1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2019 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
24 #define INCLUDE_STRING
26 #include "coretypes.h"
37 #include "stringpool.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
56 #include "langhooks.h"
61 #include "gimple-iterator.h"
62 #include "tree-vectorizer.h"
63 #include "aarch64-cost-tables.h"
67 #include "tm-constrs.h"
68 #include "sched-int.h"
69 #include "target-globals.h"
70 #include "common/common-target.h"
73 #include "selftest-rtl.h"
74 #include "rtx-vector-builder.h"
77 /* This file should be included last. */
78 #include "target-def.h"
80 /* Defined for convenience. */
81 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
83 /* Information about a legitimate vector immediate operand. */
84 struct simd_immediate_info
86 enum insn_type
{ MOV
, MVN
};
87 enum modifier_type
{ LSL
, MSL
};
89 simd_immediate_info () {}
90 simd_immediate_info (scalar_float_mode
, rtx
);
91 simd_immediate_info (scalar_int_mode
, unsigned HOST_WIDE_INT
,
92 insn_type
= MOV
, modifier_type
= LSL
,
94 simd_immediate_info (scalar_mode
, rtx
, rtx
);
96 /* The mode of the elements. */
99 /* The value of each element if all elements are the same, or the
100 first value if the constant is a series. */
103 /* The value of the step if the constant is a series, null otherwise. */
106 /* The instruction to use to move the immediate into a vector. */
109 /* The kind of shift modifier to use, and the number of bits to shift.
110 This is (LSL, 0) if no shift is needed. */
111 modifier_type modifier
;
115 /* Construct a floating-point immediate in which each element has mode
116 ELT_MODE_IN and value VALUE_IN. */
117 inline simd_immediate_info
118 ::simd_immediate_info (scalar_float_mode elt_mode_in
, rtx value_in
)
119 : elt_mode (elt_mode_in
), value (value_in
), step (NULL_RTX
), insn (MOV
),
120 modifier (LSL
), shift (0)
123 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
124 and value VALUE_IN. The other parameters are as for the structure
126 inline simd_immediate_info
127 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
128 unsigned HOST_WIDE_INT value_in
,
129 insn_type insn_in
, modifier_type modifier_in
,
130 unsigned int shift_in
)
131 : elt_mode (elt_mode_in
), value (gen_int_mode (value_in
, elt_mode_in
)),
132 step (NULL_RTX
), insn (insn_in
), modifier (modifier_in
), shift (shift_in
)
135 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
136 and where element I is equal to VALUE_IN + I * STEP_IN. */
137 inline simd_immediate_info
138 ::simd_immediate_info (scalar_mode elt_mode_in
, rtx value_in
, rtx step_in
)
139 : elt_mode (elt_mode_in
), value (value_in
), step (step_in
), insn (MOV
),
140 modifier (LSL
), shift (0)
143 /* The current code model. */
144 enum aarch64_code_model aarch64_cmodel
;
146 /* The number of 64-bit elements in an SVE vector. */
147 poly_uint16 aarch64_sve_vg
;
150 #undef TARGET_HAVE_TLS
151 #define TARGET_HAVE_TLS 1
154 static bool aarch64_composite_type_p (const_tree
, machine_mode
);
155 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode
,
157 machine_mode
*, int *,
159 static void aarch64_elf_asm_constructor (rtx
, int) ATTRIBUTE_UNUSED
;
160 static void aarch64_elf_asm_destructor (rtx
, int) ATTRIBUTE_UNUSED
;
161 static void aarch64_override_options_after_change (void);
162 static bool aarch64_vector_mode_supported_p (machine_mode
);
163 static int aarch64_address_cost (rtx
, machine_mode
, addr_space_t
, bool);
164 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode
,
168 static machine_mode
aarch64_simd_container_mode (scalar_mode
, poly_int64
);
169 static bool aarch64_print_address_internal (FILE*, machine_mode
, rtx
,
170 aarch64_addr_query_type
);
171 static HOST_WIDE_INT
aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val
);
173 /* Major revision number of the ARM Architecture implemented by the target. */
174 unsigned aarch64_architecture_version
;
176 /* The processor for which instructions should be scheduled. */
177 enum aarch64_processor aarch64_tune
= cortexa53
;
179 /* Mask to specify which instruction scheduling options should be used. */
180 uint64_t aarch64_tune_flags
= 0;
182 /* Global flag for PC relative loads. */
183 bool aarch64_pcrelative_literal_loads
;
185 /* Global flag for whether frame pointer is enabled. */
186 bool aarch64_use_frame_pointer
;
188 #define BRANCH_PROTECT_STR_MAX 255
189 char *accepted_branch_protection_string
= NULL
;
191 static enum aarch64_parse_opt_result
192 aarch64_parse_branch_protection (const char*, char**);
194 /* Support for command line parsing of boolean flags in the tuning
196 struct aarch64_flag_desc
202 #define AARCH64_FUSION_PAIR(name, internal_name) \
203 { name, AARCH64_FUSE_##internal_name },
204 static const struct aarch64_flag_desc aarch64_fusible_pairs
[] =
206 { "none", AARCH64_FUSE_NOTHING
},
207 #include "aarch64-fusion-pairs.def"
208 { "all", AARCH64_FUSE_ALL
},
209 { NULL
, AARCH64_FUSE_NOTHING
}
212 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
213 { name, AARCH64_EXTRA_TUNE_##internal_name },
214 static const struct aarch64_flag_desc aarch64_tuning_flags
[] =
216 { "none", AARCH64_EXTRA_TUNE_NONE
},
217 #include "aarch64-tuning-flags.def"
218 { "all", AARCH64_EXTRA_TUNE_ALL
},
219 { NULL
, AARCH64_EXTRA_TUNE_NONE
}
222 /* Tuning parameters. */
224 static const struct cpu_addrcost_table generic_addrcost_table
=
234 0, /* register_offset */
235 0, /* register_sextend */
236 0, /* register_zextend */
240 static const struct cpu_addrcost_table exynosm1_addrcost_table
=
250 1, /* register_offset */
251 1, /* register_sextend */
252 2, /* register_zextend */
256 static const struct cpu_addrcost_table xgene1_addrcost_table
=
266 0, /* register_offset */
267 1, /* register_sextend */
268 1, /* register_zextend */
272 static const struct cpu_addrcost_table thunderx2t99_addrcost_table
=
282 2, /* register_offset */
283 3, /* register_sextend */
284 3, /* register_zextend */
288 static const struct cpu_addrcost_table tsv110_addrcost_table
=
298 0, /* register_offset */
299 1, /* register_sextend */
300 1, /* register_zextend */
304 static const struct cpu_addrcost_table qdf24xx_addrcost_table
=
314 3, /* register_offset */
315 3, /* register_sextend */
316 3, /* register_zextend */
320 static const struct cpu_regmove_cost generic_regmove_cost
=
323 /* Avoid the use of slow int<->fp moves for spilling by setting
324 their cost higher than memmov_cost. */
330 static const struct cpu_regmove_cost cortexa57_regmove_cost
=
333 /* Avoid the use of slow int<->fp moves for spilling by setting
334 their cost higher than memmov_cost. */
340 static const struct cpu_regmove_cost cortexa53_regmove_cost
=
343 /* Avoid the use of slow int<->fp moves for spilling by setting
344 their cost higher than memmov_cost. */
350 static const struct cpu_regmove_cost exynosm1_regmove_cost
=
353 /* Avoid the use of slow int<->fp moves for spilling by setting
354 their cost higher than memmov_cost (actual, 4 and 9). */
360 static const struct cpu_regmove_cost thunderx_regmove_cost
=
368 static const struct cpu_regmove_cost xgene1_regmove_cost
=
371 /* Avoid the use of slow int<->fp moves for spilling by setting
372 their cost higher than memmov_cost. */
378 static const struct cpu_regmove_cost qdf24xx_regmove_cost
=
381 /* Avoid the use of int<->fp moves for spilling. */
387 static const struct cpu_regmove_cost thunderx2t99_regmove_cost
=
390 /* Avoid the use of int<->fp moves for spilling. */
396 static const struct cpu_regmove_cost tsv110_regmove_cost
=
399 /* Avoid the use of slow int<->fp moves for spilling by setting
400 their cost higher than memmov_cost. */
406 /* Generic costs for vector insn classes. */
407 static const struct cpu_vector_cost generic_vector_cost
=
409 1, /* scalar_int_stmt_cost */
410 1, /* scalar_fp_stmt_cost */
411 1, /* scalar_load_cost */
412 1, /* scalar_store_cost */
413 1, /* vec_int_stmt_cost */
414 1, /* vec_fp_stmt_cost */
415 2, /* vec_permute_cost */
416 1, /* vec_to_scalar_cost */
417 1, /* scalar_to_vec_cost */
418 1, /* vec_align_load_cost */
419 1, /* vec_unalign_load_cost */
420 1, /* vec_unalign_store_cost */
421 1, /* vec_store_cost */
422 3, /* cond_taken_branch_cost */
423 1 /* cond_not_taken_branch_cost */
426 /* QDF24XX costs for vector insn classes. */
427 static const struct cpu_vector_cost qdf24xx_vector_cost
=
429 1, /* scalar_int_stmt_cost */
430 1, /* scalar_fp_stmt_cost */
431 1, /* scalar_load_cost */
432 1, /* scalar_store_cost */
433 1, /* vec_int_stmt_cost */
434 3, /* vec_fp_stmt_cost */
435 2, /* vec_permute_cost */
436 1, /* vec_to_scalar_cost */
437 1, /* scalar_to_vec_cost */
438 1, /* vec_align_load_cost */
439 1, /* vec_unalign_load_cost */
440 1, /* vec_unalign_store_cost */
441 1, /* vec_store_cost */
442 3, /* cond_taken_branch_cost */
443 1 /* cond_not_taken_branch_cost */
446 /* ThunderX costs for vector insn classes. */
447 static const struct cpu_vector_cost thunderx_vector_cost
=
449 1, /* scalar_int_stmt_cost */
450 1, /* scalar_fp_stmt_cost */
451 3, /* scalar_load_cost */
452 1, /* scalar_store_cost */
453 4, /* vec_int_stmt_cost */
454 1, /* vec_fp_stmt_cost */
455 4, /* vec_permute_cost */
456 2, /* vec_to_scalar_cost */
457 2, /* scalar_to_vec_cost */
458 3, /* vec_align_load_cost */
459 5, /* vec_unalign_load_cost */
460 5, /* vec_unalign_store_cost */
461 1, /* vec_store_cost */
462 3, /* cond_taken_branch_cost */
463 3 /* cond_not_taken_branch_cost */
466 static const struct cpu_vector_cost tsv110_vector_cost
=
468 1, /* scalar_int_stmt_cost */
469 1, /* scalar_fp_stmt_cost */
470 5, /* scalar_load_cost */
471 1, /* scalar_store_cost */
472 2, /* vec_int_stmt_cost */
473 2, /* vec_fp_stmt_cost */
474 2, /* vec_permute_cost */
475 3, /* vec_to_scalar_cost */
476 2, /* scalar_to_vec_cost */
477 5, /* vec_align_load_cost */
478 5, /* vec_unalign_load_cost */
479 1, /* vec_unalign_store_cost */
480 1, /* vec_store_cost */
481 1, /* cond_taken_branch_cost */
482 1 /* cond_not_taken_branch_cost */
485 /* Generic costs for vector insn classes. */
486 static const struct cpu_vector_cost cortexa57_vector_cost
=
488 1, /* scalar_int_stmt_cost */
489 1, /* scalar_fp_stmt_cost */
490 4, /* scalar_load_cost */
491 1, /* scalar_store_cost */
492 2, /* vec_int_stmt_cost */
493 2, /* vec_fp_stmt_cost */
494 3, /* vec_permute_cost */
495 8, /* vec_to_scalar_cost */
496 8, /* scalar_to_vec_cost */
497 4, /* vec_align_load_cost */
498 4, /* vec_unalign_load_cost */
499 1, /* vec_unalign_store_cost */
500 1, /* vec_store_cost */
501 1, /* cond_taken_branch_cost */
502 1 /* cond_not_taken_branch_cost */
505 static const struct cpu_vector_cost exynosm1_vector_cost
=
507 1, /* scalar_int_stmt_cost */
508 1, /* scalar_fp_stmt_cost */
509 5, /* scalar_load_cost */
510 1, /* scalar_store_cost */
511 3, /* vec_int_stmt_cost */
512 3, /* vec_fp_stmt_cost */
513 3, /* vec_permute_cost */
514 3, /* vec_to_scalar_cost */
515 3, /* scalar_to_vec_cost */
516 5, /* vec_align_load_cost */
517 5, /* vec_unalign_load_cost */
518 1, /* vec_unalign_store_cost */
519 1, /* vec_store_cost */
520 1, /* cond_taken_branch_cost */
521 1 /* cond_not_taken_branch_cost */
524 /* Generic costs for vector insn classes. */
525 static const struct cpu_vector_cost xgene1_vector_cost
=
527 1, /* scalar_int_stmt_cost */
528 1, /* scalar_fp_stmt_cost */
529 5, /* scalar_load_cost */
530 1, /* scalar_store_cost */
531 2, /* vec_int_stmt_cost */
532 2, /* vec_fp_stmt_cost */
533 2, /* vec_permute_cost */
534 4, /* vec_to_scalar_cost */
535 4, /* scalar_to_vec_cost */
536 10, /* vec_align_load_cost */
537 10, /* vec_unalign_load_cost */
538 2, /* vec_unalign_store_cost */
539 2, /* vec_store_cost */
540 2, /* cond_taken_branch_cost */
541 1 /* cond_not_taken_branch_cost */
544 /* Costs for vector insn classes for Vulcan. */
545 static const struct cpu_vector_cost thunderx2t99_vector_cost
=
547 1, /* scalar_int_stmt_cost */
548 6, /* scalar_fp_stmt_cost */
549 4, /* scalar_load_cost */
550 1, /* scalar_store_cost */
551 5, /* vec_int_stmt_cost */
552 6, /* vec_fp_stmt_cost */
553 3, /* vec_permute_cost */
554 6, /* vec_to_scalar_cost */
555 5, /* scalar_to_vec_cost */
556 8, /* vec_align_load_cost */
557 8, /* vec_unalign_load_cost */
558 4, /* vec_unalign_store_cost */
559 4, /* vec_store_cost */
560 2, /* cond_taken_branch_cost */
561 1 /* cond_not_taken_branch_cost */
564 /* Generic costs for branch instructions. */
565 static const struct cpu_branch_cost generic_branch_cost
=
567 1, /* Predictable. */
568 3 /* Unpredictable. */
571 /* Generic approximation modes. */
572 static const cpu_approx_modes generic_approx_modes
=
574 AARCH64_APPROX_NONE
, /* division */
575 AARCH64_APPROX_NONE
, /* sqrt */
576 AARCH64_APPROX_NONE
/* recip_sqrt */
579 /* Approximation modes for Exynos M1. */
580 static const cpu_approx_modes exynosm1_approx_modes
=
582 AARCH64_APPROX_NONE
, /* division */
583 AARCH64_APPROX_ALL
, /* sqrt */
584 AARCH64_APPROX_ALL
/* recip_sqrt */
587 /* Approximation modes for X-Gene 1. */
588 static const cpu_approx_modes xgene1_approx_modes
=
590 AARCH64_APPROX_NONE
, /* division */
591 AARCH64_APPROX_NONE
, /* sqrt */
592 AARCH64_APPROX_ALL
/* recip_sqrt */
595 /* Generic prefetch settings (which disable prefetch). */
596 static const cpu_prefetch_tune generic_prefetch_tune
=
599 -1, /* l1_cache_size */
600 -1, /* l1_cache_line_size */
601 -1, /* l2_cache_size */
602 true, /* prefetch_dynamic_strides */
603 -1, /* minimum_stride */
604 -1 /* default_opt_level */
607 static const cpu_prefetch_tune exynosm1_prefetch_tune
=
610 -1, /* l1_cache_size */
611 64, /* l1_cache_line_size */
612 -1, /* l2_cache_size */
613 true, /* prefetch_dynamic_strides */
614 -1, /* minimum_stride */
615 -1 /* default_opt_level */
618 static const cpu_prefetch_tune qdf24xx_prefetch_tune
=
621 32, /* l1_cache_size */
622 64, /* l1_cache_line_size */
623 512, /* l2_cache_size */
624 false, /* prefetch_dynamic_strides */
625 2048, /* minimum_stride */
626 3 /* default_opt_level */
629 static const cpu_prefetch_tune thunderxt88_prefetch_tune
=
632 32, /* l1_cache_size */
633 128, /* l1_cache_line_size */
634 16*1024, /* l2_cache_size */
635 true, /* prefetch_dynamic_strides */
636 -1, /* minimum_stride */
637 3 /* default_opt_level */
640 static const cpu_prefetch_tune thunderx_prefetch_tune
=
643 32, /* l1_cache_size */
644 128, /* l1_cache_line_size */
645 -1, /* l2_cache_size */
646 true, /* prefetch_dynamic_strides */
647 -1, /* minimum_stride */
648 -1 /* default_opt_level */
651 static const cpu_prefetch_tune thunderx2t99_prefetch_tune
=
654 32, /* l1_cache_size */
655 64, /* l1_cache_line_size */
656 256, /* l2_cache_size */
657 true, /* prefetch_dynamic_strides */
658 -1, /* minimum_stride */
659 -1 /* default_opt_level */
662 static const cpu_prefetch_tune tsv110_prefetch_tune
=
665 64, /* l1_cache_size */
666 64, /* l1_cache_line_size */
667 512, /* l2_cache_size */
668 true, /* prefetch_dynamic_strides */
669 -1, /* minimum_stride */
670 -1 /* default_opt_level */
673 static const cpu_prefetch_tune xgene1_prefetch_tune
=
676 32, /* l1_cache_size */
677 64, /* l1_cache_line_size */
678 256, /* l2_cache_size */
679 true, /* prefetch_dynamic_strides */
680 -1, /* minimum_stride */
681 -1 /* default_opt_level */
684 static const struct tune_params generic_tunings
=
686 &cortexa57_extra_costs
,
687 &generic_addrcost_table
,
688 &generic_regmove_cost
,
689 &generic_vector_cost
,
690 &generic_branch_cost
,
691 &generic_approx_modes
,
692 SVE_NOT_IMPLEMENTED
, /* sve_width */
695 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
696 "8", /* function_align. */
697 "4", /* jump_align. */
698 "8", /* loop_align. */
699 2, /* int_reassoc_width. */
700 4, /* fp_reassoc_width. */
701 1, /* vec_reassoc_width. */
702 2, /* min_div_recip_mul_sf. */
703 2, /* min_div_recip_mul_df. */
704 0, /* max_case_values. */
705 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
706 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
707 &generic_prefetch_tune
710 static const struct tune_params cortexa35_tunings
=
712 &cortexa53_extra_costs
,
713 &generic_addrcost_table
,
714 &cortexa53_regmove_cost
,
715 &generic_vector_cost
,
716 &generic_branch_cost
,
717 &generic_approx_modes
,
718 SVE_NOT_IMPLEMENTED
, /* sve_width */
721 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
722 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
723 "16", /* function_align. */
724 "4", /* jump_align. */
725 "8", /* loop_align. */
726 2, /* int_reassoc_width. */
727 4, /* fp_reassoc_width. */
728 1, /* vec_reassoc_width. */
729 2, /* min_div_recip_mul_sf. */
730 2, /* min_div_recip_mul_df. */
731 0, /* max_case_values. */
732 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
733 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
734 &generic_prefetch_tune
737 static const struct tune_params cortexa53_tunings
=
739 &cortexa53_extra_costs
,
740 &generic_addrcost_table
,
741 &cortexa53_regmove_cost
,
742 &generic_vector_cost
,
743 &generic_branch_cost
,
744 &generic_approx_modes
,
745 SVE_NOT_IMPLEMENTED
, /* sve_width */
748 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
749 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
750 "16", /* function_align. */
751 "4", /* jump_align. */
752 "8", /* loop_align. */
753 2, /* int_reassoc_width. */
754 4, /* fp_reassoc_width. */
755 1, /* vec_reassoc_width. */
756 2, /* min_div_recip_mul_sf. */
757 2, /* min_div_recip_mul_df. */
758 0, /* max_case_values. */
759 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
760 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
761 &generic_prefetch_tune
764 static const struct tune_params cortexa57_tunings
=
766 &cortexa57_extra_costs
,
767 &generic_addrcost_table
,
768 &cortexa57_regmove_cost
,
769 &cortexa57_vector_cost
,
770 &generic_branch_cost
,
771 &generic_approx_modes
,
772 SVE_NOT_IMPLEMENTED
, /* sve_width */
775 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
776 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
777 "16", /* function_align. */
778 "4", /* jump_align. */
779 "8", /* loop_align. */
780 2, /* int_reassoc_width. */
781 4, /* fp_reassoc_width. */
782 1, /* vec_reassoc_width. */
783 2, /* min_div_recip_mul_sf. */
784 2, /* min_div_recip_mul_df. */
785 0, /* max_case_values. */
786 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
787 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
), /* tune_flags. */
788 &generic_prefetch_tune
791 static const struct tune_params cortexa72_tunings
=
793 &cortexa57_extra_costs
,
794 &generic_addrcost_table
,
795 &cortexa57_regmove_cost
,
796 &cortexa57_vector_cost
,
797 &generic_branch_cost
,
798 &generic_approx_modes
,
799 SVE_NOT_IMPLEMENTED
, /* sve_width */
802 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
803 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
804 "16", /* function_align. */
805 "4", /* jump_align. */
806 "8", /* loop_align. */
807 2, /* int_reassoc_width. */
808 4, /* fp_reassoc_width. */
809 1, /* vec_reassoc_width. */
810 2, /* min_div_recip_mul_sf. */
811 2, /* min_div_recip_mul_df. */
812 0, /* max_case_values. */
813 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
814 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
815 &generic_prefetch_tune
818 static const struct tune_params cortexa73_tunings
=
820 &cortexa57_extra_costs
,
821 &generic_addrcost_table
,
822 &cortexa57_regmove_cost
,
823 &cortexa57_vector_cost
,
824 &generic_branch_cost
,
825 &generic_approx_modes
,
826 SVE_NOT_IMPLEMENTED
, /* sve_width */
827 4, /* memmov_cost. */
829 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
830 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
831 "16", /* function_align. */
832 "4", /* jump_align. */
833 "8", /* loop_align. */
834 2, /* int_reassoc_width. */
835 4, /* fp_reassoc_width. */
836 1, /* vec_reassoc_width. */
837 2, /* min_div_recip_mul_sf. */
838 2, /* min_div_recip_mul_df. */
839 0, /* max_case_values. */
840 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
841 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
842 &generic_prefetch_tune
847 static const struct tune_params exynosm1_tunings
=
849 &exynosm1_extra_costs
,
850 &exynosm1_addrcost_table
,
851 &exynosm1_regmove_cost
,
852 &exynosm1_vector_cost
,
853 &generic_branch_cost
,
854 &exynosm1_approx_modes
,
855 SVE_NOT_IMPLEMENTED
, /* sve_width */
858 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
859 "4", /* function_align. */
860 "4", /* jump_align. */
861 "4", /* loop_align. */
862 2, /* int_reassoc_width. */
863 4, /* fp_reassoc_width. */
864 1, /* vec_reassoc_width. */
865 2, /* min_div_recip_mul_sf. */
866 2, /* min_div_recip_mul_df. */
867 48, /* max_case_values. */
868 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
869 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
870 &exynosm1_prefetch_tune
873 static const struct tune_params thunderxt88_tunings
=
875 &thunderx_extra_costs
,
876 &generic_addrcost_table
,
877 &thunderx_regmove_cost
,
878 &thunderx_vector_cost
,
879 &generic_branch_cost
,
880 &generic_approx_modes
,
881 SVE_NOT_IMPLEMENTED
, /* sve_width */
884 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
885 "8", /* function_align. */
886 "8", /* jump_align. */
887 "8", /* loop_align. */
888 2, /* int_reassoc_width. */
889 4, /* fp_reassoc_width. */
890 1, /* vec_reassoc_width. */
891 2, /* min_div_recip_mul_sf. */
892 2, /* min_div_recip_mul_df. */
893 0, /* max_case_values. */
894 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
895 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
), /* tune_flags. */
896 &thunderxt88_prefetch_tune
899 static const struct tune_params thunderx_tunings
=
901 &thunderx_extra_costs
,
902 &generic_addrcost_table
,
903 &thunderx_regmove_cost
,
904 &thunderx_vector_cost
,
905 &generic_branch_cost
,
906 &generic_approx_modes
,
907 SVE_NOT_IMPLEMENTED
, /* sve_width */
910 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
911 "8", /* function_align. */
912 "8", /* jump_align. */
913 "8", /* loop_align. */
914 2, /* int_reassoc_width. */
915 4, /* fp_reassoc_width. */
916 1, /* vec_reassoc_width. */
917 2, /* min_div_recip_mul_sf. */
918 2, /* min_div_recip_mul_df. */
919 0, /* max_case_values. */
920 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
921 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
922 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
), /* tune_flags. */
923 &thunderx_prefetch_tune
926 static const struct tune_params tsv110_tunings
=
929 &tsv110_addrcost_table
,
930 &tsv110_regmove_cost
,
932 &generic_branch_cost
,
933 &generic_approx_modes
,
934 SVE_NOT_IMPLEMENTED
, /* sve_width */
937 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_CMP_BRANCH
938 | AARCH64_FUSE_ALU_BRANCH
), /* fusible_ops */
939 "16", /* function_align. */
940 "4", /* jump_align. */
941 "8", /* loop_align. */
942 2, /* int_reassoc_width. */
943 4, /* fp_reassoc_width. */
944 1, /* vec_reassoc_width. */
945 2, /* min_div_recip_mul_sf. */
946 2, /* min_div_recip_mul_df. */
947 0, /* max_case_values. */
948 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
949 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
950 &tsv110_prefetch_tune
953 static const struct tune_params xgene1_tunings
=
956 &xgene1_addrcost_table
,
957 &xgene1_regmove_cost
,
959 &generic_branch_cost
,
960 &xgene1_approx_modes
,
961 SVE_NOT_IMPLEMENTED
, /* sve_width */
964 AARCH64_FUSE_NOTHING
, /* fusible_ops */
965 "16", /* function_align. */
966 "16", /* jump_align. */
967 "16", /* loop_align. */
968 2, /* int_reassoc_width. */
969 4, /* fp_reassoc_width. */
970 1, /* vec_reassoc_width. */
971 2, /* min_div_recip_mul_sf. */
972 2, /* min_div_recip_mul_df. */
973 17, /* max_case_values. */
974 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
975 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
), /* tune_flags. */
976 &xgene1_prefetch_tune
979 static const struct tune_params emag_tunings
=
982 &xgene1_addrcost_table
,
983 &xgene1_regmove_cost
,
985 &generic_branch_cost
,
986 &xgene1_approx_modes
,
990 AARCH64_FUSE_NOTHING
, /* fusible_ops */
991 "16", /* function_align. */
992 "16", /* jump_align. */
993 "16", /* loop_align. */
994 2, /* int_reassoc_width. */
995 4, /* fp_reassoc_width. */
996 1, /* vec_reassoc_width. */
997 2, /* min_div_recip_mul_sf. */
998 2, /* min_div_recip_mul_df. */
999 17, /* max_case_values. */
1000 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
1001 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
), /* tune_flags. */
1002 &xgene1_prefetch_tune
1005 static const struct tune_params qdf24xx_tunings
=
1007 &qdf24xx_extra_costs
,
1008 &qdf24xx_addrcost_table
,
1009 &qdf24xx_regmove_cost
,
1010 &qdf24xx_vector_cost
,
1011 &generic_branch_cost
,
1012 &generic_approx_modes
,
1013 SVE_NOT_IMPLEMENTED
, /* sve_width */
1014 4, /* memmov_cost */
1016 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
1017 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
1018 "16", /* function_align. */
1019 "8", /* jump_align. */
1020 "16", /* loop_align. */
1021 2, /* int_reassoc_width. */
1022 4, /* fp_reassoc_width. */
1023 1, /* vec_reassoc_width. */
1024 2, /* min_div_recip_mul_sf. */
1025 2, /* min_div_recip_mul_df. */
1026 0, /* max_case_values. */
1027 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1028 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS
, /* tune_flags. */
1029 &qdf24xx_prefetch_tune
1032 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1034 static const struct tune_params saphira_tunings
=
1036 &generic_extra_costs
,
1037 &generic_addrcost_table
,
1038 &generic_regmove_cost
,
1039 &generic_vector_cost
,
1040 &generic_branch_cost
,
1041 &generic_approx_modes
,
1042 SVE_NOT_IMPLEMENTED
, /* sve_width */
1043 4, /* memmov_cost */
1045 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
1046 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
1047 "16", /* function_align. */
1048 "8", /* jump_align. */
1049 "16", /* loop_align. */
1050 2, /* int_reassoc_width. */
1051 4, /* fp_reassoc_width. */
1052 1, /* vec_reassoc_width. */
1053 2, /* min_div_recip_mul_sf. */
1054 2, /* min_div_recip_mul_df. */
1055 0, /* max_case_values. */
1056 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1057 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1058 &generic_prefetch_tune
1061 static const struct tune_params thunderx2t99_tunings
=
1063 &thunderx2t99_extra_costs
,
1064 &thunderx2t99_addrcost_table
,
1065 &thunderx2t99_regmove_cost
,
1066 &thunderx2t99_vector_cost
,
1067 &generic_branch_cost
,
1068 &generic_approx_modes
,
1069 SVE_NOT_IMPLEMENTED
, /* sve_width */
1070 4, /* memmov_cost. */
1071 4, /* issue_rate. */
1072 (AARCH64_FUSE_CMP_BRANCH
| AARCH64_FUSE_AES_AESMC
1073 | AARCH64_FUSE_ALU_BRANCH
), /* fusible_ops */
1074 "16", /* function_align. */
1075 "8", /* jump_align. */
1076 "16", /* loop_align. */
1077 3, /* int_reassoc_width. */
1078 2, /* fp_reassoc_width. */
1079 2, /* vec_reassoc_width. */
1080 2, /* min_div_recip_mul_sf. */
1081 2, /* min_div_recip_mul_df. */
1082 0, /* max_case_values. */
1083 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1084 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1085 &thunderx2t99_prefetch_tune
1088 static const struct tune_params neoversen1_tunings
=
1090 &cortexa57_extra_costs
,
1091 &generic_addrcost_table
,
1092 &generic_regmove_cost
,
1093 &cortexa57_vector_cost
,
1094 &generic_branch_cost
,
1095 &generic_approx_modes
,
1096 SVE_NOT_IMPLEMENTED
, /* sve_width */
1097 4, /* memmov_cost */
1099 AARCH64_FUSE_AES_AESMC
, /* fusible_ops */
1100 "32:16", /* function_align. */
1101 "32:16", /* jump_align. */
1102 "32:16", /* loop_align. */
1103 2, /* int_reassoc_width. */
1104 4, /* fp_reassoc_width. */
1105 2, /* vec_reassoc_width. */
1106 2, /* min_div_recip_mul_sf. */
1107 2, /* min_div_recip_mul_df. */
1108 0, /* max_case_values. */
1109 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1110 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1111 &generic_prefetch_tune
1114 /* Support for fine-grained override of the tuning structures. */
1115 struct aarch64_tuning_override_function
1118 void (*parse_override
)(const char*, struct tune_params
*);
1121 static void aarch64_parse_fuse_string (const char*, struct tune_params
*);
1122 static void aarch64_parse_tune_string (const char*, struct tune_params
*);
1123 static void aarch64_parse_sve_width_string (const char*, struct tune_params
*);
1125 static const struct aarch64_tuning_override_function
1126 aarch64_tuning_override_functions
[] =
1128 { "fuse", aarch64_parse_fuse_string
},
1129 { "tune", aarch64_parse_tune_string
},
1130 { "sve_width", aarch64_parse_sve_width_string
},
1134 /* A processor implementing AArch64. */
1137 const char *const name
;
1138 enum aarch64_processor ident
;
1139 enum aarch64_processor sched_core
;
1140 enum aarch64_arch arch
;
1141 unsigned architecture_version
;
1142 const uint64_t flags
;
1143 const struct tune_params
*const tune
;
1146 /* Architectures implementing AArch64. */
1147 static const struct processor all_architectures
[] =
1149 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1150 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1151 #include "aarch64-arches.def"
1152 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
1155 /* Processor cores implementing AArch64. */
1156 static const struct processor all_cores
[] =
1158 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1159 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1160 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1161 FLAGS, &COSTS##_tunings},
1162 #include "aarch64-cores.def"
1163 {"generic", generic
, cortexa53
, AARCH64_ARCH_8A
, 8,
1164 AARCH64_FL_FOR_ARCH8
, &generic_tunings
},
1165 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
1169 /* Target specification. These are populated by the -march, -mtune, -mcpu
1170 handling code or by target attributes. */
1171 static const struct processor
*selected_arch
;
1172 static const struct processor
*selected_cpu
;
1173 static const struct processor
*selected_tune
;
1175 enum aarch64_key_type aarch64_ra_sign_key
= AARCH64_KEY_A
;
1177 /* The current tuning set. */
1178 struct tune_params aarch64_tune_params
= generic_tunings
;
1180 /* Table of machine attributes. */
1181 static const struct attribute_spec aarch64_attribute_table
[] =
1183 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1184 affects_type_identity, handler, exclude } */
1185 { "aarch64_vector_pcs", 0, 0, false, true, true, true, NULL
, NULL
},
1186 { NULL
, 0, 0, false, false, false, false, NULL
, NULL
}
1189 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1191 /* An ISA extension in the co-processor and main instruction set space. */
1192 struct aarch64_option_extension
1194 const char *const name
;
1195 const unsigned long flags_on
;
1196 const unsigned long flags_off
;
1199 typedef enum aarch64_cond_code
1201 AARCH64_EQ
= 0, AARCH64_NE
, AARCH64_CS
, AARCH64_CC
, AARCH64_MI
, AARCH64_PL
,
1202 AARCH64_VS
, AARCH64_VC
, AARCH64_HI
, AARCH64_LS
, AARCH64_GE
, AARCH64_LT
,
1203 AARCH64_GT
, AARCH64_LE
, AARCH64_AL
, AARCH64_NV
1207 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1209 struct aarch64_branch_protect_type
1211 /* The type's name that the user passes to the branch-protection option
1214 /* Function to handle the protection type and set global variables.
1215 First argument is the string token corresponding with this type and the
1216 second argument is the next token in the option string.
1218 * AARCH64_PARSE_OK: Handling was sucessful.
1219 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1220 should print an error.
1221 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1223 enum aarch64_parse_opt_result (*handler
)(char*, char*);
1224 /* A list of types that can follow this type in the option string. */
1225 const aarch64_branch_protect_type
* subtypes
;
1226 unsigned int num_subtypes
;
1229 static enum aarch64_parse_opt_result
1230 aarch64_handle_no_branch_protection (char* str
, char* rest
)
1232 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NONE
;
1233 aarch64_enable_bti
= 0;
1236 error ("unexpected %<%s%> after %<%s%>", rest
, str
);
1237 return AARCH64_PARSE_INVALID_FEATURE
;
1239 return AARCH64_PARSE_OK
;
1242 static enum aarch64_parse_opt_result
1243 aarch64_handle_standard_branch_protection (char* str
, char* rest
)
1245 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
1246 aarch64_ra_sign_key
= AARCH64_KEY_A
;
1247 aarch64_enable_bti
= 1;
1250 error ("unexpected %<%s%> after %<%s%>", rest
, str
);
1251 return AARCH64_PARSE_INVALID_FEATURE
;
1253 return AARCH64_PARSE_OK
;
1256 static enum aarch64_parse_opt_result
1257 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED
,
1258 char* rest ATTRIBUTE_UNUSED
)
1260 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
1261 aarch64_ra_sign_key
= AARCH64_KEY_A
;
1262 return AARCH64_PARSE_OK
;
1265 static enum aarch64_parse_opt_result
1266 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED
,
1267 char* rest ATTRIBUTE_UNUSED
)
1269 aarch64_ra_sign_scope
= AARCH64_FUNCTION_ALL
;
1270 return AARCH64_PARSE_OK
;
1273 static enum aarch64_parse_opt_result
1274 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED
,
1275 char* rest ATTRIBUTE_UNUSED
)
1277 aarch64_ra_sign_key
= AARCH64_KEY_B
;
1278 return AARCH64_PARSE_OK
;
1281 static enum aarch64_parse_opt_result
1282 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED
,
1283 char* rest ATTRIBUTE_UNUSED
)
1285 aarch64_enable_bti
= 1;
1286 return AARCH64_PARSE_OK
;
1289 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes
[] = {
1290 { "leaf", aarch64_handle_pac_ret_leaf
, NULL
, 0 },
1291 { "b-key", aarch64_handle_pac_ret_b_key
, NULL
, 0 },
1292 { NULL
, NULL
, NULL
, 0 }
1295 static const struct aarch64_branch_protect_type aarch64_branch_protect_types
[] = {
1296 { "none", aarch64_handle_no_branch_protection
, NULL
, 0 },
1297 { "standard", aarch64_handle_standard_branch_protection
, NULL
, 0 },
1298 { "pac-ret", aarch64_handle_pac_ret_protection
, aarch64_pac_ret_subtypes
,
1299 ARRAY_SIZE (aarch64_pac_ret_subtypes
) },
1300 { "bti", aarch64_handle_bti_protection
, NULL
, 0 },
1301 { NULL
, NULL
, NULL
, 0 }
1304 /* The condition codes of the processor, and the inverse function. */
1305 static const char * const aarch64_condition_codes
[] =
1307 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1308 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1311 /* Generate code to enable conditional branches in functions over 1 MiB. */
1313 aarch64_gen_far_branch (rtx
* operands
, int pos_label
, const char * dest
,
1314 const char * branch_format
)
1316 rtx_code_label
* tmp_label
= gen_label_rtx ();
1317 char label_buf
[256];
1319 ASM_GENERATE_INTERNAL_LABEL (label_buf
, dest
,
1320 CODE_LABEL_NUMBER (tmp_label
));
1321 const char *label_ptr
= targetm
.strip_name_encoding (label_buf
);
1322 rtx dest_label
= operands
[pos_label
];
1323 operands
[pos_label
] = tmp_label
;
1325 snprintf (buffer
, sizeof (buffer
), "%s%s", branch_format
, label_ptr
);
1326 output_asm_insn (buffer
, operands
);
1328 snprintf (buffer
, sizeof (buffer
), "b\t%%l%d\n%s:", pos_label
, label_ptr
);
1329 operands
[pos_label
] = dest_label
;
1330 output_asm_insn (buffer
, operands
);
1335 aarch64_err_no_fpadvsimd (machine_mode mode
)
1337 if (TARGET_GENERAL_REGS_ONLY
)
1338 if (FLOAT_MODE_P (mode
))
1339 error ("%qs is incompatible with the use of floating-point types",
1340 "-mgeneral-regs-only");
1342 error ("%qs is incompatible with the use of vector types",
1343 "-mgeneral-regs-only");
1345 if (FLOAT_MODE_P (mode
))
1346 error ("%qs feature modifier is incompatible with the use of"
1347 " floating-point types", "+nofp");
1349 error ("%qs feature modifier is incompatible with the use of"
1350 " vector types", "+nofp");
1353 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1354 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1355 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1356 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1357 and GENERAL_REGS is lower than the memory cost (in this case the best class
1358 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1359 cost results in bad allocations with many redundant int<->FP moves which
1360 are expensive on various cores.
1361 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1362 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1363 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1364 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1365 The result of this is that it is no longer inefficient to have a higher
1366 memory move cost than the register move cost.
1370 aarch64_ira_change_pseudo_allocno_class (int regno
, reg_class_t allocno_class
,
1371 reg_class_t best_class
)
1375 if (!reg_class_subset_p (GENERAL_REGS
, allocno_class
)
1376 || !reg_class_subset_p (FP_REGS
, allocno_class
))
1377 return allocno_class
;
1379 if (!reg_class_subset_p (GENERAL_REGS
, best_class
)
1380 || !reg_class_subset_p (FP_REGS
, best_class
))
1383 mode
= PSEUDO_REGNO_MODE (regno
);
1384 return FLOAT_MODE_P (mode
) || VECTOR_MODE_P (mode
) ? FP_REGS
: GENERAL_REGS
;
1388 aarch64_min_divisions_for_recip_mul (machine_mode mode
)
1390 if (GET_MODE_UNIT_SIZE (mode
) == 4)
1391 return aarch64_tune_params
.min_div_recip_mul_sf
;
1392 return aarch64_tune_params
.min_div_recip_mul_df
;
1395 /* Return the reassociation width of treeop OPC with mode MODE. */
1397 aarch64_reassociation_width (unsigned opc
, machine_mode mode
)
1399 if (VECTOR_MODE_P (mode
))
1400 return aarch64_tune_params
.vec_reassoc_width
;
1401 if (INTEGRAL_MODE_P (mode
))
1402 return aarch64_tune_params
.int_reassoc_width
;
1403 /* Avoid reassociating floating point addition so we emit more FMAs. */
1404 if (FLOAT_MODE_P (mode
) && opc
!= PLUS_EXPR
)
1405 return aarch64_tune_params
.fp_reassoc_width
;
1409 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1411 aarch64_dbx_register_number (unsigned regno
)
1413 if (GP_REGNUM_P (regno
))
1414 return AARCH64_DWARF_R0
+ regno
- R0_REGNUM
;
1415 else if (regno
== SP_REGNUM
)
1416 return AARCH64_DWARF_SP
;
1417 else if (FP_REGNUM_P (regno
))
1418 return AARCH64_DWARF_V0
+ regno
- V0_REGNUM
;
1419 else if (PR_REGNUM_P (regno
))
1420 return AARCH64_DWARF_P0
+ regno
- P0_REGNUM
;
1421 else if (regno
== VG_REGNUM
)
1422 return AARCH64_DWARF_VG
;
1424 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1425 equivalent DWARF register. */
1426 return DWARF_FRAME_REGISTERS
;
1429 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1431 aarch64_advsimd_struct_mode_p (machine_mode mode
)
1434 && (mode
== OImode
|| mode
== CImode
|| mode
== XImode
));
1437 /* Return true if MODE is an SVE predicate mode. */
1439 aarch64_sve_pred_mode_p (machine_mode mode
)
1442 && (mode
== VNx16BImode
1443 || mode
== VNx8BImode
1444 || mode
== VNx4BImode
1445 || mode
== VNx2BImode
));
1448 /* Three mutually-exclusive flags describing a vector or predicate type. */
1449 const unsigned int VEC_ADVSIMD
= 1;
1450 const unsigned int VEC_SVE_DATA
= 2;
1451 const unsigned int VEC_SVE_PRED
= 4;
1452 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1453 a structure of 2, 3 or 4 vectors. */
1454 const unsigned int VEC_STRUCT
= 8;
1455 /* Useful combinations of the above. */
1456 const unsigned int VEC_ANY_SVE
= VEC_SVE_DATA
| VEC_SVE_PRED
;
1457 const unsigned int VEC_ANY_DATA
= VEC_ADVSIMD
| VEC_SVE_DATA
;
1459 /* Return a set of flags describing the vector properties of mode MODE.
1460 Ignore modes that are not supported by the current target. */
1462 aarch64_classify_vector_mode (machine_mode mode
)
1464 if (aarch64_advsimd_struct_mode_p (mode
))
1465 return VEC_ADVSIMD
| VEC_STRUCT
;
1467 if (aarch64_sve_pred_mode_p (mode
))
1468 return VEC_SVE_PRED
;
1470 scalar_mode inner
= GET_MODE_INNER (mode
);
1471 if (VECTOR_MODE_P (mode
)
1478 || inner
== DFmode
))
1482 if (known_eq (GET_MODE_BITSIZE (mode
), BITS_PER_SVE_VECTOR
))
1483 return VEC_SVE_DATA
;
1484 if (known_eq (GET_MODE_BITSIZE (mode
), BITS_PER_SVE_VECTOR
* 2)
1485 || known_eq (GET_MODE_BITSIZE (mode
), BITS_PER_SVE_VECTOR
* 3)
1486 || known_eq (GET_MODE_BITSIZE (mode
), BITS_PER_SVE_VECTOR
* 4))
1487 return VEC_SVE_DATA
| VEC_STRUCT
;
1490 /* This includes V1DF but not V1DI (which doesn't exist). */
1492 && (known_eq (GET_MODE_BITSIZE (mode
), 64)
1493 || known_eq (GET_MODE_BITSIZE (mode
), 128)))
1500 /* Return true if MODE is any of the data vector modes, including
1503 aarch64_vector_data_mode_p (machine_mode mode
)
1505 return aarch64_classify_vector_mode (mode
) & VEC_ANY_DATA
;
1508 /* Return true if MODE is an SVE data vector mode; either a single vector
1509 or a structure of vectors. */
1511 aarch64_sve_data_mode_p (machine_mode mode
)
1513 return aarch64_classify_vector_mode (mode
) & VEC_SVE_DATA
;
1516 /* Implement target hook TARGET_ARRAY_MODE. */
1517 static opt_machine_mode
1518 aarch64_array_mode (machine_mode mode
, unsigned HOST_WIDE_INT nelems
)
1520 if (aarch64_classify_vector_mode (mode
) == VEC_SVE_DATA
1521 && IN_RANGE (nelems
, 2, 4))
1522 return mode_for_vector (GET_MODE_INNER (mode
),
1523 GET_MODE_NUNITS (mode
) * nelems
);
1525 return opt_machine_mode ();
1528 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1530 aarch64_array_mode_supported_p (machine_mode mode
,
1531 unsigned HOST_WIDE_INT nelems
)
1534 && (AARCH64_VALID_SIMD_QREG_MODE (mode
)
1535 || AARCH64_VALID_SIMD_DREG_MODE (mode
))
1536 && (nelems
>= 2 && nelems
<= 4))
1542 /* Return the SVE predicate mode to use for elements that have
1543 ELEM_NBYTES bytes, if such a mode exists. */
1546 aarch64_sve_pred_mode (unsigned int elem_nbytes
)
1550 if (elem_nbytes
== 1)
1552 if (elem_nbytes
== 2)
1554 if (elem_nbytes
== 4)
1556 if (elem_nbytes
== 8)
1559 return opt_machine_mode ();
1562 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1564 static opt_machine_mode
1565 aarch64_get_mask_mode (poly_uint64 nunits
, poly_uint64 nbytes
)
1567 if (TARGET_SVE
&& known_eq (nbytes
, BYTES_PER_SVE_VECTOR
))
1569 unsigned int elem_nbytes
= vector_element_size (nbytes
, nunits
);
1570 machine_mode pred_mode
;
1571 if (aarch64_sve_pred_mode (elem_nbytes
).exists (&pred_mode
))
1575 return default_get_mask_mode (nunits
, nbytes
);
1578 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1579 prefer to use the first arithmetic operand as the else value if
1580 the else value doesn't matter, since that exactly matches the SVE
1581 destructive merging form. For ternary operations we could either
1582 pick the first operand and use FMAD-like instructions or the last
1583 operand and use FMLA-like instructions; the latter seems more
1587 aarch64_preferred_else_value (unsigned, tree
, unsigned int nops
, tree
*ops
)
1589 return nops
== 3 ? ops
[2] : ops
[0];
1592 /* Implement TARGET_HARD_REGNO_NREGS. */
1595 aarch64_hard_regno_nregs (unsigned regno
, machine_mode mode
)
1597 /* ??? Logically we should only need to provide a value when
1598 HARD_REGNO_MODE_OK says that the combination is valid,
1599 but at the moment we need to handle all modes. Just ignore
1600 any runtime parts for registers that can't store them. */
1601 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
1602 switch (aarch64_regno_regclass (regno
))
1606 if (aarch64_sve_data_mode_p (mode
))
1607 return exact_div (GET_MODE_SIZE (mode
),
1608 BYTES_PER_SVE_VECTOR
).to_constant ();
1609 return CEIL (lowest_size
, UNITS_PER_VREG
);
1615 return CEIL (lowest_size
, UNITS_PER_WORD
);
1620 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1623 aarch64_hard_regno_mode_ok (unsigned regno
, machine_mode mode
)
1625 if (GET_MODE_CLASS (mode
) == MODE_CC
)
1626 return regno
== CC_REGNUM
;
1628 if (regno
== VG_REGNUM
)
1629 /* This must have the same size as _Unwind_Word. */
1630 return mode
== DImode
;
1632 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1633 if (vec_flags
& VEC_SVE_PRED
)
1634 return PR_REGNUM_P (regno
);
1636 if (PR_REGNUM_P (regno
))
1639 if (regno
== SP_REGNUM
)
1640 /* The purpose of comparing with ptr_mode is to support the
1641 global register variable associated with the stack pointer
1642 register via the syntax of asm ("wsp") in ILP32. */
1643 return mode
== Pmode
|| mode
== ptr_mode
;
1645 if (regno
== FRAME_POINTER_REGNUM
|| regno
== ARG_POINTER_REGNUM
)
1646 return mode
== Pmode
;
1648 if (GP_REGNUM_P (regno
))
1650 if (known_le (GET_MODE_SIZE (mode
), 8))
1652 else if (known_le (GET_MODE_SIZE (mode
), 16))
1653 return (regno
& 1) == 0;
1655 else if (FP_REGNUM_P (regno
))
1657 if (vec_flags
& VEC_STRUCT
)
1658 return end_hard_regno (mode
, regno
) - 1 <= V31_REGNUM
;
1660 return !VECTOR_MODE_P (mode
) || vec_flags
!= 0;
1666 /* Return true if this is a definition of a vectorized simd function. */
1669 aarch64_simd_decl_p (tree fndecl
)
1675 fntype
= TREE_TYPE (fndecl
);
1679 /* Functions with the aarch64_vector_pcs attribute use the simd ABI. */
1680 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype
)) != NULL
)
1686 /* Return the mode a register save/restore should use. DImode for integer
1687 registers, DFmode for FP registers in non-SIMD functions (they only save
1688 the bottom half of a 128 bit register), or TFmode for FP registers in
1692 aarch64_reg_save_mode (tree fndecl
, unsigned regno
)
1694 return GP_REGNUM_P (regno
)
1696 : (aarch64_simd_decl_p (fndecl
) ? E_TFmode
: E_DFmode
);
1699 /* Return true if the instruction is a call to a SIMD function, false
1700 if it is not a SIMD function or if we do not know anything about
1704 aarch64_simd_call_p (rtx_insn
*insn
)
1710 gcc_assert (CALL_P (insn
));
1711 call
= get_call_rtx_from (insn
);
1712 symbol
= XEXP (XEXP (call
, 0), 0);
1713 if (GET_CODE (symbol
) != SYMBOL_REF
)
1715 fndecl
= SYMBOL_REF_DECL (symbol
);
1719 return aarch64_simd_decl_p (fndecl
);
1722 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS. If INSN calls
1723 a function that uses the SIMD ABI, take advantage of the extra
1724 call-preserved registers that the ABI provides. */
1727 aarch64_remove_extra_call_preserved_regs (rtx_insn
*insn
,
1728 HARD_REG_SET
*return_set
)
1730 if (aarch64_simd_call_p (insn
))
1732 for (int regno
= 0; regno
< FIRST_PSEUDO_REGISTER
; regno
++)
1733 if (FP_SIMD_SAVED_REGNUM_P (regno
))
1734 CLEAR_HARD_REG_BIT (*return_set
, regno
);
1738 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1739 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1740 clobbers the top 64 bits when restoring the bottom 64 bits. */
1743 aarch64_hard_regno_call_part_clobbered (rtx_insn
*insn
, unsigned int regno
,
1746 bool simd_p
= insn
&& CALL_P (insn
) && aarch64_simd_call_p (insn
);
1747 return FP_REGNUM_P (regno
)
1748 && maybe_gt (GET_MODE_SIZE (mode
), simd_p
? 16 : 8);
1751 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS. */
1754 aarch64_return_call_with_max_clobbers (rtx_insn
*call_1
, rtx_insn
*call_2
)
1756 gcc_assert (CALL_P (call_1
) && CALL_P (call_2
));
1758 if (!aarch64_simd_call_p (call_1
) || aarch64_simd_call_p (call_2
))
1764 /* Implement REGMODE_NATURAL_SIZE. */
1766 aarch64_regmode_natural_size (machine_mode mode
)
1768 /* The natural size for SVE data modes is one SVE data vector,
1769 and similarly for predicates. We can't independently modify
1770 anything smaller than that. */
1771 /* ??? For now, only do this for variable-width SVE registers.
1772 Doing it for constant-sized registers breaks lower-subreg.c. */
1773 /* ??? And once that's fixed, we should probably have similar
1774 code for Advanced SIMD. */
1775 if (!aarch64_sve_vg
.is_constant ())
1777 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1778 if (vec_flags
& VEC_SVE_PRED
)
1779 return BYTES_PER_SVE_PRED
;
1780 if (vec_flags
& VEC_SVE_DATA
)
1781 return BYTES_PER_SVE_VECTOR
;
1783 return UNITS_PER_WORD
;
1786 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1788 aarch64_hard_regno_caller_save_mode (unsigned regno
, unsigned,
1791 /* The predicate mode determines which bits are significant and
1792 which are "don't care". Decreasing the number of lanes would
1793 lose data while increasing the number of lanes would make bits
1794 unnecessarily significant. */
1795 if (PR_REGNUM_P (regno
))
1797 if (known_ge (GET_MODE_SIZE (mode
), 4))
1803 /* Return true if I's bits are consecutive ones from the MSB. */
1805 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i
)
1807 return exact_log2 (-i
) != HOST_WIDE_INT_M1
;
1810 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1811 that strcpy from constants will be faster. */
1813 static HOST_WIDE_INT
1814 aarch64_constant_alignment (const_tree exp
, HOST_WIDE_INT align
)
1816 if (TREE_CODE (exp
) == STRING_CST
&& !optimize_size
)
1817 return MAX (align
, BITS_PER_WORD
);
1821 /* Return true if calls to DECL should be treated as
1822 long-calls (ie called via a register). */
1824 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED
)
1829 /* Return true if calls to symbol-ref SYM should be treated as
1830 long-calls (ie called via a register). */
1832 aarch64_is_long_call_p (rtx sym
)
1834 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym
));
1837 /* Return true if calls to symbol-ref SYM should not go through
1841 aarch64_is_noplt_call_p (rtx sym
)
1843 const_tree decl
= SYMBOL_REF_DECL (sym
);
1848 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl
)))
1849 && !targetm
.binds_local_p (decl
))
1855 /* Return true if the offsets to a zero/sign-extract operation
1856 represent an expression that matches an extend operation. The
1857 operands represent the paramters from
1859 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1861 aarch64_is_extend_from_extract (scalar_int_mode mode
, rtx mult_imm
,
1864 HOST_WIDE_INT mult_val
, extract_val
;
1866 if (! CONST_INT_P (mult_imm
) || ! CONST_INT_P (extract_imm
))
1869 mult_val
= INTVAL (mult_imm
);
1870 extract_val
= INTVAL (extract_imm
);
1873 && extract_val
< GET_MODE_BITSIZE (mode
)
1874 && exact_log2 (extract_val
& ~7) > 0
1875 && (extract_val
& 7) <= 4
1876 && mult_val
== (1 << (extract_val
& 7)))
1882 /* Emit an insn that's a simple single-set. Both the operands must be
1883 known to be valid. */
1884 inline static rtx_insn
*
1885 emit_set_insn (rtx x
, rtx y
)
1887 return emit_insn (gen_rtx_SET (x
, y
));
1890 /* X and Y are two things to compare using CODE. Emit the compare insn and
1891 return the rtx for register 0 in the proper mode. */
1893 aarch64_gen_compare_reg (RTX_CODE code
, rtx x
, rtx y
)
1895 machine_mode mode
= SELECT_CC_MODE (code
, x
, y
);
1896 rtx cc_reg
= gen_rtx_REG (mode
, CC_REGNUM
);
1898 emit_set_insn (cc_reg
, gen_rtx_COMPARE (mode
, x
, y
));
1902 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
1905 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code
, rtx x
, rtx y
,
1906 machine_mode y_mode
)
1908 if (y_mode
== E_QImode
|| y_mode
== E_HImode
)
1910 if (CONST_INT_P (y
))
1911 y
= GEN_INT (INTVAL (y
) & GET_MODE_MASK (y_mode
));
1915 machine_mode cc_mode
;
1917 t
= gen_rtx_ZERO_EXTEND (SImode
, y
);
1918 t
= gen_rtx_COMPARE (CC_SWPmode
, t
, x
);
1919 cc_mode
= CC_SWPmode
;
1920 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
1921 emit_set_insn (cc_reg
, t
);
1926 return aarch64_gen_compare_reg (code
, x
, y
);
1929 /* Build the SYMBOL_REF for __tls_get_addr. */
1931 static GTY(()) rtx tls_get_addr_libfunc
;
1934 aarch64_tls_get_addr (void)
1936 if (!tls_get_addr_libfunc
)
1937 tls_get_addr_libfunc
= init_one_libfunc ("__tls_get_addr");
1938 return tls_get_addr_libfunc
;
1941 /* Return the TLS model to use for ADDR. */
1943 static enum tls_model
1944 tls_symbolic_operand_type (rtx addr
)
1946 enum tls_model tls_kind
= TLS_MODEL_NONE
;
1947 if (GET_CODE (addr
) == CONST
)
1950 rtx sym
= strip_offset (addr
, &addend
);
1951 if (GET_CODE (sym
) == SYMBOL_REF
)
1952 tls_kind
= SYMBOL_REF_TLS_MODEL (sym
);
1954 else if (GET_CODE (addr
) == SYMBOL_REF
)
1955 tls_kind
= SYMBOL_REF_TLS_MODEL (addr
);
1960 /* We'll allow lo_sum's in addresses in our legitimate addresses
1961 so that combine would take care of combining addresses where
1962 necessary, but for generation purposes, we'll generate the address
1965 tmp = hi (symbol_ref); adrp x1, foo
1966 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1970 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1971 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1975 Load TLS symbol, depending on TLS mechanism and TLS access model.
1977 Global Dynamic - Traditional TLS:
1978 adrp tmp, :tlsgd:imm
1979 add dest, tmp, #:tlsgd_lo12:imm
1982 Global Dynamic - TLS Descriptors:
1983 adrp dest, :tlsdesc:imm
1984 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1985 add dest, dest, #:tlsdesc_lo12:imm
1992 adrp tmp, :gottprel:imm
1993 ldr dest, [tmp, #:gottprel_lo12:imm]
1998 add t0, tp, #:tprel_hi12:imm, lsl #12
1999 add t0, t0, #:tprel_lo12_nc:imm
2003 aarch64_load_symref_appropriately (rtx dest
, rtx imm
,
2004 enum aarch64_symbol_type type
)
2008 case SYMBOL_SMALL_ABSOLUTE
:
2010 /* In ILP32, the mode of dest can be either SImode or DImode. */
2012 machine_mode mode
= GET_MODE (dest
);
2014 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
2016 if (can_create_pseudo_p ())
2017 tmp_reg
= gen_reg_rtx (mode
);
2019 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
2020 emit_insn (gen_add_losym (dest
, tmp_reg
, imm
));
2024 case SYMBOL_TINY_ABSOLUTE
:
2025 emit_insn (gen_rtx_SET (dest
, imm
));
2028 case SYMBOL_SMALL_GOT_28K
:
2030 machine_mode mode
= GET_MODE (dest
);
2031 rtx gp_rtx
= pic_offset_table_rtx
;
2035 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2036 here before rtl expand. Tree IVOPT will generate rtl pattern to
2037 decide rtx costs, in which case pic_offset_table_rtx is not
2038 initialized. For that case no need to generate the first adrp
2039 instruction as the final cost for global variable access is
2043 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2044 using the page base as GOT base, the first page may be wasted,
2045 in the worst scenario, there is only 28K space for GOT).
2047 The generate instruction sequence for accessing global variable
2050 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2052 Only one instruction needed. But we must initialize
2053 pic_offset_table_rtx properly. We generate initialize insn for
2054 every global access, and allow CSE to remove all redundant.
2056 The final instruction sequences will look like the following
2057 for multiply global variables access.
2059 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2061 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2062 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2063 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2066 rtx s
= gen_rtx_SYMBOL_REF (Pmode
, "_GLOBAL_OFFSET_TABLE_");
2067 crtl
->uses_pic_offset_table
= 1;
2068 emit_move_insn (gp_rtx
, gen_rtx_HIGH (Pmode
, s
));
2070 if (mode
!= GET_MODE (gp_rtx
))
2071 gp_rtx
= gen_lowpart (mode
, gp_rtx
);
2075 if (mode
== ptr_mode
)
2078 insn
= gen_ldr_got_small_28k_di (dest
, gp_rtx
, imm
);
2080 insn
= gen_ldr_got_small_28k_si (dest
, gp_rtx
, imm
);
2082 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
2086 gcc_assert (mode
== Pmode
);
2088 insn
= gen_ldr_got_small_28k_sidi (dest
, gp_rtx
, imm
);
2089 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
2092 /* The operand is expected to be MEM. Whenever the related insn
2093 pattern changed, above code which calculate mem should be
2095 gcc_assert (GET_CODE (mem
) == MEM
);
2096 MEM_READONLY_P (mem
) = 1;
2097 MEM_NOTRAP_P (mem
) = 1;
2102 case SYMBOL_SMALL_GOT_4G
:
2104 /* In ILP32, the mode of dest can be either SImode or DImode,
2105 while the got entry is always of SImode size. The mode of
2106 dest depends on how dest is used: if dest is assigned to a
2107 pointer (e.g. in the memory), it has SImode; it may have
2108 DImode if dest is dereferenced to access the memeory.
2109 This is why we have to handle three different ldr_got_small
2110 patterns here (two patterns for ILP32). */
2115 machine_mode mode
= GET_MODE (dest
);
2117 if (can_create_pseudo_p ())
2118 tmp_reg
= gen_reg_rtx (mode
);
2120 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
2121 if (mode
== ptr_mode
)
2124 insn
= gen_ldr_got_small_di (dest
, tmp_reg
, imm
);
2126 insn
= gen_ldr_got_small_si (dest
, tmp_reg
, imm
);
2128 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
2132 gcc_assert (mode
== Pmode
);
2134 insn
= gen_ldr_got_small_sidi (dest
, tmp_reg
, imm
);
2135 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
2138 gcc_assert (GET_CODE (mem
) == MEM
);
2139 MEM_READONLY_P (mem
) = 1;
2140 MEM_NOTRAP_P (mem
) = 1;
2145 case SYMBOL_SMALL_TLSGD
:
2148 machine_mode mode
= GET_MODE (dest
);
2149 rtx result
= gen_rtx_REG (mode
, R0_REGNUM
);
2153 aarch64_emit_call_insn (gen_tlsgd_small_si (result
, imm
));
2155 aarch64_emit_call_insn (gen_tlsgd_small_di (result
, imm
));
2156 insns
= get_insns ();
2159 RTL_CONST_CALL_P (insns
) = 1;
2160 emit_libcall_block (insns
, dest
, result
, imm
);
2164 case SYMBOL_SMALL_TLSDESC
:
2166 machine_mode mode
= GET_MODE (dest
);
2167 rtx x0
= gen_rtx_REG (mode
, R0_REGNUM
);
2170 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
2172 /* In ILP32, the got entry is always of SImode size. Unlike
2173 small GOT, the dest is fixed at reg 0. */
2175 emit_insn (gen_tlsdesc_small_si (imm
));
2177 emit_insn (gen_tlsdesc_small_di (imm
));
2178 tp
= aarch64_load_tp (NULL
);
2181 tp
= gen_lowpart (mode
, tp
);
2183 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, x0
)));
2185 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2189 case SYMBOL_SMALL_TLSIE
:
2191 /* In ILP32, the mode of dest can be either SImode or DImode,
2192 while the got entry is always of SImode size. The mode of
2193 dest depends on how dest is used: if dest is assigned to a
2194 pointer (e.g. in the memory), it has SImode; it may have
2195 DImode if dest is dereferenced to access the memeory.
2196 This is why we have to handle three different tlsie_small
2197 patterns here (two patterns for ILP32). */
2198 machine_mode mode
= GET_MODE (dest
);
2199 rtx tmp_reg
= gen_reg_rtx (mode
);
2200 rtx tp
= aarch64_load_tp (NULL
);
2202 if (mode
== ptr_mode
)
2205 emit_insn (gen_tlsie_small_di (tmp_reg
, imm
));
2208 emit_insn (gen_tlsie_small_si (tmp_reg
, imm
));
2209 tp
= gen_lowpart (mode
, tp
);
2214 gcc_assert (mode
== Pmode
);
2215 emit_insn (gen_tlsie_small_sidi (tmp_reg
, imm
));
2218 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, tmp_reg
)));
2220 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2224 case SYMBOL_TLSLE12
:
2225 case SYMBOL_TLSLE24
:
2226 case SYMBOL_TLSLE32
:
2227 case SYMBOL_TLSLE48
:
2229 machine_mode mode
= GET_MODE (dest
);
2230 rtx tp
= aarch64_load_tp (NULL
);
2233 tp
= gen_lowpart (mode
, tp
);
2237 case SYMBOL_TLSLE12
:
2238 emit_insn ((mode
== DImode
? gen_tlsle12_di
: gen_tlsle12_si
)
2241 case SYMBOL_TLSLE24
:
2242 emit_insn ((mode
== DImode
? gen_tlsle24_di
: gen_tlsle24_si
)
2245 case SYMBOL_TLSLE32
:
2246 emit_insn ((mode
== DImode
? gen_tlsle32_di
: gen_tlsle32_si
)
2248 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
2251 case SYMBOL_TLSLE48
:
2252 emit_insn ((mode
== DImode
? gen_tlsle48_di
: gen_tlsle48_si
)
2254 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
2262 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2266 case SYMBOL_TINY_GOT
:
2267 emit_insn (gen_ldr_got_tiny (dest
, imm
));
2270 case SYMBOL_TINY_TLSIE
:
2272 machine_mode mode
= GET_MODE (dest
);
2273 rtx tp
= aarch64_load_tp (NULL
);
2275 if (mode
== ptr_mode
)
2278 emit_insn (gen_tlsie_tiny_di (dest
, imm
, tp
));
2281 tp
= gen_lowpart (mode
, tp
);
2282 emit_insn (gen_tlsie_tiny_si (dest
, imm
, tp
));
2287 gcc_assert (mode
== Pmode
);
2288 emit_insn (gen_tlsie_tiny_sidi (dest
, imm
, tp
));
2292 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2301 /* Emit a move from SRC to DEST. Assume that the move expanders can
2302 handle all moves if !can_create_pseudo_p (). The distinction is
2303 important because, unlike emit_move_insn, the move expanders know
2304 how to force Pmode objects into the constant pool even when the
2305 constant pool address is not itself legitimate. */
2307 aarch64_emit_move (rtx dest
, rtx src
)
2309 return (can_create_pseudo_p ()
2310 ? emit_move_insn (dest
, src
)
2311 : emit_move_insn_1 (dest
, src
));
2314 /* Apply UNOPTAB to OP and store the result in DEST. */
2317 aarch64_emit_unop (rtx dest
, optab unoptab
, rtx op
)
2319 rtx tmp
= expand_unop (GET_MODE (dest
), unoptab
, op
, dest
, 0);
2321 emit_move_insn (dest
, tmp
);
2324 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
2327 aarch64_emit_binop (rtx dest
, optab binoptab
, rtx op0
, rtx op1
)
2329 rtx tmp
= expand_binop (GET_MODE (dest
), binoptab
, op0
, op1
, dest
, 0,
2332 emit_move_insn (dest
, tmp
);
2335 /* Split a 128-bit move operation into two 64-bit move operations,
2336 taking care to handle partial overlap of register to register
2337 copies. Special cases are needed when moving between GP regs and
2338 FP regs. SRC can be a register, constant or memory; DST a register
2339 or memory. If either operand is memory it must not have any side
2342 aarch64_split_128bit_move (rtx dst
, rtx src
)
2347 machine_mode mode
= GET_MODE (dst
);
2349 gcc_assert (mode
== TImode
|| mode
== TFmode
);
2350 gcc_assert (!(side_effects_p (src
) || side_effects_p (dst
)));
2351 gcc_assert (mode
== GET_MODE (src
) || GET_MODE (src
) == VOIDmode
);
2353 if (REG_P (dst
) && REG_P (src
))
2355 int src_regno
= REGNO (src
);
2356 int dst_regno
= REGNO (dst
);
2358 /* Handle FP <-> GP regs. */
2359 if (FP_REGNUM_P (dst_regno
) && GP_REGNUM_P (src_regno
))
2361 src_lo
= gen_lowpart (word_mode
, src
);
2362 src_hi
= gen_highpart (word_mode
, src
);
2364 emit_insn (gen_aarch64_movlow_di (mode
, dst
, src_lo
));
2365 emit_insn (gen_aarch64_movhigh_di (mode
, dst
, src_hi
));
2368 else if (GP_REGNUM_P (dst_regno
) && FP_REGNUM_P (src_regno
))
2370 dst_lo
= gen_lowpart (word_mode
, dst
);
2371 dst_hi
= gen_highpart (word_mode
, dst
);
2373 emit_insn (gen_aarch64_movdi_low (mode
, dst_lo
, src
));
2374 emit_insn (gen_aarch64_movdi_high (mode
, dst_hi
, src
));
2379 dst_lo
= gen_lowpart (word_mode
, dst
);
2380 dst_hi
= gen_highpart (word_mode
, dst
);
2381 src_lo
= gen_lowpart (word_mode
, src
);
2382 src_hi
= gen_highpart_mode (word_mode
, mode
, src
);
2384 /* At most one pairing may overlap. */
2385 if (reg_overlap_mentioned_p (dst_lo
, src_hi
))
2387 aarch64_emit_move (dst_hi
, src_hi
);
2388 aarch64_emit_move (dst_lo
, src_lo
);
2392 aarch64_emit_move (dst_lo
, src_lo
);
2393 aarch64_emit_move (dst_hi
, src_hi
);
2398 aarch64_split_128bit_move_p (rtx dst
, rtx src
)
2400 return (! REG_P (src
)
2401 || ! (FP_REGNUM_P (REGNO (dst
)) && FP_REGNUM_P (REGNO (src
))));
2404 /* Split a complex SIMD combine. */
2407 aarch64_split_simd_combine (rtx dst
, rtx src1
, rtx src2
)
2409 machine_mode src_mode
= GET_MODE (src1
);
2410 machine_mode dst_mode
= GET_MODE (dst
);
2412 gcc_assert (VECTOR_MODE_P (dst_mode
));
2413 gcc_assert (register_operand (dst
, dst_mode
)
2414 && register_operand (src1
, src_mode
)
2415 && register_operand (src2
, src_mode
));
2417 emit_insn (gen_aarch64_simd_combine (src_mode
, dst
, src1
, src2
));
2421 /* Split a complex SIMD move. */
2424 aarch64_split_simd_move (rtx dst
, rtx src
)
2426 machine_mode src_mode
= GET_MODE (src
);
2427 machine_mode dst_mode
= GET_MODE (dst
);
2429 gcc_assert (VECTOR_MODE_P (dst_mode
));
2431 if (REG_P (dst
) && REG_P (src
))
2433 gcc_assert (VECTOR_MODE_P (src_mode
));
2434 emit_insn (gen_aarch64_split_simd_mov (src_mode
, dst
, src
));
2439 aarch64_zero_extend_const_eq (machine_mode xmode
, rtx x
,
2440 machine_mode ymode
, rtx y
)
2442 rtx r
= simplify_const_unary_operation (ZERO_EXTEND
, xmode
, y
, ymode
);
2443 gcc_assert (r
!= NULL
);
2444 return rtx_equal_p (x
, r
);
2449 aarch64_force_temporary (machine_mode mode
, rtx x
, rtx value
)
2451 if (can_create_pseudo_p ())
2452 return force_reg (mode
, value
);
2456 aarch64_emit_move (x
, value
);
2461 /* Return true if we can move VALUE into a register using a single
2462 CNT[BHWD] instruction. */
2465 aarch64_sve_cnt_immediate_p (poly_int64 value
)
2467 HOST_WIDE_INT factor
= value
.coeffs
[0];
2468 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2469 return (value
.coeffs
[1] == factor
2470 && IN_RANGE (factor
, 2, 16 * 16)
2471 && (factor
& 1) == 0
2472 && factor
<= 16 * (factor
& -factor
));
2475 /* Likewise for rtx X. */
2478 aarch64_sve_cnt_immediate_p (rtx x
)
2481 return poly_int_rtx_p (x
, &value
) && aarch64_sve_cnt_immediate_p (value
);
2484 /* Return the asm string for an instruction with a CNT-like vector size
2485 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2486 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2487 first part of the operands template (the part that comes before the
2488 vector size itself). FACTOR is the number of quadwords.
2489 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2490 If it is zero, we can use any element size. */
2493 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
2494 unsigned int factor
,
2495 unsigned int nelts_per_vq
)
2497 static char buffer
[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2499 if (nelts_per_vq
== 0)
2500 /* There is some overlap in the ranges of the four CNT instructions.
2501 Here we always use the smallest possible element size, so that the
2502 multiplier is 1 whereever possible. */
2503 nelts_per_vq
= factor
& -factor
;
2504 int shift
= std::min (exact_log2 (nelts_per_vq
), 4);
2505 gcc_assert (IN_RANGE (shift
, 1, 4));
2506 char suffix
= "dwhb"[shift
- 1];
2509 unsigned int written
;
2511 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s",
2512 prefix
, suffix
, operands
);
2514 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s, all, mul #%d",
2515 prefix
, suffix
, operands
, factor
);
2516 gcc_assert (written
< sizeof (buffer
));
2520 /* Return the asm string for an instruction with a CNT-like vector size
2521 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2522 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2523 first part of the operands template (the part that comes before the
2524 vector size itself). X is the value of the vector size operand,
2525 as a polynomial integer rtx. */
2528 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
2531 poly_int64 value
= rtx_to_poly_int64 (x
);
2532 gcc_assert (aarch64_sve_cnt_immediate_p (value
));
2533 return aarch64_output_sve_cnt_immediate (prefix
, operands
,
2534 value
.coeffs
[1], 0);
2537 /* Return true if we can add VALUE to a register using a single ADDVL
2538 or ADDPL instruction. */
2541 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value
)
2543 HOST_WIDE_INT factor
= value
.coeffs
[0];
2544 if (factor
== 0 || value
.coeffs
[1] != factor
)
2546 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2547 and a value of 16 is one vector width. */
2548 return (((factor
& 15) == 0 && IN_RANGE (factor
, -32 * 16, 31 * 16))
2549 || ((factor
& 1) == 0 && IN_RANGE (factor
, -32 * 2, 31 * 2)));
2552 /* Likewise for rtx X. */
2555 aarch64_sve_addvl_addpl_immediate_p (rtx x
)
2558 return (poly_int_rtx_p (x
, &value
)
2559 && aarch64_sve_addvl_addpl_immediate_p (value
));
2562 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2563 and storing the result in operand 0. */
2566 aarch64_output_sve_addvl_addpl (rtx dest
, rtx base
, rtx offset
)
2568 static char buffer
[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2569 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
2570 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value
));
2572 /* Use INC or DEC if possible. */
2573 if (rtx_equal_p (dest
, base
) && GP_REGNUM_P (REGNO (dest
)))
2575 if (aarch64_sve_cnt_immediate_p (offset_value
))
2576 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2577 offset_value
.coeffs
[1], 0);
2578 if (aarch64_sve_cnt_immediate_p (-offset_value
))
2579 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2580 -offset_value
.coeffs
[1], 0);
2583 int factor
= offset_value
.coeffs
[1];
2584 if ((factor
& 15) == 0)
2585 snprintf (buffer
, sizeof (buffer
), "addvl\t%%x0, %%x1, #%d", factor
/ 16);
2587 snprintf (buffer
, sizeof (buffer
), "addpl\t%%x0, %%x1, #%d", factor
/ 2);
2591 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2592 instruction. If it is, store the number of elements in each vector
2593 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2594 factor in *FACTOR_OUT (if nonnull). */
2597 aarch64_sve_inc_dec_immediate_p (rtx x
, int *factor_out
,
2598 unsigned int *nelts_per_vq_out
)
2603 if (!const_vec_duplicate_p (x
, &elt
)
2604 || !poly_int_rtx_p (elt
, &value
))
2607 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x
));
2608 if (nelts_per_vq
!= 8 && nelts_per_vq
!= 4 && nelts_per_vq
!= 2)
2609 /* There's no vector INCB. */
2612 HOST_WIDE_INT factor
= value
.coeffs
[0];
2613 if (value
.coeffs
[1] != factor
)
2616 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2617 if ((factor
% nelts_per_vq
) != 0
2618 || !IN_RANGE (abs (factor
), nelts_per_vq
, 16 * nelts_per_vq
))
2622 *factor_out
= factor
;
2623 if (nelts_per_vq_out
)
2624 *nelts_per_vq_out
= nelts_per_vq
;
2628 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2632 aarch64_sve_inc_dec_immediate_p (rtx x
)
2634 return aarch64_sve_inc_dec_immediate_p (x
, NULL
, NULL
);
2637 /* Return the asm template for an SVE vector INC or DEC instruction.
2638 OPERANDS gives the operands before the vector count and X is the
2639 value of the vector count operand itself. */
2642 aarch64_output_sve_inc_dec_immediate (const char *operands
, rtx x
)
2645 unsigned int nelts_per_vq
;
2646 if (!aarch64_sve_inc_dec_immediate_p (x
, &factor
, &nelts_per_vq
))
2649 return aarch64_output_sve_cnt_immediate ("dec", operands
, -factor
,
2652 return aarch64_output_sve_cnt_immediate ("inc", operands
, factor
,
2657 aarch64_internal_mov_immediate (rtx dest
, rtx imm
, bool generate
,
2658 scalar_int_mode mode
)
2661 unsigned HOST_WIDE_INT val
, val2
, mask
;
2662 int one_match
, zero_match
;
2667 if (aarch64_move_imm (val
, mode
))
2670 emit_insn (gen_rtx_SET (dest
, imm
));
2674 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2675 (with XXXX non-zero). In that case check to see if the move can be done in
2677 val2
= val
& 0xffffffff;
2679 && aarch64_move_imm (val2
, SImode
)
2680 && (((val
>> 32) & 0xffff) == 0 || (val
>> 48) == 0))
2683 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
2685 /* Check if we have to emit a second instruction by checking to see
2686 if any of the upper 32 bits of the original DI mode value is set. */
2690 i
= (val
>> 48) ? 48 : 32;
2693 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
2694 GEN_INT ((val
>> i
) & 0xffff)));
2699 if ((val
>> 32) == 0 || mode
== SImode
)
2703 emit_insn (gen_rtx_SET (dest
, GEN_INT (val
& 0xffff)));
2705 emit_insn (gen_insv_immsi (dest
, GEN_INT (16),
2706 GEN_INT ((val
>> 16) & 0xffff)));
2708 emit_insn (gen_insv_immdi (dest
, GEN_INT (16),
2709 GEN_INT ((val
>> 16) & 0xffff)));
2714 /* Remaining cases are all for DImode. */
2717 zero_match
= ((val
& mask
) == 0) + ((val
& (mask
<< 16)) == 0) +
2718 ((val
& (mask
<< 32)) == 0) + ((val
& (mask
<< 48)) == 0);
2719 one_match
= ((~val
& mask
) == 0) + ((~val
& (mask
<< 16)) == 0) +
2720 ((~val
& (mask
<< 32)) == 0) + ((~val
& (mask
<< 48)) == 0);
2722 if (zero_match
!= 2 && one_match
!= 2)
2724 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2725 For a 64-bit bitmask try whether changing 16 bits to all ones or
2726 zeroes creates a valid bitmask. To check any repeated bitmask,
2727 try using 16 bits from the other 32-bit half of val. */
2729 for (i
= 0; i
< 64; i
+= 16, mask
<<= 16)
2732 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
2735 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
2737 val2
= val2
& ~mask
;
2738 val2
= val2
| (((val2
>> 32) | (val2
<< 32)) & mask
);
2739 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
2746 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
2747 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
2748 GEN_INT ((val
>> i
) & 0xffff)));
2754 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2755 are emitted by the initial mov. If one_match > zero_match, skip set bits,
2756 otherwise skip zero bits. */
2760 val2
= one_match
> zero_match
? ~val
: val
;
2761 i
= (val2
& mask
) != 0 ? 0 : (val2
& (mask
<< 16)) != 0 ? 16 : 32;
2764 emit_insn (gen_rtx_SET (dest
, GEN_INT (one_match
> zero_match
2765 ? (val
| ~(mask
<< i
))
2766 : (val
& (mask
<< i
)))));
2767 for (i
+= 16; i
< 64; i
+= 16)
2769 if ((val2
& (mask
<< i
)) == 0)
2772 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
2773 GEN_INT ((val
>> i
) & 0xffff)));
2780 /* Return whether imm is a 128-bit immediate which is simple enough to
2783 aarch64_mov128_immediate (rtx imm
)
2785 if (GET_CODE (imm
) == CONST_INT
)
2788 gcc_assert (CONST_WIDE_INT_NUNITS (imm
) == 2);
2790 rtx lo
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 0));
2791 rtx hi
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 1));
2793 return aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, DImode
)
2794 + aarch64_internal_mov_immediate (NULL_RTX
, hi
, false, DImode
) <= 4;
2798 /* Return the number of temporary registers that aarch64_add_offset_1
2799 would need to add OFFSET to a register. */
2802 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset
)
2804 return abs_hwi (offset
) < 0x1000000 ? 0 : 1;
2807 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
2808 a non-polynomial OFFSET. MODE is the mode of the addition.
2809 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2810 be set and CFA adjustments added to the generated instructions.
2812 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2813 temporary if register allocation is already complete. This temporary
2814 register may overlap DEST but must not overlap SRC. If TEMP1 is known
2815 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2816 the immediate again.
2818 Since this function may be used to adjust the stack pointer, we must
2819 ensure that it cannot cause transient stack deallocation (for example
2820 by first incrementing SP and then decrementing when adjusting by a
2821 large immediate). */
2824 aarch64_add_offset_1 (scalar_int_mode mode
, rtx dest
,
2825 rtx src
, HOST_WIDE_INT offset
, rtx temp1
,
2826 bool frame_related_p
, bool emit_move_imm
)
2828 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
2829 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
2831 HOST_WIDE_INT moffset
= abs_hwi (offset
);
2836 if (!rtx_equal_p (dest
, src
))
2838 insn
= emit_insn (gen_rtx_SET (dest
, src
));
2839 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2844 /* Single instruction adjustment. */
2845 if (aarch64_uimm12_shift (moffset
))
2847 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (offset
)));
2848 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2852 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2855 a) the offset cannot be loaded by a 16-bit move or
2856 b) there is no spare register into which we can move it. */
2857 if (moffset
< 0x1000000
2858 && ((!temp1
&& !can_create_pseudo_p ())
2859 || !aarch64_move_imm (moffset
, mode
)))
2861 HOST_WIDE_INT low_off
= moffset
& 0xfff;
2863 low_off
= offset
< 0 ? -low_off
: low_off
;
2864 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (low_off
)));
2865 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2866 insn
= emit_insn (gen_add2_insn (dest
, GEN_INT (offset
- low_off
)));
2867 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2871 /* Emit a move immediate if required and an addition/subtraction. */
2874 gcc_assert (temp1
!= NULL_RTX
|| can_create_pseudo_p ());
2875 temp1
= aarch64_force_temporary (mode
, temp1
, GEN_INT (moffset
));
2877 insn
= emit_insn (offset
< 0
2878 ? gen_sub3_insn (dest
, src
, temp1
)
2879 : gen_add3_insn (dest
, src
, temp1
));
2880 if (frame_related_p
)
2882 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2883 rtx adj
= plus_constant (mode
, src
, offset
);
2884 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, gen_rtx_SET (dest
, adj
));
2888 /* Return the number of temporary registers that aarch64_add_offset
2889 would need to move OFFSET into a register or add OFFSET to a register;
2890 ADD_P is true if we want the latter rather than the former. */
2893 aarch64_offset_temporaries (bool add_p
, poly_int64 offset
)
2895 /* This follows the same structure as aarch64_add_offset. */
2896 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
2899 unsigned int count
= 0;
2900 HOST_WIDE_INT factor
= offset
.coeffs
[1];
2901 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
2902 poly_int64
poly_offset (factor
, factor
);
2903 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
2904 /* Need one register for the ADDVL/ADDPL result. */
2906 else if (factor
!= 0)
2908 factor
= abs (factor
);
2909 if (factor
> 16 * (factor
& -factor
))
2910 /* Need one register for the CNT result and one for the multiplication
2911 factor. If necessary, the second temporary can be reused for the
2912 constant part of the offset. */
2914 /* Need one register for the CNT result (which might then
2918 return count
+ aarch64_add_offset_1_temporaries (constant
);
2921 /* If X can be represented as a poly_int64, return the number
2922 of temporaries that are required to add it to a register.
2923 Return -1 otherwise. */
2926 aarch64_add_offset_temporaries (rtx x
)
2929 if (!poly_int_rtx_p (x
, &offset
))
2931 return aarch64_offset_temporaries (true, offset
);
2934 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
2935 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2936 be set and CFA adjustments added to the generated instructions.
2938 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2939 temporary if register allocation is already complete. This temporary
2940 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2941 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2942 false to avoid emitting the immediate again.
2944 TEMP2, if nonnull, is a second temporary register that doesn't
2945 overlap either DEST or REG.
2947 Since this function may be used to adjust the stack pointer, we must
2948 ensure that it cannot cause transient stack deallocation (for example
2949 by first incrementing SP and then decrementing when adjusting by a
2950 large immediate). */
2953 aarch64_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
2954 poly_int64 offset
, rtx temp1
, rtx temp2
,
2955 bool frame_related_p
, bool emit_move_imm
= true)
2957 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
2958 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
2959 gcc_assert (temp1
== NULL_RTX
2961 || !reg_overlap_mentioned_p (temp1
, dest
));
2962 gcc_assert (temp2
== NULL_RTX
|| !reg_overlap_mentioned_p (dest
, temp2
));
2964 /* Try using ADDVL or ADDPL to add the whole value. */
2965 if (src
!= const0_rtx
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
2967 rtx offset_rtx
= gen_int_mode (offset
, mode
);
2968 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
2969 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2973 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2974 SVE vector register, over and above the minimum size of 128 bits.
2975 This is equivalent to half the value returned by CNTD with a
2976 vector shape of ALL. */
2977 HOST_WIDE_INT factor
= offset
.coeffs
[1];
2978 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
2980 /* Try using ADDVL or ADDPL to add the VG-based part. */
2981 poly_int64
poly_offset (factor
, factor
);
2982 if (src
!= const0_rtx
2983 && aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
2985 rtx offset_rtx
= gen_int_mode (poly_offset
, mode
);
2986 if (frame_related_p
)
2988 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
2989 RTX_FRAME_RELATED_P (insn
) = true;
2994 rtx addr
= gen_rtx_PLUS (mode
, src
, offset_rtx
);
2995 src
= aarch64_force_temporary (mode
, temp1
, addr
);
3000 /* Otherwise use a CNT-based sequence. */
3001 else if (factor
!= 0)
3003 /* Use a subtraction if we have a negative factor. */
3004 rtx_code code
= PLUS
;
3011 /* Calculate CNTD * FACTOR / 2. First try to fold the division
3012 into the multiplication. */
3016 /* Use a right shift by 1. */
3020 HOST_WIDE_INT low_bit
= factor
& -factor
;
3021 if (factor
<= 16 * low_bit
)
3023 if (factor
> 16 * 8)
3025 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3026 the value with the minimum multiplier and shift it into
3028 int extra_shift
= exact_log2 (low_bit
);
3029 shift
+= extra_shift
;
3030 factor
>>= extra_shift
;
3032 val
= gen_int_mode (poly_int64 (factor
* 2, factor
* 2), mode
);
3036 /* Use CNTD, then multiply it by FACTOR. */
3037 val
= gen_int_mode (poly_int64 (2, 2), mode
);
3038 val
= aarch64_force_temporary (mode
, temp1
, val
);
3040 /* Go back to using a negative multiplication factor if we have
3041 no register from which to subtract. */
3042 if (code
== MINUS
&& src
== const0_rtx
)
3047 rtx coeff1
= gen_int_mode (factor
, mode
);
3048 coeff1
= aarch64_force_temporary (mode
, temp2
, coeff1
);
3049 val
= gen_rtx_MULT (mode
, val
, coeff1
);
3054 /* Multiply by 1 << SHIFT. */
3055 val
= aarch64_force_temporary (mode
, temp1
, val
);
3056 val
= gen_rtx_ASHIFT (mode
, val
, GEN_INT (shift
));
3058 else if (shift
== -1)
3061 val
= aarch64_force_temporary (mode
, temp1
, val
);
3062 val
= gen_rtx_ASHIFTRT (mode
, val
, const1_rtx
);
3065 /* Calculate SRC +/- CNTD * FACTOR / 2. */
3066 if (src
!= const0_rtx
)
3068 val
= aarch64_force_temporary (mode
, temp1
, val
);
3069 val
= gen_rtx_fmt_ee (code
, mode
, src
, val
);
3071 else if (code
== MINUS
)
3073 val
= aarch64_force_temporary (mode
, temp1
, val
);
3074 val
= gen_rtx_NEG (mode
, val
);
3077 if (constant
== 0 || frame_related_p
)
3079 rtx_insn
*insn
= emit_insn (gen_rtx_SET (dest
, val
));
3080 if (frame_related_p
)
3082 RTX_FRAME_RELATED_P (insn
) = true;
3083 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
3084 gen_rtx_SET (dest
, plus_constant (Pmode
, src
,
3093 src
= aarch64_force_temporary (mode
, temp1
, val
);
3098 emit_move_imm
= true;
3101 aarch64_add_offset_1 (mode
, dest
, src
, constant
, temp1
,
3102 frame_related_p
, emit_move_imm
);
3105 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3106 than a poly_int64. */
3109 aarch64_split_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
3110 rtx offset_rtx
, rtx temp1
, rtx temp2
)
3112 aarch64_add_offset (mode
, dest
, src
, rtx_to_poly_int64 (offset_rtx
),
3113 temp1
, temp2
, false);
3116 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3117 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
3118 if TEMP1 already contains abs (DELTA). */
3121 aarch64_add_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool emit_move_imm
)
3123 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, delta
,
3124 temp1
, temp2
, true, emit_move_imm
);
3127 /* Subtract DELTA from the stack pointer, marking the instructions
3128 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
3132 aarch64_sub_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool frame_related_p
,
3133 bool emit_move_imm
= true)
3135 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, -delta
,
3136 temp1
, temp2
, frame_related_p
, emit_move_imm
);
3139 /* Set DEST to (vec_series BASE STEP). */
3142 aarch64_expand_vec_series (rtx dest
, rtx base
, rtx step
)
3144 machine_mode mode
= GET_MODE (dest
);
3145 scalar_mode inner
= GET_MODE_INNER (mode
);
3147 /* Each operand can be a register or an immediate in the range [-16, 15]. */
3148 if (!aarch64_sve_index_immediate_p (base
))
3149 base
= force_reg (inner
, base
);
3150 if (!aarch64_sve_index_immediate_p (step
))
3151 step
= force_reg (inner
, step
);
3153 emit_set_insn (dest
, gen_rtx_VEC_SERIES (mode
, base
, step
));
3156 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
3157 integer of mode INT_MODE. Return true on success. */
3160 aarch64_expand_sve_widened_duplicate (rtx dest
, scalar_int_mode src_mode
,
3163 /* If the constant is smaller than 128 bits, we can do the move
3164 using a vector of SRC_MODEs. */
3165 if (src_mode
!= TImode
)
3167 poly_uint64 count
= exact_div (GET_MODE_SIZE (GET_MODE (dest
)),
3168 GET_MODE_SIZE (src_mode
));
3169 machine_mode dup_mode
= mode_for_vector (src_mode
, count
).require ();
3170 emit_move_insn (gen_lowpart (dup_mode
, dest
),
3171 gen_const_vec_duplicate (dup_mode
, src
));
3175 /* Use LD1RQ[BHWD] to load the 128 bits from memory. */
3176 src
= force_const_mem (src_mode
, src
);
3180 /* Make sure that the address is legitimate. */
3181 if (!aarch64_sve_ld1r_operand_p (src
))
3183 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
3184 src
= replace_equiv_address (src
, addr
);
3187 machine_mode mode
= GET_MODE (dest
);
3188 unsigned int elem_bytes
= GET_MODE_UNIT_SIZE (mode
);
3189 machine_mode pred_mode
= aarch64_sve_pred_mode (elem_bytes
).require ();
3190 rtx ptrue
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
3191 src
= gen_rtx_UNSPEC (mode
, gen_rtvec (2, ptrue
, src
), UNSPEC_LD1RQ
);
3192 emit_insn (gen_rtx_SET (dest
, src
));
3196 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
3197 isn't a simple duplicate or series. */
3200 aarch64_expand_sve_const_vector (rtx dest
, rtx src
)
3202 machine_mode mode
= GET_MODE (src
);
3203 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (src
);
3204 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (src
);
3205 gcc_assert (npatterns
> 1);
3207 if (nelts_per_pattern
== 1)
3209 /* The constant is a repeating seqeuence of at least two elements,
3210 where the repeating elements occupy no more than 128 bits.
3211 Get an integer representation of the replicated value. */
3212 scalar_int_mode int_mode
;
3213 if (BYTES_BIG_ENDIAN
)
3214 /* For now, always use LD1RQ to load the value on big-endian
3215 targets, since the handling of smaller integers includes a
3216 subreg that is semantically an element reverse. */
3220 unsigned int int_bits
= GET_MODE_UNIT_BITSIZE (mode
) * npatterns
;
3221 gcc_assert (int_bits
<= 128);
3222 int_mode
= int_mode_for_size (int_bits
, 0).require ();
3224 rtx int_value
= simplify_gen_subreg (int_mode
, src
, mode
, 0);
3226 && aarch64_expand_sve_widened_duplicate (dest
, int_mode
, int_value
))
3230 /* Expand each pattern individually. */
3231 rtx_vector_builder builder
;
3232 auto_vec
<rtx
, 16> vectors (npatterns
);
3233 for (unsigned int i
= 0; i
< npatterns
; ++i
)
3235 builder
.new_vector (mode
, 1, nelts_per_pattern
);
3236 for (unsigned int j
= 0; j
< nelts_per_pattern
; ++j
)
3237 builder
.quick_push (CONST_VECTOR_ELT (src
, i
+ j
* npatterns
));
3238 vectors
.quick_push (force_reg (mode
, builder
.build ()));
3241 /* Use permutes to interleave the separate vectors. */
3242 while (npatterns
> 1)
3245 for (unsigned int i
= 0; i
< npatterns
; ++i
)
3247 rtx tmp
= (npatterns
== 1 ? dest
: gen_reg_rtx (mode
));
3248 rtvec v
= gen_rtvec (2, vectors
[i
], vectors
[i
+ npatterns
]);
3249 emit_set_insn (tmp
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
3253 gcc_assert (vectors
[0] == dest
);
3256 /* Set DEST to immediate IMM. For SVE vector modes, GEN_VEC_DUPLICATE
3257 is a pattern that can be used to set DEST to a replicated scalar
3261 aarch64_expand_mov_immediate (rtx dest
, rtx imm
,
3262 rtx (*gen_vec_duplicate
) (rtx
, rtx
))
3264 machine_mode mode
= GET_MODE (dest
);
3266 /* Check on what type of symbol it is. */
3267 scalar_int_mode int_mode
;
3268 if ((GET_CODE (imm
) == SYMBOL_REF
3269 || GET_CODE (imm
) == LABEL_REF
3270 || GET_CODE (imm
) == CONST
3271 || GET_CODE (imm
) == CONST_POLY_INT
)
3272 && is_a
<scalar_int_mode
> (mode
, &int_mode
))
3276 HOST_WIDE_INT const_offset
;
3277 enum aarch64_symbol_type sty
;
3279 /* If we have (const (plus symbol offset)), separate out the offset
3280 before we start classifying the symbol. */
3281 rtx base
= strip_offset (imm
, &offset
);
3283 /* We must always add an offset involving VL separately, rather than
3284 folding it into the relocation. */
3285 if (!offset
.is_constant (&const_offset
))
3287 if (base
== const0_rtx
&& aarch64_sve_cnt_immediate_p (offset
))
3288 emit_insn (gen_rtx_SET (dest
, imm
));
3291 /* Do arithmetic on 32-bit values if the result is smaller
3293 if (partial_subreg_p (int_mode
, SImode
))
3295 /* It is invalid to do symbol calculations in modes
3296 narrower than SImode. */
3297 gcc_assert (base
== const0_rtx
);
3298 dest
= gen_lowpart (SImode
, dest
);
3301 if (base
!= const0_rtx
)
3303 base
= aarch64_force_temporary (int_mode
, dest
, base
);
3304 aarch64_add_offset (int_mode
, dest
, base
, offset
,
3305 NULL_RTX
, NULL_RTX
, false);
3308 aarch64_add_offset (int_mode
, dest
, base
, offset
,
3309 dest
, NULL_RTX
, false);
3314 sty
= aarch64_classify_symbol (base
, const_offset
);
3317 case SYMBOL_FORCE_TO_MEM
:
3318 if (const_offset
!= 0
3319 && targetm
.cannot_force_const_mem (int_mode
, imm
))
3321 gcc_assert (can_create_pseudo_p ());
3322 base
= aarch64_force_temporary (int_mode
, dest
, base
);
3323 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
3324 NULL_RTX
, NULL_RTX
, false);
3328 mem
= force_const_mem (ptr_mode
, imm
);
3331 /* If we aren't generating PC relative literals, then
3332 we need to expand the literal pool access carefully.
3333 This is something that needs to be done in a number
3334 of places, so could well live as a separate function. */
3335 if (!aarch64_pcrelative_literal_loads
)
3337 gcc_assert (can_create_pseudo_p ());
3338 base
= gen_reg_rtx (ptr_mode
);
3339 aarch64_expand_mov_immediate (base
, XEXP (mem
, 0));
3340 if (ptr_mode
!= Pmode
)
3341 base
= convert_memory_address (Pmode
, base
);
3342 mem
= gen_rtx_MEM (ptr_mode
, base
);
3345 if (int_mode
!= ptr_mode
)
3346 mem
= gen_rtx_ZERO_EXTEND (int_mode
, mem
);
3348 emit_insn (gen_rtx_SET (dest
, mem
));
3352 case SYMBOL_SMALL_TLSGD
:
3353 case SYMBOL_SMALL_TLSDESC
:
3354 case SYMBOL_SMALL_TLSIE
:
3355 case SYMBOL_SMALL_GOT_28K
:
3356 case SYMBOL_SMALL_GOT_4G
:
3357 case SYMBOL_TINY_GOT
:
3358 case SYMBOL_TINY_TLSIE
:
3359 if (const_offset
!= 0)
3361 gcc_assert(can_create_pseudo_p ());
3362 base
= aarch64_force_temporary (int_mode
, dest
, base
);
3363 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
3364 NULL_RTX
, NULL_RTX
, false);
3369 case SYMBOL_SMALL_ABSOLUTE
:
3370 case SYMBOL_TINY_ABSOLUTE
:
3371 case SYMBOL_TLSLE12
:
3372 case SYMBOL_TLSLE24
:
3373 case SYMBOL_TLSLE32
:
3374 case SYMBOL_TLSLE48
:
3375 aarch64_load_symref_appropriately (dest
, imm
, sty
);
3383 if (!CONST_INT_P (imm
))
3385 rtx base
, step
, value
;
3386 if (GET_CODE (imm
) == HIGH
3387 || aarch64_simd_valid_immediate (imm
, NULL
))
3388 emit_insn (gen_rtx_SET (dest
, imm
));
3389 else if (const_vec_series_p (imm
, &base
, &step
))
3390 aarch64_expand_vec_series (dest
, base
, step
);
3391 else if (const_vec_duplicate_p (imm
, &value
))
3393 /* If the constant is out of range of an SVE vector move,
3394 load it from memory if we can, otherwise move it into
3395 a register and use a DUP. */
3396 scalar_mode inner_mode
= GET_MODE_INNER (mode
);
3397 rtx op
= force_const_mem (inner_mode
, value
);
3399 op
= force_reg (inner_mode
, value
);
3400 else if (!aarch64_sve_ld1r_operand_p (op
))
3402 rtx addr
= force_reg (Pmode
, XEXP (op
, 0));
3403 op
= replace_equiv_address (op
, addr
);
3405 emit_insn (gen_vec_duplicate (dest
, op
));
3407 else if (GET_CODE (imm
) == CONST_VECTOR
3408 && !GET_MODE_NUNITS (GET_MODE (imm
)).is_constant ())
3409 aarch64_expand_sve_const_vector (dest
, imm
);
3412 rtx mem
= force_const_mem (mode
, imm
);
3414 emit_move_insn (dest
, mem
);
3420 aarch64_internal_mov_immediate (dest
, imm
, true,
3421 as_a
<scalar_int_mode
> (mode
));
3424 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
3425 that is known to contain PTRUE. */
3428 aarch64_emit_sve_pred_move (rtx dest
, rtx pred
, rtx src
)
3430 expand_operand ops
[3];
3431 machine_mode mode
= GET_MODE (dest
);
3432 create_output_operand (&ops
[0], dest
, mode
);
3433 create_input_operand (&ops
[1], pred
, GET_MODE(pred
));
3434 create_input_operand (&ops
[2], src
, mode
);
3435 expand_insn (code_for_aarch64_pred_mov (mode
), 3, ops
);
3438 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3439 operand is in memory. In this case we need to use the predicated LD1
3440 and ST1 instead of LDR and STR, both for correctness on big-endian
3441 targets and because LD1 and ST1 support a wider range of addressing modes.
3442 PRED_MODE is the mode of the predicate.
3444 See the comment at the head of aarch64-sve.md for details about the
3445 big-endian handling. */
3448 aarch64_expand_sve_mem_move (rtx dest
, rtx src
, machine_mode pred_mode
)
3450 machine_mode mode
= GET_MODE (dest
);
3451 rtx ptrue
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
3452 if (!register_operand (src
, mode
)
3453 && !register_operand (dest
, mode
))
3455 rtx tmp
= gen_reg_rtx (mode
);
3457 aarch64_emit_sve_pred_move (tmp
, ptrue
, src
);
3459 emit_move_insn (tmp
, src
);
3462 aarch64_emit_sve_pred_move (dest
, ptrue
, src
);
3465 /* Called only on big-endian targets. See whether an SVE vector move
3466 from SRC to DEST is effectively a REV[BHW] instruction, because at
3467 least one operand is a subreg of an SVE vector that has wider or
3468 narrower elements. Return true and emit the instruction if so.
3472 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3474 represents a VIEW_CONVERT between the following vectors, viewed
3477 R2: { [0].high, [0].low, [1].high, [1].low, ... }
3478 R1: { [0], [1], [2], [3], ... }
3480 The high part of lane X in R2 should therefore correspond to lane X*2
3481 of R1, but the register representations are:
3484 R2: ...... [1].high [1].low [0].high [0].low
3485 R1: ...... [3] [2] [1] [0]
3487 where the low part of lane X in R2 corresponds to lane X*2 in R1.
3488 We therefore need a reverse operation to swap the high and low values
3491 This is purely an optimization. Without it we would spill the
3492 subreg operand to the stack in one mode and reload it in the
3493 other mode, which has the same effect as the REV. */
3496 aarch64_maybe_expand_sve_subreg_move (rtx dest
, rtx src
)
3498 gcc_assert (BYTES_BIG_ENDIAN
);
3499 if (GET_CODE (dest
) == SUBREG
)
3500 dest
= SUBREG_REG (dest
);
3501 if (GET_CODE (src
) == SUBREG
)
3502 src
= SUBREG_REG (src
);
3504 /* The optimization handles two single SVE REGs with different element
3508 || aarch64_classify_vector_mode (GET_MODE (dest
)) != VEC_SVE_DATA
3509 || aarch64_classify_vector_mode (GET_MODE (src
)) != VEC_SVE_DATA
3510 || (GET_MODE_UNIT_SIZE (GET_MODE (dest
))
3511 == GET_MODE_UNIT_SIZE (GET_MODE (src
))))
3514 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
3515 rtx ptrue
= force_reg (VNx16BImode
, CONSTM1_RTX (VNx16BImode
));
3516 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (dest
), gen_rtvec (2, ptrue
, src
),
3518 emit_insn (gen_rtx_SET (dest
, unspec
));
3522 /* Return a copy of X with mode MODE, without changing its other
3523 attributes. Unlike gen_lowpart, this doesn't care whether the
3524 mode change is valid. */
3527 aarch64_replace_reg_mode (rtx x
, machine_mode mode
)
3529 if (GET_MODE (x
) == mode
)
3532 x
= shallow_copy_rtx (x
);
3533 set_mode_and_regno (x
, mode
, REGNO (x
));
3537 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3541 aarch64_split_sve_subreg_move (rtx dest
, rtx ptrue
, rtx src
)
3543 /* Decide which REV operation we need. The mode with narrower elements
3544 determines the mode of the operands and the mode with the wider
3545 elements determines the reverse width. */
3546 machine_mode mode_with_wider_elts
= GET_MODE (dest
);
3547 machine_mode mode_with_narrower_elts
= GET_MODE (src
);
3548 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts
)
3549 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts
))
3550 std::swap (mode_with_wider_elts
, mode_with_narrower_elts
);
3552 unsigned int wider_bytes
= GET_MODE_UNIT_SIZE (mode_with_wider_elts
);
3553 unsigned int unspec
;
3554 if (wider_bytes
== 8)
3555 unspec
= UNSPEC_REV64
;
3556 else if (wider_bytes
== 4)
3557 unspec
= UNSPEC_REV32
;
3558 else if (wider_bytes
== 2)
3559 unspec
= UNSPEC_REV16
;
3562 machine_mode pred_mode
= aarch64_sve_pred_mode (wider_bytes
).require ();
3566 (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3567 UNSPEC_MERGE_PTRUE))
3569 with the appropriate modes. */
3570 ptrue
= gen_lowpart (pred_mode
, ptrue
);
3571 dest
= aarch64_replace_reg_mode (dest
, mode_with_narrower_elts
);
3572 src
= aarch64_replace_reg_mode (src
, mode_with_narrower_elts
);
3573 src
= gen_rtx_UNSPEC (mode_with_narrower_elts
, gen_rtvec (1, src
), unspec
);
3574 src
= gen_rtx_UNSPEC (mode_with_narrower_elts
, gen_rtvec (2, ptrue
, src
),
3575 UNSPEC_MERGE_PTRUE
);
3576 emit_insn (gen_rtx_SET (dest
, src
));
3580 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED
,
3581 tree exp ATTRIBUTE_UNUSED
)
3583 if (aarch64_simd_decl_p (cfun
->decl
) != aarch64_simd_decl_p (decl
))
3589 /* Implement TARGET_PASS_BY_REFERENCE. */
3592 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED
,
3595 bool named ATTRIBUTE_UNUSED
)
3598 machine_mode dummymode
;
3601 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
3602 if (mode
== BLKmode
&& type
)
3603 size
= int_size_in_bytes (type
);
3605 /* No frontends can create types with variable-sized modes, so we
3606 shouldn't be asked to pass or return them. */
3607 size
= GET_MODE_SIZE (mode
).to_constant ();
3609 /* Aggregates are passed by reference based on their size. */
3610 if (type
&& AGGREGATE_TYPE_P (type
))
3612 size
= int_size_in_bytes (type
);
3615 /* Variable sized arguments are always returned by reference. */
3619 /* Can this be a candidate to be passed in fp/simd register(s)? */
3620 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
3625 /* Arguments which are variable sized or larger than 2 registers are
3626 passed by reference unless they are a homogenous floating point
3628 return size
> 2 * UNITS_PER_WORD
;
3631 /* Return TRUE if VALTYPE is padded to its least significant bits. */
3633 aarch64_return_in_msb (const_tree valtype
)
3635 machine_mode dummy_mode
;
3638 /* Never happens in little-endian mode. */
3639 if (!BYTES_BIG_ENDIAN
)
3642 /* Only composite types smaller than or equal to 16 bytes can
3643 be potentially returned in registers. */
3644 if (!aarch64_composite_type_p (valtype
, TYPE_MODE (valtype
))
3645 || int_size_in_bytes (valtype
) <= 0
3646 || int_size_in_bytes (valtype
) > 16)
3649 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3650 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3651 is always passed/returned in the least significant bits of fp/simd
3653 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype
), valtype
,
3654 &dummy_mode
, &dummy_int
, NULL
))
3660 /* Implement TARGET_FUNCTION_VALUE.
3661 Define how to find the value returned by a function. */
3664 aarch64_function_value (const_tree type
, const_tree func
,
3665 bool outgoing ATTRIBUTE_UNUSED
)
3670 machine_mode ag_mode
;
3672 mode
= TYPE_MODE (type
);
3673 if (INTEGRAL_TYPE_P (type
))
3674 mode
= promote_function_mode (type
, mode
, &unsignedp
, func
, 1);
3676 if (aarch64_return_in_msb (type
))
3678 HOST_WIDE_INT size
= int_size_in_bytes (type
);
3680 if (size
% UNITS_PER_WORD
!= 0)
3682 size
+= UNITS_PER_WORD
- size
% UNITS_PER_WORD
;
3683 mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 0).require ();
3687 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
3688 &ag_mode
, &count
, NULL
))
3690 if (!aarch64_composite_type_p (type
, mode
))
3692 gcc_assert (count
== 1 && mode
== ag_mode
);
3693 return gen_rtx_REG (mode
, V0_REGNUM
);
3700 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (count
));
3701 for (i
= 0; i
< count
; i
++)
3703 rtx tmp
= gen_rtx_REG (ag_mode
, V0_REGNUM
+ i
);
3704 rtx offset
= gen_int_mode (i
* GET_MODE_SIZE (ag_mode
), Pmode
);
3705 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
3706 XVECEXP (par
, 0, i
) = tmp
;
3712 return gen_rtx_REG (mode
, R0_REGNUM
);
3715 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3716 Return true if REGNO is the number of a hard register in which the values
3717 of called function may come back. */
3720 aarch64_function_value_regno_p (const unsigned int regno
)
3722 /* Maximum of 16 bytes can be returned in the general registers. Examples
3723 of 16-byte return values are: 128-bit integers and 16-byte small
3724 structures (excluding homogeneous floating-point aggregates). */
3725 if (regno
== R0_REGNUM
|| regno
== R1_REGNUM
)
3728 /* Up to four fp/simd registers can return a function value, e.g. a
3729 homogeneous floating-point aggregate having four members. */
3730 if (regno
>= V0_REGNUM
&& regno
< V0_REGNUM
+ HA_MAX_NUM_FLDS
)
3731 return TARGET_FLOAT
;
3736 /* Implement TARGET_RETURN_IN_MEMORY.
3738 If the type T of the result of a function is such that
3740 would require that arg be passed as a value in a register (or set of
3741 registers) according to the parameter passing rules, then the result
3742 is returned in the same registers as would be used for such an
3746 aarch64_return_in_memory (const_tree type
, const_tree fndecl ATTRIBUTE_UNUSED
)
3749 machine_mode ag_mode
;
3752 if (!AGGREGATE_TYPE_P (type
)
3753 && TREE_CODE (type
) != COMPLEX_TYPE
3754 && TREE_CODE (type
) != VECTOR_TYPE
)
3755 /* Simple scalar types always returned in registers. */
3758 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
),
3765 /* Types larger than 2 registers returned in memory. */
3766 size
= int_size_in_bytes (type
);
3767 return (size
< 0 || size
> 2 * UNITS_PER_WORD
);
3771 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v
, machine_mode mode
,
3772 const_tree type
, int *nregs
)
3774 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
3775 return aarch64_vfp_is_call_or_return_candidate (mode
,
3777 &pcum
->aapcs_vfp_rmode
,
3782 /* Given MODE and TYPE of a function argument, return the alignment in
3783 bits. The idea is to suppress any stronger alignment requested by
3784 the user and opt for the natural alignment (specified in AAPCS64 \S
3785 4.1). ABI_BREAK is set to true if the alignment was incorrectly
3786 calculated in versions of GCC prior to GCC-9. This is a helper
3787 function for local use only. */
3790 aarch64_function_arg_alignment (machine_mode mode
, const_tree type
,
3795 return GET_MODE_ALIGNMENT (mode
);
3797 if (integer_zerop (TYPE_SIZE (type
)))
3800 gcc_assert (TYPE_MODE (type
) == mode
);
3802 if (!AGGREGATE_TYPE_P (type
))
3803 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type
));
3805 if (TREE_CODE (type
) == ARRAY_TYPE
)
3806 return TYPE_ALIGN (TREE_TYPE (type
));
3808 unsigned int alignment
= 0;
3809 unsigned int bitfield_alignment
= 0;
3810 for (tree field
= TYPE_FIELDS (type
); field
; field
= DECL_CHAIN (field
))
3811 if (TREE_CODE (field
) == FIELD_DECL
)
3813 alignment
= std::max (alignment
, DECL_ALIGN (field
));
3814 if (DECL_BIT_FIELD_TYPE (field
))
3816 = std::max (bitfield_alignment
,
3817 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field
)));
3820 if (bitfield_alignment
> alignment
)
3823 return bitfield_alignment
;
3829 /* Layout a function argument according to the AAPCS64 rules. The rule
3830 numbers refer to the rule numbers in the AAPCS64. */
3833 aarch64_layout_arg (cumulative_args_t pcum_v
, machine_mode mode
,
3835 bool named ATTRIBUTE_UNUSED
)
3837 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
3838 int ncrn
, nvrn
, nregs
;
3839 bool allocate_ncrn
, allocate_nvrn
;
3843 /* We need to do this once per argument. */
3844 if (pcum
->aapcs_arg_processed
)
3847 pcum
->aapcs_arg_processed
= true;
3849 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
3851 size
= int_size_in_bytes (type
);
3853 /* No frontends can create types with variable-sized modes, so we
3854 shouldn't be asked to pass or return them. */
3855 size
= GET_MODE_SIZE (mode
).to_constant ();
3856 size
= ROUND_UP (size
, UNITS_PER_WORD
);
3858 allocate_ncrn
= (type
) ? !(FLOAT_TYPE_P (type
)) : !FLOAT_MODE_P (mode
);
3859 allocate_nvrn
= aarch64_vfp_is_call_candidate (pcum_v
,
3864 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3865 The following code thus handles passing by SIMD/FP registers first. */
3867 nvrn
= pcum
->aapcs_nvrn
;
3869 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3870 and homogenous short-vector aggregates (HVA). */
3874 aarch64_err_no_fpadvsimd (mode
);
3876 if (nvrn
+ nregs
<= NUM_FP_ARG_REGS
)
3878 pcum
->aapcs_nextnvrn
= nvrn
+ nregs
;
3879 if (!aarch64_composite_type_p (type
, mode
))
3881 gcc_assert (nregs
== 1);
3882 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ nvrn
);
3888 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
3889 for (i
= 0; i
< nregs
; i
++)
3891 rtx tmp
= gen_rtx_REG (pcum
->aapcs_vfp_rmode
,
3892 V0_REGNUM
+ nvrn
+ i
);
3893 rtx offset
= gen_int_mode
3894 (i
* GET_MODE_SIZE (pcum
->aapcs_vfp_rmode
), Pmode
);
3895 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
3896 XVECEXP (par
, 0, i
) = tmp
;
3898 pcum
->aapcs_reg
= par
;
3904 /* C.3 NSRN is set to 8. */
3905 pcum
->aapcs_nextnvrn
= NUM_FP_ARG_REGS
;
3910 ncrn
= pcum
->aapcs_ncrn
;
3911 nregs
= size
/ UNITS_PER_WORD
;
3913 /* C6 - C9. though the sign and zero extension semantics are
3914 handled elsewhere. This is the case where the argument fits
3915 entirely general registers. */
3916 if (allocate_ncrn
&& (ncrn
+ nregs
<= NUM_ARG_REGS
))
3918 gcc_assert (nregs
== 0 || nregs
== 1 || nregs
== 2);
3920 /* C.8 if the argument has an alignment of 16 then the NGRN is
3921 rounded up to the next even number. */
3924 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3925 comparison is there because for > 16 * BITS_PER_UNIT
3926 alignment nregs should be > 2 and therefore it should be
3927 passed by reference rather than value. */
3928 && (aarch64_function_arg_alignment (mode
, type
, &abi_break
)
3929 == 16 * BITS_PER_UNIT
))
3931 if (abi_break
&& warn_psabi
&& currently_expanding_gimple_stmt
)
3932 inform (input_location
, "parameter passing for argument of type "
3933 "%qT changed in GCC 9.1", type
);
3935 gcc_assert (ncrn
+ nregs
<= NUM_ARG_REGS
);
3938 /* NREGS can be 0 when e.g. an empty structure is to be passed.
3939 A reg is still generated for it, but the caller should be smart
3940 enough not to use it. */
3941 if (nregs
== 0 || nregs
== 1 || GET_MODE_CLASS (mode
) == MODE_INT
)
3942 pcum
->aapcs_reg
= gen_rtx_REG (mode
, R0_REGNUM
+ ncrn
);
3948 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
3949 for (i
= 0; i
< nregs
; i
++)
3951 rtx tmp
= gen_rtx_REG (word_mode
, R0_REGNUM
+ ncrn
+ i
);
3952 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
,
3953 GEN_INT (i
* UNITS_PER_WORD
));
3954 XVECEXP (par
, 0, i
) = tmp
;
3956 pcum
->aapcs_reg
= par
;
3959 pcum
->aapcs_nextncrn
= ncrn
+ nregs
;
3964 pcum
->aapcs_nextncrn
= NUM_ARG_REGS
;
3966 /* The argument is passed on stack; record the needed number of words for
3967 this argument and align the total size if necessary. */
3969 pcum
->aapcs_stack_words
= size
/ UNITS_PER_WORD
;
3971 if (aarch64_function_arg_alignment (mode
, type
, &abi_break
)
3972 == 16 * BITS_PER_UNIT
)
3974 int new_size
= ROUND_UP (pcum
->aapcs_stack_size
, 16 / UNITS_PER_WORD
);
3975 if (pcum
->aapcs_stack_size
!= new_size
)
3977 if (abi_break
&& warn_psabi
&& currently_expanding_gimple_stmt
)
3978 inform (input_location
, "parameter passing for argument of type "
3979 "%qT changed in GCC 9.1", type
);
3980 pcum
->aapcs_stack_size
= new_size
;
3986 /* Implement TARGET_FUNCTION_ARG. */
3989 aarch64_function_arg (cumulative_args_t pcum_v
, machine_mode mode
,
3990 const_tree type
, bool named
)
3992 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
3993 gcc_assert (pcum
->pcs_variant
== ARM_PCS_AAPCS64
);
3995 if (mode
== VOIDmode
)
3998 aarch64_layout_arg (pcum_v
, mode
, type
, named
);
3999 return pcum
->aapcs_reg
;
4003 aarch64_init_cumulative_args (CUMULATIVE_ARGS
*pcum
,
4004 const_tree fntype ATTRIBUTE_UNUSED
,
4005 rtx libname ATTRIBUTE_UNUSED
,
4006 const_tree fndecl ATTRIBUTE_UNUSED
,
4007 unsigned n_named ATTRIBUTE_UNUSED
)
4009 pcum
->aapcs_ncrn
= 0;
4010 pcum
->aapcs_nvrn
= 0;
4011 pcum
->aapcs_nextncrn
= 0;
4012 pcum
->aapcs_nextnvrn
= 0;
4013 pcum
->pcs_variant
= ARM_PCS_AAPCS64
;
4014 pcum
->aapcs_reg
= NULL_RTX
;
4015 pcum
->aapcs_arg_processed
= false;
4016 pcum
->aapcs_stack_words
= 0;
4017 pcum
->aapcs_stack_size
= 0;
4020 && fndecl
&& TREE_PUBLIC (fndecl
)
4021 && fntype
&& fntype
!= error_mark_node
)
4023 const_tree type
= TREE_TYPE (fntype
);
4024 machine_mode mode ATTRIBUTE_UNUSED
; /* To pass pointer as argument. */
4025 int nregs ATTRIBUTE_UNUSED
; /* Likewise. */
4026 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
), type
,
4027 &mode
, &nregs
, NULL
))
4028 aarch64_err_no_fpadvsimd (TYPE_MODE (type
));
4034 aarch64_function_arg_advance (cumulative_args_t pcum_v
,
4039 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
4040 if (pcum
->pcs_variant
== ARM_PCS_AAPCS64
)
4042 aarch64_layout_arg (pcum_v
, mode
, type
, named
);
4043 gcc_assert ((pcum
->aapcs_reg
!= NULL_RTX
)
4044 != (pcum
->aapcs_stack_words
!= 0));
4045 pcum
->aapcs_arg_processed
= false;
4046 pcum
->aapcs_ncrn
= pcum
->aapcs_nextncrn
;
4047 pcum
->aapcs_nvrn
= pcum
->aapcs_nextnvrn
;
4048 pcum
->aapcs_stack_size
+= pcum
->aapcs_stack_words
;
4049 pcum
->aapcs_stack_words
= 0;
4050 pcum
->aapcs_reg
= NULL_RTX
;
4055 aarch64_function_arg_regno_p (unsigned regno
)
4057 return ((GP_REGNUM_P (regno
) && regno
< R0_REGNUM
+ NUM_ARG_REGS
)
4058 || (FP_REGNUM_P (regno
) && regno
< V0_REGNUM
+ NUM_FP_ARG_REGS
));
4061 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
4062 PARM_BOUNDARY bits of alignment, but will be given anything up
4063 to STACK_BOUNDARY bits if the type requires it. This makes sure
4064 that both before and after the layout of each argument, the Next
4065 Stacked Argument Address (NSAA) will have a minimum alignment of
4069 aarch64_function_arg_boundary (machine_mode mode
, const_tree type
)
4072 unsigned int alignment
= aarch64_function_arg_alignment (mode
, type
,
4074 if (abi_break
& warn_psabi
)
4075 inform (input_location
, "parameter passing for argument of type "
4076 "%qT changed in GCC 9.1", type
);
4078 return MIN (MAX (alignment
, PARM_BOUNDARY
), STACK_BOUNDARY
);
4081 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
4083 static fixed_size_mode
4084 aarch64_get_reg_raw_mode (int regno
)
4086 if (TARGET_SVE
&& FP_REGNUM_P (regno
))
4087 /* Don't use the SVE part of the register for __builtin_apply and
4088 __builtin_return. The SVE registers aren't used by the normal PCS,
4089 so using them there would be a waste of time. The PCS extensions
4090 for SVE types are fundamentally incompatible with the
4091 __builtin_return/__builtin_apply interface. */
4092 return as_a
<fixed_size_mode
> (V16QImode
);
4093 return default_get_reg_raw_mode (regno
);
4096 /* Implement TARGET_FUNCTION_ARG_PADDING.
4098 Small aggregate types are placed in the lowest memory address.
4100 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
4102 static pad_direction
4103 aarch64_function_arg_padding (machine_mode mode
, const_tree type
)
4105 /* On little-endian targets, the least significant byte of every stack
4106 argument is passed at the lowest byte address of the stack slot. */
4107 if (!BYTES_BIG_ENDIAN
)
4110 /* Otherwise, integral, floating-point and pointer types are padded downward:
4111 the least significant byte of a stack argument is passed at the highest
4112 byte address of the stack slot. */
4114 ? (INTEGRAL_TYPE_P (type
) || SCALAR_FLOAT_TYPE_P (type
)
4115 || POINTER_TYPE_P (type
))
4116 : (SCALAR_INT_MODE_P (mode
) || SCALAR_FLOAT_MODE_P (mode
)))
4117 return PAD_DOWNWARD
;
4119 /* Everything else padded upward, i.e. data in first byte of stack slot. */
4123 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4125 It specifies padding for the last (may also be the only)
4126 element of a block move between registers and memory. If
4127 assuming the block is in the memory, padding upward means that
4128 the last element is padded after its highest significant byte,
4129 while in downward padding, the last element is padded at the
4130 its least significant byte side.
4132 Small aggregates and small complex types are always padded
4135 We don't need to worry about homogeneous floating-point or
4136 short-vector aggregates; their move is not affected by the
4137 padding direction determined here. Regardless of endianness,
4138 each element of such an aggregate is put in the least
4139 significant bits of a fp/simd register.
4141 Return !BYTES_BIG_ENDIAN if the least significant byte of the
4142 register has useful data, and return the opposite if the most
4143 significant byte does. */
4146 aarch64_pad_reg_upward (machine_mode mode
, const_tree type
,
4147 bool first ATTRIBUTE_UNUSED
)
4150 /* Small composite types are always padded upward. */
4151 if (BYTES_BIG_ENDIAN
&& aarch64_composite_type_p (type
, mode
))
4155 size
= int_size_in_bytes (type
);
4157 /* No frontends can create types with variable-sized modes, so we
4158 shouldn't be asked to pass or return them. */
4159 size
= GET_MODE_SIZE (mode
).to_constant ();
4160 if (size
< 2 * UNITS_PER_WORD
)
4164 /* Otherwise, use the default padding. */
4165 return !BYTES_BIG_ENDIAN
;
4168 static scalar_int_mode
4169 aarch64_libgcc_cmp_return_mode (void)
4174 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4176 /* We use the 12-bit shifted immediate arithmetic instructions so values
4177 must be multiple of (1 << 12), i.e. 4096. */
4178 #define ARITH_FACTOR 4096
4180 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4181 #error Cannot use simple address calculation for stack probing
4184 /* The pair of scratch registers used for stack probing. */
4185 #define PROBE_STACK_FIRST_REG R9_REGNUM
4186 #define PROBE_STACK_SECOND_REG R10_REGNUM
4188 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
4189 inclusive. These are offsets from the current stack pointer. */
4192 aarch64_emit_probe_stack_range (HOST_WIDE_INT first
, poly_int64 poly_size
)
4195 if (!poly_size
.is_constant (&size
))
4197 sorry ("stack probes for SVE frames");
4201 rtx reg1
= gen_rtx_REG (Pmode
, PROBE_STACK_FIRST_REG
);
4203 /* See the same assertion on PROBE_INTERVAL above. */
4204 gcc_assert ((first
% ARITH_FACTOR
) == 0);
4206 /* See if we have a constant small number of probes to generate. If so,
4207 that's the easy case. */
4208 if (size
<= PROBE_INTERVAL
)
4210 const HOST_WIDE_INT base
= ROUND_UP (size
, ARITH_FACTOR
);
4212 emit_set_insn (reg1
,
4213 plus_constant (Pmode
,
4214 stack_pointer_rtx
, -(first
+ base
)));
4215 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- size
));
4218 /* The run-time loop is made up of 8 insns in the generic case while the
4219 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
4220 else if (size
<= 4 * PROBE_INTERVAL
)
4222 HOST_WIDE_INT i
, rem
;
4224 emit_set_insn (reg1
,
4225 plus_constant (Pmode
,
4227 -(first
+ PROBE_INTERVAL
)));
4228 emit_stack_probe (reg1
);
4230 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
4231 it exceeds SIZE. If only two probes are needed, this will not
4232 generate any code. Then probe at FIRST + SIZE. */
4233 for (i
= 2 * PROBE_INTERVAL
; i
< size
; i
+= PROBE_INTERVAL
)
4235 emit_set_insn (reg1
,
4236 plus_constant (Pmode
, reg1
, -PROBE_INTERVAL
));
4237 emit_stack_probe (reg1
);
4240 rem
= size
- (i
- PROBE_INTERVAL
);
4243 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
4245 emit_set_insn (reg1
, plus_constant (Pmode
, reg1
, -base
));
4246 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- rem
));
4249 emit_stack_probe (plus_constant (Pmode
, reg1
, -rem
));
4252 /* Otherwise, do the same as above, but in a loop. Note that we must be
4253 extra careful with variables wrapping around because we might be at
4254 the very top (or the very bottom) of the address space and we have
4255 to be able to handle this case properly; in particular, we use an
4256 equality test for the loop condition. */
4259 rtx reg2
= gen_rtx_REG (Pmode
, PROBE_STACK_SECOND_REG
);
4261 /* Step 1: round SIZE to the previous multiple of the interval. */
4263 HOST_WIDE_INT rounded_size
= size
& -PROBE_INTERVAL
;
4266 /* Step 2: compute initial and final value of the loop counter. */
4268 /* TEST_ADDR = SP + FIRST. */
4269 emit_set_insn (reg1
,
4270 plus_constant (Pmode
, stack_pointer_rtx
, -first
));
4272 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
4273 HOST_WIDE_INT adjustment
= - (first
+ rounded_size
);
4274 if (! aarch64_uimm12_shift (adjustment
))
4276 aarch64_internal_mov_immediate (reg2
, GEN_INT (adjustment
),
4278 emit_set_insn (reg2
, gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, reg2
));
4281 emit_set_insn (reg2
,
4282 plus_constant (Pmode
, stack_pointer_rtx
, adjustment
));
4288 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
4291 while (TEST_ADDR != LAST_ADDR)
4293 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
4294 until it is equal to ROUNDED_SIZE. */
4296 emit_insn (gen_probe_stack_range (reg1
, reg1
, reg2
));
4299 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
4300 that SIZE is equal to ROUNDED_SIZE. */
4302 if (size
!= rounded_size
)
4304 HOST_WIDE_INT rem
= size
- rounded_size
;
4308 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
4310 emit_set_insn (reg2
, plus_constant (Pmode
, reg2
, -base
));
4311 emit_stack_probe (plus_constant (Pmode
, reg2
, base
- rem
));
4314 emit_stack_probe (plus_constant (Pmode
, reg2
, -rem
));
4318 /* Make sure nothing is scheduled before we are done. */
4319 emit_insn (gen_blockage ());
4322 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
4323 absolute addresses. */
4326 aarch64_output_probe_stack_range (rtx reg1
, rtx reg2
)
4328 static int labelno
= 0;
4332 ASM_GENERATE_INTERNAL_LABEL (loop_lab
, "LPSRL", labelno
++);
4335 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_lab
);
4337 HOST_WIDE_INT stack_clash_probe_interval
4338 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
4340 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
4342 HOST_WIDE_INT interval
;
4343 if (flag_stack_clash_protection
)
4344 interval
= stack_clash_probe_interval
;
4346 interval
= PROBE_INTERVAL
;
4348 gcc_assert (aarch64_uimm12_shift (interval
));
4349 xops
[1] = GEN_INT (interval
);
4351 output_asm_insn ("sub\t%0, %0, %1", xops
);
4353 /* If doing stack clash protection then we probe up by the ABI specified
4354 amount. We do this because we're dropping full pages at a time in the
4355 loop. But if we're doing non-stack clash probing, probe at SP 0. */
4356 if (flag_stack_clash_protection
)
4357 xops
[1] = GEN_INT (STACK_CLASH_CALLER_GUARD
);
4359 xops
[1] = CONST0_RTX (GET_MODE (xops
[1]));
4361 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
4362 by this amount for each iteration. */
4363 output_asm_insn ("str\txzr, [%0, %1]", xops
);
4365 /* Test if TEST_ADDR == LAST_ADDR. */
4367 output_asm_insn ("cmp\t%0, %1", xops
);
4370 fputs ("\tb.ne\t", asm_out_file
);
4371 assemble_name_raw (asm_out_file
, loop_lab
);
4372 fputc ('\n', asm_out_file
);
4377 /* Emit the probe loop for doing stack clash probes and stack adjustments for
4378 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
4379 of GUARD_SIZE. When a probe is emitted it is done at most
4380 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
4381 at most MIN_PROBE_THRESHOLD. By the end of this function
4382 BASE = BASE - ADJUSTMENT. */
4385 aarch64_output_probe_sve_stack_clash (rtx base
, rtx adjustment
,
4386 rtx min_probe_threshold
, rtx guard_size
)
4388 /* This function is not allowed to use any instruction generation function
4389 like gen_ and friends. If you do you'll likely ICE during CFG validation,
4390 so instead emit the code you want using output_asm_insn. */
4391 gcc_assert (flag_stack_clash_protection
);
4392 gcc_assert (CONST_INT_P (min_probe_threshold
) && CONST_INT_P (guard_size
));
4393 gcc_assert (INTVAL (guard_size
) > INTVAL (min_probe_threshold
));
4395 /* The minimum required allocation before the residual requires probing. */
4396 HOST_WIDE_INT residual_probe_guard
= INTVAL (min_probe_threshold
);
4398 /* Clamp the value down to the nearest value that can be used with a cmp. */
4399 residual_probe_guard
= aarch64_clamp_to_uimm12_shift (residual_probe_guard
);
4400 rtx probe_offset_value_rtx
= gen_int_mode (residual_probe_guard
, Pmode
);
4402 gcc_assert (INTVAL (min_probe_threshold
) >= residual_probe_guard
);
4403 gcc_assert (aarch64_uimm12_shift (residual_probe_guard
));
4405 static int labelno
= 0;
4406 char loop_start_lab
[32];
4407 char loop_end_lab
[32];
4410 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab
, "SVLPSPL", labelno
);
4411 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab
, "SVLPEND", labelno
++);
4413 /* Emit loop start label. */
4414 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_start_lab
);
4416 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
4417 xops
[0] = adjustment
;
4418 xops
[1] = probe_offset_value_rtx
;
4419 output_asm_insn ("cmp\t%0, %1", xops
);
4421 /* Branch to end if not enough adjustment to probe. */
4422 fputs ("\tb.lt\t", asm_out_file
);
4423 assemble_name_raw (asm_out_file
, loop_end_lab
);
4424 fputc ('\n', asm_out_file
);
4426 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
4428 xops
[1] = probe_offset_value_rtx
;
4429 output_asm_insn ("sub\t%0, %0, %1", xops
);
4431 /* Probe at BASE. */
4432 xops
[1] = const0_rtx
;
4433 output_asm_insn ("str\txzr, [%0, %1]", xops
);
4435 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
4436 xops
[0] = adjustment
;
4437 xops
[1] = probe_offset_value_rtx
;
4438 output_asm_insn ("sub\t%0, %0, %1", xops
);
4440 /* Branch to start if still more bytes to allocate. */
4441 fputs ("\tb\t", asm_out_file
);
4442 assemble_name_raw (asm_out_file
, loop_start_lab
);
4443 fputc ('\n', asm_out_file
);
4445 /* No probe leave. */
4446 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_end_lab
);
4448 /* BASE = BASE - ADJUSTMENT. */
4450 xops
[1] = adjustment
;
4451 output_asm_insn ("sub\t%0, %0, %1", xops
);
4455 /* Determine whether a frame chain needs to be generated. */
4457 aarch64_needs_frame_chain (void)
4459 /* Force a frame chain for EH returns so the return address is at FP+8. */
4460 if (frame_pointer_needed
|| crtl
->calls_eh_return
)
4463 /* A leaf function cannot have calls or write LR. */
4464 bool is_leaf
= crtl
->is_leaf
&& !df_regs_ever_live_p (LR_REGNUM
);
4466 /* Don't use a frame chain in leaf functions if leaf frame pointers
4468 if (flag_omit_leaf_frame_pointer
&& is_leaf
)
4471 return aarch64_use_frame_pointer
;
4474 /* Mark the registers that need to be saved by the callee and calculate
4475 the size of the callee-saved registers area and frame record (both FP
4476 and LR may be omitted). */
4478 aarch64_layout_frame (void)
4480 HOST_WIDE_INT offset
= 0;
4481 int regno
, last_fp_reg
= INVALID_REGNUM
;
4482 bool simd_function
= aarch64_simd_decl_p (cfun
->decl
);
4484 cfun
->machine
->frame
.emit_frame_chain
= aarch64_needs_frame_chain ();
4486 /* Adjust the outgoing arguments size if required. Keep it in sync with what
4487 the mid-end is doing. */
4488 crtl
->outgoing_args_size
= STACK_DYNAMIC_OFFSET (cfun
);
4490 #define SLOT_NOT_REQUIRED (-2)
4491 #define SLOT_REQUIRED (-1)
4493 cfun
->machine
->frame
.wb_candidate1
= INVALID_REGNUM
;
4494 cfun
->machine
->frame
.wb_candidate2
= INVALID_REGNUM
;
4496 /* If this is a non-leaf simd function with calls we assume that
4497 at least one of those calls is to a non-simd function and thus
4498 we must save V8 to V23 in the prologue. */
4500 if (simd_function
&& !crtl
->is_leaf
)
4502 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
4503 if (FP_SIMD_SAVED_REGNUM_P (regno
))
4504 df_set_regs_ever_live (regno
, true);
4507 /* First mark all the registers that really need to be saved... */
4508 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
4509 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
4511 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
4512 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
4514 /* ... that includes the eh data registers (if needed)... */
4515 if (crtl
->calls_eh_return
)
4516 for (regno
= 0; EH_RETURN_DATA_REGNO (regno
) != INVALID_REGNUM
; regno
++)
4517 cfun
->machine
->frame
.reg_offset
[EH_RETURN_DATA_REGNO (regno
)]
4520 /* ... and any callee saved register that dataflow says is live. */
4521 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
4522 if (df_regs_ever_live_p (regno
)
4523 && (regno
== R30_REGNUM
4524 || !call_used_regs
[regno
]))
4525 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
4527 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
4528 if (df_regs_ever_live_p (regno
)
4529 && (!call_used_regs
[regno
]
4530 || (simd_function
&& FP_SIMD_SAVED_REGNUM_P (regno
))))
4532 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
4533 last_fp_reg
= regno
;
4536 if (cfun
->machine
->frame
.emit_frame_chain
)
4538 /* FP and LR are placed in the linkage record. */
4539 cfun
->machine
->frame
.reg_offset
[R29_REGNUM
] = 0;
4540 cfun
->machine
->frame
.wb_candidate1
= R29_REGNUM
;
4541 cfun
->machine
->frame
.reg_offset
[R30_REGNUM
] = UNITS_PER_WORD
;
4542 cfun
->machine
->frame
.wb_candidate2
= R30_REGNUM
;
4543 offset
= 2 * UNITS_PER_WORD
;
4546 /* With stack-clash, LR must be saved in non-leaf functions. */
4547 gcc_assert (crtl
->is_leaf
4548 || (cfun
->machine
->frame
.reg_offset
[R30_REGNUM
]
4549 != SLOT_NOT_REQUIRED
));
4551 /* Now assign stack slots for them. */
4552 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
4553 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
4555 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
4556 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
4557 cfun
->machine
->frame
.wb_candidate1
= regno
;
4558 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
)
4559 cfun
->machine
->frame
.wb_candidate2
= regno
;
4560 offset
+= UNITS_PER_WORD
;
4563 HOST_WIDE_INT max_int_offset
= offset
;
4564 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
4565 bool has_align_gap
= offset
!= max_int_offset
;
4567 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
4568 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
4570 /* If there is an alignment gap between integer and fp callee-saves,
4571 allocate the last fp register to it if possible. */
4572 if (regno
== last_fp_reg
4575 && (offset
& 8) == 0)
4577 cfun
->machine
->frame
.reg_offset
[regno
] = max_int_offset
;
4581 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
4582 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
4583 cfun
->machine
->frame
.wb_candidate1
= regno
;
4584 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
4585 && cfun
->machine
->frame
.wb_candidate1
>= V0_REGNUM
)
4586 cfun
->machine
->frame
.wb_candidate2
= regno
;
4587 offset
+= simd_function
? UNITS_PER_VREG
: UNITS_PER_WORD
;
4590 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
4592 cfun
->machine
->frame
.saved_regs_size
= offset
;
4594 HOST_WIDE_INT varargs_and_saved_regs_size
4595 = offset
+ cfun
->machine
->frame
.saved_varargs_size
;
4597 cfun
->machine
->frame
.hard_fp_offset
4598 = aligned_upper_bound (varargs_and_saved_regs_size
4599 + get_frame_size (),
4600 STACK_BOUNDARY
/ BITS_PER_UNIT
);
4602 /* Both these values are already aligned. */
4603 gcc_assert (multiple_p (crtl
->outgoing_args_size
,
4604 STACK_BOUNDARY
/ BITS_PER_UNIT
));
4605 cfun
->machine
->frame
.frame_size
4606 = (cfun
->machine
->frame
.hard_fp_offset
4607 + crtl
->outgoing_args_size
);
4609 cfun
->machine
->frame
.locals_offset
= cfun
->machine
->frame
.saved_varargs_size
;
4611 cfun
->machine
->frame
.initial_adjust
= 0;
4612 cfun
->machine
->frame
.final_adjust
= 0;
4613 cfun
->machine
->frame
.callee_adjust
= 0;
4614 cfun
->machine
->frame
.callee_offset
= 0;
4616 HOST_WIDE_INT max_push_offset
= 0;
4617 if (cfun
->machine
->frame
.wb_candidate2
!= INVALID_REGNUM
)
4618 max_push_offset
= 512;
4619 else if (cfun
->machine
->frame
.wb_candidate1
!= INVALID_REGNUM
)
4620 max_push_offset
= 256;
4622 HOST_WIDE_INT const_size
, const_fp_offset
;
4623 if (cfun
->machine
->frame
.frame_size
.is_constant (&const_size
)
4624 && const_size
< max_push_offset
4625 && known_eq (crtl
->outgoing_args_size
, 0))
4627 /* Simple, small frame with no outgoing arguments:
4628 stp reg1, reg2, [sp, -frame_size]!
4629 stp reg3, reg4, [sp, 16] */
4630 cfun
->machine
->frame
.callee_adjust
= const_size
;
4632 else if (known_lt (crtl
->outgoing_args_size
4633 + cfun
->machine
->frame
.saved_regs_size
, 512)
4634 && !(cfun
->calls_alloca
4635 && known_lt (cfun
->machine
->frame
.hard_fp_offset
,
4638 /* Frame with small outgoing arguments:
4639 sub sp, sp, frame_size
4640 stp reg1, reg2, [sp, outgoing_args_size]
4641 stp reg3, reg4, [sp, outgoing_args_size + 16] */
4642 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.frame_size
;
4643 cfun
->machine
->frame
.callee_offset
4644 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.hard_fp_offset
;
4646 else if (cfun
->machine
->frame
.hard_fp_offset
.is_constant (&const_fp_offset
)
4647 && const_fp_offset
< max_push_offset
)
4649 /* Frame with large outgoing arguments but a small local area:
4650 stp reg1, reg2, [sp, -hard_fp_offset]!
4651 stp reg3, reg4, [sp, 16]
4652 sub sp, sp, outgoing_args_size */
4653 cfun
->machine
->frame
.callee_adjust
= const_fp_offset
;
4654 cfun
->machine
->frame
.final_adjust
4655 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.callee_adjust
;
4659 /* Frame with large local area and outgoing arguments using frame pointer:
4660 sub sp, sp, hard_fp_offset
4661 stp x29, x30, [sp, 0]
4663 stp reg3, reg4, [sp, 16]
4664 sub sp, sp, outgoing_args_size */
4665 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.hard_fp_offset
;
4666 cfun
->machine
->frame
.final_adjust
4667 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.initial_adjust
;
4670 cfun
->machine
->frame
.laid_out
= true;
4673 /* Return true if the register REGNO is saved on entry to
4674 the current function. */
4677 aarch64_register_saved_on_entry (int regno
)
4679 return cfun
->machine
->frame
.reg_offset
[regno
] >= 0;
4682 /* Return the next register up from REGNO up to LIMIT for the callee
4686 aarch64_next_callee_save (unsigned regno
, unsigned limit
)
4688 while (regno
<= limit
&& !aarch64_register_saved_on_entry (regno
))
4693 /* Push the register number REGNO of mode MODE to the stack with write-back
4694 adjusting the stack by ADJUSTMENT. */
4697 aarch64_pushwb_single_reg (machine_mode mode
, unsigned regno
,
4698 HOST_WIDE_INT adjustment
)
4700 rtx base_rtx
= stack_pointer_rtx
;
4703 reg
= gen_rtx_REG (mode
, regno
);
4704 mem
= gen_rtx_PRE_MODIFY (Pmode
, base_rtx
,
4705 plus_constant (Pmode
, base_rtx
, -adjustment
));
4706 mem
= gen_frame_mem (mode
, mem
);
4708 insn
= emit_move_insn (mem
, reg
);
4709 RTX_FRAME_RELATED_P (insn
) = 1;
4712 /* Generate and return an instruction to store the pair of registers
4713 REG and REG2 of mode MODE to location BASE with write-back adjusting
4714 the stack location BASE by ADJUSTMENT. */
4717 aarch64_gen_storewb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
4718 HOST_WIDE_INT adjustment
)
4723 return gen_storewb_pairdi_di (base
, base
, reg
, reg2
,
4724 GEN_INT (-adjustment
),
4725 GEN_INT (UNITS_PER_WORD
- adjustment
));
4727 return gen_storewb_pairdf_di (base
, base
, reg
, reg2
,
4728 GEN_INT (-adjustment
),
4729 GEN_INT (UNITS_PER_WORD
- adjustment
));
4731 return gen_storewb_pairtf_di (base
, base
, reg
, reg2
,
4732 GEN_INT (-adjustment
),
4733 GEN_INT (UNITS_PER_VREG
- adjustment
));
4739 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4740 stack pointer by ADJUSTMENT. */
4743 aarch64_push_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
)
4746 machine_mode mode
= aarch64_reg_save_mode (cfun
->decl
, regno1
);
4748 if (regno2
== INVALID_REGNUM
)
4749 return aarch64_pushwb_single_reg (mode
, regno1
, adjustment
);
4751 rtx reg1
= gen_rtx_REG (mode
, regno1
);
4752 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4754 insn
= emit_insn (aarch64_gen_storewb_pair (mode
, stack_pointer_rtx
, reg1
,
4756 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 2)) = 1;
4757 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
4758 RTX_FRAME_RELATED_P (insn
) = 1;
4761 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4762 adjusting it by ADJUSTMENT afterwards. */
4765 aarch64_gen_loadwb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
4766 HOST_WIDE_INT adjustment
)
4771 return gen_loadwb_pairdi_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
4772 GEN_INT (UNITS_PER_WORD
));
4774 return gen_loadwb_pairdf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
4775 GEN_INT (UNITS_PER_WORD
));
4777 return gen_loadwb_pairtf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
4778 GEN_INT (UNITS_PER_VREG
));
4784 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4785 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4789 aarch64_pop_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
,
4792 machine_mode mode
= aarch64_reg_save_mode (cfun
->decl
, regno1
);
4793 rtx reg1
= gen_rtx_REG (mode
, regno1
);
4795 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg1
, *cfi_ops
);
4797 if (regno2
== INVALID_REGNUM
)
4799 rtx mem
= plus_constant (Pmode
, stack_pointer_rtx
, adjustment
);
4800 mem
= gen_rtx_POST_MODIFY (Pmode
, stack_pointer_rtx
, mem
);
4801 emit_move_insn (reg1
, gen_frame_mem (mode
, mem
));
4805 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4806 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
4807 emit_insn (aarch64_gen_loadwb_pair (mode
, stack_pointer_rtx
, reg1
,
4812 /* Generate and return a store pair instruction of mode MODE to store
4813 register REG1 to MEM1 and register REG2 to MEM2. */
4816 aarch64_gen_store_pair (machine_mode mode
, rtx mem1
, rtx reg1
, rtx mem2
,
4822 return gen_store_pair_dw_didi (mem1
, reg1
, mem2
, reg2
);
4825 return gen_store_pair_dw_dfdf (mem1
, reg1
, mem2
, reg2
);
4828 return gen_store_pair_dw_tftf (mem1
, reg1
, mem2
, reg2
);
4835 /* Generate and regurn a load pair isntruction of mode MODE to load register
4836 REG1 from MEM1 and register REG2 from MEM2. */
4839 aarch64_gen_load_pair (machine_mode mode
, rtx reg1
, rtx mem1
, rtx reg2
,
4845 return gen_load_pair_dw_didi (reg1
, mem1
, reg2
, mem2
);
4848 return gen_load_pair_dw_dfdf (reg1
, mem1
, reg2
, mem2
);
4851 return gen_load_pair_dw_tftf (reg1
, mem1
, reg2
, mem2
);
4858 /* Return TRUE if return address signing should be enabled for the current
4859 function, otherwise return FALSE. */
4862 aarch64_return_address_signing_enabled (void)
4864 /* This function should only be called after frame laid out. */
4865 gcc_assert (cfun
->machine
->frame
.laid_out
);
4867 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4868 if its LR is pushed onto stack. */
4869 return (aarch64_ra_sign_scope
== AARCH64_FUNCTION_ALL
4870 || (aarch64_ra_sign_scope
== AARCH64_FUNCTION_NON_LEAF
4871 && cfun
->machine
->frame
.reg_offset
[LR_REGNUM
] >= 0));
4874 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
4876 aarch64_bti_enabled (void)
4878 return (aarch64_enable_bti
== 1);
4881 /* Emit code to save the callee-saved registers from register number START
4882 to LIMIT to the stack at the location starting at offset START_OFFSET,
4883 skipping any write-back candidates if SKIP_WB is true. */
4886 aarch64_save_callee_saves (machine_mode mode
, poly_int64 start_offset
,
4887 unsigned start
, unsigned limit
, bool skip_wb
)
4893 for (regno
= aarch64_next_callee_save (start
, limit
);
4895 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
4902 && (regno
== cfun
->machine
->frame
.wb_candidate1
4903 || regno
== cfun
->machine
->frame
.wb_candidate2
))
4906 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
4909 reg
= gen_rtx_REG (mode
, regno
);
4910 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
4911 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
4914 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
4915 offset_diff
= cfun
->machine
->frame
.reg_offset
[regno2
]
4916 - cfun
->machine
->frame
.reg_offset
[regno
];
4919 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
4920 && known_eq (GET_MODE_SIZE (mode
), offset_diff
))
4922 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4925 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
4926 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
4928 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
,
4931 /* The first part of a frame-related parallel insn is
4932 always assumed to be relevant to the frame
4933 calculations; subsequent parts, are only
4934 frame-related if explicitly marked. */
4935 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
4939 insn
= emit_move_insn (mem
, reg
);
4941 RTX_FRAME_RELATED_P (insn
) = 1;
4945 /* Emit code to restore the callee registers of mode MODE from register
4946 number START up to and including LIMIT. Restore from the stack offset
4947 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4948 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
4951 aarch64_restore_callee_saves (machine_mode mode
,
4952 poly_int64 start_offset
, unsigned start
,
4953 unsigned limit
, bool skip_wb
, rtx
*cfi_ops
)
4955 rtx base_rtx
= stack_pointer_rtx
;
4960 for (regno
= aarch64_next_callee_save (start
, limit
);
4962 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
4964 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
4971 && (regno
== cfun
->machine
->frame
.wb_candidate1
4972 || regno
== cfun
->machine
->frame
.wb_candidate2
))
4975 reg
= gen_rtx_REG (mode
, regno
);
4976 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
4977 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
4979 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
4980 offset_diff
= cfun
->machine
->frame
.reg_offset
[regno2
]
4981 - cfun
->machine
->frame
.reg_offset
[regno
];
4984 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
4985 && known_eq (GET_MODE_SIZE (mode
), offset_diff
))
4987 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4990 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
4991 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
4992 emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
4994 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
4998 emit_move_insn (reg
, mem
);
4999 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg
, *cfi_ops
);
5003 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
5007 offset_4bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
5009 HOST_WIDE_INT multiple
;
5010 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5011 && IN_RANGE (multiple
, -8, 7));
5014 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5018 offset_6bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
5020 HOST_WIDE_INT multiple
;
5021 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5022 && IN_RANGE (multiple
, 0, 63));
5025 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5029 aarch64_offset_7bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
5031 HOST_WIDE_INT multiple
;
5032 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5033 && IN_RANGE (multiple
, -64, 63));
5036 /* Return true if OFFSET is a signed 9-bit value. */
5039 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED
,
5042 HOST_WIDE_INT const_offset
;
5043 return (offset
.is_constant (&const_offset
)
5044 && IN_RANGE (const_offset
, -256, 255));
5047 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5051 offset_9bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
5053 HOST_WIDE_INT multiple
;
5054 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5055 && IN_RANGE (multiple
, -256, 255));
5058 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5062 offset_12bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
5064 HOST_WIDE_INT multiple
;
5065 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5066 && IN_RANGE (multiple
, 0, 4095));
5069 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
5072 aarch64_get_separate_components (void)
5074 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
5075 bitmap_clear (components
);
5077 /* The registers we need saved to the frame. */
5078 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
5079 if (aarch64_register_saved_on_entry (regno
))
5081 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
5082 if (!frame_pointer_needed
)
5083 offset
+= cfun
->machine
->frame
.frame_size
5084 - cfun
->machine
->frame
.hard_fp_offset
;
5085 /* Check that we can access the stack slot of the register with one
5086 direct load with no adjustments needed. */
5087 if (offset_12bit_unsigned_scaled_p (DImode
, offset
))
5088 bitmap_set_bit (components
, regno
);
5091 /* Don't mess with the hard frame pointer. */
5092 if (frame_pointer_needed
)
5093 bitmap_clear_bit (components
, HARD_FRAME_POINTER_REGNUM
);
5095 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
5096 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
5097 /* If registers have been chosen to be stored/restored with
5098 writeback don't interfere with them to avoid having to output explicit
5099 stack adjustment instructions. */
5100 if (reg2
!= INVALID_REGNUM
)
5101 bitmap_clear_bit (components
, reg2
);
5102 if (reg1
!= INVALID_REGNUM
)
5103 bitmap_clear_bit (components
, reg1
);
5105 bitmap_clear_bit (components
, LR_REGNUM
);
5106 bitmap_clear_bit (components
, SP_REGNUM
);
5111 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
5114 aarch64_components_for_bb (basic_block bb
)
5116 bitmap in
= DF_LIVE_IN (bb
);
5117 bitmap gen
= &DF_LIVE_BB_INFO (bb
)->gen
;
5118 bitmap kill
= &DF_LIVE_BB_INFO (bb
)->kill
;
5119 bool simd_function
= aarch64_simd_decl_p (cfun
->decl
);
5121 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
5122 bitmap_clear (components
);
5124 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
5125 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
5126 if ((!call_used_regs
[regno
]
5127 || (simd_function
&& FP_SIMD_SAVED_REGNUM_P (regno
)))
5128 && (bitmap_bit_p (in
, regno
)
5129 || bitmap_bit_p (gen
, regno
)
5130 || bitmap_bit_p (kill
, regno
)))
5132 unsigned regno2
, offset
, offset2
;
5133 bitmap_set_bit (components
, regno
);
5135 /* If there is a callee-save at an adjacent offset, add it too
5136 to increase the use of LDP/STP. */
5137 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
5138 regno2
= ((offset
& 8) == 0) ? regno
+ 1 : regno
- 1;
5140 if (regno2
<= LAST_SAVED_REGNUM
)
5142 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
5143 if ((offset
& ~8) == (offset2
& ~8))
5144 bitmap_set_bit (components
, regno2
);
5151 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5152 Nothing to do for aarch64. */
5155 aarch64_disqualify_components (sbitmap
, edge
, sbitmap
, bool)
5159 /* Return the next set bit in BMP from START onwards. Return the total number
5160 of bits in BMP if no set bit is found at or after START. */
5163 aarch64_get_next_set_bit (sbitmap bmp
, unsigned int start
)
5165 unsigned int nbits
= SBITMAP_SIZE (bmp
);
5169 gcc_assert (start
< nbits
);
5170 for (unsigned int i
= start
; i
< nbits
; i
++)
5171 if (bitmap_bit_p (bmp
, i
))
5177 /* Do the work for aarch64_emit_prologue_components and
5178 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
5179 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5180 for these components or the epilogue sequence. That is, it determines
5181 whether we should emit stores or loads and what kind of CFA notes to attach
5182 to the insns. Otherwise the logic for the two sequences is very
5186 aarch64_process_components (sbitmap components
, bool prologue_p
)
5188 rtx ptr_reg
= gen_rtx_REG (Pmode
, frame_pointer_needed
5189 ? HARD_FRAME_POINTER_REGNUM
5190 : STACK_POINTER_REGNUM
);
5192 unsigned last_regno
= SBITMAP_SIZE (components
);
5193 unsigned regno
= aarch64_get_next_set_bit (components
, R0_REGNUM
);
5194 rtx_insn
*insn
= NULL
;
5196 while (regno
!= last_regno
)
5198 /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
5199 so DFmode for the vector registers is enough. For simd functions
5200 we want to save the low 128 bits. */
5201 machine_mode mode
= aarch64_reg_save_mode (cfun
->decl
, regno
);
5203 rtx reg
= gen_rtx_REG (mode
, regno
);
5204 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
5205 if (!frame_pointer_needed
)
5206 offset
+= cfun
->machine
->frame
.frame_size
5207 - cfun
->machine
->frame
.hard_fp_offset
;
5208 rtx addr
= plus_constant (Pmode
, ptr_reg
, offset
);
5209 rtx mem
= gen_frame_mem (mode
, addr
);
5211 rtx set
= prologue_p
? gen_rtx_SET (mem
, reg
) : gen_rtx_SET (reg
, mem
);
5212 unsigned regno2
= aarch64_get_next_set_bit (components
, regno
+ 1);
5213 /* No more registers to handle after REGNO.
5214 Emit a single save/restore and exit. */
5215 if (regno2
== last_regno
)
5217 insn
= emit_insn (set
);
5218 RTX_FRAME_RELATED_P (insn
) = 1;
5220 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
5222 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
5226 poly_int64 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
5227 /* The next register is not of the same class or its offset is not
5228 mergeable with the current one into a pair. */
5229 if (!satisfies_constraint_Ump (mem
)
5230 || GP_REGNUM_P (regno
) != GP_REGNUM_P (regno2
)
5231 || (aarch64_simd_decl_p (cfun
->decl
) && FP_REGNUM_P (regno
))
5232 || maybe_ne ((offset2
- cfun
->machine
->frame
.reg_offset
[regno
]),
5233 GET_MODE_SIZE (mode
)))
5235 insn
= emit_insn (set
);
5236 RTX_FRAME_RELATED_P (insn
) = 1;
5238 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
5240 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
5246 /* REGNO2 can be saved/restored in a pair with REGNO. */
5247 rtx reg2
= gen_rtx_REG (mode
, regno2
);
5248 if (!frame_pointer_needed
)
5249 offset2
+= cfun
->machine
->frame
.frame_size
5250 - cfun
->machine
->frame
.hard_fp_offset
;
5251 rtx addr2
= plus_constant (Pmode
, ptr_reg
, offset2
);
5252 rtx mem2
= gen_frame_mem (mode
, addr2
);
5253 rtx set2
= prologue_p
? gen_rtx_SET (mem2
, reg2
)
5254 : gen_rtx_SET (reg2
, mem2
);
5257 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
, reg2
));
5259 insn
= emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
5261 RTX_FRAME_RELATED_P (insn
) = 1;
5264 add_reg_note (insn
, REG_CFA_OFFSET
, set
);
5265 add_reg_note (insn
, REG_CFA_OFFSET
, set2
);
5269 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
5270 add_reg_note (insn
, REG_CFA_RESTORE
, reg2
);
5273 regno
= aarch64_get_next_set_bit (components
, regno2
+ 1);
5277 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
5280 aarch64_emit_prologue_components (sbitmap components
)
5282 aarch64_process_components (components
, true);
5285 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
5288 aarch64_emit_epilogue_components (sbitmap components
)
5290 aarch64_process_components (components
, false);
5293 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
5296 aarch64_set_handled_components (sbitmap components
)
5298 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
5299 if (bitmap_bit_p (components
, regno
))
5300 cfun
->machine
->reg_is_wrapped_separately
[regno
] = true;
5303 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
5304 determining the probe offset for alloca. */
5306 static HOST_WIDE_INT
5307 aarch64_stack_clash_protection_alloca_probe_range (void)
5309 return STACK_CLASH_CALLER_GUARD
;
5313 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
5314 registers. If POLY_SIZE is not large enough to require a probe this function
5315 will only adjust the stack. When allocating the stack space
5316 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
5317 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
5318 arguments. If we are then we ensure that any allocation larger than the ABI
5319 defined buffer needs a probe so that the invariant of having a 1KB buffer is
5322 We emit barriers after each stack adjustment to prevent optimizations from
5323 breaking the invariant that we never drop the stack more than a page. This
5324 invariant is needed to make it easier to correctly handle asynchronous
5325 events, e.g. if we were to allow the stack to be dropped by more than a page
5326 and then have multiple probes up and we take a signal somewhere in between
5327 then the signal handler doesn't know the state of the stack and can make no
5328 assumptions about which pages have been probed. */
5331 aarch64_allocate_and_probe_stack_space (rtx temp1
, rtx temp2
,
5332 poly_int64 poly_size
,
5333 bool frame_related_p
,
5334 bool final_adjustment_p
)
5336 HOST_WIDE_INT guard_size
5337 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
5338 HOST_WIDE_INT guard_used_by_caller
= STACK_CLASH_CALLER_GUARD
;
5339 /* When doing the final adjustment for the outgoing argument size we can't
5340 assume that LR was saved at position 0. So subtract it's offset from the
5341 ABI safe buffer so that we don't accidentally allow an adjustment that
5342 would result in an allocation larger than the ABI buffer without
5344 HOST_WIDE_INT min_probe_threshold
5345 = final_adjustment_p
5346 ? guard_used_by_caller
- cfun
->machine
->frame
.reg_offset
[LR_REGNUM
]
5347 : guard_size
- guard_used_by_caller
;
5349 poly_int64 frame_size
= cfun
->machine
->frame
.frame_size
;
5351 /* We should always have a positive probe threshold. */
5352 gcc_assert (min_probe_threshold
> 0);
5354 if (flag_stack_clash_protection
&& !final_adjustment_p
)
5356 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
5357 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
5359 if (known_eq (frame_size
, 0))
5361 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME
, false);
5363 else if (known_lt (initial_adjust
, guard_size
- guard_used_by_caller
)
5364 && known_lt (final_adjust
, guard_used_by_caller
))
5366 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME
, true);
5370 /* If SIZE is not large enough to require probing, just adjust the stack and
5372 if (known_lt (poly_size
, min_probe_threshold
)
5373 || !flag_stack_clash_protection
)
5375 aarch64_sub_sp (temp1
, temp2
, poly_size
, frame_related_p
);
5380 /* Handle the SVE non-constant case first. */
5381 if (!poly_size
.is_constant (&size
))
5385 fprintf (dump_file
, "Stack clash SVE prologue: ");
5386 print_dec (poly_size
, dump_file
);
5387 fprintf (dump_file
, " bytes, dynamic probing will be required.\n");
5390 /* First calculate the amount of bytes we're actually spilling. */
5391 aarch64_add_offset (Pmode
, temp1
, CONST0_RTX (Pmode
),
5392 poly_size
, temp1
, temp2
, false, true);
5394 rtx_insn
*insn
= get_last_insn ();
5396 if (frame_related_p
)
5398 /* This is done to provide unwinding information for the stack
5399 adjustments we're about to do, however to prevent the optimizers
5400 from removing the R11 move and leaving the CFA note (which would be
5401 very wrong) we tie the old and new stack pointer together.
5402 The tie will expand to nothing but the optimizers will not touch
5404 rtx stack_ptr_copy
= gen_rtx_REG (Pmode
, STACK_CLASH_SVE_CFA_REGNUM
);
5405 emit_move_insn (stack_ptr_copy
, stack_pointer_rtx
);
5406 emit_insn (gen_stack_tie (stack_ptr_copy
, stack_pointer_rtx
));
5408 /* We want the CFA independent of the stack pointer for the
5409 duration of the loop. */
5410 add_reg_note (insn
, REG_CFA_DEF_CFA
, stack_ptr_copy
);
5411 RTX_FRAME_RELATED_P (insn
) = 1;
5414 rtx probe_const
= gen_int_mode (min_probe_threshold
, Pmode
);
5415 rtx guard_const
= gen_int_mode (guard_size
, Pmode
);
5417 insn
= emit_insn (gen_probe_sve_stack_clash (Pmode
, stack_pointer_rtx
,
5418 stack_pointer_rtx
, temp1
,
5419 probe_const
, guard_const
));
5421 /* Now reset the CFA register if needed. */
5422 if (frame_related_p
)
5424 add_reg_note (insn
, REG_CFA_DEF_CFA
,
5425 gen_rtx_PLUS (Pmode
, stack_pointer_rtx
,
5426 gen_int_mode (poly_size
, Pmode
)));
5427 RTX_FRAME_RELATED_P (insn
) = 1;
5435 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
5436 " bytes, probing will be required.\n", size
);
5438 /* Round size to the nearest multiple of guard_size, and calculate the
5439 residual as the difference between the original size and the rounded
5441 HOST_WIDE_INT rounded_size
= ROUND_DOWN (size
, guard_size
);
5442 HOST_WIDE_INT residual
= size
- rounded_size
;
5444 /* We can handle a small number of allocations/probes inline. Otherwise
5446 if (rounded_size
<= STACK_CLASH_MAX_UNROLL_PAGES
* guard_size
)
5448 for (HOST_WIDE_INT i
= 0; i
< rounded_size
; i
+= guard_size
)
5450 aarch64_sub_sp (NULL
, temp2
, guard_size
, true);
5451 emit_stack_probe (plus_constant (Pmode
, stack_pointer_rtx
,
5452 guard_used_by_caller
));
5453 emit_insn (gen_blockage ());
5455 dump_stack_clash_frame_info (PROBE_INLINE
, size
!= rounded_size
);
5459 /* Compute the ending address. */
5460 aarch64_add_offset (Pmode
, temp1
, stack_pointer_rtx
, -rounded_size
,
5461 temp1
, NULL
, false, true);
5462 rtx_insn
*insn
= get_last_insn ();
5464 /* For the initial allocation, we don't have a frame pointer
5465 set up, so we always need CFI notes. If we're doing the
5466 final allocation, then we may have a frame pointer, in which
5467 case it is the CFA, otherwise we need CFI notes.
5469 We can determine which allocation we are doing by looking at
5470 the value of FRAME_RELATED_P since the final allocations are not
5472 if (frame_related_p
)
5474 /* We want the CFA independent of the stack pointer for the
5475 duration of the loop. */
5476 add_reg_note (insn
, REG_CFA_DEF_CFA
,
5477 plus_constant (Pmode
, temp1
, rounded_size
));
5478 RTX_FRAME_RELATED_P (insn
) = 1;
5481 /* This allocates and probes the stack. Note that this re-uses some of
5482 the existing Ada stack protection code. However we are guaranteed not
5483 to enter the non loop or residual branches of that code.
5485 The non-loop part won't be entered because if our allocation amount
5486 doesn't require a loop, the case above would handle it.
5488 The residual amount won't be entered because TEMP1 is a mutliple of
5489 the allocation size. The residual will always be 0. As such, the only
5490 part we are actually using from that code is the loop setup. The
5491 actual probing is done in aarch64_output_probe_stack_range. */
5492 insn
= emit_insn (gen_probe_stack_range (stack_pointer_rtx
,
5493 stack_pointer_rtx
, temp1
));
5495 /* Now reset the CFA register if needed. */
5496 if (frame_related_p
)
5498 add_reg_note (insn
, REG_CFA_DEF_CFA
,
5499 plus_constant (Pmode
, stack_pointer_rtx
, rounded_size
));
5500 RTX_FRAME_RELATED_P (insn
) = 1;
5503 emit_insn (gen_blockage ());
5504 dump_stack_clash_frame_info (PROBE_LOOP
, size
!= rounded_size
);
5507 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
5508 be probed. This maintains the requirement that each page is probed at
5509 least once. For initial probing we probe only if the allocation is
5510 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
5511 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
5512 GUARD_SIZE. This works that for any allocation that is large enough to
5513 trigger a probe here, we'll have at least one, and if they're not large
5514 enough for this code to emit anything for them, The page would have been
5515 probed by the saving of FP/LR either by this function or any callees. If
5516 we don't have any callees then we won't have more stack adjustments and so
5520 HOST_WIDE_INT residual_probe_offset
= guard_used_by_caller
;
5521 /* If we're doing final adjustments, and we've done any full page
5522 allocations then any residual needs to be probed. */
5523 if (final_adjustment_p
&& rounded_size
!= 0)
5524 min_probe_threshold
= 0;
5525 /* If doing a small final adjustment, we always probe at offset 0.
5526 This is done to avoid issues when LR is not at position 0 or when
5527 the final adjustment is smaller than the probing offset. */
5528 else if (final_adjustment_p
&& rounded_size
== 0)
5529 residual_probe_offset
= 0;
5531 aarch64_sub_sp (temp1
, temp2
, residual
, frame_related_p
);
5532 if (residual
>= min_probe_threshold
)
5536 "Stack clash AArch64 prologue residuals: "
5537 HOST_WIDE_INT_PRINT_DEC
" bytes, probing will be required."
5540 emit_stack_probe (plus_constant (Pmode
, stack_pointer_rtx
,
5541 residual_probe_offset
));
5542 emit_insn (gen_blockage ());
5547 /* Return 1 if the register is used by the epilogue. We need to say the
5548 return register is used, but only after epilogue generation is complete.
5549 Note that in the case of sibcalls, the values "used by the epilogue" are
5550 considered live at the start of the called function.
5552 For SIMD functions we need to return 1 for FP registers that are saved and
5553 restored by a function but are not zero in call_used_regs. If we do not do
5554 this optimizations may remove the restore of the register. */
5557 aarch64_epilogue_uses (int regno
)
5559 if (epilogue_completed
)
5561 if (regno
== LR_REGNUM
)
5563 if (aarch64_simd_decl_p (cfun
->decl
) && FP_SIMD_SAVED_REGNUM_P (regno
))
5569 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
5570 is saved at BASE + OFFSET. */
5573 aarch64_add_cfa_expression (rtx_insn
*insn
, unsigned int reg
,
5574 rtx base
, poly_int64 offset
)
5576 rtx mem
= gen_frame_mem (DImode
, plus_constant (Pmode
, base
, offset
));
5577 add_reg_note (insn
, REG_CFA_EXPRESSION
,
5578 gen_rtx_SET (mem
, regno_reg_rtx
[reg
]));
5581 /* AArch64 stack frames generated by this compiler look like:
5583 +-------------------------------+
5585 | incoming stack arguments |
5587 +-------------------------------+
5588 | | <-- incoming stack pointer (aligned)
5589 | callee-allocated save area |
5590 | for register varargs |
5592 +-------------------------------+
5593 | local variables | <-- frame_pointer_rtx
5595 +-------------------------------+
5597 +-------------------------------+ |
5598 | callee-saved registers | | frame.saved_regs_size
5599 +-------------------------------+ |
5601 +-------------------------------+ |
5602 | FP' | / <- hard_frame_pointer_rtx (aligned)
5603 +-------------------------------+
5604 | dynamic allocation |
5605 +-------------------------------+
5607 +-------------------------------+
5608 | outgoing stack arguments | <-- arg_pointer
5610 +-------------------------------+
5611 | | <-- stack_pointer_rtx (aligned)
5613 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
5614 but leave frame_pointer_rtx and hard_frame_pointer_rtx
5617 By default for stack-clash we assume the guard is at least 64KB, but this
5618 value is configurable to either 4KB or 64KB. We also force the guard size to
5619 be the same as the probing interval and both values are kept in sync.
5621 With those assumptions the callee can allocate up to 63KB (or 3KB depending
5622 on the guard size) of stack space without probing.
5624 When probing is needed, we emit a probe at the start of the prologue
5625 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
5627 We have to track how much space has been allocated and the only stores
5628 to the stack we track as implicit probes are the FP/LR stores.
5630 For outgoing arguments we probe if the size is larger than 1KB, such that
5631 the ABI specified buffer is maintained for the next callee.
5633 The following registers are reserved during frame layout and should not be
5634 used for any other purpose:
5636 - r11: Used by stack clash protection when SVE is enabled.
5637 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
5638 - r14 and r15: Used for speculation tracking.
5639 - r16(IP0), r17(IP1): Used by indirect tailcalls.
5640 - r30(LR), r29(FP): Used by standard frame layout.
5642 These registers must be avoided in frame layout related code unless the
5643 explicit intention is to interact with one of the features listed above. */
5645 /* Generate the prologue instructions for entry into a function.
5646 Establish the stack frame by decreasing the stack pointer with a
5647 properly calculated size and, if necessary, create a frame record
5648 filled with the values of LR and previous frame pointer. The
5649 current FP is also set up if it is in use. */
5652 aarch64_expand_prologue (void)
5654 poly_int64 frame_size
= cfun
->machine
->frame
.frame_size
;
5655 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
5656 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
5657 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
5658 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
5659 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
5660 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
5661 bool emit_frame_chain
= cfun
->machine
->frame
.emit_frame_chain
;
5664 /* Sign return address for functions. */
5665 if (aarch64_return_address_signing_enabled ())
5667 switch (aarch64_ra_sign_key
)
5670 insn
= emit_insn (gen_paciasp ());
5673 insn
= emit_insn (gen_pacibsp ());
5678 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
5679 RTX_FRAME_RELATED_P (insn
) = 1;
5682 if (flag_stack_usage_info
)
5683 current_function_static_stack_size
= constant_lower_bound (frame_size
);
5685 if (flag_stack_check
== STATIC_BUILTIN_STACK_CHECK
)
5687 if (crtl
->is_leaf
&& !cfun
->calls_alloca
)
5689 if (maybe_gt (frame_size
, PROBE_INTERVAL
)
5690 && maybe_gt (frame_size
, get_stack_check_protect ()))
5691 aarch64_emit_probe_stack_range (get_stack_check_protect (),
5693 - get_stack_check_protect ()));
5695 else if (maybe_gt (frame_size
, 0))
5696 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size
);
5699 rtx tmp0_rtx
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
5700 rtx tmp1_rtx
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
5702 /* In theory we should never have both an initial adjustment
5703 and a callee save adjustment. Verify that is the case since the
5704 code below does not handle it for -fstack-clash-protection. */
5705 gcc_assert (known_eq (initial_adjust
, 0) || callee_adjust
== 0);
5707 /* Will only probe if the initial adjustment is larger than the guard
5708 less the amount of the guard reserved for use by the caller's
5710 aarch64_allocate_and_probe_stack_space (tmp0_rtx
, tmp1_rtx
, initial_adjust
,
5713 if (callee_adjust
!= 0)
5714 aarch64_push_regs (reg1
, reg2
, callee_adjust
);
5716 if (emit_frame_chain
)
5718 poly_int64 reg_offset
= callee_adjust
;
5719 if (callee_adjust
== 0)
5723 reg_offset
= callee_offset
;
5724 aarch64_save_callee_saves (DImode
, reg_offset
, reg1
, reg2
, false);
5726 aarch64_add_offset (Pmode
, hard_frame_pointer_rtx
,
5727 stack_pointer_rtx
, callee_offset
,
5728 tmp1_rtx
, tmp0_rtx
, frame_pointer_needed
);
5729 if (frame_pointer_needed
&& !frame_size
.is_constant ())
5731 /* Variable-sized frames need to describe the save slot
5732 address using DW_CFA_expression rather than DW_CFA_offset.
5733 This means that, without taking further action, the
5734 locations of the registers that we've already saved would
5735 remain based on the stack pointer even after we redefine
5736 the CFA based on the frame pointer. We therefore need new
5737 DW_CFA_expressions to re-express the save slots with addresses
5738 based on the frame pointer. */
5739 rtx_insn
*insn
= get_last_insn ();
5740 gcc_assert (RTX_FRAME_RELATED_P (insn
));
5742 /* Add an explicit CFA definition if this was previously
5744 if (!find_reg_note (insn
, REG_CFA_ADJUST_CFA
, NULL_RTX
))
5746 rtx src
= plus_constant (Pmode
, stack_pointer_rtx
,
5748 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
5749 gen_rtx_SET (hard_frame_pointer_rtx
, src
));
5752 /* Change the save slot expressions for the registers that
5753 we've already saved. */
5754 reg_offset
-= callee_offset
;
5755 aarch64_add_cfa_expression (insn
, reg2
, hard_frame_pointer_rtx
,
5756 reg_offset
+ UNITS_PER_WORD
);
5757 aarch64_add_cfa_expression (insn
, reg1
, hard_frame_pointer_rtx
,
5760 emit_insn (gen_stack_tie (stack_pointer_rtx
, hard_frame_pointer_rtx
));
5763 aarch64_save_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
5764 callee_adjust
!= 0 || emit_frame_chain
);
5765 if (aarch64_simd_decl_p (cfun
->decl
))
5766 aarch64_save_callee_saves (TFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
5767 callee_adjust
!= 0 || emit_frame_chain
);
5769 aarch64_save_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
5770 callee_adjust
!= 0 || emit_frame_chain
);
5772 /* We may need to probe the final adjustment if it is larger than the guard
5773 that is assumed by the called. */
5774 aarch64_allocate_and_probe_stack_space (tmp1_rtx
, tmp0_rtx
, final_adjust
,
5775 !frame_pointer_needed
, true);
5778 /* Return TRUE if we can use a simple_return insn.
5780 This function checks whether the callee saved stack is empty, which
5781 means no restore actions are need. The pro_and_epilogue will use
5782 this to check whether shrink-wrapping opt is feasible. */
5785 aarch64_use_return_insn_p (void)
5787 if (!reload_completed
)
5793 return known_eq (cfun
->machine
->frame
.frame_size
, 0);
5796 /* Return false for non-leaf SIMD functions in order to avoid
5797 shrink-wrapping them. Doing this will lose the necessary
5798 save/restore of FP registers. */
5801 aarch64_use_simple_return_insn_p (void)
5803 if (aarch64_simd_decl_p (cfun
->decl
) && !crtl
->is_leaf
)
5809 /* Generate the epilogue instructions for returning from a function.
5810 This is almost exactly the reverse of the prolog sequence, except
5811 that we need to insert barriers to avoid scheduling loads that read
5812 from a deallocated stack, and we optimize the unwind records by
5813 emitting them all together if possible. */
5815 aarch64_expand_epilogue (bool for_sibcall
)
5817 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
5818 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
5819 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
5820 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
5821 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
5822 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
5825 /* A stack clash protection prologue may not have left EP0_REGNUM or
5826 EP1_REGNUM in a usable state. The same is true for allocations
5827 with an SVE component, since we then need both temporary registers
5828 for each allocation. For stack clash we are in a usable state if
5829 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
5830 HOST_WIDE_INT guard_size
5831 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
5832 HOST_WIDE_INT guard_used_by_caller
= STACK_CLASH_CALLER_GUARD
;
5834 /* We can re-use the registers when the allocation amount is smaller than
5835 guard_size - guard_used_by_caller because we won't be doing any probes
5836 then. In such situations the register should remain live with the correct
5838 bool can_inherit_p
= (initial_adjust
.is_constant ()
5839 && final_adjust
.is_constant ())
5840 && (!flag_stack_clash_protection
5841 || known_lt (initial_adjust
,
5842 guard_size
- guard_used_by_caller
));
5844 /* We need to add memory barrier to prevent read from deallocated stack. */
5846 = maybe_ne (get_frame_size ()
5847 + cfun
->machine
->frame
.saved_varargs_size
, 0);
5849 /* Emit a barrier to prevent loads from a deallocated stack. */
5850 if (maybe_gt (final_adjust
, crtl
->outgoing_args_size
)
5851 || cfun
->calls_alloca
5852 || crtl
->calls_eh_return
)
5854 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
5855 need_barrier_p
= false;
5858 /* Restore the stack pointer from the frame pointer if it may not
5859 be the same as the stack pointer. */
5860 rtx tmp0_rtx
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
5861 rtx tmp1_rtx
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
5862 if (frame_pointer_needed
5863 && (maybe_ne (final_adjust
, 0) || cfun
->calls_alloca
))
5864 /* If writeback is used when restoring callee-saves, the CFA
5865 is restored on the instruction doing the writeback. */
5866 aarch64_add_offset (Pmode
, stack_pointer_rtx
,
5867 hard_frame_pointer_rtx
, -callee_offset
,
5868 tmp1_rtx
, tmp0_rtx
, callee_adjust
== 0);
5870 /* The case where we need to re-use the register here is very rare, so
5871 avoid the complicated condition and just always emit a move if the
5872 immediate doesn't fit. */
5873 aarch64_add_sp (tmp1_rtx
, tmp0_rtx
, final_adjust
, true);
5875 aarch64_restore_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
5876 callee_adjust
!= 0, &cfi_ops
);
5877 if (aarch64_simd_decl_p (cfun
->decl
))
5878 aarch64_restore_callee_saves (TFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
5879 callee_adjust
!= 0, &cfi_ops
);
5881 aarch64_restore_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
5882 callee_adjust
!= 0, &cfi_ops
);
5885 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
5887 if (callee_adjust
!= 0)
5888 aarch64_pop_regs (reg1
, reg2
, callee_adjust
, &cfi_ops
);
5890 if (callee_adjust
!= 0 || maybe_gt (initial_adjust
, 65536))
5892 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
5893 insn
= get_last_insn ();
5894 rtx new_cfa
= plus_constant (Pmode
, stack_pointer_rtx
, initial_adjust
);
5895 REG_NOTES (insn
) = alloc_reg_note (REG_CFA_DEF_CFA
, new_cfa
, cfi_ops
);
5896 RTX_FRAME_RELATED_P (insn
) = 1;
5900 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
5901 add restriction on emit_move optimization to leaf functions. */
5902 aarch64_add_sp (tmp0_rtx
, tmp1_rtx
, initial_adjust
,
5903 (!can_inherit_p
|| !crtl
->is_leaf
5904 || df_regs_ever_live_p (EP0_REGNUM
)));
5908 /* Emit delayed restores and reset the CFA to be SP. */
5909 insn
= get_last_insn ();
5910 cfi_ops
= alloc_reg_note (REG_CFA_DEF_CFA
, stack_pointer_rtx
, cfi_ops
);
5911 REG_NOTES (insn
) = cfi_ops
;
5912 RTX_FRAME_RELATED_P (insn
) = 1;
5915 /* We prefer to emit the combined return/authenticate instruction RETAA,
5916 however there are three cases in which we must instead emit an explicit
5917 authentication instruction.
5919 1) Sibcalls don't return in a normal way, so if we're about to call one
5920 we must authenticate.
5922 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5923 generating code for !TARGET_ARMV8_3 we can't use it and must
5924 explicitly authenticate.
5926 3) On an eh_return path we make extra stack adjustments to update the
5927 canonical frame address to be the exception handler's CFA. We want
5928 to authenticate using the CFA of the function which calls eh_return.
5930 if (aarch64_return_address_signing_enabled ()
5931 && (for_sibcall
|| !TARGET_ARMV8_3
|| crtl
->calls_eh_return
))
5933 switch (aarch64_ra_sign_key
)
5936 insn
= emit_insn (gen_autiasp ());
5939 insn
= emit_insn (gen_autibsp ());
5944 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
5945 RTX_FRAME_RELATED_P (insn
) = 1;
5948 /* Stack adjustment for exception handler. */
5949 if (crtl
->calls_eh_return
&& !for_sibcall
)
5951 /* We need to unwind the stack by the offset computed by
5952 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
5953 to be SP; letting the CFA move during this adjustment
5954 is just as correct as retaining the CFA from the body
5955 of the function. Therefore, do nothing special. */
5956 emit_insn (gen_add2_insn (stack_pointer_rtx
, EH_RETURN_STACKADJ_RTX
));
5959 emit_use (gen_rtx_REG (DImode
, LR_REGNUM
));
5961 emit_jump_insn (ret_rtx
);
5964 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
5965 normally or return to a previous frame after unwinding.
5967 An EH return uses a single shared return sequence. The epilogue is
5968 exactly like a normal epilogue except that it has an extra input
5969 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5970 that must be applied after the frame has been destroyed. An extra label
5971 is inserted before the epilogue which initializes this register to zero,
5972 and this is the entry point for a normal return.
5974 An actual EH return updates the return address, initializes the stack
5975 adjustment and jumps directly into the epilogue (bypassing the zeroing
5976 of the adjustment). Since the return address is typically saved on the
5977 stack when a function makes a call, the saved LR must be updated outside
5980 This poses problems as the store is generated well before the epilogue,
5981 so the offset of LR is not known yet. Also optimizations will remove the
5982 store as it appears dead, even after the epilogue is generated (as the
5983 base or offset for loading LR is different in many cases).
5985 To avoid these problems this implementation forces the frame pointer
5986 in eh_return functions so that the location of LR is fixed and known early.
5987 It also marks the store volatile, so no optimization is permitted to
5988 remove the store. */
5990 aarch64_eh_return_handler_rtx (void)
5992 rtx tmp
= gen_frame_mem (Pmode
,
5993 plus_constant (Pmode
, hard_frame_pointer_rtx
, UNITS_PER_WORD
));
5995 /* Mark the store volatile, so no optimization is permitted to remove it. */
5996 MEM_VOLATILE_P (tmp
) = true;
6000 /* Output code to add DELTA to the first argument, and then jump
6001 to FUNCTION. Used for C++ multiple inheritance. */
6003 aarch64_output_mi_thunk (FILE *file
, tree thunk ATTRIBUTE_UNUSED
,
6004 HOST_WIDE_INT delta
,
6005 HOST_WIDE_INT vcall_offset
,
6008 /* The this pointer is always in x0. Note that this differs from
6009 Arm where the this pointer maybe bumped to r1 if r0 is required
6010 to return a pointer to an aggregate. On AArch64 a result value
6011 pointer will be in x8. */
6012 int this_regno
= R0_REGNUM
;
6013 rtx this_rtx
, temp0
, temp1
, addr
, funexp
;
6015 const char *fnname
= IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk
));
6017 if (aarch64_bti_enabled ())
6018 emit_insn (gen_bti_c());
6020 reload_completed
= 1;
6021 emit_note (NOTE_INSN_PROLOGUE_END
);
6023 this_rtx
= gen_rtx_REG (Pmode
, this_regno
);
6024 temp0
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
6025 temp1
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
6027 if (vcall_offset
== 0)
6028 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
, temp1
, temp0
, false);
6031 gcc_assert ((vcall_offset
& (POINTER_BYTES
- 1)) == 0);
6036 if (delta
>= -256 && delta
< 256)
6037 addr
= gen_rtx_PRE_MODIFY (Pmode
, this_rtx
,
6038 plus_constant (Pmode
, this_rtx
, delta
));
6040 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
,
6041 temp1
, temp0
, false);
6044 if (Pmode
== ptr_mode
)
6045 aarch64_emit_move (temp0
, gen_rtx_MEM (ptr_mode
, addr
));
6047 aarch64_emit_move (temp0
,
6048 gen_rtx_ZERO_EXTEND (Pmode
,
6049 gen_rtx_MEM (ptr_mode
, addr
)));
6051 if (vcall_offset
>= -256 && vcall_offset
< 4096 * POINTER_BYTES
)
6052 addr
= plus_constant (Pmode
, temp0
, vcall_offset
);
6055 aarch64_internal_mov_immediate (temp1
, GEN_INT (vcall_offset
), true,
6057 addr
= gen_rtx_PLUS (Pmode
, temp0
, temp1
);
6060 if (Pmode
== ptr_mode
)
6061 aarch64_emit_move (temp1
, gen_rtx_MEM (ptr_mode
,addr
));
6063 aarch64_emit_move (temp1
,
6064 gen_rtx_SIGN_EXTEND (Pmode
,
6065 gen_rtx_MEM (ptr_mode
, addr
)));
6067 emit_insn (gen_add2_insn (this_rtx
, temp1
));
6070 /* Generate a tail call to the target function. */
6071 if (!TREE_USED (function
))
6073 assemble_external (function
);
6074 TREE_USED (function
) = 1;
6076 funexp
= XEXP (DECL_RTL (function
), 0);
6077 funexp
= gen_rtx_MEM (FUNCTION_MODE
, funexp
);
6078 insn
= emit_call_insn (gen_sibcall (funexp
, const0_rtx
, NULL_RTX
));
6079 SIBLING_CALL_P (insn
) = 1;
6081 insn
= get_insns ();
6082 shorten_branches (insn
);
6084 assemble_start_function (thunk
, fnname
);
6085 final_start_function (insn
, file
, 1);
6086 final (insn
, file
, 1);
6087 final_end_function ();
6088 assemble_end_function (thunk
, fnname
);
6090 /* Stop pretending to be a post-reload pass. */
6091 reload_completed
= 0;
6095 aarch64_tls_referenced_p (rtx x
)
6097 if (!TARGET_HAVE_TLS
)
6099 subrtx_iterator::array_type array
;
6100 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
6102 const_rtx x
= *iter
;
6103 if (GET_CODE (x
) == SYMBOL_REF
&& SYMBOL_REF_TLS_MODEL (x
) != 0)
6105 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6106 TLS offsets, not real symbol references. */
6107 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
6108 iter
.skip_subrtxes ();
6114 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6115 a left shift of 0 or 12 bits. */
6117 aarch64_uimm12_shift (HOST_WIDE_INT val
)
6119 return ((val
& (((HOST_WIDE_INT
) 0xfff) << 0)) == val
6120 || (val
& (((HOST_WIDE_INT
) 0xfff) << 12)) == val
6124 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6125 that can be created with a left shift of 0 or 12. */
6126 static HOST_WIDE_INT
6127 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val
)
6129 /* Check to see if the value fits in 24 bits, as that is the maximum we can
6130 handle correctly. */
6131 gcc_assert ((val
& 0xffffff) == val
);
6133 if (((val
& 0xfff) << 0) == val
)
6136 return val
& (0xfff << 12);
6139 /* Return true if val is an immediate that can be loaded into a
6140 register by a MOVZ instruction. */
6142 aarch64_movw_imm (HOST_WIDE_INT val
, scalar_int_mode mode
)
6144 if (GET_MODE_SIZE (mode
) > 4)
6146 if ((val
& (((HOST_WIDE_INT
) 0xffff) << 32)) == val
6147 || (val
& (((HOST_WIDE_INT
) 0xffff) << 48)) == val
)
6152 /* Ignore sign extension. */
6153 val
&= (HOST_WIDE_INT
) 0xffffffff;
6155 return ((val
& (((HOST_WIDE_INT
) 0xffff) << 0)) == val
6156 || (val
& (((HOST_WIDE_INT
) 0xffff) << 16)) == val
);
6159 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
6160 64-bit (DImode) integer. */
6162 static unsigned HOST_WIDE_INT
6163 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val
, machine_mode mode
)
6165 unsigned int size
= GET_MODE_UNIT_PRECISION (mode
);
6168 val
&= (HOST_WIDE_INT_1U
<< size
) - 1;
6175 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
6177 static const unsigned HOST_WIDE_INT bitmask_imm_mul
[] =
6179 0x0000000100000001ull
,
6180 0x0001000100010001ull
,
6181 0x0101010101010101ull
,
6182 0x1111111111111111ull
,
6183 0x5555555555555555ull
,
6187 /* Return true if val is a valid bitmask immediate. */
6190 aarch64_bitmask_imm (HOST_WIDE_INT val_in
, machine_mode mode
)
6192 unsigned HOST_WIDE_INT val
, tmp
, mask
, first_one
, next_one
;
6195 /* Check for a single sequence of one bits and return quickly if so.
6196 The special cases of all ones and all zeroes returns false. */
6197 val
= aarch64_replicate_bitmask_imm (val_in
, mode
);
6198 tmp
= val
+ (val
& -val
);
6200 if (tmp
== (tmp
& -tmp
))
6201 return (val
+ 1) > 1;
6203 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
6205 val
= (val
<< 32) | (val
& 0xffffffff);
6207 /* Invert if the immediate doesn't start with a zero bit - this means we
6208 only need to search for sequences of one bits. */
6212 /* Find the first set bit and set tmp to val with the first sequence of one
6213 bits removed. Return success if there is a single sequence of ones. */
6214 first_one
= val
& -val
;
6215 tmp
= val
& (val
+ first_one
);
6220 /* Find the next set bit and compute the difference in bit position. */
6221 next_one
= tmp
& -tmp
;
6222 bits
= clz_hwi (first_one
) - clz_hwi (next_one
);
6225 /* Check the bit position difference is a power of 2, and that the first
6226 sequence of one bits fits within 'bits' bits. */
6227 if ((mask
>> bits
) != 0 || bits
!= (bits
& -bits
))
6230 /* Check the sequence of one bits is repeated 64/bits times. */
6231 return val
== mask
* bitmask_imm_mul
[__builtin_clz (bits
) - 26];
6234 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
6235 Assumed precondition: VAL_IN Is not zero. */
6237 unsigned HOST_WIDE_INT
6238 aarch64_and_split_imm1 (HOST_WIDE_INT val_in
)
6240 int lowest_bit_set
= ctz_hwi (val_in
);
6241 int highest_bit_set
= floor_log2 (val_in
);
6242 gcc_assert (val_in
!= 0);
6244 return ((HOST_WIDE_INT_UC (2) << highest_bit_set
) -
6245 (HOST_WIDE_INT_1U
<< lowest_bit_set
));
6248 /* Create constant where bits outside of lowest bit set to highest bit set
6251 unsigned HOST_WIDE_INT
6252 aarch64_and_split_imm2 (HOST_WIDE_INT val_in
)
6254 return val_in
| ~aarch64_and_split_imm1 (val_in
);
6257 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
6260 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in
, machine_mode mode
)
6262 scalar_int_mode int_mode
;
6263 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
6266 if (aarch64_bitmask_imm (val_in
, int_mode
))
6269 if (aarch64_move_imm (val_in
, int_mode
))
6272 unsigned HOST_WIDE_INT imm2
= aarch64_and_split_imm2 (val_in
);
6274 return aarch64_bitmask_imm (imm2
, int_mode
);
6277 /* Return true if val is an immediate that can be loaded into a
6278 register in a single instruction. */
6280 aarch64_move_imm (HOST_WIDE_INT val
, machine_mode mode
)
6282 scalar_int_mode int_mode
;
6283 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
6286 if (aarch64_movw_imm (val
, int_mode
) || aarch64_movw_imm (~val
, int_mode
))
6288 return aarch64_bitmask_imm (val
, int_mode
);
6292 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED
, rtx x
)
6296 if (GET_CODE (x
) == HIGH
)
6299 /* There's no way to calculate VL-based values using relocations. */
6300 subrtx_iterator::array_type array
;
6301 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
6302 if (GET_CODE (*iter
) == CONST_POLY_INT
)
6305 split_const (x
, &base
, &offset
);
6306 if (GET_CODE (base
) == SYMBOL_REF
|| GET_CODE (base
) == LABEL_REF
)
6308 if (aarch64_classify_symbol (base
, INTVAL (offset
))
6309 != SYMBOL_FORCE_TO_MEM
)
6312 /* Avoid generating a 64-bit relocation in ILP32; leave
6313 to aarch64_expand_mov_immediate to handle it properly. */
6314 return mode
!= ptr_mode
;
6317 return aarch64_tls_referenced_p (x
);
6320 /* Implement TARGET_CASE_VALUES_THRESHOLD.
6321 The expansion for a table switch is quite expensive due to the number
6322 of instructions, the table lookup and hard to predict indirect jump.
6323 When optimizing for speed, and -O3 enabled, use the per-core tuning if
6324 set, otherwise use tables for > 16 cases as a tradeoff between size and
6325 performance. When optimizing for size, use the default setting. */
6328 aarch64_case_values_threshold (void)
6330 /* Use the specified limit for the number of cases before using jump
6331 tables at higher optimization levels. */
6333 && selected_cpu
->tune
->max_case_values
!= 0)
6334 return selected_cpu
->tune
->max_case_values
;
6336 return optimize_size
? default_case_values_threshold () : 17;
6339 /* Return true if register REGNO is a valid index register.
6340 STRICT_P is true if REG_OK_STRICT is in effect. */
6343 aarch64_regno_ok_for_index_p (int regno
, bool strict_p
)
6345 if (!HARD_REGISTER_NUM_P (regno
))
6353 regno
= reg_renumber
[regno
];
6355 return GP_REGNUM_P (regno
);
6358 /* Return true if register REGNO is a valid base register for mode MODE.
6359 STRICT_P is true if REG_OK_STRICT is in effect. */
6362 aarch64_regno_ok_for_base_p (int regno
, bool strict_p
)
6364 if (!HARD_REGISTER_NUM_P (regno
))
6372 regno
= reg_renumber
[regno
];
6375 /* The fake registers will be eliminated to either the stack or
6376 hard frame pointer, both of which are usually valid base registers.
6377 Reload deals with the cases where the eliminated form isn't valid. */
6378 return (GP_REGNUM_P (regno
)
6379 || regno
== SP_REGNUM
6380 || regno
== FRAME_POINTER_REGNUM
6381 || regno
== ARG_POINTER_REGNUM
);
6384 /* Return true if X is a valid base register for mode MODE.
6385 STRICT_P is true if REG_OK_STRICT is in effect. */
6388 aarch64_base_register_rtx_p (rtx x
, bool strict_p
)
6391 && GET_CODE (x
) == SUBREG
6392 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (x
))])
6395 return (REG_P (x
) && aarch64_regno_ok_for_base_p (REGNO (x
), strict_p
));
6398 /* Return true if address offset is a valid index. If it is, fill in INFO
6399 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
6402 aarch64_classify_index (struct aarch64_address_info
*info
, rtx x
,
6403 machine_mode mode
, bool strict_p
)
6405 enum aarch64_address_type type
;
6410 if ((REG_P (x
) || GET_CODE (x
) == SUBREG
)
6411 && GET_MODE (x
) == Pmode
)
6413 type
= ADDRESS_REG_REG
;
6417 /* (sign_extend:DI (reg:SI)) */
6418 else if ((GET_CODE (x
) == SIGN_EXTEND
6419 || GET_CODE (x
) == ZERO_EXTEND
)
6420 && GET_MODE (x
) == DImode
6421 && GET_MODE (XEXP (x
, 0)) == SImode
)
6423 type
= (GET_CODE (x
) == SIGN_EXTEND
)
6424 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
6425 index
= XEXP (x
, 0);
6428 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
6429 else if (GET_CODE (x
) == MULT
6430 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
6431 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
6432 && GET_MODE (XEXP (x
, 0)) == DImode
6433 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
6434 && CONST_INT_P (XEXP (x
, 1)))
6436 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
6437 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
6438 index
= XEXP (XEXP (x
, 0), 0);
6439 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
6441 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
6442 else if (GET_CODE (x
) == ASHIFT
6443 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
6444 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
6445 && GET_MODE (XEXP (x
, 0)) == DImode
6446 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
6447 && CONST_INT_P (XEXP (x
, 1)))
6449 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
6450 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
6451 index
= XEXP (XEXP (x
, 0), 0);
6452 shift
= INTVAL (XEXP (x
, 1));
6454 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
6455 else if ((GET_CODE (x
) == SIGN_EXTRACT
6456 || GET_CODE (x
) == ZERO_EXTRACT
)
6457 && GET_MODE (x
) == DImode
6458 && GET_CODE (XEXP (x
, 0)) == MULT
6459 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
6460 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
6462 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
6463 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
6464 index
= XEXP (XEXP (x
, 0), 0);
6465 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
6466 if (INTVAL (XEXP (x
, 1)) != 32 + shift
6467 || INTVAL (XEXP (x
, 2)) != 0)
6470 /* (and:DI (mult:DI (reg:DI) (const_int scale))
6471 (const_int 0xffffffff<<shift)) */
6472 else if (GET_CODE (x
) == AND
6473 && GET_MODE (x
) == DImode
6474 && GET_CODE (XEXP (x
, 0)) == MULT
6475 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
6476 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
6477 && CONST_INT_P (XEXP (x
, 1)))
6479 type
= ADDRESS_REG_UXTW
;
6480 index
= XEXP (XEXP (x
, 0), 0);
6481 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
6482 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
6485 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
6486 else if ((GET_CODE (x
) == SIGN_EXTRACT
6487 || GET_CODE (x
) == ZERO_EXTRACT
)
6488 && GET_MODE (x
) == DImode
6489 && GET_CODE (XEXP (x
, 0)) == ASHIFT
6490 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
6491 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
6493 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
6494 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
6495 index
= XEXP (XEXP (x
, 0), 0);
6496 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
6497 if (INTVAL (XEXP (x
, 1)) != 32 + shift
6498 || INTVAL (XEXP (x
, 2)) != 0)
6501 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
6502 (const_int 0xffffffff<<shift)) */
6503 else if (GET_CODE (x
) == AND
6504 && GET_MODE (x
) == DImode
6505 && GET_CODE (XEXP (x
, 0)) == ASHIFT
6506 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
6507 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
6508 && CONST_INT_P (XEXP (x
, 1)))
6510 type
= ADDRESS_REG_UXTW
;
6511 index
= XEXP (XEXP (x
, 0), 0);
6512 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
6513 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
6516 /* (mult:P (reg:P) (const_int scale)) */
6517 else if (GET_CODE (x
) == MULT
6518 && GET_MODE (x
) == Pmode
6519 && GET_MODE (XEXP (x
, 0)) == Pmode
6520 && CONST_INT_P (XEXP (x
, 1)))
6522 type
= ADDRESS_REG_REG
;
6523 index
= XEXP (x
, 0);
6524 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
6526 /* (ashift:P (reg:P) (const_int shift)) */
6527 else if (GET_CODE (x
) == ASHIFT
6528 && GET_MODE (x
) == Pmode
6529 && GET_MODE (XEXP (x
, 0)) == Pmode
6530 && CONST_INT_P (XEXP (x
, 1)))
6532 type
= ADDRESS_REG_REG
;
6533 index
= XEXP (x
, 0);
6534 shift
= INTVAL (XEXP (x
, 1));
6540 && GET_CODE (index
) == SUBREG
6541 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (index
))])
6542 index
= SUBREG_REG (index
);
6544 if (aarch64_sve_data_mode_p (mode
))
6546 if (type
!= ADDRESS_REG_REG
6547 || (1 << shift
) != GET_MODE_UNIT_SIZE (mode
))
6553 && !(IN_RANGE (shift
, 1, 3)
6554 && known_eq (1 << shift
, GET_MODE_SIZE (mode
))))
6559 && aarch64_regno_ok_for_index_p (REGNO (index
), strict_p
))
6562 info
->offset
= index
;
6563 info
->shift
= shift
;
6570 /* Return true if MODE is one of the modes for which we
6571 support LDP/STP operations. */
6574 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode
)
6576 return mode
== SImode
|| mode
== DImode
6577 || mode
== SFmode
|| mode
== DFmode
6578 || (aarch64_vector_mode_supported_p (mode
)
6579 && (known_eq (GET_MODE_SIZE (mode
), 8)
6580 || (known_eq (GET_MODE_SIZE (mode
), 16)
6581 && (aarch64_tune_params
.extra_tuning_flags
6582 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
) == 0)));
6585 /* Return true if REGNO is a virtual pointer register, or an eliminable
6586 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
6587 include stack_pointer or hard_frame_pointer. */
6589 virt_or_elim_regno_p (unsigned regno
)
6591 return ((regno
>= FIRST_VIRTUAL_REGISTER
6592 && regno
<= LAST_VIRTUAL_POINTER_REGISTER
)
6593 || regno
== FRAME_POINTER_REGNUM
6594 || regno
== ARG_POINTER_REGNUM
);
6597 /* Return true if X is a valid address of type TYPE for machine mode MODE.
6598 If it is, fill in INFO appropriately. STRICT_P is true if
6599 REG_OK_STRICT is in effect. */
6602 aarch64_classify_address (struct aarch64_address_info
*info
,
6603 rtx x
, machine_mode mode
, bool strict_p
,
6604 aarch64_addr_query_type type
)
6606 enum rtx_code code
= GET_CODE (x
);
6610 HOST_WIDE_INT const_size
;
6612 /* On BE, we use load/store pair for all large int mode load/stores.
6613 TI/TFmode may also use a load/store pair. */
6614 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
6615 bool advsimd_struct_p
= (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
));
6616 bool load_store_pair_p
= (type
== ADDR_QUERY_LDP_STP
6617 || type
== ADDR_QUERY_LDP_STP_N
6620 || (BYTES_BIG_ENDIAN
&& advsimd_struct_p
));
6622 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
6623 corresponds to the actual size of the memory being loaded/stored and the
6624 mode of the corresponding addressing mode is half of that. */
6625 if (type
== ADDR_QUERY_LDP_STP_N
6626 && known_eq (GET_MODE_SIZE (mode
), 16))
6629 bool allow_reg_index_p
= (!load_store_pair_p
6630 && (known_lt (GET_MODE_SIZE (mode
), 16)
6631 || vec_flags
== VEC_ADVSIMD
6632 || vec_flags
& VEC_SVE_DATA
));
6634 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
6635 [Rn, #offset, MUL VL]. */
6636 if ((vec_flags
& (VEC_SVE_DATA
| VEC_SVE_PRED
)) != 0
6637 && (code
!= REG
&& code
!= PLUS
))
6640 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
6642 if (advsimd_struct_p
6643 && !BYTES_BIG_ENDIAN
6644 && (code
!= POST_INC
&& code
!= REG
))
6647 gcc_checking_assert (GET_MODE (x
) == VOIDmode
6648 || SCALAR_INT_MODE_P (GET_MODE (x
)));
6654 info
->type
= ADDRESS_REG_IMM
;
6656 info
->offset
= const0_rtx
;
6657 info
->const_offset
= 0;
6658 return aarch64_base_register_rtx_p (x
, strict_p
);
6666 && virt_or_elim_regno_p (REGNO (op0
))
6667 && poly_int_rtx_p (op1
, &offset
))
6669 info
->type
= ADDRESS_REG_IMM
;
6672 info
->const_offset
= offset
;
6677 if (maybe_ne (GET_MODE_SIZE (mode
), 0)
6678 && aarch64_base_register_rtx_p (op0
, strict_p
)
6679 && poly_int_rtx_p (op1
, &offset
))
6681 info
->type
= ADDRESS_REG_IMM
;
6684 info
->const_offset
= offset
;
6686 /* TImode and TFmode values are allowed in both pairs of X
6687 registers and individual Q registers. The available
6689 X,X: 7-bit signed scaled offset
6690 Q: 9-bit signed offset
6691 We conservatively require an offset representable in either mode.
6692 When performing the check for pairs of X registers i.e. LDP/STP
6693 pass down DImode since that is the natural size of the LDP/STP
6694 instruction memory accesses. */
6695 if (mode
== TImode
|| mode
== TFmode
)
6696 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
6697 && (aarch64_offset_9bit_signed_unscaled_p (mode
, offset
)
6698 || offset_12bit_unsigned_scaled_p (mode
, offset
)));
6700 /* A 7bit offset check because OImode will emit a ldp/stp
6701 instruction (only big endian will get here).
6702 For ldp/stp instructions, the offset is scaled for the size of a
6703 single element of the pair. */
6705 return aarch64_offset_7bit_signed_scaled_p (TImode
, offset
);
6707 /* Three 9/12 bit offsets checks because CImode will emit three
6708 ldr/str instructions (only big endian will get here). */
6710 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
6711 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode
,
6713 || offset_12bit_unsigned_scaled_p (V16QImode
,
6716 /* Two 7bit offsets checks because XImode will emit two ldp/stp
6717 instructions (only big endian will get here). */
6719 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
6720 && aarch64_offset_7bit_signed_scaled_p (TImode
,
6723 /* Make "m" use the LD1 offset range for SVE data modes, so
6724 that pre-RTL optimizers like ivopts will work to that
6725 instead of the wider LDR/STR range. */
6726 if (vec_flags
== VEC_SVE_DATA
)
6727 return (type
== ADDR_QUERY_M
6728 ? offset_4bit_signed_scaled_p (mode
, offset
)
6729 : offset_9bit_signed_scaled_p (mode
, offset
));
6731 if (vec_flags
== (VEC_SVE_DATA
| VEC_STRUCT
))
6733 poly_int64 end_offset
= (offset
6734 + GET_MODE_SIZE (mode
)
6735 - BYTES_PER_SVE_VECTOR
);
6736 return (type
== ADDR_QUERY_M
6737 ? offset_4bit_signed_scaled_p (mode
, offset
)
6738 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE
, offset
)
6739 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE
,
6743 if (vec_flags
== VEC_SVE_PRED
)
6744 return offset_9bit_signed_scaled_p (mode
, offset
);
6746 if (load_store_pair_p
)
6747 return ((known_eq (GET_MODE_SIZE (mode
), 4)
6748 || known_eq (GET_MODE_SIZE (mode
), 8)
6749 || known_eq (GET_MODE_SIZE (mode
), 16))
6750 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
6752 return (aarch64_offset_9bit_signed_unscaled_p (mode
, offset
)
6753 || offset_12bit_unsigned_scaled_p (mode
, offset
));
6756 if (allow_reg_index_p
)
6758 /* Look for base + (scaled/extended) index register. */
6759 if (aarch64_base_register_rtx_p (op0
, strict_p
)
6760 && aarch64_classify_index (info
, op1
, mode
, strict_p
))
6765 if (aarch64_base_register_rtx_p (op1
, strict_p
)
6766 && aarch64_classify_index (info
, op0
, mode
, strict_p
))
6779 info
->type
= ADDRESS_REG_WB
;
6780 info
->base
= XEXP (x
, 0);
6781 info
->offset
= NULL_RTX
;
6782 return aarch64_base_register_rtx_p (info
->base
, strict_p
);
6786 info
->type
= ADDRESS_REG_WB
;
6787 info
->base
= XEXP (x
, 0);
6788 if (GET_CODE (XEXP (x
, 1)) == PLUS
6789 && poly_int_rtx_p (XEXP (XEXP (x
, 1), 1), &offset
)
6790 && rtx_equal_p (XEXP (XEXP (x
, 1), 0), info
->base
)
6791 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
6793 info
->offset
= XEXP (XEXP (x
, 1), 1);
6794 info
->const_offset
= offset
;
6796 /* TImode and TFmode values are allowed in both pairs of X
6797 registers and individual Q registers. The available
6799 X,X: 7-bit signed scaled offset
6800 Q: 9-bit signed offset
6801 We conservatively require an offset representable in either mode.
6803 if (mode
== TImode
|| mode
== TFmode
)
6804 return (aarch64_offset_7bit_signed_scaled_p (mode
, offset
)
6805 && aarch64_offset_9bit_signed_unscaled_p (mode
, offset
));
6807 if (load_store_pair_p
)
6808 return ((known_eq (GET_MODE_SIZE (mode
), 4)
6809 || known_eq (GET_MODE_SIZE (mode
), 8)
6810 || known_eq (GET_MODE_SIZE (mode
), 16))
6811 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
6813 return aarch64_offset_9bit_signed_unscaled_p (mode
, offset
);
6820 /* load literal: pc-relative constant pool entry. Only supported
6821 for SI mode or larger. */
6822 info
->type
= ADDRESS_SYMBOLIC
;
6824 if (!load_store_pair_p
6825 && GET_MODE_SIZE (mode
).is_constant (&const_size
)
6830 split_const (x
, &sym
, &addend
);
6831 return ((GET_CODE (sym
) == LABEL_REF
6832 || (GET_CODE (sym
) == SYMBOL_REF
6833 && CONSTANT_POOL_ADDRESS_P (sym
)
6834 && aarch64_pcrelative_literal_loads
)));
6839 info
->type
= ADDRESS_LO_SUM
;
6840 info
->base
= XEXP (x
, 0);
6841 info
->offset
= XEXP (x
, 1);
6842 if (allow_reg_index_p
6843 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
6846 split_const (info
->offset
, &sym
, &offs
);
6847 if (GET_CODE (sym
) == SYMBOL_REF
6848 && (aarch64_classify_symbol (sym
, INTVAL (offs
))
6849 == SYMBOL_SMALL_ABSOLUTE
))
6851 /* The symbol and offset must be aligned to the access size. */
6854 if (CONSTANT_POOL_ADDRESS_P (sym
))
6855 align
= GET_MODE_ALIGNMENT (get_pool_mode (sym
));
6856 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym
))
6858 tree exp
= SYMBOL_REF_DECL (sym
);
6859 align
= TYPE_ALIGN (TREE_TYPE (exp
));
6860 align
= aarch64_constant_alignment (exp
, align
);
6862 else if (SYMBOL_REF_DECL (sym
))
6863 align
= DECL_ALIGN (SYMBOL_REF_DECL (sym
));
6864 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym
)
6865 && SYMBOL_REF_BLOCK (sym
) != NULL
)
6866 align
= SYMBOL_REF_BLOCK (sym
)->alignment
;
6868 align
= BITS_PER_UNIT
;
6870 poly_int64 ref_size
= GET_MODE_SIZE (mode
);
6871 if (known_eq (ref_size
, 0))
6872 ref_size
= GET_MODE_SIZE (DImode
);
6874 return (multiple_p (INTVAL (offs
), ref_size
)
6875 && multiple_p (align
/ BITS_PER_UNIT
, ref_size
));
6885 /* Return true if the address X is valid for a PRFM instruction.
6886 STRICT_P is true if we should do strict checking with
6887 aarch64_classify_address. */
6890 aarch64_address_valid_for_prefetch_p (rtx x
, bool strict_p
)
6892 struct aarch64_address_info addr
;
6894 /* PRFM accepts the same addresses as DImode... */
6895 bool res
= aarch64_classify_address (&addr
, x
, DImode
, strict_p
);
6899 /* ... except writeback forms. */
6900 return addr
.type
!= ADDRESS_REG_WB
;
6904 aarch64_symbolic_address_p (rtx x
)
6908 split_const (x
, &x
, &offset
);
6909 return GET_CODE (x
) == SYMBOL_REF
|| GET_CODE (x
) == LABEL_REF
;
6912 /* Classify the base of symbolic expression X. */
6914 enum aarch64_symbol_type
6915 aarch64_classify_symbolic_expression (rtx x
)
6919 split_const (x
, &x
, &offset
);
6920 return aarch64_classify_symbol (x
, INTVAL (offset
));
6924 /* Return TRUE if X is a legitimate address for accessing memory in
6927 aarch64_legitimate_address_hook_p (machine_mode mode
, rtx x
, bool strict_p
)
6929 struct aarch64_address_info addr
;
6931 return aarch64_classify_address (&addr
, x
, mode
, strict_p
);
6934 /* Return TRUE if X is a legitimate address of type TYPE for accessing
6935 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
6937 aarch64_legitimate_address_p (machine_mode mode
, rtx x
, bool strict_p
,
6938 aarch64_addr_query_type type
)
6940 struct aarch64_address_info addr
;
6942 return aarch64_classify_address (&addr
, x
, mode
, strict_p
, type
);
6945 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
6948 aarch64_legitimize_address_displacement (rtx
*offset1
, rtx
*offset2
,
6949 poly_int64 orig_offset
,
6953 if (GET_MODE_SIZE (mode
).is_constant (&size
))
6955 HOST_WIDE_INT const_offset
, second_offset
;
6957 /* A general SVE offset is A * VQ + B. Remove the A component from
6958 coefficient 0 in order to get the constant B. */
6959 const_offset
= orig_offset
.coeffs
[0] - orig_offset
.coeffs
[1];
6961 /* Split an out-of-range address displacement into a base and
6962 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
6963 range otherwise to increase opportunities for sharing the base
6964 address of different sizes. Unaligned accesses use the signed
6965 9-bit range, TImode/TFmode use the intersection of signed
6966 scaled 7-bit and signed 9-bit offset. */
6967 if (mode
== TImode
|| mode
== TFmode
)
6968 second_offset
= ((const_offset
+ 0x100) & 0x1f8) - 0x100;
6969 else if ((const_offset
& (size
- 1)) != 0)
6970 second_offset
= ((const_offset
+ 0x100) & 0x1ff) - 0x100;
6972 second_offset
= const_offset
& (size
< 4 ? 0xfff : 0x3ffc);
6974 if (second_offset
== 0 || known_eq (orig_offset
, second_offset
))
6977 /* Split the offset into second_offset and the rest. */
6978 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
6979 *offset2
= gen_int_mode (second_offset
, Pmode
);
6984 /* Get the mode we should use as the basis of the range. For structure
6985 modes this is the mode of one vector. */
6986 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
6987 machine_mode step_mode
6988 = (vec_flags
& VEC_STRUCT
) != 0 ? SVE_BYTE_MODE
: mode
;
6990 /* Get the "mul vl" multiplier we'd like to use. */
6991 HOST_WIDE_INT factor
= GET_MODE_SIZE (step_mode
).coeffs
[1];
6992 HOST_WIDE_INT vnum
= orig_offset
.coeffs
[1] / factor
;
6993 if (vec_flags
& VEC_SVE_DATA
)
6994 /* LDR supports a 9-bit range, but the move patterns for
6995 structure modes require all vectors to be in range of the
6996 same base. The simplest way of accomodating that while still
6997 promoting reuse of anchor points between different modes is
6998 to use an 8-bit range unconditionally. */
6999 vnum
= ((vnum
+ 128) & 255) - 128;
7001 /* Predicates are only handled singly, so we might as well use
7003 vnum
= ((vnum
+ 256) & 511) - 256;
7007 /* Convert the "mul vl" multiplier into a byte offset. */
7008 poly_int64 second_offset
= GET_MODE_SIZE (step_mode
) * vnum
;
7009 if (known_eq (second_offset
, orig_offset
))
7012 /* Split the offset into second_offset and the rest. */
7013 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
7014 *offset2
= gen_int_mode (second_offset
, Pmode
);
7019 /* Return the binary representation of floating point constant VALUE in INTVAL.
7020 If the value cannot be converted, return false without setting INTVAL.
7021 The conversion is done in the given MODE. */
7023 aarch64_reinterpret_float_as_int (rtx value
, unsigned HOST_WIDE_INT
*intval
)
7026 /* We make a general exception for 0. */
7027 if (aarch64_float_const_zero_rtx_p (value
))
7033 scalar_float_mode mode
;
7034 if (GET_CODE (value
) != CONST_DOUBLE
7035 || !is_a
<scalar_float_mode
> (GET_MODE (value
), &mode
)
7036 || GET_MODE_BITSIZE (mode
) > HOST_BITS_PER_WIDE_INT
7037 /* Only support up to DF mode. */
7038 || GET_MODE_BITSIZE (mode
) > GET_MODE_BITSIZE (DFmode
))
7041 unsigned HOST_WIDE_INT ival
= 0;
7044 real_to_target (res
,
7045 CONST_DOUBLE_REAL_VALUE (value
),
7046 REAL_MODE_FORMAT (mode
));
7050 int order
= BYTES_BIG_ENDIAN
? 1 : 0;
7051 ival
= zext_hwi (res
[order
], 32);
7052 ival
|= (zext_hwi (res
[1 - order
], 32) << 32);
7055 ival
= zext_hwi (res
[0], 32);
7061 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7062 single MOV(+MOVK) followed by an FMOV. */
7064 aarch64_float_const_rtx_p (rtx x
)
7066 machine_mode mode
= GET_MODE (x
);
7067 if (mode
== VOIDmode
)
7070 /* Determine whether it's cheaper to write float constants as
7071 mov/movk pairs over ldr/adrp pairs. */
7072 unsigned HOST_WIDE_INT ival
;
7074 if (GET_CODE (x
) == CONST_DOUBLE
7075 && SCALAR_FLOAT_MODE_P (mode
)
7076 && aarch64_reinterpret_float_as_int (x
, &ival
))
7078 scalar_int_mode imode
= (mode
== HFmode
7080 : int_mode_for_mode (mode
).require ());
7081 int num_instr
= aarch64_internal_mov_immediate
7082 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
7083 return num_instr
< 3;
7089 /* Return TRUE if rtx X is immediate constant 0.0 */
7091 aarch64_float_const_zero_rtx_p (rtx x
)
7093 if (GET_MODE (x
) == VOIDmode
)
7096 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x
)))
7097 return !HONOR_SIGNED_ZEROS (GET_MODE (x
));
7098 return real_equal (CONST_DOUBLE_REAL_VALUE (x
), &dconst0
);
7101 /* Return TRUE if rtx X is immediate constant that fits in a single
7102 MOVI immediate operation. */
7104 aarch64_can_const_movi_rtx_p (rtx x
, machine_mode mode
)
7110 scalar_int_mode imode
;
7111 unsigned HOST_WIDE_INT ival
;
7113 if (GET_CODE (x
) == CONST_DOUBLE
7114 && SCALAR_FLOAT_MODE_P (mode
))
7116 if (!aarch64_reinterpret_float_as_int (x
, &ival
))
7119 /* We make a general exception for 0. */
7120 if (aarch64_float_const_zero_rtx_p (x
))
7123 imode
= int_mode_for_mode (mode
).require ();
7125 else if (GET_CODE (x
) == CONST_INT
7126 && is_a
<scalar_int_mode
> (mode
, &imode
))
7131 /* use a 64 bit mode for everything except for DI/DF mode, where we use
7132 a 128 bit vector mode. */
7133 int width
= GET_MODE_BITSIZE (imode
) == 64 ? 128 : 64;
7135 vmode
= aarch64_simd_container_mode (imode
, width
);
7136 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, ival
);
7138 return aarch64_simd_valid_immediate (v_op
, NULL
);
7142 /* Return the fixed registers used for condition codes. */
7145 aarch64_fixed_condition_code_regs (unsigned int *p1
, unsigned int *p2
)
7148 *p2
= INVALID_REGNUM
;
7152 /* This function is used by the call expanders of the machine description.
7153 RESULT is the register in which the result is returned. It's NULL for
7154 "call" and "sibcall".
7155 MEM is the location of the function call.
7156 SIBCALL indicates whether this function call is normal call or sibling call.
7157 It will generate different pattern accordingly. */
7160 aarch64_expand_call (rtx result
, rtx mem
, bool sibcall
)
7162 rtx call
, callee
, tmp
;
7166 gcc_assert (MEM_P (mem
));
7167 callee
= XEXP (mem
, 0);
7168 mode
= GET_MODE (callee
);
7169 gcc_assert (mode
== Pmode
);
7171 /* Decide if we should generate indirect calls by loading the
7172 address of the callee into a register before performing
7173 the branch-and-link. */
7174 if (SYMBOL_REF_P (callee
)
7175 ? (aarch64_is_long_call_p (callee
)
7176 || aarch64_is_noplt_call_p (callee
))
7178 XEXP (mem
, 0) = force_reg (mode
, callee
);
7180 call
= gen_rtx_CALL (VOIDmode
, mem
, const0_rtx
);
7182 if (result
!= NULL_RTX
)
7183 call
= gen_rtx_SET (result
, call
);
7188 tmp
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (Pmode
, LR_REGNUM
));
7190 vec
= gen_rtvec (2, call
, tmp
);
7191 call
= gen_rtx_PARALLEL (VOIDmode
, vec
);
7193 aarch64_emit_call_insn (call
);
7196 /* Emit call insn with PAT and do aarch64-specific handling. */
7199 aarch64_emit_call_insn (rtx pat
)
7201 rtx insn
= emit_call_insn (pat
);
7203 rtx
*fusage
= &CALL_INSN_FUNCTION_USAGE (insn
);
7204 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP0_REGNUM
));
7205 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP1_REGNUM
));
7209 aarch64_select_cc_mode (RTX_CODE code
, rtx x
, rtx y
)
7211 machine_mode mode_x
= GET_MODE (x
);
7212 rtx_code code_x
= GET_CODE (x
);
7214 /* All floating point compares return CCFP if it is an equality
7215 comparison, and CCFPE otherwise. */
7216 if (GET_MODE_CLASS (mode_x
) == MODE_FLOAT
)
7243 /* Equality comparisons of short modes against zero can be performed
7244 using the TST instruction with the appropriate bitmask. */
7245 if (y
== const0_rtx
&& (REG_P (x
) || SUBREG_P (x
))
7246 && (code
== EQ
|| code
== NE
)
7247 && (mode_x
== HImode
|| mode_x
== QImode
))
7250 /* Similarly, comparisons of zero_extends from shorter modes can
7251 be performed using an ANDS with an immediate mask. */
7252 if (y
== const0_rtx
&& code_x
== ZERO_EXTEND
7253 && (mode_x
== SImode
|| mode_x
== DImode
)
7254 && (GET_MODE (XEXP (x
, 0)) == HImode
|| GET_MODE (XEXP (x
, 0)) == QImode
)
7255 && (code
== EQ
|| code
== NE
))
7258 if ((mode_x
== SImode
|| mode_x
== DImode
)
7260 && (code
== EQ
|| code
== NE
|| code
== LT
|| code
== GE
)
7261 && (code_x
== PLUS
|| code_x
== MINUS
|| code_x
== AND
7263 || (code_x
== ZERO_EXTRACT
&& CONST_INT_P (XEXP (x
, 1))
7264 && CONST_INT_P (XEXP (x
, 2)))))
7267 /* A compare with a shifted operand. Because of canonicalization,
7268 the comparison will have to be swapped when we emit the assembly
7270 if ((mode_x
== SImode
|| mode_x
== DImode
)
7271 && (REG_P (y
) || GET_CODE (y
) == SUBREG
|| y
== const0_rtx
)
7272 && (code_x
== ASHIFT
|| code_x
== ASHIFTRT
7273 || code_x
== LSHIFTRT
7274 || code_x
== ZERO_EXTEND
|| code_x
== SIGN_EXTEND
))
7277 /* Similarly for a negated operand, but we can only do this for
7279 if ((mode_x
== SImode
|| mode_x
== DImode
)
7280 && (REG_P (y
) || GET_CODE (y
) == SUBREG
)
7281 && (code
== EQ
|| code
== NE
)
7285 /* A test for unsigned overflow from an addition. */
7286 if ((mode_x
== DImode
|| mode_x
== TImode
)
7287 && (code
== LTU
|| code
== GEU
)
7289 && rtx_equal_p (XEXP (x
, 0), y
))
7292 /* A test for unsigned overflow from an add with carry. */
7293 if ((mode_x
== DImode
|| mode_x
== TImode
)
7294 && (code
== LTU
|| code
== GEU
)
7296 && CONST_SCALAR_INT_P (y
)
7297 && (rtx_mode_t (y
, mode_x
)
7298 == (wi::shwi (1, mode_x
)
7299 << (GET_MODE_BITSIZE (mode_x
).to_constant () / 2))))
7302 /* A test for signed overflow. */
7303 if ((mode_x
== DImode
|| mode_x
== TImode
)
7306 && GET_CODE (y
) == SIGN_EXTEND
)
7309 /* For everything else, return CCmode. */
7314 aarch64_get_condition_code_1 (machine_mode
, enum rtx_code
);
7317 aarch64_get_condition_code (rtx x
)
7319 machine_mode mode
= GET_MODE (XEXP (x
, 0));
7320 enum rtx_code comp_code
= GET_CODE (x
);
7322 if (GET_MODE_CLASS (mode
) != MODE_CC
)
7323 mode
= SELECT_CC_MODE (comp_code
, XEXP (x
, 0), XEXP (x
, 1));
7324 return aarch64_get_condition_code_1 (mode
, comp_code
);
7328 aarch64_get_condition_code_1 (machine_mode mode
, enum rtx_code comp_code
)
7336 case GE
: return AARCH64_GE
;
7337 case GT
: return AARCH64_GT
;
7338 case LE
: return AARCH64_LS
;
7339 case LT
: return AARCH64_MI
;
7340 case NE
: return AARCH64_NE
;
7341 case EQ
: return AARCH64_EQ
;
7342 case ORDERED
: return AARCH64_VC
;
7343 case UNORDERED
: return AARCH64_VS
;
7344 case UNLT
: return AARCH64_LT
;
7345 case UNLE
: return AARCH64_LE
;
7346 case UNGT
: return AARCH64_HI
;
7347 case UNGE
: return AARCH64_PL
;
7355 case NE
: return AARCH64_NE
;
7356 case EQ
: return AARCH64_EQ
;
7357 case GE
: return AARCH64_GE
;
7358 case GT
: return AARCH64_GT
;
7359 case LE
: return AARCH64_LE
;
7360 case LT
: return AARCH64_LT
;
7361 case GEU
: return AARCH64_CS
;
7362 case GTU
: return AARCH64_HI
;
7363 case LEU
: return AARCH64_LS
;
7364 case LTU
: return AARCH64_CC
;
7372 case NE
: return AARCH64_NE
;
7373 case EQ
: return AARCH64_EQ
;
7374 case GE
: return AARCH64_LE
;
7375 case GT
: return AARCH64_LT
;
7376 case LE
: return AARCH64_GE
;
7377 case LT
: return AARCH64_GT
;
7378 case GEU
: return AARCH64_LS
;
7379 case GTU
: return AARCH64_CC
;
7380 case LEU
: return AARCH64_CS
;
7381 case LTU
: return AARCH64_HI
;
7389 case NE
: return AARCH64_NE
;
7390 case EQ
: return AARCH64_EQ
;
7391 case GE
: return AARCH64_PL
;
7392 case LT
: return AARCH64_MI
;
7400 case NE
: return AARCH64_NE
;
7401 case EQ
: return AARCH64_EQ
;
7409 case LTU
: return AARCH64_CS
;
7410 case GEU
: return AARCH64_CC
;
7418 case GEU
: return AARCH64_CS
;
7419 case LTU
: return AARCH64_CC
;
7427 case NE
: return AARCH64_VS
;
7428 case EQ
: return AARCH64_VC
;
7441 aarch64_const_vec_all_same_in_range_p (rtx x
,
7442 HOST_WIDE_INT minval
,
7443 HOST_WIDE_INT maxval
)
7446 return (const_vec_duplicate_p (x
, &elt
)
7447 && CONST_INT_P (elt
)
7448 && IN_RANGE (INTVAL (elt
), minval
, maxval
));
7452 aarch64_const_vec_all_same_int_p (rtx x
, HOST_WIDE_INT val
)
7454 return aarch64_const_vec_all_same_in_range_p (x
, val
, val
);
7457 /* Return true if VEC is a constant in which every element is in the range
7458 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
7461 aarch64_const_vec_all_in_range_p (rtx vec
,
7462 HOST_WIDE_INT minval
,
7463 HOST_WIDE_INT maxval
)
7465 if (GET_CODE (vec
) != CONST_VECTOR
7466 || GET_MODE_CLASS (GET_MODE (vec
)) != MODE_VECTOR_INT
)
7470 if (!CONST_VECTOR_STEPPED_P (vec
))
7471 nunits
= const_vector_encoded_nelts (vec
);
7472 else if (!CONST_VECTOR_NUNITS (vec
).is_constant (&nunits
))
7475 for (int i
= 0; i
< nunits
; i
++)
7477 rtx vec_elem
= CONST_VECTOR_ELT (vec
, i
);
7478 if (!CONST_INT_P (vec_elem
)
7479 || !IN_RANGE (INTVAL (vec_elem
), minval
, maxval
))
7486 #define AARCH64_CC_V 1
7487 #define AARCH64_CC_C (1 << 1)
7488 #define AARCH64_CC_Z (1 << 2)
7489 #define AARCH64_CC_N (1 << 3)
7491 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
7492 static const int aarch64_nzcv_codes
[] =
7494 0, /* EQ, Z == 1. */
7495 AARCH64_CC_Z
, /* NE, Z == 0. */
7496 0, /* CS, C == 1. */
7497 AARCH64_CC_C
, /* CC, C == 0. */
7498 0, /* MI, N == 1. */
7499 AARCH64_CC_N
, /* PL, N == 0. */
7500 0, /* VS, V == 1. */
7501 AARCH64_CC_V
, /* VC, V == 0. */
7502 0, /* HI, C ==1 && Z == 0. */
7503 AARCH64_CC_C
, /* LS, !(C == 1 && Z == 0). */
7504 AARCH64_CC_V
, /* GE, N == V. */
7505 0, /* LT, N != V. */
7506 AARCH64_CC_Z
, /* GT, Z == 0 && N == V. */
7507 0, /* LE, !(Z == 0 && N == V). */
7512 /* Print floating-point vector immediate operand X to F, negating it
7513 first if NEGATE is true. Return true on success, false if it isn't
7514 a constant we can handle. */
7517 aarch64_print_vector_float_operand (FILE *f
, rtx x
, bool negate
)
7521 if (!const_vec_duplicate_p (x
, &elt
))
7524 REAL_VALUE_TYPE r
= *CONST_DOUBLE_REAL_VALUE (elt
);
7526 r
= real_value_negate (&r
);
7528 /* We only handle the SVE single-bit immediates here. */
7529 if (real_equal (&r
, &dconst0
))
7530 asm_fprintf (f
, "0.0");
7531 else if (real_equal (&r
, &dconst1
))
7532 asm_fprintf (f
, "1.0");
7533 else if (real_equal (&r
, &dconsthalf
))
7534 asm_fprintf (f
, "0.5");
7541 /* Return the equivalent letter for size. */
7543 sizetochar (int size
)
7547 case 64: return 'd';
7548 case 32: return 's';
7549 case 16: return 'h';
7550 case 8 : return 'b';
7551 default: gcc_unreachable ();
7555 /* Print operand X to file F in a target specific manner according to CODE.
7556 The acceptable formatting commands given by CODE are:
7557 'c': An integer or symbol address without a preceding #
7559 'C': Take the duplicated element in a vector constant
7560 and print it in hex.
7561 'D': Take the duplicated element in a vector constant
7562 and print it as an unsigned integer, in decimal.
7563 'e': Print the sign/zero-extend size as a character 8->b,
7565 'p': Prints N such that 2^N == X (X must be power of 2 and
7567 'P': Print the number of non-zero bits in X (a const_int).
7568 'H': Print the higher numbered register of a pair (TImode)
7570 'm': Print a condition (eq, ne, etc).
7571 'M': Same as 'm', but invert condition.
7572 'N': Take the duplicated element in a vector constant
7573 and print the negative of it in decimal.
7574 'b/h/s/d/q': Print a scalar FP/SIMD register name.
7575 'S/T/U/V': Print a FP/SIMD register name for a register list.
7576 The register printed is the FP/SIMD register name
7577 of X + 0/1/2/3 for S/T/U/V.
7578 'R': Print a scalar FP/SIMD register name + 1.
7579 'X': Print bottom 16 bits of integer constant in hex.
7580 'w/x': Print a general register name or the zero register
7582 '0': Print a normal operand, if it's a general register,
7583 then we assume DImode.
7584 'k': Print NZCV for conditional compare instructions.
7585 'A': Output address constant representing the first
7586 argument of X, specifying a relocation offset
7588 'L': Output constant address specified by X
7589 with a relocation offset if appropriate.
7590 'G': Prints address of X, specifying a PC relative
7591 relocation mode if appropriate.
7592 'y': Output address of LDP or STP - this is used for
7593 some LDP/STPs which don't use a PARALLEL in their
7594 pattern (so the mode needs to be adjusted).
7595 'z': Output address of a typical LDP or STP. */
7598 aarch64_print_operand (FILE *f
, rtx x
, int code
)
7604 switch (GET_CODE (x
))
7607 fprintf (f
, HOST_WIDE_INT_PRINT_DEC
, INTVAL (x
));
7611 output_addr_const (f
, x
);
7615 if (GET_CODE (XEXP (x
, 0)) == PLUS
7616 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
)
7618 output_addr_const (f
, x
);
7624 output_operand_lossage ("unsupported operand for code '%c'", code
);
7632 if (!CONST_INT_P (x
)
7633 || (n
= exact_log2 (INTVAL (x
) & ~7)) <= 0)
7635 output_operand_lossage ("invalid operand for '%%%c'", code
);
7651 output_operand_lossage ("invalid operand for '%%%c'", code
);
7661 if (!CONST_INT_P (x
) || (n
= exact_log2 (INTVAL (x
))) < 0)
7663 output_operand_lossage ("invalid operand for '%%%c'", code
);
7667 asm_fprintf (f
, "%d", n
);
7672 if (!CONST_INT_P (x
))
7674 output_operand_lossage ("invalid operand for '%%%c'", code
);
7678 asm_fprintf (f
, "%u", popcount_hwi (INTVAL (x
)));
7682 if (x
== const0_rtx
)
7684 asm_fprintf (f
, "xzr");
7688 if (!REG_P (x
) || !GP_REGNUM_P (REGNO (x
) + 1))
7690 output_operand_lossage ("invalid operand for '%%%c'", code
);
7694 asm_fprintf (f
, "%s", reg_names
[REGNO (x
) + 1]);
7701 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
7702 if (x
== const_true_rtx
)
7709 if (!COMPARISON_P (x
))
7711 output_operand_lossage ("invalid operand for '%%%c'", code
);
7715 cond_code
= aarch64_get_condition_code (x
);
7716 gcc_assert (cond_code
>= 0);
7718 cond_code
= AARCH64_INVERSE_CONDITION_CODE (cond_code
);
7719 fputs (aarch64_condition_codes
[cond_code
], f
);
7724 if (!const_vec_duplicate_p (x
, &elt
))
7726 output_operand_lossage ("invalid vector constant");
7730 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
7731 asm_fprintf (f
, "%wd", -INTVAL (elt
));
7732 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
7733 && aarch64_print_vector_float_operand (f
, x
, true))
7737 output_operand_lossage ("invalid vector constant");
7747 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
7749 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
7752 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - V0_REGNUM
);
7759 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
7761 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
7764 asm_fprintf (f
, "%c%d",
7765 aarch64_sve_data_mode_p (GET_MODE (x
)) ? 'z' : 'v',
7766 REGNO (x
) - V0_REGNUM
+ (code
- 'S'));
7770 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
7772 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
7775 asm_fprintf (f
, "q%d", REGNO (x
) - V0_REGNUM
+ 1);
7779 if (!CONST_INT_P (x
))
7781 output_operand_lossage ("invalid operand for '%%%c'", code
);
7784 asm_fprintf (f
, "0x%wx", UINTVAL (x
) & 0xffff);
7789 /* Print a replicated constant in hex. */
7790 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
7792 output_operand_lossage ("invalid operand for '%%%c'", code
);
7795 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
7796 asm_fprintf (f
, "0x%wx", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
7802 /* Print a replicated constant in decimal, treating it as
7804 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
7806 output_operand_lossage ("invalid operand for '%%%c'", code
);
7809 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
7810 asm_fprintf (f
, "%wd", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
7817 || (CONST_DOUBLE_P (x
) && aarch64_float_const_zero_rtx_p (x
)))
7819 asm_fprintf (f
, "%czr", code
);
7823 if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
7825 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - R0_REGNUM
);
7829 if (REG_P (x
) && REGNO (x
) == SP_REGNUM
)
7831 asm_fprintf (f
, "%ssp", code
== 'w' ? "w" : "");
7840 output_operand_lossage ("missing operand");
7844 switch (GET_CODE (x
))
7847 if (aarch64_sve_data_mode_p (GET_MODE (x
)))
7849 if (REG_NREGS (x
) == 1)
7850 asm_fprintf (f
, "z%d", REGNO (x
) - V0_REGNUM
);
7854 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x
)));
7855 asm_fprintf (f
, "{z%d.%c - z%d.%c}",
7856 REGNO (x
) - V0_REGNUM
, suffix
,
7857 END_REGNO (x
) - V0_REGNUM
- 1, suffix
);
7861 asm_fprintf (f
, "%s", reg_names
[REGNO (x
)]);
7865 output_address (GET_MODE (x
), XEXP (x
, 0));
7870 output_addr_const (asm_out_file
, x
);
7874 asm_fprintf (f
, "%wd", INTVAL (x
));
7878 if (!VECTOR_MODE_P (GET_MODE (x
)))
7880 output_addr_const (asm_out_file
, x
);
7886 if (!const_vec_duplicate_p (x
, &elt
))
7888 output_operand_lossage ("invalid vector constant");
7892 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
7893 asm_fprintf (f
, "%wd", INTVAL (elt
));
7894 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
7895 && aarch64_print_vector_float_operand (f
, x
, false))
7899 output_operand_lossage ("invalid vector constant");
7905 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
7906 be getting CONST_DOUBLEs holding integers. */
7907 gcc_assert (GET_MODE (x
) != VOIDmode
);
7908 if (aarch64_float_const_zero_rtx_p (x
))
7913 else if (aarch64_float_const_representable_p (x
))
7916 char float_buf
[buf_size
] = {'\0'};
7917 real_to_decimal_for_mode (float_buf
,
7918 CONST_DOUBLE_REAL_VALUE (x
),
7921 asm_fprintf (asm_out_file
, "%s", float_buf
);
7925 output_operand_lossage ("invalid constant");
7928 output_operand_lossage ("invalid operand");
7934 if (GET_CODE (x
) == HIGH
)
7937 switch (aarch64_classify_symbolic_expression (x
))
7939 case SYMBOL_SMALL_GOT_4G
:
7940 asm_fprintf (asm_out_file
, ":got:");
7943 case SYMBOL_SMALL_TLSGD
:
7944 asm_fprintf (asm_out_file
, ":tlsgd:");
7947 case SYMBOL_SMALL_TLSDESC
:
7948 asm_fprintf (asm_out_file
, ":tlsdesc:");
7951 case SYMBOL_SMALL_TLSIE
:
7952 asm_fprintf (asm_out_file
, ":gottprel:");
7955 case SYMBOL_TLSLE24
:
7956 asm_fprintf (asm_out_file
, ":tprel:");
7959 case SYMBOL_TINY_GOT
:
7966 output_addr_const (asm_out_file
, x
);
7970 switch (aarch64_classify_symbolic_expression (x
))
7972 case SYMBOL_SMALL_GOT_4G
:
7973 asm_fprintf (asm_out_file
, ":lo12:");
7976 case SYMBOL_SMALL_TLSGD
:
7977 asm_fprintf (asm_out_file
, ":tlsgd_lo12:");
7980 case SYMBOL_SMALL_TLSDESC
:
7981 asm_fprintf (asm_out_file
, ":tlsdesc_lo12:");
7984 case SYMBOL_SMALL_TLSIE
:
7985 asm_fprintf (asm_out_file
, ":gottprel_lo12:");
7988 case SYMBOL_TLSLE12
:
7989 asm_fprintf (asm_out_file
, ":tprel_lo12:");
7992 case SYMBOL_TLSLE24
:
7993 asm_fprintf (asm_out_file
, ":tprel_lo12_nc:");
7996 case SYMBOL_TINY_GOT
:
7997 asm_fprintf (asm_out_file
, ":got:");
8000 case SYMBOL_TINY_TLSIE
:
8001 asm_fprintf (asm_out_file
, ":gottprel:");
8007 output_addr_const (asm_out_file
, x
);
8011 switch (aarch64_classify_symbolic_expression (x
))
8013 case SYMBOL_TLSLE24
:
8014 asm_fprintf (asm_out_file
, ":tprel_hi12:");
8019 output_addr_const (asm_out_file
, x
);
8024 HOST_WIDE_INT cond_code
;
8026 if (!CONST_INT_P (x
))
8028 output_operand_lossage ("invalid operand for '%%%c'", code
);
8032 cond_code
= INTVAL (x
);
8033 gcc_assert (cond_code
>= 0 && cond_code
<= AARCH64_NV
);
8034 asm_fprintf (f
, "%d", aarch64_nzcv_codes
[cond_code
]);
8041 machine_mode mode
= GET_MODE (x
);
8043 if (GET_CODE (x
) != MEM
8044 || (code
== 'y' && maybe_ne (GET_MODE_SIZE (mode
), 16)))
8046 output_operand_lossage ("invalid operand for '%%%c'", code
);
8050 if (!aarch64_print_address_internal (f
, mode
, XEXP (x
, 0),
8052 ? ADDR_QUERY_LDP_STP_N
8053 : ADDR_QUERY_LDP_STP
))
8054 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
8059 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
8064 /* Print address 'x' of a memory access with mode 'mode'.
8065 'op' is the context required by aarch64_classify_address. It can either be
8066 MEM for a normal memory access or PARALLEL for LDP/STP. */
8068 aarch64_print_address_internal (FILE *f
, machine_mode mode
, rtx x
,
8069 aarch64_addr_query_type type
)
8071 struct aarch64_address_info addr
;
8074 /* Check all addresses are Pmode - including ILP32. */
8075 if (GET_MODE (x
) != Pmode
8076 && (!CONST_INT_P (x
)
8077 || trunc_int_for_mode (INTVAL (x
), Pmode
) != INTVAL (x
)))
8079 output_operand_lossage ("invalid address mode");
8083 if (aarch64_classify_address (&addr
, x
, mode
, true, type
))
8086 case ADDRESS_REG_IMM
:
8087 if (known_eq (addr
.const_offset
, 0))
8088 asm_fprintf (f
, "[%s]", reg_names
[REGNO (addr
.base
)]);
8089 else if (aarch64_sve_data_mode_p (mode
))
8092 = exact_div (addr
.const_offset
,
8093 BYTES_PER_SVE_VECTOR
).to_constant ();
8094 asm_fprintf (f
, "[%s, #%wd, mul vl]",
8095 reg_names
[REGNO (addr
.base
)], vnum
);
8097 else if (aarch64_sve_pred_mode_p (mode
))
8100 = exact_div (addr
.const_offset
,
8101 BYTES_PER_SVE_PRED
).to_constant ();
8102 asm_fprintf (f
, "[%s, #%wd, mul vl]",
8103 reg_names
[REGNO (addr
.base
)], vnum
);
8106 asm_fprintf (f
, "[%s, %wd]", reg_names
[REGNO (addr
.base
)],
8107 INTVAL (addr
.offset
));
8110 case ADDRESS_REG_REG
:
8111 if (addr
.shift
== 0)
8112 asm_fprintf (f
, "[%s, %s]", reg_names
[REGNO (addr
.base
)],
8113 reg_names
[REGNO (addr
.offset
)]);
8115 asm_fprintf (f
, "[%s, %s, lsl %u]", reg_names
[REGNO (addr
.base
)],
8116 reg_names
[REGNO (addr
.offset
)], addr
.shift
);
8119 case ADDRESS_REG_UXTW
:
8120 if (addr
.shift
== 0)
8121 asm_fprintf (f
, "[%s, w%d, uxtw]", reg_names
[REGNO (addr
.base
)],
8122 REGNO (addr
.offset
) - R0_REGNUM
);
8124 asm_fprintf (f
, "[%s, w%d, uxtw %u]", reg_names
[REGNO (addr
.base
)],
8125 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
8128 case ADDRESS_REG_SXTW
:
8129 if (addr
.shift
== 0)
8130 asm_fprintf (f
, "[%s, w%d, sxtw]", reg_names
[REGNO (addr
.base
)],
8131 REGNO (addr
.offset
) - R0_REGNUM
);
8133 asm_fprintf (f
, "[%s, w%d, sxtw %u]", reg_names
[REGNO (addr
.base
)],
8134 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
8137 case ADDRESS_REG_WB
:
8138 /* Writeback is only supported for fixed-width modes. */
8139 size
= GET_MODE_SIZE (mode
).to_constant ();
8140 switch (GET_CODE (x
))
8143 asm_fprintf (f
, "[%s, %d]!", reg_names
[REGNO (addr
.base
)], size
);
8146 asm_fprintf (f
, "[%s], %d", reg_names
[REGNO (addr
.base
)], size
);
8149 asm_fprintf (f
, "[%s, -%d]!", reg_names
[REGNO (addr
.base
)], size
);
8152 asm_fprintf (f
, "[%s], -%d", reg_names
[REGNO (addr
.base
)], size
);
8155 asm_fprintf (f
, "[%s, %wd]!", reg_names
[REGNO (addr
.base
)],
8156 INTVAL (addr
.offset
));
8159 asm_fprintf (f
, "[%s], %wd", reg_names
[REGNO (addr
.base
)],
8160 INTVAL (addr
.offset
));
8167 case ADDRESS_LO_SUM
:
8168 asm_fprintf (f
, "[%s, #:lo12:", reg_names
[REGNO (addr
.base
)]);
8169 output_addr_const (f
, addr
.offset
);
8170 asm_fprintf (f
, "]");
8173 case ADDRESS_SYMBOLIC
:
8174 output_addr_const (f
, x
);
8181 /* Print address 'x' of a memory access with mode 'mode'. */
8183 aarch64_print_operand_address (FILE *f
, machine_mode mode
, rtx x
)
8185 if (!aarch64_print_address_internal (f
, mode
, x
, ADDR_QUERY_ANY
))
8186 output_addr_const (f
, x
);
8190 aarch64_label_mentioned_p (rtx x
)
8195 if (GET_CODE (x
) == LABEL_REF
)
8198 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
8199 referencing instruction, but they are constant offsets, not
8201 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
8204 fmt
= GET_RTX_FORMAT (GET_CODE (x
));
8205 for (i
= GET_RTX_LENGTH (GET_CODE (x
)) - 1; i
>= 0; i
--)
8211 for (j
= XVECLEN (x
, i
) - 1; j
>= 0; j
--)
8212 if (aarch64_label_mentioned_p (XVECEXP (x
, i
, j
)))
8215 else if (fmt
[i
] == 'e' && aarch64_label_mentioned_p (XEXP (x
, i
)))
8222 /* Implement REGNO_REG_CLASS. */
8225 aarch64_regno_regclass (unsigned regno
)
8227 if (GP_REGNUM_P (regno
))
8228 return GENERAL_REGS
;
8230 if (regno
== SP_REGNUM
)
8233 if (regno
== FRAME_POINTER_REGNUM
8234 || regno
== ARG_POINTER_REGNUM
)
8235 return POINTER_REGS
;
8237 if (FP_REGNUM_P (regno
))
8238 return FP_LO_REGNUM_P (regno
) ? FP_LO_REGS
: FP_REGS
;
8240 if (PR_REGNUM_P (regno
))
8241 return PR_LO_REGNUM_P (regno
) ? PR_LO_REGS
: PR_HI_REGS
;
8246 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
8247 If OFFSET is out of range, return an offset of an anchor point
8248 that is in range. Return 0 otherwise. */
8250 static HOST_WIDE_INT
8251 aarch64_anchor_offset (HOST_WIDE_INT offset
, HOST_WIDE_INT size
,
8254 /* Does it look like we'll need a 16-byte load/store-pair operation? */
8256 return (offset
+ 0x400) & ~0x7f0;
8258 /* For offsets that aren't a multiple of the access size, the limit is
8260 if (offset
& (size
- 1))
8262 /* BLKmode typically uses LDP of X-registers. */
8263 if (mode
== BLKmode
)
8264 return (offset
+ 512) & ~0x3ff;
8265 return (offset
+ 0x100) & ~0x1ff;
8268 /* Small negative offsets are supported. */
8269 if (IN_RANGE (offset
, -256, 0))
8272 if (mode
== TImode
|| mode
== TFmode
)
8273 return (offset
+ 0x100) & ~0x1ff;
8275 /* Use 12-bit offset by access size. */
8276 return offset
& (~0xfff * size
);
8280 aarch64_legitimize_address (rtx x
, rtx
/* orig_x */, machine_mode mode
)
8282 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
8283 where mask is selected by alignment and size of the offset.
8284 We try to pick as large a range for the offset as possible to
8285 maximize the chance of a CSE. However, for aligned addresses
8286 we limit the range to 4k so that structures with different sized
8287 elements are likely to use the same base. We need to be careful
8288 not to split a CONST for some forms of address expression, otherwise
8289 it will generate sub-optimal code. */
8291 if (GET_CODE (x
) == PLUS
&& CONST_INT_P (XEXP (x
, 1)))
8293 rtx base
= XEXP (x
, 0);
8294 rtx offset_rtx
= XEXP (x
, 1);
8295 HOST_WIDE_INT offset
= INTVAL (offset_rtx
);
8297 if (GET_CODE (base
) == PLUS
)
8299 rtx op0
= XEXP (base
, 0);
8300 rtx op1
= XEXP (base
, 1);
8302 /* Force any scaling into a temp for CSE. */
8303 op0
= force_reg (Pmode
, op0
);
8304 op1
= force_reg (Pmode
, op1
);
8306 /* Let the pointer register be in op0. */
8307 if (REG_POINTER (op1
))
8308 std::swap (op0
, op1
);
8310 /* If the pointer is virtual or frame related, then we know that
8311 virtual register instantiation or register elimination is going
8312 to apply a second constant. We want the two constants folded
8313 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
8314 if (virt_or_elim_regno_p (REGNO (op0
)))
8316 base
= expand_binop (Pmode
, add_optab
, op0
, offset_rtx
,
8317 NULL_RTX
, true, OPTAB_DIRECT
);
8318 return gen_rtx_PLUS (Pmode
, base
, op1
);
8321 /* Otherwise, in order to encourage CSE (and thence loop strength
8322 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
8323 base
= expand_binop (Pmode
, add_optab
, op0
, op1
,
8324 NULL_RTX
, true, OPTAB_DIRECT
);
8325 x
= gen_rtx_PLUS (Pmode
, base
, offset_rtx
);
8329 if (GET_MODE_SIZE (mode
).is_constant (&size
))
8331 HOST_WIDE_INT base_offset
= aarch64_anchor_offset (offset
, size
,
8333 if (base_offset
!= 0)
8335 base
= plus_constant (Pmode
, base
, base_offset
);
8336 base
= force_operand (base
, NULL_RTX
);
8337 return plus_constant (Pmode
, base
, offset
- base_offset
);
8346 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED
, rtx x
,
8349 secondary_reload_info
*sri
)
8351 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
8352 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
8353 comment at the head of aarch64-sve.md for more details about the
8354 big-endian handling. */
8355 if (BYTES_BIG_ENDIAN
8356 && reg_class_subset_p (rclass
, FP_REGS
)
8357 && !((REG_P (x
) && HARD_REGISTER_P (x
))
8358 || aarch64_simd_valid_immediate (x
, NULL
))
8359 && aarch64_sve_data_mode_p (mode
))
8361 sri
->icode
= CODE_FOR_aarch64_sve_reload_be
;
8365 /* If we have to disable direct literal pool loads and stores because the
8366 function is too big, then we need a scratch register. */
8367 if (MEM_P (x
) && GET_CODE (x
) == SYMBOL_REF
&& CONSTANT_POOL_ADDRESS_P (x
)
8368 && (SCALAR_FLOAT_MODE_P (GET_MODE (x
))
8369 || targetm
.vector_mode_supported_p (GET_MODE (x
)))
8370 && !aarch64_pcrelative_literal_loads
)
8372 sri
->icode
= code_for_aarch64_reload_movcp (mode
, DImode
);
8376 /* Without the TARGET_SIMD instructions we cannot move a Q register
8377 to a Q register directly. We need a scratch. */
8378 if (REG_P (x
) && (mode
== TFmode
|| mode
== TImode
) && mode
== GET_MODE (x
)
8379 && FP_REGNUM_P (REGNO (x
)) && !TARGET_SIMD
8380 && reg_class_subset_p (rclass
, FP_REGS
))
8382 sri
->icode
= code_for_aarch64_reload_mov (mode
);
8386 /* A TFmode or TImode memory access should be handled via an FP_REGS
8387 because AArch64 has richer addressing modes for LDR/STR instructions
8388 than LDP/STP instructions. */
8389 if (TARGET_FLOAT
&& rclass
== GENERAL_REGS
8390 && known_eq (GET_MODE_SIZE (mode
), 16) && MEM_P (x
))
8393 if (rclass
== FP_REGS
&& (mode
== TImode
|| mode
== TFmode
) && CONSTANT_P(x
))
8394 return GENERAL_REGS
;
8400 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED
, const int to
)
8402 gcc_assert (from
== ARG_POINTER_REGNUM
|| from
== FRAME_POINTER_REGNUM
);
8404 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
8405 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
8406 if (frame_pointer_needed
)
8407 return to
== HARD_FRAME_POINTER_REGNUM
;
8412 aarch64_initial_elimination_offset (unsigned from
, unsigned to
)
8414 if (to
== HARD_FRAME_POINTER_REGNUM
)
8416 if (from
== ARG_POINTER_REGNUM
)
8417 return cfun
->machine
->frame
.hard_fp_offset
;
8419 if (from
== FRAME_POINTER_REGNUM
)
8420 return cfun
->machine
->frame
.hard_fp_offset
8421 - cfun
->machine
->frame
.locals_offset
;
8424 if (to
== STACK_POINTER_REGNUM
)
8426 if (from
== FRAME_POINTER_REGNUM
)
8427 return cfun
->machine
->frame
.frame_size
8428 - cfun
->machine
->frame
.locals_offset
;
8431 return cfun
->machine
->frame
.frame_size
;
8434 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
8438 aarch64_return_addr (int count
, rtx frame ATTRIBUTE_UNUSED
)
8442 return get_hard_reg_initial_val (Pmode
, LR_REGNUM
);
8447 aarch64_asm_trampoline_template (FILE *f
)
8452 if (aarch64_bti_enabled ())
8454 asm_fprintf (f
, "\thint\t34 // bti c\n");
8461 asm_fprintf (f
, "\tldr\tw%d, .+%d\n", IP1_REGNUM
- R0_REGNUM
, offset1
);
8462 asm_fprintf (f
, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM
- R0_REGNUM
,
8467 asm_fprintf (f
, "\tldr\t%s, .+%d\n", reg_names
[IP1_REGNUM
], offset1
);
8468 asm_fprintf (f
, "\tldr\t%s, .+%d\n", reg_names
[STATIC_CHAIN_REGNUM
],
8471 asm_fprintf (f
, "\tbr\t%s\n", reg_names
[IP1_REGNUM
]);
8473 /* The trampoline needs an extra padding instruction. In case if BTI is
8474 enabled the padding instruction is replaced by the BTI instruction at
8476 if (!aarch64_bti_enabled ())
8477 assemble_aligned_integer (4, const0_rtx
);
8479 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
8480 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
8484 aarch64_trampoline_init (rtx m_tramp
, tree fndecl
, rtx chain_value
)
8486 rtx fnaddr
, mem
, a_tramp
;
8487 const int tramp_code_sz
= 16;
8489 /* Don't need to copy the trailing D-words, we fill those in below. */
8490 emit_block_move (m_tramp
, assemble_trampoline_template (),
8491 GEN_INT (tramp_code_sz
), BLOCK_OP_NORMAL
);
8492 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
);
8493 fnaddr
= XEXP (DECL_RTL (fndecl
), 0);
8494 if (GET_MODE (fnaddr
) != ptr_mode
)
8495 fnaddr
= convert_memory_address (ptr_mode
, fnaddr
);
8496 emit_move_insn (mem
, fnaddr
);
8498 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
+ POINTER_BYTES
);
8499 emit_move_insn (mem
, chain_value
);
8501 /* XXX We should really define a "clear_cache" pattern and use
8502 gen_clear_cache(). */
8503 a_tramp
= XEXP (m_tramp
, 0);
8504 emit_library_call (gen_rtx_SYMBOL_REF (Pmode
, "__clear_cache"),
8505 LCT_NORMAL
, VOIDmode
, a_tramp
, ptr_mode
,
8506 plus_constant (ptr_mode
, a_tramp
, TRAMPOLINE_SIZE
),
8510 static unsigned char
8511 aarch64_class_max_nregs (reg_class_t regclass
, machine_mode mode
)
8513 /* ??? Logically we should only need to provide a value when
8514 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
8515 can hold MODE, but at the moment we need to handle all modes.
8516 Just ignore any runtime parts for registers that can't store them. */
8517 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
8521 case TAILCALL_ADDR_REGS
:
8525 case POINTER_AND_FP_REGS
:
8528 if (aarch64_sve_data_mode_p (mode
)
8529 && constant_multiple_p (GET_MODE_SIZE (mode
),
8530 BYTES_PER_SVE_VECTOR
, &nregs
))
8532 return (aarch64_vector_data_mode_p (mode
)
8533 ? CEIL (lowest_size
, UNITS_PER_VREG
)
8534 : CEIL (lowest_size
, UNITS_PER_WORD
));
8551 aarch64_preferred_reload_class (rtx x
, reg_class_t regclass
)
8553 if (regclass
== POINTER_REGS
)
8554 return GENERAL_REGS
;
8556 if (regclass
== STACK_REG
)
8559 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x
)), POINTER_REGS
))
8565 /* Register eliminiation can result in a request for
8566 SP+constant->FP_REGS. We cannot support such operations which
8567 use SP as source and an FP_REG as destination, so reject out
8569 if (! reg_class_subset_p (regclass
, GENERAL_REGS
) && GET_CODE (x
) == PLUS
)
8571 rtx lhs
= XEXP (x
, 0);
8573 /* Look through a possible SUBREG introduced by ILP32. */
8574 if (GET_CODE (lhs
) == SUBREG
)
8575 lhs
= SUBREG_REG (lhs
);
8577 gcc_assert (REG_P (lhs
));
8578 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs
)),
8587 aarch64_asm_output_labelref (FILE* f
, const char *name
)
8589 asm_fprintf (f
, "%U%s", name
);
8593 aarch64_elf_asm_constructor (rtx symbol
, int priority
)
8595 if (priority
== DEFAULT_INIT_PRIORITY
)
8596 default_ctor_section_asm_out_constructor (symbol
, priority
);
8600 /* While priority is known to be in range [0, 65535], so 18 bytes
8601 would be enough, the compiler might not know that. To avoid
8602 -Wformat-truncation false positive, use a larger size. */
8604 snprintf (buf
, sizeof (buf
), ".init_array.%.5u", priority
);
8605 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
8606 switch_to_section (s
);
8607 assemble_align (POINTER_SIZE
);
8608 assemble_aligned_integer (POINTER_BYTES
, symbol
);
8613 aarch64_elf_asm_destructor (rtx symbol
, int priority
)
8615 if (priority
== DEFAULT_INIT_PRIORITY
)
8616 default_dtor_section_asm_out_destructor (symbol
, priority
);
8620 /* While priority is known to be in range [0, 65535], so 18 bytes
8621 would be enough, the compiler might not know that. To avoid
8622 -Wformat-truncation false positive, use a larger size. */
8624 snprintf (buf
, sizeof (buf
), ".fini_array.%.5u", priority
);
8625 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
8626 switch_to_section (s
);
8627 assemble_align (POINTER_SIZE
);
8628 assemble_aligned_integer (POINTER_BYTES
, symbol
);
8633 aarch64_output_casesi (rtx
*operands
)
8637 rtx diff_vec
= PATTERN (NEXT_INSN (as_a
<rtx_insn
*> (operands
[2])));
8639 static const char *const patterns
[4][2] =
8642 "ldrb\t%w3, [%0,%w1,uxtw]",
8643 "add\t%3, %4, %w3, sxtb #2"
8646 "ldrh\t%w3, [%0,%w1,uxtw #1]",
8647 "add\t%3, %4, %w3, sxth #2"
8650 "ldr\t%w3, [%0,%w1,uxtw #2]",
8651 "add\t%3, %4, %w3, sxtw #2"
8653 /* We assume that DImode is only generated when not optimizing and
8654 that we don't really need 64-bit address offsets. That would
8655 imply an object file with 8GB of code in a single function! */
8657 "ldr\t%w3, [%0,%w1,uxtw #2]",
8658 "add\t%3, %4, %w3, sxtw #2"
8662 gcc_assert (GET_CODE (diff_vec
) == ADDR_DIFF_VEC
);
8664 scalar_int_mode mode
= as_a
<scalar_int_mode
> (GET_MODE (diff_vec
));
8665 index
= exact_log2 (GET_MODE_SIZE (mode
));
8667 gcc_assert (index
>= 0 && index
<= 3);
8669 /* Need to implement table size reduction, by chaning the code below. */
8670 output_asm_insn (patterns
[index
][0], operands
);
8671 ASM_GENERATE_INTERNAL_LABEL (label
, "Lrtx", CODE_LABEL_NUMBER (operands
[2]));
8672 snprintf (buf
, sizeof (buf
),
8673 "adr\t%%4, %s", targetm
.strip_name_encoding (label
));
8674 output_asm_insn (buf
, operands
);
8675 output_asm_insn (patterns
[index
][1], operands
);
8676 output_asm_insn ("br\t%3", operands
);
8677 assemble_label (asm_out_file
, label
);
8682 /* Return size in bits of an arithmetic operand which is shifted/scaled and
8683 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
8687 aarch64_uxt_size (int shift
, HOST_WIDE_INT mask
)
8689 if (shift
>= 0 && shift
<= 3)
8692 for (size
= 8; size
<= 32; size
*= 2)
8694 HOST_WIDE_INT bits
= ((HOST_WIDE_INT
)1U << size
) - 1;
8695 if (mask
== bits
<< shift
)
8702 /* Constant pools are per function only when PC relative
8703 literal loads are true or we are in the large memory
8707 aarch64_can_use_per_function_literal_pools_p (void)
8709 return (aarch64_pcrelative_literal_loads
8710 || aarch64_cmodel
== AARCH64_CMODEL_LARGE
);
8714 aarch64_use_blocks_for_constant_p (machine_mode
, const_rtx
)
8716 /* We can't use blocks for constants when we're using a per-function
8718 return !aarch64_can_use_per_function_literal_pools_p ();
8721 /* Select appropriate section for constants depending
8722 on where we place literal pools. */
8725 aarch64_select_rtx_section (machine_mode mode
,
8727 unsigned HOST_WIDE_INT align
)
8729 if (aarch64_can_use_per_function_literal_pools_p ())
8730 return function_section (current_function_decl
);
8732 return default_elf_select_rtx_section (mode
, x
, align
);
8735 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
8737 aarch64_asm_output_pool_epilogue (FILE *f
, const char *, tree
,
8738 HOST_WIDE_INT offset
)
8740 /* When using per-function literal pools, we must ensure that any code
8741 section is aligned to the minimal instruction length, lest we get
8742 errors from the assembler re "unaligned instructions". */
8743 if ((offset
& 3) && aarch64_can_use_per_function_literal_pools_p ())
8744 ASM_OUTPUT_ALIGN (f
, 2);
8749 /* Helper function for rtx cost calculation. Strip a shift expression
8750 from X. Returns the inner operand if successful, or the original
8751 expression on failure. */
8753 aarch64_strip_shift (rtx x
)
8757 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
8758 we can convert both to ROR during final output. */
8759 if ((GET_CODE (op
) == ASHIFT
8760 || GET_CODE (op
) == ASHIFTRT
8761 || GET_CODE (op
) == LSHIFTRT
8762 || GET_CODE (op
) == ROTATERT
8763 || GET_CODE (op
) == ROTATE
)
8764 && CONST_INT_P (XEXP (op
, 1)))
8765 return XEXP (op
, 0);
8767 if (GET_CODE (op
) == MULT
8768 && CONST_INT_P (XEXP (op
, 1))
8769 && ((unsigned) exact_log2 (INTVAL (XEXP (op
, 1)))) < 64)
8770 return XEXP (op
, 0);
8775 /* Helper function for rtx cost calculation. Strip an extend
8776 expression from X. Returns the inner operand if successful, or the
8777 original expression on failure. We deal with a number of possible
8778 canonicalization variations here. If STRIP_SHIFT is true, then
8779 we can strip off a shift also. */
8781 aarch64_strip_extend (rtx x
, bool strip_shift
)
8783 scalar_int_mode mode
;
8786 if (!is_a
<scalar_int_mode
> (GET_MODE (op
), &mode
))
8789 /* Zero and sign extraction of a widened value. */
8790 if ((GET_CODE (op
) == ZERO_EXTRACT
|| GET_CODE (op
) == SIGN_EXTRACT
)
8791 && XEXP (op
, 2) == const0_rtx
8792 && GET_CODE (XEXP (op
, 0)) == MULT
8793 && aarch64_is_extend_from_extract (mode
, XEXP (XEXP (op
, 0), 1),
8795 return XEXP (XEXP (op
, 0), 0);
8797 /* It can also be represented (for zero-extend) as an AND with an
8799 if (GET_CODE (op
) == AND
8800 && GET_CODE (XEXP (op
, 0)) == MULT
8801 && CONST_INT_P (XEXP (XEXP (op
, 0), 1))
8802 && CONST_INT_P (XEXP (op
, 1))
8803 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op
, 0), 1))),
8804 INTVAL (XEXP (op
, 1))) != 0)
8805 return XEXP (XEXP (op
, 0), 0);
8807 /* Now handle extended register, as this may also have an optional
8808 left shift by 1..4. */
8810 && GET_CODE (op
) == ASHIFT
8811 && CONST_INT_P (XEXP (op
, 1))
8812 && ((unsigned HOST_WIDE_INT
) INTVAL (XEXP (op
, 1))) <= 4)
8815 if (GET_CODE (op
) == ZERO_EXTEND
8816 || GET_CODE (op
) == SIGN_EXTEND
)
8825 /* Return true iff CODE is a shift supported in combination
8826 with arithmetic instructions. */
8829 aarch64_shift_p (enum rtx_code code
)
8831 return code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
;
8835 /* Return true iff X is a cheap shift without a sign extend. */
8838 aarch64_cheap_mult_shift_p (rtx x
)
8845 if (!(aarch64_tune_params
.extra_tuning_flags
8846 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
))
8849 if (GET_CODE (op0
) == SIGN_EXTEND
)
8852 if (GET_CODE (x
) == ASHIFT
&& CONST_INT_P (op1
)
8853 && UINTVAL (op1
) <= 4)
8856 if (GET_CODE (x
) != MULT
|| !CONST_INT_P (op1
))
8859 HOST_WIDE_INT l2
= exact_log2 (INTVAL (op1
));
8861 if (l2
> 0 && l2
<= 4)
8867 /* Helper function for rtx cost calculation. Calculate the cost of
8868 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
8869 Return the calculated cost of the expression, recursing manually in to
8870 operands where needed. */
8873 aarch64_rtx_mult_cost (rtx x
, enum rtx_code code
, int outer
, bool speed
)
8876 const struct cpu_cost_table
*extra_cost
8877 = aarch64_tune_params
.insn_extra_cost
;
8879 bool compound_p
= (outer
== PLUS
|| outer
== MINUS
);
8880 machine_mode mode
= GET_MODE (x
);
8882 gcc_checking_assert (code
== MULT
);
8887 if (VECTOR_MODE_P (mode
))
8888 mode
= GET_MODE_INNER (mode
);
8890 /* Integer multiply/fma. */
8891 if (GET_MODE_CLASS (mode
) == MODE_INT
)
8893 /* The multiply will be canonicalized as a shift, cost it as such. */
8894 if (aarch64_shift_p (GET_CODE (x
))
8895 || (CONST_INT_P (op1
)
8896 && exact_log2 (INTVAL (op1
)) > 0))
8898 bool is_extend
= GET_CODE (op0
) == ZERO_EXTEND
8899 || GET_CODE (op0
) == SIGN_EXTEND
;
8904 /* If the shift is considered cheap,
8905 then don't add any cost. */
8906 if (aarch64_cheap_mult_shift_p (x
))
8908 else if (REG_P (op1
))
8909 /* ARITH + shift-by-register. */
8910 cost
+= extra_cost
->alu
.arith_shift_reg
;
8912 /* ARITH + extended register. We don't have a cost field
8913 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
8914 cost
+= extra_cost
->alu
.extend_arith
;
8916 /* ARITH + shift-by-immediate. */
8917 cost
+= extra_cost
->alu
.arith_shift
;
8920 /* LSL (immediate). */
8921 cost
+= extra_cost
->alu
.shift
;
8924 /* Strip extends as we will have costed them in the case above. */
8926 op0
= aarch64_strip_extend (op0
, true);
8928 cost
+= rtx_cost (op0
, VOIDmode
, code
, 0, speed
);
8933 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
8934 compound and let the below cases handle it. After all, MNEG is a
8935 special-case alias of MSUB. */
8936 if (GET_CODE (op0
) == NEG
)
8938 op0
= XEXP (op0
, 0);
8942 /* Integer multiplies or FMAs have zero/sign extending variants. */
8943 if ((GET_CODE (op0
) == ZERO_EXTEND
8944 && GET_CODE (op1
) == ZERO_EXTEND
)
8945 || (GET_CODE (op0
) == SIGN_EXTEND
8946 && GET_CODE (op1
) == SIGN_EXTEND
))
8948 cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, MULT
, 0, speed
);
8949 cost
+= rtx_cost (XEXP (op1
, 0), VOIDmode
, MULT
, 1, speed
);
8954 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
8955 cost
+= extra_cost
->mult
[0].extend_add
;
8957 /* MUL/SMULL/UMULL. */
8958 cost
+= extra_cost
->mult
[0].extend
;
8964 /* This is either an integer multiply or a MADD. In both cases
8965 we want to recurse and cost the operands. */
8966 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
8967 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
8973 cost
+= extra_cost
->mult
[mode
== DImode
].add
;
8976 cost
+= extra_cost
->mult
[mode
== DImode
].simple
;
8985 /* Floating-point FMA/FMUL can also support negations of the
8986 operands, unless the rounding mode is upward or downward in
8987 which case FNMUL is different than FMUL with operand negation. */
8988 bool neg0
= GET_CODE (op0
) == NEG
;
8989 bool neg1
= GET_CODE (op1
) == NEG
;
8990 if (compound_p
|| !flag_rounding_math
|| (neg0
&& neg1
))
8993 op0
= XEXP (op0
, 0);
8995 op1
= XEXP (op1
, 0);
8999 /* FMADD/FNMADD/FNMSUB/FMSUB. */
9000 cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
9003 cost
+= extra_cost
->fp
[mode
== DFmode
].mult
;
9006 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
9007 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
9013 aarch64_address_cost (rtx x
,
9015 addr_space_t as ATTRIBUTE_UNUSED
,
9018 enum rtx_code c
= GET_CODE (x
);
9019 const struct cpu_addrcost_table
*addr_cost
= aarch64_tune_params
.addr_cost
;
9020 struct aarch64_address_info info
;
9024 if (!aarch64_classify_address (&info
, x
, mode
, false))
9026 if (GET_CODE (x
) == CONST
|| GET_CODE (x
) == SYMBOL_REF
)
9028 /* This is a CONST or SYMBOL ref which will be split
9029 in a different way depending on the code model in use.
9030 Cost it through the generic infrastructure. */
9031 int cost_symbol_ref
= rtx_cost (x
, Pmode
, MEM
, 1, speed
);
9032 /* Divide through by the cost of one instruction to
9033 bring it to the same units as the address costs. */
9034 cost_symbol_ref
/= COSTS_N_INSNS (1);
9035 /* The cost is then the cost of preparing the address,
9036 followed by an immediate (possibly 0) offset. */
9037 return cost_symbol_ref
+ addr_cost
->imm_offset
;
9041 /* This is most likely a jump table from a case
9043 return addr_cost
->register_offset
;
9049 case ADDRESS_LO_SUM
:
9050 case ADDRESS_SYMBOLIC
:
9051 case ADDRESS_REG_IMM
:
9052 cost
+= addr_cost
->imm_offset
;
9055 case ADDRESS_REG_WB
:
9056 if (c
== PRE_INC
|| c
== PRE_DEC
|| c
== PRE_MODIFY
)
9057 cost
+= addr_cost
->pre_modify
;
9058 else if (c
== POST_INC
|| c
== POST_DEC
|| c
== POST_MODIFY
)
9059 cost
+= addr_cost
->post_modify
;
9065 case ADDRESS_REG_REG
:
9066 cost
+= addr_cost
->register_offset
;
9069 case ADDRESS_REG_SXTW
:
9070 cost
+= addr_cost
->register_sextend
;
9073 case ADDRESS_REG_UXTW
:
9074 cost
+= addr_cost
->register_zextend
;
9084 /* For the sake of calculating the cost of the shifted register
9085 component, we can treat same sized modes in the same way. */
9086 if (known_eq (GET_MODE_BITSIZE (mode
), 16))
9087 cost
+= addr_cost
->addr_scale_costs
.hi
;
9088 else if (known_eq (GET_MODE_BITSIZE (mode
), 32))
9089 cost
+= addr_cost
->addr_scale_costs
.si
;
9090 else if (known_eq (GET_MODE_BITSIZE (mode
), 64))
9091 cost
+= addr_cost
->addr_scale_costs
.di
;
9093 /* We can't tell, or this is a 128-bit vector. */
9094 cost
+= addr_cost
->addr_scale_costs
.ti
;
9100 /* Return the cost of a branch. If SPEED_P is true then the compiler is
9101 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
9105 aarch64_branch_cost (bool speed_p
, bool predictable_p
)
9107 /* When optimizing for speed, use the cost of unpredictable branches. */
9108 const struct cpu_branch_cost
*branch_costs
=
9109 aarch64_tune_params
.branch_costs
;
9111 if (!speed_p
|| predictable_p
)
9112 return branch_costs
->predictable
;
9114 return branch_costs
->unpredictable
;
9117 /* Return true if the RTX X in mode MODE is a zero or sign extract
9118 usable in an ADD or SUB (extended register) instruction. */
9120 aarch64_rtx_arith_op_extract_p (rtx x
, scalar_int_mode mode
)
9122 /* Catch add with a sign extract.
9123 This is add_<optab><mode>_multp2. */
9124 if (GET_CODE (x
) == SIGN_EXTRACT
9125 || GET_CODE (x
) == ZERO_EXTRACT
)
9127 rtx op0
= XEXP (x
, 0);
9128 rtx op1
= XEXP (x
, 1);
9129 rtx op2
= XEXP (x
, 2);
9131 if (GET_CODE (op0
) == MULT
9132 && CONST_INT_P (op1
)
9133 && op2
== const0_rtx
9134 && CONST_INT_P (XEXP (op0
, 1))
9135 && aarch64_is_extend_from_extract (mode
,
9142 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
9144 else if (GET_CODE (x
) == SIGN_EXTEND
9145 || GET_CODE (x
) == ZERO_EXTEND
)
9146 return REG_P (XEXP (x
, 0));
9152 aarch64_frint_unspec_p (unsigned int u
)
9170 /* Return true iff X is an rtx that will match an extr instruction
9171 i.e. as described in the *extr<mode>5_insn family of patterns.
9172 OP0 and OP1 will be set to the operands of the shifts involved
9173 on success and will be NULL_RTX otherwise. */
9176 aarch64_extr_rtx_p (rtx x
, rtx
*res_op0
, rtx
*res_op1
)
9179 scalar_int_mode mode
;
9180 if (!is_a
<scalar_int_mode
> (GET_MODE (x
), &mode
))
9183 *res_op0
= NULL_RTX
;
9184 *res_op1
= NULL_RTX
;
9186 if (GET_CODE (x
) != IOR
)
9192 if ((GET_CODE (op0
) == ASHIFT
&& GET_CODE (op1
) == LSHIFTRT
)
9193 || (GET_CODE (op1
) == ASHIFT
&& GET_CODE (op0
) == LSHIFTRT
))
9195 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
9196 if (GET_CODE (op1
) == ASHIFT
)
9197 std::swap (op0
, op1
);
9199 if (!CONST_INT_P (XEXP (op0
, 1)) || !CONST_INT_P (XEXP (op1
, 1)))
9202 unsigned HOST_WIDE_INT shft_amnt_0
= UINTVAL (XEXP (op0
, 1));
9203 unsigned HOST_WIDE_INT shft_amnt_1
= UINTVAL (XEXP (op1
, 1));
9205 if (shft_amnt_0
< GET_MODE_BITSIZE (mode
)
9206 && shft_amnt_0
+ shft_amnt_1
== GET_MODE_BITSIZE (mode
))
9208 *res_op0
= XEXP (op0
, 0);
9209 *res_op1
= XEXP (op1
, 0);
9217 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
9218 storing it in *COST. Result is true if the total cost of the operation
9219 has now been calculated. */
9221 aarch64_if_then_else_costs (rtx op0
, rtx op1
, rtx op2
, int *cost
, bool speed
)
9225 enum rtx_code cmpcode
;
9227 if (COMPARISON_P (op0
))
9229 inner
= XEXP (op0
, 0);
9230 comparator
= XEXP (op0
, 1);
9231 cmpcode
= GET_CODE (op0
);
9236 comparator
= const0_rtx
;
9240 if (GET_CODE (op1
) == PC
|| GET_CODE (op2
) == PC
)
9242 /* Conditional branch. */
9243 if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
9247 if (cmpcode
== NE
|| cmpcode
== EQ
)
9249 if (comparator
== const0_rtx
)
9251 /* TBZ/TBNZ/CBZ/CBNZ. */
9252 if (GET_CODE (inner
) == ZERO_EXTRACT
)
9254 *cost
+= rtx_cost (XEXP (inner
, 0), VOIDmode
,
9255 ZERO_EXTRACT
, 0, speed
);
9258 *cost
+= rtx_cost (inner
, VOIDmode
, cmpcode
, 0, speed
);
9263 else if (cmpcode
== LT
|| cmpcode
== GE
)
9266 if (comparator
== const0_rtx
)
9271 else if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
9274 if (GET_CODE (op1
) == COMPARE
)
9276 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
9277 if (XEXP (op1
, 1) == const0_rtx
)
9281 machine_mode mode
= GET_MODE (XEXP (op1
, 0));
9282 const struct cpu_cost_table
*extra_cost
9283 = aarch64_tune_params
.insn_extra_cost
;
9285 if (GET_MODE_CLASS (mode
) == MODE_INT
)
9286 *cost
+= extra_cost
->alu
.arith
;
9288 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
9293 /* It's a conditional operation based on the status flags,
9294 so it must be some flavor of CSEL. */
9296 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
9297 if (GET_CODE (op1
) == NEG
9298 || GET_CODE (op1
) == NOT
9299 || (GET_CODE (op1
) == PLUS
&& XEXP (op1
, 1) == const1_rtx
))
9300 op1
= XEXP (op1
, 0);
9301 else if (GET_CODE (op1
) == ZERO_EXTEND
&& GET_CODE (op2
) == ZERO_EXTEND
)
9303 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
9304 op1
= XEXP (op1
, 0);
9305 op2
= XEXP (op2
, 0);
9308 *cost
+= rtx_cost (op1
, VOIDmode
, IF_THEN_ELSE
, 1, speed
);
9309 *cost
+= rtx_cost (op2
, VOIDmode
, IF_THEN_ELSE
, 2, speed
);
9313 /* We don't know what this is, cost all operands. */
9317 /* Check whether X is a bitfield operation of the form shift + extend that
9318 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
9319 operand to which the bitfield operation is applied. Otherwise return
9323 aarch64_extend_bitfield_pattern_p (rtx x
)
9325 rtx_code outer_code
= GET_CODE (x
);
9326 machine_mode outer_mode
= GET_MODE (x
);
9328 if (outer_code
!= ZERO_EXTEND
&& outer_code
!= SIGN_EXTEND
9329 && outer_mode
!= SImode
&& outer_mode
!= DImode
)
9332 rtx inner
= XEXP (x
, 0);
9333 rtx_code inner_code
= GET_CODE (inner
);
9334 machine_mode inner_mode
= GET_MODE (inner
);
9340 if (CONST_INT_P (XEXP (inner
, 1))
9341 && (inner_mode
== QImode
|| inner_mode
== HImode
))
9342 op
= XEXP (inner
, 0);
9345 if (outer_code
== ZERO_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
9346 && (inner_mode
== QImode
|| inner_mode
== HImode
))
9347 op
= XEXP (inner
, 0);
9350 if (outer_code
== SIGN_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
9351 && (inner_mode
== QImode
|| inner_mode
== HImode
))
9352 op
= XEXP (inner
, 0);
9361 /* Return true if the mask and a shift amount from an RTX of the form
9362 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
9363 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
9366 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode
, rtx mask
,
9369 return CONST_INT_P (mask
) && CONST_INT_P (shft_amnt
)
9370 && INTVAL (shft_amnt
) < GET_MODE_BITSIZE (mode
)
9371 && exact_log2 ((INTVAL (mask
) >> INTVAL (shft_amnt
)) + 1) >= 0
9373 & ((HOST_WIDE_INT_1U
<< INTVAL (shft_amnt
)) - 1)) == 0;
9376 /* Return true if the masks and a shift amount from an RTX of the form
9377 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
9378 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
9381 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode
,
9382 unsigned HOST_WIDE_INT mask1
,
9383 unsigned HOST_WIDE_INT shft_amnt
,
9384 unsigned HOST_WIDE_INT mask2
)
9386 unsigned HOST_WIDE_INT t
;
9388 /* Verify that there is no overlap in what bits are set in the two masks. */
9389 if (mask1
!= ~mask2
)
9392 /* Verify that mask2 is not all zeros or ones. */
9393 if (mask2
== 0 || mask2
== HOST_WIDE_INT_M1U
)
9396 /* The shift amount should always be less than the mode size. */
9397 gcc_assert (shft_amnt
< GET_MODE_BITSIZE (mode
));
9399 /* Verify that the mask being shifted is contiguous and would be in the
9400 least significant bits after shifting by shft_amnt. */
9401 t
= mask2
+ (HOST_WIDE_INT_1U
<< shft_amnt
);
9402 return (t
== (t
& -t
));
9405 /* Calculate the cost of calculating X, storing it in *COST. Result
9406 is true if the total cost of the operation has now been calculated. */
9408 aarch64_rtx_costs (rtx x
, machine_mode mode
, int outer ATTRIBUTE_UNUSED
,
9409 int param ATTRIBUTE_UNUSED
, int *cost
, bool speed
)
9412 const struct cpu_cost_table
*extra_cost
9413 = aarch64_tune_params
.insn_extra_cost
;
9414 int code
= GET_CODE (x
);
9415 scalar_int_mode int_mode
;
9417 /* By default, assume that everything has equivalent cost to the
9418 cheapest instruction. Any additional costs are applied as a delta
9419 above this default. */
9420 *cost
= COSTS_N_INSNS (1);
9425 /* The cost depends entirely on the operands to SET. */
9430 switch (GET_CODE (op0
))
9435 rtx address
= XEXP (op0
, 0);
9436 if (VECTOR_MODE_P (mode
))
9437 *cost
+= extra_cost
->ldst
.storev
;
9438 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
9439 *cost
+= extra_cost
->ldst
.store
;
9440 else if (mode
== SFmode
)
9441 *cost
+= extra_cost
->ldst
.storef
;
9442 else if (mode
== DFmode
)
9443 *cost
+= extra_cost
->ldst
.stored
;
9446 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
9450 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
9454 if (! REG_P (SUBREG_REG (op0
)))
9455 *cost
+= rtx_cost (SUBREG_REG (op0
), VOIDmode
, SET
, 0, speed
);
9459 /* The cost is one per vector-register copied. */
9460 if (VECTOR_MODE_P (GET_MODE (op0
)) && REG_P (op1
))
9462 int nregs
= aarch64_hard_regno_nregs (V0_REGNUM
, GET_MODE (op0
));
9463 *cost
= COSTS_N_INSNS (nregs
);
9465 /* const0_rtx is in general free, but we will use an
9466 instruction to set a register to 0. */
9467 else if (REG_P (op1
) || op1
== const0_rtx
)
9469 /* The cost is 1 per register copied. */
9470 int nregs
= aarch64_hard_regno_nregs (R0_REGNUM
, GET_MODE (op0
));
9471 *cost
= COSTS_N_INSNS (nregs
);
9474 /* Cost is just the cost of the RHS of the set. */
9475 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
9480 /* Bit-field insertion. Strip any redundant widening of
9481 the RHS to meet the width of the target. */
9482 if (GET_CODE (op1
) == SUBREG
)
9483 op1
= SUBREG_REG (op1
);
9484 if ((GET_CODE (op1
) == ZERO_EXTEND
9485 || GET_CODE (op1
) == SIGN_EXTEND
)
9486 && CONST_INT_P (XEXP (op0
, 1))
9487 && is_a
<scalar_int_mode
> (GET_MODE (XEXP (op1
, 0)), &int_mode
)
9488 && GET_MODE_BITSIZE (int_mode
) >= INTVAL (XEXP (op0
, 1)))
9489 op1
= XEXP (op1
, 0);
9491 if (CONST_INT_P (op1
))
9493 /* MOV immediate is assumed to always be cheap. */
9494 *cost
= COSTS_N_INSNS (1);
9500 *cost
+= extra_cost
->alu
.bfi
;
9501 *cost
+= rtx_cost (op1
, VOIDmode
, (enum rtx_code
) code
, 1, speed
);
9507 /* We can't make sense of this, assume default cost. */
9508 *cost
= COSTS_N_INSNS (1);
9514 /* If an instruction can incorporate a constant within the
9515 instruction, the instruction's expression avoids calling
9516 rtx_cost() on the constant. If rtx_cost() is called on a
9517 constant, then it is usually because the constant must be
9518 moved into a register by one or more instructions.
9520 The exception is constant 0, which can be expressed
9521 as XZR/WZR and is therefore free. The exception to this is
9522 if we have (set (reg) (const0_rtx)) in which case we must cost
9523 the move. However, we can catch that when we cost the SET, so
9524 we don't need to consider that here. */
9525 if (x
== const0_rtx
)
9529 /* To an approximation, building any other constant is
9530 proportionally expensive to the number of instructions
9531 required to build that constant. This is true whether we
9532 are compiling for SPEED or otherwise. */
9533 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
9534 int_mode
= word_mode
;
9535 *cost
= COSTS_N_INSNS (aarch64_internal_mov_immediate
9536 (NULL_RTX
, x
, false, int_mode
));
9542 /* First determine number of instructions to do the move
9543 as an integer constant. */
9544 if (!aarch64_float_const_representable_p (x
)
9545 && !aarch64_can_const_movi_rtx_p (x
, mode
)
9546 && aarch64_float_const_rtx_p (x
))
9548 unsigned HOST_WIDE_INT ival
;
9549 bool succeed
= aarch64_reinterpret_float_as_int (x
, &ival
);
9550 gcc_assert (succeed
);
9552 scalar_int_mode imode
= (mode
== HFmode
9554 : int_mode_for_mode (mode
).require ());
9555 int ncost
= aarch64_internal_mov_immediate
9556 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
9557 *cost
+= COSTS_N_INSNS (ncost
);
9563 /* mov[df,sf]_aarch64. */
9564 if (aarch64_float_const_representable_p (x
))
9565 /* FMOV (scalar immediate). */
9566 *cost
+= extra_cost
->fp
[mode
== DFmode
].fpconst
;
9567 else if (!aarch64_float_const_zero_rtx_p (x
))
9569 /* This will be a load from memory. */
9571 *cost
+= extra_cost
->ldst
.loadd
;
9573 *cost
+= extra_cost
->ldst
.loadf
;
9576 /* Otherwise this is +0.0. We get this using MOVI d0, #0
9577 or MOV v0.s[0], wzr - neither of which are modeled by the
9578 cost tables. Just use the default cost. */
9588 /* For loads we want the base cost of a load, plus an
9589 approximation for the additional cost of the addressing
9591 rtx address
= XEXP (x
, 0);
9592 if (VECTOR_MODE_P (mode
))
9593 *cost
+= extra_cost
->ldst
.loadv
;
9594 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
9595 *cost
+= extra_cost
->ldst
.load
;
9596 else if (mode
== SFmode
)
9597 *cost
+= extra_cost
->ldst
.loadf
;
9598 else if (mode
== DFmode
)
9599 *cost
+= extra_cost
->ldst
.loadd
;
9602 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
9611 if (VECTOR_MODE_P (mode
))
9616 *cost
+= extra_cost
->vect
.alu
;
9621 if (GET_MODE_CLASS (mode
) == MODE_INT
)
9623 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
9624 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
9627 *cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, NEG
, 0, speed
);
9631 /* Cost this as SUB wzr, X. */
9632 op0
= CONST0_RTX (mode
);
9637 if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
9639 /* Support (neg(fma...)) as a single instruction only if
9640 sign of zeros is unimportant. This matches the decision
9641 making in aarch64.md. */
9642 if (GET_CODE (op0
) == FMA
&& !HONOR_SIGNED_ZEROS (GET_MODE (op0
)))
9645 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
9648 if (GET_CODE (op0
) == MULT
)
9651 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
9656 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
9666 if (VECTOR_MODE_P (mode
))
9667 *cost
+= extra_cost
->vect
.alu
;
9669 *cost
+= extra_cost
->alu
.clz
;
9678 if (op1
== const0_rtx
9679 && GET_CODE (op0
) == AND
)
9682 mode
= GET_MODE (op0
);
9686 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
)
9688 /* TODO: A write to the CC flags possibly costs extra, this
9689 needs encoding in the cost tables. */
9691 mode
= GET_MODE (op0
);
9693 if (GET_CODE (op0
) == AND
)
9699 if (GET_CODE (op0
) == PLUS
)
9701 /* ADDS (and CMN alias). */
9706 if (GET_CODE (op0
) == MINUS
)
9713 if (GET_CODE (op0
) == ZERO_EXTRACT
&& op1
== const0_rtx
9714 && GET_MODE (x
) == CC_NZmode
&& CONST_INT_P (XEXP (op0
, 1))
9715 && CONST_INT_P (XEXP (op0
, 2)))
9717 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
9718 Handle it here directly rather than going to cost_logic
9719 since we know the immediate generated for the TST is valid
9720 so we can avoid creating an intermediate rtx for it only
9721 for costing purposes. */
9723 *cost
+= extra_cost
->alu
.logical
;
9725 *cost
+= rtx_cost (XEXP (op0
, 0), GET_MODE (op0
),
9726 ZERO_EXTRACT
, 0, speed
);
9730 if (GET_CODE (op1
) == NEG
)
9734 *cost
+= extra_cost
->alu
.arith
;
9736 *cost
+= rtx_cost (op0
, mode
, COMPARE
, 0, speed
);
9737 *cost
+= rtx_cost (XEXP (op1
, 0), mode
, NEG
, 1, speed
);
9743 Compare can freely swap the order of operands, and
9744 canonicalization puts the more complex operation first.
9745 But the integer MINUS logic expects the shift/extend
9746 operation in op1. */
9748 || (GET_CODE (op0
) == SUBREG
&& REG_P (SUBREG_REG (op0
)))))
9756 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_FLOAT
)
9760 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
9762 if (CONST_DOUBLE_P (op1
) && aarch64_float_const_zero_rtx_p (op1
))
9764 *cost
+= rtx_cost (op0
, VOIDmode
, COMPARE
, 0, speed
);
9765 /* FCMP supports constant 0.0 for no extra cost. */
9771 if (VECTOR_MODE_P (mode
))
9773 /* Vector compare. */
9775 *cost
+= extra_cost
->vect
.alu
;
9777 if (aarch64_float_const_zero_rtx_p (op1
))
9779 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
9793 *cost
+= rtx_cost (op0
, mode
, MINUS
, 0, speed
);
9795 /* Detect valid immediates. */
9796 if ((GET_MODE_CLASS (mode
) == MODE_INT
9797 || (GET_MODE_CLASS (mode
) == MODE_CC
9798 && GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
))
9799 && CONST_INT_P (op1
)
9800 && aarch64_uimm12_shift (INTVAL (op1
)))
9803 /* SUB(S) (immediate). */
9804 *cost
+= extra_cost
->alu
.arith
;
9808 /* Look for SUB (extended register). */
9809 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
9810 && aarch64_rtx_arith_op_extract_p (op1
, int_mode
))
9813 *cost
+= extra_cost
->alu
.extend_arith
;
9815 op1
= aarch64_strip_extend (op1
, true);
9816 *cost
+= rtx_cost (op1
, VOIDmode
,
9817 (enum rtx_code
) GET_CODE (op1
), 0, speed
);
9821 rtx new_op1
= aarch64_strip_extend (op1
, false);
9823 /* Cost this as an FMA-alike operation. */
9824 if ((GET_CODE (new_op1
) == MULT
9825 || aarch64_shift_p (GET_CODE (new_op1
)))
9828 *cost
+= aarch64_rtx_mult_cost (new_op1
, MULT
,
9829 (enum rtx_code
) code
,
9834 *cost
+= rtx_cost (new_op1
, VOIDmode
, MINUS
, 1, speed
);
9838 if (VECTOR_MODE_P (mode
))
9841 *cost
+= extra_cost
->vect
.alu
;
9843 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
9846 *cost
+= extra_cost
->alu
.arith
;
9848 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
9851 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
9865 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
9866 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
9869 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, PLUS
, 0, speed
);
9870 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
9874 if (GET_MODE_CLASS (mode
) == MODE_INT
9875 && ((CONST_INT_P (op1
) && aarch64_uimm12_shift (INTVAL (op1
)))
9876 || aarch64_sve_addvl_addpl_immediate (op1
, mode
)))
9878 *cost
+= rtx_cost (op0
, mode
, PLUS
, 0, speed
);
9881 /* ADD (immediate). */
9882 *cost
+= extra_cost
->alu
.arith
;
9886 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
9888 /* Look for ADD (extended register). */
9889 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
9890 && aarch64_rtx_arith_op_extract_p (op0
, int_mode
))
9893 *cost
+= extra_cost
->alu
.extend_arith
;
9895 op0
= aarch64_strip_extend (op0
, true);
9896 *cost
+= rtx_cost (op0
, VOIDmode
,
9897 (enum rtx_code
) GET_CODE (op0
), 0, speed
);
9901 /* Strip any extend, leave shifts behind as we will
9902 cost them through mult_cost. */
9903 new_op0
= aarch64_strip_extend (op0
, false);
9905 if (GET_CODE (new_op0
) == MULT
9906 || aarch64_shift_p (GET_CODE (new_op0
)))
9908 *cost
+= aarch64_rtx_mult_cost (new_op0
, MULT
, PLUS
,
9913 *cost
+= rtx_cost (new_op0
, VOIDmode
, PLUS
, 0, speed
);
9917 if (VECTOR_MODE_P (mode
))
9920 *cost
+= extra_cost
->vect
.alu
;
9922 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
9925 *cost
+= extra_cost
->alu
.arith
;
9927 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
9930 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
9937 *cost
= COSTS_N_INSNS (1);
9941 if (VECTOR_MODE_P (mode
))
9942 *cost
+= extra_cost
->vect
.alu
;
9944 *cost
+= extra_cost
->alu
.rev
;
9949 if (aarch_rev16_p (x
))
9951 *cost
= COSTS_N_INSNS (1);
9955 if (VECTOR_MODE_P (mode
))
9956 *cost
+= extra_cost
->vect
.alu
;
9958 *cost
+= extra_cost
->alu
.rev
;
9963 if (aarch64_extr_rtx_p (x
, &op0
, &op1
))
9965 *cost
+= rtx_cost (op0
, mode
, IOR
, 0, speed
);
9966 *cost
+= rtx_cost (op1
, mode
, IOR
, 1, speed
);
9968 *cost
+= extra_cost
->alu
.shift
;
9979 if (VECTOR_MODE_P (mode
))
9982 *cost
+= extra_cost
->vect
.alu
;
9987 && GET_CODE (op0
) == MULT
9988 && CONST_INT_P (XEXP (op0
, 1))
9989 && CONST_INT_P (op1
)
9990 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0
, 1))),
9993 /* This is a UBFM/SBFM. */
9994 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, ZERO_EXTRACT
, 0, speed
);
9996 *cost
+= extra_cost
->alu
.bfx
;
10000 if (is_int_mode (mode
, &int_mode
))
10002 if (CONST_INT_P (op1
))
10004 /* We have a mask + shift version of a UBFIZ
10005 i.e. the *andim_ashift<mode>_bfiz pattern. */
10006 if (GET_CODE (op0
) == ASHIFT
10007 && aarch64_mask_and_shift_for_ubfiz_p (int_mode
, op1
,
10010 *cost
+= rtx_cost (XEXP (op0
, 0), int_mode
,
10011 (enum rtx_code
) code
, 0, speed
);
10013 *cost
+= extra_cost
->alu
.bfx
;
10017 else if (aarch64_bitmask_imm (INTVAL (op1
), int_mode
))
10019 /* We possibly get the immediate for free, this is not
10021 *cost
+= rtx_cost (op0
, int_mode
,
10022 (enum rtx_code
) code
, 0, speed
);
10024 *cost
+= extra_cost
->alu
.logical
;
10033 /* Handle ORN, EON, or BIC. */
10034 if (GET_CODE (op0
) == NOT
)
10035 op0
= XEXP (op0
, 0);
10037 new_op0
= aarch64_strip_shift (op0
);
10039 /* If we had a shift on op0 then this is a logical-shift-
10040 by-register/immediate operation. Otherwise, this is just
10041 a logical operation. */
10044 if (new_op0
!= op0
)
10046 /* Shift by immediate. */
10047 if (CONST_INT_P (XEXP (op0
, 1)))
10048 *cost
+= extra_cost
->alu
.log_shift
;
10050 *cost
+= extra_cost
->alu
.log_shift_reg
;
10053 *cost
+= extra_cost
->alu
.logical
;
10056 /* In both cases we want to cost both operands. */
10057 *cost
+= rtx_cost (new_op0
, int_mode
, (enum rtx_code
) code
,
10059 *cost
+= rtx_cost (op1
, int_mode
, (enum rtx_code
) code
,
10069 op0
= aarch64_strip_shift (x
);
10071 if (VECTOR_MODE_P (mode
))
10074 *cost
+= extra_cost
->vect
.alu
;
10078 /* MVN-shifted-reg. */
10081 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
10084 *cost
+= extra_cost
->alu
.log_shift
;
10088 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10089 Handle the second form here taking care that 'a' in the above can
10091 else if (GET_CODE (op0
) == XOR
)
10093 rtx newop0
= XEXP (op0
, 0);
10094 rtx newop1
= XEXP (op0
, 1);
10095 rtx op0_stripped
= aarch64_strip_shift (newop0
);
10097 *cost
+= rtx_cost (newop1
, mode
, (enum rtx_code
) code
, 1, speed
);
10098 *cost
+= rtx_cost (op0_stripped
, mode
, XOR
, 0, speed
);
10102 if (op0_stripped
!= newop0
)
10103 *cost
+= extra_cost
->alu
.log_shift
;
10105 *cost
+= extra_cost
->alu
.logical
;
10112 *cost
+= extra_cost
->alu
.logical
;
10119 /* If a value is written in SI mode, then zero extended to DI
10120 mode, the operation will in general be free as a write to
10121 a 'w' register implicitly zeroes the upper bits of an 'x'
10122 register. However, if this is
10124 (set (reg) (zero_extend (reg)))
10126 we must cost the explicit register move. */
10128 && GET_MODE (op0
) == SImode
10131 int op_cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, 0, speed
);
10133 /* If OP_COST is non-zero, then the cost of the zero extend
10134 is effectively the cost of the inner operation. Otherwise
10135 we have a MOV instruction and we take the cost from the MOV
10136 itself. This is true independently of whether we are
10137 optimizing for space or time. */
10143 else if (MEM_P (op0
))
10145 /* All loads can zero extend to any size for free. */
10146 *cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, param
, speed
);
10150 op0
= aarch64_extend_bitfield_pattern_p (x
);
10153 *cost
+= rtx_cost (op0
, mode
, ZERO_EXTEND
, 0, speed
);
10155 *cost
+= extra_cost
->alu
.bfx
;
10161 if (VECTOR_MODE_P (mode
))
10164 *cost
+= extra_cost
->vect
.alu
;
10168 /* We generate an AND instead of UXTB/UXTH. */
10169 *cost
+= extra_cost
->alu
.logical
;
10175 if (MEM_P (XEXP (x
, 0)))
10180 rtx address
= XEXP (XEXP (x
, 0), 0);
10181 *cost
+= extra_cost
->ldst
.load_sign_extend
;
10184 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
10190 op0
= aarch64_extend_bitfield_pattern_p (x
);
10193 *cost
+= rtx_cost (op0
, mode
, SIGN_EXTEND
, 0, speed
);
10195 *cost
+= extra_cost
->alu
.bfx
;
10201 if (VECTOR_MODE_P (mode
))
10202 *cost
+= extra_cost
->vect
.alu
;
10204 *cost
+= extra_cost
->alu
.extend
;
10212 if (CONST_INT_P (op1
))
10216 if (VECTOR_MODE_P (mode
))
10218 /* Vector shift (immediate). */
10219 *cost
+= extra_cost
->vect
.alu
;
10223 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
10225 *cost
+= extra_cost
->alu
.shift
;
10229 /* We can incorporate zero/sign extend for free. */
10230 if (GET_CODE (op0
) == ZERO_EXTEND
10231 || GET_CODE (op0
) == SIGN_EXTEND
)
10232 op0
= XEXP (op0
, 0);
10234 *cost
+= rtx_cost (op0
, VOIDmode
, ASHIFT
, 0, speed
);
10239 if (VECTOR_MODE_P (mode
))
10242 /* Vector shift (register). */
10243 *cost
+= extra_cost
->vect
.alu
;
10249 *cost
+= extra_cost
->alu
.shift_reg
;
10251 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
10252 && CONST_INT_P (XEXP (op1
, 1))
10253 && known_eq (INTVAL (XEXP (op1
, 1)),
10254 GET_MODE_BITSIZE (mode
) - 1))
10256 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
10257 /* We already demanded XEXP (op1, 0) to be REG_P, so
10258 don't recurse into it. */
10262 return false; /* All arguments need to be in registers. */
10272 if (CONST_INT_P (op1
))
10274 /* ASR (immediate) and friends. */
10277 if (VECTOR_MODE_P (mode
))
10278 *cost
+= extra_cost
->vect
.alu
;
10280 *cost
+= extra_cost
->alu
.shift
;
10283 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
10288 if (VECTOR_MODE_P (mode
))
10291 /* Vector shift (register). */
10292 *cost
+= extra_cost
->vect
.alu
;
10297 /* ASR (register) and friends. */
10298 *cost
+= extra_cost
->alu
.shift_reg
;
10300 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
10301 && CONST_INT_P (XEXP (op1
, 1))
10302 && known_eq (INTVAL (XEXP (op1
, 1)),
10303 GET_MODE_BITSIZE (mode
) - 1))
10305 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
10306 /* We already demanded XEXP (op1, 0) to be REG_P, so
10307 don't recurse into it. */
10311 return false; /* All arguments need to be in registers. */
10316 if (aarch64_cmodel
== AARCH64_CMODEL_LARGE
10317 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
)
10321 *cost
+= extra_cost
->ldst
.load
;
10323 else if (aarch64_cmodel
== AARCH64_CMODEL_SMALL
10324 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_PIC
)
10326 /* ADRP, followed by ADD. */
10327 *cost
+= COSTS_N_INSNS (1);
10329 *cost
+= 2 * extra_cost
->alu
.arith
;
10331 else if (aarch64_cmodel
== AARCH64_CMODEL_TINY
10332 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
10336 *cost
+= extra_cost
->alu
.arith
;
10341 /* One extra load instruction, after accessing the GOT. */
10342 *cost
+= COSTS_N_INSNS (1);
10344 *cost
+= extra_cost
->ldst
.load
;
10350 /* ADRP/ADD (immediate). */
10352 *cost
+= extra_cost
->alu
.arith
;
10360 if (VECTOR_MODE_P (mode
))
10361 *cost
+= extra_cost
->vect
.alu
;
10363 *cost
+= extra_cost
->alu
.bfx
;
10366 /* We can trust that the immediates used will be correct (there
10367 are no by-register forms), so we need only cost op0. */
10368 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (enum rtx_code
) code
, 0, speed
);
10372 *cost
+= aarch64_rtx_mult_cost (x
, MULT
, 0, speed
);
10373 /* aarch64_rtx_mult_cost always handles recursion to its
10378 /* We can expand signed mod by power of 2 using a NEGS, two parallel
10379 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
10380 an unconditional negate. This case should only ever be reached through
10381 the set_smod_pow2_cheap check in expmed.c. */
10382 if (CONST_INT_P (XEXP (x
, 1))
10383 && exact_log2 (INTVAL (XEXP (x
, 1))) > 0
10384 && (mode
== SImode
|| mode
== DImode
))
10386 /* We expand to 4 instructions. Reset the baseline. */
10387 *cost
= COSTS_N_INSNS (4);
10390 *cost
+= 2 * extra_cost
->alu
.logical
10391 + 2 * extra_cost
->alu
.arith
;
10396 /* Fall-through. */
10400 /* Slighly prefer UMOD over SMOD. */
10401 if (VECTOR_MODE_P (mode
))
10402 *cost
+= extra_cost
->vect
.alu
;
10403 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
10404 *cost
+= (extra_cost
->mult
[mode
== DImode
].add
10405 + extra_cost
->mult
[mode
== DImode
].idiv
10406 + (code
== MOD
? 1 : 0));
10408 return false; /* All arguments need to be in registers. */
10415 if (VECTOR_MODE_P (mode
))
10416 *cost
+= extra_cost
->vect
.alu
;
10417 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
10418 /* There is no integer SQRT, so only DIV and UDIV can get
10420 *cost
+= (extra_cost
->mult
[mode
== DImode
].idiv
10421 /* Slighly prefer UDIV over SDIV. */
10422 + (code
== DIV
? 1 : 0));
10424 *cost
+= extra_cost
->fp
[mode
== DFmode
].div
;
10426 return false; /* All arguments need to be in registers. */
10429 return aarch64_if_then_else_costs (XEXP (x
, 0), XEXP (x
, 1),
10430 XEXP (x
, 2), cost
, speed
);
10443 return false; /* All arguments must be in registers. */
10452 if (VECTOR_MODE_P (mode
))
10453 *cost
+= extra_cost
->vect
.alu
;
10455 *cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
10458 /* FMSUB, FNMADD, and FNMSUB are free. */
10459 if (GET_CODE (op0
) == NEG
)
10460 op0
= XEXP (op0
, 0);
10462 if (GET_CODE (op2
) == NEG
)
10463 op2
= XEXP (op2
, 0);
10465 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
10466 and the by-element operand as operand 0. */
10467 if (GET_CODE (op1
) == NEG
)
10468 op1
= XEXP (op1
, 0);
10470 /* Catch vector-by-element operations. The by-element operand can
10471 either be (vec_duplicate (vec_select (x))) or just
10472 (vec_select (x)), depending on whether we are multiplying by
10473 a vector or a scalar.
10475 Canonicalization is not very good in these cases, FMA4 will put the
10476 by-element operand as operand 0, FNMA4 will have it as operand 1. */
10477 if (GET_CODE (op0
) == VEC_DUPLICATE
)
10478 op0
= XEXP (op0
, 0);
10479 else if (GET_CODE (op1
) == VEC_DUPLICATE
)
10480 op1
= XEXP (op1
, 0);
10482 if (GET_CODE (op0
) == VEC_SELECT
)
10483 op0
= XEXP (op0
, 0);
10484 else if (GET_CODE (op1
) == VEC_SELECT
)
10485 op1
= XEXP (op1
, 0);
10487 /* If the remaining parameters are not registers,
10488 get the cost to put them into registers. */
10489 *cost
+= rtx_cost (op0
, mode
, FMA
, 0, speed
);
10490 *cost
+= rtx_cost (op1
, mode
, FMA
, 1, speed
);
10491 *cost
+= rtx_cost (op2
, mode
, FMA
, 2, speed
);
10495 case UNSIGNED_FLOAT
:
10497 *cost
+= extra_cost
->fp
[mode
== DFmode
].fromint
;
10503 if (VECTOR_MODE_P (mode
))
10505 /*Vector truncate. */
10506 *cost
+= extra_cost
->vect
.alu
;
10509 *cost
+= extra_cost
->fp
[mode
== DFmode
].widen
;
10513 case FLOAT_TRUNCATE
:
10516 if (VECTOR_MODE_P (mode
))
10518 /*Vector conversion. */
10519 *cost
+= extra_cost
->vect
.alu
;
10522 *cost
+= extra_cost
->fp
[mode
== DFmode
].narrow
;
10529 /* Strip the rounding part. They will all be implemented
10530 by the fcvt* family of instructions anyway. */
10531 if (GET_CODE (x
) == UNSPEC
)
10533 unsigned int uns_code
= XINT (x
, 1);
10535 if (uns_code
== UNSPEC_FRINTA
10536 || uns_code
== UNSPEC_FRINTM
10537 || uns_code
== UNSPEC_FRINTN
10538 || uns_code
== UNSPEC_FRINTP
10539 || uns_code
== UNSPEC_FRINTZ
)
10540 x
= XVECEXP (x
, 0, 0);
10545 if (VECTOR_MODE_P (mode
))
10546 *cost
+= extra_cost
->vect
.alu
;
10548 *cost
+= extra_cost
->fp
[GET_MODE (x
) == DFmode
].toint
;
10551 /* We can combine fmul by a power of 2 followed by a fcvt into a single
10552 fixed-point fcvt. */
10553 if (GET_CODE (x
) == MULT
10554 && ((VECTOR_MODE_P (mode
)
10555 && aarch64_vec_fpconst_pow_of_2 (XEXP (x
, 1)) > 0)
10556 || aarch64_fpconst_pow_of_2 (XEXP (x
, 1)) > 0))
10558 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (rtx_code
) code
,
10563 *cost
+= rtx_cost (x
, VOIDmode
, (enum rtx_code
) code
, 0, speed
);
10567 if (VECTOR_MODE_P (mode
))
10569 /* ABS (vector). */
10571 *cost
+= extra_cost
->vect
.alu
;
10573 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
10577 /* FABD, which is analogous to FADD. */
10578 if (GET_CODE (op0
) == MINUS
)
10580 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, MINUS
, 0, speed
);
10581 *cost
+= rtx_cost (XEXP (op0
, 1), mode
, MINUS
, 1, speed
);
10583 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
10587 /* Simple FABS is analogous to FNEG. */
10589 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
10593 /* Integer ABS will either be split to
10594 two arithmetic instructions, or will be an ABS
10595 (scalar), which we don't model. */
10596 *cost
= COSTS_N_INSNS (2);
10598 *cost
+= 2 * extra_cost
->alu
.arith
;
10606 if (VECTOR_MODE_P (mode
))
10607 *cost
+= extra_cost
->vect
.alu
;
10610 /* FMAXNM/FMINNM/FMAX/FMIN.
10611 TODO: This may not be accurate for all implementations, but
10612 we do not model this in the cost tables. */
10613 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
10619 /* The floating point round to integer frint* instructions. */
10620 if (aarch64_frint_unspec_p (XINT (x
, 1)))
10623 *cost
+= extra_cost
->fp
[mode
== DFmode
].roundint
;
10628 if (XINT (x
, 1) == UNSPEC_RBIT
)
10631 *cost
+= extra_cost
->alu
.rev
;
10639 /* Decompose <su>muldi3_highpart. */
10640 if (/* (truncate:DI */
10643 && GET_MODE (XEXP (x
, 0)) == TImode
10644 && GET_CODE (XEXP (x
, 0)) == LSHIFTRT
10646 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == MULT
10647 /* (ANY_EXTEND:TI (reg:DI))
10648 (ANY_EXTEND:TI (reg:DI))) */
10649 && ((GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == ZERO_EXTEND
10650 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == ZERO_EXTEND
)
10651 || (GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == SIGN_EXTEND
10652 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == SIGN_EXTEND
))
10653 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0)) == DImode
10654 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0)) == DImode
10655 /* (const_int 64) */
10656 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
10657 && UINTVAL (XEXP (XEXP (x
, 0), 1)) == 64)
10661 *cost
+= extra_cost
->mult
[mode
== DImode
].extend
;
10662 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0),
10663 mode
, MULT
, 0, speed
);
10664 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0),
10665 mode
, MULT
, 1, speed
);
10669 /* Fall through. */
10675 && flag_aarch64_verbose_cost
)
10676 fprintf (dump_file
,
10677 "\nFailed to cost RTX. Assuming default cost.\n");
10682 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
10683 calculated for X. This cost is stored in *COST. Returns true
10684 if the total cost of X was calculated. */
10686 aarch64_rtx_costs_wrapper (rtx x
, machine_mode mode
, int outer
,
10687 int param
, int *cost
, bool speed
)
10689 bool result
= aarch64_rtx_costs (x
, mode
, outer
, param
, cost
, speed
);
10692 && flag_aarch64_verbose_cost
)
10694 print_rtl_single (dump_file
, x
);
10695 fprintf (dump_file
, "\n%s cost: %d (%s)\n",
10696 speed
? "Hot" : "Cold",
10697 *cost
, result
? "final" : "partial");
10704 aarch64_register_move_cost (machine_mode mode
,
10705 reg_class_t from_i
, reg_class_t to_i
)
10707 enum reg_class from
= (enum reg_class
) from_i
;
10708 enum reg_class to
= (enum reg_class
) to_i
;
10709 const struct cpu_regmove_cost
*regmove_cost
10710 = aarch64_tune_params
.regmove_cost
;
10712 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
10713 if (to
== TAILCALL_ADDR_REGS
|| to
== POINTER_REGS
)
10716 if (from
== TAILCALL_ADDR_REGS
|| from
== POINTER_REGS
)
10717 from
= GENERAL_REGS
;
10719 /* Moving between GPR and stack cost is the same as GP2GP. */
10720 if ((from
== GENERAL_REGS
&& to
== STACK_REG
)
10721 || (to
== GENERAL_REGS
&& from
== STACK_REG
))
10722 return regmove_cost
->GP2GP
;
10724 /* To/From the stack register, we move via the gprs. */
10725 if (to
== STACK_REG
|| from
== STACK_REG
)
10726 return aarch64_register_move_cost (mode
, from
, GENERAL_REGS
)
10727 + aarch64_register_move_cost (mode
, GENERAL_REGS
, to
);
10729 if (known_eq (GET_MODE_SIZE (mode
), 16))
10731 /* 128-bit operations on general registers require 2 instructions. */
10732 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
10733 return regmove_cost
->GP2GP
* 2;
10734 else if (from
== GENERAL_REGS
)
10735 return regmove_cost
->GP2FP
* 2;
10736 else if (to
== GENERAL_REGS
)
10737 return regmove_cost
->FP2GP
* 2;
10739 /* When AdvSIMD instructions are disabled it is not possible to move
10740 a 128-bit value directly between Q registers. This is handled in
10741 secondary reload. A general register is used as a scratch to move
10742 the upper DI value and the lower DI value is moved directly,
10743 hence the cost is the sum of three moves. */
10745 return regmove_cost
->GP2FP
+ regmove_cost
->FP2GP
+ regmove_cost
->FP2FP
;
10747 return regmove_cost
->FP2FP
;
10750 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
10751 return regmove_cost
->GP2GP
;
10752 else if (from
== GENERAL_REGS
)
10753 return regmove_cost
->GP2FP
;
10754 else if (to
== GENERAL_REGS
)
10755 return regmove_cost
->FP2GP
;
10757 return regmove_cost
->FP2FP
;
10761 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED
,
10762 reg_class_t rclass ATTRIBUTE_UNUSED
,
10763 bool in ATTRIBUTE_UNUSED
)
10765 return aarch64_tune_params
.memmov_cost
;
10768 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
10769 to optimize 1.0/sqrt. */
10772 use_rsqrt_p (machine_mode mode
)
10774 return (!flag_trapping_math
10775 && flag_unsafe_math_optimizations
10776 && ((aarch64_tune_params
.approx_modes
->recip_sqrt
10777 & AARCH64_APPROX_MODE (mode
))
10778 || flag_mrecip_low_precision_sqrt
));
10781 /* Function to decide when to use the approximate reciprocal square root
10785 aarch64_builtin_reciprocal (tree fndecl
)
10787 machine_mode mode
= TYPE_MODE (TREE_TYPE (fndecl
));
10789 if (!use_rsqrt_p (mode
))
10791 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl
));
10794 /* Emit instruction sequence to compute either the approximate square root
10795 or its approximate reciprocal, depending on the flag RECP, and return
10796 whether the sequence was emitted or not. */
10799 aarch64_emit_approx_sqrt (rtx dst
, rtx src
, bool recp
)
10801 machine_mode mode
= GET_MODE (dst
);
10803 if (GET_MODE_INNER (mode
) == HFmode
)
10805 gcc_assert (!recp
);
10811 if (!(flag_mlow_precision_sqrt
10812 || (aarch64_tune_params
.approx_modes
->sqrt
10813 & AARCH64_APPROX_MODE (mode
))))
10816 if (flag_finite_math_only
10817 || flag_trapping_math
10818 || !flag_unsafe_math_optimizations
10819 || optimize_function_for_size_p (cfun
))
10823 /* Caller assumes we cannot fail. */
10824 gcc_assert (use_rsqrt_p (mode
));
10826 machine_mode mmsk
= mode_for_int_vector (mode
).require ();
10827 rtx xmsk
= gen_reg_rtx (mmsk
);
10829 /* When calculating the approximate square root, compare the
10830 argument with 0.0 and create a mask. */
10831 emit_insn (gen_rtx_SET (xmsk
,
10833 gen_rtx_EQ (mmsk
, src
,
10834 CONST0_RTX (mode
)))));
10836 /* Estimate the approximate reciprocal square root. */
10837 rtx xdst
= gen_reg_rtx (mode
);
10838 emit_insn (gen_aarch64_rsqrte (mode
, xdst
, src
));
10840 /* Iterate over the series twice for SF and thrice for DF. */
10841 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
10843 /* Optionally iterate over the series once less for faster performance
10844 while sacrificing the accuracy. */
10845 if ((recp
&& flag_mrecip_low_precision_sqrt
)
10846 || (!recp
&& flag_mlow_precision_sqrt
))
10849 /* Iterate over the series to calculate the approximate reciprocal square
10851 rtx x1
= gen_reg_rtx (mode
);
10852 while (iterations
--)
10854 rtx x2
= gen_reg_rtx (mode
);
10855 emit_set_insn (x2
, gen_rtx_MULT (mode
, xdst
, xdst
));
10857 emit_insn (gen_aarch64_rsqrts (mode
, x1
, src
, x2
));
10859 if (iterations
> 0)
10860 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, x1
));
10865 /* Qualify the approximate reciprocal square root when the argument is
10866 0.0 by squashing the intermediary result to 0.0. */
10867 rtx xtmp
= gen_reg_rtx (mmsk
);
10868 emit_set_insn (xtmp
, gen_rtx_AND (mmsk
, gen_rtx_NOT (mmsk
, xmsk
),
10869 gen_rtx_SUBREG (mmsk
, xdst
, 0)));
10870 emit_move_insn (xdst
, gen_rtx_SUBREG (mode
, xtmp
, 0));
10872 /* Calculate the approximate square root. */
10873 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, src
));
10876 /* Finalize the approximation. */
10877 emit_set_insn (dst
, gen_rtx_MULT (mode
, xdst
, x1
));
10882 /* Emit the instruction sequence to compute the approximation for the division
10883 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
10886 aarch64_emit_approx_div (rtx quo
, rtx num
, rtx den
)
10888 machine_mode mode
= GET_MODE (quo
);
10890 if (GET_MODE_INNER (mode
) == HFmode
)
10893 bool use_approx_division_p
= (flag_mlow_precision_div
10894 || (aarch64_tune_params
.approx_modes
->division
10895 & AARCH64_APPROX_MODE (mode
)));
10897 if (!flag_finite_math_only
10898 || flag_trapping_math
10899 || !flag_unsafe_math_optimizations
10900 || optimize_function_for_size_p (cfun
)
10901 || !use_approx_division_p
)
10904 if (!TARGET_SIMD
&& VECTOR_MODE_P (mode
))
10907 /* Estimate the approximate reciprocal. */
10908 rtx xrcp
= gen_reg_rtx (mode
);
10909 emit_insn (gen_aarch64_frecpe (mode
, xrcp
, den
));
10911 /* Iterate over the series twice for SF and thrice for DF. */
10912 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
10914 /* Optionally iterate over the series once less for faster performance,
10915 while sacrificing the accuracy. */
10916 if (flag_mlow_precision_div
)
10919 /* Iterate over the series to calculate the approximate reciprocal. */
10920 rtx xtmp
= gen_reg_rtx (mode
);
10921 while (iterations
--)
10923 emit_insn (gen_aarch64_frecps (mode
, xtmp
, xrcp
, den
));
10925 if (iterations
> 0)
10926 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
10929 if (num
!= CONST1_RTX (mode
))
10931 /* As the approximate reciprocal of DEN is already calculated, only
10932 calculate the approximate division when NUM is not 1.0. */
10933 rtx xnum
= force_reg (mode
, num
);
10934 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xnum
));
10937 /* Finalize the approximation. */
10938 emit_set_insn (quo
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
10942 /* Return the number of instructions that can be issued per cycle. */
10944 aarch64_sched_issue_rate (void)
10946 return aarch64_tune_params
.issue_rate
;
10950 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10952 int issue_rate
= aarch64_sched_issue_rate ();
10954 return issue_rate
> 1 && !sched_fusion
? issue_rate
: 0;
10958 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10959 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
10960 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
10963 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn
*insn
,
10966 return autopref_multipass_dfa_lookahead_guard (insn
, ready_index
);
10970 /* Vectorizer cost model target hooks. */
10972 /* Implement targetm.vectorize.builtin_vectorization_cost. */
10974 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost
,
10976 int misalign ATTRIBUTE_UNUSED
)
10979 const cpu_vector_cost
*costs
= aarch64_tune_params
.vec_costs
;
10982 if (vectype
!= NULL
)
10983 fp
= FLOAT_TYPE_P (vectype
);
10985 switch (type_of_cost
)
10988 return fp
? costs
->scalar_fp_stmt_cost
: costs
->scalar_int_stmt_cost
;
10991 return costs
->scalar_load_cost
;
10994 return costs
->scalar_store_cost
;
10997 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
11000 return costs
->vec_align_load_cost
;
11003 return costs
->vec_store_cost
;
11005 case vec_to_scalar
:
11006 return costs
->vec_to_scalar_cost
;
11008 case scalar_to_vec
:
11009 return costs
->scalar_to_vec_cost
;
11011 case unaligned_load
:
11012 case vector_gather_load
:
11013 return costs
->vec_unalign_load_cost
;
11015 case unaligned_store
:
11016 case vector_scatter_store
:
11017 return costs
->vec_unalign_store_cost
;
11019 case cond_branch_taken
:
11020 return costs
->cond_taken_branch_cost
;
11022 case cond_branch_not_taken
:
11023 return costs
->cond_not_taken_branch_cost
;
11026 return costs
->vec_permute_cost
;
11028 case vec_promote_demote
:
11029 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
11031 case vec_construct
:
11032 elements
= estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype
));
11033 return elements
/ 2 + 1;
11036 gcc_unreachable ();
11040 /* Implement targetm.vectorize.add_stmt_cost. */
11042 aarch64_add_stmt_cost (void *data
, int count
, enum vect_cost_for_stmt kind
,
11043 struct _stmt_vec_info
*stmt_info
, int misalign
,
11044 enum vect_cost_model_location where
)
11046 unsigned *cost
= (unsigned *) data
;
11047 unsigned retval
= 0;
11049 if (flag_vect_cost_model
)
11051 tree vectype
= stmt_info
? stmt_vectype (stmt_info
) : NULL_TREE
;
11053 aarch64_builtin_vectorization_cost (kind
, vectype
, misalign
);
11055 /* Statements in an inner loop relative to the loop being
11056 vectorized are weighted more heavily. The value here is
11057 arbitrary and could potentially be improved with analysis. */
11058 if (where
== vect_body
&& stmt_info
&& stmt_in_inner_loop_p (stmt_info
))
11059 count
*= 50; /* FIXME */
11061 retval
= (unsigned) (count
* stmt_cost
);
11062 cost
[where
] += retval
;
11068 static void initialize_aarch64_code_model (struct gcc_options
*);
11070 /* Parse the TO_PARSE string and put the architecture struct that it
11071 selects into RES and the architectural features into ISA_FLAGS.
11072 Return an aarch64_parse_opt_result describing the parse result.
11073 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
11074 When the TO_PARSE string contains an invalid extension,
11075 a copy of the string is created and stored to INVALID_EXTENSION. */
11077 static enum aarch64_parse_opt_result
11078 aarch64_parse_arch (const char *to_parse
, const struct processor
**res
,
11079 uint64_t *isa_flags
, std::string
*invalid_extension
)
11082 const struct processor
*arch
;
11085 ext
= strchr (to_parse
, '+');
11088 len
= ext
- to_parse
;
11090 len
= strlen (to_parse
);
11093 return AARCH64_PARSE_MISSING_ARG
;
11096 /* Loop through the list of supported ARCHes to find a match. */
11097 for (arch
= all_architectures
; arch
->name
!= NULL
; arch
++)
11099 if (strlen (arch
->name
) == len
11100 && strncmp (arch
->name
, to_parse
, len
) == 0)
11102 uint64_t isa_temp
= arch
->flags
;
11106 /* TO_PARSE string contains at least one extension. */
11107 enum aarch64_parse_opt_result ext_res
11108 = aarch64_parse_extension (ext
, &isa_temp
, invalid_extension
);
11110 if (ext_res
!= AARCH64_PARSE_OK
)
11113 /* Extension parsing was successful. Confirm the result
11114 arch and ISA flags. */
11116 *isa_flags
= isa_temp
;
11117 return AARCH64_PARSE_OK
;
11121 /* ARCH name not found in list. */
11122 return AARCH64_PARSE_INVALID_ARG
;
11125 /* Parse the TO_PARSE string and put the result tuning in RES and the
11126 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
11127 describing the parse result. If there is an error parsing, RES and
11128 ISA_FLAGS are left unchanged.
11129 When the TO_PARSE string contains an invalid extension,
11130 a copy of the string is created and stored to INVALID_EXTENSION. */
11132 static enum aarch64_parse_opt_result
11133 aarch64_parse_cpu (const char *to_parse
, const struct processor
**res
,
11134 uint64_t *isa_flags
, std::string
*invalid_extension
)
11137 const struct processor
*cpu
;
11140 ext
= strchr (to_parse
, '+');
11143 len
= ext
- to_parse
;
11145 len
= strlen (to_parse
);
11148 return AARCH64_PARSE_MISSING_ARG
;
11151 /* Loop through the list of supported CPUs to find a match. */
11152 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
11154 if (strlen (cpu
->name
) == len
&& strncmp (cpu
->name
, to_parse
, len
) == 0)
11156 uint64_t isa_temp
= cpu
->flags
;
11161 /* TO_PARSE string contains at least one extension. */
11162 enum aarch64_parse_opt_result ext_res
11163 = aarch64_parse_extension (ext
, &isa_temp
, invalid_extension
);
11165 if (ext_res
!= AARCH64_PARSE_OK
)
11168 /* Extension parsing was successfull. Confirm the result
11169 cpu and ISA flags. */
11171 *isa_flags
= isa_temp
;
11172 return AARCH64_PARSE_OK
;
11176 /* CPU name not found in list. */
11177 return AARCH64_PARSE_INVALID_ARG
;
11180 /* Parse the TO_PARSE string and put the cpu it selects into RES.
11181 Return an aarch64_parse_opt_result describing the parse result.
11182 If the parsing fails the RES does not change. */
11184 static enum aarch64_parse_opt_result
11185 aarch64_parse_tune (const char *to_parse
, const struct processor
**res
)
11187 const struct processor
*cpu
;
11189 /* Loop through the list of supported CPUs to find a match. */
11190 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
11192 if (strcmp (cpu
->name
, to_parse
) == 0)
11195 return AARCH64_PARSE_OK
;
11199 /* CPU name not found in list. */
11200 return AARCH64_PARSE_INVALID_ARG
;
11203 /* Parse TOKEN, which has length LENGTH to see if it is an option
11204 described in FLAG. If it is, return the index bit for that fusion type.
11205 If not, error (printing OPTION_NAME) and return zero. */
11207 static unsigned int
11208 aarch64_parse_one_option_token (const char *token
,
11210 const struct aarch64_flag_desc
*flag
,
11211 const char *option_name
)
11213 for (; flag
->name
!= NULL
; flag
++)
11215 if (length
== strlen (flag
->name
)
11216 && !strncmp (flag
->name
, token
, length
))
11220 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name
, token
);
11224 /* Parse OPTION which is a comma-separated list of flags to enable.
11225 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
11226 default state we inherit from the CPU tuning structures. OPTION_NAME
11227 gives the top-level option we are parsing in the -moverride string,
11228 for use in error messages. */
11230 static unsigned int
11231 aarch64_parse_boolean_options (const char *option
,
11232 const struct aarch64_flag_desc
*flags
,
11233 unsigned int initial_state
,
11234 const char *option_name
)
11236 const char separator
= '.';
11237 const char* specs
= option
;
11238 const char* ntoken
= option
;
11239 unsigned int found_flags
= initial_state
;
11241 while ((ntoken
= strchr (specs
, separator
)))
11243 size_t token_length
= ntoken
- specs
;
11244 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
11248 /* If we find "none" (or, for simplicity's sake, an error) anywhere
11249 in the token stream, reset the supported operations. So:
11251 adrp+add.cmp+branch.none.adrp+add
11253 would have the result of turning on only adrp+add fusion. */
11257 found_flags
|= token_ops
;
11261 /* We ended with a comma, print something. */
11264 error ("%s string ill-formed\n", option_name
);
11268 /* We still have one more token to parse. */
11269 size_t token_length
= strlen (specs
);
11270 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
11277 found_flags
|= token_ops
;
11278 return found_flags
;
11281 /* Support for overriding instruction fusion. */
11284 aarch64_parse_fuse_string (const char *fuse_string
,
11285 struct tune_params
*tune
)
11287 tune
->fusible_ops
= aarch64_parse_boolean_options (fuse_string
,
11288 aarch64_fusible_pairs
,
11293 /* Support for overriding other tuning flags. */
11296 aarch64_parse_tune_string (const char *tune_string
,
11297 struct tune_params
*tune
)
11299 tune
->extra_tuning_flags
11300 = aarch64_parse_boolean_options (tune_string
,
11301 aarch64_tuning_flags
,
11302 tune
->extra_tuning_flags
,
11306 /* Parse the sve_width tuning moverride string in TUNE_STRING.
11307 Accept the valid SVE vector widths allowed by
11308 aarch64_sve_vector_bits_enum and use it to override sve_width
11312 aarch64_parse_sve_width_string (const char *tune_string
,
11313 struct tune_params
*tune
)
11317 int n
= sscanf (tune_string
, "%d", &width
);
11320 error ("invalid format for sve_width");
11332 error ("invalid sve_width value: %d", width
);
11334 tune
->sve_width
= (enum aarch64_sve_vector_bits_enum
) width
;
11337 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
11338 we understand. If it is, extract the option string and handoff to
11339 the appropriate function. */
11342 aarch64_parse_one_override_token (const char* token
,
11344 struct tune_params
*tune
)
11346 const struct aarch64_tuning_override_function
*fn
11347 = aarch64_tuning_override_functions
;
11349 const char *option_part
= strchr (token
, '=');
11352 error ("tuning string missing in option (%s)", token
);
11356 /* Get the length of the option name. */
11357 length
= option_part
- token
;
11358 /* Skip the '=' to get to the option string. */
11361 for (; fn
->name
!= NULL
; fn
++)
11363 if (!strncmp (fn
->name
, token
, length
))
11365 fn
->parse_override (option_part
, tune
);
11370 error ("unknown tuning option (%s)",token
);
11374 /* A checking mechanism for the implementation of the tls size. */
11377 initialize_aarch64_tls_size (struct gcc_options
*opts
)
11379 if (aarch64_tls_size
== 0)
11380 aarch64_tls_size
= 24;
11382 switch (opts
->x_aarch64_cmodel_var
)
11384 case AARCH64_CMODEL_TINY
:
11385 /* Both the default and maximum TLS size allowed under tiny is 1M which
11386 needs two instructions to address, so we clamp the size to 24. */
11387 if (aarch64_tls_size
> 24)
11388 aarch64_tls_size
= 24;
11390 case AARCH64_CMODEL_SMALL
:
11391 /* The maximum TLS size allowed under small is 4G. */
11392 if (aarch64_tls_size
> 32)
11393 aarch64_tls_size
= 32;
11395 case AARCH64_CMODEL_LARGE
:
11396 /* The maximum TLS size allowed under large is 16E.
11397 FIXME: 16E should be 64bit, we only support 48bit offset now. */
11398 if (aarch64_tls_size
> 48)
11399 aarch64_tls_size
= 48;
11402 gcc_unreachable ();
11408 /* Parse STRING looking for options in the format:
11409 string :: option:string
11410 option :: name=substring
11412 substring :: defined by option. */
11415 aarch64_parse_override_string (const char* input_string
,
11416 struct tune_params
* tune
)
11418 const char separator
= ':';
11419 size_t string_length
= strlen (input_string
) + 1;
11420 char *string_root
= (char *) xmalloc (sizeof (*string_root
) * string_length
);
11421 char *string
= string_root
;
11422 strncpy (string
, input_string
, string_length
);
11423 string
[string_length
- 1] = '\0';
11425 char* ntoken
= string
;
11427 while ((ntoken
= strchr (string
, separator
)))
11429 size_t token_length
= ntoken
- string
;
11430 /* Make this substring look like a string. */
11432 aarch64_parse_one_override_token (string
, token_length
, tune
);
11436 /* One last option to parse. */
11437 aarch64_parse_one_override_token (string
, strlen (string
), tune
);
11438 free (string_root
);
11443 aarch64_override_options_after_change_1 (struct gcc_options
*opts
)
11445 if (accepted_branch_protection_string
)
11447 opts
->x_aarch64_branch_protection_string
11448 = xstrdup (accepted_branch_protection_string
);
11451 /* PR 70044: We have to be careful about being called multiple times for the
11452 same function. This means all changes should be repeatable. */
11454 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
11455 Disable the frame pointer flag so the mid-end will not use a frame
11456 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
11457 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
11458 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
11459 aarch64_use_frame_pointer
= opts
->x_flag_omit_frame_pointer
!= 1;
11460 if (opts
->x_flag_omit_frame_pointer
== 0)
11461 opts
->x_flag_omit_frame_pointer
= 2;
11463 /* If not optimizing for size, set the default
11464 alignment to what the target wants. */
11465 if (!opts
->x_optimize_size
)
11467 if (opts
->x_flag_align_loops
&& !opts
->x_str_align_loops
)
11468 opts
->x_str_align_loops
= aarch64_tune_params
.loop_align
;
11469 if (opts
->x_flag_align_jumps
&& !opts
->x_str_align_jumps
)
11470 opts
->x_str_align_jumps
= aarch64_tune_params
.jump_align
;
11471 if (opts
->x_flag_align_functions
&& !opts
->x_str_align_functions
)
11472 opts
->x_str_align_functions
= aarch64_tune_params
.function_align
;
11475 /* We default to no pc-relative literal loads. */
11477 aarch64_pcrelative_literal_loads
= false;
11479 /* If -mpc-relative-literal-loads is set on the command line, this
11480 implies that the user asked for PC relative literal loads. */
11481 if (opts
->x_pcrelative_literal_loads
== 1)
11482 aarch64_pcrelative_literal_loads
= true;
11484 /* In the tiny memory model it makes no sense to disallow PC relative
11485 literal pool loads. */
11486 if (aarch64_cmodel
== AARCH64_CMODEL_TINY
11487 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
11488 aarch64_pcrelative_literal_loads
= true;
11490 /* When enabling the lower precision Newton series for the square root, also
11491 enable it for the reciprocal square root, since the latter is an
11492 intermediary step for the former. */
11493 if (flag_mlow_precision_sqrt
)
11494 flag_mrecip_low_precision_sqrt
= true;
11497 /* 'Unpack' up the internal tuning structs and update the options
11498 in OPTS. The caller must have set up selected_tune and selected_arch
11499 as all the other target-specific codegen decisions are
11500 derived from them. */
11503 aarch64_override_options_internal (struct gcc_options
*opts
)
11505 aarch64_tune_flags
= selected_tune
->flags
;
11506 aarch64_tune
= selected_tune
->sched_core
;
11507 /* Make a copy of the tuning parameters attached to the core, which
11508 we may later overwrite. */
11509 aarch64_tune_params
= *(selected_tune
->tune
);
11510 aarch64_architecture_version
= selected_arch
->architecture_version
;
11512 if (opts
->x_aarch64_override_tune_string
)
11513 aarch64_parse_override_string (opts
->x_aarch64_override_tune_string
,
11514 &aarch64_tune_params
);
11516 /* This target defaults to strict volatile bitfields. */
11517 if (opts
->x_flag_strict_volatile_bitfields
< 0 && abi_version_at_least (2))
11518 opts
->x_flag_strict_volatile_bitfields
= 1;
11520 if (aarch64_stack_protector_guard
== SSP_GLOBAL
11521 && opts
->x_aarch64_stack_protector_guard_offset_str
)
11523 error ("incompatible options %<-mstack-protector-guard=global%> and "
11524 "%<-mstack-protector-guard-offset=%s%>",
11525 aarch64_stack_protector_guard_offset_str
);
11528 if (aarch64_stack_protector_guard
== SSP_SYSREG
11529 && !(opts
->x_aarch64_stack_protector_guard_offset_str
11530 && opts
->x_aarch64_stack_protector_guard_reg_str
))
11532 error ("both %<-mstack-protector-guard-offset%> and "
11533 "%<-mstack-protector-guard-reg%> must be used "
11534 "with %<-mstack-protector-guard=sysreg%>");
11537 if (opts
->x_aarch64_stack_protector_guard_reg_str
)
11539 if (strlen (opts
->x_aarch64_stack_protector_guard_reg_str
) > 100)
11540 error ("specify a system register with a small string length.");
11543 if (opts
->x_aarch64_stack_protector_guard_offset_str
)
11546 const char *str
= aarch64_stack_protector_guard_offset_str
;
11548 long offs
= strtol (aarch64_stack_protector_guard_offset_str
, &end
, 0);
11549 if (!*str
|| *end
|| errno
)
11550 error ("%qs is not a valid offset in %qs", str
,
11551 "-mstack-protector-guard-offset=");
11552 aarch64_stack_protector_guard_offset
= offs
;
11555 initialize_aarch64_code_model (opts
);
11556 initialize_aarch64_tls_size (opts
);
11558 int queue_depth
= 0;
11559 switch (aarch64_tune_params
.autoprefetcher_model
)
11561 case tune_params::AUTOPREFETCHER_OFF
:
11564 case tune_params::AUTOPREFETCHER_WEAK
:
11567 case tune_params::AUTOPREFETCHER_STRONG
:
11568 queue_depth
= max_insn_queue_index
+ 1;
11571 gcc_unreachable ();
11574 /* We don't mind passing in global_options_set here as we don't use
11575 the *options_set structs anyway. */
11576 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH
,
11578 opts
->x_param_values
,
11579 global_options_set
.x_param_values
);
11581 /* Set up parameters to be used in prefetching algorithm. Do not
11582 override the defaults unless we are tuning for a core we have
11583 researched values for. */
11584 if (aarch64_tune_params
.prefetch
->num_slots
> 0)
11585 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES
,
11586 aarch64_tune_params
.prefetch
->num_slots
,
11587 opts
->x_param_values
,
11588 global_options_set
.x_param_values
);
11589 if (aarch64_tune_params
.prefetch
->l1_cache_size
>= 0)
11590 maybe_set_param_value (PARAM_L1_CACHE_SIZE
,
11591 aarch64_tune_params
.prefetch
->l1_cache_size
,
11592 opts
->x_param_values
,
11593 global_options_set
.x_param_values
);
11594 if (aarch64_tune_params
.prefetch
->l1_cache_line_size
>= 0)
11595 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE
,
11596 aarch64_tune_params
.prefetch
->l1_cache_line_size
,
11597 opts
->x_param_values
,
11598 global_options_set
.x_param_values
);
11599 if (aarch64_tune_params
.prefetch
->l2_cache_size
>= 0)
11600 maybe_set_param_value (PARAM_L2_CACHE_SIZE
,
11601 aarch64_tune_params
.prefetch
->l2_cache_size
,
11602 opts
->x_param_values
,
11603 global_options_set
.x_param_values
);
11604 if (!aarch64_tune_params
.prefetch
->prefetch_dynamic_strides
)
11605 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES
,
11607 opts
->x_param_values
,
11608 global_options_set
.x_param_values
);
11609 if (aarch64_tune_params
.prefetch
->minimum_stride
>= 0)
11610 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE
,
11611 aarch64_tune_params
.prefetch
->minimum_stride
,
11612 opts
->x_param_values
,
11613 global_options_set
.x_param_values
);
11615 /* Use the alternative scheduling-pressure algorithm by default. */
11616 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM
, SCHED_PRESSURE_MODEL
,
11617 opts
->x_param_values
,
11618 global_options_set
.x_param_values
);
11620 /* If the user hasn't changed it via configure then set the default to 64 KB
11621 for the backend. */
11622 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
,
11623 DEFAULT_STK_CLASH_GUARD_SIZE
== 0
11624 ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE
,
11625 opts
->x_param_values
,
11626 global_options_set
.x_param_values
);
11628 /* Validate the guard size. */
11629 int guard_size
= PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
11631 /* Enforce that interval is the same size as size so the mid-end does the
11633 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL
,
11635 opts
->x_param_values
,
11636 global_options_set
.x_param_values
);
11638 /* The maybe_set calls won't update the value if the user has explicitly set
11639 one. Which means we need to validate that probing interval and guard size
11642 = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL
);
11643 if (guard_size
!= probe_interval
)
11644 error ("stack clash guard size %<%d%> must be equal to probing interval "
11645 "%<%d%>", guard_size
, probe_interval
);
11647 /* Enable sw prefetching at specified optimization level for
11648 CPUS that have prefetch. Lower optimization level threshold by 1
11649 when profiling is enabled. */
11650 if (opts
->x_flag_prefetch_loop_arrays
< 0
11651 && !opts
->x_optimize_size
11652 && aarch64_tune_params
.prefetch
->default_opt_level
>= 0
11653 && opts
->x_optimize
>= aarch64_tune_params
.prefetch
->default_opt_level
)
11654 opts
->x_flag_prefetch_loop_arrays
= 1;
11656 if (opts
->x_aarch64_arch_string
== NULL
)
11657 opts
->x_aarch64_arch_string
= selected_arch
->name
;
11658 if (opts
->x_aarch64_cpu_string
== NULL
)
11659 opts
->x_aarch64_cpu_string
= selected_cpu
->name
;
11660 if (opts
->x_aarch64_tune_string
== NULL
)
11661 opts
->x_aarch64_tune_string
= selected_tune
->name
;
11663 aarch64_override_options_after_change_1 (opts
);
11666 /* Print a hint with a suggestion for a core or architecture name that
11667 most closely resembles what the user passed in STR. ARCH is true if
11668 the user is asking for an architecture name. ARCH is false if the user
11669 is asking for a core name. */
11672 aarch64_print_hint_for_core_or_arch (const char *str
, bool arch
)
11674 auto_vec
<const char *> candidates
;
11675 const struct processor
*entry
= arch
? all_architectures
: all_cores
;
11676 for (; entry
->name
!= NULL
; entry
++)
11677 candidates
.safe_push (entry
->name
);
11679 #ifdef HAVE_LOCAL_CPU_DETECT
11680 /* Add also "native" as possible value. */
11682 candidates
.safe_push ("native");
11686 const char *hint
= candidates_list_and_hint (str
, s
, candidates
);
11688 inform (input_location
, "valid arguments are: %s;"
11689 " did you mean %qs?", s
, hint
);
11691 inform (input_location
, "valid arguments are: %s", s
);
11696 /* Print a hint with a suggestion for a core name that most closely resembles
11697 what the user passed in STR. */
11700 aarch64_print_hint_for_core (const char *str
)
11702 aarch64_print_hint_for_core_or_arch (str
, false);
11705 /* Print a hint with a suggestion for an architecture name that most closely
11706 resembles what the user passed in STR. */
11709 aarch64_print_hint_for_arch (const char *str
)
11711 aarch64_print_hint_for_core_or_arch (str
, true);
11715 /* Print a hint with a suggestion for an extension name
11716 that most closely resembles what the user passed in STR. */
11719 aarch64_print_hint_for_extensions (const std::string
&str
)
11721 auto_vec
<const char *> candidates
;
11722 aarch64_get_all_extension_candidates (&candidates
);
11724 const char *hint
= candidates_list_and_hint (str
.c_str (), s
, candidates
);
11726 inform (input_location
, "valid arguments are: %s;"
11727 " did you mean %qs?", s
, hint
);
11729 inform (input_location
, "valid arguments are: %s;", s
);
11734 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
11735 specified in STR and throw errors if appropriate. Put the results if
11736 they are valid in RES and ISA_FLAGS. Return whether the option is
11740 aarch64_validate_mcpu (const char *str
, const struct processor
**res
,
11741 uint64_t *isa_flags
)
11743 std::string invalid_extension
;
11744 enum aarch64_parse_opt_result parse_res
11745 = aarch64_parse_cpu (str
, res
, isa_flags
, &invalid_extension
);
11747 if (parse_res
== AARCH64_PARSE_OK
)
11752 case AARCH64_PARSE_MISSING_ARG
:
11753 error ("missing cpu name in %<-mcpu=%s%>", str
);
11755 case AARCH64_PARSE_INVALID_ARG
:
11756 error ("unknown value %qs for %<-mcpu%>", str
);
11757 aarch64_print_hint_for_core (str
);
11759 case AARCH64_PARSE_INVALID_FEATURE
:
11760 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
11761 invalid_extension
.c_str (), str
);
11762 aarch64_print_hint_for_extensions (invalid_extension
);
11765 gcc_unreachable ();
11771 /* Parses CONST_STR for branch protection features specified in
11772 aarch64_branch_protect_types, and set any global variables required. Returns
11773 the parsing result and assigns LAST_STR to the last processed token from
11774 CONST_STR so that it can be used for error reporting. */
11777 aarch64_parse_opt_result
aarch64_parse_branch_protection (const char *const_str
,
11780 char *str_root
= xstrdup (const_str
);
11781 char* token_save
= NULL
;
11782 char *str
= strtok_r (str_root
, "+", &token_save
);
11783 enum aarch64_parse_opt_result res
= AARCH64_PARSE_OK
;
11785 res
= AARCH64_PARSE_MISSING_ARG
;
11788 char *next_str
= strtok_r (NULL
, "+", &token_save
);
11789 /* Reset the branch protection features to their defaults. */
11790 aarch64_handle_no_branch_protection (NULL
, NULL
);
11792 while (str
&& res
== AARCH64_PARSE_OK
)
11794 const aarch64_branch_protect_type
* type
= aarch64_branch_protect_types
;
11795 bool found
= false;
11796 /* Search for this type. */
11797 while (type
&& type
->name
&& !found
&& res
== AARCH64_PARSE_OK
)
11799 if (strcmp (str
, type
->name
) == 0)
11802 res
= type
->handler (str
, next_str
);
11804 next_str
= strtok_r (NULL
, "+", &token_save
);
11809 if (found
&& res
== AARCH64_PARSE_OK
)
11811 bool found_subtype
= true;
11812 /* Loop through each token until we find one that isn't a
11814 while (found_subtype
)
11816 found_subtype
= false;
11817 const aarch64_branch_protect_type
*subtype
= type
->subtypes
;
11818 /* Search for the subtype. */
11819 while (str
&& subtype
&& subtype
->name
&& !found_subtype
11820 && res
== AARCH64_PARSE_OK
)
11822 if (strcmp (str
, subtype
->name
) == 0)
11824 found_subtype
= true;
11825 res
= subtype
->handler (str
, next_str
);
11827 next_str
= strtok_r (NULL
, "+", &token_save
);
11835 res
= AARCH64_PARSE_INVALID_ARG
;
11838 /* Copy the last processed token into the argument to pass it back.
11839 Used by option and attribute validation to print the offending token. */
11842 if (str
) strcpy (*last_str
, str
);
11843 else *last_str
= NULL
;
11845 if (res
== AARCH64_PARSE_OK
)
11847 /* If needed, alloc the accepted string then copy in const_str.
11848 Used by override_option_after_change_1. */
11849 if (!accepted_branch_protection_string
)
11850 accepted_branch_protection_string
= (char *) xmalloc (
11851 BRANCH_PROTECT_STR_MAX
11853 strncpy (accepted_branch_protection_string
, const_str
,
11854 BRANCH_PROTECT_STR_MAX
+ 1);
11855 /* Forcibly null-terminate. */
11856 accepted_branch_protection_string
[BRANCH_PROTECT_STR_MAX
] = '\0';
11862 aarch64_validate_mbranch_protection (const char *const_str
)
11864 char *str
= (char *) xmalloc (strlen (const_str
));
11865 enum aarch64_parse_opt_result res
=
11866 aarch64_parse_branch_protection (const_str
, &str
);
11867 if (res
== AARCH64_PARSE_INVALID_ARG
)
11868 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str
);
11869 else if (res
== AARCH64_PARSE_MISSING_ARG
)
11870 error ("missing argument for %<-mbranch-protection=%>");
11872 return res
== AARCH64_PARSE_OK
;
11875 /* Validate a command-line -march option. Parse the arch and extensions
11876 (if any) specified in STR and throw errors if appropriate. Put the
11877 results, if they are valid, in RES and ISA_FLAGS. Return whether the
11878 option is valid. */
11881 aarch64_validate_march (const char *str
, const struct processor
**res
,
11882 uint64_t *isa_flags
)
11884 std::string invalid_extension
;
11885 enum aarch64_parse_opt_result parse_res
11886 = aarch64_parse_arch (str
, res
, isa_flags
, &invalid_extension
);
11888 if (parse_res
== AARCH64_PARSE_OK
)
11893 case AARCH64_PARSE_MISSING_ARG
:
11894 error ("missing arch name in %<-march=%s%>", str
);
11896 case AARCH64_PARSE_INVALID_ARG
:
11897 error ("unknown value %qs for %<-march%>", str
);
11898 aarch64_print_hint_for_arch (str
);
11900 case AARCH64_PARSE_INVALID_FEATURE
:
11901 error ("invalid feature modifier %qs in %<-march=%s%>",
11902 invalid_extension
.c_str (), str
);
11903 aarch64_print_hint_for_extensions (invalid_extension
);
11906 gcc_unreachable ();
11912 /* Validate a command-line -mtune option. Parse the cpu
11913 specified in STR and throw errors if appropriate. Put the
11914 result, if it is valid, in RES. Return whether the option is
11918 aarch64_validate_mtune (const char *str
, const struct processor
**res
)
11920 enum aarch64_parse_opt_result parse_res
11921 = aarch64_parse_tune (str
, res
);
11923 if (parse_res
== AARCH64_PARSE_OK
)
11928 case AARCH64_PARSE_MISSING_ARG
:
11929 error ("missing cpu name in %<-mtune=%s%>", str
);
11931 case AARCH64_PARSE_INVALID_ARG
:
11932 error ("unknown value %qs for %<-mtune%>", str
);
11933 aarch64_print_hint_for_core (str
);
11936 gcc_unreachable ();
11941 /* Return the CPU corresponding to the enum CPU.
11942 If it doesn't specify a cpu, return the default. */
11944 static const struct processor
*
11945 aarch64_get_tune_cpu (enum aarch64_processor cpu
)
11947 if (cpu
!= aarch64_none
)
11948 return &all_cores
[cpu
];
11950 /* The & 0x3f is to extract the bottom 6 bits that encode the
11951 default cpu as selected by the --with-cpu GCC configure option
11953 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
11954 flags mechanism should be reworked to make it more sane. */
11955 return &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
11958 /* Return the architecture corresponding to the enum ARCH.
11959 If it doesn't specify a valid architecture, return the default. */
11961 static const struct processor
*
11962 aarch64_get_arch (enum aarch64_arch arch
)
11964 if (arch
!= aarch64_no_arch
)
11965 return &all_architectures
[arch
];
11967 const struct processor
*cpu
= &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
11969 return &all_architectures
[cpu
->arch
];
11972 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
11975 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value
)
11977 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
11978 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
11979 deciding which .md file patterns to use and when deciding whether
11980 something is a legitimate address or constant. */
11981 if (value
== SVE_SCALABLE
|| value
== SVE_128
)
11982 return poly_uint16 (2, 2);
11984 return (int) value
/ 64;
11987 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
11988 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
11989 tuning structs. In particular it must set selected_tune and
11990 aarch64_isa_flags that define the available ISA features and tuning
11991 decisions. It must also set selected_arch as this will be used to
11992 output the .arch asm tags for each function. */
11995 aarch64_override_options (void)
11997 uint64_t cpu_isa
= 0;
11998 uint64_t arch_isa
= 0;
11999 aarch64_isa_flags
= 0;
12001 bool valid_cpu
= true;
12002 bool valid_tune
= true;
12003 bool valid_arch
= true;
12005 selected_cpu
= NULL
;
12006 selected_arch
= NULL
;
12007 selected_tune
= NULL
;
12009 if (aarch64_branch_protection_string
)
12010 aarch64_validate_mbranch_protection (aarch64_branch_protection_string
);
12012 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
12013 If either of -march or -mtune is given, they override their
12014 respective component of -mcpu. */
12015 if (aarch64_cpu_string
)
12016 valid_cpu
= aarch64_validate_mcpu (aarch64_cpu_string
, &selected_cpu
,
12019 if (aarch64_arch_string
)
12020 valid_arch
= aarch64_validate_march (aarch64_arch_string
, &selected_arch
,
12023 if (aarch64_tune_string
)
12024 valid_tune
= aarch64_validate_mtune (aarch64_tune_string
, &selected_tune
);
12026 #ifdef SUBTARGET_OVERRIDE_OPTIONS
12027 SUBTARGET_OVERRIDE_OPTIONS
;
12030 /* If the user did not specify a processor, choose the default
12031 one for them. This will be the CPU set during configuration using
12032 --with-cpu, otherwise it is "generic". */
12037 selected_cpu
= &all_cores
[selected_arch
->ident
];
12038 aarch64_isa_flags
= arch_isa
;
12039 explicit_arch
= selected_arch
->arch
;
12043 /* Get default configure-time CPU. */
12044 selected_cpu
= aarch64_get_tune_cpu (aarch64_none
);
12045 aarch64_isa_flags
= TARGET_CPU_DEFAULT
>> 6;
12049 explicit_tune_core
= selected_tune
->ident
;
12051 /* If both -mcpu and -march are specified check that they are architecturally
12052 compatible, warn if they're not and prefer the -march ISA flags. */
12053 else if (selected_arch
)
12055 if (selected_arch
->arch
!= selected_cpu
->arch
)
12057 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
12058 all_architectures
[selected_cpu
->arch
].name
,
12059 selected_arch
->name
);
12061 aarch64_isa_flags
= arch_isa
;
12062 explicit_arch
= selected_arch
->arch
;
12063 explicit_tune_core
= selected_tune
? selected_tune
->ident
12064 : selected_cpu
->ident
;
12068 /* -mcpu but no -march. */
12069 aarch64_isa_flags
= cpu_isa
;
12070 explicit_tune_core
= selected_tune
? selected_tune
->ident
12071 : selected_cpu
->ident
;
12072 gcc_assert (selected_cpu
);
12073 selected_arch
= &all_architectures
[selected_cpu
->arch
];
12074 explicit_arch
= selected_arch
->arch
;
12077 /* Set the arch as well as we will need it when outputing
12078 the .arch directive in assembly. */
12079 if (!selected_arch
)
12081 gcc_assert (selected_cpu
);
12082 selected_arch
= &all_architectures
[selected_cpu
->arch
];
12085 if (!selected_tune
)
12086 selected_tune
= selected_cpu
;
12088 if (aarch64_enable_bti
== 2)
12090 #ifdef TARGET_ENABLE_BTI
12091 aarch64_enable_bti
= 1;
12093 aarch64_enable_bti
= 0;
12097 /* Return address signing is currently not supported for ILP32 targets. For
12098 LP64 targets use the configured option in the absence of a command-line
12099 option for -mbranch-protection. */
12100 if (!TARGET_ILP32
&& accepted_branch_protection_string
== NULL
)
12102 #ifdef TARGET_ENABLE_PAC_RET
12103 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
12105 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NONE
;
12109 #ifndef HAVE_AS_MABI_OPTION
12110 /* The compiler may have been configured with 2.23.* binutils, which does
12111 not have support for ILP32. */
12113 error ("assembler does not support %<-mabi=ilp32%>");
12116 /* Convert -msve-vector-bits to a VG count. */
12117 aarch64_sve_vg
= aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits
);
12119 if (aarch64_ra_sign_scope
!= AARCH64_FUNCTION_NONE
&& TARGET_ILP32
)
12120 sorry ("return address signing is only supported for %<-mabi=lp64%>");
12122 /* Make sure we properly set up the explicit options. */
12123 if ((aarch64_cpu_string
&& valid_cpu
)
12124 || (aarch64_tune_string
&& valid_tune
))
12125 gcc_assert (explicit_tune_core
!= aarch64_none
);
12127 if ((aarch64_cpu_string
&& valid_cpu
)
12128 || (aarch64_arch_string
&& valid_arch
))
12129 gcc_assert (explicit_arch
!= aarch64_no_arch
);
12131 /* The pass to insert speculation tracking runs before
12132 shrink-wrapping and the latter does not know how to update the
12133 tracking status. So disable it in this case. */
12134 if (aarch64_track_speculation
)
12135 flag_shrink_wrap
= 0;
12137 aarch64_override_options_internal (&global_options
);
12139 /* Save these options as the default ones in case we push and pop them later
12140 while processing functions with potential target attributes. */
12141 target_option_default_node
= target_option_current_node
12142 = build_target_option_node (&global_options
);
12145 /* Implement targetm.override_options_after_change. */
12148 aarch64_override_options_after_change (void)
12150 aarch64_override_options_after_change_1 (&global_options
);
12153 static struct machine_function
*
12154 aarch64_init_machine_status (void)
12156 struct machine_function
*machine
;
12157 machine
= ggc_cleared_alloc
<machine_function
> ();
12162 aarch64_init_expanders (void)
12164 init_machine_status
= aarch64_init_machine_status
;
12167 /* A checking mechanism for the implementation of the various code models. */
12169 initialize_aarch64_code_model (struct gcc_options
*opts
)
12171 if (opts
->x_flag_pic
)
12173 switch (opts
->x_aarch64_cmodel_var
)
12175 case AARCH64_CMODEL_TINY
:
12176 aarch64_cmodel
= AARCH64_CMODEL_TINY_PIC
;
12178 case AARCH64_CMODEL_SMALL
:
12179 #ifdef HAVE_AS_SMALL_PIC_RELOCS
12180 aarch64_cmodel
= (flag_pic
== 2
12181 ? AARCH64_CMODEL_SMALL_PIC
12182 : AARCH64_CMODEL_SMALL_SPIC
);
12184 aarch64_cmodel
= AARCH64_CMODEL_SMALL_PIC
;
12187 case AARCH64_CMODEL_LARGE
:
12188 sorry ("code model %qs with %<-f%s%>", "large",
12189 opts
->x_flag_pic
> 1 ? "PIC" : "pic");
12192 gcc_unreachable ();
12196 aarch64_cmodel
= opts
->x_aarch64_cmodel_var
;
12199 /* Implement TARGET_OPTION_SAVE. */
12202 aarch64_option_save (struct cl_target_option
*ptr
, struct gcc_options
*opts
)
12204 ptr
->x_aarch64_override_tune_string
= opts
->x_aarch64_override_tune_string
;
12205 ptr
->x_aarch64_branch_protection_string
12206 = opts
->x_aarch64_branch_protection_string
;
12209 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
12210 using the information saved in PTR. */
12213 aarch64_option_restore (struct gcc_options
*opts
, struct cl_target_option
*ptr
)
12215 opts
->x_explicit_tune_core
= ptr
->x_explicit_tune_core
;
12216 selected_tune
= aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
12217 opts
->x_explicit_arch
= ptr
->x_explicit_arch
;
12218 selected_arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
12219 opts
->x_aarch64_override_tune_string
= ptr
->x_aarch64_override_tune_string
;
12220 opts
->x_aarch64_branch_protection_string
12221 = ptr
->x_aarch64_branch_protection_string
;
12222 if (opts
->x_aarch64_branch_protection_string
)
12224 aarch64_parse_branch_protection (opts
->x_aarch64_branch_protection_string
,
12228 aarch64_override_options_internal (opts
);
12231 /* Implement TARGET_OPTION_PRINT. */
12234 aarch64_option_print (FILE *file
, int indent
, struct cl_target_option
*ptr
)
12236 const struct processor
*cpu
12237 = aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
12238 uint64_t isa_flags
= ptr
->x_aarch64_isa_flags
;
12239 const struct processor
*arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
12240 std::string extension
12241 = aarch64_get_extension_string_for_isa_flags (isa_flags
, arch
->flags
);
12243 fprintf (file
, "%*sselected tune = %s\n", indent
, "", cpu
->name
);
12244 fprintf (file
, "%*sselected arch = %s%s\n", indent
, "",
12245 arch
->name
, extension
.c_str ());
12248 static GTY(()) tree aarch64_previous_fndecl
;
12251 aarch64_reset_previous_fndecl (void)
12253 aarch64_previous_fndecl
= NULL
;
12256 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
12257 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
12258 make sure optab availability predicates are recomputed when necessary. */
12261 aarch64_save_restore_target_globals (tree new_tree
)
12263 if (TREE_TARGET_GLOBALS (new_tree
))
12264 restore_target_globals (TREE_TARGET_GLOBALS (new_tree
));
12265 else if (new_tree
== target_option_default_node
)
12266 restore_target_globals (&default_target_globals
);
12268 TREE_TARGET_GLOBALS (new_tree
) = save_target_globals_default_opts ();
12271 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
12272 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
12273 of the function, if such exists. This function may be called multiple
12274 times on a single function so use aarch64_previous_fndecl to avoid
12275 setting up identical state. */
12278 aarch64_set_current_function (tree fndecl
)
12280 if (!fndecl
|| fndecl
== aarch64_previous_fndecl
)
12283 tree old_tree
= (aarch64_previous_fndecl
12284 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl
)
12287 tree new_tree
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
12289 /* If current function has no attributes but the previous one did,
12290 use the default node. */
12291 if (!new_tree
&& old_tree
)
12292 new_tree
= target_option_default_node
;
12294 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
12295 the default have been handled by aarch64_save_restore_target_globals from
12296 aarch64_pragma_target_parse. */
12297 if (old_tree
== new_tree
)
12300 aarch64_previous_fndecl
= fndecl
;
12302 /* First set the target options. */
12303 cl_target_option_restore (&global_options
, TREE_TARGET_OPTION (new_tree
));
12305 aarch64_save_restore_target_globals (new_tree
);
12308 /* Enum describing the various ways we can handle attributes.
12309 In many cases we can reuse the generic option handling machinery. */
12311 enum aarch64_attr_opt_type
12313 aarch64_attr_mask
, /* Attribute should set a bit in target_flags. */
12314 aarch64_attr_bool
, /* Attribute sets or unsets a boolean variable. */
12315 aarch64_attr_enum
, /* Attribute sets an enum variable. */
12316 aarch64_attr_custom
/* Attribute requires a custom handling function. */
12319 /* All the information needed to handle a target attribute.
12320 NAME is the name of the attribute.
12321 ATTR_TYPE specifies the type of behavior of the attribute as described
12322 in the definition of enum aarch64_attr_opt_type.
12323 ALLOW_NEG is true if the attribute supports a "no-" form.
12324 HANDLER is the function that takes the attribute string as an argument
12325 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
12326 OPT_NUM is the enum specifying the option that the attribute modifies.
12327 This is needed for attributes that mirror the behavior of a command-line
12328 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
12329 aarch64_attr_enum. */
12331 struct aarch64_attribute_info
12334 enum aarch64_attr_opt_type attr_type
;
12336 bool (*handler
) (const char *);
12337 enum opt_code opt_num
;
12340 /* Handle the ARCH_STR argument to the arch= target attribute. */
12343 aarch64_handle_attr_arch (const char *str
)
12345 const struct processor
*tmp_arch
= NULL
;
12346 std::string invalid_extension
;
12347 enum aarch64_parse_opt_result parse_res
12348 = aarch64_parse_arch (str
, &tmp_arch
, &aarch64_isa_flags
, &invalid_extension
);
12350 if (parse_res
== AARCH64_PARSE_OK
)
12352 gcc_assert (tmp_arch
);
12353 selected_arch
= tmp_arch
;
12354 explicit_arch
= selected_arch
->arch
;
12360 case AARCH64_PARSE_MISSING_ARG
:
12361 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
12363 case AARCH64_PARSE_INVALID_ARG
:
12364 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str
);
12365 aarch64_print_hint_for_arch (str
);
12367 case AARCH64_PARSE_INVALID_FEATURE
:
12368 error ("invalid feature modifier %s of value (\"%s\") in "
12369 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
12370 aarch64_print_hint_for_extensions (invalid_extension
);
12373 gcc_unreachable ();
12379 /* Handle the argument CPU_STR to the cpu= target attribute. */
12382 aarch64_handle_attr_cpu (const char *str
)
12384 const struct processor
*tmp_cpu
= NULL
;
12385 std::string invalid_extension
;
12386 enum aarch64_parse_opt_result parse_res
12387 = aarch64_parse_cpu (str
, &tmp_cpu
, &aarch64_isa_flags
, &invalid_extension
);
12389 if (parse_res
== AARCH64_PARSE_OK
)
12391 gcc_assert (tmp_cpu
);
12392 selected_tune
= tmp_cpu
;
12393 explicit_tune_core
= selected_tune
->ident
;
12395 selected_arch
= &all_architectures
[tmp_cpu
->arch
];
12396 explicit_arch
= selected_arch
->arch
;
12402 case AARCH64_PARSE_MISSING_ARG
:
12403 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
12405 case AARCH64_PARSE_INVALID_ARG
:
12406 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str
);
12407 aarch64_print_hint_for_core (str
);
12409 case AARCH64_PARSE_INVALID_FEATURE
:
12410 error ("invalid feature modifier %s of value (\"%s\") in "
12411 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
12412 aarch64_print_hint_for_extensions (invalid_extension
);
12415 gcc_unreachable ();
12421 /* Handle the argument STR to the branch-protection= attribute. */
12424 aarch64_handle_attr_branch_protection (const char* str
)
12426 char *err_str
= (char *) xmalloc (strlen (str
));
12427 enum aarch64_parse_opt_result res
= aarch64_parse_branch_protection (str
,
12429 bool success
= false;
12432 case AARCH64_PARSE_MISSING_ARG
:
12433 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
12436 case AARCH64_PARSE_INVALID_ARG
:
12437 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
12438 "=\")%> pragma or attribute", err_str
);
12440 case AARCH64_PARSE_OK
:
12442 /* Fall through. */
12443 case AARCH64_PARSE_INVALID_FEATURE
:
12446 gcc_unreachable ();
12452 /* Handle the argument STR to the tune= target attribute. */
12455 aarch64_handle_attr_tune (const char *str
)
12457 const struct processor
*tmp_tune
= NULL
;
12458 enum aarch64_parse_opt_result parse_res
12459 = aarch64_parse_tune (str
, &tmp_tune
);
12461 if (parse_res
== AARCH64_PARSE_OK
)
12463 gcc_assert (tmp_tune
);
12464 selected_tune
= tmp_tune
;
12465 explicit_tune_core
= selected_tune
->ident
;
12471 case AARCH64_PARSE_INVALID_ARG
:
12472 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str
);
12473 aarch64_print_hint_for_core (str
);
12476 gcc_unreachable ();
12482 /* Parse an architecture extensions target attribute string specified in STR.
12483 For example "+fp+nosimd". Show any errors if needed. Return TRUE
12484 if successful. Update aarch64_isa_flags to reflect the ISA features
12488 aarch64_handle_attr_isa_flags (char *str
)
12490 enum aarch64_parse_opt_result parse_res
;
12491 uint64_t isa_flags
= aarch64_isa_flags
;
12493 /* We allow "+nothing" in the beginning to clear out all architectural
12494 features if the user wants to handpick specific features. */
12495 if (strncmp ("+nothing", str
, 8) == 0)
12501 std::string invalid_extension
;
12502 parse_res
= aarch64_parse_extension (str
, &isa_flags
, &invalid_extension
);
12504 if (parse_res
== AARCH64_PARSE_OK
)
12506 aarch64_isa_flags
= isa_flags
;
12512 case AARCH64_PARSE_MISSING_ARG
:
12513 error ("missing value in %<target()%> pragma or attribute");
12516 case AARCH64_PARSE_INVALID_FEATURE
:
12517 error ("invalid feature modifier %s of value (\"%s\") in "
12518 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
12522 gcc_unreachable ();
12528 /* The target attributes that we support. On top of these we also support just
12529 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
12530 handled explicitly in aarch64_process_one_target_attr. */
12532 static const struct aarch64_attribute_info aarch64_attributes
[] =
12534 { "general-regs-only", aarch64_attr_mask
, false, NULL
,
12535 OPT_mgeneral_regs_only
},
12536 { "fix-cortex-a53-835769", aarch64_attr_bool
, true, NULL
,
12537 OPT_mfix_cortex_a53_835769
},
12538 { "fix-cortex-a53-843419", aarch64_attr_bool
, true, NULL
,
12539 OPT_mfix_cortex_a53_843419
},
12540 { "cmodel", aarch64_attr_enum
, false, NULL
, OPT_mcmodel_
},
12541 { "strict-align", aarch64_attr_mask
, true, NULL
, OPT_mstrict_align
},
12542 { "omit-leaf-frame-pointer", aarch64_attr_bool
, true, NULL
,
12543 OPT_momit_leaf_frame_pointer
},
12544 { "tls-dialect", aarch64_attr_enum
, false, NULL
, OPT_mtls_dialect_
},
12545 { "arch", aarch64_attr_custom
, false, aarch64_handle_attr_arch
,
12547 { "cpu", aarch64_attr_custom
, false, aarch64_handle_attr_cpu
, OPT_mcpu_
},
12548 { "tune", aarch64_attr_custom
, false, aarch64_handle_attr_tune
,
12550 { "branch-protection", aarch64_attr_custom
, false,
12551 aarch64_handle_attr_branch_protection
, OPT_mbranch_protection_
},
12552 { "sign-return-address", aarch64_attr_enum
, false, NULL
,
12553 OPT_msign_return_address_
},
12554 { NULL
, aarch64_attr_custom
, false, NULL
, OPT____
}
12557 /* Parse ARG_STR which contains the definition of one target attribute.
12558 Show appropriate errors if any or return true if the attribute is valid. */
12561 aarch64_process_one_target_attr (char *arg_str
)
12563 bool invert
= false;
12565 size_t len
= strlen (arg_str
);
12569 error ("malformed %<target()%> pragma or attribute");
12573 char *str_to_check
= (char *) alloca (len
+ 1);
12574 strcpy (str_to_check
, arg_str
);
12576 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
12577 It is easier to detect and handle it explicitly here rather than going
12578 through the machinery for the rest of the target attributes in this
12580 if (*str_to_check
== '+')
12581 return aarch64_handle_attr_isa_flags (str_to_check
);
12583 if (len
> 3 && strncmp (str_to_check
, "no-", 3) == 0)
12588 char *arg
= strchr (str_to_check
, '=');
12590 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
12591 and point ARG to "foo". */
12597 const struct aarch64_attribute_info
*p_attr
;
12598 bool found
= false;
12599 for (p_attr
= aarch64_attributes
; p_attr
->name
; p_attr
++)
12601 /* If the names don't match up, or the user has given an argument
12602 to an attribute that doesn't accept one, or didn't give an argument
12603 to an attribute that expects one, fail to match. */
12604 if (strcmp (str_to_check
, p_attr
->name
) != 0)
12608 bool attr_need_arg_p
= p_attr
->attr_type
== aarch64_attr_custom
12609 || p_attr
->attr_type
== aarch64_attr_enum
;
12611 if (attr_need_arg_p
^ (arg
!= NULL
))
12613 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check
);
12617 /* If the name matches but the attribute does not allow "no-" versions
12618 then we can't match. */
12619 if (invert
&& !p_attr
->allow_neg
)
12621 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check
);
12625 switch (p_attr
->attr_type
)
12627 /* Has a custom handler registered.
12628 For example, cpu=, arch=, tune=. */
12629 case aarch64_attr_custom
:
12630 gcc_assert (p_attr
->handler
);
12631 if (!p_attr
->handler (arg
))
12635 /* Either set or unset a boolean option. */
12636 case aarch64_attr_bool
:
12638 struct cl_decoded_option decoded
;
12640 generate_option (p_attr
->opt_num
, NULL
, !invert
,
12641 CL_TARGET
, &decoded
);
12642 aarch64_handle_option (&global_options
, &global_options_set
,
12643 &decoded
, input_location
);
12646 /* Set or unset a bit in the target_flags. aarch64_handle_option
12647 should know what mask to apply given the option number. */
12648 case aarch64_attr_mask
:
12650 struct cl_decoded_option decoded
;
12651 /* We only need to specify the option number.
12652 aarch64_handle_option will know which mask to apply. */
12653 decoded
.opt_index
= p_attr
->opt_num
;
12654 decoded
.value
= !invert
;
12655 aarch64_handle_option (&global_options
, &global_options_set
,
12656 &decoded
, input_location
);
12659 /* Use the option setting machinery to set an option to an enum. */
12660 case aarch64_attr_enum
:
12665 valid
= opt_enum_arg_to_value (p_attr
->opt_num
, arg
,
12666 &value
, CL_TARGET
);
12669 set_option (&global_options
, NULL
, p_attr
->opt_num
, value
,
12670 NULL
, DK_UNSPECIFIED
, input_location
,
12675 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check
, arg
);
12680 gcc_unreachable ();
12684 /* If we reached here we either have found an attribute and validated
12685 it or didn't match any. If we matched an attribute but its arguments
12686 were malformed we will have returned false already. */
12690 /* Count how many times the character C appears in
12691 NULL-terminated string STR. */
12693 static unsigned int
12694 num_occurences_in_str (char c
, char *str
)
12696 unsigned int res
= 0;
12697 while (*str
!= '\0')
12708 /* Parse the tree in ARGS that contains the target attribute information
12709 and update the global target options space. */
12712 aarch64_process_target_attr (tree args
)
12714 if (TREE_CODE (args
) == TREE_LIST
)
12718 tree head
= TREE_VALUE (args
);
12721 if (!aarch64_process_target_attr (head
))
12724 args
= TREE_CHAIN (args
);
12730 if (TREE_CODE (args
) != STRING_CST
)
12732 error ("attribute %<target%> argument not a string");
12736 size_t len
= strlen (TREE_STRING_POINTER (args
));
12737 char *str_to_check
= (char *) alloca (len
+ 1);
12738 strcpy (str_to_check
, TREE_STRING_POINTER (args
));
12742 error ("malformed %<target()%> pragma or attribute");
12746 /* Used to catch empty spaces between commas i.e.
12747 attribute ((target ("attr1,,attr2"))). */
12748 unsigned int num_commas
= num_occurences_in_str (',', str_to_check
);
12750 /* Handle multiple target attributes separated by ','. */
12751 char *token
= strtok_r (str_to_check
, ",", &str_to_check
);
12753 unsigned int num_attrs
= 0;
12757 if (!aarch64_process_one_target_attr (token
))
12759 error ("pragma or attribute %<target(\"%s\")%> is not valid", token
);
12763 token
= strtok_r (NULL
, ",", &str_to_check
);
12766 if (num_attrs
!= num_commas
+ 1)
12768 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args
));
12775 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
12776 process attribute ((target ("..."))). */
12779 aarch64_option_valid_attribute_p (tree fndecl
, tree
, tree args
, int)
12781 struct cl_target_option cur_target
;
12784 tree new_target
, new_optimize
;
12785 tree existing_target
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
12787 /* If what we're processing is the current pragma string then the
12788 target option node is already stored in target_option_current_node
12789 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
12790 having to re-parse the string. This is especially useful to keep
12791 arm_neon.h compile times down since that header contains a lot
12792 of intrinsics enclosed in pragmas. */
12793 if (!existing_target
&& args
== current_target_pragma
)
12795 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = target_option_current_node
;
12798 tree func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
12800 old_optimize
= build_optimization_node (&global_options
);
12801 func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
12803 /* If the function changed the optimization levels as well as setting
12804 target options, start with the optimizations specified. */
12805 if (func_optimize
&& func_optimize
!= old_optimize
)
12806 cl_optimization_restore (&global_options
,
12807 TREE_OPTIMIZATION (func_optimize
));
12809 /* Save the current target options to restore at the end. */
12810 cl_target_option_save (&cur_target
, &global_options
);
12812 /* If fndecl already has some target attributes applied to it, unpack
12813 them so that we add this attribute on top of them, rather than
12814 overwriting them. */
12815 if (existing_target
)
12817 struct cl_target_option
*existing_options
12818 = TREE_TARGET_OPTION (existing_target
);
12820 if (existing_options
)
12821 cl_target_option_restore (&global_options
, existing_options
);
12824 cl_target_option_restore (&global_options
,
12825 TREE_TARGET_OPTION (target_option_current_node
));
12827 ret
= aarch64_process_target_attr (args
);
12829 /* Set up any additional state. */
12832 aarch64_override_options_internal (&global_options
);
12833 /* Initialize SIMD builtins if we haven't already.
12834 Set current_target_pragma to NULL for the duration so that
12835 the builtin initialization code doesn't try to tag the functions
12836 being built with the attributes specified by any current pragma, thus
12837 going into an infinite recursion. */
12840 tree saved_current_target_pragma
= current_target_pragma
;
12841 current_target_pragma
= NULL
;
12842 aarch64_init_simd_builtins ();
12843 current_target_pragma
= saved_current_target_pragma
;
12845 new_target
= build_target_option_node (&global_options
);
12850 new_optimize
= build_optimization_node (&global_options
);
12854 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = new_target
;
12856 if (old_optimize
!= new_optimize
)
12857 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
) = new_optimize
;
12860 cl_target_option_restore (&global_options
, &cur_target
);
12862 if (old_optimize
!= new_optimize
)
12863 cl_optimization_restore (&global_options
,
12864 TREE_OPTIMIZATION (old_optimize
));
12868 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
12869 tri-bool options (yes, no, don't care) and the default value is
12870 DEF, determine whether to reject inlining. */
12873 aarch64_tribools_ok_for_inlining_p (int caller
, int callee
,
12874 int dont_care
, int def
)
12876 /* If the callee doesn't care, always allow inlining. */
12877 if (callee
== dont_care
)
12880 /* If the caller doesn't care, always allow inlining. */
12881 if (caller
== dont_care
)
12884 /* Otherwise, allow inlining if either the callee and caller values
12885 agree, or if the callee is using the default value. */
12886 return (callee
== caller
|| callee
== def
);
12889 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
12890 to inline CALLEE into CALLER based on target-specific info.
12891 Make sure that the caller and callee have compatible architectural
12892 features. Then go through the other possible target attributes
12893 and see if they can block inlining. Try not to reject always_inline
12894 callees unless they are incompatible architecturally. */
12897 aarch64_can_inline_p (tree caller
, tree callee
)
12899 tree caller_tree
= DECL_FUNCTION_SPECIFIC_TARGET (caller
);
12900 tree callee_tree
= DECL_FUNCTION_SPECIFIC_TARGET (callee
);
12902 struct cl_target_option
*caller_opts
12903 = TREE_TARGET_OPTION (caller_tree
? caller_tree
12904 : target_option_default_node
);
12906 struct cl_target_option
*callee_opts
12907 = TREE_TARGET_OPTION (callee_tree
? callee_tree
12908 : target_option_default_node
);
12910 /* Callee's ISA flags should be a subset of the caller's. */
12911 if ((caller_opts
->x_aarch64_isa_flags
& callee_opts
->x_aarch64_isa_flags
)
12912 != callee_opts
->x_aarch64_isa_flags
)
12915 /* Allow non-strict aligned functions inlining into strict
12917 if ((TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)
12918 != TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
))
12919 && !(!TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
)
12920 && TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)))
12923 bool always_inline
= lookup_attribute ("always_inline",
12924 DECL_ATTRIBUTES (callee
));
12926 /* If the architectural features match up and the callee is always_inline
12927 then the other attributes don't matter. */
12931 if (caller_opts
->x_aarch64_cmodel_var
12932 != callee_opts
->x_aarch64_cmodel_var
)
12935 if (caller_opts
->x_aarch64_tls_dialect
12936 != callee_opts
->x_aarch64_tls_dialect
)
12939 /* Honour explicit requests to workaround errata. */
12940 if (!aarch64_tribools_ok_for_inlining_p (
12941 caller_opts
->x_aarch64_fix_a53_err835769
,
12942 callee_opts
->x_aarch64_fix_a53_err835769
,
12943 2, TARGET_FIX_ERR_A53_835769_DEFAULT
))
12946 if (!aarch64_tribools_ok_for_inlining_p (
12947 caller_opts
->x_aarch64_fix_a53_err843419
,
12948 callee_opts
->x_aarch64_fix_a53_err843419
,
12949 2, TARGET_FIX_ERR_A53_843419
))
12952 /* If the user explicitly specified -momit-leaf-frame-pointer for the
12953 caller and calle and they don't match up, reject inlining. */
12954 if (!aarch64_tribools_ok_for_inlining_p (
12955 caller_opts
->x_flag_omit_leaf_frame_pointer
,
12956 callee_opts
->x_flag_omit_leaf_frame_pointer
,
12960 /* If the callee has specific tuning overrides, respect them. */
12961 if (callee_opts
->x_aarch64_override_tune_string
!= NULL
12962 && caller_opts
->x_aarch64_override_tune_string
== NULL
)
12965 /* If the user specified tuning override strings for the
12966 caller and callee and they don't match up, reject inlining.
12967 We just do a string compare here, we don't analyze the meaning
12968 of the string, as it would be too costly for little gain. */
12969 if (callee_opts
->x_aarch64_override_tune_string
12970 && caller_opts
->x_aarch64_override_tune_string
12971 && (strcmp (callee_opts
->x_aarch64_override_tune_string
,
12972 caller_opts
->x_aarch64_override_tune_string
) != 0))
12978 /* Return true if SYMBOL_REF X binds locally. */
12981 aarch64_symbol_binds_local_p (const_rtx x
)
12983 return (SYMBOL_REF_DECL (x
)
12984 ? targetm
.binds_local_p (SYMBOL_REF_DECL (x
))
12985 : SYMBOL_REF_LOCAL_P (x
));
12988 /* Return true if SYMBOL_REF X is thread local */
12990 aarch64_tls_symbol_p (rtx x
)
12992 if (! TARGET_HAVE_TLS
)
12995 if (GET_CODE (x
) != SYMBOL_REF
)
12998 return SYMBOL_REF_TLS_MODEL (x
) != 0;
13001 /* Classify a TLS symbol into one of the TLS kinds. */
13002 enum aarch64_symbol_type
13003 aarch64_classify_tls_symbol (rtx x
)
13005 enum tls_model tls_kind
= tls_symbolic_operand_type (x
);
13009 case TLS_MODEL_GLOBAL_DYNAMIC
:
13010 case TLS_MODEL_LOCAL_DYNAMIC
:
13011 return TARGET_TLS_DESC
? SYMBOL_SMALL_TLSDESC
: SYMBOL_SMALL_TLSGD
;
13013 case TLS_MODEL_INITIAL_EXEC
:
13014 switch (aarch64_cmodel
)
13016 case AARCH64_CMODEL_TINY
:
13017 case AARCH64_CMODEL_TINY_PIC
:
13018 return SYMBOL_TINY_TLSIE
;
13020 return SYMBOL_SMALL_TLSIE
;
13023 case TLS_MODEL_LOCAL_EXEC
:
13024 if (aarch64_tls_size
== 12)
13025 return SYMBOL_TLSLE12
;
13026 else if (aarch64_tls_size
== 24)
13027 return SYMBOL_TLSLE24
;
13028 else if (aarch64_tls_size
== 32)
13029 return SYMBOL_TLSLE32
;
13030 else if (aarch64_tls_size
== 48)
13031 return SYMBOL_TLSLE48
;
13033 gcc_unreachable ();
13035 case TLS_MODEL_EMULATED
:
13036 case TLS_MODEL_NONE
:
13037 return SYMBOL_FORCE_TO_MEM
;
13040 gcc_unreachable ();
13044 /* Return the correct method for accessing X + OFFSET, where X is either
13045 a SYMBOL_REF or LABEL_REF. */
13047 enum aarch64_symbol_type
13048 aarch64_classify_symbol (rtx x
, HOST_WIDE_INT offset
)
13050 if (GET_CODE (x
) == LABEL_REF
)
13052 switch (aarch64_cmodel
)
13054 case AARCH64_CMODEL_LARGE
:
13055 return SYMBOL_FORCE_TO_MEM
;
13057 case AARCH64_CMODEL_TINY_PIC
:
13058 case AARCH64_CMODEL_TINY
:
13059 return SYMBOL_TINY_ABSOLUTE
;
13061 case AARCH64_CMODEL_SMALL_SPIC
:
13062 case AARCH64_CMODEL_SMALL_PIC
:
13063 case AARCH64_CMODEL_SMALL
:
13064 return SYMBOL_SMALL_ABSOLUTE
;
13067 gcc_unreachable ();
13071 if (GET_CODE (x
) == SYMBOL_REF
)
13073 if (aarch64_tls_symbol_p (x
))
13074 return aarch64_classify_tls_symbol (x
);
13076 switch (aarch64_cmodel
)
13078 case AARCH64_CMODEL_TINY
:
13079 /* When we retrieve symbol + offset address, we have to make sure
13080 the offset does not cause overflow of the final address. But
13081 we have no way of knowing the address of symbol at compile time
13082 so we can't accurately say if the distance between the PC and
13083 symbol + offset is outside the addressible range of +/-1M in the
13084 TINY code model. So we rely on images not being greater than
13085 1M and cap the offset at 1M and anything beyond 1M will have to
13086 be loaded using an alternative mechanism. Furthermore if the
13087 symbol is a weak reference to something that isn't known to
13088 resolve to a symbol in this module, then force to memory. */
13089 if ((SYMBOL_REF_WEAK (x
)
13090 && !aarch64_symbol_binds_local_p (x
))
13091 || !IN_RANGE (offset
, -1048575, 1048575))
13092 return SYMBOL_FORCE_TO_MEM
;
13093 return SYMBOL_TINY_ABSOLUTE
;
13095 case AARCH64_CMODEL_SMALL
:
13096 /* Same reasoning as the tiny code model, but the offset cap here is
13098 if ((SYMBOL_REF_WEAK (x
)
13099 && !aarch64_symbol_binds_local_p (x
))
13100 || !IN_RANGE (offset
, HOST_WIDE_INT_C (-4294967263),
13101 HOST_WIDE_INT_C (4294967264)))
13102 return SYMBOL_FORCE_TO_MEM
;
13103 return SYMBOL_SMALL_ABSOLUTE
;
13105 case AARCH64_CMODEL_TINY_PIC
:
13106 if (!aarch64_symbol_binds_local_p (x
))
13107 return SYMBOL_TINY_GOT
;
13108 return SYMBOL_TINY_ABSOLUTE
;
13110 case AARCH64_CMODEL_SMALL_SPIC
:
13111 case AARCH64_CMODEL_SMALL_PIC
:
13112 if (!aarch64_symbol_binds_local_p (x
))
13113 return (aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
13114 ? SYMBOL_SMALL_GOT_28K
: SYMBOL_SMALL_GOT_4G
);
13115 return SYMBOL_SMALL_ABSOLUTE
;
13117 case AARCH64_CMODEL_LARGE
:
13118 /* This is alright even in PIC code as the constant
13119 pool reference is always PC relative and within
13120 the same translation unit. */
13121 if (!aarch64_pcrelative_literal_loads
&& CONSTANT_POOL_ADDRESS_P (x
))
13122 return SYMBOL_SMALL_ABSOLUTE
;
13124 return SYMBOL_FORCE_TO_MEM
;
13127 gcc_unreachable ();
13131 /* By default push everything into the constant pool. */
13132 return SYMBOL_FORCE_TO_MEM
;
13136 aarch64_constant_address_p (rtx x
)
13138 return (CONSTANT_P (x
) && memory_address_p (DImode
, x
));
13142 aarch64_legitimate_pic_operand_p (rtx x
)
13144 if (GET_CODE (x
) == SYMBOL_REF
13145 || (GET_CODE (x
) == CONST
13146 && GET_CODE (XEXP (x
, 0)) == PLUS
13147 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
))
13153 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
13154 that should be rematerialized rather than spilled. */
13157 aarch64_legitimate_constant_p (machine_mode mode
, rtx x
)
13159 /* Support CSE and rematerialization of common constants. */
13160 if (CONST_INT_P (x
)
13161 || (CONST_DOUBLE_P (x
) && GET_MODE_CLASS (mode
) == MODE_FLOAT
)
13162 || GET_CODE (x
) == CONST_VECTOR
)
13165 /* Do not allow vector struct mode constants for Advanced SIMD.
13166 We could support 0 and -1 easily, but they need support in
13167 aarch64-simd.md. */
13168 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
13169 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
13172 /* Only accept variable-length vector constants if they can be
13175 ??? It would be possible to handle rematerialization of other
13176 constants via secondary reloads. */
13177 if (vec_flags
& VEC_ANY_SVE
)
13178 return aarch64_simd_valid_immediate (x
, NULL
);
13180 if (GET_CODE (x
) == HIGH
)
13183 /* Accept polynomial constants that can be calculated by using the
13184 destination of a move as the sole temporary. Constants that
13185 require a second temporary cannot be rematerialized (they can't be
13186 forced to memory and also aren't legitimate constants). */
13188 if (poly_int_rtx_p (x
, &offset
))
13189 return aarch64_offset_temporaries (false, offset
) <= 1;
13191 /* If an offset is being added to something else, we need to allow the
13192 base to be moved into the destination register, meaning that there
13193 are no free temporaries for the offset. */
13194 x
= strip_offset (x
, &offset
);
13195 if (!offset
.is_constant () && aarch64_offset_temporaries (true, offset
) > 0)
13198 /* Do not allow const (plus (anchor_symbol, const_int)). */
13199 if (maybe_ne (offset
, 0) && SYMBOL_REF_P (x
) && SYMBOL_REF_ANCHOR_P (x
))
13202 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
13203 so spilling them is better than rematerialization. */
13204 if (SYMBOL_REF_P (x
) && !SYMBOL_REF_TLS_MODEL (x
))
13207 /* Label references are always constant. */
13208 if (GET_CODE (x
) == LABEL_REF
)
13215 aarch64_load_tp (rtx target
)
13218 || GET_MODE (target
) != Pmode
13219 || !register_operand (target
, Pmode
))
13220 target
= gen_reg_rtx (Pmode
);
13222 /* Can return in any reg. */
13223 emit_insn (gen_aarch64_load_tp_hard (target
));
13227 /* On AAPCS systems, this is the "struct __va_list". */
13228 static GTY(()) tree va_list_type
;
13230 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
13231 Return the type to use as __builtin_va_list.
13233 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
13245 aarch64_build_builtin_va_list (void)
13248 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
13250 /* Create the type. */
13251 va_list_type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
13252 /* Give it the required name. */
13253 va_list_name
= build_decl (BUILTINS_LOCATION
,
13255 get_identifier ("__va_list"),
13257 DECL_ARTIFICIAL (va_list_name
) = 1;
13258 TYPE_NAME (va_list_type
) = va_list_name
;
13259 TYPE_STUB_DECL (va_list_type
) = va_list_name
;
13261 /* Create the fields. */
13262 f_stack
= build_decl (BUILTINS_LOCATION
,
13263 FIELD_DECL
, get_identifier ("__stack"),
13265 f_grtop
= build_decl (BUILTINS_LOCATION
,
13266 FIELD_DECL
, get_identifier ("__gr_top"),
13268 f_vrtop
= build_decl (BUILTINS_LOCATION
,
13269 FIELD_DECL
, get_identifier ("__vr_top"),
13271 f_groff
= build_decl (BUILTINS_LOCATION
,
13272 FIELD_DECL
, get_identifier ("__gr_offs"),
13273 integer_type_node
);
13274 f_vroff
= build_decl (BUILTINS_LOCATION
,
13275 FIELD_DECL
, get_identifier ("__vr_offs"),
13276 integer_type_node
);
13278 /* Tell tree-stdarg pass about our internal offset fields.
13279 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
13280 purpose to identify whether the code is updating va_list internal
13281 offset fields through irregular way. */
13282 va_list_gpr_counter_field
= f_groff
;
13283 va_list_fpr_counter_field
= f_vroff
;
13285 DECL_ARTIFICIAL (f_stack
) = 1;
13286 DECL_ARTIFICIAL (f_grtop
) = 1;
13287 DECL_ARTIFICIAL (f_vrtop
) = 1;
13288 DECL_ARTIFICIAL (f_groff
) = 1;
13289 DECL_ARTIFICIAL (f_vroff
) = 1;
13291 DECL_FIELD_CONTEXT (f_stack
) = va_list_type
;
13292 DECL_FIELD_CONTEXT (f_grtop
) = va_list_type
;
13293 DECL_FIELD_CONTEXT (f_vrtop
) = va_list_type
;
13294 DECL_FIELD_CONTEXT (f_groff
) = va_list_type
;
13295 DECL_FIELD_CONTEXT (f_vroff
) = va_list_type
;
13297 TYPE_FIELDS (va_list_type
) = f_stack
;
13298 DECL_CHAIN (f_stack
) = f_grtop
;
13299 DECL_CHAIN (f_grtop
) = f_vrtop
;
13300 DECL_CHAIN (f_vrtop
) = f_groff
;
13301 DECL_CHAIN (f_groff
) = f_vroff
;
13303 /* Compute its layout. */
13304 layout_type (va_list_type
);
13306 return va_list_type
;
13309 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
13311 aarch64_expand_builtin_va_start (tree valist
, rtx nextarg ATTRIBUTE_UNUSED
)
13313 const CUMULATIVE_ARGS
*cum
;
13314 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
13315 tree stack
, grtop
, vrtop
, groff
, vroff
;
13317 int gr_save_area_size
= cfun
->va_list_gpr_size
;
13318 int vr_save_area_size
= cfun
->va_list_fpr_size
;
13321 cum
= &crtl
->args
.info
;
13322 if (cfun
->va_list_gpr_size
)
13323 gr_save_area_size
= MIN ((NUM_ARG_REGS
- cum
->aapcs_ncrn
) * UNITS_PER_WORD
,
13324 cfun
->va_list_gpr_size
);
13325 if (cfun
->va_list_fpr_size
)
13326 vr_save_area_size
= MIN ((NUM_FP_ARG_REGS
- cum
->aapcs_nvrn
)
13327 * UNITS_PER_VREG
, cfun
->va_list_fpr_size
);
13331 gcc_assert (cum
->aapcs_nvrn
== 0);
13332 vr_save_area_size
= 0;
13335 f_stack
= TYPE_FIELDS (va_list_type_node
);
13336 f_grtop
= DECL_CHAIN (f_stack
);
13337 f_vrtop
= DECL_CHAIN (f_grtop
);
13338 f_groff
= DECL_CHAIN (f_vrtop
);
13339 f_vroff
= DECL_CHAIN (f_groff
);
13341 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), valist
, f_stack
,
13343 grtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
), valist
, f_grtop
,
13345 vrtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
), valist
, f_vrtop
,
13347 groff
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
), valist
, f_groff
,
13349 vroff
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
), valist
, f_vroff
,
13352 /* Emit code to initialize STACK, which points to the next varargs stack
13353 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
13354 by named arguments. STACK is 8-byte aligned. */
13355 t
= make_tree (TREE_TYPE (stack
), virtual_incoming_args_rtx
);
13356 if (cum
->aapcs_stack_size
> 0)
13357 t
= fold_build_pointer_plus_hwi (t
, cum
->aapcs_stack_size
* UNITS_PER_WORD
);
13358 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), stack
, t
);
13359 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
13361 /* Emit code to initialize GRTOP, the top of the GR save area.
13362 virtual_incoming_args_rtx should have been 16 byte aligned. */
13363 t
= make_tree (TREE_TYPE (grtop
), virtual_incoming_args_rtx
);
13364 t
= build2 (MODIFY_EXPR
, TREE_TYPE (grtop
), grtop
, t
);
13365 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
13367 /* Emit code to initialize VRTOP, the top of the VR save area.
13368 This address is gr_save_area_bytes below GRTOP, rounded
13369 down to the next 16-byte boundary. */
13370 t
= make_tree (TREE_TYPE (vrtop
), virtual_incoming_args_rtx
);
13371 vr_offset
= ROUND_UP (gr_save_area_size
,
13372 STACK_BOUNDARY
/ BITS_PER_UNIT
);
13375 t
= fold_build_pointer_plus_hwi (t
, -vr_offset
);
13376 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vrtop
), vrtop
, t
);
13377 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
13379 /* Emit code to initialize GROFF, the offset from GRTOP of the
13380 next GPR argument. */
13381 t
= build2 (MODIFY_EXPR
, TREE_TYPE (groff
), groff
,
13382 build_int_cst (TREE_TYPE (groff
), -gr_save_area_size
));
13383 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
13385 /* Likewise emit code to initialize VROFF, the offset from FTOP
13386 of the next VR argument. */
13387 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vroff
), vroff
,
13388 build_int_cst (TREE_TYPE (vroff
), -vr_save_area_size
));
13389 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
13392 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
13395 aarch64_gimplify_va_arg_expr (tree valist
, tree type
, gimple_seq
*pre_p
,
13396 gimple_seq
*post_p ATTRIBUTE_UNUSED
)
13400 bool is_ha
; /* is HFA or HVA. */
13401 bool dw_align
; /* double-word align. */
13402 machine_mode ag_mode
= VOIDmode
;
13406 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
13407 tree stack
, f_top
, f_off
, off
, arg
, roundup
, on_stack
;
13408 HOST_WIDE_INT size
, rsize
, adjust
, align
;
13409 tree t
, u
, cond1
, cond2
;
13411 indirect_p
= pass_by_reference (NULL
, TYPE_MODE (type
), type
, false);
13413 type
= build_pointer_type (type
);
13415 mode
= TYPE_MODE (type
);
13417 f_stack
= TYPE_FIELDS (va_list_type_node
);
13418 f_grtop
= DECL_CHAIN (f_stack
);
13419 f_vrtop
= DECL_CHAIN (f_grtop
);
13420 f_groff
= DECL_CHAIN (f_vrtop
);
13421 f_vroff
= DECL_CHAIN (f_groff
);
13423 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), unshare_expr (valist
),
13424 f_stack
, NULL_TREE
);
13425 size
= int_size_in_bytes (type
);
13429 = aarch64_function_arg_alignment (mode
, type
, &abi_break
) / BITS_PER_UNIT
;
13433 if (aarch64_vfp_is_call_or_return_candidate (mode
,
13439 /* No frontends can create types with variable-sized modes, so we
13440 shouldn't be asked to pass or return them. */
13441 unsigned int ag_size
= GET_MODE_SIZE (ag_mode
).to_constant ();
13443 /* TYPE passed in fp/simd registers. */
13445 aarch64_err_no_fpadvsimd (mode
);
13447 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
),
13448 unshare_expr (valist
), f_vrtop
, NULL_TREE
);
13449 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
),
13450 unshare_expr (valist
), f_vroff
, NULL_TREE
);
13452 rsize
= nregs
* UNITS_PER_VREG
;
13456 if (BYTES_BIG_ENDIAN
&& ag_size
< UNITS_PER_VREG
)
13457 adjust
= UNITS_PER_VREG
- ag_size
;
13459 else if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
13460 && size
< UNITS_PER_VREG
)
13462 adjust
= UNITS_PER_VREG
- size
;
13467 /* TYPE passed in general registers. */
13468 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
),
13469 unshare_expr (valist
), f_grtop
, NULL_TREE
);
13470 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
),
13471 unshare_expr (valist
), f_groff
, NULL_TREE
);
13472 rsize
= ROUND_UP (size
, UNITS_PER_WORD
);
13473 nregs
= rsize
/ UNITS_PER_WORD
;
13477 if (abi_break
&& warn_psabi
)
13478 inform (input_location
, "parameter passing for argument of type "
13479 "%qT changed in GCC 9.1", type
);
13483 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
13484 && size
< UNITS_PER_WORD
)
13486 adjust
= UNITS_PER_WORD
- size
;
13490 /* Get a local temporary for the field value. */
13491 off
= get_initialized_tmp_var (f_off
, pre_p
, NULL
);
13493 /* Emit code to branch if off >= 0. */
13494 t
= build2 (GE_EXPR
, boolean_type_node
, off
,
13495 build_int_cst (TREE_TYPE (off
), 0));
13496 cond1
= build3 (COND_EXPR
, ptr_type_node
, t
, NULL_TREE
, NULL_TREE
);
13500 /* Emit: offs = (offs + 15) & -16. */
13501 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
13502 build_int_cst (TREE_TYPE (off
), 15));
13503 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (off
), t
,
13504 build_int_cst (TREE_TYPE (off
), -16));
13505 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (off
), off
, t
);
13510 /* Update ap.__[g|v]r_offs */
13511 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
13512 build_int_cst (TREE_TYPE (off
), rsize
));
13513 t
= build2 (MODIFY_EXPR
, TREE_TYPE (f_off
), unshare_expr (f_off
), t
);
13517 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
13519 /* [cond2] if (ap.__[g|v]r_offs > 0) */
13520 u
= build2 (GT_EXPR
, boolean_type_node
, unshare_expr (f_off
),
13521 build_int_cst (TREE_TYPE (f_off
), 0));
13522 cond2
= build3 (COND_EXPR
, ptr_type_node
, u
, NULL_TREE
, NULL_TREE
);
13524 /* String up: make sure the assignment happens before the use. */
13525 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (cond2
), t
, cond2
);
13526 COND_EXPR_ELSE (cond1
) = t
;
13528 /* Prepare the trees handling the argument that is passed on the stack;
13529 the top level node will store in ON_STACK. */
13530 arg
= get_initialized_tmp_var (stack
, pre_p
, NULL
);
13533 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
13534 t
= fold_build_pointer_plus_hwi (arg
, 15);
13535 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
13536 build_int_cst (TREE_TYPE (t
), -16));
13537 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (arg
), arg
, t
);
13541 /* Advance ap.__stack */
13542 t
= fold_build_pointer_plus_hwi (arg
, size
+ 7);
13543 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
13544 build_int_cst (TREE_TYPE (t
), -8));
13545 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), unshare_expr (stack
), t
);
13546 /* String up roundup and advance. */
13548 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
13549 /* String up with arg */
13550 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), t
, arg
);
13551 /* Big-endianness related address adjustment. */
13552 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
13553 && size
< UNITS_PER_WORD
)
13555 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (arg
), arg
,
13556 size_int (UNITS_PER_WORD
- size
));
13557 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), on_stack
, t
);
13560 COND_EXPR_THEN (cond1
) = unshare_expr (on_stack
);
13561 COND_EXPR_THEN (cond2
) = unshare_expr (on_stack
);
13563 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
13566 t
= build2 (PREINCREMENT_EXPR
, TREE_TYPE (off
), off
,
13567 build_int_cst (TREE_TYPE (off
), adjust
));
13569 t
= fold_convert (sizetype
, t
);
13570 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (f_top
), f_top
, t
);
13574 /* type ha; // treat as "struct {ftype field[n];}"
13575 ... [computing offs]
13576 for (i = 0; i <nregs; ++i, offs += 16)
13577 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
13580 tree tmp_ha
, field_t
, field_ptr_t
;
13582 /* Declare a local variable. */
13583 tmp_ha
= create_tmp_var_raw (type
, "ha");
13584 gimple_add_tmp_var (tmp_ha
);
13586 /* Establish the base type. */
13590 field_t
= float_type_node
;
13591 field_ptr_t
= float_ptr_type_node
;
13594 field_t
= double_type_node
;
13595 field_ptr_t
= double_ptr_type_node
;
13598 field_t
= long_double_type_node
;
13599 field_ptr_t
= long_double_ptr_type_node
;
13602 field_t
= aarch64_fp16_type_node
;
13603 field_ptr_t
= aarch64_fp16_ptr_type_node
;
13608 tree innertype
= make_signed_type (GET_MODE_PRECISION (SImode
));
13609 field_t
= build_vector_type_for_mode (innertype
, ag_mode
);
13610 field_ptr_t
= build_pointer_type (field_t
);
13617 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
13618 tmp_ha
= build1 (ADDR_EXPR
, field_ptr_t
, tmp_ha
);
13620 t
= fold_convert (field_ptr_t
, addr
);
13621 t
= build2 (MODIFY_EXPR
, field_t
,
13622 build1 (INDIRECT_REF
, field_t
, tmp_ha
),
13623 build1 (INDIRECT_REF
, field_t
, t
));
13625 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
13626 for (i
= 1; i
< nregs
; ++i
)
13628 addr
= fold_build_pointer_plus_hwi (addr
, UNITS_PER_VREG
);
13629 u
= fold_convert (field_ptr_t
, addr
);
13630 u
= build2 (MODIFY_EXPR
, field_t
,
13631 build2 (MEM_REF
, field_t
, tmp_ha
,
13632 build_int_cst (field_ptr_t
,
13634 int_size_in_bytes (field_t
)))),
13635 build1 (INDIRECT_REF
, field_t
, u
));
13636 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), t
, u
);
13639 u
= fold_convert (TREE_TYPE (f_top
), tmp_ha
);
13640 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (f_top
), t
, u
);
13643 COND_EXPR_ELSE (cond2
) = t
;
13644 addr
= fold_convert (build_pointer_type (type
), cond1
);
13645 addr
= build_va_arg_indirect_ref (addr
);
13648 addr
= build_va_arg_indirect_ref (addr
);
13653 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
13656 aarch64_setup_incoming_varargs (cumulative_args_t cum_v
, machine_mode mode
,
13657 tree type
, int *pretend_size ATTRIBUTE_UNUSED
,
13660 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
13661 CUMULATIVE_ARGS local_cum
;
13662 int gr_saved
= cfun
->va_list_gpr_size
;
13663 int vr_saved
= cfun
->va_list_fpr_size
;
13665 /* The caller has advanced CUM up to, but not beyond, the last named
13666 argument. Advance a local copy of CUM past the last "real" named
13667 argument, to find out how many registers are left over. */
13669 aarch64_function_arg_advance (pack_cumulative_args(&local_cum
), mode
, type
, true);
13671 /* Found out how many registers we need to save.
13672 Honor tree-stdvar analysis results. */
13673 if (cfun
->va_list_gpr_size
)
13674 gr_saved
= MIN (NUM_ARG_REGS
- local_cum
.aapcs_ncrn
,
13675 cfun
->va_list_gpr_size
/ UNITS_PER_WORD
);
13676 if (cfun
->va_list_fpr_size
)
13677 vr_saved
= MIN (NUM_FP_ARG_REGS
- local_cum
.aapcs_nvrn
,
13678 cfun
->va_list_fpr_size
/ UNITS_PER_VREG
);
13682 gcc_assert (local_cum
.aapcs_nvrn
== 0);
13692 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
13693 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
,
13694 - gr_saved
* UNITS_PER_WORD
);
13695 mem
= gen_frame_mem (BLKmode
, ptr
);
13696 set_mem_alias_set (mem
, get_varargs_alias_set ());
13698 move_block_from_reg (local_cum
.aapcs_ncrn
+ R0_REGNUM
,
13703 /* We can't use move_block_from_reg, because it will use
13704 the wrong mode, storing D regs only. */
13705 machine_mode mode
= TImode
;
13706 int off
, i
, vr_start
;
13708 /* Set OFF to the offset from virtual_incoming_args_rtx of
13709 the first vector register. The VR save area lies below
13710 the GR one, and is aligned to 16 bytes. */
13711 off
= -ROUND_UP (gr_saved
* UNITS_PER_WORD
,
13712 STACK_BOUNDARY
/ BITS_PER_UNIT
);
13713 off
-= vr_saved
* UNITS_PER_VREG
;
13715 vr_start
= V0_REGNUM
+ local_cum
.aapcs_nvrn
;
13716 for (i
= 0; i
< vr_saved
; ++i
)
13720 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
, off
);
13721 mem
= gen_frame_mem (mode
, ptr
);
13722 set_mem_alias_set (mem
, get_varargs_alias_set ());
13723 aarch64_emit_move (mem
, gen_rtx_REG (mode
, vr_start
+ i
));
13724 off
+= UNITS_PER_VREG
;
13729 /* We don't save the size into *PRETEND_SIZE because we want to avoid
13730 any complication of having crtl->args.pretend_args_size changed. */
13731 cfun
->machine
->frame
.saved_varargs_size
13732 = (ROUND_UP (gr_saved
* UNITS_PER_WORD
,
13733 STACK_BOUNDARY
/ BITS_PER_UNIT
)
13734 + vr_saved
* UNITS_PER_VREG
);
13738 aarch64_conditional_register_usage (void)
13743 for (i
= V0_REGNUM
; i
<= V31_REGNUM
; i
++)
13746 call_used_regs
[i
] = 1;
13750 for (i
= P0_REGNUM
; i
<= P15_REGNUM
; i
++)
13753 call_used_regs
[i
] = 1;
13756 /* When tracking speculation, we need a couple of call-clobbered registers
13757 to track the speculation state. It would be nice to just use
13758 IP0 and IP1, but currently there are numerous places that just
13759 assume these registers are free for other uses (eg pointer
13760 authentication). */
13761 if (aarch64_track_speculation
)
13763 fixed_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
13764 call_used_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
13765 fixed_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
13766 call_used_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
13770 /* Walk down the type tree of TYPE counting consecutive base elements.
13771 If *MODEP is VOIDmode, then set it to the first valid floating point
13772 type. If a non-floating point type is found, or if a floating point
13773 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
13774 otherwise return the count in the sub-tree. */
13776 aapcs_vfp_sub_candidate (const_tree type
, machine_mode
*modep
)
13779 HOST_WIDE_INT size
;
13781 switch (TREE_CODE (type
))
13784 mode
= TYPE_MODE (type
);
13785 if (mode
!= DFmode
&& mode
!= SFmode
13786 && mode
!= TFmode
&& mode
!= HFmode
)
13789 if (*modep
== VOIDmode
)
13792 if (*modep
== mode
)
13798 mode
= TYPE_MODE (TREE_TYPE (type
));
13799 if (mode
!= DFmode
&& mode
!= SFmode
13800 && mode
!= TFmode
&& mode
!= HFmode
)
13803 if (*modep
== VOIDmode
)
13806 if (*modep
== mode
)
13812 /* Use V2SImode and V4SImode as representatives of all 64-bit
13813 and 128-bit vector types. */
13814 size
= int_size_in_bytes (type
);
13827 if (*modep
== VOIDmode
)
13830 /* Vector modes are considered to be opaque: two vectors are
13831 equivalent for the purposes of being homogeneous aggregates
13832 if they are the same size. */
13833 if (*modep
== mode
)
13841 tree index
= TYPE_DOMAIN (type
);
13843 /* Can't handle incomplete types nor sizes that are not
13845 if (!COMPLETE_TYPE_P (type
)
13846 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
13849 count
= aapcs_vfp_sub_candidate (TREE_TYPE (type
), modep
);
13852 || !TYPE_MAX_VALUE (index
)
13853 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index
))
13854 || !TYPE_MIN_VALUE (index
)
13855 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index
))
13859 count
*= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index
))
13860 - tree_to_uhwi (TYPE_MIN_VALUE (index
)));
13862 /* There must be no padding. */
13863 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
13864 count
* GET_MODE_BITSIZE (*modep
)))
13876 /* Can't handle incomplete types nor sizes that are not
13878 if (!COMPLETE_TYPE_P (type
)
13879 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
13882 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
13884 if (TREE_CODE (field
) != FIELD_DECL
)
13887 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
13890 count
+= sub_count
;
13893 /* There must be no padding. */
13894 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
13895 count
* GET_MODE_BITSIZE (*modep
)))
13902 case QUAL_UNION_TYPE
:
13904 /* These aren't very interesting except in a degenerate case. */
13909 /* Can't handle incomplete types nor sizes that are not
13911 if (!COMPLETE_TYPE_P (type
)
13912 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
13915 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
13917 if (TREE_CODE (field
) != FIELD_DECL
)
13920 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
13923 count
= count
> sub_count
? count
: sub_count
;
13926 /* There must be no padding. */
13927 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
13928 count
* GET_MODE_BITSIZE (*modep
)))
13941 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
13942 type as described in AAPCS64 \S 4.1.2.
13944 See the comment above aarch64_composite_type_p for the notes on MODE. */
13947 aarch64_short_vector_p (const_tree type
,
13950 poly_int64 size
= -1;
13952 if (type
&& TREE_CODE (type
) == VECTOR_TYPE
)
13953 size
= int_size_in_bytes (type
);
13954 else if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
13955 || GET_MODE_CLASS (mode
) == MODE_VECTOR_FLOAT
)
13956 size
= GET_MODE_SIZE (mode
);
13958 return known_eq (size
, 8) || known_eq (size
, 16);
13961 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
13962 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
13963 array types. The C99 floating-point complex types are also considered
13964 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
13965 types, which are GCC extensions and out of the scope of AAPCS64, are
13966 treated as composite types here as well.
13968 Note that MODE itself is not sufficient in determining whether a type
13969 is such a composite type or not. This is because
13970 stor-layout.c:compute_record_mode may have already changed the MODE
13971 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
13972 structure with only one field may have its MODE set to the mode of the
13973 field. Also an integer mode whose size matches the size of the
13974 RECORD_TYPE type may be used to substitute the original mode
13975 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
13976 solely relied on. */
13979 aarch64_composite_type_p (const_tree type
,
13982 if (aarch64_short_vector_p (type
, mode
))
13985 if (type
&& (AGGREGATE_TYPE_P (type
) || TREE_CODE (type
) == COMPLEX_TYPE
))
13988 if (mode
== BLKmode
13989 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
13990 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_INT
)
13996 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
13997 shall be passed or returned in simd/fp register(s) (providing these
13998 parameter passing registers are available).
14000 Upon successful return, *COUNT returns the number of needed registers,
14001 *BASE_MODE returns the mode of the individual register and when IS_HAF
14002 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
14003 floating-point aggregate or a homogeneous short-vector aggregate. */
14006 aarch64_vfp_is_call_or_return_candidate (machine_mode mode
,
14008 machine_mode
*base_mode
,
14012 machine_mode new_mode
= VOIDmode
;
14013 bool composite_p
= aarch64_composite_type_p (type
, mode
);
14015 if (is_ha
!= NULL
) *is_ha
= false;
14017 if ((!composite_p
&& GET_MODE_CLASS (mode
) == MODE_FLOAT
)
14018 || aarch64_short_vector_p (type
, mode
))
14023 else if (GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
)
14025 if (is_ha
!= NULL
) *is_ha
= true;
14027 new_mode
= GET_MODE_INNER (mode
);
14029 else if (type
&& composite_p
)
14031 int ag_count
= aapcs_vfp_sub_candidate (type
, &new_mode
);
14033 if (ag_count
> 0 && ag_count
<= HA_MAX_NUM_FLDS
)
14035 if (is_ha
!= NULL
) *is_ha
= true;
14044 *base_mode
= new_mode
;
14048 /* Implement TARGET_STRUCT_VALUE_RTX. */
14051 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED
,
14052 int incoming ATTRIBUTE_UNUSED
)
14054 return gen_rtx_REG (Pmode
, AARCH64_STRUCT_VALUE_REGNUM
);
14057 /* Implements target hook vector_mode_supported_p. */
14059 aarch64_vector_mode_supported_p (machine_mode mode
)
14061 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
14062 return vec_flags
!= 0 && (vec_flags
& VEC_STRUCT
) == 0;
14065 /* Return appropriate SIMD container
14066 for MODE within a vector of WIDTH bits. */
14067 static machine_mode
14068 aarch64_simd_container_mode (scalar_mode mode
, poly_int64 width
)
14070 if (TARGET_SVE
&& known_eq (width
, BITS_PER_SVE_VECTOR
))
14086 return VNx16QImode
;
14091 gcc_assert (known_eq (width
, 64) || known_eq (width
, 128));
14094 if (known_eq (width
, 128))
14134 /* Return 128-bit container as the preferred SIMD mode for MODE. */
14135 static machine_mode
14136 aarch64_preferred_simd_mode (scalar_mode mode
)
14138 poly_int64 bits
= TARGET_SVE
? BITS_PER_SVE_VECTOR
: 128;
14139 return aarch64_simd_container_mode (mode
, bits
);
14142 /* Return a list of possible vector sizes for the vectorizer
14143 to iterate over. */
14145 aarch64_autovectorize_vector_sizes (vector_sizes
*sizes
, bool)
14148 sizes
->safe_push (BYTES_PER_SVE_VECTOR
);
14149 sizes
->safe_push (16);
14150 sizes
->safe_push (8);
14153 /* Implement TARGET_MANGLE_TYPE. */
14155 static const char *
14156 aarch64_mangle_type (const_tree type
)
14158 /* The AArch64 ABI documents say that "__va_list" has to be
14159 mangled as if it is in the "std" namespace. */
14160 if (lang_hooks
.types_compatible_p (CONST_CAST_TREE (type
), va_list_type
))
14161 return "St9__va_list";
14163 /* Half-precision float. */
14164 if (TREE_CODE (type
) == REAL_TYPE
&& TYPE_PRECISION (type
) == 16)
14167 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
14169 if (TYPE_NAME (type
) != NULL
)
14170 return aarch64_mangle_builtin_type (type
);
14172 /* Use the default mangling. */
14176 /* Find the first rtx_insn before insn that will generate an assembly
14180 aarch64_prev_real_insn (rtx_insn
*insn
)
14187 insn
= prev_real_insn (insn
);
14189 while (insn
&& recog_memoized (insn
) < 0);
14195 is_madd_op (enum attr_type t1
)
14198 /* A number of these may be AArch32 only. */
14199 enum attr_type mlatypes
[] = {
14200 TYPE_MLA
, TYPE_MLAS
, TYPE_SMLAD
, TYPE_SMLADX
, TYPE_SMLAL
, TYPE_SMLALD
,
14201 TYPE_SMLALS
, TYPE_SMLALXY
, TYPE_SMLAWX
, TYPE_SMLAWY
, TYPE_SMLAXY
,
14202 TYPE_SMMLA
, TYPE_UMLAL
, TYPE_UMLALS
,TYPE_SMLSD
, TYPE_SMLSDX
, TYPE_SMLSLD
14205 for (i
= 0; i
< sizeof (mlatypes
) / sizeof (enum attr_type
); i
++)
14207 if (t1
== mlatypes
[i
])
14214 /* Check if there is a register dependency between a load and the insn
14215 for which we hold recog_data. */
14218 dep_between_memop_and_curr (rtx memop
)
14223 gcc_assert (GET_CODE (memop
) == SET
);
14225 if (!REG_P (SET_DEST (memop
)))
14228 load_reg
= SET_DEST (memop
);
14229 for (opno
= 1; opno
< recog_data
.n_operands
; opno
++)
14231 rtx operand
= recog_data
.operand
[opno
];
14232 if (REG_P (operand
)
14233 && reg_overlap_mentioned_p (load_reg
, operand
))
14241 /* When working around the Cortex-A53 erratum 835769,
14242 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
14243 instruction and has a preceding memory instruction such that a NOP
14244 should be inserted between them. */
14247 aarch64_madd_needs_nop (rtx_insn
* insn
)
14249 enum attr_type attr_type
;
14253 if (!TARGET_FIX_ERR_A53_835769
)
14256 if (!INSN_P (insn
) || recog_memoized (insn
) < 0)
14259 attr_type
= get_attr_type (insn
);
14260 if (!is_madd_op (attr_type
))
14263 prev
= aarch64_prev_real_insn (insn
);
14264 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
14265 Restore recog state to INSN to avoid state corruption. */
14266 extract_constrain_insn_cached (insn
);
14268 if (!prev
|| !contains_mem_rtx_p (PATTERN (prev
)))
14271 body
= single_set (prev
);
14273 /* If the previous insn is a memory op and there is no dependency between
14274 it and the DImode madd, emit a NOP between them. If body is NULL then we
14275 have a complex memory operation, probably a load/store pair.
14276 Be conservative for now and emit a NOP. */
14277 if (GET_MODE (recog_data
.operand
[0]) == DImode
14278 && (!body
|| !dep_between_memop_and_curr (body
)))
14286 /* Implement FINAL_PRESCAN_INSN. */
14289 aarch64_final_prescan_insn (rtx_insn
*insn
)
14291 if (aarch64_madd_needs_nop (insn
))
14292 fprintf (asm_out_file
, "\tnop // between mem op and mult-accumulate\n");
14296 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
14300 aarch64_sve_index_immediate_p (rtx base_or_step
)
14302 return (CONST_INT_P (base_or_step
)
14303 && IN_RANGE (INTVAL (base_or_step
), -16, 15));
14306 /* Return true if X is a valid immediate for the SVE ADD and SUB
14307 instructions. Negate X first if NEGATE_P is true. */
14310 aarch64_sve_arith_immediate_p (rtx x
, bool negate_p
)
14314 if (!const_vec_duplicate_p (x
, &elt
)
14315 || !CONST_INT_P (elt
))
14318 HOST_WIDE_INT val
= INTVAL (elt
);
14321 val
&= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x
)));
14324 return IN_RANGE (val
, 0, 0xff);
14325 return IN_RANGE (val
, 0, 0xff00);
14328 /* Return true if X is a valid immediate operand for an SVE logical
14329 instruction such as AND. */
14332 aarch64_sve_bitmask_immediate_p (rtx x
)
14336 return (const_vec_duplicate_p (x
, &elt
)
14337 && CONST_INT_P (elt
)
14338 && aarch64_bitmask_imm (INTVAL (elt
),
14339 GET_MODE_INNER (GET_MODE (x
))));
14342 /* Return true if X is a valid immediate for the SVE DUP and CPY
14346 aarch64_sve_dup_immediate_p (rtx x
)
14350 if (!const_vec_duplicate_p (x
, &elt
)
14351 || !CONST_INT_P (elt
))
14354 HOST_WIDE_INT val
= INTVAL (elt
);
14356 return IN_RANGE (val
, -0x80, 0x7f);
14357 return IN_RANGE (val
, -0x8000, 0x7f00);
14360 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
14361 SIGNED_P says whether the operand is signed rather than unsigned. */
14364 aarch64_sve_cmp_immediate_p (rtx x
, bool signed_p
)
14368 return (const_vec_duplicate_p (x
, &elt
)
14369 && CONST_INT_P (elt
)
14371 ? IN_RANGE (INTVAL (elt
), -16, 15)
14372 : IN_RANGE (INTVAL (elt
), 0, 127)));
14375 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
14376 instruction. Negate X first if NEGATE_P is true. */
14379 aarch64_sve_float_arith_immediate_p (rtx x
, bool negate_p
)
14384 if (!const_vec_duplicate_p (x
, &elt
)
14385 || GET_CODE (elt
) != CONST_DOUBLE
)
14388 r
= *CONST_DOUBLE_REAL_VALUE (elt
);
14391 r
= real_value_negate (&r
);
14393 if (real_equal (&r
, &dconst1
))
14395 if (real_equal (&r
, &dconsthalf
))
14400 /* Return true if X is a valid immediate operand for an SVE FMUL
14404 aarch64_sve_float_mul_immediate_p (rtx x
)
14408 /* GCC will never generate a multiply with an immediate of 2, so there is no
14409 point testing for it (even though it is a valid constant). */
14410 return (const_vec_duplicate_p (x
, &elt
)
14411 && GET_CODE (elt
) == CONST_DOUBLE
14412 && real_equal (CONST_DOUBLE_REAL_VALUE (elt
), &dconsthalf
));
14415 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
14416 for the Advanced SIMD operation described by WHICH and INSN. If INFO
14417 is nonnull, use it to describe valid immediates. */
14419 aarch64_advsimd_valid_immediate_hs (unsigned int val32
,
14420 simd_immediate_info
*info
,
14421 enum simd_immediate_check which
,
14422 simd_immediate_info::insn_type insn
)
14424 /* Try a 4-byte immediate with LSL. */
14425 for (unsigned int shift
= 0; shift
< 32; shift
+= 8)
14426 if ((val32
& (0xff << shift
)) == val32
)
14429 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
14430 simd_immediate_info::LSL
, shift
);
14434 /* Try a 2-byte immediate with LSL. */
14435 unsigned int imm16
= val32
& 0xffff;
14436 if (imm16
== (val32
>> 16))
14437 for (unsigned int shift
= 0; shift
< 16; shift
+= 8)
14438 if ((imm16
& (0xff << shift
)) == imm16
)
14441 *info
= simd_immediate_info (HImode
, imm16
>> shift
, insn
,
14442 simd_immediate_info::LSL
, shift
);
14446 /* Try a 4-byte immediate with MSL, except for cases that MVN
14448 if (which
== AARCH64_CHECK_MOV
)
14449 for (unsigned int shift
= 8; shift
< 24; shift
+= 8)
14451 unsigned int low
= (1 << shift
) - 1;
14452 if (((val32
& (0xff << shift
)) | low
) == val32
)
14455 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
14456 simd_immediate_info::MSL
, shift
);
14464 /* Return true if replicating VAL64 is a valid immediate for the
14465 Advanced SIMD operation described by WHICH. If INFO is nonnull,
14466 use it to describe valid immediates. */
14468 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64
,
14469 simd_immediate_info
*info
,
14470 enum simd_immediate_check which
)
14472 unsigned int val32
= val64
& 0xffffffff;
14473 unsigned int val16
= val64
& 0xffff;
14474 unsigned int val8
= val64
& 0xff;
14476 if (val32
== (val64
>> 32))
14478 if ((which
& AARCH64_CHECK_ORR
) != 0
14479 && aarch64_advsimd_valid_immediate_hs (val32
, info
, which
,
14480 simd_immediate_info::MOV
))
14483 if ((which
& AARCH64_CHECK_BIC
) != 0
14484 && aarch64_advsimd_valid_immediate_hs (~val32
, info
, which
,
14485 simd_immediate_info::MVN
))
14488 /* Try using a replicated byte. */
14489 if (which
== AARCH64_CHECK_MOV
14490 && val16
== (val32
>> 16)
14491 && val8
== (val16
>> 8))
14494 *info
= simd_immediate_info (QImode
, val8
);
14499 /* Try using a bit-to-bytemask. */
14500 if (which
== AARCH64_CHECK_MOV
)
14503 for (i
= 0; i
< 64; i
+= 8)
14505 unsigned char byte
= (val64
>> i
) & 0xff;
14506 if (byte
!= 0 && byte
!= 0xff)
14512 *info
= simd_immediate_info (DImode
, val64
);
14519 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
14520 instruction. If INFO is nonnull, use it to describe valid immediates. */
14523 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64
,
14524 simd_immediate_info
*info
)
14526 scalar_int_mode mode
= DImode
;
14527 unsigned int val32
= val64
& 0xffffffff;
14528 if (val32
== (val64
>> 32))
14531 unsigned int val16
= val32
& 0xffff;
14532 if (val16
== (val32
>> 16))
14535 unsigned int val8
= val16
& 0xff;
14536 if (val8
== (val16
>> 8))
14540 HOST_WIDE_INT val
= trunc_int_for_mode (val64
, mode
);
14541 if (IN_RANGE (val
, -0x80, 0x7f))
14543 /* DUP with no shift. */
14545 *info
= simd_immediate_info (mode
, val
);
14548 if ((val
& 0xff) == 0 && IN_RANGE (val
, -0x8000, 0x7f00))
14550 /* DUP with LSL #8. */
14552 *info
= simd_immediate_info (mode
, val
);
14555 if (aarch64_bitmask_imm (val64
, mode
))
14559 *info
= simd_immediate_info (mode
, val
);
14565 /* Return true if OP is a valid SIMD immediate for the operation
14566 described by WHICH. If INFO is nonnull, use it to describe valid
14569 aarch64_simd_valid_immediate (rtx op
, simd_immediate_info
*info
,
14570 enum simd_immediate_check which
)
14572 machine_mode mode
= GET_MODE (op
);
14573 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
14574 if (vec_flags
== 0 || vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
14577 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
14579 unsigned int n_elts
;
14580 if (GET_CODE (op
) == CONST_VECTOR
14581 && CONST_VECTOR_DUPLICATE_P (op
))
14582 n_elts
= CONST_VECTOR_NPATTERNS (op
);
14583 else if ((vec_flags
& VEC_SVE_DATA
)
14584 && const_vec_series_p (op
, &base
, &step
))
14586 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
);
14587 if (!aarch64_sve_index_immediate_p (base
)
14588 || !aarch64_sve_index_immediate_p (step
))
14592 *info
= simd_immediate_info (elt_mode
, base
, step
);
14595 else if (GET_CODE (op
) == CONST_VECTOR
14596 && CONST_VECTOR_NUNITS (op
).is_constant (&n_elts
))
14597 /* N_ELTS set above. */;
14601 /* Handle PFALSE and PTRUE. */
14602 if (vec_flags
& VEC_SVE_PRED
)
14603 return (op
== CONST0_RTX (mode
)
14604 || op
== CONSTM1_RTX (mode
));
14606 scalar_float_mode elt_float_mode
;
14608 && is_a
<scalar_float_mode
> (elt_mode
, &elt_float_mode
))
14610 rtx elt
= CONST_VECTOR_ENCODED_ELT (op
, 0);
14611 if (aarch64_float_const_zero_rtx_p (elt
)
14612 || aarch64_float_const_representable_p (elt
))
14615 *info
= simd_immediate_info (elt_float_mode
, elt
);
14620 unsigned int elt_size
= GET_MODE_SIZE (elt_mode
);
14624 scalar_int_mode elt_int_mode
= int_mode_for_mode (elt_mode
).require ();
14626 /* Expand the vector constant out into a byte vector, with the least
14627 significant byte of the register first. */
14628 auto_vec
<unsigned char, 16> bytes
;
14629 bytes
.reserve (n_elts
* elt_size
);
14630 for (unsigned int i
= 0; i
< n_elts
; i
++)
14632 /* The vector is provided in gcc endian-neutral fashion.
14633 For aarch64_be Advanced SIMD, it must be laid out in the vector
14634 register in reverse order. */
14635 bool swap_p
= ((vec_flags
& VEC_ADVSIMD
) != 0 && BYTES_BIG_ENDIAN
);
14636 rtx elt
= CONST_VECTOR_ELT (op
, swap_p
? (n_elts
- 1 - i
) : i
);
14638 if (elt_mode
!= elt_int_mode
)
14639 elt
= gen_lowpart (elt_int_mode
, elt
);
14641 if (!CONST_INT_P (elt
))
14644 unsigned HOST_WIDE_INT elt_val
= INTVAL (elt
);
14645 for (unsigned int byte
= 0; byte
< elt_size
; byte
++)
14647 bytes
.quick_push (elt_val
& 0xff);
14648 elt_val
>>= BITS_PER_UNIT
;
14652 /* The immediate must repeat every eight bytes. */
14653 unsigned int nbytes
= bytes
.length ();
14654 for (unsigned i
= 8; i
< nbytes
; ++i
)
14655 if (bytes
[i
] != bytes
[i
- 8])
14658 /* Get the repeating 8-byte value as an integer. No endian correction
14659 is needed here because bytes is already in lsb-first order. */
14660 unsigned HOST_WIDE_INT val64
= 0;
14661 for (unsigned int i
= 0; i
< 8; i
++)
14662 val64
|= ((unsigned HOST_WIDE_INT
) bytes
[i
% nbytes
]
14663 << (i
* BITS_PER_UNIT
));
14665 if (vec_flags
& VEC_SVE_DATA
)
14666 return aarch64_sve_valid_immediate (val64
, info
);
14668 return aarch64_advsimd_valid_immediate (val64
, info
, which
);
14671 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
14672 has a step in the range of INDEX. Return the index expression if so,
14673 otherwise return null. */
14675 aarch64_check_zero_based_sve_index_immediate (rtx x
)
14678 if (const_vec_series_p (x
, &base
, &step
)
14679 && base
== const0_rtx
14680 && aarch64_sve_index_immediate_p (step
))
14685 /* Check of immediate shift constants are within range. */
14687 aarch64_simd_shift_imm_p (rtx x
, machine_mode mode
, bool left
)
14689 int bit_width
= GET_MODE_UNIT_SIZE (mode
) * BITS_PER_UNIT
;
14691 return aarch64_const_vec_all_same_in_range_p (x
, 0, bit_width
- 1);
14693 return aarch64_const_vec_all_same_in_range_p (x
, 1, bit_width
);
14696 /* Return the bitmask CONST_INT to select the bits required by a zero extract
14697 operation of width WIDTH at bit position POS. */
14700 aarch64_mask_from_zextract_ops (rtx width
, rtx pos
)
14702 gcc_assert (CONST_INT_P (width
));
14703 gcc_assert (CONST_INT_P (pos
));
14705 unsigned HOST_WIDE_INT mask
14706 = ((unsigned HOST_WIDE_INT
) 1 << UINTVAL (width
)) - 1;
14707 return GEN_INT (mask
<< UINTVAL (pos
));
14711 aarch64_mov_operand_p (rtx x
, machine_mode mode
)
14713 if (GET_CODE (x
) == HIGH
14714 && aarch64_valid_symref (XEXP (x
, 0), GET_MODE (XEXP (x
, 0))))
14717 if (CONST_INT_P (x
))
14720 if (VECTOR_MODE_P (GET_MODE (x
)))
14721 return aarch64_simd_valid_immediate (x
, NULL
);
14723 if (GET_CODE (x
) == SYMBOL_REF
&& mode
== DImode
&& CONSTANT_ADDRESS_P (x
))
14726 if (aarch64_sve_cnt_immediate_p (x
))
14729 return aarch64_classify_symbolic_expression (x
)
14730 == SYMBOL_TINY_ABSOLUTE
;
14733 /* Return a const_int vector of VAL. */
14735 aarch64_simd_gen_const_vector_dup (machine_mode mode
, HOST_WIDE_INT val
)
14737 rtx c
= gen_int_mode (val
, GET_MODE_INNER (mode
));
14738 return gen_const_vec_duplicate (mode
, c
);
14741 /* Check OP is a legal scalar immediate for the MOVI instruction. */
14744 aarch64_simd_scalar_immediate_valid_for_move (rtx op
, scalar_int_mode mode
)
14746 machine_mode vmode
;
14748 vmode
= aarch64_simd_container_mode (mode
, 64);
14749 rtx op_v
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (op
));
14750 return aarch64_simd_valid_immediate (op_v
, NULL
);
14753 /* Construct and return a PARALLEL RTX vector with elements numbering the
14754 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
14755 the vector - from the perspective of the architecture. This does not
14756 line up with GCC's perspective on lane numbers, so we end up with
14757 different masks depending on our target endian-ness. The diagram
14758 below may help. We must draw the distinction when building masks
14759 which select one half of the vector. An instruction selecting
14760 architectural low-lanes for a big-endian target, must be described using
14761 a mask selecting GCC high-lanes.
14763 Big-Endian Little-Endian
14765 GCC 0 1 2 3 3 2 1 0
14766 | x | x | x | x | | x | x | x | x |
14767 Architecture 3 2 1 0 3 2 1 0
14769 Low Mask: { 2, 3 } { 0, 1 }
14770 High Mask: { 0, 1 } { 2, 3 }
14772 MODE Is the mode of the vector and NUNITS is the number of units in it. */
14775 aarch64_simd_vect_par_cnst_half (machine_mode mode
, int nunits
, bool high
)
14777 rtvec v
= rtvec_alloc (nunits
/ 2);
14778 int high_base
= nunits
/ 2;
14784 if (BYTES_BIG_ENDIAN
)
14785 base
= high
? low_base
: high_base
;
14787 base
= high
? high_base
: low_base
;
14789 for (i
= 0; i
< nunits
/ 2; i
++)
14790 RTVEC_ELT (v
, i
) = GEN_INT (base
+ i
);
14792 t1
= gen_rtx_PARALLEL (mode
, v
);
14796 /* Check OP for validity as a PARALLEL RTX vector with elements
14797 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
14798 from the perspective of the architecture. See the diagram above
14799 aarch64_simd_vect_par_cnst_half for more details. */
14802 aarch64_simd_check_vect_par_cnst_half (rtx op
, machine_mode mode
,
14806 if (!VECTOR_MODE_P (mode
) || !GET_MODE_NUNITS (mode
).is_constant (&nelts
))
14809 rtx ideal
= aarch64_simd_vect_par_cnst_half (mode
, nelts
, high
);
14810 HOST_WIDE_INT count_op
= XVECLEN (op
, 0);
14811 HOST_WIDE_INT count_ideal
= XVECLEN (ideal
, 0);
14814 if (count_op
!= count_ideal
)
14817 for (i
= 0; i
< count_ideal
; i
++)
14819 rtx elt_op
= XVECEXP (op
, 0, i
);
14820 rtx elt_ideal
= XVECEXP (ideal
, 0, i
);
14822 if (!CONST_INT_P (elt_op
)
14823 || INTVAL (elt_ideal
) != INTVAL (elt_op
))
14829 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
14830 HIGH (exclusive). */
14832 aarch64_simd_lane_bounds (rtx operand
, HOST_WIDE_INT low
, HOST_WIDE_INT high
,
14835 HOST_WIDE_INT lane
;
14836 gcc_assert (CONST_INT_P (operand
));
14837 lane
= INTVAL (operand
);
14839 if (lane
< low
|| lane
>= high
)
14842 error ("%Klane %wd out of range %wd - %wd", exp
, lane
, low
, high
- 1);
14844 error ("lane %wd out of range %wd - %wd", lane
, low
, high
- 1);
14848 /* Peform endian correction on lane number N, which indexes a vector
14849 of mode MODE, and return the result as an SImode rtx. */
14852 aarch64_endian_lane_rtx (machine_mode mode
, unsigned int n
)
14854 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode
), n
), SImode
);
14857 /* Return TRUE if OP is a valid vector addressing mode. */
14860 aarch64_simd_mem_operand_p (rtx op
)
14862 return MEM_P (op
) && (GET_CODE (XEXP (op
, 0)) == POST_INC
14863 || REG_P (XEXP (op
, 0)));
14866 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
14869 aarch64_sve_ld1r_operand_p (rtx op
)
14871 struct aarch64_address_info addr
;
14875 && is_a
<scalar_mode
> (GET_MODE (op
), &mode
)
14876 && aarch64_classify_address (&addr
, XEXP (op
, 0), mode
, false)
14877 && addr
.type
== ADDRESS_REG_IMM
14878 && offset_6bit_unsigned_scaled_p (mode
, addr
.const_offset
));
14881 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
14882 The conditions for STR are the same. */
14884 aarch64_sve_ldr_operand_p (rtx op
)
14886 struct aarch64_address_info addr
;
14889 && aarch64_classify_address (&addr
, XEXP (op
, 0), GET_MODE (op
),
14890 false, ADDR_QUERY_ANY
)
14891 && addr
.type
== ADDRESS_REG_IMM
);
14894 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
14895 We need to be able to access the individual pieces, so the range
14896 is different from LD[234] and ST[234]. */
14898 aarch64_sve_struct_memory_operand_p (rtx op
)
14903 machine_mode mode
= GET_MODE (op
);
14904 struct aarch64_address_info addr
;
14905 if (!aarch64_classify_address (&addr
, XEXP (op
, 0), SVE_BYTE_MODE
, false,
14907 || addr
.type
!= ADDRESS_REG_IMM
)
14910 poly_int64 first
= addr
.const_offset
;
14911 poly_int64 last
= first
+ GET_MODE_SIZE (mode
) - BYTES_PER_SVE_VECTOR
;
14912 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, first
)
14913 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, last
));
14916 /* Emit a register copy from operand to operand, taking care not to
14917 early-clobber source registers in the process.
14919 COUNT is the number of components into which the copy needs to be
14922 aarch64_simd_emit_reg_reg_move (rtx
*operands
, machine_mode mode
,
14923 unsigned int count
)
14926 int rdest
= REGNO (operands
[0]);
14927 int rsrc
= REGNO (operands
[1]);
14929 if (!reg_overlap_mentioned_p (operands
[0], operands
[1])
14931 for (i
= 0; i
< count
; i
++)
14932 emit_move_insn (gen_rtx_REG (mode
, rdest
+ i
),
14933 gen_rtx_REG (mode
, rsrc
+ i
));
14935 for (i
= 0; i
< count
; i
++)
14936 emit_move_insn (gen_rtx_REG (mode
, rdest
+ count
- i
- 1),
14937 gen_rtx_REG (mode
, rsrc
+ count
- i
- 1));
14940 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
14941 one of VSTRUCT modes: OI, CI, or XI. */
14943 aarch64_simd_attr_length_rglist (machine_mode mode
)
14945 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
14946 return (GET_MODE_SIZE (mode
).to_constant () / UNITS_PER_VREG
) * 4;
14949 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
14950 alignment of a vector to 128 bits. SVE predicates have an alignment of
14952 static HOST_WIDE_INT
14953 aarch64_simd_vector_alignment (const_tree type
)
14955 if (TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
14956 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
14957 be set for non-predicate vectors of booleans. Modes are the most
14958 direct way we have of identifying real SVE predicate types. */
14959 return GET_MODE_CLASS (TYPE_MODE (type
)) == MODE_VECTOR_BOOL
? 16 : 128;
14960 return wi::umin (wi::to_wide (TYPE_SIZE (type
)), 128).to_uhwi ();
14963 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
14965 aarch64_vectorize_preferred_vector_alignment (const_tree type
)
14967 if (aarch64_sve_data_mode_p (TYPE_MODE (type
)))
14969 /* If the length of the vector is fixed, try to align to that length,
14970 otherwise don't try to align at all. */
14971 HOST_WIDE_INT result
;
14972 if (!BITS_PER_SVE_VECTOR
.is_constant (&result
))
14973 result
= TYPE_ALIGN (TREE_TYPE (type
));
14976 return TYPE_ALIGN (type
);
14979 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
14981 aarch64_simd_vector_alignment_reachable (const_tree type
, bool is_packed
)
14986 /* For fixed-length vectors, check that the vectorizer will aim for
14987 full-vector alignment. This isn't true for generic GCC vectors
14988 that are wider than the ABI maximum of 128 bits. */
14989 poly_uint64 preferred_alignment
=
14990 aarch64_vectorize_preferred_vector_alignment (type
);
14991 if (TREE_CODE (TYPE_SIZE (type
)) == INTEGER_CST
14992 && maybe_ne (wi::to_widest (TYPE_SIZE (type
)),
14993 preferred_alignment
))
14996 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
15000 /* Return true if the vector misalignment factor is supported by the
15003 aarch64_builtin_support_vector_misalignment (machine_mode mode
,
15004 const_tree type
, int misalignment
,
15007 if (TARGET_SIMD
&& STRICT_ALIGNMENT
)
15009 /* Return if movmisalign pattern is not supported for this mode. */
15010 if (optab_handler (movmisalign_optab
, mode
) == CODE_FOR_nothing
)
15013 /* Misalignment factor is unknown at compile time. */
15014 if (misalignment
== -1)
15017 return default_builtin_support_vector_misalignment (mode
, type
, misalignment
,
15021 /* If VALS is a vector constant that can be loaded into a register
15022 using DUP, generate instructions to do so and return an RTX to
15023 assign to the register. Otherwise return NULL_RTX. */
15025 aarch64_simd_dup_constant (rtx vals
)
15027 machine_mode mode
= GET_MODE (vals
);
15028 machine_mode inner_mode
= GET_MODE_INNER (mode
);
15031 if (!const_vec_duplicate_p (vals
, &x
))
15034 /* We can load this constant by using DUP and a constant in a
15035 single ARM register. This will be cheaper than a vector
15037 x
= copy_to_mode_reg (inner_mode
, x
);
15038 return gen_vec_duplicate (mode
, x
);
15042 /* Generate code to load VALS, which is a PARALLEL containing only
15043 constants (for vec_init) or CONST_VECTOR, efficiently into a
15044 register. Returns an RTX to copy into the register, or NULL_RTX
15045 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
15047 aarch64_simd_make_constant (rtx vals
)
15049 machine_mode mode
= GET_MODE (vals
);
15051 rtx const_vec
= NULL_RTX
;
15055 if (GET_CODE (vals
) == CONST_VECTOR
)
15057 else if (GET_CODE (vals
) == PARALLEL
)
15059 /* A CONST_VECTOR must contain only CONST_INTs and
15060 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
15061 Only store valid constants in a CONST_VECTOR. */
15062 int n_elts
= XVECLEN (vals
, 0);
15063 for (i
= 0; i
< n_elts
; ++i
)
15065 rtx x
= XVECEXP (vals
, 0, i
);
15066 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
15069 if (n_const
== n_elts
)
15070 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0));
15073 gcc_unreachable ();
15075 if (const_vec
!= NULL_RTX
15076 && aarch64_simd_valid_immediate (const_vec
, NULL
))
15077 /* Load using MOVI/MVNI. */
15079 else if ((const_dup
= aarch64_simd_dup_constant (vals
)) != NULL_RTX
)
15080 /* Loaded using DUP. */
15082 else if (const_vec
!= NULL_RTX
)
15083 /* Load from constant pool. We cannot take advantage of single-cycle
15084 LD1 because we need a PC-relative addressing mode. */
15087 /* A PARALLEL containing something not valid inside CONST_VECTOR.
15088 We cannot construct an initializer. */
15092 /* Expand a vector initialisation sequence, such that TARGET is
15093 initialised to contain VALS. */
15096 aarch64_expand_vector_init (rtx target
, rtx vals
)
15098 machine_mode mode
= GET_MODE (target
);
15099 scalar_mode inner_mode
= GET_MODE_INNER (mode
);
15100 /* The number of vector elements. */
15101 int n_elts
= XVECLEN (vals
, 0);
15102 /* The number of vector elements which are not constant. */
15104 rtx any_const
= NULL_RTX
;
15105 /* The first element of vals. */
15106 rtx v0
= XVECEXP (vals
, 0, 0);
15107 bool all_same
= true;
15109 /* This is a special vec_init<M><N> where N is not an element mode but a
15110 vector mode with half the elements of M. We expect to find two entries
15111 of mode N in VALS and we must put their concatentation into TARGET. */
15112 if (XVECLEN (vals
, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals
, 0, 0))))
15114 gcc_assert (known_eq (GET_MODE_SIZE (mode
),
15115 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals
, 0, 0)))));
15116 rtx lo
= XVECEXP (vals
, 0, 0);
15117 rtx hi
= XVECEXP (vals
, 0, 1);
15118 machine_mode narrow_mode
= GET_MODE (lo
);
15119 gcc_assert (GET_MODE_INNER (narrow_mode
) == inner_mode
);
15120 gcc_assert (narrow_mode
== GET_MODE (hi
));
15122 /* When we want to concatenate a half-width vector with zeroes we can
15123 use the aarch64_combinez[_be] patterns. Just make sure that the
15124 zeroes are in the right half. */
15125 if (BYTES_BIG_ENDIAN
15126 && aarch64_simd_imm_zero (lo
, narrow_mode
)
15127 && general_operand (hi
, narrow_mode
))
15128 emit_insn (gen_aarch64_combinez_be (narrow_mode
, target
, hi
, lo
));
15129 else if (!BYTES_BIG_ENDIAN
15130 && aarch64_simd_imm_zero (hi
, narrow_mode
)
15131 && general_operand (lo
, narrow_mode
))
15132 emit_insn (gen_aarch64_combinez (narrow_mode
, target
, lo
, hi
));
15135 /* Else create the two half-width registers and combine them. */
15137 lo
= force_reg (GET_MODE (lo
), lo
);
15139 hi
= force_reg (GET_MODE (hi
), hi
);
15141 if (BYTES_BIG_ENDIAN
)
15142 std::swap (lo
, hi
);
15143 emit_insn (gen_aarch64_simd_combine (narrow_mode
, target
, lo
, hi
));
15148 /* Count the number of variable elements to initialise. */
15149 for (int i
= 0; i
< n_elts
; ++i
)
15151 rtx x
= XVECEXP (vals
, 0, i
);
15152 if (!(CONST_INT_P (x
) || CONST_DOUBLE_P (x
)))
15157 all_same
&= rtx_equal_p (x
, v0
);
15160 /* No variable elements, hand off to aarch64_simd_make_constant which knows
15161 how best to handle this. */
15164 rtx constant
= aarch64_simd_make_constant (vals
);
15165 if (constant
!= NULL_RTX
)
15167 emit_move_insn (target
, constant
);
15172 /* Splat a single non-constant element if we can. */
15175 rtx x
= copy_to_mode_reg (inner_mode
, v0
);
15176 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
15180 enum insn_code icode
= optab_handler (vec_set_optab
, mode
);
15181 gcc_assert (icode
!= CODE_FOR_nothing
);
15183 /* If there are only variable elements, try to optimize
15184 the insertion using dup for the most common element
15185 followed by insertions. */
15187 /* The algorithm will fill matches[*][0] with the earliest matching element,
15188 and matches[X][1] with the count of duplicate elements (if X is the
15189 earliest element which has duplicates). */
15191 if (n_var
== n_elts
&& n_elts
<= 16)
15193 int matches
[16][2] = {0};
15194 for (int i
= 0; i
< n_elts
; i
++)
15196 for (int j
= 0; j
<= i
; j
++)
15198 if (rtx_equal_p (XVECEXP (vals
, 0, i
), XVECEXP (vals
, 0, j
)))
15206 int maxelement
= 0;
15208 for (int i
= 0; i
< n_elts
; i
++)
15209 if (matches
[i
][1] > maxv
)
15212 maxv
= matches
[i
][1];
15215 /* Create a duplicate of the most common element, unless all elements
15216 are equally useless to us, in which case just immediately set the
15217 vector register using the first element. */
15221 /* For vectors of two 64-bit elements, we can do even better. */
15223 && (inner_mode
== E_DImode
15224 || inner_mode
== E_DFmode
))
15227 rtx x0
= XVECEXP (vals
, 0, 0);
15228 rtx x1
= XVECEXP (vals
, 0, 1);
15229 /* Combine can pick up this case, but handling it directly
15230 here leaves clearer RTL.
15232 This is load_pair_lanes<mode>, and also gives us a clean-up
15233 for store_pair_lanes<mode>. */
15234 if (memory_operand (x0
, inner_mode
)
15235 && memory_operand (x1
, inner_mode
)
15236 && !STRICT_ALIGNMENT
15237 && rtx_equal_p (XEXP (x1
, 0),
15238 plus_constant (Pmode
,
15240 GET_MODE_SIZE (inner_mode
))))
15243 if (inner_mode
== DFmode
)
15244 t
= gen_load_pair_lanesdf (target
, x0
, x1
);
15246 t
= gen_load_pair_lanesdi (target
, x0
, x1
);
15251 /* The subreg-move sequence below will move into lane zero of the
15252 vector register. For big-endian we want that position to hold
15253 the last element of VALS. */
15254 maxelement
= BYTES_BIG_ENDIAN
? n_elts
- 1 : 0;
15255 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
15256 aarch64_emit_move (target
, lowpart_subreg (mode
, x
, inner_mode
));
15260 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
15261 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
15264 /* Insert the rest. */
15265 for (int i
= 0; i
< n_elts
; i
++)
15267 rtx x
= XVECEXP (vals
, 0, i
);
15268 if (matches
[i
][0] == maxelement
)
15270 x
= copy_to_mode_reg (inner_mode
, x
);
15271 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
15276 /* Initialise a vector which is part-variable. We want to first try
15277 to build those lanes which are constant in the most efficient way we
15279 if (n_var
!= n_elts
)
15281 rtx copy
= copy_rtx (vals
);
15283 /* Load constant part of vector. We really don't care what goes into the
15284 parts we will overwrite, but we're more likely to be able to load the
15285 constant efficiently if it has fewer, larger, repeating parts
15286 (see aarch64_simd_valid_immediate). */
15287 for (int i
= 0; i
< n_elts
; i
++)
15289 rtx x
= XVECEXP (vals
, 0, i
);
15290 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
15292 rtx subst
= any_const
;
15293 for (int bit
= n_elts
/ 2; bit
> 0; bit
/= 2)
15295 /* Look in the copied vector, as more elements are const. */
15296 rtx test
= XVECEXP (copy
, 0, i
^ bit
);
15297 if (CONST_INT_P (test
) || CONST_DOUBLE_P (test
))
15303 XVECEXP (copy
, 0, i
) = subst
;
15305 aarch64_expand_vector_init (target
, copy
);
15308 /* Insert the variable lanes directly. */
15309 for (int i
= 0; i
< n_elts
; i
++)
15311 rtx x
= XVECEXP (vals
, 0, i
);
15312 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
15314 x
= copy_to_mode_reg (inner_mode
, x
);
15315 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
15319 /* Emit RTL corresponding to:
15320 insr TARGET, ELEM. */
15323 emit_insr (rtx target
, rtx elem
)
15325 machine_mode mode
= GET_MODE (target
);
15326 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
15327 elem
= force_reg (elem_mode
, elem
);
15329 insn_code icode
= optab_handler (vec_shl_insert_optab
, mode
);
15330 gcc_assert (icode
!= CODE_FOR_nothing
);
15331 emit_insn (GEN_FCN (icode
) (target
, target
, elem
));
15334 /* Subroutine of aarch64_sve_expand_vector_init for handling
15335 trailing constants.
15336 This function works as follows:
15337 (a) Create a new vector consisting of trailing constants.
15338 (b) Initialize TARGET with the constant vector using emit_move_insn.
15339 (c) Insert remaining elements in TARGET using insr.
15340 NELTS is the total number of elements in original vector while
15341 while NELTS_REQD is the number of elements that are actually
15344 ??? The heuristic used is to do above only if number of constants
15345 is at least half the total number of elements. May need fine tuning. */
15348 aarch64_sve_expand_vector_init_handle_trailing_constants
15349 (rtx target
, const rtx_vector_builder
&builder
, int nelts
, int nelts_reqd
)
15351 machine_mode mode
= GET_MODE (target
);
15352 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
15353 int n_trailing_constants
= 0;
15355 for (int i
= nelts_reqd
- 1;
15356 i
>= 0 && aarch64_legitimate_constant_p (elem_mode
, builder
.elt (i
));
15358 n_trailing_constants
++;
15360 if (n_trailing_constants
>= nelts_reqd
/ 2)
15362 rtx_vector_builder
v (mode
, 1, nelts
);
15363 for (int i
= 0; i
< nelts
; i
++)
15364 v
.quick_push (builder
.elt (i
+ nelts_reqd
- n_trailing_constants
));
15365 rtx const_vec
= v
.build ();
15366 emit_move_insn (target
, const_vec
);
15368 for (int i
= nelts_reqd
- n_trailing_constants
- 1; i
>= 0; i
--)
15369 emit_insr (target
, builder
.elt (i
));
15377 /* Subroutine of aarch64_sve_expand_vector_init.
15379 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
15380 (b) Skip trailing elements from BUILDER, which are the same as
15381 element NELTS_REQD - 1.
15382 (c) Insert earlier elements in reverse order in TARGET using insr. */
15385 aarch64_sve_expand_vector_init_insert_elems (rtx target
,
15386 const rtx_vector_builder
&builder
,
15389 machine_mode mode
= GET_MODE (target
);
15390 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
15392 struct expand_operand ops
[2];
15393 enum insn_code icode
= optab_handler (vec_duplicate_optab
, mode
);
15394 gcc_assert (icode
!= CODE_FOR_nothing
);
15396 create_output_operand (&ops
[0], target
, mode
);
15397 create_input_operand (&ops
[1], builder
.elt (nelts_reqd
- 1), elem_mode
);
15398 expand_insn (icode
, 2, ops
);
15400 int ndups
= builder
.count_dups (nelts_reqd
- 1, -1, -1);
15401 for (int i
= nelts_reqd
- ndups
- 1; i
>= 0; i
--)
15402 emit_insr (target
, builder
.elt (i
));
15405 /* Subroutine of aarch64_sve_expand_vector_init to handle case
15406 when all trailing elements of builder are same.
15407 This works as follows:
15408 (a) Use expand_insn interface to broadcast last vector element in TARGET.
15409 (b) Insert remaining elements in TARGET using insr.
15411 ??? The heuristic used is to do above if number of same trailing elements
15412 is at least 3/4 of total number of elements, loosely based on
15413 heuristic from mostly_zeros_p. May need fine-tuning. */
15416 aarch64_sve_expand_vector_init_handle_trailing_same_elem
15417 (rtx target
, const rtx_vector_builder
&builder
, int nelts_reqd
)
15419 int ndups
= builder
.count_dups (nelts_reqd
- 1, -1, -1);
15420 if (ndups
>= (3 * nelts_reqd
) / 4)
15422 aarch64_sve_expand_vector_init_insert_elems (target
, builder
,
15423 nelts_reqd
- ndups
+ 1);
15430 /* Initialize register TARGET from BUILDER. NELTS is the constant number
15431 of elements in BUILDER.
15433 The function tries to initialize TARGET from BUILDER if it fits one
15434 of the special cases outlined below.
15436 Failing that, the function divides BUILDER into two sub-vectors:
15437 v_even = even elements of BUILDER;
15438 v_odd = odd elements of BUILDER;
15440 and recursively calls itself with v_even and v_odd.
15442 if (recursive call succeeded for v_even or v_odd)
15443 TARGET = zip (v_even, v_odd)
15445 The function returns true if it managed to build TARGET from BUILDER
15446 with one of the special cases, false otherwise.
15448 Example: {a, 1, b, 2, c, 3, d, 4}
15450 The vector gets divided into:
15451 v_even = {a, b, c, d}
15452 v_odd = {1, 2, 3, 4}
15454 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
15455 initialize tmp2 from constant vector v_odd using emit_move_insn.
15457 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
15458 4 elements, so we construct tmp1 from v_even using insr:
15465 TARGET = zip (tmp1, tmp2)
15466 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
15469 aarch64_sve_expand_vector_init (rtx target
, const rtx_vector_builder
&builder
,
15470 int nelts
, int nelts_reqd
)
15472 machine_mode mode
= GET_MODE (target
);
15474 /* Case 1: Vector contains trailing constants. */
15476 if (aarch64_sve_expand_vector_init_handle_trailing_constants
15477 (target
, builder
, nelts
, nelts_reqd
))
15480 /* Case 2: Vector contains leading constants. */
15482 rtx_vector_builder
rev_builder (mode
, 1, nelts_reqd
);
15483 for (int i
= 0; i
< nelts_reqd
; i
++)
15484 rev_builder
.quick_push (builder
.elt (nelts_reqd
- i
- 1));
15485 rev_builder
.finalize ();
15487 if (aarch64_sve_expand_vector_init_handle_trailing_constants
15488 (target
, rev_builder
, nelts
, nelts_reqd
))
15490 emit_insn (gen_aarch64_sve_rev (mode
, target
, target
));
15494 /* Case 3: Vector contains trailing same element. */
15496 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
15497 (target
, builder
, nelts_reqd
))
15500 /* Case 4: Vector contains leading same element. */
15502 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
15503 (target
, rev_builder
, nelts_reqd
) && nelts_reqd
== nelts
)
15505 emit_insn (gen_aarch64_sve_rev (mode
, target
, target
));
15509 /* Avoid recursing below 4-elements.
15510 ??? The threshold 4 may need fine-tuning. */
15512 if (nelts_reqd
<= 4)
15515 rtx_vector_builder
v_even (mode
, 1, nelts
);
15516 rtx_vector_builder
v_odd (mode
, 1, nelts
);
15518 for (int i
= 0; i
< nelts
* 2; i
+= 2)
15520 v_even
.quick_push (builder
.elt (i
));
15521 v_odd
.quick_push (builder
.elt (i
+ 1));
15524 v_even
.finalize ();
15527 rtx tmp1
= gen_reg_rtx (mode
);
15528 bool did_even_p
= aarch64_sve_expand_vector_init (tmp1
, v_even
,
15529 nelts
, nelts_reqd
/ 2);
15531 rtx tmp2
= gen_reg_rtx (mode
);
15532 bool did_odd_p
= aarch64_sve_expand_vector_init (tmp2
, v_odd
,
15533 nelts
, nelts_reqd
/ 2);
15535 if (!did_even_p
&& !did_odd_p
)
15538 /* Initialize v_even and v_odd using INSR if it didn't match any of the
15539 special cases and zip v_even, v_odd. */
15542 aarch64_sve_expand_vector_init_insert_elems (tmp1
, v_even
, nelts_reqd
/ 2);
15545 aarch64_sve_expand_vector_init_insert_elems (tmp2
, v_odd
, nelts_reqd
/ 2);
15547 rtvec v
= gen_rtvec (2, tmp1
, tmp2
);
15548 emit_set_insn (target
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
15552 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
15555 aarch64_sve_expand_vector_init (rtx target
, rtx vals
)
15557 machine_mode mode
= GET_MODE (target
);
15558 int nelts
= XVECLEN (vals
, 0);
15560 rtx_vector_builder
v (mode
, 1, nelts
);
15561 for (int i
= 0; i
< nelts
; i
++)
15562 v
.quick_push (XVECEXP (vals
, 0, i
));
15565 /* If neither sub-vectors of v could be initialized specially,
15566 then use INSR to insert all elements from v into TARGET.
15567 ??? This might not be optimal for vectors with large
15568 initializers like 16-element or above.
15569 For nelts < 4, it probably isn't useful to handle specially. */
15572 || !aarch64_sve_expand_vector_init (target
, v
, nelts
, nelts
))
15573 aarch64_sve_expand_vector_init_insert_elems (target
, v
, nelts
);
15576 static unsigned HOST_WIDE_INT
15577 aarch64_shift_truncation_mask (machine_mode mode
)
15579 if (!SHIFT_COUNT_TRUNCATED
|| aarch64_vector_data_mode_p (mode
))
15581 return GET_MODE_UNIT_BITSIZE (mode
) - 1;
15584 /* Select a format to encode pointers in exception handling data. */
15586 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED
, int global
)
15589 switch (aarch64_cmodel
)
15591 case AARCH64_CMODEL_TINY
:
15592 case AARCH64_CMODEL_TINY_PIC
:
15593 case AARCH64_CMODEL_SMALL
:
15594 case AARCH64_CMODEL_SMALL_PIC
:
15595 case AARCH64_CMODEL_SMALL_SPIC
:
15596 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
15598 type
= DW_EH_PE_sdata4
;
15601 /* No assumptions here. 8-byte relocs required. */
15602 type
= DW_EH_PE_sdata8
;
15605 return (global
? DW_EH_PE_indirect
: 0) | DW_EH_PE_pcrel
| type
;
15608 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
15611 aarch64_asm_output_variant_pcs (FILE *stream
, const tree decl
, const char* name
)
15613 if (aarch64_simd_decl_p (decl
))
15615 fprintf (stream
, "\t.variant_pcs\t");
15616 assemble_name (stream
, name
);
15617 fprintf (stream
, "\n");
15621 /* The last .arch and .tune assembly strings that we printed. */
15622 static std::string aarch64_last_printed_arch_string
;
15623 static std::string aarch64_last_printed_tune_string
;
15625 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
15626 by the function fndecl. */
15629 aarch64_declare_function_name (FILE *stream
, const char* name
,
15632 tree target_parts
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
15634 struct cl_target_option
*targ_options
;
15636 targ_options
= TREE_TARGET_OPTION (target_parts
);
15638 targ_options
= TREE_TARGET_OPTION (target_option_current_node
);
15639 gcc_assert (targ_options
);
15641 const struct processor
*this_arch
15642 = aarch64_get_arch (targ_options
->x_explicit_arch
);
15644 uint64_t isa_flags
= targ_options
->x_aarch64_isa_flags
;
15645 std::string extension
15646 = aarch64_get_extension_string_for_isa_flags (isa_flags
,
15648 /* Only update the assembler .arch string if it is distinct from the last
15649 such string we printed. */
15650 std::string to_print
= this_arch
->name
+ extension
;
15651 if (to_print
!= aarch64_last_printed_arch_string
)
15653 asm_fprintf (asm_out_file
, "\t.arch %s\n", to_print
.c_str ());
15654 aarch64_last_printed_arch_string
= to_print
;
15657 /* Print the cpu name we're tuning for in the comments, might be
15658 useful to readers of the generated asm. Do it only when it changes
15659 from function to function and verbose assembly is requested. */
15660 const struct processor
*this_tune
15661 = aarch64_get_tune_cpu (targ_options
->x_explicit_tune_core
);
15663 if (flag_debug_asm
&& aarch64_last_printed_tune_string
!= this_tune
->name
)
15665 asm_fprintf (asm_out_file
, "\t" ASM_COMMENT_START
".tune %s\n",
15667 aarch64_last_printed_tune_string
= this_tune
->name
;
15670 aarch64_asm_output_variant_pcs (stream
, fndecl
, name
);
15672 /* Don't forget the type directive for ELF. */
15673 ASM_OUTPUT_TYPE_DIRECTIVE (stream
, name
, "function");
15674 ASM_OUTPUT_LABEL (stream
, name
);
15677 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
15680 aarch64_asm_output_alias (FILE *stream
, const tree decl
, const tree target
)
15682 const char *name
= XSTR (XEXP (DECL_RTL (decl
), 0), 0);
15683 const char *value
= IDENTIFIER_POINTER (target
);
15684 aarch64_asm_output_variant_pcs (stream
, decl
, name
);
15685 ASM_OUTPUT_DEF (stream
, name
, value
);
15688 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
15689 function symbol references. */
15692 aarch64_asm_output_external (FILE *stream
, tree decl
, const char* name
)
15694 default_elf_asm_output_external (stream
, decl
, name
);
15695 aarch64_asm_output_variant_pcs (stream
, decl
, name
);
15698 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
15699 Used to output the .cfi_b_key_frame directive when signing the current
15700 function with the B key. */
15703 aarch64_post_cfi_startproc (FILE *f
, tree ignored ATTRIBUTE_UNUSED
)
15705 if (!cfun
->is_thunk
&& aarch64_return_address_signing_enabled ()
15706 && aarch64_ra_sign_key
== AARCH64_KEY_B
)
15707 asm_fprintf (f
, "\t.cfi_b_key_frame\n");
15710 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
15713 aarch64_start_file (void)
15715 struct cl_target_option
*default_options
15716 = TREE_TARGET_OPTION (target_option_default_node
);
15718 const struct processor
*default_arch
15719 = aarch64_get_arch (default_options
->x_explicit_arch
);
15720 uint64_t default_isa_flags
= default_options
->x_aarch64_isa_flags
;
15721 std::string extension
15722 = aarch64_get_extension_string_for_isa_flags (default_isa_flags
,
15723 default_arch
->flags
);
15725 aarch64_last_printed_arch_string
= default_arch
->name
+ extension
;
15726 aarch64_last_printed_tune_string
= "";
15727 asm_fprintf (asm_out_file
, "\t.arch %s\n",
15728 aarch64_last_printed_arch_string
.c_str ());
15730 default_file_start ();
15733 /* Emit load exclusive. */
15736 aarch64_emit_load_exclusive (machine_mode mode
, rtx rval
,
15737 rtx mem
, rtx model_rtx
)
15739 emit_insn (gen_aarch64_load_exclusive (mode
, rval
, mem
, model_rtx
));
15742 /* Emit store exclusive. */
15745 aarch64_emit_store_exclusive (machine_mode mode
, rtx bval
,
15746 rtx rval
, rtx mem
, rtx model_rtx
)
15748 emit_insn (gen_aarch64_store_exclusive (mode
, bval
, rval
, mem
, model_rtx
));
15751 /* Mark the previous jump instruction as unlikely. */
15754 aarch64_emit_unlikely_jump (rtx insn
)
15756 rtx_insn
*jump
= emit_jump_insn (insn
);
15757 add_reg_br_prob_note (jump
, profile_probability::very_unlikely ());
15760 /* Expand a compare and swap pattern. */
15763 aarch64_expand_compare_and_swap (rtx operands
[])
15765 rtx bval
, rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
, x
, cc_reg
;
15766 machine_mode mode
, r_mode
;
15768 bval
= operands
[0];
15769 rval
= operands
[1];
15771 oldval
= operands
[3];
15772 newval
= operands
[4];
15773 is_weak
= operands
[5];
15774 mod_s
= operands
[6];
15775 mod_f
= operands
[7];
15776 mode
= GET_MODE (mem
);
15778 /* Normally the succ memory model must be stronger than fail, but in the
15779 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
15780 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
15781 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f
)))
15782 && is_mm_release (memmodel_from_int (INTVAL (mod_s
))))
15783 mod_s
= GEN_INT (MEMMODEL_ACQ_REL
);
15786 if (mode
== QImode
|| mode
== HImode
)
15789 rval
= gen_reg_rtx (r_mode
);
15794 /* The CAS insn requires oldval and rval overlap, but we need to
15795 have a copy of oldval saved across the operation to tell if
15796 the operation is successful. */
15797 if (reg_overlap_mentioned_p (rval
, oldval
))
15798 rval
= copy_to_mode_reg (r_mode
, oldval
);
15800 emit_move_insn (rval
, gen_lowpart (r_mode
, oldval
));
15802 emit_insn (gen_aarch64_compare_and_swap_lse (mode
, rval
, mem
,
15804 cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
15808 /* The oldval predicate varies by mode. Test it and force to reg. */
15809 insn_code code
= code_for_aarch64_compare_and_swap (mode
);
15810 if (!insn_data
[code
].operand
[2].predicate (oldval
, mode
))
15811 oldval
= force_reg (mode
, oldval
);
15813 emit_insn (GEN_FCN (code
) (rval
, mem
, oldval
, newval
,
15814 is_weak
, mod_s
, mod_f
));
15815 cc_reg
= gen_rtx_REG (CCmode
, CC_REGNUM
);
15818 if (r_mode
!= mode
)
15819 rval
= gen_lowpart (mode
, rval
);
15820 emit_move_insn (operands
[1], rval
);
15822 x
= gen_rtx_EQ (SImode
, cc_reg
, const0_rtx
);
15823 emit_insn (gen_rtx_SET (bval
, x
));
15826 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
15827 sequence implementing an atomic operation. */
15830 aarch64_emit_post_barrier (enum memmodel model
)
15832 const enum memmodel base_model
= memmodel_base (model
);
15834 if (is_mm_sync (model
)
15835 && (base_model
== MEMMODEL_ACQUIRE
15836 || base_model
== MEMMODEL_ACQ_REL
15837 || base_model
== MEMMODEL_SEQ_CST
))
15839 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST
)));
15843 /* Split a compare and swap pattern. */
15846 aarch64_split_compare_and_swap (rtx operands
[])
15848 rtx rval
, mem
, oldval
, newval
, scratch
;
15851 rtx_code_label
*label1
, *label2
;
15853 enum memmodel model
;
15856 rval
= operands
[0];
15858 oldval
= operands
[2];
15859 newval
= operands
[3];
15860 is_weak
= (operands
[4] != const0_rtx
);
15861 model_rtx
= operands
[5];
15862 scratch
= operands
[7];
15863 mode
= GET_MODE (mem
);
15864 model
= memmodel_from_int (INTVAL (model_rtx
));
15866 /* When OLDVAL is zero and we want the strong version we can emit a tighter
15869 LD[A]XR rval, [mem]
15871 ST[L]XR scratch, newval, [mem]
15872 CBNZ scratch, .label1
15875 bool strong_zero_p
= !is_weak
&& oldval
== const0_rtx
;
15880 label1
= gen_label_rtx ();
15881 emit_label (label1
);
15883 label2
= gen_label_rtx ();
15885 /* The initial load can be relaxed for a __sync operation since a final
15886 barrier will be emitted to stop code hoisting. */
15887 if (is_mm_sync (model
))
15888 aarch64_emit_load_exclusive (mode
, rval
, mem
,
15889 GEN_INT (MEMMODEL_RELAXED
));
15891 aarch64_emit_load_exclusive (mode
, rval
, mem
, model_rtx
);
15895 if (aarch64_track_speculation
)
15897 /* Emit an explicit compare instruction, so that we can correctly
15898 track the condition codes. */
15899 rtx cc_reg
= aarch64_gen_compare_reg (NE
, rval
, const0_rtx
);
15900 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
15903 x
= gen_rtx_NE (VOIDmode
, rval
, const0_rtx
);
15905 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
15906 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
15907 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
15911 cond
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
15912 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
15913 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
15914 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
15915 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
15918 aarch64_emit_store_exclusive (mode
, scratch
, mem
, newval
, model_rtx
);
15922 if (aarch64_track_speculation
)
15924 /* Emit an explicit compare instruction, so that we can correctly
15925 track the condition codes. */
15926 rtx cc_reg
= aarch64_gen_compare_reg (NE
, scratch
, const0_rtx
);
15927 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
15930 x
= gen_rtx_NE (VOIDmode
, scratch
, const0_rtx
);
15932 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
15933 gen_rtx_LABEL_REF (Pmode
, label1
), pc_rtx
);
15934 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
15938 cond
= gen_rtx_REG (CCmode
, CC_REGNUM
);
15939 x
= gen_rtx_COMPARE (CCmode
, scratch
, const0_rtx
);
15940 emit_insn (gen_rtx_SET (cond
, x
));
15943 emit_label (label2
);
15944 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
15945 to set the condition flags. If this is not used it will be removed by
15949 cond
= gen_rtx_REG (CCmode
, CC_REGNUM
);
15950 x
= gen_rtx_COMPARE (CCmode
, rval
, const0_rtx
);
15951 emit_insn (gen_rtx_SET (cond
, x
));
15953 /* Emit any final barrier needed for a __sync operation. */
15954 if (is_mm_sync (model
))
15955 aarch64_emit_post_barrier (model
);
15958 /* Split an atomic operation. */
15961 aarch64_split_atomic_op (enum rtx_code code
, rtx old_out
, rtx new_out
, rtx mem
,
15962 rtx value
, rtx model_rtx
, rtx cond
)
15964 machine_mode mode
= GET_MODE (mem
);
15965 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
15966 const enum memmodel model
= memmodel_from_int (INTVAL (model_rtx
));
15967 const bool is_sync
= is_mm_sync (model
);
15968 rtx_code_label
*label
;
15971 /* Split the atomic operation into a sequence. */
15972 label
= gen_label_rtx ();
15973 emit_label (label
);
15976 new_out
= gen_lowpart (wmode
, new_out
);
15978 old_out
= gen_lowpart (wmode
, old_out
);
15981 value
= simplify_gen_subreg (wmode
, value
, mode
, 0);
15983 /* The initial load can be relaxed for a __sync operation since a final
15984 barrier will be emitted to stop code hoisting. */
15986 aarch64_emit_load_exclusive (mode
, old_out
, mem
,
15987 GEN_INT (MEMMODEL_RELAXED
));
15989 aarch64_emit_load_exclusive (mode
, old_out
, mem
, model_rtx
);
15998 x
= gen_rtx_AND (wmode
, old_out
, value
);
15999 emit_insn (gen_rtx_SET (new_out
, x
));
16000 x
= gen_rtx_NOT (wmode
, new_out
);
16001 emit_insn (gen_rtx_SET (new_out
, x
));
16005 if (CONST_INT_P (value
))
16007 value
= GEN_INT (-INTVAL (value
));
16010 /* Fall through. */
16013 x
= gen_rtx_fmt_ee (code
, wmode
, old_out
, value
);
16014 emit_insn (gen_rtx_SET (new_out
, x
));
16018 aarch64_emit_store_exclusive (mode
, cond
, mem
,
16019 gen_lowpart (mode
, new_out
), model_rtx
);
16021 if (aarch64_track_speculation
)
16023 /* Emit an explicit compare instruction, so that we can correctly
16024 track the condition codes. */
16025 rtx cc_reg
= aarch64_gen_compare_reg (NE
, cond
, const0_rtx
);
16026 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
16029 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
16031 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
16032 gen_rtx_LABEL_REF (Pmode
, label
), pc_rtx
);
16033 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
16035 /* Emit any final barrier needed for a __sync operation. */
16037 aarch64_emit_post_barrier (model
);
16041 aarch64_init_libfuncs (void)
16043 /* Half-precision float operations. The compiler handles all operations
16044 with NULL libfuncs by converting to SFmode. */
16047 set_conv_libfunc (trunc_optab
, HFmode
, SFmode
, "__gnu_f2h_ieee");
16048 set_conv_libfunc (sext_optab
, SFmode
, HFmode
, "__gnu_h2f_ieee");
16051 set_optab_libfunc (add_optab
, HFmode
, NULL
);
16052 set_optab_libfunc (sdiv_optab
, HFmode
, NULL
);
16053 set_optab_libfunc (smul_optab
, HFmode
, NULL
);
16054 set_optab_libfunc (neg_optab
, HFmode
, NULL
);
16055 set_optab_libfunc (sub_optab
, HFmode
, NULL
);
16058 set_optab_libfunc (eq_optab
, HFmode
, NULL
);
16059 set_optab_libfunc (ne_optab
, HFmode
, NULL
);
16060 set_optab_libfunc (lt_optab
, HFmode
, NULL
);
16061 set_optab_libfunc (le_optab
, HFmode
, NULL
);
16062 set_optab_libfunc (ge_optab
, HFmode
, NULL
);
16063 set_optab_libfunc (gt_optab
, HFmode
, NULL
);
16064 set_optab_libfunc (unord_optab
, HFmode
, NULL
);
16067 /* Target hook for c_mode_for_suffix. */
16068 static machine_mode
16069 aarch64_c_mode_for_suffix (char suffix
)
16077 /* We can only represent floating point constants which will fit in
16078 "quarter-precision" values. These values are characterised by
16079 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
16082 (-1)^s * (n/16) * 2^r
16085 's' is the sign bit.
16086 'n' is an integer in the range 16 <= n <= 31.
16087 'r' is an integer in the range -3 <= r <= 4. */
16089 /* Return true iff X can be represented by a quarter-precision
16090 floating point immediate operand X. Note, we cannot represent 0.0. */
16092 aarch64_float_const_representable_p (rtx x
)
16094 /* This represents our current view of how many bits
16095 make up the mantissa. */
16096 int point_pos
= 2 * HOST_BITS_PER_WIDE_INT
- 1;
16098 unsigned HOST_WIDE_INT mantissa
, mask
;
16099 REAL_VALUE_TYPE r
, m
;
16102 if (!CONST_DOUBLE_P (x
))
16105 if (GET_MODE (x
) == VOIDmode
16106 || (GET_MODE (x
) == HFmode
&& !TARGET_FP_F16INST
))
16109 r
= *CONST_DOUBLE_REAL_VALUE (x
);
16111 /* We cannot represent infinities, NaNs or +/-zero. We won't
16112 know if we have +zero until we analyse the mantissa, but we
16113 can reject the other invalid values. */
16114 if (REAL_VALUE_ISINF (r
) || REAL_VALUE_ISNAN (r
)
16115 || REAL_VALUE_MINUS_ZERO (r
))
16118 /* Extract exponent. */
16119 r
= real_value_abs (&r
);
16120 exponent
= REAL_EXP (&r
);
16122 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
16123 highest (sign) bit, with a fixed binary point at bit point_pos.
16124 m1 holds the low part of the mantissa, m2 the high part.
16125 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
16126 bits for the mantissa, this can fail (low bits will be lost). */
16127 real_ldexp (&m
, &r
, point_pos
- exponent
);
16128 wide_int w
= real_to_integer (&m
, &fail
, HOST_BITS_PER_WIDE_INT
* 2);
16130 /* If the low part of the mantissa has bits set we cannot represent
16132 if (w
.ulow () != 0)
16134 /* We have rejected the lower HOST_WIDE_INT, so update our
16135 understanding of how many bits lie in the mantissa and
16136 look only at the high HOST_WIDE_INT. */
16137 mantissa
= w
.elt (1);
16138 point_pos
-= HOST_BITS_PER_WIDE_INT
;
16140 /* We can only represent values with a mantissa of the form 1.xxxx. */
16141 mask
= ((unsigned HOST_WIDE_INT
)1 << (point_pos
- 5)) - 1;
16142 if ((mantissa
& mask
) != 0)
16145 /* Having filtered unrepresentable values, we may now remove all
16146 but the highest 5 bits. */
16147 mantissa
>>= point_pos
- 5;
16149 /* We cannot represent the value 0.0, so reject it. This is handled
16154 /* Then, as bit 4 is always set, we can mask it off, leaving
16155 the mantissa in the range [0, 15]. */
16156 mantissa
&= ~(1 << 4);
16157 gcc_assert (mantissa
<= 15);
16159 /* GCC internally does not use IEEE754-like encoding (where normalized
16160 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
16161 Our mantissa values are shifted 4 places to the left relative to
16162 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
16163 by 5 places to correct for GCC's representation. */
16164 exponent
= 5 - exponent
;
16166 return (exponent
>= 0 && exponent
<= 7);
16169 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
16170 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
16171 output MOVI/MVNI, ORR or BIC immediate. */
16173 aarch64_output_simd_mov_immediate (rtx const_vector
, unsigned width
,
16174 enum simd_immediate_check which
)
16177 static char templ
[40];
16178 const char *mnemonic
;
16179 const char *shift_op
;
16180 unsigned int lane_count
= 0;
16183 struct simd_immediate_info info
;
16185 /* This will return true to show const_vector is legal for use as either
16186 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
16187 It will also update INFO to show how the immediate should be generated.
16188 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
16189 is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
, which
);
16190 gcc_assert (is_valid
);
16192 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
16193 lane_count
= width
/ GET_MODE_BITSIZE (info
.elt_mode
);
16195 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
16197 gcc_assert (info
.shift
== 0 && info
.insn
== simd_immediate_info::MOV
);
16198 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
16199 move immediate path. */
16200 if (aarch64_float_const_zero_rtx_p (info
.value
))
16201 info
.value
= GEN_INT (0);
16204 const unsigned int buf_size
= 20;
16205 char float_buf
[buf_size
] = {'\0'};
16206 real_to_decimal_for_mode (float_buf
,
16207 CONST_DOUBLE_REAL_VALUE (info
.value
),
16208 buf_size
, buf_size
, 1, info
.elt_mode
);
16210 if (lane_count
== 1)
16211 snprintf (templ
, sizeof (templ
), "fmov\t%%d0, %s", float_buf
);
16213 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%d%c, %s",
16214 lane_count
, element_char
, float_buf
);
16219 gcc_assert (CONST_INT_P (info
.value
));
16221 if (which
== AARCH64_CHECK_MOV
)
16223 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "mvni" : "movi";
16224 shift_op
= info
.modifier
== simd_immediate_info::MSL
? "msl" : "lsl";
16225 if (lane_count
== 1)
16226 snprintf (templ
, sizeof (templ
), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX
,
16227 mnemonic
, UINTVAL (info
.value
));
16228 else if (info
.shift
)
16229 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
16230 HOST_WIDE_INT_PRINT_HEX
", %s %d", mnemonic
, lane_count
,
16231 element_char
, UINTVAL (info
.value
), shift_op
, info
.shift
);
16233 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
16234 HOST_WIDE_INT_PRINT_HEX
, mnemonic
, lane_count
,
16235 element_char
, UINTVAL (info
.value
));
16239 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
16240 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "bic" : "orr";
16242 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
16243 HOST_WIDE_INT_PRINT_DEC
", %s #%d", mnemonic
, lane_count
,
16244 element_char
, UINTVAL (info
.value
), "lsl", info
.shift
);
16246 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
16247 HOST_WIDE_INT_PRINT_DEC
, mnemonic
, lane_count
,
16248 element_char
, UINTVAL (info
.value
));
16254 aarch64_output_scalar_simd_mov_immediate (rtx immediate
, scalar_int_mode mode
)
16257 /* If a floating point number was passed and we desire to use it in an
16258 integer mode do the conversion to integer. */
16259 if (CONST_DOUBLE_P (immediate
) && GET_MODE_CLASS (mode
) == MODE_INT
)
16261 unsigned HOST_WIDE_INT ival
;
16262 if (!aarch64_reinterpret_float_as_int (immediate
, &ival
))
16263 gcc_unreachable ();
16264 immediate
= gen_int_mode (ival
, mode
);
16267 machine_mode vmode
;
16268 /* use a 64 bit mode for everything except for DI/DF mode, where we use
16269 a 128 bit vector mode. */
16270 int width
= GET_MODE_BITSIZE (mode
) == 64 ? 128 : 64;
16272 vmode
= aarch64_simd_container_mode (mode
, width
);
16273 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (immediate
));
16274 return aarch64_output_simd_mov_immediate (v_op
, width
);
16277 /* Return the output string to use for moving immediate CONST_VECTOR
16278 into an SVE register. */
16281 aarch64_output_sve_mov_immediate (rtx const_vector
)
16283 static char templ
[40];
16284 struct simd_immediate_info info
;
16287 bool is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
);
16288 gcc_assert (is_valid
);
16290 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
16294 snprintf (templ
, sizeof (templ
), "index\t%%0.%c, #"
16295 HOST_WIDE_INT_PRINT_DEC
", #" HOST_WIDE_INT_PRINT_DEC
,
16296 element_char
, INTVAL (info
.value
), INTVAL (info
.step
));
16300 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
16302 if (aarch64_float_const_zero_rtx_p (info
.value
))
16303 info
.value
= GEN_INT (0);
16306 const int buf_size
= 20;
16307 char float_buf
[buf_size
] = {};
16308 real_to_decimal_for_mode (float_buf
,
16309 CONST_DOUBLE_REAL_VALUE (info
.value
),
16310 buf_size
, buf_size
, 1, info
.elt_mode
);
16312 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%c, #%s",
16313 element_char
, float_buf
);
16318 snprintf (templ
, sizeof (templ
), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC
,
16319 element_char
, INTVAL (info
.value
));
16323 /* Return the asm format for a PTRUE instruction whose destination has
16324 mode MODE. SUFFIX is the element size suffix. */
16327 aarch64_output_ptrue (machine_mode mode
, char suffix
)
16329 unsigned int nunits
;
16330 static char buf
[sizeof ("ptrue\t%0.N, vlNNNNN")];
16331 if (GET_MODE_NUNITS (mode
).is_constant (&nunits
))
16332 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, vl%d", suffix
, nunits
);
16334 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, all", suffix
);
16338 /* Split operands into moves from op[1] + op[2] into op[0]. */
16341 aarch64_split_combinev16qi (rtx operands
[3])
16343 unsigned int dest
= REGNO (operands
[0]);
16344 unsigned int src1
= REGNO (operands
[1]);
16345 unsigned int src2
= REGNO (operands
[2]);
16346 machine_mode halfmode
= GET_MODE (operands
[1]);
16347 unsigned int halfregs
= REG_NREGS (operands
[1]);
16348 rtx destlo
, desthi
;
16350 gcc_assert (halfmode
== V16QImode
);
16352 if (src1
== dest
&& src2
== dest
+ halfregs
)
16354 /* No-op move. Can't split to nothing; emit something. */
16355 emit_note (NOTE_INSN_DELETED
);
16359 /* Preserve register attributes for variable tracking. */
16360 destlo
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
, 0);
16361 desthi
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
+ halfregs
,
16362 GET_MODE_SIZE (halfmode
));
16364 /* Special case of reversed high/low parts. */
16365 if (reg_overlap_mentioned_p (operands
[2], destlo
)
16366 && reg_overlap_mentioned_p (operands
[1], desthi
))
16368 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
16369 emit_insn (gen_xorv16qi3 (operands
[2], operands
[1], operands
[2]));
16370 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
16372 else if (!reg_overlap_mentioned_p (operands
[2], destlo
))
16374 /* Try to avoid unnecessary moves if part of the result
16375 is in the right place already. */
16377 emit_move_insn (destlo
, operands
[1]);
16378 if (src2
!= dest
+ halfregs
)
16379 emit_move_insn (desthi
, operands
[2]);
16383 if (src2
!= dest
+ halfregs
)
16384 emit_move_insn (desthi
, operands
[2]);
16386 emit_move_insn (destlo
, operands
[1]);
16390 /* vec_perm support. */
16392 struct expand_vec_perm_d
16394 rtx target
, op0
, op1
;
16395 vec_perm_indices perm
;
16396 machine_mode vmode
;
16397 unsigned int vec_flags
;
16402 /* Generate a variable permutation. */
16405 aarch64_expand_vec_perm_1 (rtx target
, rtx op0
, rtx op1
, rtx sel
)
16407 machine_mode vmode
= GET_MODE (target
);
16408 bool one_vector_p
= rtx_equal_p (op0
, op1
);
16410 gcc_checking_assert (vmode
== V8QImode
|| vmode
== V16QImode
);
16411 gcc_checking_assert (GET_MODE (op0
) == vmode
);
16412 gcc_checking_assert (GET_MODE (op1
) == vmode
);
16413 gcc_checking_assert (GET_MODE (sel
) == vmode
);
16414 gcc_checking_assert (TARGET_SIMD
);
16418 if (vmode
== V8QImode
)
16420 /* Expand the argument to a V16QI mode by duplicating it. */
16421 rtx pair
= gen_reg_rtx (V16QImode
);
16422 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op0
));
16423 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
16427 emit_insn (gen_aarch64_tbl1v16qi (target
, op0
, sel
));
16434 if (vmode
== V8QImode
)
16436 pair
= gen_reg_rtx (V16QImode
);
16437 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op1
));
16438 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
16442 pair
= gen_reg_rtx (OImode
);
16443 emit_insn (gen_aarch64_combinev16qi (pair
, op0
, op1
));
16444 emit_insn (gen_aarch64_tbl2v16qi (target
, pair
, sel
));
16449 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
16450 NELT is the number of elements in the vector. */
16453 aarch64_expand_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
,
16456 machine_mode vmode
= GET_MODE (target
);
16457 bool one_vector_p
= rtx_equal_p (op0
, op1
);
16460 /* The TBL instruction does not use a modulo index, so we must take care
16461 of that ourselves. */
16462 mask
= aarch64_simd_gen_const_vector_dup (vmode
,
16463 one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
16464 sel
= expand_simple_binop (vmode
, AND
, sel
, mask
, NULL
, 0, OPTAB_LIB_WIDEN
);
16466 /* For big-endian, we also need to reverse the index within the vector
16467 (but not which vector). */
16468 if (BYTES_BIG_ENDIAN
)
16470 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
16472 mask
= aarch64_simd_gen_const_vector_dup (vmode
, nelt
- 1);
16473 sel
= expand_simple_binop (vmode
, XOR
, sel
, mask
,
16474 NULL
, 0, OPTAB_LIB_WIDEN
);
16476 aarch64_expand_vec_perm_1 (target
, op0
, op1
, sel
);
16479 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
16482 emit_unspec2 (rtx target
, int code
, rtx op0
, rtx op1
)
16484 emit_insn (gen_rtx_SET (target
,
16485 gen_rtx_UNSPEC (GET_MODE (target
),
16486 gen_rtvec (2, op0
, op1
), code
)));
16489 /* Expand an SVE vec_perm with the given operands. */
16492 aarch64_expand_sve_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
)
16494 machine_mode data_mode
= GET_MODE (target
);
16495 machine_mode sel_mode
= GET_MODE (sel
);
16496 /* Enforced by the pattern condition. */
16497 int nunits
= GET_MODE_NUNITS (sel_mode
).to_constant ();
16499 /* Note: vec_perm indices are supposed to wrap when they go beyond the
16500 size of the two value vectors, i.e. the upper bits of the indices
16501 are effectively ignored. SVE TBL instead produces 0 for any
16502 out-of-range indices, so we need to modulo all the vec_perm indices
16503 to ensure they are all in range. */
16504 rtx sel_reg
= force_reg (sel_mode
, sel
);
16506 /* Check if the sel only references the first values vector. */
16507 if (GET_CODE (sel
) == CONST_VECTOR
16508 && aarch64_const_vec_all_in_range_p (sel
, 0, nunits
- 1))
16510 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_reg
);
16514 /* Check if the two values vectors are the same. */
16515 if (rtx_equal_p (op0
, op1
))
16517 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
, nunits
- 1);
16518 rtx sel_mod
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
16519 NULL
, 0, OPTAB_DIRECT
);
16520 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_mod
);
16524 /* Run TBL on for each value vector and combine the results. */
16526 rtx res0
= gen_reg_rtx (data_mode
);
16527 rtx res1
= gen_reg_rtx (data_mode
);
16528 rtx neg_num_elems
= aarch64_simd_gen_const_vector_dup (sel_mode
, -nunits
);
16529 if (GET_CODE (sel
) != CONST_VECTOR
16530 || !aarch64_const_vec_all_in_range_p (sel
, 0, 2 * nunits
- 1))
16532 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
,
16534 sel_reg
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
16535 NULL
, 0, OPTAB_DIRECT
);
16537 emit_unspec2 (res0
, UNSPEC_TBL
, op0
, sel_reg
);
16538 rtx sel_sub
= expand_simple_binop (sel_mode
, PLUS
, sel_reg
, neg_num_elems
,
16539 NULL
, 0, OPTAB_DIRECT
);
16540 emit_unspec2 (res1
, UNSPEC_TBL
, op1
, sel_sub
);
16541 if (GET_MODE_CLASS (data_mode
) == MODE_VECTOR_INT
)
16542 emit_insn (gen_rtx_SET (target
, gen_rtx_IOR (data_mode
, res0
, res1
)));
16544 emit_unspec2 (target
, UNSPEC_IORF
, res0
, res1
);
16547 /* Recognize patterns suitable for the TRN instructions. */
16549 aarch64_evpc_trn (struct expand_vec_perm_d
*d
)
16552 poly_uint64 nelt
= d
->perm
.length ();
16553 rtx out
, in0
, in1
, x
;
16554 machine_mode vmode
= d
->vmode
;
16556 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
16559 /* Note that these are little-endian tests.
16560 We correct for big-endian later. */
16561 if (!d
->perm
[0].is_constant (&odd
)
16562 || (odd
!= 0 && odd
!= 1)
16563 || !d
->perm
.series_p (0, 2, odd
, 2)
16564 || !d
->perm
.series_p (1, 2, nelt
+ odd
, 2))
16573 /* We don't need a big-endian lane correction for SVE; see the comment
16574 at the head of aarch64-sve.md for details. */
16575 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
16577 x
= in0
, in0
= in1
, in1
= x
;
16582 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
16583 odd
? UNSPEC_TRN2
: UNSPEC_TRN1
));
16587 /* Recognize patterns suitable for the UZP instructions. */
16589 aarch64_evpc_uzp (struct expand_vec_perm_d
*d
)
16592 rtx out
, in0
, in1
, x
;
16593 machine_mode vmode
= d
->vmode
;
16595 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
16598 /* Note that these are little-endian tests.
16599 We correct for big-endian later. */
16600 if (!d
->perm
[0].is_constant (&odd
)
16601 || (odd
!= 0 && odd
!= 1)
16602 || !d
->perm
.series_p (0, 1, odd
, 2))
16611 /* We don't need a big-endian lane correction for SVE; see the comment
16612 at the head of aarch64-sve.md for details. */
16613 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
16615 x
= in0
, in0
= in1
, in1
= x
;
16620 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
16621 odd
? UNSPEC_UZP2
: UNSPEC_UZP1
));
16625 /* Recognize patterns suitable for the ZIP instructions. */
16627 aarch64_evpc_zip (struct expand_vec_perm_d
*d
)
16630 poly_uint64 nelt
= d
->perm
.length ();
16631 rtx out
, in0
, in1
, x
;
16632 machine_mode vmode
= d
->vmode
;
16634 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
16637 /* Note that these are little-endian tests.
16638 We correct for big-endian later. */
16639 poly_uint64 first
= d
->perm
[0];
16640 if ((maybe_ne (first
, 0U) && maybe_ne (first
* 2, nelt
))
16641 || !d
->perm
.series_p (0, 2, first
, 1)
16642 || !d
->perm
.series_p (1, 2, first
+ nelt
, 1))
16644 high
= maybe_ne (first
, 0U);
16652 /* We don't need a big-endian lane correction for SVE; see the comment
16653 at the head of aarch64-sve.md for details. */
16654 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
16656 x
= in0
, in0
= in1
, in1
= x
;
16661 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
16662 high
? UNSPEC_ZIP2
: UNSPEC_ZIP1
));
16666 /* Recognize patterns for the EXT insn. */
16669 aarch64_evpc_ext (struct expand_vec_perm_d
*d
)
16671 HOST_WIDE_INT location
;
16674 /* The first element always refers to the first vector.
16675 Check if the extracted indices are increasing by one. */
16676 if (d
->vec_flags
== VEC_SVE_PRED
16677 || !d
->perm
[0].is_constant (&location
)
16678 || !d
->perm
.series_p (0, 1, location
, 1))
16685 /* The case where (location == 0) is a no-op for both big- and little-endian,
16686 and is removed by the mid-end at optimization levels -O1 and higher.
16688 We don't need a big-endian lane correction for SVE; see the comment
16689 at the head of aarch64-sve.md for details. */
16690 if (BYTES_BIG_ENDIAN
&& location
!= 0 && d
->vec_flags
== VEC_ADVSIMD
)
16692 /* After setup, we want the high elements of the first vector (stored
16693 at the LSB end of the register), and the low elements of the second
16694 vector (stored at the MSB end of the register). So swap. */
16695 std::swap (d
->op0
, d
->op1
);
16696 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
16697 to_constant () is safe since this is restricted to Advanced SIMD
16699 location
= d
->perm
.length ().to_constant () - location
;
16702 offset
= GEN_INT (location
);
16703 emit_set_insn (d
->target
,
16704 gen_rtx_UNSPEC (d
->vmode
,
16705 gen_rtvec (3, d
->op0
, d
->op1
, offset
),
16710 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
16711 within each 64-bit, 32-bit or 16-bit granule. */
16714 aarch64_evpc_rev_local (struct expand_vec_perm_d
*d
)
16716 HOST_WIDE_INT diff
;
16717 unsigned int i
, size
, unspec
;
16718 machine_mode pred_mode
;
16720 if (d
->vec_flags
== VEC_SVE_PRED
16721 || !d
->one_vector_p
16722 || !d
->perm
[0].is_constant (&diff
))
16725 size
= (diff
+ 1) * GET_MODE_UNIT_SIZE (d
->vmode
);
16728 unspec
= UNSPEC_REV64
;
16729 pred_mode
= VNx2BImode
;
16731 else if (size
== 4)
16733 unspec
= UNSPEC_REV32
;
16734 pred_mode
= VNx4BImode
;
16736 else if (size
== 2)
16738 unspec
= UNSPEC_REV16
;
16739 pred_mode
= VNx8BImode
;
16744 unsigned int step
= diff
+ 1;
16745 for (i
= 0; i
< step
; ++i
)
16746 if (!d
->perm
.series_p (i
, step
, diff
- i
, step
))
16753 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), unspec
);
16754 if (d
->vec_flags
== VEC_SVE_DATA
)
16756 rtx pred
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
16757 src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (2, pred
, src
),
16758 UNSPEC_MERGE_PTRUE
);
16760 emit_set_insn (d
->target
, src
);
16764 /* Recognize patterns for the REV insn, which reverses elements within
16768 aarch64_evpc_rev_global (struct expand_vec_perm_d
*d
)
16770 poly_uint64 nelt
= d
->perm
.length ();
16772 if (!d
->one_vector_p
|| d
->vec_flags
!= VEC_SVE_DATA
)
16775 if (!d
->perm
.series_p (0, 1, nelt
- 1, -1))
16782 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), UNSPEC_REV
);
16783 emit_set_insn (d
->target
, src
);
16788 aarch64_evpc_dup (struct expand_vec_perm_d
*d
)
16790 rtx out
= d
->target
;
16793 machine_mode vmode
= d
->vmode
;
16796 if (d
->vec_flags
== VEC_SVE_PRED
16797 || d
->perm
.encoding ().encoded_nelts () != 1
16798 || !d
->perm
[0].is_constant (&elt
))
16801 if (d
->vec_flags
== VEC_SVE_DATA
&& elt
>= 64 * GET_MODE_UNIT_SIZE (vmode
))
16808 /* The generic preparation in aarch64_expand_vec_perm_const_1
16809 swaps the operand order and the permute indices if it finds
16810 d->perm[0] to be in the second operand. Thus, we can always
16811 use d->op0 and need not do any extra arithmetic to get the
16812 correct lane number. */
16814 lane
= GEN_INT (elt
); /* The pattern corrects for big-endian. */
16816 rtx parallel
= gen_rtx_PARALLEL (vmode
, gen_rtvec (1, lane
));
16817 rtx select
= gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode
), in0
, parallel
);
16818 emit_set_insn (out
, gen_rtx_VEC_DUPLICATE (vmode
, select
));
16823 aarch64_evpc_tbl (struct expand_vec_perm_d
*d
)
16825 rtx rperm
[MAX_COMPILE_TIME_VEC_BYTES
], sel
;
16826 machine_mode vmode
= d
->vmode
;
16828 /* Make sure that the indices are constant. */
16829 unsigned int encoded_nelts
= d
->perm
.encoding ().encoded_nelts ();
16830 for (unsigned int i
= 0; i
< encoded_nelts
; ++i
)
16831 if (!d
->perm
[i
].is_constant ())
16837 /* Generic code will try constant permutation twice. Once with the
16838 original mode and again with the elements lowered to QImode.
16839 So wait and don't do the selector expansion ourselves. */
16840 if (vmode
!= V8QImode
&& vmode
!= V16QImode
)
16843 /* to_constant is safe since this routine is specific to Advanced SIMD
16845 unsigned int nelt
= d
->perm
.length ().to_constant ();
16846 for (unsigned int i
= 0; i
< nelt
; ++i
)
16847 /* If big-endian and two vectors we end up with a weird mixed-endian
16848 mode on NEON. Reverse the index within each word but not the word
16849 itself. to_constant is safe because we checked is_constant above. */
16850 rperm
[i
] = GEN_INT (BYTES_BIG_ENDIAN
16851 ? d
->perm
[i
].to_constant () ^ (nelt
- 1)
16852 : d
->perm
[i
].to_constant ());
16854 sel
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
16855 sel
= force_reg (vmode
, sel
);
16857 aarch64_expand_vec_perm_1 (d
->target
, d
->op0
, d
->op1
, sel
);
16861 /* Try to implement D using an SVE TBL instruction. */
16864 aarch64_evpc_sve_tbl (struct expand_vec_perm_d
*d
)
16866 unsigned HOST_WIDE_INT nelt
;
16868 /* Permuting two variable-length vectors could overflow the
16870 if (!d
->one_vector_p
&& !d
->perm
.length ().is_constant (&nelt
))
16876 machine_mode sel_mode
= mode_for_int_vector (d
->vmode
).require ();
16877 rtx sel
= vec_perm_indices_to_rtx (sel_mode
, d
->perm
);
16878 if (d
->one_vector_p
)
16879 emit_unspec2 (d
->target
, UNSPEC_TBL
, d
->op0
, force_reg (sel_mode
, sel
));
16881 aarch64_expand_sve_vec_perm (d
->target
, d
->op0
, d
->op1
, sel
);
16886 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
16888 /* The pattern matching functions above are written to look for a small
16889 number to begin the sequence (0, 1, N/2). If we begin with an index
16890 from the second operand, we can swap the operands. */
16891 poly_int64 nelt
= d
->perm
.length ();
16892 if (known_ge (d
->perm
[0], nelt
))
16894 d
->perm
.rotate_inputs (1);
16895 std::swap (d
->op0
, d
->op1
);
16898 if ((d
->vec_flags
== VEC_ADVSIMD
16899 || d
->vec_flags
== VEC_SVE_DATA
16900 || d
->vec_flags
== VEC_SVE_PRED
)
16901 && known_gt (nelt
, 1))
16903 if (aarch64_evpc_rev_local (d
))
16905 else if (aarch64_evpc_rev_global (d
))
16907 else if (aarch64_evpc_ext (d
))
16909 else if (aarch64_evpc_dup (d
))
16911 else if (aarch64_evpc_zip (d
))
16913 else if (aarch64_evpc_uzp (d
))
16915 else if (aarch64_evpc_trn (d
))
16917 if (d
->vec_flags
== VEC_SVE_DATA
)
16918 return aarch64_evpc_sve_tbl (d
);
16919 else if (d
->vec_flags
== VEC_ADVSIMD
)
16920 return aarch64_evpc_tbl (d
);
16925 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
16928 aarch64_vectorize_vec_perm_const (machine_mode vmode
, rtx target
, rtx op0
,
16929 rtx op1
, const vec_perm_indices
&sel
)
16931 struct expand_vec_perm_d d
;
16933 /* Check whether the mask can be applied to a single vector. */
16934 if (sel
.ninputs () == 1
16935 || (op0
&& rtx_equal_p (op0
, op1
)))
16936 d
.one_vector_p
= true;
16937 else if (sel
.all_from_input_p (0))
16939 d
.one_vector_p
= true;
16942 else if (sel
.all_from_input_p (1))
16944 d
.one_vector_p
= true;
16948 d
.one_vector_p
= false;
16950 d
.perm
.new_vector (sel
.encoding (), d
.one_vector_p
? 1 : 2,
16951 sel
.nelts_per_input ());
16953 d
.vec_flags
= aarch64_classify_vector_mode (d
.vmode
);
16957 d
.testing_p
= !target
;
16960 return aarch64_expand_vec_perm_const_1 (&d
);
16962 rtx_insn
*last
= get_last_insn ();
16963 bool ret
= aarch64_expand_vec_perm_const_1 (&d
);
16964 gcc_assert (last
== get_last_insn ());
16969 /* Generate a byte permute mask for a register of mode MODE,
16970 which has NUNITS units. */
16973 aarch64_reverse_mask (machine_mode mode
, unsigned int nunits
)
16975 /* We have to reverse each vector because we dont have
16976 a permuted load that can reverse-load according to ABI rules. */
16978 rtvec v
= rtvec_alloc (16);
16980 unsigned int usize
= GET_MODE_UNIT_SIZE (mode
);
16982 gcc_assert (BYTES_BIG_ENDIAN
);
16983 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode
));
16985 for (i
= 0; i
< nunits
; i
++)
16986 for (j
= 0; j
< usize
; j
++)
16987 RTVEC_ELT (v
, i
* usize
+ j
) = GEN_INT ((i
+ 1) * usize
- 1 - j
);
16988 mask
= gen_rtx_CONST_VECTOR (V16QImode
, v
);
16989 return force_reg (V16QImode
, mask
);
16992 /* Return true if X is a valid second operand for the SVE instruction
16993 that implements integer comparison OP_CODE. */
16996 aarch64_sve_cmp_operand_p (rtx_code op_code
, rtx x
)
16998 if (register_operand (x
, VOIDmode
))
17007 return aarch64_sve_cmp_immediate_p (x
, false);
17014 return aarch64_sve_cmp_immediate_p (x
, true);
17016 gcc_unreachable ();
17020 /* Use predicated SVE instructions to implement the equivalent of:
17024 given that PTRUE is an all-true predicate of the appropriate mode. */
17027 aarch64_emit_sve_ptrue_op (rtx target
, rtx ptrue
, rtx op
)
17029 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (target
),
17030 gen_rtvec (2, ptrue
, op
),
17031 UNSPEC_MERGE_PTRUE
);
17032 rtx_insn
*insn
= emit_set_insn (target
, unspec
);
17033 set_unique_reg_note (insn
, REG_EQUAL
, copy_rtx (op
));
17036 /* Likewise, but also clobber the condition codes. */
17039 aarch64_emit_sve_ptrue_op_cc (rtx target
, rtx ptrue
, rtx op
)
17041 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (target
),
17042 gen_rtvec (2, ptrue
, op
),
17043 UNSPEC_MERGE_PTRUE
);
17044 rtx_insn
*insn
= emit_insn (gen_set_clobber_cc (target
, unspec
));
17045 set_unique_reg_note (insn
, REG_EQUAL
, copy_rtx (op
));
17048 /* Return the UNSPEC_COND_* code for comparison CODE. */
17050 static unsigned int
17051 aarch64_unspec_cond_code (rtx_code code
)
17056 return UNSPEC_COND_NE
;
17058 return UNSPEC_COND_EQ
;
17060 return UNSPEC_COND_LT
;
17062 return UNSPEC_COND_GT
;
17064 return UNSPEC_COND_LE
;
17066 return UNSPEC_COND_GE
;
17068 gcc_unreachable ();
17074 (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
17076 where <X> is the operation associated with comparison CODE. This form
17077 of instruction is used when (and (CODE OP0 OP1) PRED) would have different
17078 semantics, such as when PRED might not be all-true and when comparing
17079 inactive lanes could have side effects. */
17082 aarch64_emit_sve_predicated_cond (rtx target
, rtx_code code
,
17083 rtx pred
, rtx op0
, rtx op1
)
17085 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (pred
),
17086 gen_rtvec (3, pred
, op0
, op1
),
17087 aarch64_unspec_cond_code (code
));
17088 emit_set_insn (target
, unspec
);
17091 /* Expand an SVE integer comparison using the SVE equivalent of:
17093 (set TARGET (CODE OP0 OP1)). */
17096 aarch64_expand_sve_vec_cmp_int (rtx target
, rtx_code code
, rtx op0
, rtx op1
)
17098 machine_mode pred_mode
= GET_MODE (target
);
17099 machine_mode data_mode
= GET_MODE (op0
);
17101 if (!aarch64_sve_cmp_operand_p (code
, op1
))
17102 op1
= force_reg (data_mode
, op1
);
17104 rtx ptrue
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
17105 rtx cond
= gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
);
17106 aarch64_emit_sve_ptrue_op_cc (target
, ptrue
, cond
);
17109 /* Emit the SVE equivalent of:
17111 (set TMP1 (CODE1 OP0 OP1))
17112 (set TMP2 (CODE2 OP0 OP1))
17113 (set TARGET (ior:PRED_MODE TMP1 TMP2))
17115 PTRUE is an all-true predicate with the same mode as TARGET. */
17118 aarch64_emit_sve_or_conds (rtx target
, rtx_code code1
, rtx_code code2
,
17119 rtx ptrue
, rtx op0
, rtx op1
)
17121 machine_mode pred_mode
= GET_MODE (ptrue
);
17122 rtx tmp1
= gen_reg_rtx (pred_mode
);
17123 aarch64_emit_sve_ptrue_op (tmp1
, ptrue
,
17124 gen_rtx_fmt_ee (code1
, pred_mode
, op0
, op1
));
17125 rtx tmp2
= gen_reg_rtx (pred_mode
);
17126 aarch64_emit_sve_ptrue_op (tmp2
, ptrue
,
17127 gen_rtx_fmt_ee (code2
, pred_mode
, op0
, op1
));
17128 aarch64_emit_binop (target
, ior_optab
, tmp1
, tmp2
);
17131 /* Emit the SVE equivalent of:
17133 (set TMP (CODE OP0 OP1))
17134 (set TARGET (not TMP))
17136 PTRUE is an all-true predicate with the same mode as TARGET. */
17139 aarch64_emit_sve_inverted_cond (rtx target
, rtx ptrue
, rtx_code code
,
17142 machine_mode pred_mode
= GET_MODE (ptrue
);
17143 rtx tmp
= gen_reg_rtx (pred_mode
);
17144 aarch64_emit_sve_ptrue_op (tmp
, ptrue
,
17145 gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
));
17146 aarch64_emit_unop (target
, one_cmpl_optab
, tmp
);
17149 /* Expand an SVE floating-point comparison using the SVE equivalent of:
17151 (set TARGET (CODE OP0 OP1))
17153 If CAN_INVERT_P is true, the caller can also handle inverted results;
17154 return true if the result is in fact inverted. */
17157 aarch64_expand_sve_vec_cmp_float (rtx target
, rtx_code code
,
17158 rtx op0
, rtx op1
, bool can_invert_p
)
17160 machine_mode pred_mode
= GET_MODE (target
);
17161 machine_mode data_mode
= GET_MODE (op0
);
17163 rtx ptrue
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
17167 /* UNORDERED has no immediate form. */
17168 op1
= force_reg (data_mode
, op1
);
17177 /* There is native support for the comparison. */
17178 rtx cond
= gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
);
17179 aarch64_emit_sve_ptrue_op (target
, ptrue
, cond
);
17184 /* This is a trapping operation (LT or GT). */
17185 aarch64_emit_sve_or_conds (target
, LT
, GT
, ptrue
, op0
, op1
);
17189 if (!flag_trapping_math
)
17191 /* This would trap for signaling NaNs. */
17192 op1
= force_reg (data_mode
, op1
);
17193 aarch64_emit_sve_or_conds (target
, UNORDERED
, EQ
, ptrue
, op0
, op1
);
17201 if (flag_trapping_math
)
17203 /* Work out which elements are ordered. */
17204 rtx ordered
= gen_reg_rtx (pred_mode
);
17205 op1
= force_reg (data_mode
, op1
);
17206 aarch64_emit_sve_inverted_cond (ordered
, ptrue
, UNORDERED
, op0
, op1
);
17208 /* Test the opposite condition for the ordered elements,
17209 then invert the result. */
17213 code
= reverse_condition_maybe_unordered (code
);
17216 aarch64_emit_sve_predicated_cond (target
, code
,
17217 ordered
, op0
, op1
);
17220 rtx tmp
= gen_reg_rtx (pred_mode
);
17221 aarch64_emit_sve_predicated_cond (tmp
, code
, ordered
, op0
, op1
);
17222 aarch64_emit_unop (target
, one_cmpl_optab
, tmp
);
17228 /* ORDERED has no immediate form. */
17229 op1
= force_reg (data_mode
, op1
);
17233 gcc_unreachable ();
17236 /* There is native support for the inverse comparison. */
17237 code
= reverse_condition_maybe_unordered (code
);
17240 rtx cond
= gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
);
17241 aarch64_emit_sve_ptrue_op (target
, ptrue
, cond
);
17244 aarch64_emit_sve_inverted_cond (target
, ptrue
, code
, op0
, op1
);
17248 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
17249 of the data being selected and CMP_MODE is the mode of the values being
17253 aarch64_expand_sve_vcond (machine_mode data_mode
, machine_mode cmp_mode
,
17256 machine_mode pred_mode
17257 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode
),
17258 GET_MODE_SIZE (cmp_mode
)).require ();
17259 rtx pred
= gen_reg_rtx (pred_mode
);
17260 if (FLOAT_MODE_P (cmp_mode
))
17262 if (aarch64_expand_sve_vec_cmp_float (pred
, GET_CODE (ops
[3]),
17263 ops
[4], ops
[5], true))
17264 std::swap (ops
[1], ops
[2]);
17267 aarch64_expand_sve_vec_cmp_int (pred
, GET_CODE (ops
[3]), ops
[4], ops
[5]);
17269 rtvec vec
= gen_rtvec (3, pred
, ops
[1], ops
[2]);
17270 emit_set_insn (ops
[0], gen_rtx_UNSPEC (data_mode
, vec
, UNSPEC_SEL
));
17273 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
17274 true. However due to issues with register allocation it is preferable
17275 to avoid tieing integer scalar and FP scalar modes. Executing integer
17276 operations in general registers is better than treating them as scalar
17277 vector operations. This reduces latency and avoids redundant int<->FP
17278 moves. So tie modes if they are either the same class, or vector modes
17279 with other vector modes, vector structs or any scalar mode. */
17282 aarch64_modes_tieable_p (machine_mode mode1
, machine_mode mode2
)
17284 if (GET_MODE_CLASS (mode1
) == GET_MODE_CLASS (mode2
))
17287 /* We specifically want to allow elements of "structure" modes to
17288 be tieable to the structure. This more general condition allows
17289 other rarer situations too. The reason we don't extend this to
17290 predicate modes is that there are no predicate structure modes
17291 nor any specific instructions for extracting part of a predicate
17293 if (aarch64_vector_data_mode_p (mode1
)
17294 && aarch64_vector_data_mode_p (mode2
))
17297 /* Also allow any scalar modes with vectors. */
17298 if (aarch64_vector_mode_supported_p (mode1
)
17299 || aarch64_vector_mode_supported_p (mode2
))
17305 /* Return a new RTX holding the result of moving POINTER forward by
17309 aarch64_move_pointer (rtx pointer
, poly_int64 amount
)
17311 rtx next
= plus_constant (Pmode
, XEXP (pointer
, 0), amount
);
17313 return adjust_automodify_address (pointer
, GET_MODE (pointer
),
17317 /* Return a new RTX holding the result of moving POINTER forward by the
17318 size of the mode it points to. */
17321 aarch64_progress_pointer (rtx pointer
)
17323 return aarch64_move_pointer (pointer
, GET_MODE_SIZE (GET_MODE (pointer
)));
17326 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
17330 aarch64_copy_one_block_and_progress_pointers (rtx
*src
, rtx
*dst
,
17333 rtx reg
= gen_reg_rtx (mode
);
17335 /* "Cast" the pointers to the correct mode. */
17336 *src
= adjust_address (*src
, mode
, 0);
17337 *dst
= adjust_address (*dst
, mode
, 0);
17338 /* Emit the memcpy. */
17339 emit_move_insn (reg
, *src
);
17340 emit_move_insn (*dst
, reg
);
17341 /* Move the pointers forward. */
17342 *src
= aarch64_progress_pointer (*src
);
17343 *dst
= aarch64_progress_pointer (*dst
);
17346 /* Expand movmem, as if from a __builtin_memcpy. Return true if
17347 we succeed, otherwise return false. */
17350 aarch64_expand_movmem (rtx
*operands
)
17353 rtx dst
= operands
[0];
17354 rtx src
= operands
[1];
17356 machine_mode cur_mode
= BLKmode
, next_mode
;
17357 bool speed_p
= !optimize_function_for_size_p (cfun
);
17359 /* When optimizing for size, give a better estimate of the length of a
17360 memcpy call, but use the default otherwise. Moves larger than 8 bytes
17361 will always require an even number of instructions to do now. And each
17362 operation requires both a load+store, so devide the max number by 2. */
17363 int max_num_moves
= (speed_p
? 16 : AARCH64_CALL_RATIO
) / 2;
17365 /* We can't do anything smart if the amount to copy is not constant. */
17366 if (!CONST_INT_P (operands
[2]))
17369 n
= INTVAL (operands
[2]);
17371 /* Try to keep the number of instructions low. For all cases we will do at
17372 most two moves for the residual amount, since we'll always overlap the
17374 if (((n
/ 16) + (n
% 16 ? 2 : 0)) > max_num_moves
)
17377 base
= copy_to_mode_reg (Pmode
, XEXP (dst
, 0));
17378 dst
= adjust_automodify_address (dst
, VOIDmode
, base
, 0);
17380 base
= copy_to_mode_reg (Pmode
, XEXP (src
, 0));
17381 src
= adjust_automodify_address (src
, VOIDmode
, base
, 0);
17383 /* Convert n to bits to make the rest of the code simpler. */
17384 n
= n
* BITS_PER_UNIT
;
17386 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
17387 larger than TImode, but we should not use them for loads/stores here. */
17388 const int copy_limit
= GET_MODE_BITSIZE (TImode
);
17392 /* Find the largest mode in which to do the copy in without over reading
17394 opt_scalar_int_mode mode_iter
;
17395 FOR_EACH_MODE_IN_CLASS (mode_iter
, MODE_INT
)
17396 if (GET_MODE_BITSIZE (mode_iter
.require ()) <= MIN (n
, copy_limit
))
17397 cur_mode
= mode_iter
.require ();
17399 gcc_assert (cur_mode
!= BLKmode
);
17401 mode_bits
= GET_MODE_BITSIZE (cur_mode
).to_constant ();
17402 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, cur_mode
);
17406 /* Do certain trailing copies as overlapping if it's going to be
17407 cheaper. i.e. less instructions to do so. For instance doing a 15
17408 byte copy it's more efficient to do two overlapping 8 byte copies than
17410 if (n
> 0 && n
<= 8 * BITS_PER_UNIT
)
17412 next_mode
= smallest_mode_for_size (n
, MODE_INT
);
17413 int n_bits
= GET_MODE_BITSIZE (next_mode
).to_constant ();
17414 src
= aarch64_move_pointer (src
, (n
- n_bits
) / BITS_PER_UNIT
);
17415 dst
= aarch64_move_pointer (dst
, (n
- n_bits
) / BITS_PER_UNIT
);
17423 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
17424 SImode stores. Handle the case when the constant has identical
17425 bottom and top halves. This is beneficial when the two stores can be
17426 merged into an STP and we avoid synthesising potentially expensive
17427 immediates twice. Return true if such a split is possible. */
17430 aarch64_split_dimode_const_store (rtx dst
, rtx src
)
17432 rtx lo
= gen_lowpart (SImode
, src
);
17433 rtx hi
= gen_highpart_mode (SImode
, DImode
, src
);
17435 bool size_p
= optimize_function_for_size_p (cfun
);
17437 if (!rtx_equal_p (lo
, hi
))
17440 unsigned int orig_cost
17441 = aarch64_internal_mov_immediate (NULL_RTX
, src
, false, DImode
);
17442 unsigned int lo_cost
17443 = aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, SImode
);
17445 /* We want to transform:
17447 MOVK x1, 0x140, lsl 16
17448 MOVK x1, 0xc0da, lsl 32
17449 MOVK x1, 0x140, lsl 48
17453 MOVK w1, 0x140, lsl 16
17455 So we want to perform this only when we save two instructions
17456 or more. When optimizing for size, however, accept any code size
17458 if (size_p
&& orig_cost
<= lo_cost
)
17462 && (orig_cost
<= lo_cost
+ 1))
17465 rtx mem_lo
= adjust_address (dst
, SImode
, 0);
17466 if (!aarch64_mem_pair_operand (mem_lo
, SImode
))
17469 rtx tmp_reg
= gen_reg_rtx (SImode
);
17470 aarch64_expand_mov_immediate (tmp_reg
, lo
);
17471 rtx mem_hi
= aarch64_move_pointer (mem_lo
, GET_MODE_SIZE (SImode
));
17472 /* Don't emit an explicit store pair as this may not be always profitable.
17473 Let the sched-fusion logic decide whether to merge them. */
17474 emit_move_insn (mem_lo
, tmp_reg
);
17475 emit_move_insn (mem_hi
, tmp_reg
);
17480 /* Generate RTL for a conditional branch with rtx comparison CODE in
17481 mode CC_MODE. The destination of the unlikely conditional branch
17485 aarch64_gen_unlikely_cbranch (enum rtx_code code
, machine_mode cc_mode
,
17489 x
= gen_rtx_fmt_ee (code
, VOIDmode
,
17490 gen_rtx_REG (cc_mode
, CC_REGNUM
),
17493 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
17494 gen_rtx_LABEL_REF (VOIDmode
, label_ref
),
17496 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
17499 /* Generate DImode scratch registers for 128-bit (TImode) addition.
17501 OP1 represents the TImode destination operand 1
17502 OP2 represents the TImode destination operand 2
17503 LOW_DEST represents the low half (DImode) of TImode operand 0
17504 LOW_IN1 represents the low half (DImode) of TImode operand 1
17505 LOW_IN2 represents the low half (DImode) of TImode operand 2
17506 HIGH_DEST represents the high half (DImode) of TImode operand 0
17507 HIGH_IN1 represents the high half (DImode) of TImode operand 1
17508 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
17511 aarch64_addti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
17512 rtx
*low_in1
, rtx
*low_in2
,
17513 rtx
*high_dest
, rtx
*high_in1
,
17516 *low_dest
= gen_reg_rtx (DImode
);
17517 *low_in1
= gen_lowpart (DImode
, op1
);
17518 *low_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
17519 subreg_lowpart_offset (DImode
, TImode
));
17520 *high_dest
= gen_reg_rtx (DImode
);
17521 *high_in1
= gen_highpart (DImode
, op1
);
17522 *high_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
17523 subreg_highpart_offset (DImode
, TImode
));
17526 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
17528 This function differs from 'arch64_addti_scratch_regs' in that
17529 OP1 can be an immediate constant (zero). We must call
17530 subreg_highpart_offset with DImode and TImode arguments, otherwise
17531 VOIDmode will be used for the const_int which generates an internal
17532 error from subreg_size_highpart_offset which does not expect a size of zero.
17534 OP1 represents the TImode destination operand 1
17535 OP2 represents the TImode destination operand 2
17536 LOW_DEST represents the low half (DImode) of TImode operand 0
17537 LOW_IN1 represents the low half (DImode) of TImode operand 1
17538 LOW_IN2 represents the low half (DImode) of TImode operand 2
17539 HIGH_DEST represents the high half (DImode) of TImode operand 0
17540 HIGH_IN1 represents the high half (DImode) of TImode operand 1
17541 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
17545 aarch64_subvti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
17546 rtx
*low_in1
, rtx
*low_in2
,
17547 rtx
*high_dest
, rtx
*high_in1
,
17550 *low_dest
= gen_reg_rtx (DImode
);
17551 *low_in1
= simplify_gen_subreg (DImode
, op1
, TImode
,
17552 subreg_lowpart_offset (DImode
, TImode
));
17554 *low_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
17555 subreg_lowpart_offset (DImode
, TImode
));
17556 *high_dest
= gen_reg_rtx (DImode
);
17558 *high_in1
= simplify_gen_subreg (DImode
, op1
, TImode
,
17559 subreg_highpart_offset (DImode
, TImode
));
17560 *high_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
17561 subreg_highpart_offset (DImode
, TImode
));
17564 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
17566 OP0 represents the TImode destination operand 0
17567 LOW_DEST represents the low half (DImode) of TImode operand 0
17568 LOW_IN1 represents the low half (DImode) of TImode operand 1
17569 LOW_IN2 represents the low half (DImode) of TImode operand 2
17570 HIGH_DEST represents the high half (DImode) of TImode operand 0
17571 HIGH_IN1 represents the high half (DImode) of TImode operand 1
17572 HIGH_IN2 represents the high half (DImode) of TImode operand 2
17573 UNSIGNED_P is true if the operation is being performed on unsigned
17576 aarch64_expand_subvti (rtx op0
, rtx low_dest
, rtx low_in1
,
17577 rtx low_in2
, rtx high_dest
, rtx high_in1
,
17578 rtx high_in2
, bool unsigned_p
)
17580 if (low_in2
== const0_rtx
)
17582 low_dest
= low_in1
;
17583 high_in2
= force_reg (DImode
, high_in2
);
17585 emit_insn (gen_subdi3_compare1 (high_dest
, high_in1
, high_in2
));
17587 emit_insn (gen_subvdi_insn (high_dest
, high_in1
, high_in2
));
17591 if (CONST_INT_P (low_in2
))
17593 high_in2
= force_reg (DImode
, high_in2
);
17594 emit_insn (gen_subdi3_compare1_imm (low_dest
, low_in1
, low_in2
,
17595 GEN_INT (-INTVAL (low_in2
))));
17598 emit_insn (gen_subdi3_compare1 (low_dest
, low_in1
, low_in2
));
17601 emit_insn (gen_usubdi3_carryinC (high_dest
, high_in1
, high_in2
));
17603 emit_insn (gen_subdi3_carryinV (high_dest
, high_in1
, high_in2
));
17606 emit_move_insn (gen_lowpart (DImode
, op0
), low_dest
);
17607 emit_move_insn (gen_highpart (DImode
, op0
), high_dest
);
17611 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
17613 static unsigned HOST_WIDE_INT
17614 aarch64_asan_shadow_offset (void)
17617 return (HOST_WIDE_INT_1
<< 29);
17619 return (HOST_WIDE_INT_1
<< 36);
17623 aarch64_gen_ccmp_first (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
,
17624 int code
, tree treeop0
, tree treeop1
)
17626 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
17628 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
17630 struct expand_operand ops
[4];
17633 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
17635 op_mode
= GET_MODE (op0
);
17636 if (op_mode
== VOIDmode
)
17637 op_mode
= GET_MODE (op1
);
17645 icode
= CODE_FOR_cmpsi
;
17650 icode
= CODE_FOR_cmpdi
;
17655 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
17656 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpesf
: CODE_FOR_fcmpsf
;
17661 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
17662 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpedf
: CODE_FOR_fcmpdf
;
17670 op0
= prepare_operand (icode
, op0
, 0, op_mode
, cmp_mode
, unsignedp
);
17671 op1
= prepare_operand (icode
, op1
, 1, op_mode
, cmp_mode
, unsignedp
);
17677 *prep_seq
= get_insns ();
17680 create_fixed_operand (&ops
[0], op0
);
17681 create_fixed_operand (&ops
[1], op1
);
17684 if (!maybe_expand_insn (icode
, 2, ops
))
17689 *gen_seq
= get_insns ();
17692 return gen_rtx_fmt_ee ((rtx_code
) code
, cc_mode
,
17693 gen_rtx_REG (cc_mode
, CC_REGNUM
), const0_rtx
);
17697 aarch64_gen_ccmp_next (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
, rtx prev
,
17698 int cmp_code
, tree treeop0
, tree treeop1
, int bit_code
)
17700 rtx op0
, op1
, target
;
17701 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
17702 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
17704 struct expand_operand ops
[6];
17707 push_to_sequence (*prep_seq
);
17708 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
17710 op_mode
= GET_MODE (op0
);
17711 if (op_mode
== VOIDmode
)
17712 op_mode
= GET_MODE (op1
);
17720 icode
= CODE_FOR_ccmpsi
;
17725 icode
= CODE_FOR_ccmpdi
;
17730 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
17731 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpesf
: CODE_FOR_fccmpsf
;
17736 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
17737 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpedf
: CODE_FOR_fccmpdf
;
17745 op0
= prepare_operand (icode
, op0
, 2, op_mode
, cmp_mode
, unsignedp
);
17746 op1
= prepare_operand (icode
, op1
, 3, op_mode
, cmp_mode
, unsignedp
);
17752 *prep_seq
= get_insns ();
17755 target
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
17756 aarch64_cond
= aarch64_get_condition_code_1 (cc_mode
, (rtx_code
) cmp_code
);
17758 if (bit_code
!= AND
)
17760 prev
= gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev
),
17761 GET_MODE (XEXP (prev
, 0))),
17762 VOIDmode
, XEXP (prev
, 0), const0_rtx
);
17763 aarch64_cond
= AARCH64_INVERSE_CONDITION_CODE (aarch64_cond
);
17766 create_fixed_operand (&ops
[0], XEXP (prev
, 0));
17767 create_fixed_operand (&ops
[1], target
);
17768 create_fixed_operand (&ops
[2], op0
);
17769 create_fixed_operand (&ops
[3], op1
);
17770 create_fixed_operand (&ops
[4], prev
);
17771 create_fixed_operand (&ops
[5], GEN_INT (aarch64_cond
));
17773 push_to_sequence (*gen_seq
);
17774 if (!maybe_expand_insn (icode
, 6, ops
))
17780 *gen_seq
= get_insns ();
17783 return gen_rtx_fmt_ee ((rtx_code
) cmp_code
, VOIDmode
, target
, const0_rtx
);
17786 #undef TARGET_GEN_CCMP_FIRST
17787 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
17789 #undef TARGET_GEN_CCMP_NEXT
17790 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
17792 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
17793 instruction fusion of some sort. */
17796 aarch64_macro_fusion_p (void)
17798 return aarch64_tune_params
.fusible_ops
!= AARCH64_FUSE_NOTHING
;
17802 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
17803 should be kept together during scheduling. */
17806 aarch_macro_fusion_pair_p (rtx_insn
*prev
, rtx_insn
*curr
)
17809 rtx prev_set
= single_set (prev
);
17810 rtx curr_set
= single_set (curr
);
17811 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
17812 bool simple_sets_p
= prev_set
&& curr_set
&& !any_condjump_p (curr
);
17814 if (!aarch64_macro_fusion_p ())
17817 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK
))
17819 /* We are trying to match:
17820 prev (mov) == (set (reg r0) (const_int imm16))
17821 curr (movk) == (set (zero_extract (reg r0)
17824 (const_int imm16_1)) */
17826 set_dest
= SET_DEST (curr_set
);
17828 if (GET_CODE (set_dest
) == ZERO_EXTRACT
17829 && CONST_INT_P (SET_SRC (curr_set
))
17830 && CONST_INT_P (SET_SRC (prev_set
))
17831 && CONST_INT_P (XEXP (set_dest
, 2))
17832 && INTVAL (XEXP (set_dest
, 2)) == 16
17833 && REG_P (XEXP (set_dest
, 0))
17834 && REG_P (SET_DEST (prev_set
))
17835 && REGNO (XEXP (set_dest
, 0)) == REGNO (SET_DEST (prev_set
)))
17841 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD
))
17844 /* We're trying to match:
17845 prev (adrp) == (set (reg r1)
17846 (high (symbol_ref ("SYM"))))
17847 curr (add) == (set (reg r0)
17849 (symbol_ref ("SYM"))))
17850 Note that r0 need not necessarily be the same as r1, especially
17851 during pre-regalloc scheduling. */
17853 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
17854 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
17856 if (GET_CODE (SET_SRC (curr_set
)) == LO_SUM
17857 && REG_P (XEXP (SET_SRC (curr_set
), 0))
17858 && REGNO (XEXP (SET_SRC (curr_set
), 0))
17859 == REGNO (SET_DEST (prev_set
))
17860 && rtx_equal_p (XEXP (SET_SRC (prev_set
), 0),
17861 XEXP (SET_SRC (curr_set
), 1)))
17866 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK
))
17869 /* We're trying to match:
17870 prev (movk) == (set (zero_extract (reg r0)
17873 (const_int imm16_1))
17874 curr (movk) == (set (zero_extract (reg r0)
17877 (const_int imm16_2)) */
17879 if (GET_CODE (SET_DEST (prev_set
)) == ZERO_EXTRACT
17880 && GET_CODE (SET_DEST (curr_set
)) == ZERO_EXTRACT
17881 && REG_P (XEXP (SET_DEST (prev_set
), 0))
17882 && REG_P (XEXP (SET_DEST (curr_set
), 0))
17883 && REGNO (XEXP (SET_DEST (prev_set
), 0))
17884 == REGNO (XEXP (SET_DEST (curr_set
), 0))
17885 && CONST_INT_P (XEXP (SET_DEST (prev_set
), 2))
17886 && CONST_INT_P (XEXP (SET_DEST (curr_set
), 2))
17887 && INTVAL (XEXP (SET_DEST (prev_set
), 2)) == 32
17888 && INTVAL (XEXP (SET_DEST (curr_set
), 2)) == 48
17889 && CONST_INT_P (SET_SRC (prev_set
))
17890 && CONST_INT_P (SET_SRC (curr_set
)))
17894 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR
))
17896 /* We're trying to match:
17897 prev (adrp) == (set (reg r0)
17898 (high (symbol_ref ("SYM"))))
17899 curr (ldr) == (set (reg r1)
17900 (mem (lo_sum (reg r0)
17901 (symbol_ref ("SYM")))))
17903 curr (ldr) == (set (reg r1)
17906 (symbol_ref ("SYM")))))) */
17907 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
17908 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
17910 rtx curr_src
= SET_SRC (curr_set
);
17912 if (GET_CODE (curr_src
) == ZERO_EXTEND
)
17913 curr_src
= XEXP (curr_src
, 0);
17915 if (MEM_P (curr_src
) && GET_CODE (XEXP (curr_src
, 0)) == LO_SUM
17916 && REG_P (XEXP (XEXP (curr_src
, 0), 0))
17917 && REGNO (XEXP (XEXP (curr_src
, 0), 0))
17918 == REGNO (SET_DEST (prev_set
))
17919 && rtx_equal_p (XEXP (XEXP (curr_src
, 0), 1),
17920 XEXP (SET_SRC (prev_set
), 0)))
17925 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC
)
17926 && aarch_crypto_can_dual_issue (prev
, curr
))
17929 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH
)
17930 && any_condjump_p (curr
))
17932 unsigned int condreg1
, condreg2
;
17934 aarch64_fixed_condition_code_regs (&condreg1
, &condreg2
);
17935 cc_reg_1
= gen_rtx_REG (CCmode
, condreg1
);
17937 if (reg_referenced_p (cc_reg_1
, PATTERN (curr
))
17939 && modified_in_p (cc_reg_1
, prev
))
17941 enum attr_type prev_type
= get_attr_type (prev
);
17943 /* FIXME: this misses some which is considered simple arthematic
17944 instructions for ThunderX. Simple shifts are missed here. */
17945 if (prev_type
== TYPE_ALUS_SREG
17946 || prev_type
== TYPE_ALUS_IMM
17947 || prev_type
== TYPE_LOGICS_REG
17948 || prev_type
== TYPE_LOGICS_IMM
)
17955 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH
)
17956 && any_condjump_p (curr
))
17958 /* We're trying to match:
17959 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
17960 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
17962 (label_ref ("SYM"))
17964 if (SET_DEST (curr_set
) == (pc_rtx
)
17965 && GET_CODE (SET_SRC (curr_set
)) == IF_THEN_ELSE
17966 && REG_P (XEXP (XEXP (SET_SRC (curr_set
), 0), 0))
17967 && REG_P (SET_DEST (prev_set
))
17968 && REGNO (SET_DEST (prev_set
))
17969 == REGNO (XEXP (XEXP (SET_SRC (curr_set
), 0), 0)))
17971 /* Fuse ALU operations followed by conditional branch instruction. */
17972 switch (get_attr_type (prev
))
17975 case TYPE_ALU_SREG
:
17978 case TYPE_ADCS_REG
:
17979 case TYPE_ADCS_IMM
:
17980 case TYPE_LOGIC_REG
:
17981 case TYPE_LOGIC_IMM
:
17985 case TYPE_SHIFT_REG
:
17986 case TYPE_SHIFT_IMM
:
18001 /* Return true iff the instruction fusion described by OP is enabled. */
18004 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op
)
18006 return (aarch64_tune_params
.fusible_ops
& op
) != 0;
18009 /* If MEM is in the form of [base+offset], extract the two parts
18010 of address and set to BASE and OFFSET, otherwise return false
18011 after clearing BASE and OFFSET. */
18014 extract_base_offset_in_addr (rtx mem
, rtx
*base
, rtx
*offset
)
18018 gcc_assert (MEM_P (mem
));
18020 addr
= XEXP (mem
, 0);
18025 *offset
= const0_rtx
;
18029 if (GET_CODE (addr
) == PLUS
18030 && REG_P (XEXP (addr
, 0)) && CONST_INT_P (XEXP (addr
, 1)))
18032 *base
= XEXP (addr
, 0);
18033 *offset
= XEXP (addr
, 1);
18038 *offset
= NULL_RTX
;
18043 /* Types for scheduling fusion. */
18044 enum sched_fusion_type
18046 SCHED_FUSION_NONE
= 0,
18047 SCHED_FUSION_LD_SIGN_EXTEND
,
18048 SCHED_FUSION_LD_ZERO_EXTEND
,
18054 /* If INSN is a load or store of address in the form of [base+offset],
18055 extract the two parts and set to BASE and OFFSET. Return scheduling
18056 fusion type this INSN is. */
18058 static enum sched_fusion_type
18059 fusion_load_store (rtx_insn
*insn
, rtx
*base
, rtx
*offset
)
18062 enum sched_fusion_type fusion
= SCHED_FUSION_LD
;
18064 gcc_assert (INSN_P (insn
));
18065 x
= PATTERN (insn
);
18066 if (GET_CODE (x
) != SET
)
18067 return SCHED_FUSION_NONE
;
18070 dest
= SET_DEST (x
);
18072 machine_mode dest_mode
= GET_MODE (dest
);
18074 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode
))
18075 return SCHED_FUSION_NONE
;
18077 if (GET_CODE (src
) == SIGN_EXTEND
)
18079 fusion
= SCHED_FUSION_LD_SIGN_EXTEND
;
18080 src
= XEXP (src
, 0);
18081 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
18082 return SCHED_FUSION_NONE
;
18084 else if (GET_CODE (src
) == ZERO_EXTEND
)
18086 fusion
= SCHED_FUSION_LD_ZERO_EXTEND
;
18087 src
= XEXP (src
, 0);
18088 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
18089 return SCHED_FUSION_NONE
;
18092 if (GET_CODE (src
) == MEM
&& REG_P (dest
))
18093 extract_base_offset_in_addr (src
, base
, offset
);
18094 else if (GET_CODE (dest
) == MEM
&& (REG_P (src
) || src
== const0_rtx
))
18096 fusion
= SCHED_FUSION_ST
;
18097 extract_base_offset_in_addr (dest
, base
, offset
);
18100 return SCHED_FUSION_NONE
;
18102 if (*base
== NULL_RTX
|| *offset
== NULL_RTX
)
18103 fusion
= SCHED_FUSION_NONE
;
18108 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
18110 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
18111 and PRI are only calculated for these instructions. For other instruction,
18112 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
18113 type instruction fusion can be added by returning different priorities.
18115 It's important that irrelevant instructions get the largest FUSION_PRI. */
18118 aarch64_sched_fusion_priority (rtx_insn
*insn
, int max_pri
,
18119 int *fusion_pri
, int *pri
)
18123 enum sched_fusion_type fusion
;
18125 gcc_assert (INSN_P (insn
));
18128 fusion
= fusion_load_store (insn
, &base
, &offset
);
18129 if (fusion
== SCHED_FUSION_NONE
)
18136 /* Set FUSION_PRI according to fusion type and base register. */
18137 *fusion_pri
= tmp
- fusion
* FIRST_PSEUDO_REGISTER
- REGNO (base
);
18139 /* Calculate PRI. */
18142 /* INSN with smaller offset goes first. */
18143 off_val
= (int)(INTVAL (offset
));
18145 tmp
-= (off_val
& 0xfffff);
18147 tmp
+= ((- off_val
) & 0xfffff);
18153 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
18154 Adjust priority of sha1h instructions so they are scheduled before
18155 other SHA1 instructions. */
18158 aarch64_sched_adjust_priority (rtx_insn
*insn
, int priority
)
18160 rtx x
= PATTERN (insn
);
18162 if (GET_CODE (x
) == SET
)
18166 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_SHA1H
)
18167 return priority
+ 10;
18173 /* Given OPERANDS of consecutive load/store, check if we can merge
18174 them into ldp/stp. LOAD is true if they are load instructions.
18175 MODE is the mode of memory operands. */
18178 aarch64_operands_ok_for_ldpstp (rtx
*operands
, bool load
,
18181 HOST_WIDE_INT offval_1
, offval_2
, msize
;
18182 enum reg_class rclass_1
, rclass_2
;
18183 rtx mem_1
, mem_2
, reg_1
, reg_2
, base_1
, base_2
, offset_1
, offset_2
;
18187 mem_1
= operands
[1];
18188 mem_2
= operands
[3];
18189 reg_1
= operands
[0];
18190 reg_2
= operands
[2];
18191 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
));
18192 if (REGNO (reg_1
) == REGNO (reg_2
))
18197 mem_1
= operands
[0];
18198 mem_2
= operands
[2];
18199 reg_1
= operands
[1];
18200 reg_2
= operands
[3];
18203 /* The mems cannot be volatile. */
18204 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
))
18207 /* If we have SImode and slow unaligned ldp,
18208 check the alignment to be at least 8 byte. */
18210 && (aarch64_tune_params
.extra_tuning_flags
18211 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
18213 && MEM_ALIGN (mem_1
) < 8 * BITS_PER_UNIT
)
18216 /* Check if the addresses are in the form of [base+offset]. */
18217 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
18218 if (base_1
== NULL_RTX
|| offset_1
== NULL_RTX
)
18220 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
18221 if (base_2
== NULL_RTX
|| offset_2
== NULL_RTX
)
18224 /* Check if the bases are same. */
18225 if (!rtx_equal_p (base_1
, base_2
))
18228 /* The operands must be of the same size. */
18229 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1
)),
18230 GET_MODE_SIZE (GET_MODE (mem_2
))));
18232 offval_1
= INTVAL (offset_1
);
18233 offval_2
= INTVAL (offset_2
);
18234 /* We should only be trying this for fixed-sized modes. There is no
18235 SVE LDP/STP instruction. */
18236 msize
= GET_MODE_SIZE (mode
).to_constant ();
18237 /* Check if the offsets are consecutive. */
18238 if (offval_1
!= (offval_2
+ msize
) && offval_2
!= (offval_1
+ msize
))
18241 /* Check if the addresses are clobbered by load. */
18244 if (reg_mentioned_p (reg_1
, mem_1
))
18247 /* In increasing order, the last load can clobber the address. */
18248 if (offval_1
> offval_2
&& reg_mentioned_p (reg_2
, mem_2
))
18252 /* One of the memory accesses must be a mempair operand.
18253 If it is not the first one, they need to be swapped by the
18255 if (!aarch64_mem_pair_operand (mem_1
, GET_MODE (mem_1
))
18256 && !aarch64_mem_pair_operand (mem_2
, GET_MODE (mem_2
)))
18259 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
18260 rclass_1
= FP_REGS
;
18262 rclass_1
= GENERAL_REGS
;
18264 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
18265 rclass_2
= FP_REGS
;
18267 rclass_2
= GENERAL_REGS
;
18269 /* Check if the registers are of same class. */
18270 if (rclass_1
!= rclass_2
)
18276 /* Given OPERANDS of consecutive load/store that can be merged,
18277 swap them if they are not in ascending order. */
18279 aarch64_swap_ldrstr_operands (rtx
* operands
, bool load
)
18281 rtx mem_1
, mem_2
, base_1
, base_2
, offset_1
, offset_2
;
18282 HOST_WIDE_INT offval_1
, offval_2
;
18286 mem_1
= operands
[1];
18287 mem_2
= operands
[3];
18291 mem_1
= operands
[0];
18292 mem_2
= operands
[2];
18295 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
18296 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
18298 offval_1
= INTVAL (offset_1
);
18299 offval_2
= INTVAL (offset_2
);
18301 if (offval_1
> offval_2
)
18303 /* Irrespective of whether this is a load or a store,
18304 we do the same swap. */
18305 std::swap (operands
[0], operands
[2]);
18306 std::swap (operands
[1], operands
[3]);
18310 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
18311 comparison between the two. */
18313 aarch64_host_wide_int_compare (const void *x
, const void *y
)
18315 return wi::cmps (* ((const HOST_WIDE_INT
*) x
),
18316 * ((const HOST_WIDE_INT
*) y
));
18319 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
18320 other pointing to a REG rtx containing an offset, compare the offsets
18325 1 iff offset (X) > offset (Y)
18326 0 iff offset (X) == offset (Y)
18327 -1 iff offset (X) < offset (Y) */
18329 aarch64_ldrstr_offset_compare (const void *x
, const void *y
)
18331 const rtx
* operands_1
= (const rtx
*) x
;
18332 const rtx
* operands_2
= (const rtx
*) y
;
18333 rtx mem_1
, mem_2
, base
, offset_1
, offset_2
;
18335 if (MEM_P (operands_1
[0]))
18336 mem_1
= operands_1
[0];
18338 mem_1
= operands_1
[1];
18340 if (MEM_P (operands_2
[0]))
18341 mem_2
= operands_2
[0];
18343 mem_2
= operands_2
[1];
18345 /* Extract the offsets. */
18346 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
18347 extract_base_offset_in_addr (mem_2
, &base
, &offset_2
);
18349 gcc_assert (offset_1
!= NULL_RTX
&& offset_2
!= NULL_RTX
);
18351 return wi::cmps (INTVAL (offset_1
), INTVAL (offset_2
));
18354 /* Given OPERANDS of consecutive load/store, check if we can merge
18355 them into ldp/stp by adjusting the offset. LOAD is true if they
18356 are load instructions. MODE is the mode of memory operands.
18358 Given below consecutive stores:
18360 str w1, [xb, 0x100]
18361 str w1, [xb, 0x104]
18362 str w1, [xb, 0x108]
18363 str w1, [xb, 0x10c]
18365 Though the offsets are out of the range supported by stp, we can
18366 still pair them after adjusting the offset, like:
18368 add scratch, xb, 0x100
18369 stp w1, w1, [scratch]
18370 stp w1, w1, [scratch, 0x8]
18372 The peephole patterns detecting this opportunity should guarantee
18373 the scratch register is avaliable. */
18376 aarch64_operands_adjust_ok_for_ldpstp (rtx
*operands
, bool load
,
18379 const int num_insns
= 4;
18380 enum reg_class rclass
;
18381 HOST_WIDE_INT offvals
[num_insns
], msize
;
18382 rtx mem
[num_insns
], reg
[num_insns
], base
[num_insns
], offset
[num_insns
];
18386 for (int i
= 0; i
< num_insns
; i
++)
18388 reg
[i
] = operands
[2 * i
];
18389 mem
[i
] = operands
[2 * i
+ 1];
18391 gcc_assert (REG_P (reg
[i
]));
18394 /* Do not attempt to merge the loads if the loads clobber each other. */
18395 for (int i
= 0; i
< 8; i
+= 2)
18396 for (int j
= i
+ 2; j
< 8; j
+= 2)
18397 if (reg_overlap_mentioned_p (operands
[i
], operands
[j
]))
18401 for (int i
= 0; i
< num_insns
; i
++)
18403 mem
[i
] = operands
[2 * i
];
18404 reg
[i
] = operands
[2 * i
+ 1];
18407 /* Skip if memory operand is by itself valid for ldp/stp. */
18408 if (!MEM_P (mem
[0]) || aarch64_mem_pair_operand (mem
[0], mode
))
18411 for (int i
= 0; i
< num_insns
; i
++)
18413 /* The mems cannot be volatile. */
18414 if (MEM_VOLATILE_P (mem
[i
]))
18417 /* Check if the addresses are in the form of [base+offset]. */
18418 extract_base_offset_in_addr (mem
[i
], base
+ i
, offset
+ i
);
18419 if (base
[i
] == NULL_RTX
|| offset
[i
] == NULL_RTX
)
18423 /* Check if the registers are of same class. */
18424 rclass
= REG_P (reg
[0]) && FP_REGNUM_P (REGNO (reg
[0]))
18425 ? FP_REGS
: GENERAL_REGS
;
18427 for (int i
= 1; i
< num_insns
; i
++)
18428 if (REG_P (reg
[i
]) && FP_REGNUM_P (REGNO (reg
[i
])))
18430 if (rclass
!= FP_REGS
)
18435 if (rclass
!= GENERAL_REGS
)
18439 /* Only the last register in the order in which they occur
18440 may be clobbered by the load. */
18441 if (rclass
== GENERAL_REGS
&& load
)
18442 for (int i
= 0; i
< num_insns
- 1; i
++)
18443 if (reg_mentioned_p (reg
[i
], mem
[i
]))
18446 /* Check if the bases are same. */
18447 for (int i
= 0; i
< num_insns
- 1; i
++)
18448 if (!rtx_equal_p (base
[i
], base
[i
+ 1]))
18451 for (int i
= 0; i
< num_insns
; i
++)
18452 offvals
[i
] = INTVAL (offset
[i
]);
18454 msize
= GET_MODE_SIZE (mode
);
18456 /* Check if the offsets can be put in the right order to do a ldp/stp. */
18457 qsort (offvals
, num_insns
, sizeof (HOST_WIDE_INT
),
18458 aarch64_host_wide_int_compare
);
18460 if (!(offvals
[1] == offvals
[0] + msize
18461 && offvals
[3] == offvals
[2] + msize
))
18464 /* Check that offsets are within range of each other. The ldp/stp
18465 instructions have 7 bit immediate offsets, so use 0x80. */
18466 if (offvals
[2] - offvals
[0] >= msize
* 0x80)
18469 /* The offsets must be aligned with respect to each other. */
18470 if (offvals
[0] % msize
!= offvals
[2] % msize
)
18473 /* If we have SImode and slow unaligned ldp,
18474 check the alignment to be at least 8 byte. */
18476 && (aarch64_tune_params
.extra_tuning_flags
18477 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
18479 && MEM_ALIGN (mem
[0]) < 8 * BITS_PER_UNIT
)
18485 /* Given OPERANDS of consecutive load/store, this function pairs them
18486 into LDP/STP after adjusting the offset. It depends on the fact
18487 that the operands can be sorted so the offsets are correct for STP.
18488 MODE is the mode of memory operands. CODE is the rtl operator
18489 which should be applied to all memory operands, it's SIGN_EXTEND,
18490 ZERO_EXTEND or UNKNOWN. */
18493 aarch64_gen_adjusted_ldpstp (rtx
*operands
, bool load
,
18494 scalar_mode mode
, RTX_CODE code
)
18496 rtx base
, offset_1
, offset_3
, t1
, t2
;
18497 rtx mem_1
, mem_2
, mem_3
, mem_4
;
18498 rtx temp_operands
[8];
18499 HOST_WIDE_INT off_val_1
, off_val_3
, base_off
, new_off_1
, new_off_3
,
18500 stp_off_upper_limit
, stp_off_lower_limit
, msize
;
18502 /* We make changes on a copy as we may still bail out. */
18503 for (int i
= 0; i
< 8; i
++)
18504 temp_operands
[i
] = operands
[i
];
18506 /* Sort the operands. */
18507 qsort (temp_operands
, 4, 2 * sizeof (rtx
*), aarch64_ldrstr_offset_compare
);
18511 mem_1
= temp_operands
[1];
18512 mem_2
= temp_operands
[3];
18513 mem_3
= temp_operands
[5];
18514 mem_4
= temp_operands
[7];
18518 mem_1
= temp_operands
[0];
18519 mem_2
= temp_operands
[2];
18520 mem_3
= temp_operands
[4];
18521 mem_4
= temp_operands
[6];
18522 gcc_assert (code
== UNKNOWN
);
18525 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
18526 extract_base_offset_in_addr (mem_3
, &base
, &offset_3
);
18527 gcc_assert (base
!= NULL_RTX
&& offset_1
!= NULL_RTX
18528 && offset_3
!= NULL_RTX
);
18530 /* Adjust offset so it can fit in LDP/STP instruction. */
18531 msize
= GET_MODE_SIZE (mode
);
18532 stp_off_upper_limit
= msize
* (0x40 - 1);
18533 stp_off_lower_limit
= - msize
* 0x40;
18535 off_val_1
= INTVAL (offset_1
);
18536 off_val_3
= INTVAL (offset_3
);
18538 /* The base offset is optimally half way between the two STP/LDP offsets. */
18540 base_off
= (off_val_1
+ off_val_3
) / 2;
18542 /* However, due to issues with negative LDP/STP offset generation for
18543 larger modes, for DF, DI and vector modes. we must not use negative
18544 addresses smaller than 9 signed unadjusted bits can store. This
18545 provides the most range in this case. */
18546 base_off
= off_val_1
;
18548 /* Adjust the base so that it is aligned with the addresses but still
18550 if (base_off
% msize
!= off_val_1
% msize
)
18551 /* Fix the offset, bearing in mind we want to make it bigger not
18553 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
18554 else if (msize
<= 4)
18555 /* The negative range of LDP/STP is one larger than the positive range. */
18558 /* Check if base offset is too big or too small. We can attempt to resolve
18559 this issue by setting it to the maximum value and seeing if the offsets
18561 if (base_off
>= 0x1000)
18563 base_off
= 0x1000 - 1;
18564 /* We must still make sure that the base offset is aligned with respect
18565 to the address. But it may may not be made any bigger. */
18566 base_off
-= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
18569 /* Likewise for the case where the base is too small. */
18570 if (base_off
<= -0x1000)
18572 base_off
= -0x1000 + 1;
18573 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
18576 /* Offset of the first STP/LDP. */
18577 new_off_1
= off_val_1
- base_off
;
18579 /* Offset of the second STP/LDP. */
18580 new_off_3
= off_val_3
- base_off
;
18582 /* The offsets must be within the range of the LDP/STP instructions. */
18583 if (new_off_1
> stp_off_upper_limit
|| new_off_1
< stp_off_lower_limit
18584 || new_off_3
> stp_off_upper_limit
|| new_off_3
< stp_off_lower_limit
)
18587 replace_equiv_address_nv (mem_1
, plus_constant (Pmode
, operands
[8],
18589 replace_equiv_address_nv (mem_2
, plus_constant (Pmode
, operands
[8],
18590 new_off_1
+ msize
), true);
18591 replace_equiv_address_nv (mem_3
, plus_constant (Pmode
, operands
[8],
18593 replace_equiv_address_nv (mem_4
, plus_constant (Pmode
, operands
[8],
18594 new_off_3
+ msize
), true);
18596 if (!aarch64_mem_pair_operand (mem_1
, mode
)
18597 || !aarch64_mem_pair_operand (mem_3
, mode
))
18600 if (code
== ZERO_EXTEND
)
18602 mem_1
= gen_rtx_ZERO_EXTEND (DImode
, mem_1
);
18603 mem_2
= gen_rtx_ZERO_EXTEND (DImode
, mem_2
);
18604 mem_3
= gen_rtx_ZERO_EXTEND (DImode
, mem_3
);
18605 mem_4
= gen_rtx_ZERO_EXTEND (DImode
, mem_4
);
18607 else if (code
== SIGN_EXTEND
)
18609 mem_1
= gen_rtx_SIGN_EXTEND (DImode
, mem_1
);
18610 mem_2
= gen_rtx_SIGN_EXTEND (DImode
, mem_2
);
18611 mem_3
= gen_rtx_SIGN_EXTEND (DImode
, mem_3
);
18612 mem_4
= gen_rtx_SIGN_EXTEND (DImode
, mem_4
);
18617 operands
[0] = temp_operands
[0];
18618 operands
[1] = mem_1
;
18619 operands
[2] = temp_operands
[2];
18620 operands
[3] = mem_2
;
18621 operands
[4] = temp_operands
[4];
18622 operands
[5] = mem_3
;
18623 operands
[6] = temp_operands
[6];
18624 operands
[7] = mem_4
;
18628 operands
[0] = mem_1
;
18629 operands
[1] = temp_operands
[1];
18630 operands
[2] = mem_2
;
18631 operands
[3] = temp_operands
[3];
18632 operands
[4] = mem_3
;
18633 operands
[5] = temp_operands
[5];
18634 operands
[6] = mem_4
;
18635 operands
[7] = temp_operands
[7];
18638 /* Emit adjusting instruction. */
18639 emit_insn (gen_rtx_SET (operands
[8], plus_constant (DImode
, base
, base_off
)));
18640 /* Emit ldp/stp instructions. */
18641 t1
= gen_rtx_SET (operands
[0], operands
[1]);
18642 t2
= gen_rtx_SET (operands
[2], operands
[3]);
18643 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
18644 t1
= gen_rtx_SET (operands
[4], operands
[5]);
18645 t2
= gen_rtx_SET (operands
[6], operands
[7]);
18646 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
18650 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
18651 it isn't worth branching around empty masked ops (including masked
18655 aarch64_empty_mask_is_expensive (unsigned)
18660 /* Return 1 if pseudo register should be created and used to hold
18661 GOT address for PIC code. */
18664 aarch64_use_pseudo_pic_reg (void)
18666 return aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
;
18669 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
18672 aarch64_unspec_may_trap_p (const_rtx x
, unsigned flags
)
18674 switch (XINT (x
, 1))
18676 case UNSPEC_GOTSMALLPIC
:
18677 case UNSPEC_GOTSMALLPIC28K
:
18678 case UNSPEC_GOTTINYPIC
:
18684 return default_unspec_may_trap_p (x
, flags
);
18688 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
18689 return the log2 of that value. Otherwise return -1. */
18692 aarch64_fpconst_pow_of_2 (rtx x
)
18694 const REAL_VALUE_TYPE
*r
;
18696 if (!CONST_DOUBLE_P (x
))
18699 r
= CONST_DOUBLE_REAL_VALUE (x
);
18701 if (REAL_VALUE_NEGATIVE (*r
)
18702 || REAL_VALUE_ISNAN (*r
)
18703 || REAL_VALUE_ISINF (*r
)
18704 || !real_isinteger (r
, DFmode
))
18707 return exact_log2 (real_to_integer (r
));
18710 /* If X is a vector of equal CONST_DOUBLE values and that value is
18711 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
18714 aarch64_vec_fpconst_pow_of_2 (rtx x
)
18717 if (GET_CODE (x
) != CONST_VECTOR
18718 || !CONST_VECTOR_NUNITS (x
).is_constant (&nelts
))
18721 if (GET_MODE_CLASS (GET_MODE (x
)) != MODE_VECTOR_FLOAT
)
18724 int firstval
= aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, 0));
18728 for (int i
= 1; i
< nelts
; i
++)
18729 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, i
)) != firstval
)
18735 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
18738 __fp16 always promotes through this hook.
18739 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
18740 through the generic excess precision logic rather than here. */
18743 aarch64_promoted_type (const_tree t
)
18745 if (SCALAR_FLOAT_TYPE_P (t
)
18746 && TYPE_MAIN_VARIANT (t
) == aarch64_fp16_type_node
)
18747 return float_type_node
;
18752 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
18755 aarch64_optab_supported_p (int op
, machine_mode mode1
, machine_mode
,
18756 optimization_type opt_type
)
18761 return opt_type
== OPTIMIZE_FOR_SPEED
&& use_rsqrt_p (mode1
);
18768 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
18770 static unsigned int
18771 aarch64_dwarf_poly_indeterminate_value (unsigned int i
, unsigned int *factor
,
18774 /* Polynomial invariant 1 == (VG / 2) - 1. */
18775 gcc_assert (i
== 1);
18778 return AARCH64_DWARF_VG
;
18781 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
18782 if MODE is HFmode, and punt to the generic implementation otherwise. */
18785 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode
)
18787 return (mode
== HFmode
18789 : default_libgcc_floating_mode_supported_p (mode
));
18792 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
18793 if MODE is HFmode, and punt to the generic implementation otherwise. */
18796 aarch64_scalar_mode_supported_p (scalar_mode mode
)
18798 return (mode
== HFmode
18800 : default_scalar_mode_supported_p (mode
));
18803 /* Set the value of FLT_EVAL_METHOD.
18804 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
18806 0: evaluate all operations and constants, whose semantic type has at
18807 most the range and precision of type float, to the range and
18808 precision of float; evaluate all other operations and constants to
18809 the range and precision of the semantic type;
18811 N, where _FloatN is a supported interchange floating type
18812 evaluate all operations and constants, whose semantic type has at
18813 most the range and precision of _FloatN type, to the range and
18814 precision of the _FloatN type; evaluate all other operations and
18815 constants to the range and precision of the semantic type;
18817 If we have the ARMv8.2-A extensions then we support _Float16 in native
18818 precision, so we should set this to 16. Otherwise, we support the type,
18819 but want to evaluate expressions in float precision, so set this to
18822 static enum flt_eval_method
18823 aarch64_excess_precision (enum excess_precision_type type
)
18827 case EXCESS_PRECISION_TYPE_FAST
:
18828 case EXCESS_PRECISION_TYPE_STANDARD
:
18829 /* We can calculate either in 16-bit range and precision or
18830 32-bit range and precision. Make that decision based on whether
18831 we have native support for the ARMv8.2-A 16-bit floating-point
18832 instructions or not. */
18833 return (TARGET_FP_F16INST
18834 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
18835 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
);
18836 case EXCESS_PRECISION_TYPE_IMPLICIT
:
18837 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
;
18839 gcc_unreachable ();
18841 return FLT_EVAL_METHOD_UNPREDICTABLE
;
18844 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
18845 scheduled for speculative execution. Reject the long-running division
18846 and square-root instructions. */
18849 aarch64_sched_can_speculate_insn (rtx_insn
*insn
)
18851 switch (get_attr_type (insn
))
18859 case TYPE_NEON_FP_SQRT_S
:
18860 case TYPE_NEON_FP_SQRT_D
:
18861 case TYPE_NEON_FP_SQRT_S_Q
:
18862 case TYPE_NEON_FP_SQRT_D_Q
:
18863 case TYPE_NEON_FP_DIV_S
:
18864 case TYPE_NEON_FP_DIV_D
:
18865 case TYPE_NEON_FP_DIV_S_Q
:
18866 case TYPE_NEON_FP_DIV_D_Q
:
18873 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
18876 aarch64_compute_pressure_classes (reg_class
*classes
)
18879 classes
[i
++] = GENERAL_REGS
;
18880 classes
[i
++] = FP_REGS
;
18881 /* PR_REGS isn't a useful pressure class because many predicate pseudo
18882 registers need to go in PR_LO_REGS at some point during their
18883 lifetime. Splitting it into two halves has the effect of making
18884 all predicates count against PR_LO_REGS, so that we try whenever
18885 possible to restrict the number of live predicates to 8. This
18886 greatly reduces the amount of spilling in certain loops. */
18887 classes
[i
++] = PR_LO_REGS
;
18888 classes
[i
++] = PR_HI_REGS
;
18892 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
18895 aarch64_can_change_mode_class (machine_mode from
,
18896 machine_mode to
, reg_class_t
)
18898 if (BYTES_BIG_ENDIAN
)
18900 bool from_sve_p
= aarch64_sve_data_mode_p (from
);
18901 bool to_sve_p
= aarch64_sve_data_mode_p (to
);
18903 /* Don't allow changes between SVE data modes and non-SVE modes.
18904 See the comment at the head of aarch64-sve.md for details. */
18905 if (from_sve_p
!= to_sve_p
)
18908 /* Don't allow changes in element size: lane 0 of the new vector
18909 would not then be lane 0 of the old vector. See the comment
18910 above aarch64_maybe_expand_sve_subreg_move for a more detailed
18913 In the worst case, this forces a register to be spilled in
18914 one mode and reloaded in the other, which handles the
18915 endianness correctly. */
18916 if (from_sve_p
&& GET_MODE_UNIT_SIZE (from
) != GET_MODE_UNIT_SIZE (to
))
18922 /* Implement TARGET_EARLY_REMAT_MODES. */
18925 aarch64_select_early_remat_modes (sbitmap modes
)
18927 /* SVE values are not normally live across a call, so it should be
18928 worth doing early rematerialization even in VL-specific mode. */
18929 for (int i
= 0; i
< NUM_MACHINE_MODES
; ++i
)
18931 machine_mode mode
= (machine_mode
) i
;
18932 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
18933 if (vec_flags
& VEC_ANY_SVE
)
18934 bitmap_set_bit (modes
, i
);
18938 /* Override the default target speculation_safe_value. */
18940 aarch64_speculation_safe_value (machine_mode mode
,
18941 rtx result
, rtx val
, rtx failval
)
18943 /* Maybe we should warn if falling back to hard barriers. They are
18944 likely to be noticably more expensive than the alternative below. */
18945 if (!aarch64_track_speculation
)
18946 return default_speculation_safe_value (mode
, result
, val
, failval
);
18949 val
= copy_to_mode_reg (mode
, val
);
18951 if (!aarch64_reg_or_zero (failval
, mode
))
18952 failval
= copy_to_mode_reg (mode
, failval
);
18954 emit_insn (gen_despeculate_copy (mode
, result
, val
, failval
));
18958 /* Implement TARGET_ESTIMATED_POLY_VALUE.
18959 Look into the tuning structure for an estimate.
18960 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
18961 Advanced SIMD 128 bits. */
18963 static HOST_WIDE_INT
18964 aarch64_estimated_poly_value (poly_int64 val
)
18966 enum aarch64_sve_vector_bits_enum width_source
18967 = aarch64_tune_params
.sve_width
;
18969 /* If we still don't have an estimate, use the default. */
18970 if (width_source
== SVE_SCALABLE
)
18971 return default_estimated_poly_value (val
);
18973 HOST_WIDE_INT over_128
= width_source
- 128;
18974 return val
.coeffs
[0] + val
.coeffs
[1] * over_128
/ 128;
18978 /* Return true for types that could be supported as SIMD return or
18982 supported_simd_type (tree t
)
18984 if (SCALAR_FLOAT_TYPE_P (t
) || INTEGRAL_TYPE_P (t
) || POINTER_TYPE_P (t
))
18986 HOST_WIDE_INT s
= tree_to_shwi (TYPE_SIZE_UNIT (t
));
18987 return s
== 1 || s
== 2 || s
== 4 || s
== 8;
18992 /* Return true for types that currently are supported as SIMD return
18993 or argument types. */
18996 currently_supported_simd_type (tree t
, tree b
)
18998 if (COMPLEX_FLOAT_TYPE_P (t
))
19001 if (TYPE_SIZE (t
) != TYPE_SIZE (b
))
19004 return supported_simd_type (t
);
19007 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
19010 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node
*node
,
19011 struct cgraph_simd_clone
*clonei
,
19012 tree base_type
, int num
)
19014 tree t
, ret_type
, arg_type
;
19015 unsigned int elt_bits
, vec_bits
, count
;
19020 if (clonei
->simdlen
19021 && (clonei
->simdlen
< 2
19022 || clonei
->simdlen
> 1024
19023 || (clonei
->simdlen
& (clonei
->simdlen
- 1)) != 0))
19025 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
19026 "unsupported simdlen %d", clonei
->simdlen
);
19030 ret_type
= TREE_TYPE (TREE_TYPE (node
->decl
));
19031 if (TREE_CODE (ret_type
) != VOID_TYPE
19032 && !currently_supported_simd_type (ret_type
, base_type
))
19034 if (TYPE_SIZE (ret_type
) != TYPE_SIZE (base_type
))
19035 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
19036 "GCC does not currently support mixed size types "
19037 "for %<simd%> functions");
19038 else if (supported_simd_type (ret_type
))
19039 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
19040 "GCC does not currently support return type %qT "
19041 "for %<simd%> functions", ret_type
);
19043 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
19044 "unsupported return type %qT for %<simd%> functions",
19049 for (t
= DECL_ARGUMENTS (node
->decl
); t
; t
= DECL_CHAIN (t
))
19051 arg_type
= TREE_TYPE (t
);
19053 if (!currently_supported_simd_type (arg_type
, base_type
))
19055 if (TYPE_SIZE (arg_type
) != TYPE_SIZE (base_type
))
19056 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
19057 "GCC does not currently support mixed size types "
19058 "for %<simd%> functions");
19060 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
19061 "GCC does not currently support argument type %qT "
19062 "for %<simd%> functions", arg_type
);
19067 clonei
->vecsize_mangle
= 'n';
19068 clonei
->mask_mode
= VOIDmode
;
19069 elt_bits
= GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type
));
19070 if (clonei
->simdlen
== 0)
19073 vec_bits
= (num
== 0 ? 64 : 128);
19074 clonei
->simdlen
= vec_bits
/ elt_bits
;
19079 vec_bits
= clonei
->simdlen
* elt_bits
;
19080 if (vec_bits
!= 64 && vec_bits
!= 128)
19082 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
19083 "GCC does not currently support simdlen %d for type %qT",
19084 clonei
->simdlen
, base_type
);
19088 clonei
->vecsize_int
= vec_bits
;
19089 clonei
->vecsize_float
= vec_bits
;
19093 /* Implement TARGET_SIMD_CLONE_ADJUST. */
19096 aarch64_simd_clone_adjust (struct cgraph_node
*node
)
19098 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
19099 use the correct ABI. */
19101 tree t
= TREE_TYPE (node
->decl
);
19102 TYPE_ATTRIBUTES (t
) = make_attribute ("aarch64_vector_pcs", "default",
19103 TYPE_ATTRIBUTES (t
));
19106 /* Implement TARGET_SIMD_CLONE_USABLE. */
19109 aarch64_simd_clone_usable (struct cgraph_node
*node
)
19111 switch (node
->simdclone
->vecsize_mangle
)
19118 gcc_unreachable ();
19122 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
19125 aarch64_comp_type_attributes (const_tree type1
, const_tree type2
)
19127 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1
))
19128 != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2
)))
19133 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
19135 static const char *
19136 aarch64_get_multilib_abi_name (void)
19138 if (TARGET_BIG_END
)
19139 return TARGET_ILP32
? "aarch64_be_ilp32" : "aarch64_be";
19140 return TARGET_ILP32
? "aarch64_ilp32" : "aarch64";
19143 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
19144 global variable based guard use the default else
19145 return a null tree. */
19147 aarch64_stack_protect_guard (void)
19149 if (aarch64_stack_protector_guard
== SSP_GLOBAL
)
19150 return default_stack_protect_guard ();
19155 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
19156 section at the end if needed. */
19157 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
19158 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
19159 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
19161 aarch64_file_end_indicate_exec_stack ()
19163 file_end_indicate_exec_stack ();
19165 unsigned feature_1_and
= 0;
19166 if (aarch64_bti_enabled ())
19167 feature_1_and
|= GNU_PROPERTY_AARCH64_FEATURE_1_BTI
;
19169 if (aarch64_ra_sign_scope
!= AARCH64_FUNCTION_NONE
)
19170 feature_1_and
|= GNU_PROPERTY_AARCH64_FEATURE_1_PAC
;
19174 /* Generate .note.gnu.property section. */
19175 switch_to_section (get_section (".note.gnu.property",
19176 SECTION_NOTYPE
, NULL
));
19178 /* PT_NOTE header: namesz, descsz, type.
19179 namesz = 4 ("GNU\0")
19180 descsz = 16 (Size of the program property array)
19181 [(12 + padding) * Number of array elements]
19182 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
19183 assemble_align (POINTER_SIZE
);
19184 assemble_integer (GEN_INT (4), 4, 32, 1);
19185 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES
)), 4, 32, 1);
19186 assemble_integer (GEN_INT (5), 4, 32, 1);
19188 /* PT_NOTE name. */
19189 assemble_string ("GNU", 4);
19191 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
19192 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
19194 data = feature_1_and. */
19195 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND
), 4, 32, 1);
19196 assemble_integer (GEN_INT (4), 4, 32, 1);
19197 assemble_integer (GEN_INT (feature_1_and
), 4, 32, 1);
19199 /* Pad the size of the note to the required alignment. */
19200 assemble_align (POINTER_SIZE
);
19203 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
19204 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
19205 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
19207 /* Target-specific selftests. */
19211 namespace selftest
{
19213 /* Selftest for the RTL loader.
19214 Verify that the RTL loader copes with a dump from
19215 print_rtx_function. This is essentially just a test that class
19216 function_reader can handle a real dump, but it also verifies
19217 that lookup_reg_by_dump_name correctly handles hard regs.
19218 The presence of hard reg names in the dump means that the test is
19219 target-specific, hence it is in this file. */
19222 aarch64_test_loading_full_dump ()
19224 rtl_dump_test
t (SELFTEST_LOCATION
, locate_file ("aarch64/times-two.rtl"));
19226 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun
->decl
)));
19228 rtx_insn
*insn_1
= get_insn_by_uid (1);
19229 ASSERT_EQ (NOTE
, GET_CODE (insn_1
));
19231 rtx_insn
*insn_15
= get_insn_by_uid (15);
19232 ASSERT_EQ (INSN
, GET_CODE (insn_15
));
19233 ASSERT_EQ (USE
, GET_CODE (PATTERN (insn_15
)));
19235 /* Verify crtl->return_rtx. */
19236 ASSERT_EQ (REG
, GET_CODE (crtl
->return_rtx
));
19237 ASSERT_EQ (0, REGNO (crtl
->return_rtx
));
19238 ASSERT_EQ (SImode
, GET_MODE (crtl
->return_rtx
));
19241 /* Run all target-specific selftests. */
19244 aarch64_run_selftests (void)
19246 aarch64_test_loading_full_dump ();
19249 } // namespace selftest
19251 #endif /* #if CHECKING_P */
19253 #undef TARGET_STACK_PROTECT_GUARD
19254 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
19256 #undef TARGET_ADDRESS_COST
19257 #define TARGET_ADDRESS_COST aarch64_address_cost
19259 /* This hook will determines whether unnamed bitfields affect the alignment
19260 of the containing structure. The hook returns true if the structure
19261 should inherit the alignment requirements of an unnamed bitfield's
19263 #undef TARGET_ALIGN_ANON_BITFIELD
19264 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
19266 #undef TARGET_ASM_ALIGNED_DI_OP
19267 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
19269 #undef TARGET_ASM_ALIGNED_HI_OP
19270 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
19272 #undef TARGET_ASM_ALIGNED_SI_OP
19273 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
19275 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
19276 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
19277 hook_bool_const_tree_hwi_hwi_const_tree_true
19279 #undef TARGET_ASM_FILE_START
19280 #define TARGET_ASM_FILE_START aarch64_start_file
19282 #undef TARGET_ASM_OUTPUT_MI_THUNK
19283 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
19285 #undef TARGET_ASM_SELECT_RTX_SECTION
19286 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
19288 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
19289 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
19291 #undef TARGET_BUILD_BUILTIN_VA_LIST
19292 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
19294 #undef TARGET_CALLEE_COPIES
19295 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
19297 #undef TARGET_CAN_ELIMINATE
19298 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
19300 #undef TARGET_CAN_INLINE_P
19301 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
19303 #undef TARGET_CANNOT_FORCE_CONST_MEM
19304 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
19306 #undef TARGET_CASE_VALUES_THRESHOLD
19307 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
19309 #undef TARGET_CONDITIONAL_REGISTER_USAGE
19310 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
19312 /* Only the least significant bit is used for initialization guard
19314 #undef TARGET_CXX_GUARD_MASK_BIT
19315 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
19317 #undef TARGET_C_MODE_FOR_SUFFIX
19318 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
19320 #ifdef TARGET_BIG_ENDIAN_DEFAULT
19321 #undef TARGET_DEFAULT_TARGET_FLAGS
19322 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
19325 #undef TARGET_CLASS_MAX_NREGS
19326 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
19328 #undef TARGET_BUILTIN_DECL
19329 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
19331 #undef TARGET_BUILTIN_RECIPROCAL
19332 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
19334 #undef TARGET_C_EXCESS_PRECISION
19335 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
19337 #undef TARGET_EXPAND_BUILTIN
19338 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
19340 #undef TARGET_EXPAND_BUILTIN_VA_START
19341 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
19343 #undef TARGET_FOLD_BUILTIN
19344 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
19346 #undef TARGET_FUNCTION_ARG
19347 #define TARGET_FUNCTION_ARG aarch64_function_arg
19349 #undef TARGET_FUNCTION_ARG_ADVANCE
19350 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
19352 #undef TARGET_FUNCTION_ARG_BOUNDARY
19353 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
19355 #undef TARGET_FUNCTION_ARG_PADDING
19356 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
19358 #undef TARGET_GET_RAW_RESULT_MODE
19359 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
19360 #undef TARGET_GET_RAW_ARG_MODE
19361 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
19363 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
19364 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
19366 #undef TARGET_FUNCTION_VALUE
19367 #define TARGET_FUNCTION_VALUE aarch64_function_value
19369 #undef TARGET_FUNCTION_VALUE_REGNO_P
19370 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
19372 #undef TARGET_GIMPLE_FOLD_BUILTIN
19373 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
19375 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
19376 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
19378 #undef TARGET_INIT_BUILTINS
19379 #define TARGET_INIT_BUILTINS aarch64_init_builtins
19381 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
19382 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
19383 aarch64_ira_change_pseudo_allocno_class
19385 #undef TARGET_LEGITIMATE_ADDRESS_P
19386 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
19388 #undef TARGET_LEGITIMATE_CONSTANT_P
19389 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
19391 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
19392 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
19393 aarch64_legitimize_address_displacement
19395 #undef TARGET_LIBGCC_CMP_RETURN_MODE
19396 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
19398 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
19399 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
19400 aarch64_libgcc_floating_mode_supported_p
19402 #undef TARGET_MANGLE_TYPE
19403 #define TARGET_MANGLE_TYPE aarch64_mangle_type
19405 #undef TARGET_MEMORY_MOVE_COST
19406 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
19408 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
19409 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
19411 #undef TARGET_MUST_PASS_IN_STACK
19412 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
19414 /* This target hook should return true if accesses to volatile bitfields
19415 should use the narrowest mode possible. It should return false if these
19416 accesses should use the bitfield container type. */
19417 #undef TARGET_NARROW_VOLATILE_BITFIELD
19418 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
19420 #undef TARGET_OPTION_OVERRIDE
19421 #define TARGET_OPTION_OVERRIDE aarch64_override_options
19423 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
19424 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
19425 aarch64_override_options_after_change
19427 #undef TARGET_OPTION_SAVE
19428 #define TARGET_OPTION_SAVE aarch64_option_save
19430 #undef TARGET_OPTION_RESTORE
19431 #define TARGET_OPTION_RESTORE aarch64_option_restore
19433 #undef TARGET_OPTION_PRINT
19434 #define TARGET_OPTION_PRINT aarch64_option_print
19436 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
19437 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
19439 #undef TARGET_SET_CURRENT_FUNCTION
19440 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
19442 #undef TARGET_PASS_BY_REFERENCE
19443 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
19445 #undef TARGET_PREFERRED_RELOAD_CLASS
19446 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
19448 #undef TARGET_SCHED_REASSOCIATION_WIDTH
19449 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
19451 #undef TARGET_PROMOTED_TYPE
19452 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
19454 #undef TARGET_SECONDARY_RELOAD
19455 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
19457 #undef TARGET_SHIFT_TRUNCATION_MASK
19458 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
19460 #undef TARGET_SETUP_INCOMING_VARARGS
19461 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
19463 #undef TARGET_STRUCT_VALUE_RTX
19464 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
19466 #undef TARGET_REGISTER_MOVE_COST
19467 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
19469 #undef TARGET_RETURN_IN_MEMORY
19470 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
19472 #undef TARGET_RETURN_IN_MSB
19473 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
19475 #undef TARGET_RTX_COSTS
19476 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
19478 #undef TARGET_SCALAR_MODE_SUPPORTED_P
19479 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
19481 #undef TARGET_SCHED_ISSUE_RATE
19482 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
19484 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
19485 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
19486 aarch64_sched_first_cycle_multipass_dfa_lookahead
19488 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
19489 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
19490 aarch64_first_cycle_multipass_dfa_lookahead_guard
19492 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
19493 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
19494 aarch64_get_separate_components
19496 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
19497 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
19498 aarch64_components_for_bb
19500 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
19501 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
19502 aarch64_disqualify_components
19504 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
19505 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
19506 aarch64_emit_prologue_components
19508 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
19509 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
19510 aarch64_emit_epilogue_components
19512 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
19513 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
19514 aarch64_set_handled_components
19516 #undef TARGET_TRAMPOLINE_INIT
19517 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
19519 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
19520 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
19522 #undef TARGET_VECTOR_MODE_SUPPORTED_P
19523 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
19525 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
19526 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
19527 aarch64_builtin_support_vector_misalignment
19529 #undef TARGET_ARRAY_MODE
19530 #define TARGET_ARRAY_MODE aarch64_array_mode
19532 #undef TARGET_ARRAY_MODE_SUPPORTED_P
19533 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
19535 #undef TARGET_VECTORIZE_ADD_STMT_COST
19536 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
19538 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
19539 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
19540 aarch64_builtin_vectorization_cost
19542 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
19543 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
19545 #undef TARGET_VECTORIZE_BUILTINS
19546 #define TARGET_VECTORIZE_BUILTINS
19548 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
19549 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
19550 aarch64_builtin_vectorized_function
19552 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
19553 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
19554 aarch64_autovectorize_vector_sizes
19556 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
19557 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
19558 aarch64_atomic_assign_expand_fenv
19560 /* Section anchor support. */
19562 #undef TARGET_MIN_ANCHOR_OFFSET
19563 #define TARGET_MIN_ANCHOR_OFFSET -256
19565 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
19566 byte offset; we can do much more for larger data types, but have no way
19567 to determine the size of the access. We assume accesses are aligned. */
19568 #undef TARGET_MAX_ANCHOR_OFFSET
19569 #define TARGET_MAX_ANCHOR_OFFSET 4095
19571 #undef TARGET_VECTOR_ALIGNMENT
19572 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
19574 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
19575 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
19576 aarch64_vectorize_preferred_vector_alignment
19577 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
19578 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
19579 aarch64_simd_vector_alignment_reachable
19581 /* vec_perm support. */
19583 #undef TARGET_VECTORIZE_VEC_PERM_CONST
19584 #define TARGET_VECTORIZE_VEC_PERM_CONST \
19585 aarch64_vectorize_vec_perm_const
19587 #undef TARGET_VECTORIZE_GET_MASK_MODE
19588 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
19589 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
19590 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
19591 aarch64_empty_mask_is_expensive
19592 #undef TARGET_PREFERRED_ELSE_VALUE
19593 #define TARGET_PREFERRED_ELSE_VALUE \
19594 aarch64_preferred_else_value
19596 #undef TARGET_INIT_LIBFUNCS
19597 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
19599 #undef TARGET_FIXED_CONDITION_CODE_REGS
19600 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
19602 #undef TARGET_FLAGS_REGNUM
19603 #define TARGET_FLAGS_REGNUM CC_REGNUM
19605 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
19606 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
19608 #undef TARGET_ASAN_SHADOW_OFFSET
19609 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
19611 #undef TARGET_LEGITIMIZE_ADDRESS
19612 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
19614 #undef TARGET_SCHED_CAN_SPECULATE_INSN
19615 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
19617 #undef TARGET_CAN_USE_DOLOOP_P
19618 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
19620 #undef TARGET_SCHED_ADJUST_PRIORITY
19621 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
19623 #undef TARGET_SCHED_MACRO_FUSION_P
19624 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
19626 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
19627 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
19629 #undef TARGET_SCHED_FUSION_PRIORITY
19630 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
19632 #undef TARGET_UNSPEC_MAY_TRAP_P
19633 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
19635 #undef TARGET_USE_PSEUDO_PIC_REG
19636 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
19638 #undef TARGET_PRINT_OPERAND
19639 #define TARGET_PRINT_OPERAND aarch64_print_operand
19641 #undef TARGET_PRINT_OPERAND_ADDRESS
19642 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
19644 #undef TARGET_OPTAB_SUPPORTED_P
19645 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
19647 #undef TARGET_OMIT_STRUCT_RETURN_REG
19648 #define TARGET_OMIT_STRUCT_RETURN_REG true
19650 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
19651 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
19652 aarch64_dwarf_poly_indeterminate_value
19654 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
19655 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
19656 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
19658 #undef TARGET_HARD_REGNO_NREGS
19659 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
19660 #undef TARGET_HARD_REGNO_MODE_OK
19661 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
19663 #undef TARGET_MODES_TIEABLE_P
19664 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
19666 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
19667 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
19668 aarch64_hard_regno_call_part_clobbered
19670 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
19671 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
19672 aarch64_remove_extra_call_preserved_regs
19674 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
19675 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
19676 aarch64_return_call_with_max_clobbers
19678 #undef TARGET_CONSTANT_ALIGNMENT
19679 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
19681 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
19682 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
19683 aarch64_stack_clash_protection_alloca_probe_range
19685 #undef TARGET_COMPUTE_PRESSURE_CLASSES
19686 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
19688 #undef TARGET_CAN_CHANGE_MODE_CLASS
19689 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
19691 #undef TARGET_SELECT_EARLY_REMAT_MODES
19692 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
19694 #undef TARGET_SPECULATION_SAFE_VALUE
19695 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
19697 #undef TARGET_ESTIMATED_POLY_VALUE
19698 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
19700 #undef TARGET_ATTRIBUTE_TABLE
19701 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
19703 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
19704 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
19705 aarch64_simd_clone_compute_vecsize_and_simdlen
19707 #undef TARGET_SIMD_CLONE_ADJUST
19708 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
19710 #undef TARGET_SIMD_CLONE_USABLE
19711 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
19713 #undef TARGET_COMP_TYPE_ATTRIBUTES
19714 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
19716 #undef TARGET_GET_MULTILIB_ABI_NAME
19717 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
19720 #undef TARGET_RUN_TARGET_SELFTESTS
19721 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
19722 #endif /* #if CHECKING_P */
19724 #undef TARGET_ASM_POST_CFI_STARTPROC
19725 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
19727 struct gcc_target targetm
= TARGET_INITIALIZER
;
19729 #include "gt-aarch64.h"