1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2018 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
24 #define INCLUDE_STRING
26 #include "coretypes.h"
37 #include "stringpool.h"
43 #include "diagnostic.h"
44 #include "insn-attr.h"
46 #include "fold-const.h"
47 #include "stor-layout.h"
55 #include "langhooks.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
75 /* This file should be included last. */
76 #include "target-def.h"
78 /* Defined for convenience. */
79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
81 /* Classifies an address.
84 A simple base register plus immediate offset.
87 A base register indexed by immediate offset with writeback.
90 A base register indexed by (optionally scaled) register.
93 A base register indexed by (optionally scaled) zero-extended register.
96 A base register indexed by (optionally scaled) sign-extended register.
99 A LO_SUM rtx with a base register and "LO12" symbol relocation.
102 A constant symbolic address, in pc-relative literal pool. */
104 enum aarch64_address_type
{
114 struct aarch64_address_info
{
115 enum aarch64_address_type type
;
118 poly_int64 const_offset
;
120 enum aarch64_symbol_type symbol_type
;
123 /* Information about a legitimate vector immediate operand. */
124 struct simd_immediate_info
126 enum insn_type
{ MOV
, MVN
};
127 enum modifier_type
{ LSL
, MSL
};
129 simd_immediate_info () {}
130 simd_immediate_info (scalar_float_mode
, rtx
);
131 simd_immediate_info (scalar_int_mode
, unsigned HOST_WIDE_INT
,
132 insn_type
= MOV
, modifier_type
= LSL
,
134 simd_immediate_info (scalar_mode
, rtx
, rtx
);
136 /* The mode of the elements. */
137 scalar_mode elt_mode
;
139 /* The value of each element if all elements are the same, or the
140 first value if the constant is a series. */
143 /* The value of the step if the constant is a series, null otherwise. */
146 /* The instruction to use to move the immediate into a vector. */
149 /* The kind of shift modifier to use, and the number of bits to shift.
150 This is (LSL, 0) if no shift is needed. */
151 modifier_type modifier
;
155 /* Construct a floating-point immediate in which each element has mode
156 ELT_MODE_IN and value VALUE_IN. */
157 inline simd_immediate_info
158 ::simd_immediate_info (scalar_float_mode elt_mode_in
, rtx value_in
)
159 : elt_mode (elt_mode_in
), value (value_in
), step (NULL_RTX
), insn (MOV
),
160 modifier (LSL
), shift (0)
163 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
164 and value VALUE_IN. The other parameters are as for the structure
166 inline simd_immediate_info
167 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
168 unsigned HOST_WIDE_INT value_in
,
169 insn_type insn_in
, modifier_type modifier_in
,
170 unsigned int shift_in
)
171 : elt_mode (elt_mode_in
), value (gen_int_mode (value_in
, elt_mode_in
)),
172 step (NULL_RTX
), insn (insn_in
), modifier (modifier_in
), shift (shift_in
)
175 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
176 and where element I is equal to VALUE_IN + I * STEP_IN. */
177 inline simd_immediate_info
178 ::simd_immediate_info (scalar_mode elt_mode_in
, rtx value_in
, rtx step_in
)
179 : elt_mode (elt_mode_in
), value (value_in
), step (step_in
), insn (MOV
),
180 modifier (LSL
), shift (0)
183 /* The current code model. */
184 enum aarch64_code_model aarch64_cmodel
;
186 /* The number of 64-bit elements in an SVE vector. */
187 poly_uint16 aarch64_sve_vg
;
190 #undef TARGET_HAVE_TLS
191 #define TARGET_HAVE_TLS 1
194 static bool aarch64_composite_type_p (const_tree
, machine_mode
);
195 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode
,
197 machine_mode
*, int *,
199 static void aarch64_elf_asm_constructor (rtx
, int) ATTRIBUTE_UNUSED
;
200 static void aarch64_elf_asm_destructor (rtx
, int) ATTRIBUTE_UNUSED
;
201 static void aarch64_override_options_after_change (void);
202 static bool aarch64_vector_mode_supported_p (machine_mode
);
203 static int aarch64_address_cost (rtx
, machine_mode
, addr_space_t
, bool);
204 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode
,
208 static machine_mode
aarch64_simd_container_mode (scalar_mode
, poly_int64
);
209 static bool aarch64_print_ldpstp_address (FILE *, machine_mode
, rtx
);
211 /* Major revision number of the ARM Architecture implemented by the target. */
212 unsigned aarch64_architecture_version
;
214 /* The processor for which instructions should be scheduled. */
215 enum aarch64_processor aarch64_tune
= cortexa53
;
217 /* Mask to specify which instruction scheduling options should be used. */
218 unsigned long aarch64_tune_flags
= 0;
220 /* Global flag for PC relative loads. */
221 bool aarch64_pcrelative_literal_loads
;
223 /* Global flag for whether frame pointer is enabled. */
224 bool aarch64_use_frame_pointer
;
226 /* Support for command line parsing of boolean flags in the tuning
228 struct aarch64_flag_desc
234 #define AARCH64_FUSION_PAIR(name, internal_name) \
235 { name, AARCH64_FUSE_##internal_name },
236 static const struct aarch64_flag_desc aarch64_fusible_pairs
[] =
238 { "none", AARCH64_FUSE_NOTHING
},
239 #include "aarch64-fusion-pairs.def"
240 { "all", AARCH64_FUSE_ALL
},
241 { NULL
, AARCH64_FUSE_NOTHING
}
244 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
245 { name, AARCH64_EXTRA_TUNE_##internal_name },
246 static const struct aarch64_flag_desc aarch64_tuning_flags
[] =
248 { "none", AARCH64_EXTRA_TUNE_NONE
},
249 #include "aarch64-tuning-flags.def"
250 { "all", AARCH64_EXTRA_TUNE_ALL
},
251 { NULL
, AARCH64_EXTRA_TUNE_NONE
}
254 /* Tuning parameters. */
256 static const struct cpu_addrcost_table generic_addrcost_table
=
266 0, /* register_offset */
267 0, /* register_sextend */
268 0, /* register_zextend */
272 static const struct cpu_addrcost_table exynosm1_addrcost_table
=
282 1, /* register_offset */
283 1, /* register_sextend */
284 2, /* register_zextend */
288 static const struct cpu_addrcost_table xgene1_addrcost_table
=
298 0, /* register_offset */
299 1, /* register_sextend */
300 1, /* register_zextend */
304 static const struct cpu_addrcost_table thunderx2t99_addrcost_table
=
314 2, /* register_offset */
315 3, /* register_sextend */
316 3, /* register_zextend */
320 static const struct cpu_addrcost_table qdf24xx_addrcost_table
=
330 3, /* register_offset */
331 4, /* register_sextend */
332 3, /* register_zextend */
336 static const struct cpu_regmove_cost generic_regmove_cost
=
339 /* Avoid the use of slow int<->fp moves for spilling by setting
340 their cost higher than memmov_cost. */
346 static const struct cpu_regmove_cost cortexa57_regmove_cost
=
349 /* Avoid the use of slow int<->fp moves for spilling by setting
350 their cost higher than memmov_cost. */
356 static const struct cpu_regmove_cost cortexa53_regmove_cost
=
359 /* Avoid the use of slow int<->fp moves for spilling by setting
360 their cost higher than memmov_cost. */
366 static const struct cpu_regmove_cost exynosm1_regmove_cost
=
369 /* Avoid the use of slow int<->fp moves for spilling by setting
370 their cost higher than memmov_cost (actual, 4 and 9). */
376 static const struct cpu_regmove_cost thunderx_regmove_cost
=
384 static const struct cpu_regmove_cost xgene1_regmove_cost
=
387 /* Avoid the use of slow int<->fp moves for spilling by setting
388 their cost higher than memmov_cost. */
394 static const struct cpu_regmove_cost qdf24xx_regmove_cost
=
397 /* Avoid the use of int<->fp moves for spilling. */
403 static const struct cpu_regmove_cost thunderx2t99_regmove_cost
=
406 /* Avoid the use of int<->fp moves for spilling. */
412 /* Generic costs for vector insn classes. */
413 static const struct cpu_vector_cost generic_vector_cost
=
415 1, /* scalar_int_stmt_cost */
416 1, /* scalar_fp_stmt_cost */
417 1, /* scalar_load_cost */
418 1, /* scalar_store_cost */
419 1, /* vec_int_stmt_cost */
420 1, /* vec_fp_stmt_cost */
421 2, /* vec_permute_cost */
422 1, /* vec_to_scalar_cost */
423 1, /* scalar_to_vec_cost */
424 1, /* vec_align_load_cost */
425 1, /* vec_unalign_load_cost */
426 1, /* vec_unalign_store_cost */
427 1, /* vec_store_cost */
428 3, /* cond_taken_branch_cost */
429 1 /* cond_not_taken_branch_cost */
432 /* ThunderX costs for vector insn classes. */
433 static const struct cpu_vector_cost thunderx_vector_cost
=
435 1, /* scalar_int_stmt_cost */
436 1, /* scalar_fp_stmt_cost */
437 3, /* scalar_load_cost */
438 1, /* scalar_store_cost */
439 4, /* vec_int_stmt_cost */
440 1, /* vec_fp_stmt_cost */
441 4, /* vec_permute_cost */
442 2, /* vec_to_scalar_cost */
443 2, /* scalar_to_vec_cost */
444 3, /* vec_align_load_cost */
445 5, /* vec_unalign_load_cost */
446 5, /* vec_unalign_store_cost */
447 1, /* vec_store_cost */
448 3, /* cond_taken_branch_cost */
449 3 /* cond_not_taken_branch_cost */
452 /* Generic costs for vector insn classes. */
453 static const struct cpu_vector_cost cortexa57_vector_cost
=
455 1, /* scalar_int_stmt_cost */
456 1, /* scalar_fp_stmt_cost */
457 4, /* scalar_load_cost */
458 1, /* scalar_store_cost */
459 2, /* vec_int_stmt_cost */
460 2, /* vec_fp_stmt_cost */
461 3, /* vec_permute_cost */
462 8, /* vec_to_scalar_cost */
463 8, /* scalar_to_vec_cost */
464 4, /* vec_align_load_cost */
465 4, /* vec_unalign_load_cost */
466 1, /* vec_unalign_store_cost */
467 1, /* vec_store_cost */
468 1, /* cond_taken_branch_cost */
469 1 /* cond_not_taken_branch_cost */
472 static const struct cpu_vector_cost exynosm1_vector_cost
=
474 1, /* scalar_int_stmt_cost */
475 1, /* scalar_fp_stmt_cost */
476 5, /* scalar_load_cost */
477 1, /* scalar_store_cost */
478 3, /* vec_int_stmt_cost */
479 3, /* vec_fp_stmt_cost */
480 3, /* vec_permute_cost */
481 3, /* vec_to_scalar_cost */
482 3, /* scalar_to_vec_cost */
483 5, /* vec_align_load_cost */
484 5, /* vec_unalign_load_cost */
485 1, /* vec_unalign_store_cost */
486 1, /* vec_store_cost */
487 1, /* cond_taken_branch_cost */
488 1 /* cond_not_taken_branch_cost */
491 /* Generic costs for vector insn classes. */
492 static const struct cpu_vector_cost xgene1_vector_cost
=
494 1, /* scalar_int_stmt_cost */
495 1, /* scalar_fp_stmt_cost */
496 5, /* scalar_load_cost */
497 1, /* scalar_store_cost */
498 2, /* vec_int_stmt_cost */
499 2, /* vec_fp_stmt_cost */
500 2, /* vec_permute_cost */
501 4, /* vec_to_scalar_cost */
502 4, /* scalar_to_vec_cost */
503 10, /* vec_align_load_cost */
504 10, /* vec_unalign_load_cost */
505 2, /* vec_unalign_store_cost */
506 2, /* vec_store_cost */
507 2, /* cond_taken_branch_cost */
508 1 /* cond_not_taken_branch_cost */
511 /* Costs for vector insn classes for Vulcan. */
512 static const struct cpu_vector_cost thunderx2t99_vector_cost
=
514 1, /* scalar_int_stmt_cost */
515 6, /* scalar_fp_stmt_cost */
516 4, /* scalar_load_cost */
517 1, /* scalar_store_cost */
518 5, /* vec_int_stmt_cost */
519 6, /* vec_fp_stmt_cost */
520 3, /* vec_permute_cost */
521 6, /* vec_to_scalar_cost */
522 5, /* scalar_to_vec_cost */
523 8, /* vec_align_load_cost */
524 8, /* vec_unalign_load_cost */
525 4, /* vec_unalign_store_cost */
526 4, /* vec_store_cost */
527 2, /* cond_taken_branch_cost */
528 1 /* cond_not_taken_branch_cost */
531 /* Generic costs for branch instructions. */
532 static const struct cpu_branch_cost generic_branch_cost
=
534 1, /* Predictable. */
535 3 /* Unpredictable. */
538 /* Generic approximation modes. */
539 static const cpu_approx_modes generic_approx_modes
=
541 AARCH64_APPROX_NONE
, /* division */
542 AARCH64_APPROX_NONE
, /* sqrt */
543 AARCH64_APPROX_NONE
/* recip_sqrt */
546 /* Approximation modes for Exynos M1. */
547 static const cpu_approx_modes exynosm1_approx_modes
=
549 AARCH64_APPROX_NONE
, /* division */
550 AARCH64_APPROX_ALL
, /* sqrt */
551 AARCH64_APPROX_ALL
/* recip_sqrt */
554 /* Approximation modes for X-Gene 1. */
555 static const cpu_approx_modes xgene1_approx_modes
=
557 AARCH64_APPROX_NONE
, /* division */
558 AARCH64_APPROX_NONE
, /* sqrt */
559 AARCH64_APPROX_ALL
/* recip_sqrt */
562 /* Generic prefetch settings (which disable prefetch). */
563 static const cpu_prefetch_tune generic_prefetch_tune
=
566 -1, /* l1_cache_size */
567 -1, /* l1_cache_line_size */
568 -1, /* l2_cache_size */
569 true, /* prefetch_dynamic_strides */
570 -1, /* minimum_stride */
571 -1 /* default_opt_level */
574 static const cpu_prefetch_tune exynosm1_prefetch_tune
=
577 -1, /* l1_cache_size */
578 64, /* l1_cache_line_size */
579 -1, /* l2_cache_size */
580 true, /* prefetch_dynamic_strides */
581 -1, /* minimum_stride */
582 -1 /* default_opt_level */
585 static const cpu_prefetch_tune qdf24xx_prefetch_tune
=
588 32, /* l1_cache_size */
589 64, /* l1_cache_line_size */
590 512, /* l2_cache_size */
591 false, /* prefetch_dynamic_strides */
592 2048, /* minimum_stride */
593 3 /* default_opt_level */
596 static const cpu_prefetch_tune thunderxt88_prefetch_tune
=
599 32, /* l1_cache_size */
600 128, /* l1_cache_line_size */
601 16*1024, /* l2_cache_size */
602 true, /* prefetch_dynamic_strides */
603 -1, /* minimum_stride */
604 3 /* default_opt_level */
607 static const cpu_prefetch_tune thunderx_prefetch_tune
=
610 32, /* l1_cache_size */
611 128, /* l1_cache_line_size */
612 -1, /* l2_cache_size */
613 true, /* prefetch_dynamic_strides */
614 -1, /* minimum_stride */
615 -1 /* default_opt_level */
618 static const cpu_prefetch_tune thunderx2t99_prefetch_tune
=
621 32, /* l1_cache_size */
622 64, /* l1_cache_line_size */
623 256, /* l2_cache_size */
624 true, /* prefetch_dynamic_strides */
625 -1, /* minimum_stride */
626 -1 /* default_opt_level */
629 static const struct tune_params generic_tunings
=
631 &cortexa57_extra_costs
,
632 &generic_addrcost_table
,
633 &generic_regmove_cost
,
634 &generic_vector_cost
,
635 &generic_branch_cost
,
636 &generic_approx_modes
,
639 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
640 "8", /* function_align. */
641 "4", /* jump_align. */
642 "8", /* loop_align. */
643 2, /* int_reassoc_width. */
644 4, /* fp_reassoc_width. */
645 1, /* vec_reassoc_width. */
646 2, /* min_div_recip_mul_sf. */
647 2, /* min_div_recip_mul_df. */
648 0, /* max_case_values. */
649 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
650 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
651 &generic_prefetch_tune
654 static const struct tune_params cortexa35_tunings
=
656 &cortexa53_extra_costs
,
657 &generic_addrcost_table
,
658 &cortexa53_regmove_cost
,
659 &generic_vector_cost
,
660 &generic_branch_cost
,
661 &generic_approx_modes
,
664 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
665 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
666 "16", /* function_align. */
667 "4", /* jump_align. */
668 "8", /* loop_align. */
669 2, /* int_reassoc_width. */
670 4, /* fp_reassoc_width. */
671 1, /* vec_reassoc_width. */
672 2, /* min_div_recip_mul_sf. */
673 2, /* min_div_recip_mul_df. */
674 0, /* max_case_values. */
675 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
676 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
677 &generic_prefetch_tune
680 static const struct tune_params cortexa53_tunings
=
682 &cortexa53_extra_costs
,
683 &generic_addrcost_table
,
684 &cortexa53_regmove_cost
,
685 &generic_vector_cost
,
686 &generic_branch_cost
,
687 &generic_approx_modes
,
690 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
691 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
692 "16", /* function_align. */
693 "4", /* jump_align. */
694 "8", /* loop_align. */
695 2, /* int_reassoc_width. */
696 4, /* fp_reassoc_width. */
697 1, /* vec_reassoc_width. */
698 2, /* min_div_recip_mul_sf. */
699 2, /* min_div_recip_mul_df. */
700 0, /* max_case_values. */
701 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
702 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
703 &generic_prefetch_tune
706 static const struct tune_params cortexa57_tunings
=
708 &cortexa57_extra_costs
,
709 &generic_addrcost_table
,
710 &cortexa57_regmove_cost
,
711 &cortexa57_vector_cost
,
712 &generic_branch_cost
,
713 &generic_approx_modes
,
716 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
717 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
718 "16", /* function_align. */
719 "4", /* jump_align. */
720 "8", /* loop_align. */
721 2, /* int_reassoc_width. */
722 4, /* fp_reassoc_width. */
723 1, /* vec_reassoc_width. */
724 2, /* min_div_recip_mul_sf. */
725 2, /* min_div_recip_mul_df. */
726 0, /* max_case_values. */
727 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
728 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
), /* tune_flags. */
729 &generic_prefetch_tune
732 static const struct tune_params cortexa72_tunings
=
734 &cortexa57_extra_costs
,
735 &generic_addrcost_table
,
736 &cortexa57_regmove_cost
,
737 &cortexa57_vector_cost
,
738 &generic_branch_cost
,
739 &generic_approx_modes
,
742 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
743 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
744 "16", /* function_align. */
745 "4", /* jump_align. */
746 "8", /* loop_align. */
747 2, /* int_reassoc_width. */
748 4, /* fp_reassoc_width. */
749 1, /* vec_reassoc_width. */
750 2, /* min_div_recip_mul_sf. */
751 2, /* min_div_recip_mul_df. */
752 0, /* max_case_values. */
753 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
754 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
755 &generic_prefetch_tune
758 static const struct tune_params cortexa73_tunings
=
760 &cortexa57_extra_costs
,
761 &generic_addrcost_table
,
762 &cortexa57_regmove_cost
,
763 &cortexa57_vector_cost
,
764 &generic_branch_cost
,
765 &generic_approx_modes
,
766 4, /* memmov_cost. */
768 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
769 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
770 "16", /* function_align. */
771 "4", /* jump_align. */
772 "8", /* loop_align. */
773 2, /* int_reassoc_width. */
774 4, /* fp_reassoc_width. */
775 1, /* vec_reassoc_width. */
776 2, /* min_div_recip_mul_sf. */
777 2, /* min_div_recip_mul_df. */
778 0, /* max_case_values. */
779 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
780 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
781 &generic_prefetch_tune
786 static const struct tune_params exynosm1_tunings
=
788 &exynosm1_extra_costs
,
789 &exynosm1_addrcost_table
,
790 &exynosm1_regmove_cost
,
791 &exynosm1_vector_cost
,
792 &generic_branch_cost
,
793 &exynosm1_approx_modes
,
796 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
797 "4", /* function_align. */
798 "4", /* jump_align. */
799 "4", /* loop_align. */
800 2, /* int_reassoc_width. */
801 4, /* fp_reassoc_width. */
802 1, /* vec_reassoc_width. */
803 2, /* min_div_recip_mul_sf. */
804 2, /* min_div_recip_mul_df. */
805 48, /* max_case_values. */
806 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
807 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
808 &exynosm1_prefetch_tune
811 static const struct tune_params thunderxt88_tunings
=
813 &thunderx_extra_costs
,
814 &generic_addrcost_table
,
815 &thunderx_regmove_cost
,
816 &thunderx_vector_cost
,
817 &generic_branch_cost
,
818 &generic_approx_modes
,
821 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
822 "8", /* function_align. */
823 "8", /* jump_align. */
824 "8", /* loop_align. */
825 2, /* int_reassoc_width. */
826 4, /* fp_reassoc_width. */
827 1, /* vec_reassoc_width. */
828 2, /* min_div_recip_mul_sf. */
829 2, /* min_div_recip_mul_df. */
830 0, /* max_case_values. */
831 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
832 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
), /* tune_flags. */
833 &thunderxt88_prefetch_tune
836 static const struct tune_params thunderx_tunings
=
838 &thunderx_extra_costs
,
839 &generic_addrcost_table
,
840 &thunderx_regmove_cost
,
841 &thunderx_vector_cost
,
842 &generic_branch_cost
,
843 &generic_approx_modes
,
846 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
847 "8", /* function_align. */
848 "8", /* jump_align. */
849 "8", /* loop_align. */
850 2, /* int_reassoc_width. */
851 4, /* fp_reassoc_width. */
852 1, /* vec_reassoc_width. */
853 2, /* min_div_recip_mul_sf. */
854 2, /* min_div_recip_mul_df. */
855 0, /* max_case_values. */
856 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
857 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
858 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
), /* tune_flags. */
859 &thunderx_prefetch_tune
862 static const struct tune_params xgene1_tunings
=
865 &xgene1_addrcost_table
,
866 &xgene1_regmove_cost
,
868 &generic_branch_cost
,
869 &xgene1_approx_modes
,
872 AARCH64_FUSE_NOTHING
, /* fusible_ops */
873 "16", /* function_align. */
874 "8", /* jump_align. */
875 "16", /* loop_align. */
876 2, /* int_reassoc_width. */
877 4, /* fp_reassoc_width. */
878 1, /* vec_reassoc_width. */
879 2, /* min_div_recip_mul_sf. */
880 2, /* min_div_recip_mul_df. */
881 0, /* max_case_values. */
882 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
883 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
), /* tune_flags. */
884 &generic_prefetch_tune
887 static const struct tune_params qdf24xx_tunings
=
889 &qdf24xx_extra_costs
,
890 &qdf24xx_addrcost_table
,
891 &qdf24xx_regmove_cost
,
892 &generic_vector_cost
,
893 &generic_branch_cost
,
894 &generic_approx_modes
,
897 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
898 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
899 "16", /* function_align. */
900 "8", /* jump_align. */
901 "16", /* loop_align. */
902 2, /* int_reassoc_width. */
903 4, /* fp_reassoc_width. */
904 1, /* vec_reassoc_width. */
905 2, /* min_div_recip_mul_sf. */
906 2, /* min_div_recip_mul_df. */
907 0, /* max_case_values. */
908 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
909 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
910 &qdf24xx_prefetch_tune
913 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
915 static const struct tune_params saphira_tunings
=
917 &generic_extra_costs
,
918 &generic_addrcost_table
,
919 &generic_regmove_cost
,
920 &generic_vector_cost
,
921 &generic_branch_cost
,
922 &generic_approx_modes
,
925 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
926 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
927 "16", /* function_align. */
928 "8", /* jump_align. */
929 "16", /* loop_align. */
930 2, /* int_reassoc_width. */
931 4, /* fp_reassoc_width. */
932 1, /* vec_reassoc_width. */
933 2, /* min_div_recip_mul_sf. */
934 2, /* min_div_recip_mul_df. */
935 0, /* max_case_values. */
936 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
937 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
938 &generic_prefetch_tune
941 static const struct tune_params thunderx2t99_tunings
=
943 &thunderx2t99_extra_costs
,
944 &thunderx2t99_addrcost_table
,
945 &thunderx2t99_regmove_cost
,
946 &thunderx2t99_vector_cost
,
947 &generic_branch_cost
,
948 &generic_approx_modes
,
949 4, /* memmov_cost. */
951 (AARCH64_FUSE_CMP_BRANCH
| AARCH64_FUSE_AES_AESMC
952 | AARCH64_FUSE_ALU_BRANCH
), /* fusible_ops */
953 "16", /* function_align. */
954 "8", /* jump_align. */
955 "16", /* loop_align. */
956 3, /* int_reassoc_width. */
957 2, /* fp_reassoc_width. */
958 2, /* vec_reassoc_width. */
959 2, /* min_div_recip_mul_sf. */
960 2, /* min_div_recip_mul_df. */
961 0, /* max_case_values. */
962 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
963 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
964 &thunderx2t99_prefetch_tune
967 /* Support for fine-grained override of the tuning structures. */
968 struct aarch64_tuning_override_function
971 void (*parse_override
)(const char*, struct tune_params
*);
974 static void aarch64_parse_fuse_string (const char*, struct tune_params
*);
975 static void aarch64_parse_tune_string (const char*, struct tune_params
*);
977 static const struct aarch64_tuning_override_function
978 aarch64_tuning_override_functions
[] =
980 { "fuse", aarch64_parse_fuse_string
},
981 { "tune", aarch64_parse_tune_string
},
985 /* A processor implementing AArch64. */
988 const char *const name
;
989 enum aarch64_processor ident
;
990 enum aarch64_processor sched_core
;
991 enum aarch64_arch arch
;
992 unsigned architecture_version
;
993 const unsigned long flags
;
994 const struct tune_params
*const tune
;
997 /* Architectures implementing AArch64. */
998 static const struct processor all_architectures
[] =
1000 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1001 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1002 #include "aarch64-arches.def"
1003 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
1006 /* Processor cores implementing AArch64. */
1007 static const struct processor all_cores
[] =
1009 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1010 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1011 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1012 FLAGS, &COSTS##_tunings},
1013 #include "aarch64-cores.def"
1014 {"generic", generic
, cortexa53
, AARCH64_ARCH_8A
, 8,
1015 AARCH64_FL_FOR_ARCH8
, &generic_tunings
},
1016 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
1020 /* Target specification. These are populated by the -march, -mtune, -mcpu
1021 handling code or by target attributes. */
1022 static const struct processor
*selected_arch
;
1023 static const struct processor
*selected_cpu
;
1024 static const struct processor
*selected_tune
;
1026 /* The current tuning set. */
1027 struct tune_params aarch64_tune_params
= generic_tunings
;
1029 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1031 /* An ISA extension in the co-processor and main instruction set space. */
1032 struct aarch64_option_extension
1034 const char *const name
;
1035 const unsigned long flags_on
;
1036 const unsigned long flags_off
;
1039 typedef enum aarch64_cond_code
1041 AARCH64_EQ
= 0, AARCH64_NE
, AARCH64_CS
, AARCH64_CC
, AARCH64_MI
, AARCH64_PL
,
1042 AARCH64_VS
, AARCH64_VC
, AARCH64_HI
, AARCH64_LS
, AARCH64_GE
, AARCH64_LT
,
1043 AARCH64_GT
, AARCH64_LE
, AARCH64_AL
, AARCH64_NV
1047 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1049 /* The condition codes of the processor, and the inverse function. */
1050 static const char * const aarch64_condition_codes
[] =
1052 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1053 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1056 /* Generate code to enable conditional branches in functions over 1 MiB. */
1058 aarch64_gen_far_branch (rtx
* operands
, int pos_label
, const char * dest
,
1059 const char * branch_format
)
1061 rtx_code_label
* tmp_label
= gen_label_rtx ();
1062 char label_buf
[256];
1064 ASM_GENERATE_INTERNAL_LABEL (label_buf
, dest
,
1065 CODE_LABEL_NUMBER (tmp_label
));
1066 const char *label_ptr
= targetm
.strip_name_encoding (label_buf
);
1067 rtx dest_label
= operands
[pos_label
];
1068 operands
[pos_label
] = tmp_label
;
1070 snprintf (buffer
, sizeof (buffer
), "%s%s", branch_format
, label_ptr
);
1071 output_asm_insn (buffer
, operands
);
1073 snprintf (buffer
, sizeof (buffer
), "b\t%%l%d\n%s:", pos_label
, label_ptr
);
1074 operands
[pos_label
] = dest_label
;
1075 output_asm_insn (buffer
, operands
);
1080 aarch64_err_no_fpadvsimd (machine_mode mode
)
1082 if (TARGET_GENERAL_REGS_ONLY
)
1083 if (FLOAT_MODE_P (mode
))
1084 error ("%qs is incompatible with the use of floating-point types",
1085 "-mgeneral-regs-only");
1087 error ("%qs is incompatible with the use of vector types",
1088 "-mgeneral-regs-only");
1090 if (FLOAT_MODE_P (mode
))
1091 error ("%qs feature modifier is incompatible with the use of"
1092 " floating-point types", "+nofp");
1094 error ("%qs feature modifier is incompatible with the use of"
1095 " vector types", "+nofp");
1098 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1099 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1100 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1101 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1102 and GENERAL_REGS is lower than the memory cost (in this case the best class
1103 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1104 cost results in bad allocations with many redundant int<->FP moves which
1105 are expensive on various cores.
1106 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1107 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1108 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1109 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1110 The result of this is that it is no longer inefficient to have a higher
1111 memory move cost than the register move cost.
1115 aarch64_ira_change_pseudo_allocno_class (int regno
, reg_class_t allocno_class
,
1116 reg_class_t best_class
)
1120 if (!reg_class_subset_p (GENERAL_REGS
, allocno_class
)
1121 || !reg_class_subset_p (FP_REGS
, allocno_class
))
1122 return allocno_class
;
1124 if (!reg_class_subset_p (GENERAL_REGS
, best_class
)
1125 || !reg_class_subset_p (FP_REGS
, best_class
))
1128 mode
= PSEUDO_REGNO_MODE (regno
);
1129 return FLOAT_MODE_P (mode
) || VECTOR_MODE_P (mode
) ? FP_REGS
: GENERAL_REGS
;
1133 aarch64_min_divisions_for_recip_mul (machine_mode mode
)
1135 if (GET_MODE_UNIT_SIZE (mode
) == 4)
1136 return aarch64_tune_params
.min_div_recip_mul_sf
;
1137 return aarch64_tune_params
.min_div_recip_mul_df
;
1140 /* Return the reassociation width of treeop OPC with mode MODE. */
1142 aarch64_reassociation_width (unsigned opc
, machine_mode mode
)
1144 if (VECTOR_MODE_P (mode
))
1145 return aarch64_tune_params
.vec_reassoc_width
;
1146 if (INTEGRAL_MODE_P (mode
))
1147 return aarch64_tune_params
.int_reassoc_width
;
1148 /* Avoid reassociating floating point addition so we emit more FMAs. */
1149 if (FLOAT_MODE_P (mode
) && opc
!= PLUS_EXPR
)
1150 return aarch64_tune_params
.fp_reassoc_width
;
1154 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1156 aarch64_dbx_register_number (unsigned regno
)
1158 if (GP_REGNUM_P (regno
))
1159 return AARCH64_DWARF_R0
+ regno
- R0_REGNUM
;
1160 else if (regno
== SP_REGNUM
)
1161 return AARCH64_DWARF_SP
;
1162 else if (FP_REGNUM_P (regno
))
1163 return AARCH64_DWARF_V0
+ regno
- V0_REGNUM
;
1164 else if (PR_REGNUM_P (regno
))
1165 return AARCH64_DWARF_P0
+ regno
- P0_REGNUM
;
1166 else if (regno
== VG_REGNUM
)
1167 return AARCH64_DWARF_VG
;
1169 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1170 equivalent DWARF register. */
1171 return DWARF_FRAME_REGISTERS
;
1174 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1176 aarch64_advsimd_struct_mode_p (machine_mode mode
)
1179 && (mode
== OImode
|| mode
== CImode
|| mode
== XImode
));
1182 /* Return true if MODE is an SVE predicate mode. */
1184 aarch64_sve_pred_mode_p (machine_mode mode
)
1187 && (mode
== VNx16BImode
1188 || mode
== VNx8BImode
1189 || mode
== VNx4BImode
1190 || mode
== VNx2BImode
));
1193 /* Three mutually-exclusive flags describing a vector or predicate type. */
1194 const unsigned int VEC_ADVSIMD
= 1;
1195 const unsigned int VEC_SVE_DATA
= 2;
1196 const unsigned int VEC_SVE_PRED
= 4;
1197 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1198 a structure of 2, 3 or 4 vectors. */
1199 const unsigned int VEC_STRUCT
= 8;
1200 /* Useful combinations of the above. */
1201 const unsigned int VEC_ANY_SVE
= VEC_SVE_DATA
| VEC_SVE_PRED
;
1202 const unsigned int VEC_ANY_DATA
= VEC_ADVSIMD
| VEC_SVE_DATA
;
1204 /* Return a set of flags describing the vector properties of mode MODE.
1205 Ignore modes that are not supported by the current target. */
1207 aarch64_classify_vector_mode (machine_mode mode
)
1209 if (aarch64_advsimd_struct_mode_p (mode
))
1210 return VEC_ADVSIMD
| VEC_STRUCT
;
1212 if (aarch64_sve_pred_mode_p (mode
))
1213 return VEC_SVE_PRED
;
1215 scalar_mode inner
= GET_MODE_INNER (mode
);
1216 if (VECTOR_MODE_P (mode
)
1223 || inner
== DFmode
))
1227 if (known_eq (GET_MODE_BITSIZE (mode
), BITS_PER_SVE_VECTOR
))
1228 return VEC_SVE_DATA
;
1229 if (known_eq (GET_MODE_BITSIZE (mode
), BITS_PER_SVE_VECTOR
* 2)
1230 || known_eq (GET_MODE_BITSIZE (mode
), BITS_PER_SVE_VECTOR
* 3)
1231 || known_eq (GET_MODE_BITSIZE (mode
), BITS_PER_SVE_VECTOR
* 4))
1232 return VEC_SVE_DATA
| VEC_STRUCT
;
1235 /* This includes V1DF but not V1DI (which doesn't exist). */
1237 && (known_eq (GET_MODE_BITSIZE (mode
), 64)
1238 || known_eq (GET_MODE_BITSIZE (mode
), 128)))
1245 /* Return true if MODE is any of the data vector modes, including
1248 aarch64_vector_data_mode_p (machine_mode mode
)
1250 return aarch64_classify_vector_mode (mode
) & VEC_ANY_DATA
;
1253 /* Return true if MODE is an SVE data vector mode; either a single vector
1254 or a structure of vectors. */
1256 aarch64_sve_data_mode_p (machine_mode mode
)
1258 return aarch64_classify_vector_mode (mode
) & VEC_SVE_DATA
;
1261 /* Implement target hook TARGET_ARRAY_MODE. */
1262 static opt_machine_mode
1263 aarch64_array_mode (machine_mode mode
, unsigned HOST_WIDE_INT nelems
)
1265 if (aarch64_classify_vector_mode (mode
) == VEC_SVE_DATA
1266 && IN_RANGE (nelems
, 2, 4))
1267 return mode_for_vector (GET_MODE_INNER (mode
),
1268 GET_MODE_NUNITS (mode
) * nelems
);
1270 return opt_machine_mode ();
1273 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1275 aarch64_array_mode_supported_p (machine_mode mode
,
1276 unsigned HOST_WIDE_INT nelems
)
1279 && (AARCH64_VALID_SIMD_QREG_MODE (mode
)
1280 || AARCH64_VALID_SIMD_DREG_MODE (mode
))
1281 && (nelems
>= 2 && nelems
<= 4))
1287 /* Return the SVE predicate mode to use for elements that have
1288 ELEM_NBYTES bytes, if such a mode exists. */
1291 aarch64_sve_pred_mode (unsigned int elem_nbytes
)
1295 if (elem_nbytes
== 1)
1297 if (elem_nbytes
== 2)
1299 if (elem_nbytes
== 4)
1301 if (elem_nbytes
== 8)
1304 return opt_machine_mode ();
1307 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1309 static opt_machine_mode
1310 aarch64_get_mask_mode (poly_uint64 nunits
, poly_uint64 nbytes
)
1312 if (TARGET_SVE
&& known_eq (nbytes
, BYTES_PER_SVE_VECTOR
))
1314 unsigned int elem_nbytes
= vector_element_size (nbytes
, nunits
);
1315 machine_mode pred_mode
;
1316 if (aarch64_sve_pred_mode (elem_nbytes
).exists (&pred_mode
))
1320 return default_get_mask_mode (nunits
, nbytes
);
1323 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1324 prefer to use the first arithmetic operand as the else value if
1325 the else value doesn't matter, since that exactly matches the SVE
1326 destructive merging form. For ternary operations we could either
1327 pick the first operand and use FMAD-like instructions or the last
1328 operand and use FMLA-like instructions; the latter seems more
1332 aarch64_preferred_else_value (unsigned, tree
, unsigned int nops
, tree
*ops
)
1334 return nops
== 3 ? ops
[2] : ops
[0];
1337 /* Implement TARGET_HARD_REGNO_NREGS. */
1340 aarch64_hard_regno_nregs (unsigned regno
, machine_mode mode
)
1342 /* ??? Logically we should only need to provide a value when
1343 HARD_REGNO_MODE_OK says that the combination is valid,
1344 but at the moment we need to handle all modes. Just ignore
1345 any runtime parts for registers that can't store them. */
1346 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
1347 switch (aarch64_regno_regclass (regno
))
1351 if (aarch64_sve_data_mode_p (mode
))
1352 return exact_div (GET_MODE_SIZE (mode
),
1353 BYTES_PER_SVE_VECTOR
).to_constant ();
1354 return CEIL (lowest_size
, UNITS_PER_VREG
);
1360 return CEIL (lowest_size
, UNITS_PER_WORD
);
1365 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1368 aarch64_hard_regno_mode_ok (unsigned regno
, machine_mode mode
)
1370 if (GET_MODE_CLASS (mode
) == MODE_CC
)
1371 return regno
== CC_REGNUM
;
1373 if (regno
== VG_REGNUM
)
1374 /* This must have the same size as _Unwind_Word. */
1375 return mode
== DImode
;
1377 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1378 if (vec_flags
& VEC_SVE_PRED
)
1379 return PR_REGNUM_P (regno
);
1381 if (PR_REGNUM_P (regno
))
1384 if (regno
== SP_REGNUM
)
1385 /* The purpose of comparing with ptr_mode is to support the
1386 global register variable associated with the stack pointer
1387 register via the syntax of asm ("wsp") in ILP32. */
1388 return mode
== Pmode
|| mode
== ptr_mode
;
1390 if (regno
== FRAME_POINTER_REGNUM
|| regno
== ARG_POINTER_REGNUM
)
1391 return mode
== Pmode
;
1393 if (GP_REGNUM_P (regno
) && known_le (GET_MODE_SIZE (mode
), 16))
1396 if (FP_REGNUM_P (regno
))
1398 if (vec_flags
& VEC_STRUCT
)
1399 return end_hard_regno (mode
, regno
) - 1 <= V31_REGNUM
;
1401 return !VECTOR_MODE_P (mode
) || vec_flags
!= 0;
1407 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1408 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1409 clobbers the top 64 bits when restoring the bottom 64 bits. */
1412 aarch64_hard_regno_call_part_clobbered (unsigned int regno
, machine_mode mode
)
1414 return FP_REGNUM_P (regno
) && maybe_gt (GET_MODE_SIZE (mode
), 8);
1417 /* Implement REGMODE_NATURAL_SIZE. */
1419 aarch64_regmode_natural_size (machine_mode mode
)
1421 /* The natural size for SVE data modes is one SVE data vector,
1422 and similarly for predicates. We can't independently modify
1423 anything smaller than that. */
1424 /* ??? For now, only do this for variable-width SVE registers.
1425 Doing it for constant-sized registers breaks lower-subreg.c. */
1426 /* ??? And once that's fixed, we should probably have similar
1427 code for Advanced SIMD. */
1428 if (!aarch64_sve_vg
.is_constant ())
1430 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1431 if (vec_flags
& VEC_SVE_PRED
)
1432 return BYTES_PER_SVE_PRED
;
1433 if (vec_flags
& VEC_SVE_DATA
)
1434 return BYTES_PER_SVE_VECTOR
;
1436 return UNITS_PER_WORD
;
1439 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1441 aarch64_hard_regno_caller_save_mode (unsigned regno
, unsigned,
1444 /* The predicate mode determines which bits are significant and
1445 which are "don't care". Decreasing the number of lanes would
1446 lose data while increasing the number of lanes would make bits
1447 unnecessarily significant. */
1448 if (PR_REGNUM_P (regno
))
1450 if (known_ge (GET_MODE_SIZE (mode
), 4))
1456 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1457 that strcpy from constants will be faster. */
1459 static HOST_WIDE_INT
1460 aarch64_constant_alignment (const_tree exp
, HOST_WIDE_INT align
)
1462 if (TREE_CODE (exp
) == STRING_CST
&& !optimize_size
)
1463 return MAX (align
, BITS_PER_WORD
);
1467 /* Return true if calls to DECL should be treated as
1468 long-calls (ie called via a register). */
1470 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED
)
1475 /* Return true if calls to symbol-ref SYM should be treated as
1476 long-calls (ie called via a register). */
1478 aarch64_is_long_call_p (rtx sym
)
1480 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym
));
1483 /* Return true if calls to symbol-ref SYM should not go through
1487 aarch64_is_noplt_call_p (rtx sym
)
1489 const_tree decl
= SYMBOL_REF_DECL (sym
);
1494 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl
)))
1495 && !targetm
.binds_local_p (decl
))
1501 /* Return true if the offsets to a zero/sign-extract operation
1502 represent an expression that matches an extend operation. The
1503 operands represent the paramters from
1505 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1507 aarch64_is_extend_from_extract (scalar_int_mode mode
, rtx mult_imm
,
1510 HOST_WIDE_INT mult_val
, extract_val
;
1512 if (! CONST_INT_P (mult_imm
) || ! CONST_INT_P (extract_imm
))
1515 mult_val
= INTVAL (mult_imm
);
1516 extract_val
= INTVAL (extract_imm
);
1519 && extract_val
< GET_MODE_BITSIZE (mode
)
1520 && exact_log2 (extract_val
& ~7) > 0
1521 && (extract_val
& 7) <= 4
1522 && mult_val
== (1 << (extract_val
& 7)))
1528 /* Emit an insn that's a simple single-set. Both the operands must be
1529 known to be valid. */
1530 inline static rtx_insn
*
1531 emit_set_insn (rtx x
, rtx y
)
1533 return emit_insn (gen_rtx_SET (x
, y
));
1536 /* X and Y are two things to compare using CODE. Emit the compare insn and
1537 return the rtx for register 0 in the proper mode. */
1539 aarch64_gen_compare_reg (RTX_CODE code
, rtx x
, rtx y
)
1541 machine_mode mode
= SELECT_CC_MODE (code
, x
, y
);
1542 rtx cc_reg
= gen_rtx_REG (mode
, CC_REGNUM
);
1544 emit_set_insn (cc_reg
, gen_rtx_COMPARE (mode
, x
, y
));
1548 /* Build the SYMBOL_REF for __tls_get_addr. */
1550 static GTY(()) rtx tls_get_addr_libfunc
;
1553 aarch64_tls_get_addr (void)
1555 if (!tls_get_addr_libfunc
)
1556 tls_get_addr_libfunc
= init_one_libfunc ("__tls_get_addr");
1557 return tls_get_addr_libfunc
;
1560 /* Return the TLS model to use for ADDR. */
1562 static enum tls_model
1563 tls_symbolic_operand_type (rtx addr
)
1565 enum tls_model tls_kind
= TLS_MODEL_NONE
;
1566 if (GET_CODE (addr
) == CONST
)
1569 rtx sym
= strip_offset (addr
, &addend
);
1570 if (GET_CODE (sym
) == SYMBOL_REF
)
1571 tls_kind
= SYMBOL_REF_TLS_MODEL (sym
);
1573 else if (GET_CODE (addr
) == SYMBOL_REF
)
1574 tls_kind
= SYMBOL_REF_TLS_MODEL (addr
);
1579 /* We'll allow lo_sum's in addresses in our legitimate addresses
1580 so that combine would take care of combining addresses where
1581 necessary, but for generation purposes, we'll generate the address
1584 tmp = hi (symbol_ref); adrp x1, foo
1585 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1589 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1590 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1594 Load TLS symbol, depending on TLS mechanism and TLS access model.
1596 Global Dynamic - Traditional TLS:
1597 adrp tmp, :tlsgd:imm
1598 add dest, tmp, #:tlsgd_lo12:imm
1601 Global Dynamic - TLS Descriptors:
1602 adrp dest, :tlsdesc:imm
1603 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1604 add dest, dest, #:tlsdesc_lo12:imm
1611 adrp tmp, :gottprel:imm
1612 ldr dest, [tmp, #:gottprel_lo12:imm]
1617 add t0, tp, #:tprel_hi12:imm, lsl #12
1618 add t0, t0, #:tprel_lo12_nc:imm
1622 aarch64_load_symref_appropriately (rtx dest
, rtx imm
,
1623 enum aarch64_symbol_type type
)
1627 case SYMBOL_SMALL_ABSOLUTE
:
1629 /* In ILP32, the mode of dest can be either SImode or DImode. */
1631 machine_mode mode
= GET_MODE (dest
);
1633 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
1635 if (can_create_pseudo_p ())
1636 tmp_reg
= gen_reg_rtx (mode
);
1638 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
1639 emit_insn (gen_add_losym (dest
, tmp_reg
, imm
));
1643 case SYMBOL_TINY_ABSOLUTE
:
1644 emit_insn (gen_rtx_SET (dest
, imm
));
1647 case SYMBOL_SMALL_GOT_28K
:
1649 machine_mode mode
= GET_MODE (dest
);
1650 rtx gp_rtx
= pic_offset_table_rtx
;
1654 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1655 here before rtl expand. Tree IVOPT will generate rtl pattern to
1656 decide rtx costs, in which case pic_offset_table_rtx is not
1657 initialized. For that case no need to generate the first adrp
1658 instruction as the final cost for global variable access is
1662 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1663 using the page base as GOT base, the first page may be wasted,
1664 in the worst scenario, there is only 28K space for GOT).
1666 The generate instruction sequence for accessing global variable
1669 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1671 Only one instruction needed. But we must initialize
1672 pic_offset_table_rtx properly. We generate initialize insn for
1673 every global access, and allow CSE to remove all redundant.
1675 The final instruction sequences will look like the following
1676 for multiply global variables access.
1678 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1680 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1681 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1682 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1685 rtx s
= gen_rtx_SYMBOL_REF (Pmode
, "_GLOBAL_OFFSET_TABLE_");
1686 crtl
->uses_pic_offset_table
= 1;
1687 emit_move_insn (gp_rtx
, gen_rtx_HIGH (Pmode
, s
));
1689 if (mode
!= GET_MODE (gp_rtx
))
1690 gp_rtx
= gen_lowpart (mode
, gp_rtx
);
1694 if (mode
== ptr_mode
)
1697 insn
= gen_ldr_got_small_28k_di (dest
, gp_rtx
, imm
);
1699 insn
= gen_ldr_got_small_28k_si (dest
, gp_rtx
, imm
);
1701 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
1705 gcc_assert (mode
== Pmode
);
1707 insn
= gen_ldr_got_small_28k_sidi (dest
, gp_rtx
, imm
);
1708 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
1711 /* The operand is expected to be MEM. Whenever the related insn
1712 pattern changed, above code which calculate mem should be
1714 gcc_assert (GET_CODE (mem
) == MEM
);
1715 MEM_READONLY_P (mem
) = 1;
1716 MEM_NOTRAP_P (mem
) = 1;
1721 case SYMBOL_SMALL_GOT_4G
:
1723 /* In ILP32, the mode of dest can be either SImode or DImode,
1724 while the got entry is always of SImode size. The mode of
1725 dest depends on how dest is used: if dest is assigned to a
1726 pointer (e.g. in the memory), it has SImode; it may have
1727 DImode if dest is dereferenced to access the memeory.
1728 This is why we have to handle three different ldr_got_small
1729 patterns here (two patterns for ILP32). */
1734 machine_mode mode
= GET_MODE (dest
);
1736 if (can_create_pseudo_p ())
1737 tmp_reg
= gen_reg_rtx (mode
);
1739 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
1740 if (mode
== ptr_mode
)
1743 insn
= gen_ldr_got_small_di (dest
, tmp_reg
, imm
);
1745 insn
= gen_ldr_got_small_si (dest
, tmp_reg
, imm
);
1747 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
1751 gcc_assert (mode
== Pmode
);
1753 insn
= gen_ldr_got_small_sidi (dest
, tmp_reg
, imm
);
1754 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
1757 gcc_assert (GET_CODE (mem
) == MEM
);
1758 MEM_READONLY_P (mem
) = 1;
1759 MEM_NOTRAP_P (mem
) = 1;
1764 case SYMBOL_SMALL_TLSGD
:
1767 machine_mode mode
= GET_MODE (dest
);
1768 rtx result
= gen_rtx_REG (mode
, R0_REGNUM
);
1772 aarch64_emit_call_insn (gen_tlsgd_small_si (result
, imm
));
1774 aarch64_emit_call_insn (gen_tlsgd_small_di (result
, imm
));
1775 insns
= get_insns ();
1778 RTL_CONST_CALL_P (insns
) = 1;
1779 emit_libcall_block (insns
, dest
, result
, imm
);
1783 case SYMBOL_SMALL_TLSDESC
:
1785 machine_mode mode
= GET_MODE (dest
);
1786 rtx x0
= gen_rtx_REG (mode
, R0_REGNUM
);
1789 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
1791 /* In ILP32, the got entry is always of SImode size. Unlike
1792 small GOT, the dest is fixed at reg 0. */
1794 emit_insn (gen_tlsdesc_small_si (imm
));
1796 emit_insn (gen_tlsdesc_small_di (imm
));
1797 tp
= aarch64_load_tp (NULL
);
1800 tp
= gen_lowpart (mode
, tp
);
1802 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, x0
)));
1804 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1808 case SYMBOL_SMALL_TLSIE
:
1810 /* In ILP32, the mode of dest can be either SImode or DImode,
1811 while the got entry is always of SImode size. The mode of
1812 dest depends on how dest is used: if dest is assigned to a
1813 pointer (e.g. in the memory), it has SImode; it may have
1814 DImode if dest is dereferenced to access the memeory.
1815 This is why we have to handle three different tlsie_small
1816 patterns here (two patterns for ILP32). */
1817 machine_mode mode
= GET_MODE (dest
);
1818 rtx tmp_reg
= gen_reg_rtx (mode
);
1819 rtx tp
= aarch64_load_tp (NULL
);
1821 if (mode
== ptr_mode
)
1824 emit_insn (gen_tlsie_small_di (tmp_reg
, imm
));
1827 emit_insn (gen_tlsie_small_si (tmp_reg
, imm
));
1828 tp
= gen_lowpart (mode
, tp
);
1833 gcc_assert (mode
== Pmode
);
1834 emit_insn (gen_tlsie_small_sidi (tmp_reg
, imm
));
1837 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, tmp_reg
)));
1839 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1843 case SYMBOL_TLSLE12
:
1844 case SYMBOL_TLSLE24
:
1845 case SYMBOL_TLSLE32
:
1846 case SYMBOL_TLSLE48
:
1848 machine_mode mode
= GET_MODE (dest
);
1849 rtx tp
= aarch64_load_tp (NULL
);
1852 tp
= gen_lowpart (mode
, tp
);
1856 case SYMBOL_TLSLE12
:
1857 emit_insn ((mode
== DImode
? gen_tlsle12_di
: gen_tlsle12_si
)
1860 case SYMBOL_TLSLE24
:
1861 emit_insn ((mode
== DImode
? gen_tlsle24_di
: gen_tlsle24_si
)
1864 case SYMBOL_TLSLE32
:
1865 emit_insn ((mode
== DImode
? gen_tlsle32_di
: gen_tlsle32_si
)
1867 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
1870 case SYMBOL_TLSLE48
:
1871 emit_insn ((mode
== DImode
? gen_tlsle48_di
: gen_tlsle48_si
)
1873 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
1881 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1885 case SYMBOL_TINY_GOT
:
1886 emit_insn (gen_ldr_got_tiny (dest
, imm
));
1889 case SYMBOL_TINY_TLSIE
:
1891 machine_mode mode
= GET_MODE (dest
);
1892 rtx tp
= aarch64_load_tp (NULL
);
1894 if (mode
== ptr_mode
)
1897 emit_insn (gen_tlsie_tiny_di (dest
, imm
, tp
));
1900 tp
= gen_lowpart (mode
, tp
);
1901 emit_insn (gen_tlsie_tiny_si (dest
, imm
, tp
));
1906 gcc_assert (mode
== Pmode
);
1907 emit_insn (gen_tlsie_tiny_sidi (dest
, imm
, tp
));
1911 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1920 /* Emit a move from SRC to DEST. Assume that the move expanders can
1921 handle all moves if !can_create_pseudo_p (). The distinction is
1922 important because, unlike emit_move_insn, the move expanders know
1923 how to force Pmode objects into the constant pool even when the
1924 constant pool address is not itself legitimate. */
1926 aarch64_emit_move (rtx dest
, rtx src
)
1928 return (can_create_pseudo_p ()
1929 ? emit_move_insn (dest
, src
)
1930 : emit_move_insn_1 (dest
, src
));
1933 /* Apply UNOPTAB to OP and store the result in DEST. */
1936 aarch64_emit_unop (rtx dest
, optab unoptab
, rtx op
)
1938 rtx tmp
= expand_unop (GET_MODE (dest
), unoptab
, op
, dest
, 0);
1940 emit_move_insn (dest
, tmp
);
1943 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
1946 aarch64_emit_binop (rtx dest
, optab binoptab
, rtx op0
, rtx op1
)
1948 rtx tmp
= expand_binop (GET_MODE (dest
), binoptab
, op0
, op1
, dest
, 0,
1951 emit_move_insn (dest
, tmp
);
1954 /* Split a 128-bit move operation into two 64-bit move operations,
1955 taking care to handle partial overlap of register to register
1956 copies. Special cases are needed when moving between GP regs and
1957 FP regs. SRC can be a register, constant or memory; DST a register
1958 or memory. If either operand is memory it must not have any side
1961 aarch64_split_128bit_move (rtx dst
, rtx src
)
1966 machine_mode mode
= GET_MODE (dst
);
1968 gcc_assert (mode
== TImode
|| mode
== TFmode
);
1969 gcc_assert (!(side_effects_p (src
) || side_effects_p (dst
)));
1970 gcc_assert (mode
== GET_MODE (src
) || GET_MODE (src
) == VOIDmode
);
1972 if (REG_P (dst
) && REG_P (src
))
1974 int src_regno
= REGNO (src
);
1975 int dst_regno
= REGNO (dst
);
1977 /* Handle FP <-> GP regs. */
1978 if (FP_REGNUM_P (dst_regno
) && GP_REGNUM_P (src_regno
))
1980 src_lo
= gen_lowpart (word_mode
, src
);
1981 src_hi
= gen_highpart (word_mode
, src
);
1985 emit_insn (gen_aarch64_movtilow_di (dst
, src_lo
));
1986 emit_insn (gen_aarch64_movtihigh_di (dst
, src_hi
));
1990 emit_insn (gen_aarch64_movtflow_di (dst
, src_lo
));
1991 emit_insn (gen_aarch64_movtfhigh_di (dst
, src_hi
));
1995 else if (GP_REGNUM_P (dst_regno
) && FP_REGNUM_P (src_regno
))
1997 dst_lo
= gen_lowpart (word_mode
, dst
);
1998 dst_hi
= gen_highpart (word_mode
, dst
);
2002 emit_insn (gen_aarch64_movdi_tilow (dst_lo
, src
));
2003 emit_insn (gen_aarch64_movdi_tihigh (dst_hi
, src
));
2007 emit_insn (gen_aarch64_movdi_tflow (dst_lo
, src
));
2008 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi
, src
));
2014 dst_lo
= gen_lowpart (word_mode
, dst
);
2015 dst_hi
= gen_highpart (word_mode
, dst
);
2016 src_lo
= gen_lowpart (word_mode
, src
);
2017 src_hi
= gen_highpart_mode (word_mode
, mode
, src
);
2019 /* At most one pairing may overlap. */
2020 if (reg_overlap_mentioned_p (dst_lo
, src_hi
))
2022 aarch64_emit_move (dst_hi
, src_hi
);
2023 aarch64_emit_move (dst_lo
, src_lo
);
2027 aarch64_emit_move (dst_lo
, src_lo
);
2028 aarch64_emit_move (dst_hi
, src_hi
);
2033 aarch64_split_128bit_move_p (rtx dst
, rtx src
)
2035 return (! REG_P (src
)
2036 || ! (FP_REGNUM_P (REGNO (dst
)) && FP_REGNUM_P (REGNO (src
))));
2039 /* Split a complex SIMD combine. */
2042 aarch64_split_simd_combine (rtx dst
, rtx src1
, rtx src2
)
2044 machine_mode src_mode
= GET_MODE (src1
);
2045 machine_mode dst_mode
= GET_MODE (dst
);
2047 gcc_assert (VECTOR_MODE_P (dst_mode
));
2048 gcc_assert (register_operand (dst
, dst_mode
)
2049 && register_operand (src1
, src_mode
)
2050 && register_operand (src2
, src_mode
));
2052 rtx (*gen
) (rtx
, rtx
, rtx
);
2057 gen
= gen_aarch64_simd_combinev8qi
;
2060 gen
= gen_aarch64_simd_combinev4hi
;
2063 gen
= gen_aarch64_simd_combinev2si
;
2066 gen
= gen_aarch64_simd_combinev4hf
;
2069 gen
= gen_aarch64_simd_combinev2sf
;
2072 gen
= gen_aarch64_simd_combinedi
;
2075 gen
= gen_aarch64_simd_combinedf
;
2081 emit_insn (gen (dst
, src1
, src2
));
2085 /* Split a complex SIMD move. */
2088 aarch64_split_simd_move (rtx dst
, rtx src
)
2090 machine_mode src_mode
= GET_MODE (src
);
2091 machine_mode dst_mode
= GET_MODE (dst
);
2093 gcc_assert (VECTOR_MODE_P (dst_mode
));
2095 if (REG_P (dst
) && REG_P (src
))
2097 rtx (*gen
) (rtx
, rtx
);
2099 gcc_assert (VECTOR_MODE_P (src_mode
));
2104 gen
= gen_aarch64_split_simd_movv16qi
;
2107 gen
= gen_aarch64_split_simd_movv8hi
;
2110 gen
= gen_aarch64_split_simd_movv4si
;
2113 gen
= gen_aarch64_split_simd_movv2di
;
2116 gen
= gen_aarch64_split_simd_movv8hf
;
2119 gen
= gen_aarch64_split_simd_movv4sf
;
2122 gen
= gen_aarch64_split_simd_movv2df
;
2128 emit_insn (gen (dst
, src
));
2134 aarch64_zero_extend_const_eq (machine_mode xmode
, rtx x
,
2135 machine_mode ymode
, rtx y
)
2137 rtx r
= simplify_const_unary_operation (ZERO_EXTEND
, xmode
, y
, ymode
);
2138 gcc_assert (r
!= NULL
);
2139 return rtx_equal_p (x
, r
);
2144 aarch64_force_temporary (machine_mode mode
, rtx x
, rtx value
)
2146 if (can_create_pseudo_p ())
2147 return force_reg (mode
, value
);
2151 aarch64_emit_move (x
, value
);
2156 /* Return true if we can move VALUE into a register using a single
2157 CNT[BHWD] instruction. */
2160 aarch64_sve_cnt_immediate_p (poly_int64 value
)
2162 HOST_WIDE_INT factor
= value
.coeffs
[0];
2163 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2164 return (value
.coeffs
[1] == factor
2165 && IN_RANGE (factor
, 2, 16 * 16)
2166 && (factor
& 1) == 0
2167 && factor
<= 16 * (factor
& -factor
));
2170 /* Likewise for rtx X. */
2173 aarch64_sve_cnt_immediate_p (rtx x
)
2176 return poly_int_rtx_p (x
, &value
) && aarch64_sve_cnt_immediate_p (value
);
2179 /* Return the asm string for an instruction with a CNT-like vector size
2180 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2181 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2182 first part of the operands template (the part that comes before the
2183 vector size itself). FACTOR is the number of quadwords.
2184 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2185 If it is zero, we can use any element size. */
2188 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
2189 unsigned int factor
,
2190 unsigned int nelts_per_vq
)
2192 static char buffer
[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2194 if (nelts_per_vq
== 0)
2195 /* There is some overlap in the ranges of the four CNT instructions.
2196 Here we always use the smallest possible element size, so that the
2197 multiplier is 1 whereever possible. */
2198 nelts_per_vq
= factor
& -factor
;
2199 int shift
= std::min (exact_log2 (nelts_per_vq
), 4);
2200 gcc_assert (IN_RANGE (shift
, 1, 4));
2201 char suffix
= "dwhb"[shift
- 1];
2204 unsigned int written
;
2206 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s",
2207 prefix
, suffix
, operands
);
2209 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s, all, mul #%d",
2210 prefix
, suffix
, operands
, factor
);
2211 gcc_assert (written
< sizeof (buffer
));
2215 /* Return the asm string for an instruction with a CNT-like vector size
2216 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2217 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2218 first part of the operands template (the part that comes before the
2219 vector size itself). X is the value of the vector size operand,
2220 as a polynomial integer rtx. */
2223 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
2226 poly_int64 value
= rtx_to_poly_int64 (x
);
2227 gcc_assert (aarch64_sve_cnt_immediate_p (value
));
2228 return aarch64_output_sve_cnt_immediate (prefix
, operands
,
2229 value
.coeffs
[1], 0);
2232 /* Return true if we can add VALUE to a register using a single ADDVL
2233 or ADDPL instruction. */
2236 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value
)
2238 HOST_WIDE_INT factor
= value
.coeffs
[0];
2239 if (factor
== 0 || value
.coeffs
[1] != factor
)
2241 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2242 and a value of 16 is one vector width. */
2243 return (((factor
& 15) == 0 && IN_RANGE (factor
, -32 * 16, 31 * 16))
2244 || ((factor
& 1) == 0 && IN_RANGE (factor
, -32 * 2, 31 * 2)));
2247 /* Likewise for rtx X. */
2250 aarch64_sve_addvl_addpl_immediate_p (rtx x
)
2253 return (poly_int_rtx_p (x
, &value
)
2254 && aarch64_sve_addvl_addpl_immediate_p (value
));
2257 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2258 and storing the result in operand 0. */
2261 aarch64_output_sve_addvl_addpl (rtx dest
, rtx base
, rtx offset
)
2263 static char buffer
[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2264 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
2265 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value
));
2267 /* Use INC or DEC if possible. */
2268 if (rtx_equal_p (dest
, base
) && GP_REGNUM_P (REGNO (dest
)))
2270 if (aarch64_sve_cnt_immediate_p (offset_value
))
2271 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2272 offset_value
.coeffs
[1], 0);
2273 if (aarch64_sve_cnt_immediate_p (-offset_value
))
2274 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2275 -offset_value
.coeffs
[1], 0);
2278 int factor
= offset_value
.coeffs
[1];
2279 if ((factor
& 15) == 0)
2280 snprintf (buffer
, sizeof (buffer
), "addvl\t%%x0, %%x1, #%d", factor
/ 16);
2282 snprintf (buffer
, sizeof (buffer
), "addpl\t%%x0, %%x1, #%d", factor
/ 2);
2286 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2287 instruction. If it is, store the number of elements in each vector
2288 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2289 factor in *FACTOR_OUT (if nonnull). */
2292 aarch64_sve_inc_dec_immediate_p (rtx x
, int *factor_out
,
2293 unsigned int *nelts_per_vq_out
)
2298 if (!const_vec_duplicate_p (x
, &elt
)
2299 || !poly_int_rtx_p (elt
, &value
))
2302 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x
));
2303 if (nelts_per_vq
!= 8 && nelts_per_vq
!= 4 && nelts_per_vq
!= 2)
2304 /* There's no vector INCB. */
2307 HOST_WIDE_INT factor
= value
.coeffs
[0];
2308 if (value
.coeffs
[1] != factor
)
2311 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2312 if ((factor
% nelts_per_vq
) != 0
2313 || !IN_RANGE (abs (factor
), nelts_per_vq
, 16 * nelts_per_vq
))
2317 *factor_out
= factor
;
2318 if (nelts_per_vq_out
)
2319 *nelts_per_vq_out
= nelts_per_vq
;
2323 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2327 aarch64_sve_inc_dec_immediate_p (rtx x
)
2329 return aarch64_sve_inc_dec_immediate_p (x
, NULL
, NULL
);
2332 /* Return the asm template for an SVE vector INC or DEC instruction.
2333 OPERANDS gives the operands before the vector count and X is the
2334 value of the vector count operand itself. */
2337 aarch64_output_sve_inc_dec_immediate (const char *operands
, rtx x
)
2340 unsigned int nelts_per_vq
;
2341 if (!aarch64_sve_inc_dec_immediate_p (x
, &factor
, &nelts_per_vq
))
2344 return aarch64_output_sve_cnt_immediate ("dec", operands
, -factor
,
2347 return aarch64_output_sve_cnt_immediate ("inc", operands
, factor
,
2352 aarch64_internal_mov_immediate (rtx dest
, rtx imm
, bool generate
,
2353 scalar_int_mode mode
)
2356 unsigned HOST_WIDE_INT val
, val2
, mask
;
2357 int one_match
, zero_match
;
2362 if (aarch64_move_imm (val
, mode
))
2365 emit_insn (gen_rtx_SET (dest
, imm
));
2369 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2370 (with XXXX non-zero). In that case check to see if the move can be done in
2372 val2
= val
& 0xffffffff;
2374 && aarch64_move_imm (val2
, SImode
)
2375 && (((val
>> 32) & 0xffff) == 0 || (val
>> 48) == 0))
2378 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
2380 /* Check if we have to emit a second instruction by checking to see
2381 if any of the upper 32 bits of the original DI mode value is set. */
2385 i
= (val
>> 48) ? 48 : 32;
2388 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
2389 GEN_INT ((val
>> i
) & 0xffff)));
2394 if ((val
>> 32) == 0 || mode
== SImode
)
2398 emit_insn (gen_rtx_SET (dest
, GEN_INT (val
& 0xffff)));
2400 emit_insn (gen_insv_immsi (dest
, GEN_INT (16),
2401 GEN_INT ((val
>> 16) & 0xffff)));
2403 emit_insn (gen_insv_immdi (dest
, GEN_INT (16),
2404 GEN_INT ((val
>> 16) & 0xffff)));
2409 /* Remaining cases are all for DImode. */
2412 zero_match
= ((val
& mask
) == 0) + ((val
& (mask
<< 16)) == 0) +
2413 ((val
& (mask
<< 32)) == 0) + ((val
& (mask
<< 48)) == 0);
2414 one_match
= ((~val
& mask
) == 0) + ((~val
& (mask
<< 16)) == 0) +
2415 ((~val
& (mask
<< 32)) == 0) + ((~val
& (mask
<< 48)) == 0);
2417 if (zero_match
!= 2 && one_match
!= 2)
2419 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2420 For a 64-bit bitmask try whether changing 16 bits to all ones or
2421 zeroes creates a valid bitmask. To check any repeated bitmask,
2422 try using 16 bits from the other 32-bit half of val. */
2424 for (i
= 0; i
< 64; i
+= 16, mask
<<= 16)
2427 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
2430 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
2432 val2
= val2
& ~mask
;
2433 val2
= val2
| (((val2
>> 32) | (val2
<< 32)) & mask
);
2434 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
2441 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
2442 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
2443 GEN_INT ((val
>> i
) & 0xffff)));
2449 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2450 are emitted by the initial mov. If one_match > zero_match, skip set bits,
2451 otherwise skip zero bits. */
2455 val2
= one_match
> zero_match
? ~val
: val
;
2456 i
= (val2
& mask
) != 0 ? 0 : (val2
& (mask
<< 16)) != 0 ? 16 : 32;
2459 emit_insn (gen_rtx_SET (dest
, GEN_INT (one_match
> zero_match
2460 ? (val
| ~(mask
<< i
))
2461 : (val
& (mask
<< i
)))));
2462 for (i
+= 16; i
< 64; i
+= 16)
2464 if ((val2
& (mask
<< i
)) == 0)
2467 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
2468 GEN_INT ((val
>> i
) & 0xffff)));
2475 /* Return whether imm is a 128-bit immediate which is simple enough to
2478 aarch64_mov128_immediate (rtx imm
)
2480 if (GET_CODE (imm
) == CONST_INT
)
2483 gcc_assert (CONST_WIDE_INT_NUNITS (imm
) == 2);
2485 rtx lo
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 0));
2486 rtx hi
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 1));
2488 return aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, DImode
)
2489 + aarch64_internal_mov_immediate (NULL_RTX
, hi
, false, DImode
) <= 4;
2493 /* Return the number of temporary registers that aarch64_add_offset_1
2494 would need to add OFFSET to a register. */
2497 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset
)
2499 return abs_hwi (offset
) < 0x1000000 ? 0 : 1;
2502 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
2503 a non-polynomial OFFSET. MODE is the mode of the addition.
2504 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2505 be set and CFA adjustments added to the generated instructions.
2507 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2508 temporary if register allocation is already complete. This temporary
2509 register may overlap DEST but must not overlap SRC. If TEMP1 is known
2510 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2511 the immediate again.
2513 Since this function may be used to adjust the stack pointer, we must
2514 ensure that it cannot cause transient stack deallocation (for example
2515 by first incrementing SP and then decrementing when adjusting by a
2516 large immediate). */
2519 aarch64_add_offset_1 (scalar_int_mode mode
, rtx dest
,
2520 rtx src
, HOST_WIDE_INT offset
, rtx temp1
,
2521 bool frame_related_p
, bool emit_move_imm
)
2523 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
2524 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
2526 HOST_WIDE_INT moffset
= abs_hwi (offset
);
2531 if (!rtx_equal_p (dest
, src
))
2533 insn
= emit_insn (gen_rtx_SET (dest
, src
));
2534 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2539 /* Single instruction adjustment. */
2540 if (aarch64_uimm12_shift (moffset
))
2542 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (offset
)));
2543 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2547 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2550 a) the offset cannot be loaded by a 16-bit move or
2551 b) there is no spare register into which we can move it. */
2552 if (moffset
< 0x1000000
2553 && ((!temp1
&& !can_create_pseudo_p ())
2554 || !aarch64_move_imm (moffset
, mode
)))
2556 HOST_WIDE_INT low_off
= moffset
& 0xfff;
2558 low_off
= offset
< 0 ? -low_off
: low_off
;
2559 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (low_off
)));
2560 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2561 insn
= emit_insn (gen_add2_insn (dest
, GEN_INT (offset
- low_off
)));
2562 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2566 /* Emit a move immediate if required and an addition/subtraction. */
2569 gcc_assert (temp1
!= NULL_RTX
|| can_create_pseudo_p ());
2570 temp1
= aarch64_force_temporary (mode
, temp1
, GEN_INT (moffset
));
2572 insn
= emit_insn (offset
< 0
2573 ? gen_sub3_insn (dest
, src
, temp1
)
2574 : gen_add3_insn (dest
, src
, temp1
));
2575 if (frame_related_p
)
2577 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2578 rtx adj
= plus_constant (mode
, src
, offset
);
2579 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, gen_rtx_SET (dest
, adj
));
2583 /* Return the number of temporary registers that aarch64_add_offset
2584 would need to move OFFSET into a register or add OFFSET to a register;
2585 ADD_P is true if we want the latter rather than the former. */
2588 aarch64_offset_temporaries (bool add_p
, poly_int64 offset
)
2590 /* This follows the same structure as aarch64_add_offset. */
2591 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
2594 unsigned int count
= 0;
2595 HOST_WIDE_INT factor
= offset
.coeffs
[1];
2596 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
2597 poly_int64
poly_offset (factor
, factor
);
2598 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
2599 /* Need one register for the ADDVL/ADDPL result. */
2601 else if (factor
!= 0)
2603 factor
= abs (factor
);
2604 if (factor
> 16 * (factor
& -factor
))
2605 /* Need one register for the CNT result and one for the multiplication
2606 factor. If necessary, the second temporary can be reused for the
2607 constant part of the offset. */
2609 /* Need one register for the CNT result (which might then
2613 return count
+ aarch64_add_offset_1_temporaries (constant
);
2616 /* If X can be represented as a poly_int64, return the number
2617 of temporaries that are required to add it to a register.
2618 Return -1 otherwise. */
2621 aarch64_add_offset_temporaries (rtx x
)
2624 if (!poly_int_rtx_p (x
, &offset
))
2626 return aarch64_offset_temporaries (true, offset
);
2629 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
2630 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2631 be set and CFA adjustments added to the generated instructions.
2633 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2634 temporary if register allocation is already complete. This temporary
2635 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2636 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2637 false to avoid emitting the immediate again.
2639 TEMP2, if nonnull, is a second temporary register that doesn't
2640 overlap either DEST or REG.
2642 Since this function may be used to adjust the stack pointer, we must
2643 ensure that it cannot cause transient stack deallocation (for example
2644 by first incrementing SP and then decrementing when adjusting by a
2645 large immediate). */
2648 aarch64_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
2649 poly_int64 offset
, rtx temp1
, rtx temp2
,
2650 bool frame_related_p
, bool emit_move_imm
= true)
2652 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
2653 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
2654 gcc_assert (temp1
== NULL_RTX
2656 || !reg_overlap_mentioned_p (temp1
, dest
));
2657 gcc_assert (temp2
== NULL_RTX
|| !reg_overlap_mentioned_p (dest
, temp2
));
2659 /* Try using ADDVL or ADDPL to add the whole value. */
2660 if (src
!= const0_rtx
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
2662 rtx offset_rtx
= gen_int_mode (offset
, mode
);
2663 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
2664 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2668 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2669 SVE vector register, over and above the minimum size of 128 bits.
2670 This is equivalent to half the value returned by CNTD with a
2671 vector shape of ALL. */
2672 HOST_WIDE_INT factor
= offset
.coeffs
[1];
2673 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
2675 /* Try using ADDVL or ADDPL to add the VG-based part. */
2676 poly_int64
poly_offset (factor
, factor
);
2677 if (src
!= const0_rtx
2678 && aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
2680 rtx offset_rtx
= gen_int_mode (poly_offset
, mode
);
2681 if (frame_related_p
)
2683 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
2684 RTX_FRAME_RELATED_P (insn
) = true;
2689 rtx addr
= gen_rtx_PLUS (mode
, src
, offset_rtx
);
2690 src
= aarch64_force_temporary (mode
, temp1
, addr
);
2695 /* Otherwise use a CNT-based sequence. */
2696 else if (factor
!= 0)
2698 /* Use a subtraction if we have a negative factor. */
2699 rtx_code code
= PLUS
;
2706 /* Calculate CNTD * FACTOR / 2. First try to fold the division
2707 into the multiplication. */
2711 /* Use a right shift by 1. */
2715 HOST_WIDE_INT low_bit
= factor
& -factor
;
2716 if (factor
<= 16 * low_bit
)
2718 if (factor
> 16 * 8)
2720 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2721 the value with the minimum multiplier and shift it into
2723 int extra_shift
= exact_log2 (low_bit
);
2724 shift
+= extra_shift
;
2725 factor
>>= extra_shift
;
2727 val
= gen_int_mode (poly_int64 (factor
* 2, factor
* 2), mode
);
2731 /* Use CNTD, then multiply it by FACTOR. */
2732 val
= gen_int_mode (poly_int64 (2, 2), mode
);
2733 val
= aarch64_force_temporary (mode
, temp1
, val
);
2735 /* Go back to using a negative multiplication factor if we have
2736 no register from which to subtract. */
2737 if (code
== MINUS
&& src
== const0_rtx
)
2742 rtx coeff1
= gen_int_mode (factor
, mode
);
2743 coeff1
= aarch64_force_temporary (mode
, temp2
, coeff1
);
2744 val
= gen_rtx_MULT (mode
, val
, coeff1
);
2749 /* Multiply by 1 << SHIFT. */
2750 val
= aarch64_force_temporary (mode
, temp1
, val
);
2751 val
= gen_rtx_ASHIFT (mode
, val
, GEN_INT (shift
));
2753 else if (shift
== -1)
2756 val
= aarch64_force_temporary (mode
, temp1
, val
);
2757 val
= gen_rtx_ASHIFTRT (mode
, val
, const1_rtx
);
2760 /* Calculate SRC +/- CNTD * FACTOR / 2. */
2761 if (src
!= const0_rtx
)
2763 val
= aarch64_force_temporary (mode
, temp1
, val
);
2764 val
= gen_rtx_fmt_ee (code
, mode
, src
, val
);
2766 else if (code
== MINUS
)
2768 val
= aarch64_force_temporary (mode
, temp1
, val
);
2769 val
= gen_rtx_NEG (mode
, val
);
2772 if (constant
== 0 || frame_related_p
)
2774 rtx_insn
*insn
= emit_insn (gen_rtx_SET (dest
, val
));
2775 if (frame_related_p
)
2777 RTX_FRAME_RELATED_P (insn
) = true;
2778 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
2779 gen_rtx_SET (dest
, plus_constant (Pmode
, src
,
2788 src
= aarch64_force_temporary (mode
, temp1
, val
);
2793 emit_move_imm
= true;
2796 aarch64_add_offset_1 (mode
, dest
, src
, constant
, temp1
,
2797 frame_related_p
, emit_move_imm
);
2800 /* Like aarch64_add_offset, but the offset is given as an rtx rather
2801 than a poly_int64. */
2804 aarch64_split_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
2805 rtx offset_rtx
, rtx temp1
, rtx temp2
)
2807 aarch64_add_offset (mode
, dest
, src
, rtx_to_poly_int64 (offset_rtx
),
2808 temp1
, temp2
, false);
2811 /* Add DELTA to the stack pointer, marking the instructions frame-related.
2812 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
2813 if TEMP1 already contains abs (DELTA). */
2816 aarch64_add_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool emit_move_imm
)
2818 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, delta
,
2819 temp1
, temp2
, true, emit_move_imm
);
2822 /* Subtract DELTA from the stack pointer, marking the instructions
2823 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
2827 aarch64_sub_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool frame_related_p
)
2829 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, -delta
,
2830 temp1
, temp2
, frame_related_p
);
2833 /* Set DEST to (vec_series BASE STEP). */
2836 aarch64_expand_vec_series (rtx dest
, rtx base
, rtx step
)
2838 machine_mode mode
= GET_MODE (dest
);
2839 scalar_mode inner
= GET_MODE_INNER (mode
);
2841 /* Each operand can be a register or an immediate in the range [-16, 15]. */
2842 if (!aarch64_sve_index_immediate_p (base
))
2843 base
= force_reg (inner
, base
);
2844 if (!aarch64_sve_index_immediate_p (step
))
2845 step
= force_reg (inner
, step
);
2847 emit_set_insn (dest
, gen_rtx_VEC_SERIES (mode
, base
, step
));
2850 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
2851 integer of mode INT_MODE. Return true on success. */
2854 aarch64_expand_sve_widened_duplicate (rtx dest
, scalar_int_mode src_mode
,
2857 /* If the constant is smaller than 128 bits, we can do the move
2858 using a vector of SRC_MODEs. */
2859 if (src_mode
!= TImode
)
2861 poly_uint64 count
= exact_div (GET_MODE_SIZE (GET_MODE (dest
)),
2862 GET_MODE_SIZE (src_mode
));
2863 machine_mode dup_mode
= mode_for_vector (src_mode
, count
).require ();
2864 emit_move_insn (gen_lowpart (dup_mode
, dest
),
2865 gen_const_vec_duplicate (dup_mode
, src
));
2869 /* Use LD1RQ[BHWD] to load the 128 bits from memory. */
2870 src
= force_const_mem (src_mode
, src
);
2874 /* Make sure that the address is legitimate. */
2875 if (!aarch64_sve_ld1r_operand_p (src
))
2877 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
2878 src
= replace_equiv_address (src
, addr
);
2881 machine_mode mode
= GET_MODE (dest
);
2882 unsigned int elem_bytes
= GET_MODE_UNIT_SIZE (mode
);
2883 machine_mode pred_mode
= aarch64_sve_pred_mode (elem_bytes
).require ();
2884 rtx ptrue
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
2885 src
= gen_rtx_UNSPEC (mode
, gen_rtvec (2, ptrue
, src
), UNSPEC_LD1RQ
);
2886 emit_insn (gen_rtx_SET (dest
, src
));
2890 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
2891 isn't a simple duplicate or series. */
2894 aarch64_expand_sve_const_vector (rtx dest
, rtx src
)
2896 machine_mode mode
= GET_MODE (src
);
2897 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (src
);
2898 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (src
);
2899 gcc_assert (npatterns
> 1);
2901 if (nelts_per_pattern
== 1)
2903 /* The constant is a repeating seqeuence of at least two elements,
2904 where the repeating elements occupy no more than 128 bits.
2905 Get an integer representation of the replicated value. */
2906 scalar_int_mode int_mode
;
2907 if (BYTES_BIG_ENDIAN
)
2908 /* For now, always use LD1RQ to load the value on big-endian
2909 targets, since the handling of smaller integers includes a
2910 subreg that is semantically an element reverse. */
2914 unsigned int int_bits
= GET_MODE_UNIT_BITSIZE (mode
) * npatterns
;
2915 gcc_assert (int_bits
<= 128);
2916 int_mode
= int_mode_for_size (int_bits
, 0).require ();
2918 rtx int_value
= simplify_gen_subreg (int_mode
, src
, mode
, 0);
2920 && aarch64_expand_sve_widened_duplicate (dest
, int_mode
, int_value
))
2924 /* Expand each pattern individually. */
2925 rtx_vector_builder builder
;
2926 auto_vec
<rtx
, 16> vectors (npatterns
);
2927 for (unsigned int i
= 0; i
< npatterns
; ++i
)
2929 builder
.new_vector (mode
, 1, nelts_per_pattern
);
2930 for (unsigned int j
= 0; j
< nelts_per_pattern
; ++j
)
2931 builder
.quick_push (CONST_VECTOR_ELT (src
, i
+ j
* npatterns
));
2932 vectors
.quick_push (force_reg (mode
, builder
.build ()));
2935 /* Use permutes to interleave the separate vectors. */
2936 while (npatterns
> 1)
2939 for (unsigned int i
= 0; i
< npatterns
; ++i
)
2941 rtx tmp
= (npatterns
== 1 ? dest
: gen_reg_rtx (mode
));
2942 rtvec v
= gen_rtvec (2, vectors
[i
], vectors
[i
+ npatterns
]);
2943 emit_set_insn (tmp
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
2947 gcc_assert (vectors
[0] == dest
);
2950 /* Set DEST to immediate IMM. For SVE vector modes, GEN_VEC_DUPLICATE
2951 is a pattern that can be used to set DEST to a replicated scalar
2955 aarch64_expand_mov_immediate (rtx dest
, rtx imm
,
2956 rtx (*gen_vec_duplicate
) (rtx
, rtx
))
2958 machine_mode mode
= GET_MODE (dest
);
2960 /* Check on what type of symbol it is. */
2961 scalar_int_mode int_mode
;
2962 if ((GET_CODE (imm
) == SYMBOL_REF
2963 || GET_CODE (imm
) == LABEL_REF
2964 || GET_CODE (imm
) == CONST
2965 || GET_CODE (imm
) == CONST_POLY_INT
)
2966 && is_a
<scalar_int_mode
> (mode
, &int_mode
))
2970 HOST_WIDE_INT const_offset
;
2971 enum aarch64_symbol_type sty
;
2973 /* If we have (const (plus symbol offset)), separate out the offset
2974 before we start classifying the symbol. */
2975 rtx base
= strip_offset (imm
, &offset
);
2977 /* We must always add an offset involving VL separately, rather than
2978 folding it into the relocation. */
2979 if (!offset
.is_constant (&const_offset
))
2981 if (base
== const0_rtx
&& aarch64_sve_cnt_immediate_p (offset
))
2982 emit_insn (gen_rtx_SET (dest
, imm
));
2985 /* Do arithmetic on 32-bit values if the result is smaller
2987 if (partial_subreg_p (int_mode
, SImode
))
2989 /* It is invalid to do symbol calculations in modes
2990 narrower than SImode. */
2991 gcc_assert (base
== const0_rtx
);
2992 dest
= gen_lowpart (SImode
, dest
);
2995 if (base
!= const0_rtx
)
2997 base
= aarch64_force_temporary (int_mode
, dest
, base
);
2998 aarch64_add_offset (int_mode
, dest
, base
, offset
,
2999 NULL_RTX
, NULL_RTX
, false);
3002 aarch64_add_offset (int_mode
, dest
, base
, offset
,
3003 dest
, NULL_RTX
, false);
3008 sty
= aarch64_classify_symbol (base
, const_offset
);
3011 case SYMBOL_FORCE_TO_MEM
:
3012 if (const_offset
!= 0
3013 && targetm
.cannot_force_const_mem (int_mode
, imm
))
3015 gcc_assert (can_create_pseudo_p ());
3016 base
= aarch64_force_temporary (int_mode
, dest
, base
);
3017 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
3018 NULL_RTX
, NULL_RTX
, false);
3022 mem
= force_const_mem (ptr_mode
, imm
);
3025 /* If we aren't generating PC relative literals, then
3026 we need to expand the literal pool access carefully.
3027 This is something that needs to be done in a number
3028 of places, so could well live as a separate function. */
3029 if (!aarch64_pcrelative_literal_loads
)
3031 gcc_assert (can_create_pseudo_p ());
3032 base
= gen_reg_rtx (ptr_mode
);
3033 aarch64_expand_mov_immediate (base
, XEXP (mem
, 0));
3034 if (ptr_mode
!= Pmode
)
3035 base
= convert_memory_address (Pmode
, base
);
3036 mem
= gen_rtx_MEM (ptr_mode
, base
);
3039 if (int_mode
!= ptr_mode
)
3040 mem
= gen_rtx_ZERO_EXTEND (int_mode
, mem
);
3042 emit_insn (gen_rtx_SET (dest
, mem
));
3046 case SYMBOL_SMALL_TLSGD
:
3047 case SYMBOL_SMALL_TLSDESC
:
3048 case SYMBOL_SMALL_TLSIE
:
3049 case SYMBOL_SMALL_GOT_28K
:
3050 case SYMBOL_SMALL_GOT_4G
:
3051 case SYMBOL_TINY_GOT
:
3052 case SYMBOL_TINY_TLSIE
:
3053 if (const_offset
!= 0)
3055 gcc_assert(can_create_pseudo_p ());
3056 base
= aarch64_force_temporary (int_mode
, dest
, base
);
3057 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
3058 NULL_RTX
, NULL_RTX
, false);
3063 case SYMBOL_SMALL_ABSOLUTE
:
3064 case SYMBOL_TINY_ABSOLUTE
:
3065 case SYMBOL_TLSLE12
:
3066 case SYMBOL_TLSLE24
:
3067 case SYMBOL_TLSLE32
:
3068 case SYMBOL_TLSLE48
:
3069 aarch64_load_symref_appropriately (dest
, imm
, sty
);
3077 if (!CONST_INT_P (imm
))
3079 rtx base
, step
, value
;
3080 if (GET_CODE (imm
) == HIGH
3081 || aarch64_simd_valid_immediate (imm
, NULL
))
3082 emit_insn (gen_rtx_SET (dest
, imm
));
3083 else if (const_vec_series_p (imm
, &base
, &step
))
3084 aarch64_expand_vec_series (dest
, base
, step
);
3085 else if (const_vec_duplicate_p (imm
, &value
))
3087 /* If the constant is out of range of an SVE vector move,
3088 load it from memory if we can, otherwise move it into
3089 a register and use a DUP. */
3090 scalar_mode inner_mode
= GET_MODE_INNER (mode
);
3091 rtx op
= force_const_mem (inner_mode
, value
);
3093 op
= force_reg (inner_mode
, value
);
3094 else if (!aarch64_sve_ld1r_operand_p (op
))
3096 rtx addr
= force_reg (Pmode
, XEXP (op
, 0));
3097 op
= replace_equiv_address (op
, addr
);
3099 emit_insn (gen_vec_duplicate (dest
, op
));
3101 else if (GET_CODE (imm
) == CONST_VECTOR
3102 && !GET_MODE_NUNITS (GET_MODE (imm
)).is_constant ())
3103 aarch64_expand_sve_const_vector (dest
, imm
);
3106 rtx mem
= force_const_mem (mode
, imm
);
3108 emit_move_insn (dest
, mem
);
3114 aarch64_internal_mov_immediate (dest
, imm
, true,
3115 as_a
<scalar_int_mode
> (mode
));
3118 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
3119 that is known to contain PTRUE. */
3122 aarch64_emit_sve_pred_move (rtx dest
, rtx pred
, rtx src
)
3124 emit_insn (gen_rtx_SET (dest
, gen_rtx_UNSPEC (GET_MODE (dest
),
3125 gen_rtvec (2, pred
, src
),
3126 UNSPEC_MERGE_PTRUE
)));
3129 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3130 operand is in memory. In this case we need to use the predicated LD1
3131 and ST1 instead of LDR and STR, both for correctness on big-endian
3132 targets and because LD1 and ST1 support a wider range of addressing modes.
3133 PRED_MODE is the mode of the predicate.
3135 See the comment at the head of aarch64-sve.md for details about the
3136 big-endian handling. */
3139 aarch64_expand_sve_mem_move (rtx dest
, rtx src
, machine_mode pred_mode
)
3141 machine_mode mode
= GET_MODE (dest
);
3142 rtx ptrue
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
3143 if (!register_operand (src
, mode
)
3144 && !register_operand (dest
, mode
))
3146 rtx tmp
= gen_reg_rtx (mode
);
3148 aarch64_emit_sve_pred_move (tmp
, ptrue
, src
);
3150 emit_move_insn (tmp
, src
);
3153 aarch64_emit_sve_pred_move (dest
, ptrue
, src
);
3156 /* Called only on big-endian targets. See whether an SVE vector move
3157 from SRC to DEST is effectively a REV[BHW] instruction, because at
3158 least one operand is a subreg of an SVE vector that has wider or
3159 narrower elements. Return true and emit the instruction if so.
3163 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3165 represents a VIEW_CONVERT between the following vectors, viewed
3168 R2: { [0].high, [0].low, [1].high, [1].low, ... }
3169 R1: { [0], [1], [2], [3], ... }
3171 The high part of lane X in R2 should therefore correspond to lane X*2
3172 of R1, but the register representations are:
3175 R2: ...... [1].high [1].low [0].high [0].low
3176 R1: ...... [3] [2] [1] [0]
3178 where the low part of lane X in R2 corresponds to lane X*2 in R1.
3179 We therefore need a reverse operation to swap the high and low values
3182 This is purely an optimization. Without it we would spill the
3183 subreg operand to the stack in one mode and reload it in the
3184 other mode, which has the same effect as the REV. */
3187 aarch64_maybe_expand_sve_subreg_move (rtx dest
, rtx src
)
3189 gcc_assert (BYTES_BIG_ENDIAN
);
3190 if (GET_CODE (dest
) == SUBREG
)
3191 dest
= SUBREG_REG (dest
);
3192 if (GET_CODE (src
) == SUBREG
)
3193 src
= SUBREG_REG (src
);
3195 /* The optimization handles two single SVE REGs with different element
3199 || aarch64_classify_vector_mode (GET_MODE (dest
)) != VEC_SVE_DATA
3200 || aarch64_classify_vector_mode (GET_MODE (src
)) != VEC_SVE_DATA
3201 || (GET_MODE_UNIT_SIZE (GET_MODE (dest
))
3202 == GET_MODE_UNIT_SIZE (GET_MODE (src
))))
3205 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
3206 rtx ptrue
= force_reg (VNx16BImode
, CONSTM1_RTX (VNx16BImode
));
3207 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (dest
), gen_rtvec (2, ptrue
, src
),
3209 emit_insn (gen_rtx_SET (dest
, unspec
));
3213 /* Return a copy of X with mode MODE, without changing its other
3214 attributes. Unlike gen_lowpart, this doesn't care whether the
3215 mode change is valid. */
3218 aarch64_replace_reg_mode (rtx x
, machine_mode mode
)
3220 if (GET_MODE (x
) == mode
)
3223 x
= shallow_copy_rtx (x
);
3224 set_mode_and_regno (x
, mode
, REGNO (x
));
3228 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3232 aarch64_split_sve_subreg_move (rtx dest
, rtx ptrue
, rtx src
)
3234 /* Decide which REV operation we need. The mode with narrower elements
3235 determines the mode of the operands and the mode with the wider
3236 elements determines the reverse width. */
3237 machine_mode mode_with_wider_elts
= GET_MODE (dest
);
3238 machine_mode mode_with_narrower_elts
= GET_MODE (src
);
3239 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts
)
3240 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts
))
3241 std::swap (mode_with_wider_elts
, mode_with_narrower_elts
);
3243 unsigned int wider_bytes
= GET_MODE_UNIT_SIZE (mode_with_wider_elts
);
3244 unsigned int unspec
;
3245 if (wider_bytes
== 8)
3246 unspec
= UNSPEC_REV64
;
3247 else if (wider_bytes
== 4)
3248 unspec
= UNSPEC_REV32
;
3249 else if (wider_bytes
== 2)
3250 unspec
= UNSPEC_REV16
;
3253 machine_mode pred_mode
= aarch64_sve_pred_mode (wider_bytes
).require ();
3257 (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3258 UNSPEC_MERGE_PTRUE))
3260 with the appropriate modes. */
3261 ptrue
= gen_lowpart (pred_mode
, ptrue
);
3262 dest
= aarch64_replace_reg_mode (dest
, mode_with_narrower_elts
);
3263 src
= aarch64_replace_reg_mode (src
, mode_with_narrower_elts
);
3264 src
= gen_rtx_UNSPEC (mode_with_narrower_elts
, gen_rtvec (1, src
), unspec
);
3265 src
= gen_rtx_UNSPEC (mode_with_narrower_elts
, gen_rtvec (2, ptrue
, src
),
3266 UNSPEC_MERGE_PTRUE
);
3267 emit_insn (gen_rtx_SET (dest
, src
));
3271 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED
,
3272 tree exp ATTRIBUTE_UNUSED
)
3274 /* Currently, always true. */
3278 /* Implement TARGET_PASS_BY_REFERENCE. */
3281 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED
,
3284 bool named ATTRIBUTE_UNUSED
)
3287 machine_mode dummymode
;
3290 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
3291 if (mode
== BLKmode
&& type
)
3292 size
= int_size_in_bytes (type
);
3294 /* No frontends can create types with variable-sized modes, so we
3295 shouldn't be asked to pass or return them. */
3296 size
= GET_MODE_SIZE (mode
).to_constant ();
3298 /* Aggregates are passed by reference based on their size. */
3299 if (type
&& AGGREGATE_TYPE_P (type
))
3301 size
= int_size_in_bytes (type
);
3304 /* Variable sized arguments are always returned by reference. */
3308 /* Can this be a candidate to be passed in fp/simd register(s)? */
3309 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
3314 /* Arguments which are variable sized or larger than 2 registers are
3315 passed by reference unless they are a homogenous floating point
3317 return size
> 2 * UNITS_PER_WORD
;
3320 /* Return TRUE if VALTYPE is padded to its least significant bits. */
3322 aarch64_return_in_msb (const_tree valtype
)
3324 machine_mode dummy_mode
;
3327 /* Never happens in little-endian mode. */
3328 if (!BYTES_BIG_ENDIAN
)
3331 /* Only composite types smaller than or equal to 16 bytes can
3332 be potentially returned in registers. */
3333 if (!aarch64_composite_type_p (valtype
, TYPE_MODE (valtype
))
3334 || int_size_in_bytes (valtype
) <= 0
3335 || int_size_in_bytes (valtype
) > 16)
3338 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3339 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3340 is always passed/returned in the least significant bits of fp/simd
3342 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype
), valtype
,
3343 &dummy_mode
, &dummy_int
, NULL
))
3349 /* Implement TARGET_FUNCTION_VALUE.
3350 Define how to find the value returned by a function. */
3353 aarch64_function_value (const_tree type
, const_tree func
,
3354 bool outgoing ATTRIBUTE_UNUSED
)
3359 machine_mode ag_mode
;
3361 mode
= TYPE_MODE (type
);
3362 if (INTEGRAL_TYPE_P (type
))
3363 mode
= promote_function_mode (type
, mode
, &unsignedp
, func
, 1);
3365 if (aarch64_return_in_msb (type
))
3367 HOST_WIDE_INT size
= int_size_in_bytes (type
);
3369 if (size
% UNITS_PER_WORD
!= 0)
3371 size
+= UNITS_PER_WORD
- size
% UNITS_PER_WORD
;
3372 mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 0).require ();
3376 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
3377 &ag_mode
, &count
, NULL
))
3379 if (!aarch64_composite_type_p (type
, mode
))
3381 gcc_assert (count
== 1 && mode
== ag_mode
);
3382 return gen_rtx_REG (mode
, V0_REGNUM
);
3389 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (count
));
3390 for (i
= 0; i
< count
; i
++)
3392 rtx tmp
= gen_rtx_REG (ag_mode
, V0_REGNUM
+ i
);
3393 rtx offset
= gen_int_mode (i
* GET_MODE_SIZE (ag_mode
), Pmode
);
3394 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
3395 XVECEXP (par
, 0, i
) = tmp
;
3401 return gen_rtx_REG (mode
, R0_REGNUM
);
3404 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3405 Return true if REGNO is the number of a hard register in which the values
3406 of called function may come back. */
3409 aarch64_function_value_regno_p (const unsigned int regno
)
3411 /* Maximum of 16 bytes can be returned in the general registers. Examples
3412 of 16-byte return values are: 128-bit integers and 16-byte small
3413 structures (excluding homogeneous floating-point aggregates). */
3414 if (regno
== R0_REGNUM
|| regno
== R1_REGNUM
)
3417 /* Up to four fp/simd registers can return a function value, e.g. a
3418 homogeneous floating-point aggregate having four members. */
3419 if (regno
>= V0_REGNUM
&& regno
< V0_REGNUM
+ HA_MAX_NUM_FLDS
)
3420 return TARGET_FLOAT
;
3425 /* Implement TARGET_RETURN_IN_MEMORY.
3427 If the type T of the result of a function is such that
3429 would require that arg be passed as a value in a register (or set of
3430 registers) according to the parameter passing rules, then the result
3431 is returned in the same registers as would be used for such an
3435 aarch64_return_in_memory (const_tree type
, const_tree fndecl ATTRIBUTE_UNUSED
)
3438 machine_mode ag_mode
;
3441 if (!AGGREGATE_TYPE_P (type
)
3442 && TREE_CODE (type
) != COMPLEX_TYPE
3443 && TREE_CODE (type
) != VECTOR_TYPE
)
3444 /* Simple scalar types always returned in registers. */
3447 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
),
3454 /* Types larger than 2 registers returned in memory. */
3455 size
= int_size_in_bytes (type
);
3456 return (size
< 0 || size
> 2 * UNITS_PER_WORD
);
3460 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v
, machine_mode mode
,
3461 const_tree type
, int *nregs
)
3463 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
3464 return aarch64_vfp_is_call_or_return_candidate (mode
,
3466 &pcum
->aapcs_vfp_rmode
,
3471 /* Given MODE and TYPE of a function argument, return the alignment in
3472 bits. The idea is to suppress any stronger alignment requested by
3473 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3474 This is a helper function for local use only. */
3477 aarch64_function_arg_alignment (machine_mode mode
, const_tree type
)
3480 return GET_MODE_ALIGNMENT (mode
);
3482 if (integer_zerop (TYPE_SIZE (type
)))
3485 gcc_assert (TYPE_MODE (type
) == mode
);
3487 if (!AGGREGATE_TYPE_P (type
))
3488 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type
));
3490 if (TREE_CODE (type
) == ARRAY_TYPE
)
3491 return TYPE_ALIGN (TREE_TYPE (type
));
3493 unsigned int alignment
= 0;
3494 for (tree field
= TYPE_FIELDS (type
); field
; field
= DECL_CHAIN (field
))
3495 if (TREE_CODE (field
) == FIELD_DECL
)
3496 alignment
= std::max (alignment
, DECL_ALIGN (field
));
3501 /* Layout a function argument according to the AAPCS64 rules. The rule
3502 numbers refer to the rule numbers in the AAPCS64. */
3505 aarch64_layout_arg (cumulative_args_t pcum_v
, machine_mode mode
,
3507 bool named ATTRIBUTE_UNUSED
)
3509 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
3510 int ncrn
, nvrn
, nregs
;
3511 bool allocate_ncrn
, allocate_nvrn
;
3514 /* We need to do this once per argument. */
3515 if (pcum
->aapcs_arg_processed
)
3518 pcum
->aapcs_arg_processed
= true;
3520 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
3522 size
= int_size_in_bytes (type
);
3524 /* No frontends can create types with variable-sized modes, so we
3525 shouldn't be asked to pass or return them. */
3526 size
= GET_MODE_SIZE (mode
).to_constant ();
3527 size
= ROUND_UP (size
, UNITS_PER_WORD
);
3529 allocate_ncrn
= (type
) ? !(FLOAT_TYPE_P (type
)) : !FLOAT_MODE_P (mode
);
3530 allocate_nvrn
= aarch64_vfp_is_call_candidate (pcum_v
,
3535 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3536 The following code thus handles passing by SIMD/FP registers first. */
3538 nvrn
= pcum
->aapcs_nvrn
;
3540 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3541 and homogenous short-vector aggregates (HVA). */
3545 aarch64_err_no_fpadvsimd (mode
);
3547 if (nvrn
+ nregs
<= NUM_FP_ARG_REGS
)
3549 pcum
->aapcs_nextnvrn
= nvrn
+ nregs
;
3550 if (!aarch64_composite_type_p (type
, mode
))
3552 gcc_assert (nregs
== 1);
3553 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ nvrn
);
3559 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
3560 for (i
= 0; i
< nregs
; i
++)
3562 rtx tmp
= gen_rtx_REG (pcum
->aapcs_vfp_rmode
,
3563 V0_REGNUM
+ nvrn
+ i
);
3564 rtx offset
= gen_int_mode
3565 (i
* GET_MODE_SIZE (pcum
->aapcs_vfp_rmode
), Pmode
);
3566 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
3567 XVECEXP (par
, 0, i
) = tmp
;
3569 pcum
->aapcs_reg
= par
;
3575 /* C.3 NSRN is set to 8. */
3576 pcum
->aapcs_nextnvrn
= NUM_FP_ARG_REGS
;
3581 ncrn
= pcum
->aapcs_ncrn
;
3582 nregs
= size
/ UNITS_PER_WORD
;
3584 /* C6 - C9. though the sign and zero extension semantics are
3585 handled elsewhere. This is the case where the argument fits
3586 entirely general registers. */
3587 if (allocate_ncrn
&& (ncrn
+ nregs
<= NUM_ARG_REGS
))
3590 gcc_assert (nregs
== 0 || nregs
== 1 || nregs
== 2);
3592 /* C.8 if the argument has an alignment of 16 then the NGRN is
3593 rounded up to the next even number. */
3596 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3597 comparison is there because for > 16 * BITS_PER_UNIT
3598 alignment nregs should be > 2 and therefore it should be
3599 passed by reference rather than value. */
3600 && aarch64_function_arg_alignment (mode
, type
) == 16 * BITS_PER_UNIT
)
3603 gcc_assert (ncrn
+ nregs
<= NUM_ARG_REGS
);
3606 /* NREGS can be 0 when e.g. an empty structure is to be passed.
3607 A reg is still generated for it, but the caller should be smart
3608 enough not to use it. */
3609 if (nregs
== 0 || nregs
== 1 || GET_MODE_CLASS (mode
) == MODE_INT
)
3610 pcum
->aapcs_reg
= gen_rtx_REG (mode
, R0_REGNUM
+ ncrn
);
3616 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
3617 for (i
= 0; i
< nregs
; i
++)
3619 rtx tmp
= gen_rtx_REG (word_mode
, R0_REGNUM
+ ncrn
+ i
);
3620 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
,
3621 GEN_INT (i
* UNITS_PER_WORD
));
3622 XVECEXP (par
, 0, i
) = tmp
;
3624 pcum
->aapcs_reg
= par
;
3627 pcum
->aapcs_nextncrn
= ncrn
+ nregs
;
3632 pcum
->aapcs_nextncrn
= NUM_ARG_REGS
;
3634 /* The argument is passed on stack; record the needed number of words for
3635 this argument and align the total size if necessary. */
3637 pcum
->aapcs_stack_words
= size
/ UNITS_PER_WORD
;
3639 if (aarch64_function_arg_alignment (mode
, type
) == 16 * BITS_PER_UNIT
)
3640 pcum
->aapcs_stack_size
= ROUND_UP (pcum
->aapcs_stack_size
,
3641 16 / UNITS_PER_WORD
);
3645 /* Implement TARGET_FUNCTION_ARG. */
3648 aarch64_function_arg (cumulative_args_t pcum_v
, machine_mode mode
,
3649 const_tree type
, bool named
)
3651 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
3652 gcc_assert (pcum
->pcs_variant
== ARM_PCS_AAPCS64
);
3654 if (mode
== VOIDmode
)
3657 aarch64_layout_arg (pcum_v
, mode
, type
, named
);
3658 return pcum
->aapcs_reg
;
3662 aarch64_init_cumulative_args (CUMULATIVE_ARGS
*pcum
,
3663 const_tree fntype ATTRIBUTE_UNUSED
,
3664 rtx libname ATTRIBUTE_UNUSED
,
3665 const_tree fndecl ATTRIBUTE_UNUSED
,
3666 unsigned n_named ATTRIBUTE_UNUSED
)
3668 pcum
->aapcs_ncrn
= 0;
3669 pcum
->aapcs_nvrn
= 0;
3670 pcum
->aapcs_nextncrn
= 0;
3671 pcum
->aapcs_nextnvrn
= 0;
3672 pcum
->pcs_variant
= ARM_PCS_AAPCS64
;
3673 pcum
->aapcs_reg
= NULL_RTX
;
3674 pcum
->aapcs_arg_processed
= false;
3675 pcum
->aapcs_stack_words
= 0;
3676 pcum
->aapcs_stack_size
= 0;
3679 && fndecl
&& TREE_PUBLIC (fndecl
)
3680 && fntype
&& fntype
!= error_mark_node
)
3682 const_tree type
= TREE_TYPE (fntype
);
3683 machine_mode mode ATTRIBUTE_UNUSED
; /* To pass pointer as argument. */
3684 int nregs ATTRIBUTE_UNUSED
; /* Likewise. */
3685 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
), type
,
3686 &mode
, &nregs
, NULL
))
3687 aarch64_err_no_fpadvsimd (TYPE_MODE (type
));
3693 aarch64_function_arg_advance (cumulative_args_t pcum_v
,
3698 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
3699 if (pcum
->pcs_variant
== ARM_PCS_AAPCS64
)
3701 aarch64_layout_arg (pcum_v
, mode
, type
, named
);
3702 gcc_assert ((pcum
->aapcs_reg
!= NULL_RTX
)
3703 != (pcum
->aapcs_stack_words
!= 0));
3704 pcum
->aapcs_arg_processed
= false;
3705 pcum
->aapcs_ncrn
= pcum
->aapcs_nextncrn
;
3706 pcum
->aapcs_nvrn
= pcum
->aapcs_nextnvrn
;
3707 pcum
->aapcs_stack_size
+= pcum
->aapcs_stack_words
;
3708 pcum
->aapcs_stack_words
= 0;
3709 pcum
->aapcs_reg
= NULL_RTX
;
3714 aarch64_function_arg_regno_p (unsigned regno
)
3716 return ((GP_REGNUM_P (regno
) && regno
< R0_REGNUM
+ NUM_ARG_REGS
)
3717 || (FP_REGNUM_P (regno
) && regno
< V0_REGNUM
+ NUM_FP_ARG_REGS
));
3720 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
3721 PARM_BOUNDARY bits of alignment, but will be given anything up
3722 to STACK_BOUNDARY bits if the type requires it. This makes sure
3723 that both before and after the layout of each argument, the Next
3724 Stacked Argument Address (NSAA) will have a minimum alignment of
3728 aarch64_function_arg_boundary (machine_mode mode
, const_tree type
)
3730 unsigned int alignment
= aarch64_function_arg_alignment (mode
, type
);
3731 return MIN (MAX (alignment
, PARM_BOUNDARY
), STACK_BOUNDARY
);
3734 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
3736 static fixed_size_mode
3737 aarch64_get_reg_raw_mode (int regno
)
3739 if (TARGET_SVE
&& FP_REGNUM_P (regno
))
3740 /* Don't use the SVE part of the register for __builtin_apply and
3741 __builtin_return. The SVE registers aren't used by the normal PCS,
3742 so using them there would be a waste of time. The PCS extensions
3743 for SVE types are fundamentally incompatible with the
3744 __builtin_return/__builtin_apply interface. */
3745 return as_a
<fixed_size_mode
> (V16QImode
);
3746 return default_get_reg_raw_mode (regno
);
3749 /* Implement TARGET_FUNCTION_ARG_PADDING.
3751 Small aggregate types are placed in the lowest memory address.
3753 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
3755 static pad_direction
3756 aarch64_function_arg_padding (machine_mode mode
, const_tree type
)
3758 /* On little-endian targets, the least significant byte of every stack
3759 argument is passed at the lowest byte address of the stack slot. */
3760 if (!BYTES_BIG_ENDIAN
)
3763 /* Otherwise, integral, floating-point and pointer types are padded downward:
3764 the least significant byte of a stack argument is passed at the highest
3765 byte address of the stack slot. */
3767 ? (INTEGRAL_TYPE_P (type
) || SCALAR_FLOAT_TYPE_P (type
)
3768 || POINTER_TYPE_P (type
))
3769 : (SCALAR_INT_MODE_P (mode
) || SCALAR_FLOAT_MODE_P (mode
)))
3770 return PAD_DOWNWARD
;
3772 /* Everything else padded upward, i.e. data in first byte of stack slot. */
3776 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3778 It specifies padding for the last (may also be the only)
3779 element of a block move between registers and memory. If
3780 assuming the block is in the memory, padding upward means that
3781 the last element is padded after its highest significant byte,
3782 while in downward padding, the last element is padded at the
3783 its least significant byte side.
3785 Small aggregates and small complex types are always padded
3788 We don't need to worry about homogeneous floating-point or
3789 short-vector aggregates; their move is not affected by the
3790 padding direction determined here. Regardless of endianness,
3791 each element of such an aggregate is put in the least
3792 significant bits of a fp/simd register.
3794 Return !BYTES_BIG_ENDIAN if the least significant byte of the
3795 register has useful data, and return the opposite if the most
3796 significant byte does. */
3799 aarch64_pad_reg_upward (machine_mode mode
, const_tree type
,
3800 bool first ATTRIBUTE_UNUSED
)
3803 /* Small composite types are always padded upward. */
3804 if (BYTES_BIG_ENDIAN
&& aarch64_composite_type_p (type
, mode
))
3808 size
= int_size_in_bytes (type
);
3810 /* No frontends can create types with variable-sized modes, so we
3811 shouldn't be asked to pass or return them. */
3812 size
= GET_MODE_SIZE (mode
).to_constant ();
3813 if (size
< 2 * UNITS_PER_WORD
)
3817 /* Otherwise, use the default padding. */
3818 return !BYTES_BIG_ENDIAN
;
3821 static scalar_int_mode
3822 aarch64_libgcc_cmp_return_mode (void)
3827 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
3829 /* We use the 12-bit shifted immediate arithmetic instructions so values
3830 must be multiple of (1 << 12), i.e. 4096. */
3831 #define ARITH_FACTOR 4096
3833 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
3834 #error Cannot use simple address calculation for stack probing
3837 /* The pair of scratch registers used for stack probing. */
3838 #define PROBE_STACK_FIRST_REG 9
3839 #define PROBE_STACK_SECOND_REG 10
3841 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
3842 inclusive. These are offsets from the current stack pointer. */
3845 aarch64_emit_probe_stack_range (HOST_WIDE_INT first
, poly_int64 poly_size
)
3848 if (!poly_size
.is_constant (&size
))
3850 sorry ("stack probes for SVE frames");
3854 rtx reg1
= gen_rtx_REG (Pmode
, PROBE_STACK_FIRST_REG
);
3856 /* See the same assertion on PROBE_INTERVAL above. */
3857 gcc_assert ((first
% ARITH_FACTOR
) == 0);
3859 /* See if we have a constant small number of probes to generate. If so,
3860 that's the easy case. */
3861 if (size
<= PROBE_INTERVAL
)
3863 const HOST_WIDE_INT base
= ROUND_UP (size
, ARITH_FACTOR
);
3865 emit_set_insn (reg1
,
3866 plus_constant (Pmode
,
3867 stack_pointer_rtx
, -(first
+ base
)));
3868 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- size
));
3871 /* The run-time loop is made up of 8 insns in the generic case while the
3872 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
3873 else if (size
<= 4 * PROBE_INTERVAL
)
3875 HOST_WIDE_INT i
, rem
;
3877 emit_set_insn (reg1
,
3878 plus_constant (Pmode
,
3880 -(first
+ PROBE_INTERVAL
)));
3881 emit_stack_probe (reg1
);
3883 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
3884 it exceeds SIZE. If only two probes are needed, this will not
3885 generate any code. Then probe at FIRST + SIZE. */
3886 for (i
= 2 * PROBE_INTERVAL
; i
< size
; i
+= PROBE_INTERVAL
)
3888 emit_set_insn (reg1
,
3889 plus_constant (Pmode
, reg1
, -PROBE_INTERVAL
));
3890 emit_stack_probe (reg1
);
3893 rem
= size
- (i
- PROBE_INTERVAL
);
3896 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
3898 emit_set_insn (reg1
, plus_constant (Pmode
, reg1
, -base
));
3899 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- rem
));
3902 emit_stack_probe (plus_constant (Pmode
, reg1
, -rem
));
3905 /* Otherwise, do the same as above, but in a loop. Note that we must be
3906 extra careful with variables wrapping around because we might be at
3907 the very top (or the very bottom) of the address space and we have
3908 to be able to handle this case properly; in particular, we use an
3909 equality test for the loop condition. */
3912 rtx reg2
= gen_rtx_REG (Pmode
, PROBE_STACK_SECOND_REG
);
3914 /* Step 1: round SIZE to the previous multiple of the interval. */
3916 HOST_WIDE_INT rounded_size
= size
& -PROBE_INTERVAL
;
3919 /* Step 2: compute initial and final value of the loop counter. */
3921 /* TEST_ADDR = SP + FIRST. */
3922 emit_set_insn (reg1
,
3923 plus_constant (Pmode
, stack_pointer_rtx
, -first
));
3925 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
3926 HOST_WIDE_INT adjustment
= - (first
+ rounded_size
);
3927 if (! aarch64_uimm12_shift (adjustment
))
3929 aarch64_internal_mov_immediate (reg2
, GEN_INT (adjustment
),
3931 emit_set_insn (reg2
, gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, reg2
));
3934 emit_set_insn (reg2
,
3935 plus_constant (Pmode
, stack_pointer_rtx
, adjustment
));
3941 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
3944 while (TEST_ADDR != LAST_ADDR)
3946 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
3947 until it is equal to ROUNDED_SIZE. */
3949 emit_insn (gen_probe_stack_range (reg1
, reg1
, reg2
));
3952 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
3953 that SIZE is equal to ROUNDED_SIZE. */
3955 if (size
!= rounded_size
)
3957 HOST_WIDE_INT rem
= size
- rounded_size
;
3961 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
3963 emit_set_insn (reg2
, plus_constant (Pmode
, reg2
, -base
));
3964 emit_stack_probe (plus_constant (Pmode
, reg2
, base
- rem
));
3967 emit_stack_probe (plus_constant (Pmode
, reg2
, -rem
));
3971 /* Make sure nothing is scheduled before we are done. */
3972 emit_insn (gen_blockage ());
3975 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
3976 absolute addresses. */
3979 aarch64_output_probe_stack_range (rtx reg1
, rtx reg2
)
3981 static int labelno
= 0;
3985 ASM_GENERATE_INTERNAL_LABEL (loop_lab
, "LPSRL", labelno
++);
3988 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_lab
);
3990 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
3992 xops
[1] = GEN_INT (PROBE_INTERVAL
);
3993 output_asm_insn ("sub\t%0, %0, %1", xops
);
3995 /* Probe at TEST_ADDR. */
3996 output_asm_insn ("str\txzr, [%0]", xops
);
3998 /* Test if TEST_ADDR == LAST_ADDR. */
4000 output_asm_insn ("cmp\t%0, %1", xops
);
4003 fputs ("\tb.ne\t", asm_out_file
);
4004 assemble_name_raw (asm_out_file
, loop_lab
);
4005 fputc ('\n', asm_out_file
);
4010 /* Determine whether a frame chain needs to be generated. */
4012 aarch64_needs_frame_chain (void)
4014 /* Force a frame chain for EH returns so the return address is at FP+8. */
4015 if (frame_pointer_needed
|| crtl
->calls_eh_return
)
4018 /* A leaf function cannot have calls or write LR. */
4019 bool is_leaf
= crtl
->is_leaf
&& !df_regs_ever_live_p (LR_REGNUM
);
4021 /* Don't use a frame chain in leaf functions if leaf frame pointers
4023 if (flag_omit_leaf_frame_pointer
&& is_leaf
)
4026 return aarch64_use_frame_pointer
;
4029 /* Mark the registers that need to be saved by the callee and calculate
4030 the size of the callee-saved registers area and frame record (both FP
4031 and LR may be omitted). */
4033 aarch64_layout_frame (void)
4035 HOST_WIDE_INT offset
= 0;
4036 int regno
, last_fp_reg
= INVALID_REGNUM
;
4038 if (reload_completed
&& cfun
->machine
->frame
.laid_out
)
4041 cfun
->machine
->frame
.emit_frame_chain
= aarch64_needs_frame_chain ();
4043 #define SLOT_NOT_REQUIRED (-2)
4044 #define SLOT_REQUIRED (-1)
4046 cfun
->machine
->frame
.wb_candidate1
= INVALID_REGNUM
;
4047 cfun
->machine
->frame
.wb_candidate2
= INVALID_REGNUM
;
4049 /* First mark all the registers that really need to be saved... */
4050 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
4051 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
4053 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
4054 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
4056 /* ... that includes the eh data registers (if needed)... */
4057 if (crtl
->calls_eh_return
)
4058 for (regno
= 0; EH_RETURN_DATA_REGNO (regno
) != INVALID_REGNUM
; regno
++)
4059 cfun
->machine
->frame
.reg_offset
[EH_RETURN_DATA_REGNO (regno
)]
4062 /* ... and any callee saved register that dataflow says is live. */
4063 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
4064 if (df_regs_ever_live_p (regno
)
4065 && (regno
== R30_REGNUM
4066 || !call_used_regs
[regno
]))
4067 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
4069 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
4070 if (df_regs_ever_live_p (regno
)
4071 && !call_used_regs
[regno
])
4073 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
4074 last_fp_reg
= regno
;
4077 if (cfun
->machine
->frame
.emit_frame_chain
)
4079 /* FP and LR are placed in the linkage record. */
4080 cfun
->machine
->frame
.reg_offset
[R29_REGNUM
] = 0;
4081 cfun
->machine
->frame
.wb_candidate1
= R29_REGNUM
;
4082 cfun
->machine
->frame
.reg_offset
[R30_REGNUM
] = UNITS_PER_WORD
;
4083 cfun
->machine
->frame
.wb_candidate2
= R30_REGNUM
;
4084 offset
= 2 * UNITS_PER_WORD
;
4087 /* Now assign stack slots for them. */
4088 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
4089 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
4091 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
4092 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
4093 cfun
->machine
->frame
.wb_candidate1
= regno
;
4094 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
)
4095 cfun
->machine
->frame
.wb_candidate2
= regno
;
4096 offset
+= UNITS_PER_WORD
;
4099 HOST_WIDE_INT max_int_offset
= offset
;
4100 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
4101 bool has_align_gap
= offset
!= max_int_offset
;
4103 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
4104 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
4106 /* If there is an alignment gap between integer and fp callee-saves,
4107 allocate the last fp register to it if possible. */
4108 if (regno
== last_fp_reg
&& has_align_gap
&& (offset
& 8) == 0)
4110 cfun
->machine
->frame
.reg_offset
[regno
] = max_int_offset
;
4114 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
4115 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
4116 cfun
->machine
->frame
.wb_candidate1
= regno
;
4117 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
4118 && cfun
->machine
->frame
.wb_candidate1
>= V0_REGNUM
)
4119 cfun
->machine
->frame
.wb_candidate2
= regno
;
4120 offset
+= UNITS_PER_WORD
;
4123 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
4125 cfun
->machine
->frame
.saved_regs_size
= offset
;
4127 HOST_WIDE_INT varargs_and_saved_regs_size
4128 = offset
+ cfun
->machine
->frame
.saved_varargs_size
;
4130 cfun
->machine
->frame
.hard_fp_offset
4131 = aligned_upper_bound (varargs_and_saved_regs_size
4132 + get_frame_size (),
4133 STACK_BOUNDARY
/ BITS_PER_UNIT
);
4135 /* Both these values are already aligned. */
4136 gcc_assert (multiple_p (crtl
->outgoing_args_size
,
4137 STACK_BOUNDARY
/ BITS_PER_UNIT
));
4138 cfun
->machine
->frame
.frame_size
4139 = (cfun
->machine
->frame
.hard_fp_offset
4140 + crtl
->outgoing_args_size
);
4142 cfun
->machine
->frame
.locals_offset
= cfun
->machine
->frame
.saved_varargs_size
;
4144 cfun
->machine
->frame
.initial_adjust
= 0;
4145 cfun
->machine
->frame
.final_adjust
= 0;
4146 cfun
->machine
->frame
.callee_adjust
= 0;
4147 cfun
->machine
->frame
.callee_offset
= 0;
4149 HOST_WIDE_INT max_push_offset
= 0;
4150 if (cfun
->machine
->frame
.wb_candidate2
!= INVALID_REGNUM
)
4151 max_push_offset
= 512;
4152 else if (cfun
->machine
->frame
.wb_candidate1
!= INVALID_REGNUM
)
4153 max_push_offset
= 256;
4155 HOST_WIDE_INT const_size
, const_fp_offset
;
4156 if (cfun
->machine
->frame
.frame_size
.is_constant (&const_size
)
4157 && const_size
< max_push_offset
4158 && known_eq (crtl
->outgoing_args_size
, 0))
4160 /* Simple, small frame with no outgoing arguments:
4161 stp reg1, reg2, [sp, -frame_size]!
4162 stp reg3, reg4, [sp, 16] */
4163 cfun
->machine
->frame
.callee_adjust
= const_size
;
4165 else if (known_lt (crtl
->outgoing_args_size
4166 + cfun
->machine
->frame
.saved_regs_size
, 512)
4167 && !(cfun
->calls_alloca
4168 && known_lt (cfun
->machine
->frame
.hard_fp_offset
,
4171 /* Frame with small outgoing arguments:
4172 sub sp, sp, frame_size
4173 stp reg1, reg2, [sp, outgoing_args_size]
4174 stp reg3, reg4, [sp, outgoing_args_size + 16] */
4175 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.frame_size
;
4176 cfun
->machine
->frame
.callee_offset
4177 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.hard_fp_offset
;
4179 else if (cfun
->machine
->frame
.hard_fp_offset
.is_constant (&const_fp_offset
)
4180 && const_fp_offset
< max_push_offset
)
4182 /* Frame with large outgoing arguments but a small local area:
4183 stp reg1, reg2, [sp, -hard_fp_offset]!
4184 stp reg3, reg4, [sp, 16]
4185 sub sp, sp, outgoing_args_size */
4186 cfun
->machine
->frame
.callee_adjust
= const_fp_offset
;
4187 cfun
->machine
->frame
.final_adjust
4188 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.callee_adjust
;
4192 /* Frame with large local area and outgoing arguments using frame pointer:
4193 sub sp, sp, hard_fp_offset
4194 stp x29, x30, [sp, 0]
4196 stp reg3, reg4, [sp, 16]
4197 sub sp, sp, outgoing_args_size */
4198 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.hard_fp_offset
;
4199 cfun
->machine
->frame
.final_adjust
4200 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.initial_adjust
;
4203 cfun
->machine
->frame
.laid_out
= true;
4206 /* Return true if the register REGNO is saved on entry to
4207 the current function. */
4210 aarch64_register_saved_on_entry (int regno
)
4212 return cfun
->machine
->frame
.reg_offset
[regno
] >= 0;
4215 /* Return the next register up from REGNO up to LIMIT for the callee
4219 aarch64_next_callee_save (unsigned regno
, unsigned limit
)
4221 while (regno
<= limit
&& !aarch64_register_saved_on_entry (regno
))
4226 /* Push the register number REGNO of mode MODE to the stack with write-back
4227 adjusting the stack by ADJUSTMENT. */
4230 aarch64_pushwb_single_reg (machine_mode mode
, unsigned regno
,
4231 HOST_WIDE_INT adjustment
)
4233 rtx base_rtx
= stack_pointer_rtx
;
4236 reg
= gen_rtx_REG (mode
, regno
);
4237 mem
= gen_rtx_PRE_MODIFY (Pmode
, base_rtx
,
4238 plus_constant (Pmode
, base_rtx
, -adjustment
));
4239 mem
= gen_frame_mem (mode
, mem
);
4241 insn
= emit_move_insn (mem
, reg
);
4242 RTX_FRAME_RELATED_P (insn
) = 1;
4245 /* Generate and return an instruction to store the pair of registers
4246 REG and REG2 of mode MODE to location BASE with write-back adjusting
4247 the stack location BASE by ADJUSTMENT. */
4250 aarch64_gen_storewb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
4251 HOST_WIDE_INT adjustment
)
4256 return gen_storewb_pairdi_di (base
, base
, reg
, reg2
,
4257 GEN_INT (-adjustment
),
4258 GEN_INT (UNITS_PER_WORD
- adjustment
));
4260 return gen_storewb_pairdf_di (base
, base
, reg
, reg2
,
4261 GEN_INT (-adjustment
),
4262 GEN_INT (UNITS_PER_WORD
- adjustment
));
4268 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4269 stack pointer by ADJUSTMENT. */
4272 aarch64_push_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
)
4275 machine_mode mode
= (regno1
<= R30_REGNUM
) ? E_DImode
: E_DFmode
;
4277 if (regno2
== INVALID_REGNUM
)
4278 return aarch64_pushwb_single_reg (mode
, regno1
, adjustment
);
4280 rtx reg1
= gen_rtx_REG (mode
, regno1
);
4281 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4283 insn
= emit_insn (aarch64_gen_storewb_pair (mode
, stack_pointer_rtx
, reg1
,
4285 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 2)) = 1;
4286 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
4287 RTX_FRAME_RELATED_P (insn
) = 1;
4290 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4291 adjusting it by ADJUSTMENT afterwards. */
4294 aarch64_gen_loadwb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
4295 HOST_WIDE_INT adjustment
)
4300 return gen_loadwb_pairdi_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
4301 GEN_INT (UNITS_PER_WORD
));
4303 return gen_loadwb_pairdf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
4304 GEN_INT (UNITS_PER_WORD
));
4310 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4311 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4315 aarch64_pop_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
,
4318 machine_mode mode
= (regno1
<= R30_REGNUM
) ? E_DImode
: E_DFmode
;
4319 rtx reg1
= gen_rtx_REG (mode
, regno1
);
4321 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg1
, *cfi_ops
);
4323 if (regno2
== INVALID_REGNUM
)
4325 rtx mem
= plus_constant (Pmode
, stack_pointer_rtx
, adjustment
);
4326 mem
= gen_rtx_POST_MODIFY (Pmode
, stack_pointer_rtx
, mem
);
4327 emit_move_insn (reg1
, gen_frame_mem (mode
, mem
));
4331 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4332 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
4333 emit_insn (aarch64_gen_loadwb_pair (mode
, stack_pointer_rtx
, reg1
,
4338 /* Generate and return a store pair instruction of mode MODE to store
4339 register REG1 to MEM1 and register REG2 to MEM2. */
4342 aarch64_gen_store_pair (machine_mode mode
, rtx mem1
, rtx reg1
, rtx mem2
,
4348 return gen_store_pair_dw_didi (mem1
, reg1
, mem2
, reg2
);
4351 return gen_store_pair_dw_dfdf (mem1
, reg1
, mem2
, reg2
);
4358 /* Generate and regurn a load pair isntruction of mode MODE to load register
4359 REG1 from MEM1 and register REG2 from MEM2. */
4362 aarch64_gen_load_pair (machine_mode mode
, rtx reg1
, rtx mem1
, rtx reg2
,
4368 return gen_load_pair_dw_didi (reg1
, mem1
, reg2
, mem2
);
4371 return gen_load_pair_dw_dfdf (reg1
, mem1
, reg2
, mem2
);
4378 /* Return TRUE if return address signing should be enabled for the current
4379 function, otherwise return FALSE. */
4382 aarch64_return_address_signing_enabled (void)
4384 /* This function should only be called after frame laid out. */
4385 gcc_assert (cfun
->machine
->frame
.laid_out
);
4387 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4388 if it's LR is pushed onto stack. */
4389 return (aarch64_ra_sign_scope
== AARCH64_FUNCTION_ALL
4390 || (aarch64_ra_sign_scope
== AARCH64_FUNCTION_NON_LEAF
4391 && cfun
->machine
->frame
.reg_offset
[LR_REGNUM
] >= 0));
4394 /* Emit code to save the callee-saved registers from register number START
4395 to LIMIT to the stack at the location starting at offset START_OFFSET,
4396 skipping any write-back candidates if SKIP_WB is true. */
4399 aarch64_save_callee_saves (machine_mode mode
, poly_int64 start_offset
,
4400 unsigned start
, unsigned limit
, bool skip_wb
)
4406 for (regno
= aarch64_next_callee_save (start
, limit
);
4408 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
4414 && (regno
== cfun
->machine
->frame
.wb_candidate1
4415 || regno
== cfun
->machine
->frame
.wb_candidate2
))
4418 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
4421 reg
= gen_rtx_REG (mode
, regno
);
4422 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
4423 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
4426 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
4429 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
4430 && ((cfun
->machine
->frame
.reg_offset
[regno
] + UNITS_PER_WORD
)
4431 == cfun
->machine
->frame
.reg_offset
[regno2
]))
4434 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4437 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
4438 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
4440 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
,
4443 /* The first part of a frame-related parallel insn is
4444 always assumed to be relevant to the frame
4445 calculations; subsequent parts, are only
4446 frame-related if explicitly marked. */
4447 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
4451 insn
= emit_move_insn (mem
, reg
);
4453 RTX_FRAME_RELATED_P (insn
) = 1;
4457 /* Emit code to restore the callee registers of mode MODE from register
4458 number START up to and including LIMIT. Restore from the stack offset
4459 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4460 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
4463 aarch64_restore_callee_saves (machine_mode mode
,
4464 poly_int64 start_offset
, unsigned start
,
4465 unsigned limit
, bool skip_wb
, rtx
*cfi_ops
)
4467 rtx base_rtx
= stack_pointer_rtx
;
4472 for (regno
= aarch64_next_callee_save (start
, limit
);
4474 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
4476 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
4482 && (regno
== cfun
->machine
->frame
.wb_candidate1
4483 || regno
== cfun
->machine
->frame
.wb_candidate2
))
4486 reg
= gen_rtx_REG (mode
, regno
);
4487 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
4488 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
4490 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
4493 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
4494 && ((cfun
->machine
->frame
.reg_offset
[regno
] + UNITS_PER_WORD
)
4495 == cfun
->machine
->frame
.reg_offset
[regno2
]))
4497 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4500 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
4501 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
4502 emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
4504 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
4508 emit_move_insn (reg
, mem
);
4509 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg
, *cfi_ops
);
4513 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4517 offset_4bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
4519 HOST_WIDE_INT multiple
;
4520 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4521 && IN_RANGE (multiple
, -8, 7));
4524 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4528 offset_6bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
4530 HOST_WIDE_INT multiple
;
4531 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4532 && IN_RANGE (multiple
, 0, 63));
4535 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4539 aarch64_offset_7bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
4541 HOST_WIDE_INT multiple
;
4542 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4543 && IN_RANGE (multiple
, -64, 63));
4546 /* Return true if OFFSET is a signed 9-bit value. */
4549 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED
,
4552 HOST_WIDE_INT const_offset
;
4553 return (offset
.is_constant (&const_offset
)
4554 && IN_RANGE (const_offset
, -256, 255));
4557 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4561 offset_9bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
4563 HOST_WIDE_INT multiple
;
4564 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4565 && IN_RANGE (multiple
, -256, 255));
4568 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4572 offset_12bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
4574 HOST_WIDE_INT multiple
;
4575 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4576 && IN_RANGE (multiple
, 0, 4095));
4579 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
4582 aarch64_get_separate_components (void)
4584 aarch64_layout_frame ();
4586 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
4587 bitmap_clear (components
);
4589 /* The registers we need saved to the frame. */
4590 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
4591 if (aarch64_register_saved_on_entry (regno
))
4593 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
4594 if (!frame_pointer_needed
)
4595 offset
+= cfun
->machine
->frame
.frame_size
4596 - cfun
->machine
->frame
.hard_fp_offset
;
4597 /* Check that we can access the stack slot of the register with one
4598 direct load with no adjustments needed. */
4599 if (offset_12bit_unsigned_scaled_p (DImode
, offset
))
4600 bitmap_set_bit (components
, regno
);
4603 /* Don't mess with the hard frame pointer. */
4604 if (frame_pointer_needed
)
4605 bitmap_clear_bit (components
, HARD_FRAME_POINTER_REGNUM
);
4607 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
4608 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
4609 /* If aarch64_layout_frame has chosen registers to store/restore with
4610 writeback don't interfere with them to avoid having to output explicit
4611 stack adjustment instructions. */
4612 if (reg2
!= INVALID_REGNUM
)
4613 bitmap_clear_bit (components
, reg2
);
4614 if (reg1
!= INVALID_REGNUM
)
4615 bitmap_clear_bit (components
, reg1
);
4617 bitmap_clear_bit (components
, LR_REGNUM
);
4618 bitmap_clear_bit (components
, SP_REGNUM
);
4623 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
4626 aarch64_components_for_bb (basic_block bb
)
4628 bitmap in
= DF_LIVE_IN (bb
);
4629 bitmap gen
= &DF_LIVE_BB_INFO (bb
)->gen
;
4630 bitmap kill
= &DF_LIVE_BB_INFO (bb
)->kill
;
4632 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
4633 bitmap_clear (components
);
4635 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
4636 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
4637 if ((!call_used_regs
[regno
])
4638 && (bitmap_bit_p (in
, regno
)
4639 || bitmap_bit_p (gen
, regno
)
4640 || bitmap_bit_p (kill
, regno
)))
4642 unsigned regno2
, offset
, offset2
;
4643 bitmap_set_bit (components
, regno
);
4645 /* If there is a callee-save at an adjacent offset, add it too
4646 to increase the use of LDP/STP. */
4647 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
4648 regno2
= ((offset
& 8) == 0) ? regno
+ 1 : regno
- 1;
4650 if (regno2
<= LAST_SAVED_REGNUM
)
4652 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
4653 if ((offset
& ~8) == (offset2
& ~8))
4654 bitmap_set_bit (components
, regno2
);
4661 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
4662 Nothing to do for aarch64. */
4665 aarch64_disqualify_components (sbitmap
, edge
, sbitmap
, bool)
4669 /* Return the next set bit in BMP from START onwards. Return the total number
4670 of bits in BMP if no set bit is found at or after START. */
4673 aarch64_get_next_set_bit (sbitmap bmp
, unsigned int start
)
4675 unsigned int nbits
= SBITMAP_SIZE (bmp
);
4679 gcc_assert (start
< nbits
);
4680 for (unsigned int i
= start
; i
< nbits
; i
++)
4681 if (bitmap_bit_p (bmp
, i
))
4687 /* Do the work for aarch64_emit_prologue_components and
4688 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
4689 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
4690 for these components or the epilogue sequence. That is, it determines
4691 whether we should emit stores or loads and what kind of CFA notes to attach
4692 to the insns. Otherwise the logic for the two sequences is very
4696 aarch64_process_components (sbitmap components
, bool prologue_p
)
4698 rtx ptr_reg
= gen_rtx_REG (Pmode
, frame_pointer_needed
4699 ? HARD_FRAME_POINTER_REGNUM
4700 : STACK_POINTER_REGNUM
);
4702 unsigned last_regno
= SBITMAP_SIZE (components
);
4703 unsigned regno
= aarch64_get_next_set_bit (components
, R0_REGNUM
);
4704 rtx_insn
*insn
= NULL
;
4706 while (regno
!= last_regno
)
4708 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
4709 so DFmode for the vector registers is enough. */
4710 machine_mode mode
= GP_REGNUM_P (regno
) ? E_DImode
: E_DFmode
;
4711 rtx reg
= gen_rtx_REG (mode
, regno
);
4712 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
4713 if (!frame_pointer_needed
)
4714 offset
+= cfun
->machine
->frame
.frame_size
4715 - cfun
->machine
->frame
.hard_fp_offset
;
4716 rtx addr
= plus_constant (Pmode
, ptr_reg
, offset
);
4717 rtx mem
= gen_frame_mem (mode
, addr
);
4719 rtx set
= prologue_p
? gen_rtx_SET (mem
, reg
) : gen_rtx_SET (reg
, mem
);
4720 unsigned regno2
= aarch64_get_next_set_bit (components
, regno
+ 1);
4721 /* No more registers to handle after REGNO.
4722 Emit a single save/restore and exit. */
4723 if (regno2
== last_regno
)
4725 insn
= emit_insn (set
);
4726 RTX_FRAME_RELATED_P (insn
) = 1;
4728 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
4730 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
4734 poly_int64 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
4735 /* The next register is not of the same class or its offset is not
4736 mergeable with the current one into a pair. */
4737 if (!satisfies_constraint_Ump (mem
)
4738 || GP_REGNUM_P (regno
) != GP_REGNUM_P (regno2
)
4739 || maybe_ne ((offset2
- cfun
->machine
->frame
.reg_offset
[regno
]),
4740 GET_MODE_SIZE (mode
)))
4742 insn
= emit_insn (set
);
4743 RTX_FRAME_RELATED_P (insn
) = 1;
4745 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
4747 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
4753 /* REGNO2 can be saved/restored in a pair with REGNO. */
4754 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4755 if (!frame_pointer_needed
)
4756 offset2
+= cfun
->machine
->frame
.frame_size
4757 - cfun
->machine
->frame
.hard_fp_offset
;
4758 rtx addr2
= plus_constant (Pmode
, ptr_reg
, offset2
);
4759 rtx mem2
= gen_frame_mem (mode
, addr2
);
4760 rtx set2
= prologue_p
? gen_rtx_SET (mem2
, reg2
)
4761 : gen_rtx_SET (reg2
, mem2
);
4764 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
, reg2
));
4766 insn
= emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
4768 RTX_FRAME_RELATED_P (insn
) = 1;
4771 add_reg_note (insn
, REG_CFA_OFFSET
, set
);
4772 add_reg_note (insn
, REG_CFA_OFFSET
, set2
);
4776 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
4777 add_reg_note (insn
, REG_CFA_RESTORE
, reg2
);
4780 regno
= aarch64_get_next_set_bit (components
, regno2
+ 1);
4784 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
4787 aarch64_emit_prologue_components (sbitmap components
)
4789 aarch64_process_components (components
, true);
4792 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
4795 aarch64_emit_epilogue_components (sbitmap components
)
4797 aarch64_process_components (components
, false);
4800 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
4803 aarch64_set_handled_components (sbitmap components
)
4805 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
4806 if (bitmap_bit_p (components
, regno
))
4807 cfun
->machine
->reg_is_wrapped_separately
[regno
] = true;
4810 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
4811 is saved at BASE + OFFSET. */
4814 aarch64_add_cfa_expression (rtx_insn
*insn
, unsigned int reg
,
4815 rtx base
, poly_int64 offset
)
4817 rtx mem
= gen_frame_mem (DImode
, plus_constant (Pmode
, base
, offset
));
4818 add_reg_note (insn
, REG_CFA_EXPRESSION
,
4819 gen_rtx_SET (mem
, regno_reg_rtx
[reg
]));
4822 /* AArch64 stack frames generated by this compiler look like:
4824 +-------------------------------+
4826 | incoming stack arguments |
4828 +-------------------------------+
4829 | | <-- incoming stack pointer (aligned)
4830 | callee-allocated save area |
4831 | for register varargs |
4833 +-------------------------------+
4834 | local variables | <-- frame_pointer_rtx
4836 +-------------------------------+
4838 +-------------------------------+ |
4839 | callee-saved registers | | frame.saved_regs_size
4840 +-------------------------------+ |
4842 +-------------------------------+ |
4843 | FP' | / <- hard_frame_pointer_rtx (aligned)
4844 +-------------------------------+
4845 | dynamic allocation |
4846 +-------------------------------+
4848 +-------------------------------+
4849 | outgoing stack arguments | <-- arg_pointer
4851 +-------------------------------+
4852 | | <-- stack_pointer_rtx (aligned)
4854 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
4855 but leave frame_pointer_rtx and hard_frame_pointer_rtx
4858 /* Generate the prologue instructions for entry into a function.
4859 Establish the stack frame by decreasing the stack pointer with a
4860 properly calculated size and, if necessary, create a frame record
4861 filled with the values of LR and previous frame pointer. The
4862 current FP is also set up if it is in use. */
4865 aarch64_expand_prologue (void)
4867 aarch64_layout_frame ();
4869 poly_int64 frame_size
= cfun
->machine
->frame
.frame_size
;
4870 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
4871 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
4872 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
4873 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
4874 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
4875 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
4876 bool emit_frame_chain
= cfun
->machine
->frame
.emit_frame_chain
;
4879 /* Sign return address for functions. */
4880 if (aarch64_return_address_signing_enabled ())
4882 insn
= emit_insn (gen_pacisp ());
4883 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
4884 RTX_FRAME_RELATED_P (insn
) = 1;
4887 if (flag_stack_usage_info
)
4888 current_function_static_stack_size
= constant_lower_bound (frame_size
);
4890 if (flag_stack_check
== STATIC_BUILTIN_STACK_CHECK
)
4892 if (crtl
->is_leaf
&& !cfun
->calls_alloca
)
4894 if (maybe_gt (frame_size
, PROBE_INTERVAL
)
4895 && maybe_gt (frame_size
, get_stack_check_protect ()))
4896 aarch64_emit_probe_stack_range (get_stack_check_protect (),
4898 - get_stack_check_protect ()));
4900 else if (maybe_gt (frame_size
, 0))
4901 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size
);
4904 rtx ip0_rtx
= gen_rtx_REG (Pmode
, IP0_REGNUM
);
4905 rtx ip1_rtx
= gen_rtx_REG (Pmode
, IP1_REGNUM
);
4907 aarch64_sub_sp (ip0_rtx
, ip1_rtx
, initial_adjust
, true);
4909 if (callee_adjust
!= 0)
4910 aarch64_push_regs (reg1
, reg2
, callee_adjust
);
4912 if (emit_frame_chain
)
4914 poly_int64 reg_offset
= callee_adjust
;
4915 if (callee_adjust
== 0)
4919 reg_offset
= callee_offset
;
4920 aarch64_save_callee_saves (DImode
, reg_offset
, reg1
, reg2
, false);
4922 aarch64_add_offset (Pmode
, hard_frame_pointer_rtx
,
4923 stack_pointer_rtx
, callee_offset
,
4924 ip1_rtx
, ip0_rtx
, frame_pointer_needed
);
4925 if (frame_pointer_needed
&& !frame_size
.is_constant ())
4927 /* Variable-sized frames need to describe the save slot
4928 address using DW_CFA_expression rather than DW_CFA_offset.
4929 This means that, without taking further action, the
4930 locations of the registers that we've already saved would
4931 remain based on the stack pointer even after we redefine
4932 the CFA based on the frame pointer. We therefore need new
4933 DW_CFA_expressions to re-express the save slots with addresses
4934 based on the frame pointer. */
4935 rtx_insn
*insn
= get_last_insn ();
4936 gcc_assert (RTX_FRAME_RELATED_P (insn
));
4938 /* Add an explicit CFA definition if this was previously
4940 if (!find_reg_note (insn
, REG_CFA_ADJUST_CFA
, NULL_RTX
))
4942 rtx src
= plus_constant (Pmode
, stack_pointer_rtx
,
4944 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
4945 gen_rtx_SET (hard_frame_pointer_rtx
, src
));
4948 /* Change the save slot expressions for the registers that
4949 we've already saved. */
4950 reg_offset
-= callee_offset
;
4951 aarch64_add_cfa_expression (insn
, reg2
, hard_frame_pointer_rtx
,
4952 reg_offset
+ UNITS_PER_WORD
);
4953 aarch64_add_cfa_expression (insn
, reg1
, hard_frame_pointer_rtx
,
4956 emit_insn (gen_stack_tie (stack_pointer_rtx
, hard_frame_pointer_rtx
));
4959 aarch64_save_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
4960 callee_adjust
!= 0 || emit_frame_chain
);
4961 aarch64_save_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
4962 callee_adjust
!= 0 || emit_frame_chain
);
4963 aarch64_sub_sp (ip1_rtx
, ip0_rtx
, final_adjust
, !frame_pointer_needed
);
4966 /* Return TRUE if we can use a simple_return insn.
4968 This function checks whether the callee saved stack is empty, which
4969 means no restore actions are need. The pro_and_epilogue will use
4970 this to check whether shrink-wrapping opt is feasible. */
4973 aarch64_use_return_insn_p (void)
4975 if (!reload_completed
)
4981 aarch64_layout_frame ();
4983 return known_eq (cfun
->machine
->frame
.frame_size
, 0);
4986 /* Generate the epilogue instructions for returning from a function.
4987 This is almost exactly the reverse of the prolog sequence, except
4988 that we need to insert barriers to avoid scheduling loads that read
4989 from a deallocated stack, and we optimize the unwind records by
4990 emitting them all together if possible. */
4992 aarch64_expand_epilogue (bool for_sibcall
)
4994 aarch64_layout_frame ();
4996 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
4997 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
4998 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
4999 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
5000 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
5001 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
5004 /* A stack clash protection prologue may not have left IP0_REGNUM or
5005 IP1_REGNUM in a usable state. The same is true for allocations
5006 with an SVE component, since we then need both temporary registers
5007 for each allocation. */
5008 bool can_inherit_p
= (initial_adjust
.is_constant ()
5009 && final_adjust
.is_constant ()
5010 && !flag_stack_clash_protection
);
5012 /* We need to add memory barrier to prevent read from deallocated stack. */
5014 = maybe_ne (get_frame_size ()
5015 + cfun
->machine
->frame
.saved_varargs_size
, 0);
5017 /* Emit a barrier to prevent loads from a deallocated stack. */
5018 if (maybe_gt (final_adjust
, crtl
->outgoing_args_size
)
5019 || cfun
->calls_alloca
5020 || crtl
->calls_eh_return
)
5022 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
5023 need_barrier_p
= false;
5026 /* Restore the stack pointer from the frame pointer if it may not
5027 be the same as the stack pointer. */
5028 rtx ip0_rtx
= gen_rtx_REG (Pmode
, IP0_REGNUM
);
5029 rtx ip1_rtx
= gen_rtx_REG (Pmode
, IP1_REGNUM
);
5030 if (frame_pointer_needed
5031 && (maybe_ne (final_adjust
, 0) || cfun
->calls_alloca
))
5032 /* If writeback is used when restoring callee-saves, the CFA
5033 is restored on the instruction doing the writeback. */
5034 aarch64_add_offset (Pmode
, stack_pointer_rtx
,
5035 hard_frame_pointer_rtx
, -callee_offset
,
5036 ip1_rtx
, ip0_rtx
, callee_adjust
== 0);
5038 aarch64_add_sp (ip1_rtx
, ip0_rtx
, final_adjust
,
5039 !can_inherit_p
|| df_regs_ever_live_p (IP1_REGNUM
));
5041 aarch64_restore_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
5042 callee_adjust
!= 0, &cfi_ops
);
5043 aarch64_restore_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
5044 callee_adjust
!= 0, &cfi_ops
);
5047 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
5049 if (callee_adjust
!= 0)
5050 aarch64_pop_regs (reg1
, reg2
, callee_adjust
, &cfi_ops
);
5052 if (callee_adjust
!= 0 || maybe_gt (initial_adjust
, 65536))
5054 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
5055 insn
= get_last_insn ();
5056 rtx new_cfa
= plus_constant (Pmode
, stack_pointer_rtx
, initial_adjust
);
5057 REG_NOTES (insn
) = alloc_reg_note (REG_CFA_DEF_CFA
, new_cfa
, cfi_ops
);
5058 RTX_FRAME_RELATED_P (insn
) = 1;
5062 aarch64_add_sp (ip0_rtx
, ip1_rtx
, initial_adjust
,
5063 !can_inherit_p
|| df_regs_ever_live_p (IP0_REGNUM
));
5067 /* Emit delayed restores and reset the CFA to be SP. */
5068 insn
= get_last_insn ();
5069 cfi_ops
= alloc_reg_note (REG_CFA_DEF_CFA
, stack_pointer_rtx
, cfi_ops
);
5070 REG_NOTES (insn
) = cfi_ops
;
5071 RTX_FRAME_RELATED_P (insn
) = 1;
5074 /* We prefer to emit the combined return/authenticate instruction RETAA,
5075 however there are three cases in which we must instead emit an explicit
5076 authentication instruction.
5078 1) Sibcalls don't return in a normal way, so if we're about to call one
5079 we must authenticate.
5081 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5082 generating code for !TARGET_ARMV8_3 we can't use it and must
5083 explicitly authenticate.
5085 3) On an eh_return path we make extra stack adjustments to update the
5086 canonical frame address to be the exception handler's CFA. We want
5087 to authenticate using the CFA of the function which calls eh_return.
5089 if (aarch64_return_address_signing_enabled ()
5090 && (for_sibcall
|| !TARGET_ARMV8_3
|| crtl
->calls_eh_return
))
5092 insn
= emit_insn (gen_autisp ());
5093 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
5094 RTX_FRAME_RELATED_P (insn
) = 1;
5097 /* Stack adjustment for exception handler. */
5098 if (crtl
->calls_eh_return
)
5100 /* We need to unwind the stack by the offset computed by
5101 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
5102 to be SP; letting the CFA move during this adjustment
5103 is just as correct as retaining the CFA from the body
5104 of the function. Therefore, do nothing special. */
5105 emit_insn (gen_add2_insn (stack_pointer_rtx
, EH_RETURN_STACKADJ_RTX
));
5108 emit_use (gen_rtx_REG (DImode
, LR_REGNUM
));
5110 emit_jump_insn (ret_rtx
);
5113 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
5114 normally or return to a previous frame after unwinding.
5116 An EH return uses a single shared return sequence. The epilogue is
5117 exactly like a normal epilogue except that it has an extra input
5118 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5119 that must be applied after the frame has been destroyed. An extra label
5120 is inserted before the epilogue which initializes this register to zero,
5121 and this is the entry point for a normal return.
5123 An actual EH return updates the return address, initializes the stack
5124 adjustment and jumps directly into the epilogue (bypassing the zeroing
5125 of the adjustment). Since the return address is typically saved on the
5126 stack when a function makes a call, the saved LR must be updated outside
5129 This poses problems as the store is generated well before the epilogue,
5130 so the offset of LR is not known yet. Also optimizations will remove the
5131 store as it appears dead, even after the epilogue is generated (as the
5132 base or offset for loading LR is different in many cases).
5134 To avoid these problems this implementation forces the frame pointer
5135 in eh_return functions so that the location of LR is fixed and known early.
5136 It also marks the store volatile, so no optimization is permitted to
5137 remove the store. */
5139 aarch64_eh_return_handler_rtx (void)
5141 rtx tmp
= gen_frame_mem (Pmode
,
5142 plus_constant (Pmode
, hard_frame_pointer_rtx
, UNITS_PER_WORD
));
5144 /* Mark the store volatile, so no optimization is permitted to remove it. */
5145 MEM_VOLATILE_P (tmp
) = true;
5149 /* Output code to add DELTA to the first argument, and then jump
5150 to FUNCTION. Used for C++ multiple inheritance. */
5152 aarch64_output_mi_thunk (FILE *file
, tree thunk ATTRIBUTE_UNUSED
,
5153 HOST_WIDE_INT delta
,
5154 HOST_WIDE_INT vcall_offset
,
5157 /* The this pointer is always in x0. Note that this differs from
5158 Arm where the this pointer maybe bumped to r1 if r0 is required
5159 to return a pointer to an aggregate. On AArch64 a result value
5160 pointer will be in x8. */
5161 int this_regno
= R0_REGNUM
;
5162 rtx this_rtx
, temp0
, temp1
, addr
, funexp
;
5165 reload_completed
= 1;
5166 emit_note (NOTE_INSN_PROLOGUE_END
);
5168 this_rtx
= gen_rtx_REG (Pmode
, this_regno
);
5169 temp0
= gen_rtx_REG (Pmode
, IP0_REGNUM
);
5170 temp1
= gen_rtx_REG (Pmode
, IP1_REGNUM
);
5172 if (vcall_offset
== 0)
5173 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
, temp1
, temp0
, false);
5176 gcc_assert ((vcall_offset
& (POINTER_BYTES
- 1)) == 0);
5181 if (delta
>= -256 && delta
< 256)
5182 addr
= gen_rtx_PRE_MODIFY (Pmode
, this_rtx
,
5183 plus_constant (Pmode
, this_rtx
, delta
));
5185 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
,
5186 temp1
, temp0
, false);
5189 if (Pmode
== ptr_mode
)
5190 aarch64_emit_move (temp0
, gen_rtx_MEM (ptr_mode
, addr
));
5192 aarch64_emit_move (temp0
,
5193 gen_rtx_ZERO_EXTEND (Pmode
,
5194 gen_rtx_MEM (ptr_mode
, addr
)));
5196 if (vcall_offset
>= -256 && vcall_offset
< 4096 * POINTER_BYTES
)
5197 addr
= plus_constant (Pmode
, temp0
, vcall_offset
);
5200 aarch64_internal_mov_immediate (temp1
, GEN_INT (vcall_offset
), true,
5202 addr
= gen_rtx_PLUS (Pmode
, temp0
, temp1
);
5205 if (Pmode
== ptr_mode
)
5206 aarch64_emit_move (temp1
, gen_rtx_MEM (ptr_mode
,addr
));
5208 aarch64_emit_move (temp1
,
5209 gen_rtx_SIGN_EXTEND (Pmode
,
5210 gen_rtx_MEM (ptr_mode
, addr
)));
5212 emit_insn (gen_add2_insn (this_rtx
, temp1
));
5215 /* Generate a tail call to the target function. */
5216 if (!TREE_USED (function
))
5218 assemble_external (function
);
5219 TREE_USED (function
) = 1;
5221 funexp
= XEXP (DECL_RTL (function
), 0);
5222 funexp
= gen_rtx_MEM (FUNCTION_MODE
, funexp
);
5223 insn
= emit_call_insn (gen_sibcall (funexp
, const0_rtx
, NULL_RTX
));
5224 SIBLING_CALL_P (insn
) = 1;
5226 insn
= get_insns ();
5227 shorten_branches (insn
);
5228 final_start_function (insn
, file
, 1);
5229 final (insn
, file
, 1);
5230 final_end_function ();
5232 /* Stop pretending to be a post-reload pass. */
5233 reload_completed
= 0;
5237 aarch64_tls_referenced_p (rtx x
)
5239 if (!TARGET_HAVE_TLS
)
5241 subrtx_iterator::array_type array
;
5242 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
5244 const_rtx x
= *iter
;
5245 if (GET_CODE (x
) == SYMBOL_REF
&& SYMBOL_REF_TLS_MODEL (x
) != 0)
5247 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5248 TLS offsets, not real symbol references. */
5249 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
5250 iter
.skip_subrtxes ();
5256 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5257 a left shift of 0 or 12 bits. */
5259 aarch64_uimm12_shift (HOST_WIDE_INT val
)
5261 return ((val
& (((HOST_WIDE_INT
) 0xfff) << 0)) == val
5262 || (val
& (((HOST_WIDE_INT
) 0xfff) << 12)) == val
5267 /* Return true if val is an immediate that can be loaded into a
5268 register by a MOVZ instruction. */
5270 aarch64_movw_imm (HOST_WIDE_INT val
, scalar_int_mode mode
)
5272 if (GET_MODE_SIZE (mode
) > 4)
5274 if ((val
& (((HOST_WIDE_INT
) 0xffff) << 32)) == val
5275 || (val
& (((HOST_WIDE_INT
) 0xffff) << 48)) == val
)
5280 /* Ignore sign extension. */
5281 val
&= (HOST_WIDE_INT
) 0xffffffff;
5283 return ((val
& (((HOST_WIDE_INT
) 0xffff) << 0)) == val
5284 || (val
& (((HOST_WIDE_INT
) 0xffff) << 16)) == val
);
5287 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
5288 64-bit (DImode) integer. */
5290 static unsigned HOST_WIDE_INT
5291 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val
, machine_mode mode
)
5293 unsigned int size
= GET_MODE_UNIT_PRECISION (mode
);
5296 val
&= (HOST_WIDE_INT_1U
<< size
) - 1;
5303 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
5305 static const unsigned HOST_WIDE_INT bitmask_imm_mul
[] =
5307 0x0000000100000001ull
,
5308 0x0001000100010001ull
,
5309 0x0101010101010101ull
,
5310 0x1111111111111111ull
,
5311 0x5555555555555555ull
,
5315 /* Return true if val is a valid bitmask immediate. */
5318 aarch64_bitmask_imm (HOST_WIDE_INT val_in
, machine_mode mode
)
5320 unsigned HOST_WIDE_INT val
, tmp
, mask
, first_one
, next_one
;
5323 /* Check for a single sequence of one bits and return quickly if so.
5324 The special cases of all ones and all zeroes returns false. */
5325 val
= aarch64_replicate_bitmask_imm (val_in
, mode
);
5326 tmp
= val
+ (val
& -val
);
5328 if (tmp
== (tmp
& -tmp
))
5329 return (val
+ 1) > 1;
5331 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
5333 val
= (val
<< 32) | (val
& 0xffffffff);
5335 /* Invert if the immediate doesn't start with a zero bit - this means we
5336 only need to search for sequences of one bits. */
5340 /* Find the first set bit and set tmp to val with the first sequence of one
5341 bits removed. Return success if there is a single sequence of ones. */
5342 first_one
= val
& -val
;
5343 tmp
= val
& (val
+ first_one
);
5348 /* Find the next set bit and compute the difference in bit position. */
5349 next_one
= tmp
& -tmp
;
5350 bits
= clz_hwi (first_one
) - clz_hwi (next_one
);
5353 /* Check the bit position difference is a power of 2, and that the first
5354 sequence of one bits fits within 'bits' bits. */
5355 if ((mask
>> bits
) != 0 || bits
!= (bits
& -bits
))
5358 /* Check the sequence of one bits is repeated 64/bits times. */
5359 return val
== mask
* bitmask_imm_mul
[__builtin_clz (bits
) - 26];
5362 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5363 Assumed precondition: VAL_IN Is not zero. */
5365 unsigned HOST_WIDE_INT
5366 aarch64_and_split_imm1 (HOST_WIDE_INT val_in
)
5368 int lowest_bit_set
= ctz_hwi (val_in
);
5369 int highest_bit_set
= floor_log2 (val_in
);
5370 gcc_assert (val_in
!= 0);
5372 return ((HOST_WIDE_INT_UC (2) << highest_bit_set
) -
5373 (HOST_WIDE_INT_1U
<< lowest_bit_set
));
5376 /* Create constant where bits outside of lowest bit set to highest bit set
5379 unsigned HOST_WIDE_INT
5380 aarch64_and_split_imm2 (HOST_WIDE_INT val_in
)
5382 return val_in
| ~aarch64_and_split_imm1 (val_in
);
5385 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
5388 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in
, machine_mode mode
)
5390 scalar_int_mode int_mode
;
5391 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
5394 if (aarch64_bitmask_imm (val_in
, int_mode
))
5397 if (aarch64_move_imm (val_in
, int_mode
))
5400 unsigned HOST_WIDE_INT imm2
= aarch64_and_split_imm2 (val_in
);
5402 return aarch64_bitmask_imm (imm2
, int_mode
);
5405 /* Return true if val is an immediate that can be loaded into a
5406 register in a single instruction. */
5408 aarch64_move_imm (HOST_WIDE_INT val
, machine_mode mode
)
5410 scalar_int_mode int_mode
;
5411 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
5414 if (aarch64_movw_imm (val
, int_mode
) || aarch64_movw_imm (~val
, int_mode
))
5416 return aarch64_bitmask_imm (val
, int_mode
);
5420 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED
, rtx x
)
5424 if (GET_CODE (x
) == HIGH
)
5427 /* There's no way to calculate VL-based values using relocations. */
5428 subrtx_iterator::array_type array
;
5429 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
5430 if (GET_CODE (*iter
) == CONST_POLY_INT
)
5433 split_const (x
, &base
, &offset
);
5434 if (GET_CODE (base
) == SYMBOL_REF
|| GET_CODE (base
) == LABEL_REF
)
5436 if (aarch64_classify_symbol (base
, INTVAL (offset
))
5437 != SYMBOL_FORCE_TO_MEM
)
5440 /* Avoid generating a 64-bit relocation in ILP32; leave
5441 to aarch64_expand_mov_immediate to handle it properly. */
5442 return mode
!= ptr_mode
;
5445 return aarch64_tls_referenced_p (x
);
5448 /* Implement TARGET_CASE_VALUES_THRESHOLD.
5449 The expansion for a table switch is quite expensive due to the number
5450 of instructions, the table lookup and hard to predict indirect jump.
5451 When optimizing for speed, and -O3 enabled, use the per-core tuning if
5452 set, otherwise use tables for > 16 cases as a tradeoff between size and
5453 performance. When optimizing for size, use the default setting. */
5456 aarch64_case_values_threshold (void)
5458 /* Use the specified limit for the number of cases before using jump
5459 tables at higher optimization levels. */
5461 && selected_cpu
->tune
->max_case_values
!= 0)
5462 return selected_cpu
->tune
->max_case_values
;
5464 return optimize_size
? default_case_values_threshold () : 17;
5467 /* Return true if register REGNO is a valid index register.
5468 STRICT_P is true if REG_OK_STRICT is in effect. */
5471 aarch64_regno_ok_for_index_p (int regno
, bool strict_p
)
5473 if (!HARD_REGISTER_NUM_P (regno
))
5481 regno
= reg_renumber
[regno
];
5483 return GP_REGNUM_P (regno
);
5486 /* Return true if register REGNO is a valid base register for mode MODE.
5487 STRICT_P is true if REG_OK_STRICT is in effect. */
5490 aarch64_regno_ok_for_base_p (int regno
, bool strict_p
)
5492 if (!HARD_REGISTER_NUM_P (regno
))
5500 regno
= reg_renumber
[regno
];
5503 /* The fake registers will be eliminated to either the stack or
5504 hard frame pointer, both of which are usually valid base registers.
5505 Reload deals with the cases where the eliminated form isn't valid. */
5506 return (GP_REGNUM_P (regno
)
5507 || regno
== SP_REGNUM
5508 || regno
== FRAME_POINTER_REGNUM
5509 || regno
== ARG_POINTER_REGNUM
);
5512 /* Return true if X is a valid base register for mode MODE.
5513 STRICT_P is true if REG_OK_STRICT is in effect. */
5516 aarch64_base_register_rtx_p (rtx x
, bool strict_p
)
5519 && GET_CODE (x
) == SUBREG
5520 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (x
))])
5523 return (REG_P (x
) && aarch64_regno_ok_for_base_p (REGNO (x
), strict_p
));
5526 /* Return true if address offset is a valid index. If it is, fill in INFO
5527 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
5530 aarch64_classify_index (struct aarch64_address_info
*info
, rtx x
,
5531 machine_mode mode
, bool strict_p
)
5533 enum aarch64_address_type type
;
5538 if ((REG_P (x
) || GET_CODE (x
) == SUBREG
)
5539 && GET_MODE (x
) == Pmode
)
5541 type
= ADDRESS_REG_REG
;
5545 /* (sign_extend:DI (reg:SI)) */
5546 else if ((GET_CODE (x
) == SIGN_EXTEND
5547 || GET_CODE (x
) == ZERO_EXTEND
)
5548 && GET_MODE (x
) == DImode
5549 && GET_MODE (XEXP (x
, 0)) == SImode
)
5551 type
= (GET_CODE (x
) == SIGN_EXTEND
)
5552 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
5553 index
= XEXP (x
, 0);
5556 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
5557 else if (GET_CODE (x
) == MULT
5558 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
5559 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
5560 && GET_MODE (XEXP (x
, 0)) == DImode
5561 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
5562 && CONST_INT_P (XEXP (x
, 1)))
5564 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
5565 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
5566 index
= XEXP (XEXP (x
, 0), 0);
5567 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
5569 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
5570 else if (GET_CODE (x
) == ASHIFT
5571 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
5572 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
5573 && GET_MODE (XEXP (x
, 0)) == DImode
5574 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
5575 && CONST_INT_P (XEXP (x
, 1)))
5577 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
5578 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
5579 index
= XEXP (XEXP (x
, 0), 0);
5580 shift
= INTVAL (XEXP (x
, 1));
5582 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
5583 else if ((GET_CODE (x
) == SIGN_EXTRACT
5584 || GET_CODE (x
) == ZERO_EXTRACT
)
5585 && GET_MODE (x
) == DImode
5586 && GET_CODE (XEXP (x
, 0)) == MULT
5587 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
5588 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
5590 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
5591 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
5592 index
= XEXP (XEXP (x
, 0), 0);
5593 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
5594 if (INTVAL (XEXP (x
, 1)) != 32 + shift
5595 || INTVAL (XEXP (x
, 2)) != 0)
5598 /* (and:DI (mult:DI (reg:DI) (const_int scale))
5599 (const_int 0xffffffff<<shift)) */
5600 else if (GET_CODE (x
) == AND
5601 && GET_MODE (x
) == DImode
5602 && GET_CODE (XEXP (x
, 0)) == MULT
5603 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
5604 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
5605 && CONST_INT_P (XEXP (x
, 1)))
5607 type
= ADDRESS_REG_UXTW
;
5608 index
= XEXP (XEXP (x
, 0), 0);
5609 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
5610 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
5613 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
5614 else if ((GET_CODE (x
) == SIGN_EXTRACT
5615 || GET_CODE (x
) == ZERO_EXTRACT
)
5616 && GET_MODE (x
) == DImode
5617 && GET_CODE (XEXP (x
, 0)) == ASHIFT
5618 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
5619 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
5621 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
5622 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
5623 index
= XEXP (XEXP (x
, 0), 0);
5624 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
5625 if (INTVAL (XEXP (x
, 1)) != 32 + shift
5626 || INTVAL (XEXP (x
, 2)) != 0)
5629 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
5630 (const_int 0xffffffff<<shift)) */
5631 else if (GET_CODE (x
) == AND
5632 && GET_MODE (x
) == DImode
5633 && GET_CODE (XEXP (x
, 0)) == ASHIFT
5634 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
5635 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
5636 && CONST_INT_P (XEXP (x
, 1)))
5638 type
= ADDRESS_REG_UXTW
;
5639 index
= XEXP (XEXP (x
, 0), 0);
5640 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
5641 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
5644 /* (mult:P (reg:P) (const_int scale)) */
5645 else if (GET_CODE (x
) == MULT
5646 && GET_MODE (x
) == Pmode
5647 && GET_MODE (XEXP (x
, 0)) == Pmode
5648 && CONST_INT_P (XEXP (x
, 1)))
5650 type
= ADDRESS_REG_REG
;
5651 index
= XEXP (x
, 0);
5652 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
5654 /* (ashift:P (reg:P) (const_int shift)) */
5655 else if (GET_CODE (x
) == ASHIFT
5656 && GET_MODE (x
) == Pmode
5657 && GET_MODE (XEXP (x
, 0)) == Pmode
5658 && CONST_INT_P (XEXP (x
, 1)))
5660 type
= ADDRESS_REG_REG
;
5661 index
= XEXP (x
, 0);
5662 shift
= INTVAL (XEXP (x
, 1));
5668 && GET_CODE (index
) == SUBREG
5669 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (index
))])
5670 index
= SUBREG_REG (index
);
5672 if (aarch64_sve_data_mode_p (mode
))
5674 if (type
!= ADDRESS_REG_REG
5675 || (1 << shift
) != GET_MODE_UNIT_SIZE (mode
))
5681 && !(IN_RANGE (shift
, 1, 3)
5682 && known_eq (1 << shift
, GET_MODE_SIZE (mode
))))
5687 && aarch64_regno_ok_for_index_p (REGNO (index
), strict_p
))
5690 info
->offset
= index
;
5691 info
->shift
= shift
;
5698 /* Return true if MODE is one of the modes for which we
5699 support LDP/STP operations. */
5702 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode
)
5704 return mode
== SImode
|| mode
== DImode
5705 || mode
== SFmode
|| mode
== DFmode
5706 || (aarch64_vector_mode_supported_p (mode
)
5707 && (known_eq (GET_MODE_SIZE (mode
), 8)
5708 || (known_eq (GET_MODE_SIZE (mode
), 16)
5709 && (aarch64_tune_params
.extra_tuning_flags
5710 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
) == 0)));
5713 /* Return true if REGNO is a virtual pointer register, or an eliminable
5714 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
5715 include stack_pointer or hard_frame_pointer. */
5717 virt_or_elim_regno_p (unsigned regno
)
5719 return ((regno
>= FIRST_VIRTUAL_REGISTER
5720 && regno
<= LAST_VIRTUAL_POINTER_REGISTER
)
5721 || regno
== FRAME_POINTER_REGNUM
5722 || regno
== ARG_POINTER_REGNUM
);
5725 /* Return true if X is a valid address of type TYPE for machine mode MODE.
5726 If it is, fill in INFO appropriately. STRICT_P is true if
5727 REG_OK_STRICT is in effect. */
5730 aarch64_classify_address (struct aarch64_address_info
*info
,
5731 rtx x
, machine_mode mode
, bool strict_p
,
5732 aarch64_addr_query_type type
= ADDR_QUERY_M
)
5734 enum rtx_code code
= GET_CODE (x
);
5738 HOST_WIDE_INT const_size
;
5740 /* On BE, we use load/store pair for all large int mode load/stores.
5741 TI/TFmode may also use a load/store pair. */
5742 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
5743 bool advsimd_struct_p
= (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
));
5744 bool load_store_pair_p
= (type
== ADDR_QUERY_LDP_STP
5747 || (BYTES_BIG_ENDIAN
&& advsimd_struct_p
));
5749 bool allow_reg_index_p
= (!load_store_pair_p
5750 && (known_lt (GET_MODE_SIZE (mode
), 16)
5751 || vec_flags
== VEC_ADVSIMD
5752 || vec_flags
== VEC_SVE_DATA
));
5754 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
5755 [Rn, #offset, MUL VL]. */
5756 if ((vec_flags
& (VEC_SVE_DATA
| VEC_SVE_PRED
)) != 0
5757 && (code
!= REG
&& code
!= PLUS
))
5760 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
5762 if (advsimd_struct_p
5763 && !BYTES_BIG_ENDIAN
5764 && (code
!= POST_INC
&& code
!= REG
))
5767 gcc_checking_assert (GET_MODE (x
) == VOIDmode
5768 || SCALAR_INT_MODE_P (GET_MODE (x
)));
5774 info
->type
= ADDRESS_REG_IMM
;
5776 info
->offset
= const0_rtx
;
5777 info
->const_offset
= 0;
5778 return aarch64_base_register_rtx_p (x
, strict_p
);
5786 && virt_or_elim_regno_p (REGNO (op0
))
5787 && poly_int_rtx_p (op1
, &offset
))
5789 info
->type
= ADDRESS_REG_IMM
;
5792 info
->const_offset
= offset
;
5797 if (maybe_ne (GET_MODE_SIZE (mode
), 0)
5798 && aarch64_base_register_rtx_p (op0
, strict_p
)
5799 && poly_int_rtx_p (op1
, &offset
))
5801 info
->type
= ADDRESS_REG_IMM
;
5804 info
->const_offset
= offset
;
5806 /* TImode and TFmode values are allowed in both pairs of X
5807 registers and individual Q registers. The available
5809 X,X: 7-bit signed scaled offset
5810 Q: 9-bit signed offset
5811 We conservatively require an offset representable in either mode.
5812 When performing the check for pairs of X registers i.e. LDP/STP
5813 pass down DImode since that is the natural size of the LDP/STP
5814 instruction memory accesses. */
5815 if (mode
== TImode
|| mode
== TFmode
)
5816 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
5817 && (offset_9bit_signed_unscaled_p (mode
, offset
)
5818 || offset_12bit_unsigned_scaled_p (mode
, offset
)));
5820 /* A 7bit offset check because OImode will emit a ldp/stp
5821 instruction (only big endian will get here).
5822 For ldp/stp instructions, the offset is scaled for the size of a
5823 single element of the pair. */
5825 return aarch64_offset_7bit_signed_scaled_p (TImode
, offset
);
5827 /* Three 9/12 bit offsets checks because CImode will emit three
5828 ldr/str instructions (only big endian will get here). */
5830 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
5831 && (offset_9bit_signed_unscaled_p (V16QImode
, offset
+ 32)
5832 || offset_12bit_unsigned_scaled_p (V16QImode
,
5835 /* Two 7bit offsets checks because XImode will emit two ldp/stp
5836 instructions (only big endian will get here). */
5838 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
5839 && aarch64_offset_7bit_signed_scaled_p (TImode
,
5842 /* Make "m" use the LD1 offset range for SVE data modes, so
5843 that pre-RTL optimizers like ivopts will work to that
5844 instead of the wider LDR/STR range. */
5845 if (vec_flags
== VEC_SVE_DATA
)
5846 return (type
== ADDR_QUERY_M
5847 ? offset_4bit_signed_scaled_p (mode
, offset
)
5848 : offset_9bit_signed_scaled_p (mode
, offset
));
5850 if (vec_flags
== (VEC_SVE_DATA
| VEC_STRUCT
))
5852 poly_int64 end_offset
= (offset
5853 + GET_MODE_SIZE (mode
)
5854 - BYTES_PER_SVE_VECTOR
);
5855 return (type
== ADDR_QUERY_M
5856 ? offset_4bit_signed_scaled_p (mode
, offset
)
5857 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE
, offset
)
5858 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE
,
5862 if (vec_flags
== VEC_SVE_PRED
)
5863 return offset_9bit_signed_scaled_p (mode
, offset
);
5865 if (load_store_pair_p
)
5866 return ((known_eq (GET_MODE_SIZE (mode
), 4)
5867 || known_eq (GET_MODE_SIZE (mode
), 8)
5868 || known_eq (GET_MODE_SIZE (mode
), 16))
5869 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
5871 return (offset_9bit_signed_unscaled_p (mode
, offset
)
5872 || offset_12bit_unsigned_scaled_p (mode
, offset
));
5875 if (allow_reg_index_p
)
5877 /* Look for base + (scaled/extended) index register. */
5878 if (aarch64_base_register_rtx_p (op0
, strict_p
)
5879 && aarch64_classify_index (info
, op1
, mode
, strict_p
))
5884 if (aarch64_base_register_rtx_p (op1
, strict_p
)
5885 && aarch64_classify_index (info
, op0
, mode
, strict_p
))
5898 info
->type
= ADDRESS_REG_WB
;
5899 info
->base
= XEXP (x
, 0);
5900 info
->offset
= NULL_RTX
;
5901 return aarch64_base_register_rtx_p (info
->base
, strict_p
);
5905 info
->type
= ADDRESS_REG_WB
;
5906 info
->base
= XEXP (x
, 0);
5907 if (GET_CODE (XEXP (x
, 1)) == PLUS
5908 && poly_int_rtx_p (XEXP (XEXP (x
, 1), 1), &offset
)
5909 && rtx_equal_p (XEXP (XEXP (x
, 1), 0), info
->base
)
5910 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
5912 info
->offset
= XEXP (XEXP (x
, 1), 1);
5913 info
->const_offset
= offset
;
5915 /* TImode and TFmode values are allowed in both pairs of X
5916 registers and individual Q registers. The available
5918 X,X: 7-bit signed scaled offset
5919 Q: 9-bit signed offset
5920 We conservatively require an offset representable in either mode.
5922 if (mode
== TImode
|| mode
== TFmode
)
5923 return (aarch64_offset_7bit_signed_scaled_p (mode
, offset
)
5924 && offset_9bit_signed_unscaled_p (mode
, offset
));
5926 if (load_store_pair_p
)
5927 return ((known_eq (GET_MODE_SIZE (mode
), 4)
5928 || known_eq (GET_MODE_SIZE (mode
), 8)
5929 || known_eq (GET_MODE_SIZE (mode
), 16))
5930 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
5932 return offset_9bit_signed_unscaled_p (mode
, offset
);
5939 /* load literal: pc-relative constant pool entry. Only supported
5940 for SI mode or larger. */
5941 info
->type
= ADDRESS_SYMBOLIC
;
5943 if (!load_store_pair_p
5944 && GET_MODE_SIZE (mode
).is_constant (&const_size
)
5949 split_const (x
, &sym
, &addend
);
5950 return ((GET_CODE (sym
) == LABEL_REF
5951 || (GET_CODE (sym
) == SYMBOL_REF
5952 && CONSTANT_POOL_ADDRESS_P (sym
)
5953 && aarch64_pcrelative_literal_loads
)));
5958 info
->type
= ADDRESS_LO_SUM
;
5959 info
->base
= XEXP (x
, 0);
5960 info
->offset
= XEXP (x
, 1);
5961 if (allow_reg_index_p
5962 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
5965 split_const (info
->offset
, &sym
, &offs
);
5966 if (GET_CODE (sym
) == SYMBOL_REF
5967 && (aarch64_classify_symbol (sym
, INTVAL (offs
))
5968 == SYMBOL_SMALL_ABSOLUTE
))
5970 /* The symbol and offset must be aligned to the access size. */
5973 if (CONSTANT_POOL_ADDRESS_P (sym
))
5974 align
= GET_MODE_ALIGNMENT (get_pool_mode (sym
));
5975 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym
))
5977 tree exp
= SYMBOL_REF_DECL (sym
);
5978 align
= TYPE_ALIGN (TREE_TYPE (exp
));
5979 align
= aarch64_constant_alignment (exp
, align
);
5981 else if (SYMBOL_REF_DECL (sym
))
5982 align
= DECL_ALIGN (SYMBOL_REF_DECL (sym
));
5983 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym
)
5984 && SYMBOL_REF_BLOCK (sym
) != NULL
)
5985 align
= SYMBOL_REF_BLOCK (sym
)->alignment
;
5987 align
= BITS_PER_UNIT
;
5989 poly_int64 ref_size
= GET_MODE_SIZE (mode
);
5990 if (known_eq (ref_size
, 0))
5991 ref_size
= GET_MODE_SIZE (DImode
);
5993 return (multiple_p (INTVAL (offs
), ref_size
)
5994 && multiple_p (align
/ BITS_PER_UNIT
, ref_size
));
6004 /* Return true if the address X is valid for a PRFM instruction.
6005 STRICT_P is true if we should do strict checking with
6006 aarch64_classify_address. */
6009 aarch64_address_valid_for_prefetch_p (rtx x
, bool strict_p
)
6011 struct aarch64_address_info addr
;
6013 /* PRFM accepts the same addresses as DImode... */
6014 bool res
= aarch64_classify_address (&addr
, x
, DImode
, strict_p
);
6018 /* ... except writeback forms. */
6019 return addr
.type
!= ADDRESS_REG_WB
;
6023 aarch64_symbolic_address_p (rtx x
)
6027 split_const (x
, &x
, &offset
);
6028 return GET_CODE (x
) == SYMBOL_REF
|| GET_CODE (x
) == LABEL_REF
;
6031 /* Classify the base of symbolic expression X. */
6033 enum aarch64_symbol_type
6034 aarch64_classify_symbolic_expression (rtx x
)
6038 split_const (x
, &x
, &offset
);
6039 return aarch64_classify_symbol (x
, INTVAL (offset
));
6043 /* Return TRUE if X is a legitimate address for accessing memory in
6046 aarch64_legitimate_address_hook_p (machine_mode mode
, rtx x
, bool strict_p
)
6048 struct aarch64_address_info addr
;
6050 return aarch64_classify_address (&addr
, x
, mode
, strict_p
);
6053 /* Return TRUE if X is a legitimate address of type TYPE for accessing
6054 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
6056 aarch64_legitimate_address_p (machine_mode mode
, rtx x
, bool strict_p
,
6057 aarch64_addr_query_type type
)
6059 struct aarch64_address_info addr
;
6061 return aarch64_classify_address (&addr
, x
, mode
, strict_p
, type
);
6064 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
6067 aarch64_legitimize_address_displacement (rtx
*offset1
, rtx
*offset2
,
6068 poly_int64 orig_offset
,
6072 if (GET_MODE_SIZE (mode
).is_constant (&size
))
6074 HOST_WIDE_INT const_offset
, second_offset
;
6076 /* A general SVE offset is A * VQ + B. Remove the A component from
6077 coefficient 0 in order to get the constant B. */
6078 const_offset
= orig_offset
.coeffs
[0] - orig_offset
.coeffs
[1];
6080 /* Split an out-of-range address displacement into a base and
6081 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
6082 range otherwise to increase opportunities for sharing the base
6083 address of different sizes. Unaligned accesses use the signed
6084 9-bit range, TImode/TFmode use the intersection of signed
6085 scaled 7-bit and signed 9-bit offset. */
6086 if (mode
== TImode
|| mode
== TFmode
)
6087 second_offset
= ((const_offset
+ 0x100) & 0x1f8) - 0x100;
6088 else if ((const_offset
& (size
- 1)) != 0)
6089 second_offset
= ((const_offset
+ 0x100) & 0x1ff) - 0x100;
6091 second_offset
= const_offset
& (size
< 4 ? 0xfff : 0x3ffc);
6093 if (second_offset
== 0 || known_eq (orig_offset
, second_offset
))
6096 /* Split the offset into second_offset and the rest. */
6097 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
6098 *offset2
= gen_int_mode (second_offset
, Pmode
);
6103 /* Get the mode we should use as the basis of the range. For structure
6104 modes this is the mode of one vector. */
6105 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
6106 machine_mode step_mode
6107 = (vec_flags
& VEC_STRUCT
) != 0 ? SVE_BYTE_MODE
: mode
;
6109 /* Get the "mul vl" multiplier we'd like to use. */
6110 HOST_WIDE_INT factor
= GET_MODE_SIZE (step_mode
).coeffs
[1];
6111 HOST_WIDE_INT vnum
= orig_offset
.coeffs
[1] / factor
;
6112 if (vec_flags
& VEC_SVE_DATA
)
6113 /* LDR supports a 9-bit range, but the move patterns for
6114 structure modes require all vectors to be in range of the
6115 same base. The simplest way of accomodating that while still
6116 promoting reuse of anchor points between different modes is
6117 to use an 8-bit range unconditionally. */
6118 vnum
= ((vnum
+ 128) & 255) - 128;
6120 /* Predicates are only handled singly, so we might as well use
6122 vnum
= ((vnum
+ 256) & 511) - 256;
6126 /* Convert the "mul vl" multiplier into a byte offset. */
6127 poly_int64 second_offset
= GET_MODE_SIZE (step_mode
) * vnum
;
6128 if (known_eq (second_offset
, orig_offset
))
6131 /* Split the offset into second_offset and the rest. */
6132 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
6133 *offset2
= gen_int_mode (second_offset
, Pmode
);
6138 /* Return the binary representation of floating point constant VALUE in INTVAL.
6139 If the value cannot be converted, return false without setting INTVAL.
6140 The conversion is done in the given MODE. */
6142 aarch64_reinterpret_float_as_int (rtx value
, unsigned HOST_WIDE_INT
*intval
)
6145 /* We make a general exception for 0. */
6146 if (aarch64_float_const_zero_rtx_p (value
))
6152 scalar_float_mode mode
;
6153 if (GET_CODE (value
) != CONST_DOUBLE
6154 || !is_a
<scalar_float_mode
> (GET_MODE (value
), &mode
)
6155 || GET_MODE_BITSIZE (mode
) > HOST_BITS_PER_WIDE_INT
6156 /* Only support up to DF mode. */
6157 || GET_MODE_BITSIZE (mode
) > GET_MODE_BITSIZE (DFmode
))
6160 unsigned HOST_WIDE_INT ival
= 0;
6163 real_to_target (res
,
6164 CONST_DOUBLE_REAL_VALUE (value
),
6165 REAL_MODE_FORMAT (mode
));
6169 int order
= BYTES_BIG_ENDIAN
? 1 : 0;
6170 ival
= zext_hwi (res
[order
], 32);
6171 ival
|= (zext_hwi (res
[1 - order
], 32) << 32);
6174 ival
= zext_hwi (res
[0], 32);
6180 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6181 single MOV(+MOVK) followed by an FMOV. */
6183 aarch64_float_const_rtx_p (rtx x
)
6185 machine_mode mode
= GET_MODE (x
);
6186 if (mode
== VOIDmode
)
6189 /* Determine whether it's cheaper to write float constants as
6190 mov/movk pairs over ldr/adrp pairs. */
6191 unsigned HOST_WIDE_INT ival
;
6193 if (GET_CODE (x
) == CONST_DOUBLE
6194 && SCALAR_FLOAT_MODE_P (mode
)
6195 && aarch64_reinterpret_float_as_int (x
, &ival
))
6197 scalar_int_mode imode
= (mode
== HFmode
6199 : int_mode_for_mode (mode
).require ());
6200 int num_instr
= aarch64_internal_mov_immediate
6201 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
6202 return num_instr
< 3;
6208 /* Return TRUE if rtx X is immediate constant 0.0 */
6210 aarch64_float_const_zero_rtx_p (rtx x
)
6212 if (GET_MODE (x
) == VOIDmode
)
6215 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x
)))
6216 return !HONOR_SIGNED_ZEROS (GET_MODE (x
));
6217 return real_equal (CONST_DOUBLE_REAL_VALUE (x
), &dconst0
);
6220 /* Return TRUE if rtx X is immediate constant that fits in a single
6221 MOVI immediate operation. */
6223 aarch64_can_const_movi_rtx_p (rtx x
, machine_mode mode
)
6229 scalar_int_mode imode
;
6230 unsigned HOST_WIDE_INT ival
;
6232 if (GET_CODE (x
) == CONST_DOUBLE
6233 && SCALAR_FLOAT_MODE_P (mode
))
6235 if (!aarch64_reinterpret_float_as_int (x
, &ival
))
6238 /* We make a general exception for 0. */
6239 if (aarch64_float_const_zero_rtx_p (x
))
6242 imode
= int_mode_for_mode (mode
).require ();
6244 else if (GET_CODE (x
) == CONST_INT
6245 && is_a
<scalar_int_mode
> (mode
, &imode
))
6250 /* use a 64 bit mode for everything except for DI/DF mode, where we use
6251 a 128 bit vector mode. */
6252 int width
= GET_MODE_BITSIZE (imode
) == 64 ? 128 : 64;
6254 vmode
= aarch64_simd_container_mode (imode
, width
);
6255 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, ival
);
6257 return aarch64_simd_valid_immediate (v_op
, NULL
);
6261 /* Return the fixed registers used for condition codes. */
6264 aarch64_fixed_condition_code_regs (unsigned int *p1
, unsigned int *p2
)
6267 *p2
= INVALID_REGNUM
;
6271 /* This function is used by the call expanders of the machine description.
6272 RESULT is the register in which the result is returned. It's NULL for
6273 "call" and "sibcall".
6274 MEM is the location of the function call.
6275 SIBCALL indicates whether this function call is normal call or sibling call.
6276 It will generate different pattern accordingly. */
6279 aarch64_expand_call (rtx result
, rtx mem
, bool sibcall
)
6281 rtx call
, callee
, tmp
;
6285 gcc_assert (MEM_P (mem
));
6286 callee
= XEXP (mem
, 0);
6287 mode
= GET_MODE (callee
);
6288 gcc_assert (mode
== Pmode
);
6290 /* Decide if we should generate indirect calls by loading the
6291 address of the callee into a register before performing
6292 the branch-and-link. */
6293 if (SYMBOL_REF_P (callee
)
6294 ? (aarch64_is_long_call_p (callee
)
6295 || aarch64_is_noplt_call_p (callee
))
6297 XEXP (mem
, 0) = force_reg (mode
, callee
);
6299 call
= gen_rtx_CALL (VOIDmode
, mem
, const0_rtx
);
6301 if (result
!= NULL_RTX
)
6302 call
= gen_rtx_SET (result
, call
);
6307 tmp
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (Pmode
, LR_REGNUM
));
6309 vec
= gen_rtvec (2, call
, tmp
);
6310 call
= gen_rtx_PARALLEL (VOIDmode
, vec
);
6312 aarch64_emit_call_insn (call
);
6315 /* Emit call insn with PAT and do aarch64-specific handling. */
6318 aarch64_emit_call_insn (rtx pat
)
6320 rtx insn
= emit_call_insn (pat
);
6322 rtx
*fusage
= &CALL_INSN_FUNCTION_USAGE (insn
);
6323 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP0_REGNUM
));
6324 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP1_REGNUM
));
6328 aarch64_select_cc_mode (RTX_CODE code
, rtx x
, rtx y
)
6330 /* All floating point compares return CCFP if it is an equality
6331 comparison, and CCFPE otherwise. */
6332 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_FLOAT
)
6359 /* Equality comparisons of short modes against zero can be performed
6360 using the TST instruction with the appropriate bitmask. */
6361 if (y
== const0_rtx
&& REG_P (x
)
6362 && (code
== EQ
|| code
== NE
)
6363 && (GET_MODE (x
) == HImode
|| GET_MODE (x
) == QImode
))
6366 /* Similarly, comparisons of zero_extends from shorter modes can
6367 be performed using an ANDS with an immediate mask. */
6368 if (y
== const0_rtx
&& GET_CODE (x
) == ZERO_EXTEND
6369 && (GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
6370 && (GET_MODE (XEXP (x
, 0)) == HImode
|| GET_MODE (XEXP (x
, 0)) == QImode
)
6371 && (code
== EQ
|| code
== NE
))
6374 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
6376 && (code
== EQ
|| code
== NE
|| code
== LT
|| code
== GE
)
6377 && (GET_CODE (x
) == PLUS
|| GET_CODE (x
) == MINUS
|| GET_CODE (x
) == AND
6378 || GET_CODE (x
) == NEG
6379 || (GET_CODE (x
) == ZERO_EXTRACT
&& CONST_INT_P (XEXP (x
, 1))
6380 && CONST_INT_P (XEXP (x
, 2)))))
6383 /* A compare with a shifted operand. Because of canonicalization,
6384 the comparison will have to be swapped when we emit the assembly
6386 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
6387 && (REG_P (y
) || GET_CODE (y
) == SUBREG
|| y
== const0_rtx
)
6388 && (GET_CODE (x
) == ASHIFT
|| GET_CODE (x
) == ASHIFTRT
6389 || GET_CODE (x
) == LSHIFTRT
6390 || GET_CODE (x
) == ZERO_EXTEND
|| GET_CODE (x
) == SIGN_EXTEND
))
6393 /* Similarly for a negated operand, but we can only do this for
6395 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
6396 && (REG_P (y
) || GET_CODE (y
) == SUBREG
)
6397 && (code
== EQ
|| code
== NE
)
6398 && GET_CODE (x
) == NEG
)
6401 /* A test for unsigned overflow. */
6402 if ((GET_MODE (x
) == DImode
|| GET_MODE (x
) == TImode
)
6404 && GET_CODE (x
) == PLUS
6405 && GET_CODE (y
) == ZERO_EXTEND
)
6408 /* For everything else, return CCmode. */
6413 aarch64_get_condition_code_1 (machine_mode
, enum rtx_code
);
6416 aarch64_get_condition_code (rtx x
)
6418 machine_mode mode
= GET_MODE (XEXP (x
, 0));
6419 enum rtx_code comp_code
= GET_CODE (x
);
6421 if (GET_MODE_CLASS (mode
) != MODE_CC
)
6422 mode
= SELECT_CC_MODE (comp_code
, XEXP (x
, 0), XEXP (x
, 1));
6423 return aarch64_get_condition_code_1 (mode
, comp_code
);
6427 aarch64_get_condition_code_1 (machine_mode mode
, enum rtx_code comp_code
)
6435 case GE
: return AARCH64_GE
;
6436 case GT
: return AARCH64_GT
;
6437 case LE
: return AARCH64_LS
;
6438 case LT
: return AARCH64_MI
;
6439 case NE
: return AARCH64_NE
;
6440 case EQ
: return AARCH64_EQ
;
6441 case ORDERED
: return AARCH64_VC
;
6442 case UNORDERED
: return AARCH64_VS
;
6443 case UNLT
: return AARCH64_LT
;
6444 case UNLE
: return AARCH64_LE
;
6445 case UNGT
: return AARCH64_HI
;
6446 case UNGE
: return AARCH64_PL
;
6454 case NE
: return AARCH64_NE
;
6455 case EQ
: return AARCH64_EQ
;
6456 case GE
: return AARCH64_GE
;
6457 case GT
: return AARCH64_GT
;
6458 case LE
: return AARCH64_LE
;
6459 case LT
: return AARCH64_LT
;
6460 case GEU
: return AARCH64_CS
;
6461 case GTU
: return AARCH64_HI
;
6462 case LEU
: return AARCH64_LS
;
6463 case LTU
: return AARCH64_CC
;
6471 case NE
: return AARCH64_NE
;
6472 case EQ
: return AARCH64_EQ
;
6473 case GE
: return AARCH64_LE
;
6474 case GT
: return AARCH64_LT
;
6475 case LE
: return AARCH64_GE
;
6476 case LT
: return AARCH64_GT
;
6477 case GEU
: return AARCH64_LS
;
6478 case GTU
: return AARCH64_CC
;
6479 case LEU
: return AARCH64_CS
;
6480 case LTU
: return AARCH64_HI
;
6488 case NE
: return AARCH64_NE
;
6489 case EQ
: return AARCH64_EQ
;
6490 case GE
: return AARCH64_PL
;
6491 case LT
: return AARCH64_MI
;
6499 case NE
: return AARCH64_NE
;
6500 case EQ
: return AARCH64_EQ
;
6508 case NE
: return AARCH64_CS
;
6509 case EQ
: return AARCH64_CC
;
6522 aarch64_const_vec_all_same_in_range_p (rtx x
,
6523 HOST_WIDE_INT minval
,
6524 HOST_WIDE_INT maxval
)
6527 return (const_vec_duplicate_p (x
, &elt
)
6528 && CONST_INT_P (elt
)
6529 && IN_RANGE (INTVAL (elt
), minval
, maxval
));
6533 aarch64_const_vec_all_same_int_p (rtx x
, HOST_WIDE_INT val
)
6535 return aarch64_const_vec_all_same_in_range_p (x
, val
, val
);
6538 /* Return true if VEC is a constant in which every element is in the range
6539 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
6542 aarch64_const_vec_all_in_range_p (rtx vec
,
6543 HOST_WIDE_INT minval
,
6544 HOST_WIDE_INT maxval
)
6546 if (GET_CODE (vec
) != CONST_VECTOR
6547 || GET_MODE_CLASS (GET_MODE (vec
)) != MODE_VECTOR_INT
)
6551 if (!CONST_VECTOR_STEPPED_P (vec
))
6552 nunits
= const_vector_encoded_nelts (vec
);
6553 else if (!CONST_VECTOR_NUNITS (vec
).is_constant (&nunits
))
6556 for (int i
= 0; i
< nunits
; i
++)
6558 rtx vec_elem
= CONST_VECTOR_ELT (vec
, i
);
6559 if (!CONST_INT_P (vec_elem
)
6560 || !IN_RANGE (INTVAL (vec_elem
), minval
, maxval
))
6567 #define AARCH64_CC_V 1
6568 #define AARCH64_CC_C (1 << 1)
6569 #define AARCH64_CC_Z (1 << 2)
6570 #define AARCH64_CC_N (1 << 3)
6572 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
6573 static const int aarch64_nzcv_codes
[] =
6575 0, /* EQ, Z == 1. */
6576 AARCH64_CC_Z
, /* NE, Z == 0. */
6577 0, /* CS, C == 1. */
6578 AARCH64_CC_C
, /* CC, C == 0. */
6579 0, /* MI, N == 1. */
6580 AARCH64_CC_N
, /* PL, N == 0. */
6581 0, /* VS, V == 1. */
6582 AARCH64_CC_V
, /* VC, V == 0. */
6583 0, /* HI, C ==1 && Z == 0. */
6584 AARCH64_CC_C
, /* LS, !(C == 1 && Z == 0). */
6585 AARCH64_CC_V
, /* GE, N == V. */
6586 0, /* LT, N != V. */
6587 AARCH64_CC_Z
, /* GT, Z == 0 && N == V. */
6588 0, /* LE, !(Z == 0 && N == V). */
6593 /* Print floating-point vector immediate operand X to F, negating it
6594 first if NEGATE is true. Return true on success, false if it isn't
6595 a constant we can handle. */
6598 aarch64_print_vector_float_operand (FILE *f
, rtx x
, bool negate
)
6602 if (!const_vec_duplicate_p (x
, &elt
))
6605 REAL_VALUE_TYPE r
= *CONST_DOUBLE_REAL_VALUE (elt
);
6607 r
= real_value_negate (&r
);
6609 /* We only handle the SVE single-bit immediates here. */
6610 if (real_equal (&r
, &dconst0
))
6611 asm_fprintf (f
, "0.0");
6612 else if (real_equal (&r
, &dconst1
))
6613 asm_fprintf (f
, "1.0");
6614 else if (real_equal (&r
, &dconsthalf
))
6615 asm_fprintf (f
, "0.5");
6622 /* Return the equivalent letter for size. */
6624 sizetochar (int size
)
6628 case 64: return 'd';
6629 case 32: return 's';
6630 case 16: return 'h';
6631 case 8 : return 'b';
6632 default: gcc_unreachable ();
6636 /* Print operand X to file F in a target specific manner according to CODE.
6637 The acceptable formatting commands given by CODE are:
6638 'c': An integer or symbol address without a preceding #
6640 'C': Take the duplicated element in a vector constant
6641 and print it in hex.
6642 'D': Take the duplicated element in a vector constant
6643 and print it as an unsigned integer, in decimal.
6644 'e': Print the sign/zero-extend size as a character 8->b,
6646 'p': Prints N such that 2^N == X (X must be power of 2 and
6648 'P': Print the number of non-zero bits in X (a const_int).
6649 'H': Print the higher numbered register of a pair (TImode)
6651 'm': Print a condition (eq, ne, etc).
6652 'M': Same as 'm', but invert condition.
6653 'N': Take the duplicated element in a vector constant
6654 and print the negative of it in decimal.
6655 'b/h/s/d/q': Print a scalar FP/SIMD register name.
6656 'S/T/U/V': Print a FP/SIMD register name for a register list.
6657 The register printed is the FP/SIMD register name
6658 of X + 0/1/2/3 for S/T/U/V.
6659 'R': Print a scalar FP/SIMD register name + 1.
6660 'X': Print bottom 16 bits of integer constant in hex.
6661 'w/x': Print a general register name or the zero register
6663 '0': Print a normal operand, if it's a general register,
6664 then we assume DImode.
6665 'k': Print NZCV for conditional compare instructions.
6666 'A': Output address constant representing the first
6667 argument of X, specifying a relocation offset
6669 'L': Output constant address specified by X
6670 with a relocation offset if appropriate.
6671 'G': Prints address of X, specifying a PC relative
6672 relocation mode if appropriate.
6673 'y': Output address of LDP or STP - this is used for
6674 some LDP/STPs which don't use a PARALLEL in their
6675 pattern (so the mode needs to be adjusted).
6676 'z': Output address of a typical LDP or STP. */
6679 aarch64_print_operand (FILE *f
, rtx x
, int code
)
6685 switch (GET_CODE (x
))
6688 fprintf (f
, HOST_WIDE_INT_PRINT_DEC
, INTVAL (x
));
6692 output_addr_const (f
, x
);
6696 if (GET_CODE (XEXP (x
, 0)) == PLUS
6697 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
)
6699 output_addr_const (f
, x
);
6705 output_operand_lossage ("unsupported operand for code '%c'", code
);
6713 if (!CONST_INT_P (x
)
6714 || (n
= exact_log2 (INTVAL (x
) & ~7)) <= 0)
6716 output_operand_lossage ("invalid operand for '%%%c'", code
);
6732 output_operand_lossage ("invalid operand for '%%%c'", code
);
6742 if (!CONST_INT_P (x
) || (n
= exact_log2 (INTVAL (x
))) < 0)
6744 output_operand_lossage ("invalid operand for '%%%c'", code
);
6748 asm_fprintf (f
, "%d", n
);
6753 if (!CONST_INT_P (x
))
6755 output_operand_lossage ("invalid operand for '%%%c'", code
);
6759 asm_fprintf (f
, "%u", popcount_hwi (INTVAL (x
)));
6763 if (!REG_P (x
) || !GP_REGNUM_P (REGNO (x
) + 1))
6765 output_operand_lossage ("invalid operand for '%%%c'", code
);
6769 asm_fprintf (f
, "%s", reg_names
[REGNO (x
) + 1]);
6776 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
6777 if (x
== const_true_rtx
)
6784 if (!COMPARISON_P (x
))
6786 output_operand_lossage ("invalid operand for '%%%c'", code
);
6790 cond_code
= aarch64_get_condition_code (x
);
6791 gcc_assert (cond_code
>= 0);
6793 cond_code
= AARCH64_INVERSE_CONDITION_CODE (cond_code
);
6794 fputs (aarch64_condition_codes
[cond_code
], f
);
6799 if (!const_vec_duplicate_p (x
, &elt
))
6801 output_operand_lossage ("invalid vector constant");
6805 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
6806 asm_fprintf (f
, "%wd", -INTVAL (elt
));
6807 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
6808 && aarch64_print_vector_float_operand (f
, x
, true))
6812 output_operand_lossage ("invalid vector constant");
6822 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
6824 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
6827 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - V0_REGNUM
);
6834 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
6836 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
6839 asm_fprintf (f
, "%c%d",
6840 aarch64_sve_data_mode_p (GET_MODE (x
)) ? 'z' : 'v',
6841 REGNO (x
) - V0_REGNUM
+ (code
- 'S'));
6845 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
6847 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
6850 asm_fprintf (f
, "q%d", REGNO (x
) - V0_REGNUM
+ 1);
6854 if (!CONST_INT_P (x
))
6856 output_operand_lossage ("invalid operand for '%%%c'", code
);
6859 asm_fprintf (f
, "0x%wx", UINTVAL (x
) & 0xffff);
6864 /* Print a replicated constant in hex. */
6865 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
6867 output_operand_lossage ("invalid operand for '%%%c'", code
);
6870 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
6871 asm_fprintf (f
, "0x%wx", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
6877 /* Print a replicated constant in decimal, treating it as
6879 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
6881 output_operand_lossage ("invalid operand for '%%%c'", code
);
6884 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
6885 asm_fprintf (f
, "%wd", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
6892 || (CONST_DOUBLE_P (x
) && aarch64_float_const_zero_rtx_p (x
)))
6894 asm_fprintf (f
, "%czr", code
);
6898 if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
6900 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - R0_REGNUM
);
6904 if (REG_P (x
) && REGNO (x
) == SP_REGNUM
)
6906 asm_fprintf (f
, "%ssp", code
== 'w' ? "w" : "");
6915 output_operand_lossage ("missing operand");
6919 switch (GET_CODE (x
))
6922 if (aarch64_sve_data_mode_p (GET_MODE (x
)))
6924 if (REG_NREGS (x
) == 1)
6925 asm_fprintf (f
, "z%d", REGNO (x
) - V0_REGNUM
);
6929 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x
)));
6930 asm_fprintf (f
, "{z%d.%c - z%d.%c}",
6931 REGNO (x
) - V0_REGNUM
, suffix
,
6932 END_REGNO (x
) - V0_REGNUM
- 1, suffix
);
6936 asm_fprintf (f
, "%s", reg_names
[REGNO (x
)]);
6940 output_address (GET_MODE (x
), XEXP (x
, 0));
6945 output_addr_const (asm_out_file
, x
);
6949 asm_fprintf (f
, "%wd", INTVAL (x
));
6953 if (!VECTOR_MODE_P (GET_MODE (x
)))
6955 output_addr_const (asm_out_file
, x
);
6961 if (!const_vec_duplicate_p (x
, &elt
))
6963 output_operand_lossage ("invalid vector constant");
6967 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
6968 asm_fprintf (f
, "%wd", INTVAL (elt
));
6969 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
6970 && aarch64_print_vector_float_operand (f
, x
, false))
6974 output_operand_lossage ("invalid vector constant");
6980 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
6981 be getting CONST_DOUBLEs holding integers. */
6982 gcc_assert (GET_MODE (x
) != VOIDmode
);
6983 if (aarch64_float_const_zero_rtx_p (x
))
6988 else if (aarch64_float_const_representable_p (x
))
6991 char float_buf
[buf_size
] = {'\0'};
6992 real_to_decimal_for_mode (float_buf
,
6993 CONST_DOUBLE_REAL_VALUE (x
),
6996 asm_fprintf (asm_out_file
, "%s", float_buf
);
7000 output_operand_lossage ("invalid constant");
7003 output_operand_lossage ("invalid operand");
7009 if (GET_CODE (x
) == HIGH
)
7012 switch (aarch64_classify_symbolic_expression (x
))
7014 case SYMBOL_SMALL_GOT_4G
:
7015 asm_fprintf (asm_out_file
, ":got:");
7018 case SYMBOL_SMALL_TLSGD
:
7019 asm_fprintf (asm_out_file
, ":tlsgd:");
7022 case SYMBOL_SMALL_TLSDESC
:
7023 asm_fprintf (asm_out_file
, ":tlsdesc:");
7026 case SYMBOL_SMALL_TLSIE
:
7027 asm_fprintf (asm_out_file
, ":gottprel:");
7030 case SYMBOL_TLSLE24
:
7031 asm_fprintf (asm_out_file
, ":tprel:");
7034 case SYMBOL_TINY_GOT
:
7041 output_addr_const (asm_out_file
, x
);
7045 switch (aarch64_classify_symbolic_expression (x
))
7047 case SYMBOL_SMALL_GOT_4G
:
7048 asm_fprintf (asm_out_file
, ":lo12:");
7051 case SYMBOL_SMALL_TLSGD
:
7052 asm_fprintf (asm_out_file
, ":tlsgd_lo12:");
7055 case SYMBOL_SMALL_TLSDESC
:
7056 asm_fprintf (asm_out_file
, ":tlsdesc_lo12:");
7059 case SYMBOL_SMALL_TLSIE
:
7060 asm_fprintf (asm_out_file
, ":gottprel_lo12:");
7063 case SYMBOL_TLSLE12
:
7064 asm_fprintf (asm_out_file
, ":tprel_lo12:");
7067 case SYMBOL_TLSLE24
:
7068 asm_fprintf (asm_out_file
, ":tprel_lo12_nc:");
7071 case SYMBOL_TINY_GOT
:
7072 asm_fprintf (asm_out_file
, ":got:");
7075 case SYMBOL_TINY_TLSIE
:
7076 asm_fprintf (asm_out_file
, ":gottprel:");
7082 output_addr_const (asm_out_file
, x
);
7086 switch (aarch64_classify_symbolic_expression (x
))
7088 case SYMBOL_TLSLE24
:
7089 asm_fprintf (asm_out_file
, ":tprel_hi12:");
7094 output_addr_const (asm_out_file
, x
);
7099 HOST_WIDE_INT cond_code
;
7101 if (!CONST_INT_P (x
))
7103 output_operand_lossage ("invalid operand for '%%%c'", code
);
7107 cond_code
= INTVAL (x
);
7108 gcc_assert (cond_code
>= 0 && cond_code
<= AARCH64_NV
);
7109 asm_fprintf (f
, "%d", aarch64_nzcv_codes
[cond_code
]);
7116 machine_mode mode
= GET_MODE (x
);
7118 if (GET_CODE (x
) != MEM
7119 || (code
== 'y' && maybe_ne (GET_MODE_SIZE (mode
), 16)))
7121 output_operand_lossage ("invalid operand for '%%%c'", code
);
7126 /* LDP/STP which uses a single double-width memory operand.
7127 Adjust the mode to appear like a typical LDP/STP.
7128 Currently this is supported for 16-byte accesses only. */
7131 if (!aarch64_print_ldpstp_address (f
, mode
, XEXP (x
, 0)))
7132 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
7137 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
7142 /* Print address 'x' of a memory access with mode 'mode'.
7143 'op' is the context required by aarch64_classify_address. It can either be
7144 MEM for a normal memory access or PARALLEL for LDP/STP. */
7146 aarch64_print_address_internal (FILE *f
, machine_mode mode
, rtx x
,
7147 aarch64_addr_query_type type
)
7149 struct aarch64_address_info addr
;
7152 /* Check all addresses are Pmode - including ILP32. */
7153 if (GET_MODE (x
) != Pmode
)
7154 output_operand_lossage ("invalid address mode");
7156 if (aarch64_classify_address (&addr
, x
, mode
, true, type
))
7159 case ADDRESS_REG_IMM
:
7160 if (known_eq (addr
.const_offset
, 0))
7161 asm_fprintf (f
, "[%s]", reg_names
[REGNO (addr
.base
)]);
7162 else if (aarch64_sve_data_mode_p (mode
))
7165 = exact_div (addr
.const_offset
,
7166 BYTES_PER_SVE_VECTOR
).to_constant ();
7167 asm_fprintf (f
, "[%s, #%wd, mul vl]",
7168 reg_names
[REGNO (addr
.base
)], vnum
);
7170 else if (aarch64_sve_pred_mode_p (mode
))
7173 = exact_div (addr
.const_offset
,
7174 BYTES_PER_SVE_PRED
).to_constant ();
7175 asm_fprintf (f
, "[%s, #%wd, mul vl]",
7176 reg_names
[REGNO (addr
.base
)], vnum
);
7179 asm_fprintf (f
, "[%s, %wd]", reg_names
[REGNO (addr
.base
)],
7180 INTVAL (addr
.offset
));
7183 case ADDRESS_REG_REG
:
7184 if (addr
.shift
== 0)
7185 asm_fprintf (f
, "[%s, %s]", reg_names
[REGNO (addr
.base
)],
7186 reg_names
[REGNO (addr
.offset
)]);
7188 asm_fprintf (f
, "[%s, %s, lsl %u]", reg_names
[REGNO (addr
.base
)],
7189 reg_names
[REGNO (addr
.offset
)], addr
.shift
);
7192 case ADDRESS_REG_UXTW
:
7193 if (addr
.shift
== 0)
7194 asm_fprintf (f
, "[%s, w%d, uxtw]", reg_names
[REGNO (addr
.base
)],
7195 REGNO (addr
.offset
) - R0_REGNUM
);
7197 asm_fprintf (f
, "[%s, w%d, uxtw %u]", reg_names
[REGNO (addr
.base
)],
7198 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
7201 case ADDRESS_REG_SXTW
:
7202 if (addr
.shift
== 0)
7203 asm_fprintf (f
, "[%s, w%d, sxtw]", reg_names
[REGNO (addr
.base
)],
7204 REGNO (addr
.offset
) - R0_REGNUM
);
7206 asm_fprintf (f
, "[%s, w%d, sxtw %u]", reg_names
[REGNO (addr
.base
)],
7207 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
7210 case ADDRESS_REG_WB
:
7211 /* Writeback is only supported for fixed-width modes. */
7212 size
= GET_MODE_SIZE (mode
).to_constant ();
7213 switch (GET_CODE (x
))
7216 asm_fprintf (f
, "[%s, %d]!", reg_names
[REGNO (addr
.base
)], size
);
7219 asm_fprintf (f
, "[%s], %d", reg_names
[REGNO (addr
.base
)], size
);
7222 asm_fprintf (f
, "[%s, -%d]!", reg_names
[REGNO (addr
.base
)], size
);
7225 asm_fprintf (f
, "[%s], -%d", reg_names
[REGNO (addr
.base
)], size
);
7228 asm_fprintf (f
, "[%s, %wd]!", reg_names
[REGNO (addr
.base
)],
7229 INTVAL (addr
.offset
));
7232 asm_fprintf (f
, "[%s], %wd", reg_names
[REGNO (addr
.base
)],
7233 INTVAL (addr
.offset
));
7240 case ADDRESS_LO_SUM
:
7241 asm_fprintf (f
, "[%s, #:lo12:", reg_names
[REGNO (addr
.base
)]);
7242 output_addr_const (f
, addr
.offset
);
7243 asm_fprintf (f
, "]");
7246 case ADDRESS_SYMBOLIC
:
7247 output_addr_const (f
, x
);
7254 /* Print address 'x' of a LDP/STP with mode 'mode'. */
7256 aarch64_print_ldpstp_address (FILE *f
, machine_mode mode
, rtx x
)
7258 return aarch64_print_address_internal (f
, mode
, x
, ADDR_QUERY_LDP_STP
);
7261 /* Print address 'x' of a memory access with mode 'mode'. */
7263 aarch64_print_operand_address (FILE *f
, machine_mode mode
, rtx x
)
7265 if (!aarch64_print_address_internal (f
, mode
, x
, ADDR_QUERY_ANY
))
7266 output_addr_const (f
, x
);
7270 aarch64_label_mentioned_p (rtx x
)
7275 if (GET_CODE (x
) == LABEL_REF
)
7278 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
7279 referencing instruction, but they are constant offsets, not
7281 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
7284 fmt
= GET_RTX_FORMAT (GET_CODE (x
));
7285 for (i
= GET_RTX_LENGTH (GET_CODE (x
)) - 1; i
>= 0; i
--)
7291 for (j
= XVECLEN (x
, i
) - 1; j
>= 0; j
--)
7292 if (aarch64_label_mentioned_p (XVECEXP (x
, i
, j
)))
7295 else if (fmt
[i
] == 'e' && aarch64_label_mentioned_p (XEXP (x
, i
)))
7302 /* Implement REGNO_REG_CLASS. */
7305 aarch64_regno_regclass (unsigned regno
)
7307 if (GP_REGNUM_P (regno
))
7308 return GENERAL_REGS
;
7310 if (regno
== SP_REGNUM
)
7313 if (regno
== FRAME_POINTER_REGNUM
7314 || regno
== ARG_POINTER_REGNUM
)
7315 return POINTER_REGS
;
7317 if (FP_REGNUM_P (regno
))
7318 return FP_LO_REGNUM_P (regno
) ? FP_LO_REGS
: FP_REGS
;
7320 if (PR_REGNUM_P (regno
))
7321 return PR_LO_REGNUM_P (regno
) ? PR_LO_REGS
: PR_HI_REGS
;
7326 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
7327 If OFFSET is out of range, return an offset of an anchor point
7328 that is in range. Return 0 otherwise. */
7330 static HOST_WIDE_INT
7331 aarch64_anchor_offset (HOST_WIDE_INT offset
, HOST_WIDE_INT size
,
7334 /* Does it look like we'll need a 16-byte load/store-pair operation? */
7336 return (offset
+ 0x400) & ~0x7f0;
7338 /* For offsets that aren't a multiple of the access size, the limit is
7340 if (offset
& (size
- 1))
7342 /* BLKmode typically uses LDP of X-registers. */
7343 if (mode
== BLKmode
)
7344 return (offset
+ 512) & ~0x3ff;
7345 return (offset
+ 0x100) & ~0x1ff;
7348 /* Small negative offsets are supported. */
7349 if (IN_RANGE (offset
, -256, 0))
7352 if (mode
== TImode
|| mode
== TFmode
)
7353 return (offset
+ 0x100) & ~0x1ff;
7355 /* Use 12-bit offset by access size. */
7356 return offset
& (~0xfff * size
);
7360 aarch64_legitimize_address (rtx x
, rtx
/* orig_x */, machine_mode mode
)
7362 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
7363 where mask is selected by alignment and size of the offset.
7364 We try to pick as large a range for the offset as possible to
7365 maximize the chance of a CSE. However, for aligned addresses
7366 we limit the range to 4k so that structures with different sized
7367 elements are likely to use the same base. We need to be careful
7368 not to split a CONST for some forms of address expression, otherwise
7369 it will generate sub-optimal code. */
7371 if (GET_CODE (x
) == PLUS
&& CONST_INT_P (XEXP (x
, 1)))
7373 rtx base
= XEXP (x
, 0);
7374 rtx offset_rtx
= XEXP (x
, 1);
7375 HOST_WIDE_INT offset
= INTVAL (offset_rtx
);
7377 if (GET_CODE (base
) == PLUS
)
7379 rtx op0
= XEXP (base
, 0);
7380 rtx op1
= XEXP (base
, 1);
7382 /* Force any scaling into a temp for CSE. */
7383 op0
= force_reg (Pmode
, op0
);
7384 op1
= force_reg (Pmode
, op1
);
7386 /* Let the pointer register be in op0. */
7387 if (REG_POINTER (op1
))
7388 std::swap (op0
, op1
);
7390 /* If the pointer is virtual or frame related, then we know that
7391 virtual register instantiation or register elimination is going
7392 to apply a second constant. We want the two constants folded
7393 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
7394 if (virt_or_elim_regno_p (REGNO (op0
)))
7396 base
= expand_binop (Pmode
, add_optab
, op0
, offset_rtx
,
7397 NULL_RTX
, true, OPTAB_DIRECT
);
7398 return gen_rtx_PLUS (Pmode
, base
, op1
);
7401 /* Otherwise, in order to encourage CSE (and thence loop strength
7402 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
7403 base
= expand_binop (Pmode
, add_optab
, op0
, op1
,
7404 NULL_RTX
, true, OPTAB_DIRECT
);
7405 x
= gen_rtx_PLUS (Pmode
, base
, offset_rtx
);
7409 if (GET_MODE_SIZE (mode
).is_constant (&size
))
7411 HOST_WIDE_INT base_offset
= aarch64_anchor_offset (offset
, size
,
7413 if (base_offset
!= 0)
7415 base
= plus_constant (Pmode
, base
, base_offset
);
7416 base
= force_operand (base
, NULL_RTX
);
7417 return plus_constant (Pmode
, base
, offset
- base_offset
);
7425 /* Return the reload icode required for a constant pool in mode. */
7426 static enum insn_code
7427 aarch64_constant_pool_reload_icode (machine_mode mode
)
7432 return CODE_FOR_aarch64_reload_movcpsfdi
;
7435 return CODE_FOR_aarch64_reload_movcpdfdi
;
7438 return CODE_FOR_aarch64_reload_movcptfdi
;
7441 return CODE_FOR_aarch64_reload_movcpv8qidi
;
7444 return CODE_FOR_aarch64_reload_movcpv16qidi
;
7447 return CODE_FOR_aarch64_reload_movcpv4hidi
;
7450 return CODE_FOR_aarch64_reload_movcpv8hidi
;
7453 return CODE_FOR_aarch64_reload_movcpv2sidi
;
7456 return CODE_FOR_aarch64_reload_movcpv4sidi
;
7459 return CODE_FOR_aarch64_reload_movcpv2didi
;
7462 return CODE_FOR_aarch64_reload_movcpv2dfdi
;
7471 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED
, rtx x
,
7474 secondary_reload_info
*sri
)
7476 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
7477 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
7478 comment at the head of aarch64-sve.md for more details about the
7479 big-endian handling. */
7480 if (BYTES_BIG_ENDIAN
7481 && reg_class_subset_p (rclass
, FP_REGS
)
7482 && !((REG_P (x
) && HARD_REGISTER_P (x
))
7483 || aarch64_simd_valid_immediate (x
, NULL
))
7484 && aarch64_sve_data_mode_p (mode
))
7486 sri
->icode
= CODE_FOR_aarch64_sve_reload_be
;
7490 /* If we have to disable direct literal pool loads and stores because the
7491 function is too big, then we need a scratch register. */
7492 if (MEM_P (x
) && GET_CODE (x
) == SYMBOL_REF
&& CONSTANT_POOL_ADDRESS_P (x
)
7493 && (SCALAR_FLOAT_MODE_P (GET_MODE (x
))
7494 || targetm
.vector_mode_supported_p (GET_MODE (x
)))
7495 && !aarch64_pcrelative_literal_loads
)
7497 sri
->icode
= aarch64_constant_pool_reload_icode (mode
);
7501 /* Without the TARGET_SIMD instructions we cannot move a Q register
7502 to a Q register directly. We need a scratch. */
7503 if (REG_P (x
) && (mode
== TFmode
|| mode
== TImode
) && mode
== GET_MODE (x
)
7504 && FP_REGNUM_P (REGNO (x
)) && !TARGET_SIMD
7505 && reg_class_subset_p (rclass
, FP_REGS
))
7508 sri
->icode
= CODE_FOR_aarch64_reload_movtf
;
7509 else if (mode
== TImode
)
7510 sri
->icode
= CODE_FOR_aarch64_reload_movti
;
7514 /* A TFmode or TImode memory access should be handled via an FP_REGS
7515 because AArch64 has richer addressing modes for LDR/STR instructions
7516 than LDP/STP instructions. */
7517 if (TARGET_FLOAT
&& rclass
== GENERAL_REGS
7518 && known_eq (GET_MODE_SIZE (mode
), 16) && MEM_P (x
))
7521 if (rclass
== FP_REGS
&& (mode
== TImode
|| mode
== TFmode
) && CONSTANT_P(x
))
7522 return GENERAL_REGS
;
7528 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED
, const int to
)
7530 gcc_assert (from
== ARG_POINTER_REGNUM
|| from
== FRAME_POINTER_REGNUM
);
7532 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
7533 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
7534 if (frame_pointer_needed
)
7535 return to
== HARD_FRAME_POINTER_REGNUM
;
7540 aarch64_initial_elimination_offset (unsigned from
, unsigned to
)
7542 aarch64_layout_frame ();
7544 if (to
== HARD_FRAME_POINTER_REGNUM
)
7546 if (from
== ARG_POINTER_REGNUM
)
7547 return cfun
->machine
->frame
.hard_fp_offset
;
7549 if (from
== FRAME_POINTER_REGNUM
)
7550 return cfun
->machine
->frame
.hard_fp_offset
7551 - cfun
->machine
->frame
.locals_offset
;
7554 if (to
== STACK_POINTER_REGNUM
)
7556 if (from
== FRAME_POINTER_REGNUM
)
7557 return cfun
->machine
->frame
.frame_size
7558 - cfun
->machine
->frame
.locals_offset
;
7561 return cfun
->machine
->frame
.frame_size
;
7564 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
7568 aarch64_return_addr (int count
, rtx frame ATTRIBUTE_UNUSED
)
7572 return get_hard_reg_initial_val (Pmode
, LR_REGNUM
);
7577 aarch64_asm_trampoline_template (FILE *f
)
7581 asm_fprintf (f
, "\tldr\tw%d, .+16\n", IP1_REGNUM
- R0_REGNUM
);
7582 asm_fprintf (f
, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM
- R0_REGNUM
);
7586 asm_fprintf (f
, "\tldr\t%s, .+16\n", reg_names
[IP1_REGNUM
]);
7587 asm_fprintf (f
, "\tldr\t%s, .+20\n", reg_names
[STATIC_CHAIN_REGNUM
]);
7589 asm_fprintf (f
, "\tbr\t%s\n", reg_names
[IP1_REGNUM
]);
7590 assemble_aligned_integer (4, const0_rtx
);
7591 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
7592 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
7596 aarch64_trampoline_init (rtx m_tramp
, tree fndecl
, rtx chain_value
)
7598 rtx fnaddr
, mem
, a_tramp
;
7599 const int tramp_code_sz
= 16;
7601 /* Don't need to copy the trailing D-words, we fill those in below. */
7602 emit_block_move (m_tramp
, assemble_trampoline_template (),
7603 GEN_INT (tramp_code_sz
), BLOCK_OP_NORMAL
);
7604 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
);
7605 fnaddr
= XEXP (DECL_RTL (fndecl
), 0);
7606 if (GET_MODE (fnaddr
) != ptr_mode
)
7607 fnaddr
= convert_memory_address (ptr_mode
, fnaddr
);
7608 emit_move_insn (mem
, fnaddr
);
7610 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
+ POINTER_BYTES
);
7611 emit_move_insn (mem
, chain_value
);
7613 /* XXX We should really define a "clear_cache" pattern and use
7614 gen_clear_cache(). */
7615 a_tramp
= XEXP (m_tramp
, 0);
7616 emit_library_call (gen_rtx_SYMBOL_REF (Pmode
, "__clear_cache"),
7617 LCT_NORMAL
, VOIDmode
, a_tramp
, ptr_mode
,
7618 plus_constant (ptr_mode
, a_tramp
, TRAMPOLINE_SIZE
),
7622 static unsigned char
7623 aarch64_class_max_nregs (reg_class_t regclass
, machine_mode mode
)
7625 /* ??? Logically we should only need to provide a value when
7626 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
7627 can hold MODE, but at the moment we need to handle all modes.
7628 Just ignore any runtime parts for registers that can't store them. */
7629 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
7633 case TAILCALL_ADDR_REGS
:
7637 case POINTER_AND_FP_REGS
:
7640 if (aarch64_sve_data_mode_p (mode
)
7641 && constant_multiple_p (GET_MODE_SIZE (mode
),
7642 BYTES_PER_SVE_VECTOR
, &nregs
))
7644 return (aarch64_vector_data_mode_p (mode
)
7645 ? CEIL (lowest_size
, UNITS_PER_VREG
)
7646 : CEIL (lowest_size
, UNITS_PER_WORD
));
7663 aarch64_preferred_reload_class (rtx x
, reg_class_t regclass
)
7665 if (regclass
== POINTER_REGS
)
7666 return GENERAL_REGS
;
7668 if (regclass
== STACK_REG
)
7671 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x
)), POINTER_REGS
))
7677 /* Register eliminiation can result in a request for
7678 SP+constant->FP_REGS. We cannot support such operations which
7679 use SP as source and an FP_REG as destination, so reject out
7681 if (! reg_class_subset_p (regclass
, GENERAL_REGS
) && GET_CODE (x
) == PLUS
)
7683 rtx lhs
= XEXP (x
, 0);
7685 /* Look through a possible SUBREG introduced by ILP32. */
7686 if (GET_CODE (lhs
) == SUBREG
)
7687 lhs
= SUBREG_REG (lhs
);
7689 gcc_assert (REG_P (lhs
));
7690 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs
)),
7699 aarch64_asm_output_labelref (FILE* f
, const char *name
)
7701 asm_fprintf (f
, "%U%s", name
);
7705 aarch64_elf_asm_constructor (rtx symbol
, int priority
)
7707 if (priority
== DEFAULT_INIT_PRIORITY
)
7708 default_ctor_section_asm_out_constructor (symbol
, priority
);
7712 /* While priority is known to be in range [0, 65535], so 18 bytes
7713 would be enough, the compiler might not know that. To avoid
7714 -Wformat-truncation false positive, use a larger size. */
7716 snprintf (buf
, sizeof (buf
), ".init_array.%.5u", priority
);
7717 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
7718 switch_to_section (s
);
7719 assemble_align (POINTER_SIZE
);
7720 assemble_aligned_integer (POINTER_BYTES
, symbol
);
7725 aarch64_elf_asm_destructor (rtx symbol
, int priority
)
7727 if (priority
== DEFAULT_INIT_PRIORITY
)
7728 default_dtor_section_asm_out_destructor (symbol
, priority
);
7732 /* While priority is known to be in range [0, 65535], so 18 bytes
7733 would be enough, the compiler might not know that. To avoid
7734 -Wformat-truncation false positive, use a larger size. */
7736 snprintf (buf
, sizeof (buf
), ".fini_array.%.5u", priority
);
7737 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
7738 switch_to_section (s
);
7739 assemble_align (POINTER_SIZE
);
7740 assemble_aligned_integer (POINTER_BYTES
, symbol
);
7745 aarch64_output_casesi (rtx
*operands
)
7749 rtx diff_vec
= PATTERN (NEXT_INSN (as_a
<rtx_insn
*> (operands
[2])));
7751 static const char *const patterns
[4][2] =
7754 "ldrb\t%w3, [%0,%w1,uxtw]",
7755 "add\t%3, %4, %w3, sxtb #2"
7758 "ldrh\t%w3, [%0,%w1,uxtw #1]",
7759 "add\t%3, %4, %w3, sxth #2"
7762 "ldr\t%w3, [%0,%w1,uxtw #2]",
7763 "add\t%3, %4, %w3, sxtw #2"
7765 /* We assume that DImode is only generated when not optimizing and
7766 that we don't really need 64-bit address offsets. That would
7767 imply an object file with 8GB of code in a single function! */
7769 "ldr\t%w3, [%0,%w1,uxtw #2]",
7770 "add\t%3, %4, %w3, sxtw #2"
7774 gcc_assert (GET_CODE (diff_vec
) == ADDR_DIFF_VEC
);
7776 scalar_int_mode mode
= as_a
<scalar_int_mode
> (GET_MODE (diff_vec
));
7777 index
= exact_log2 (GET_MODE_SIZE (mode
));
7779 gcc_assert (index
>= 0 && index
<= 3);
7781 /* Need to implement table size reduction, by chaning the code below. */
7782 output_asm_insn (patterns
[index
][0], operands
);
7783 ASM_GENERATE_INTERNAL_LABEL (label
, "Lrtx", CODE_LABEL_NUMBER (operands
[2]));
7784 snprintf (buf
, sizeof (buf
),
7785 "adr\t%%4, %s", targetm
.strip_name_encoding (label
));
7786 output_asm_insn (buf
, operands
);
7787 output_asm_insn (patterns
[index
][1], operands
);
7788 output_asm_insn ("br\t%3", operands
);
7789 assemble_label (asm_out_file
, label
);
7794 /* Return size in bits of an arithmetic operand which is shifted/scaled and
7795 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
7799 aarch64_uxt_size (int shift
, HOST_WIDE_INT mask
)
7801 if (shift
>= 0 && shift
<= 3)
7804 for (size
= 8; size
<= 32; size
*= 2)
7806 HOST_WIDE_INT bits
= ((HOST_WIDE_INT
)1U << size
) - 1;
7807 if (mask
== bits
<< shift
)
7814 /* Constant pools are per function only when PC relative
7815 literal loads are true or we are in the large memory
7819 aarch64_can_use_per_function_literal_pools_p (void)
7821 return (aarch64_pcrelative_literal_loads
7822 || aarch64_cmodel
== AARCH64_CMODEL_LARGE
);
7826 aarch64_use_blocks_for_constant_p (machine_mode
, const_rtx
)
7828 /* We can't use blocks for constants when we're using a per-function
7830 return !aarch64_can_use_per_function_literal_pools_p ();
7833 /* Select appropriate section for constants depending
7834 on where we place literal pools. */
7837 aarch64_select_rtx_section (machine_mode mode
,
7839 unsigned HOST_WIDE_INT align
)
7841 if (aarch64_can_use_per_function_literal_pools_p ())
7842 return function_section (current_function_decl
);
7844 return default_elf_select_rtx_section (mode
, x
, align
);
7847 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
7849 aarch64_asm_output_pool_epilogue (FILE *f
, const char *, tree
,
7850 HOST_WIDE_INT offset
)
7852 /* When using per-function literal pools, we must ensure that any code
7853 section is aligned to the minimal instruction length, lest we get
7854 errors from the assembler re "unaligned instructions". */
7855 if ((offset
& 3) && aarch64_can_use_per_function_literal_pools_p ())
7856 ASM_OUTPUT_ALIGN (f
, 2);
7861 /* Helper function for rtx cost calculation. Strip a shift expression
7862 from X. Returns the inner operand if successful, or the original
7863 expression on failure. */
7865 aarch64_strip_shift (rtx x
)
7869 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
7870 we can convert both to ROR during final output. */
7871 if ((GET_CODE (op
) == ASHIFT
7872 || GET_CODE (op
) == ASHIFTRT
7873 || GET_CODE (op
) == LSHIFTRT
7874 || GET_CODE (op
) == ROTATERT
7875 || GET_CODE (op
) == ROTATE
)
7876 && CONST_INT_P (XEXP (op
, 1)))
7877 return XEXP (op
, 0);
7879 if (GET_CODE (op
) == MULT
7880 && CONST_INT_P (XEXP (op
, 1))
7881 && ((unsigned) exact_log2 (INTVAL (XEXP (op
, 1)))) < 64)
7882 return XEXP (op
, 0);
7887 /* Helper function for rtx cost calculation. Strip an extend
7888 expression from X. Returns the inner operand if successful, or the
7889 original expression on failure. We deal with a number of possible
7890 canonicalization variations here. If STRIP_SHIFT is true, then
7891 we can strip off a shift also. */
7893 aarch64_strip_extend (rtx x
, bool strip_shift
)
7895 scalar_int_mode mode
;
7898 if (!is_a
<scalar_int_mode
> (GET_MODE (op
), &mode
))
7901 /* Zero and sign extraction of a widened value. */
7902 if ((GET_CODE (op
) == ZERO_EXTRACT
|| GET_CODE (op
) == SIGN_EXTRACT
)
7903 && XEXP (op
, 2) == const0_rtx
7904 && GET_CODE (XEXP (op
, 0)) == MULT
7905 && aarch64_is_extend_from_extract (mode
, XEXP (XEXP (op
, 0), 1),
7907 return XEXP (XEXP (op
, 0), 0);
7909 /* It can also be represented (for zero-extend) as an AND with an
7911 if (GET_CODE (op
) == AND
7912 && GET_CODE (XEXP (op
, 0)) == MULT
7913 && CONST_INT_P (XEXP (XEXP (op
, 0), 1))
7914 && CONST_INT_P (XEXP (op
, 1))
7915 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op
, 0), 1))),
7916 INTVAL (XEXP (op
, 1))) != 0)
7917 return XEXP (XEXP (op
, 0), 0);
7919 /* Now handle extended register, as this may also have an optional
7920 left shift by 1..4. */
7922 && GET_CODE (op
) == ASHIFT
7923 && CONST_INT_P (XEXP (op
, 1))
7924 && ((unsigned HOST_WIDE_INT
) INTVAL (XEXP (op
, 1))) <= 4)
7927 if (GET_CODE (op
) == ZERO_EXTEND
7928 || GET_CODE (op
) == SIGN_EXTEND
)
7937 /* Return true iff CODE is a shift supported in combination
7938 with arithmetic instructions. */
7941 aarch64_shift_p (enum rtx_code code
)
7943 return code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
;
7947 /* Return true iff X is a cheap shift without a sign extend. */
7950 aarch64_cheap_mult_shift_p (rtx x
)
7957 if (!(aarch64_tune_params
.extra_tuning_flags
7958 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
))
7961 if (GET_CODE (op0
) == SIGN_EXTEND
)
7964 if (GET_CODE (x
) == ASHIFT
&& CONST_INT_P (op1
)
7965 && UINTVAL (op1
) <= 4)
7968 if (GET_CODE (x
) != MULT
|| !CONST_INT_P (op1
))
7971 HOST_WIDE_INT l2
= exact_log2 (INTVAL (op1
));
7973 if (l2
> 0 && l2
<= 4)
7979 /* Helper function for rtx cost calculation. Calculate the cost of
7980 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
7981 Return the calculated cost of the expression, recursing manually in to
7982 operands where needed. */
7985 aarch64_rtx_mult_cost (rtx x
, enum rtx_code code
, int outer
, bool speed
)
7988 const struct cpu_cost_table
*extra_cost
7989 = aarch64_tune_params
.insn_extra_cost
;
7991 bool compound_p
= (outer
== PLUS
|| outer
== MINUS
);
7992 machine_mode mode
= GET_MODE (x
);
7994 gcc_checking_assert (code
== MULT
);
7999 if (VECTOR_MODE_P (mode
))
8000 mode
= GET_MODE_INNER (mode
);
8002 /* Integer multiply/fma. */
8003 if (GET_MODE_CLASS (mode
) == MODE_INT
)
8005 /* The multiply will be canonicalized as a shift, cost it as such. */
8006 if (aarch64_shift_p (GET_CODE (x
))
8007 || (CONST_INT_P (op1
)
8008 && exact_log2 (INTVAL (op1
)) > 0))
8010 bool is_extend
= GET_CODE (op0
) == ZERO_EXTEND
8011 || GET_CODE (op0
) == SIGN_EXTEND
;
8016 /* If the shift is considered cheap,
8017 then don't add any cost. */
8018 if (aarch64_cheap_mult_shift_p (x
))
8020 else if (REG_P (op1
))
8021 /* ARITH + shift-by-register. */
8022 cost
+= extra_cost
->alu
.arith_shift_reg
;
8024 /* ARITH + extended register. We don't have a cost field
8025 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
8026 cost
+= extra_cost
->alu
.extend_arith
;
8028 /* ARITH + shift-by-immediate. */
8029 cost
+= extra_cost
->alu
.arith_shift
;
8032 /* LSL (immediate). */
8033 cost
+= extra_cost
->alu
.shift
;
8036 /* Strip extends as we will have costed them in the case above. */
8038 op0
= aarch64_strip_extend (op0
, true);
8040 cost
+= rtx_cost (op0
, VOIDmode
, code
, 0, speed
);
8045 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
8046 compound and let the below cases handle it. After all, MNEG is a
8047 special-case alias of MSUB. */
8048 if (GET_CODE (op0
) == NEG
)
8050 op0
= XEXP (op0
, 0);
8054 /* Integer multiplies or FMAs have zero/sign extending variants. */
8055 if ((GET_CODE (op0
) == ZERO_EXTEND
8056 && GET_CODE (op1
) == ZERO_EXTEND
)
8057 || (GET_CODE (op0
) == SIGN_EXTEND
8058 && GET_CODE (op1
) == SIGN_EXTEND
))
8060 cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, MULT
, 0, speed
);
8061 cost
+= rtx_cost (XEXP (op1
, 0), VOIDmode
, MULT
, 1, speed
);
8066 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
8067 cost
+= extra_cost
->mult
[0].extend_add
;
8069 /* MUL/SMULL/UMULL. */
8070 cost
+= extra_cost
->mult
[0].extend
;
8076 /* This is either an integer multiply or a MADD. In both cases
8077 we want to recurse and cost the operands. */
8078 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
8079 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
8085 cost
+= extra_cost
->mult
[mode
== DImode
].add
;
8088 cost
+= extra_cost
->mult
[mode
== DImode
].simple
;
8097 /* Floating-point FMA/FMUL can also support negations of the
8098 operands, unless the rounding mode is upward or downward in
8099 which case FNMUL is different than FMUL with operand negation. */
8100 bool neg0
= GET_CODE (op0
) == NEG
;
8101 bool neg1
= GET_CODE (op1
) == NEG
;
8102 if (compound_p
|| !flag_rounding_math
|| (neg0
&& neg1
))
8105 op0
= XEXP (op0
, 0);
8107 op1
= XEXP (op1
, 0);
8111 /* FMADD/FNMADD/FNMSUB/FMSUB. */
8112 cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
8115 cost
+= extra_cost
->fp
[mode
== DFmode
].mult
;
8118 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
8119 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
8125 aarch64_address_cost (rtx x
,
8127 addr_space_t as ATTRIBUTE_UNUSED
,
8130 enum rtx_code c
= GET_CODE (x
);
8131 const struct cpu_addrcost_table
*addr_cost
= aarch64_tune_params
.addr_cost
;
8132 struct aarch64_address_info info
;
8136 if (!aarch64_classify_address (&info
, x
, mode
, false))
8138 if (GET_CODE (x
) == CONST
|| GET_CODE (x
) == SYMBOL_REF
)
8140 /* This is a CONST or SYMBOL ref which will be split
8141 in a different way depending on the code model in use.
8142 Cost it through the generic infrastructure. */
8143 int cost_symbol_ref
= rtx_cost (x
, Pmode
, MEM
, 1, speed
);
8144 /* Divide through by the cost of one instruction to
8145 bring it to the same units as the address costs. */
8146 cost_symbol_ref
/= COSTS_N_INSNS (1);
8147 /* The cost is then the cost of preparing the address,
8148 followed by an immediate (possibly 0) offset. */
8149 return cost_symbol_ref
+ addr_cost
->imm_offset
;
8153 /* This is most likely a jump table from a case
8155 return addr_cost
->register_offset
;
8161 case ADDRESS_LO_SUM
:
8162 case ADDRESS_SYMBOLIC
:
8163 case ADDRESS_REG_IMM
:
8164 cost
+= addr_cost
->imm_offset
;
8167 case ADDRESS_REG_WB
:
8168 if (c
== PRE_INC
|| c
== PRE_DEC
|| c
== PRE_MODIFY
)
8169 cost
+= addr_cost
->pre_modify
;
8170 else if (c
== POST_INC
|| c
== POST_DEC
|| c
== POST_MODIFY
)
8171 cost
+= addr_cost
->post_modify
;
8177 case ADDRESS_REG_REG
:
8178 cost
+= addr_cost
->register_offset
;
8181 case ADDRESS_REG_SXTW
:
8182 cost
+= addr_cost
->register_sextend
;
8185 case ADDRESS_REG_UXTW
:
8186 cost
+= addr_cost
->register_zextend
;
8196 /* For the sake of calculating the cost of the shifted register
8197 component, we can treat same sized modes in the same way. */
8198 if (known_eq (GET_MODE_BITSIZE (mode
), 16))
8199 cost
+= addr_cost
->addr_scale_costs
.hi
;
8200 else if (known_eq (GET_MODE_BITSIZE (mode
), 32))
8201 cost
+= addr_cost
->addr_scale_costs
.si
;
8202 else if (known_eq (GET_MODE_BITSIZE (mode
), 64))
8203 cost
+= addr_cost
->addr_scale_costs
.di
;
8205 /* We can't tell, or this is a 128-bit vector. */
8206 cost
+= addr_cost
->addr_scale_costs
.ti
;
8212 /* Return the cost of a branch. If SPEED_P is true then the compiler is
8213 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
8217 aarch64_branch_cost (bool speed_p
, bool predictable_p
)
8219 /* When optimizing for speed, use the cost of unpredictable branches. */
8220 const struct cpu_branch_cost
*branch_costs
=
8221 aarch64_tune_params
.branch_costs
;
8223 if (!speed_p
|| predictable_p
)
8224 return branch_costs
->predictable
;
8226 return branch_costs
->unpredictable
;
8229 /* Return true if the RTX X in mode MODE is a zero or sign extract
8230 usable in an ADD or SUB (extended register) instruction. */
8232 aarch64_rtx_arith_op_extract_p (rtx x
, scalar_int_mode mode
)
8234 /* Catch add with a sign extract.
8235 This is add_<optab><mode>_multp2. */
8236 if (GET_CODE (x
) == SIGN_EXTRACT
8237 || GET_CODE (x
) == ZERO_EXTRACT
)
8239 rtx op0
= XEXP (x
, 0);
8240 rtx op1
= XEXP (x
, 1);
8241 rtx op2
= XEXP (x
, 2);
8243 if (GET_CODE (op0
) == MULT
8244 && CONST_INT_P (op1
)
8245 && op2
== const0_rtx
8246 && CONST_INT_P (XEXP (op0
, 1))
8247 && aarch64_is_extend_from_extract (mode
,
8254 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8256 else if (GET_CODE (x
) == SIGN_EXTEND
8257 || GET_CODE (x
) == ZERO_EXTEND
)
8258 return REG_P (XEXP (x
, 0));
8264 aarch64_frint_unspec_p (unsigned int u
)
8282 /* Return true iff X is an rtx that will match an extr instruction
8283 i.e. as described in the *extr<mode>5_insn family of patterns.
8284 OP0 and OP1 will be set to the operands of the shifts involved
8285 on success and will be NULL_RTX otherwise. */
8288 aarch64_extr_rtx_p (rtx x
, rtx
*res_op0
, rtx
*res_op1
)
8291 scalar_int_mode mode
;
8292 if (!is_a
<scalar_int_mode
> (GET_MODE (x
), &mode
))
8295 *res_op0
= NULL_RTX
;
8296 *res_op1
= NULL_RTX
;
8298 if (GET_CODE (x
) != IOR
)
8304 if ((GET_CODE (op0
) == ASHIFT
&& GET_CODE (op1
) == LSHIFTRT
)
8305 || (GET_CODE (op1
) == ASHIFT
&& GET_CODE (op0
) == LSHIFTRT
))
8307 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
8308 if (GET_CODE (op1
) == ASHIFT
)
8309 std::swap (op0
, op1
);
8311 if (!CONST_INT_P (XEXP (op0
, 1)) || !CONST_INT_P (XEXP (op1
, 1)))
8314 unsigned HOST_WIDE_INT shft_amnt_0
= UINTVAL (XEXP (op0
, 1));
8315 unsigned HOST_WIDE_INT shft_amnt_1
= UINTVAL (XEXP (op1
, 1));
8317 if (shft_amnt_0
< GET_MODE_BITSIZE (mode
)
8318 && shft_amnt_0
+ shft_amnt_1
== GET_MODE_BITSIZE (mode
))
8320 *res_op0
= XEXP (op0
, 0);
8321 *res_op1
= XEXP (op1
, 0);
8329 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
8330 storing it in *COST. Result is true if the total cost of the operation
8331 has now been calculated. */
8333 aarch64_if_then_else_costs (rtx op0
, rtx op1
, rtx op2
, int *cost
, bool speed
)
8337 enum rtx_code cmpcode
;
8339 if (COMPARISON_P (op0
))
8341 inner
= XEXP (op0
, 0);
8342 comparator
= XEXP (op0
, 1);
8343 cmpcode
= GET_CODE (op0
);
8348 comparator
= const0_rtx
;
8352 if (GET_CODE (op1
) == PC
|| GET_CODE (op2
) == PC
)
8354 /* Conditional branch. */
8355 if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
8359 if (cmpcode
== NE
|| cmpcode
== EQ
)
8361 if (comparator
== const0_rtx
)
8363 /* TBZ/TBNZ/CBZ/CBNZ. */
8364 if (GET_CODE (inner
) == ZERO_EXTRACT
)
8366 *cost
+= rtx_cost (XEXP (inner
, 0), VOIDmode
,
8367 ZERO_EXTRACT
, 0, speed
);
8370 *cost
+= rtx_cost (inner
, VOIDmode
, cmpcode
, 0, speed
);
8375 else if (cmpcode
== LT
|| cmpcode
== GE
)
8378 if (comparator
== const0_rtx
)
8383 else if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
8386 if (GET_CODE (op1
) == COMPARE
)
8388 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
8389 if (XEXP (op1
, 1) == const0_rtx
)
8393 machine_mode mode
= GET_MODE (XEXP (op1
, 0));
8394 const struct cpu_cost_table
*extra_cost
8395 = aarch64_tune_params
.insn_extra_cost
;
8397 if (GET_MODE_CLASS (mode
) == MODE_INT
)
8398 *cost
+= extra_cost
->alu
.arith
;
8400 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
8405 /* It's a conditional operation based on the status flags,
8406 so it must be some flavor of CSEL. */
8408 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
8409 if (GET_CODE (op1
) == NEG
8410 || GET_CODE (op1
) == NOT
8411 || (GET_CODE (op1
) == PLUS
&& XEXP (op1
, 1) == const1_rtx
))
8412 op1
= XEXP (op1
, 0);
8413 else if (GET_CODE (op1
) == ZERO_EXTEND
&& GET_CODE (op2
) == ZERO_EXTEND
)
8415 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
8416 op1
= XEXP (op1
, 0);
8417 op2
= XEXP (op2
, 0);
8420 *cost
+= rtx_cost (op1
, VOIDmode
, IF_THEN_ELSE
, 1, speed
);
8421 *cost
+= rtx_cost (op2
, VOIDmode
, IF_THEN_ELSE
, 2, speed
);
8425 /* We don't know what this is, cost all operands. */
8429 /* Check whether X is a bitfield operation of the form shift + extend that
8430 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
8431 operand to which the bitfield operation is applied. Otherwise return
8435 aarch64_extend_bitfield_pattern_p (rtx x
)
8437 rtx_code outer_code
= GET_CODE (x
);
8438 machine_mode outer_mode
= GET_MODE (x
);
8440 if (outer_code
!= ZERO_EXTEND
&& outer_code
!= SIGN_EXTEND
8441 && outer_mode
!= SImode
&& outer_mode
!= DImode
)
8444 rtx inner
= XEXP (x
, 0);
8445 rtx_code inner_code
= GET_CODE (inner
);
8446 machine_mode inner_mode
= GET_MODE (inner
);
8452 if (CONST_INT_P (XEXP (inner
, 1))
8453 && (inner_mode
== QImode
|| inner_mode
== HImode
))
8454 op
= XEXP (inner
, 0);
8457 if (outer_code
== ZERO_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
8458 && (inner_mode
== QImode
|| inner_mode
== HImode
))
8459 op
= XEXP (inner
, 0);
8462 if (outer_code
== SIGN_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
8463 && (inner_mode
== QImode
|| inner_mode
== HImode
))
8464 op
= XEXP (inner
, 0);
8473 /* Return true if the mask and a shift amount from an RTX of the form
8474 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
8475 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
8478 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode
, rtx mask
,
8481 return CONST_INT_P (mask
) && CONST_INT_P (shft_amnt
)
8482 && INTVAL (shft_amnt
) < GET_MODE_BITSIZE (mode
)
8483 && exact_log2 ((INTVAL (mask
) >> INTVAL (shft_amnt
)) + 1) >= 0
8484 && (INTVAL (mask
) & ((1 << INTVAL (shft_amnt
)) - 1)) == 0;
8487 /* Calculate the cost of calculating X, storing it in *COST. Result
8488 is true if the total cost of the operation has now been calculated. */
8490 aarch64_rtx_costs (rtx x
, machine_mode mode
, int outer ATTRIBUTE_UNUSED
,
8491 int param ATTRIBUTE_UNUSED
, int *cost
, bool speed
)
8494 const struct cpu_cost_table
*extra_cost
8495 = aarch64_tune_params
.insn_extra_cost
;
8496 int code
= GET_CODE (x
);
8497 scalar_int_mode int_mode
;
8499 /* By default, assume that everything has equivalent cost to the
8500 cheapest instruction. Any additional costs are applied as a delta
8501 above this default. */
8502 *cost
= COSTS_N_INSNS (1);
8507 /* The cost depends entirely on the operands to SET. */
8512 switch (GET_CODE (op0
))
8517 rtx address
= XEXP (op0
, 0);
8518 if (VECTOR_MODE_P (mode
))
8519 *cost
+= extra_cost
->ldst
.storev
;
8520 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
8521 *cost
+= extra_cost
->ldst
.store
;
8522 else if (mode
== SFmode
)
8523 *cost
+= extra_cost
->ldst
.storef
;
8524 else if (mode
== DFmode
)
8525 *cost
+= extra_cost
->ldst
.stored
;
8528 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
8532 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
8536 if (! REG_P (SUBREG_REG (op0
)))
8537 *cost
+= rtx_cost (SUBREG_REG (op0
), VOIDmode
, SET
, 0, speed
);
8541 /* The cost is one per vector-register copied. */
8542 if (VECTOR_MODE_P (GET_MODE (op0
)) && REG_P (op1
))
8544 int nregs
= aarch64_hard_regno_nregs (V0_REGNUM
, GET_MODE (op0
));
8545 *cost
= COSTS_N_INSNS (nregs
);
8547 /* const0_rtx is in general free, but we will use an
8548 instruction to set a register to 0. */
8549 else if (REG_P (op1
) || op1
== const0_rtx
)
8551 /* The cost is 1 per register copied. */
8552 int nregs
= aarch64_hard_regno_nregs (R0_REGNUM
, GET_MODE (op0
));
8553 *cost
= COSTS_N_INSNS (nregs
);
8556 /* Cost is just the cost of the RHS of the set. */
8557 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
8562 /* Bit-field insertion. Strip any redundant widening of
8563 the RHS to meet the width of the target. */
8564 if (GET_CODE (op1
) == SUBREG
)
8565 op1
= SUBREG_REG (op1
);
8566 if ((GET_CODE (op1
) == ZERO_EXTEND
8567 || GET_CODE (op1
) == SIGN_EXTEND
)
8568 && CONST_INT_P (XEXP (op0
, 1))
8569 && is_a
<scalar_int_mode
> (GET_MODE (XEXP (op1
, 0)), &int_mode
)
8570 && GET_MODE_BITSIZE (int_mode
) >= INTVAL (XEXP (op0
, 1)))
8571 op1
= XEXP (op1
, 0);
8573 if (CONST_INT_P (op1
))
8575 /* MOV immediate is assumed to always be cheap. */
8576 *cost
= COSTS_N_INSNS (1);
8582 *cost
+= extra_cost
->alu
.bfi
;
8583 *cost
+= rtx_cost (op1
, VOIDmode
, (enum rtx_code
) code
, 1, speed
);
8589 /* We can't make sense of this, assume default cost. */
8590 *cost
= COSTS_N_INSNS (1);
8596 /* If an instruction can incorporate a constant within the
8597 instruction, the instruction's expression avoids calling
8598 rtx_cost() on the constant. If rtx_cost() is called on a
8599 constant, then it is usually because the constant must be
8600 moved into a register by one or more instructions.
8602 The exception is constant 0, which can be expressed
8603 as XZR/WZR and is therefore free. The exception to this is
8604 if we have (set (reg) (const0_rtx)) in which case we must cost
8605 the move. However, we can catch that when we cost the SET, so
8606 we don't need to consider that here. */
8607 if (x
== const0_rtx
)
8611 /* To an approximation, building any other constant is
8612 proportionally expensive to the number of instructions
8613 required to build that constant. This is true whether we
8614 are compiling for SPEED or otherwise. */
8615 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
8616 int_mode
= word_mode
;
8617 *cost
= COSTS_N_INSNS (aarch64_internal_mov_immediate
8618 (NULL_RTX
, x
, false, int_mode
));
8624 /* First determine number of instructions to do the move
8625 as an integer constant. */
8626 if (!aarch64_float_const_representable_p (x
)
8627 && !aarch64_can_const_movi_rtx_p (x
, mode
)
8628 && aarch64_float_const_rtx_p (x
))
8630 unsigned HOST_WIDE_INT ival
;
8631 bool succeed
= aarch64_reinterpret_float_as_int (x
, &ival
);
8632 gcc_assert (succeed
);
8634 scalar_int_mode imode
= (mode
== HFmode
8636 : int_mode_for_mode (mode
).require ());
8637 int ncost
= aarch64_internal_mov_immediate
8638 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
8639 *cost
+= COSTS_N_INSNS (ncost
);
8645 /* mov[df,sf]_aarch64. */
8646 if (aarch64_float_const_representable_p (x
))
8647 /* FMOV (scalar immediate). */
8648 *cost
+= extra_cost
->fp
[mode
== DFmode
].fpconst
;
8649 else if (!aarch64_float_const_zero_rtx_p (x
))
8651 /* This will be a load from memory. */
8653 *cost
+= extra_cost
->ldst
.loadd
;
8655 *cost
+= extra_cost
->ldst
.loadf
;
8658 /* Otherwise this is +0.0. We get this using MOVI d0, #0
8659 or MOV v0.s[0], wzr - neither of which are modeled by the
8660 cost tables. Just use the default cost. */
8670 /* For loads we want the base cost of a load, plus an
8671 approximation for the additional cost of the addressing
8673 rtx address
= XEXP (x
, 0);
8674 if (VECTOR_MODE_P (mode
))
8675 *cost
+= extra_cost
->ldst
.loadv
;
8676 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
8677 *cost
+= extra_cost
->ldst
.load
;
8678 else if (mode
== SFmode
)
8679 *cost
+= extra_cost
->ldst
.loadf
;
8680 else if (mode
== DFmode
)
8681 *cost
+= extra_cost
->ldst
.loadd
;
8684 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
8693 if (VECTOR_MODE_P (mode
))
8698 *cost
+= extra_cost
->vect
.alu
;
8703 if (GET_MODE_CLASS (mode
) == MODE_INT
)
8705 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
8706 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
8709 *cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, NEG
, 0, speed
);
8713 /* Cost this as SUB wzr, X. */
8714 op0
= CONST0_RTX (mode
);
8719 if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
8721 /* Support (neg(fma...)) as a single instruction only if
8722 sign of zeros is unimportant. This matches the decision
8723 making in aarch64.md. */
8724 if (GET_CODE (op0
) == FMA
&& !HONOR_SIGNED_ZEROS (GET_MODE (op0
)))
8727 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
8730 if (GET_CODE (op0
) == MULT
)
8733 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
8738 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
8748 if (VECTOR_MODE_P (mode
))
8749 *cost
+= extra_cost
->vect
.alu
;
8751 *cost
+= extra_cost
->alu
.clz
;
8760 if (op1
== const0_rtx
8761 && GET_CODE (op0
) == AND
)
8764 mode
= GET_MODE (op0
);
8768 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
)
8770 /* TODO: A write to the CC flags possibly costs extra, this
8771 needs encoding in the cost tables. */
8773 mode
= GET_MODE (op0
);
8775 if (GET_CODE (op0
) == AND
)
8781 if (GET_CODE (op0
) == PLUS
)
8783 /* ADDS (and CMN alias). */
8788 if (GET_CODE (op0
) == MINUS
)
8795 if (GET_CODE (op0
) == ZERO_EXTRACT
&& op1
== const0_rtx
8796 && GET_MODE (x
) == CC_NZmode
&& CONST_INT_P (XEXP (op0
, 1))
8797 && CONST_INT_P (XEXP (op0
, 2)))
8799 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
8800 Handle it here directly rather than going to cost_logic
8801 since we know the immediate generated for the TST is valid
8802 so we can avoid creating an intermediate rtx for it only
8803 for costing purposes. */
8805 *cost
+= extra_cost
->alu
.logical
;
8807 *cost
+= rtx_cost (XEXP (op0
, 0), GET_MODE (op0
),
8808 ZERO_EXTRACT
, 0, speed
);
8812 if (GET_CODE (op1
) == NEG
)
8816 *cost
+= extra_cost
->alu
.arith
;
8818 *cost
+= rtx_cost (op0
, mode
, COMPARE
, 0, speed
);
8819 *cost
+= rtx_cost (XEXP (op1
, 0), mode
, NEG
, 1, speed
);
8825 Compare can freely swap the order of operands, and
8826 canonicalization puts the more complex operation first.
8827 But the integer MINUS logic expects the shift/extend
8828 operation in op1. */
8830 || (GET_CODE (op0
) == SUBREG
&& REG_P (SUBREG_REG (op0
)))))
8838 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_FLOAT
)
8842 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
8844 if (CONST_DOUBLE_P (op1
) && aarch64_float_const_zero_rtx_p (op1
))
8846 *cost
+= rtx_cost (op0
, VOIDmode
, COMPARE
, 0, speed
);
8847 /* FCMP supports constant 0.0 for no extra cost. */
8853 if (VECTOR_MODE_P (mode
))
8855 /* Vector compare. */
8857 *cost
+= extra_cost
->vect
.alu
;
8859 if (aarch64_float_const_zero_rtx_p (op1
))
8861 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
8875 *cost
+= rtx_cost (op0
, mode
, MINUS
, 0, speed
);
8877 /* Detect valid immediates. */
8878 if ((GET_MODE_CLASS (mode
) == MODE_INT
8879 || (GET_MODE_CLASS (mode
) == MODE_CC
8880 && GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
))
8881 && CONST_INT_P (op1
)
8882 && aarch64_uimm12_shift (INTVAL (op1
)))
8885 /* SUB(S) (immediate). */
8886 *cost
+= extra_cost
->alu
.arith
;
8890 /* Look for SUB (extended register). */
8891 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
8892 && aarch64_rtx_arith_op_extract_p (op1
, int_mode
))
8895 *cost
+= extra_cost
->alu
.extend_arith
;
8897 op1
= aarch64_strip_extend (op1
, true);
8898 *cost
+= rtx_cost (op1
, VOIDmode
,
8899 (enum rtx_code
) GET_CODE (op1
), 0, speed
);
8903 rtx new_op1
= aarch64_strip_extend (op1
, false);
8905 /* Cost this as an FMA-alike operation. */
8906 if ((GET_CODE (new_op1
) == MULT
8907 || aarch64_shift_p (GET_CODE (new_op1
)))
8910 *cost
+= aarch64_rtx_mult_cost (new_op1
, MULT
,
8911 (enum rtx_code
) code
,
8916 *cost
+= rtx_cost (new_op1
, VOIDmode
, MINUS
, 1, speed
);
8920 if (VECTOR_MODE_P (mode
))
8923 *cost
+= extra_cost
->vect
.alu
;
8925 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
8928 *cost
+= extra_cost
->alu
.arith
;
8930 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
8933 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
8947 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
8948 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
8951 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, PLUS
, 0, speed
);
8952 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
8956 if (GET_MODE_CLASS (mode
) == MODE_INT
8957 && ((CONST_INT_P (op1
) && aarch64_uimm12_shift (INTVAL (op1
)))
8958 || aarch64_sve_addvl_addpl_immediate (op1
, mode
)))
8960 *cost
+= rtx_cost (op0
, mode
, PLUS
, 0, speed
);
8963 /* ADD (immediate). */
8964 *cost
+= extra_cost
->alu
.arith
;
8968 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
8970 /* Look for ADD (extended register). */
8971 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
8972 && aarch64_rtx_arith_op_extract_p (op0
, int_mode
))
8975 *cost
+= extra_cost
->alu
.extend_arith
;
8977 op0
= aarch64_strip_extend (op0
, true);
8978 *cost
+= rtx_cost (op0
, VOIDmode
,
8979 (enum rtx_code
) GET_CODE (op0
), 0, speed
);
8983 /* Strip any extend, leave shifts behind as we will
8984 cost them through mult_cost. */
8985 new_op0
= aarch64_strip_extend (op0
, false);
8987 if (GET_CODE (new_op0
) == MULT
8988 || aarch64_shift_p (GET_CODE (new_op0
)))
8990 *cost
+= aarch64_rtx_mult_cost (new_op0
, MULT
, PLUS
,
8995 *cost
+= rtx_cost (new_op0
, VOIDmode
, PLUS
, 0, speed
);
8999 if (VECTOR_MODE_P (mode
))
9002 *cost
+= extra_cost
->vect
.alu
;
9004 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
9007 *cost
+= extra_cost
->alu
.arith
;
9009 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
9012 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
9019 *cost
= COSTS_N_INSNS (1);
9023 if (VECTOR_MODE_P (mode
))
9024 *cost
+= extra_cost
->vect
.alu
;
9026 *cost
+= extra_cost
->alu
.rev
;
9031 if (aarch_rev16_p (x
))
9033 *cost
= COSTS_N_INSNS (1);
9037 if (VECTOR_MODE_P (mode
))
9038 *cost
+= extra_cost
->vect
.alu
;
9040 *cost
+= extra_cost
->alu
.rev
;
9045 if (aarch64_extr_rtx_p (x
, &op0
, &op1
))
9047 *cost
+= rtx_cost (op0
, mode
, IOR
, 0, speed
);
9048 *cost
+= rtx_cost (op1
, mode
, IOR
, 1, speed
);
9050 *cost
+= extra_cost
->alu
.shift
;
9061 if (VECTOR_MODE_P (mode
))
9064 *cost
+= extra_cost
->vect
.alu
;
9069 && GET_CODE (op0
) == MULT
9070 && CONST_INT_P (XEXP (op0
, 1))
9071 && CONST_INT_P (op1
)
9072 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0
, 1))),
9075 /* This is a UBFM/SBFM. */
9076 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, ZERO_EXTRACT
, 0, speed
);
9078 *cost
+= extra_cost
->alu
.bfx
;
9082 if (is_int_mode (mode
, &int_mode
))
9084 if (CONST_INT_P (op1
))
9086 /* We have a mask + shift version of a UBFIZ
9087 i.e. the *andim_ashift<mode>_bfiz pattern. */
9088 if (GET_CODE (op0
) == ASHIFT
9089 && aarch64_mask_and_shift_for_ubfiz_p (int_mode
, op1
,
9092 *cost
+= rtx_cost (XEXP (op0
, 0), int_mode
,
9093 (enum rtx_code
) code
, 0, speed
);
9095 *cost
+= extra_cost
->alu
.bfx
;
9099 else if (aarch64_bitmask_imm (INTVAL (op1
), int_mode
))
9101 /* We possibly get the immediate for free, this is not
9103 *cost
+= rtx_cost (op0
, int_mode
,
9104 (enum rtx_code
) code
, 0, speed
);
9106 *cost
+= extra_cost
->alu
.logical
;
9115 /* Handle ORN, EON, or BIC. */
9116 if (GET_CODE (op0
) == NOT
)
9117 op0
= XEXP (op0
, 0);
9119 new_op0
= aarch64_strip_shift (op0
);
9121 /* If we had a shift on op0 then this is a logical-shift-
9122 by-register/immediate operation. Otherwise, this is just
9123 a logical operation. */
9128 /* Shift by immediate. */
9129 if (CONST_INT_P (XEXP (op0
, 1)))
9130 *cost
+= extra_cost
->alu
.log_shift
;
9132 *cost
+= extra_cost
->alu
.log_shift_reg
;
9135 *cost
+= extra_cost
->alu
.logical
;
9138 /* In both cases we want to cost both operands. */
9139 *cost
+= rtx_cost (new_op0
, int_mode
, (enum rtx_code
) code
,
9141 *cost
+= rtx_cost (op1
, int_mode
, (enum rtx_code
) code
,
9151 op0
= aarch64_strip_shift (x
);
9153 if (VECTOR_MODE_P (mode
))
9156 *cost
+= extra_cost
->vect
.alu
;
9160 /* MVN-shifted-reg. */
9163 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
9166 *cost
+= extra_cost
->alu
.log_shift
;
9170 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9171 Handle the second form here taking care that 'a' in the above can
9173 else if (GET_CODE (op0
) == XOR
)
9175 rtx newop0
= XEXP (op0
, 0);
9176 rtx newop1
= XEXP (op0
, 1);
9177 rtx op0_stripped
= aarch64_strip_shift (newop0
);
9179 *cost
+= rtx_cost (newop1
, mode
, (enum rtx_code
) code
, 1, speed
);
9180 *cost
+= rtx_cost (op0_stripped
, mode
, XOR
, 0, speed
);
9184 if (op0_stripped
!= newop0
)
9185 *cost
+= extra_cost
->alu
.log_shift
;
9187 *cost
+= extra_cost
->alu
.logical
;
9194 *cost
+= extra_cost
->alu
.logical
;
9201 /* If a value is written in SI mode, then zero extended to DI
9202 mode, the operation will in general be free as a write to
9203 a 'w' register implicitly zeroes the upper bits of an 'x'
9204 register. However, if this is
9206 (set (reg) (zero_extend (reg)))
9208 we must cost the explicit register move. */
9210 && GET_MODE (op0
) == SImode
9213 int op_cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, 0, speed
);
9215 /* If OP_COST is non-zero, then the cost of the zero extend
9216 is effectively the cost of the inner operation. Otherwise
9217 we have a MOV instruction and we take the cost from the MOV
9218 itself. This is true independently of whether we are
9219 optimizing for space or time. */
9225 else if (MEM_P (op0
))
9227 /* All loads can zero extend to any size for free. */
9228 *cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, param
, speed
);
9232 op0
= aarch64_extend_bitfield_pattern_p (x
);
9235 *cost
+= rtx_cost (op0
, mode
, ZERO_EXTEND
, 0, speed
);
9237 *cost
+= extra_cost
->alu
.bfx
;
9243 if (VECTOR_MODE_P (mode
))
9246 *cost
+= extra_cost
->vect
.alu
;
9250 /* We generate an AND instead of UXTB/UXTH. */
9251 *cost
+= extra_cost
->alu
.logical
;
9257 if (MEM_P (XEXP (x
, 0)))
9262 rtx address
= XEXP (XEXP (x
, 0), 0);
9263 *cost
+= extra_cost
->ldst
.load_sign_extend
;
9266 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
9272 op0
= aarch64_extend_bitfield_pattern_p (x
);
9275 *cost
+= rtx_cost (op0
, mode
, SIGN_EXTEND
, 0, speed
);
9277 *cost
+= extra_cost
->alu
.bfx
;
9283 if (VECTOR_MODE_P (mode
))
9284 *cost
+= extra_cost
->vect
.alu
;
9286 *cost
+= extra_cost
->alu
.extend
;
9294 if (CONST_INT_P (op1
))
9298 if (VECTOR_MODE_P (mode
))
9300 /* Vector shift (immediate). */
9301 *cost
+= extra_cost
->vect
.alu
;
9305 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
9307 *cost
+= extra_cost
->alu
.shift
;
9311 /* We can incorporate zero/sign extend for free. */
9312 if (GET_CODE (op0
) == ZERO_EXTEND
9313 || GET_CODE (op0
) == SIGN_EXTEND
)
9314 op0
= XEXP (op0
, 0);
9316 *cost
+= rtx_cost (op0
, VOIDmode
, ASHIFT
, 0, speed
);
9321 if (VECTOR_MODE_P (mode
))
9324 /* Vector shift (register). */
9325 *cost
+= extra_cost
->vect
.alu
;
9331 *cost
+= extra_cost
->alu
.shift_reg
;
9333 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
9334 && CONST_INT_P (XEXP (op1
, 1))
9335 && known_eq (INTVAL (XEXP (op1
, 1)),
9336 GET_MODE_BITSIZE (mode
) - 1))
9338 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
9339 /* We already demanded XEXP (op1, 0) to be REG_P, so
9340 don't recurse into it. */
9344 return false; /* All arguments need to be in registers. */
9354 if (CONST_INT_P (op1
))
9356 /* ASR (immediate) and friends. */
9359 if (VECTOR_MODE_P (mode
))
9360 *cost
+= extra_cost
->vect
.alu
;
9362 *cost
+= extra_cost
->alu
.shift
;
9365 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
9370 if (VECTOR_MODE_P (mode
))
9373 /* Vector shift (register). */
9374 *cost
+= extra_cost
->vect
.alu
;
9379 /* ASR (register) and friends. */
9380 *cost
+= extra_cost
->alu
.shift_reg
;
9382 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
9383 && CONST_INT_P (XEXP (op1
, 1))
9384 && known_eq (INTVAL (XEXP (op1
, 1)),
9385 GET_MODE_BITSIZE (mode
) - 1))
9387 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
9388 /* We already demanded XEXP (op1, 0) to be REG_P, so
9389 don't recurse into it. */
9393 return false; /* All arguments need to be in registers. */
9398 if (aarch64_cmodel
== AARCH64_CMODEL_LARGE
9399 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
)
9403 *cost
+= extra_cost
->ldst
.load
;
9405 else if (aarch64_cmodel
== AARCH64_CMODEL_SMALL
9406 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_PIC
)
9408 /* ADRP, followed by ADD. */
9409 *cost
+= COSTS_N_INSNS (1);
9411 *cost
+= 2 * extra_cost
->alu
.arith
;
9413 else if (aarch64_cmodel
== AARCH64_CMODEL_TINY
9414 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
9418 *cost
+= extra_cost
->alu
.arith
;
9423 /* One extra load instruction, after accessing the GOT. */
9424 *cost
+= COSTS_N_INSNS (1);
9426 *cost
+= extra_cost
->ldst
.load
;
9432 /* ADRP/ADD (immediate). */
9434 *cost
+= extra_cost
->alu
.arith
;
9442 if (VECTOR_MODE_P (mode
))
9443 *cost
+= extra_cost
->vect
.alu
;
9445 *cost
+= extra_cost
->alu
.bfx
;
9448 /* We can trust that the immediates used will be correct (there
9449 are no by-register forms), so we need only cost op0. */
9450 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (enum rtx_code
) code
, 0, speed
);
9454 *cost
+= aarch64_rtx_mult_cost (x
, MULT
, 0, speed
);
9455 /* aarch64_rtx_mult_cost always handles recursion to its
9460 /* We can expand signed mod by power of 2 using a NEGS, two parallel
9461 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
9462 an unconditional negate. This case should only ever be reached through
9463 the set_smod_pow2_cheap check in expmed.c. */
9464 if (CONST_INT_P (XEXP (x
, 1))
9465 && exact_log2 (INTVAL (XEXP (x
, 1))) > 0
9466 && (mode
== SImode
|| mode
== DImode
))
9468 /* We expand to 4 instructions. Reset the baseline. */
9469 *cost
= COSTS_N_INSNS (4);
9472 *cost
+= 2 * extra_cost
->alu
.logical
9473 + 2 * extra_cost
->alu
.arith
;
9482 /* Slighly prefer UMOD over SMOD. */
9483 if (VECTOR_MODE_P (mode
))
9484 *cost
+= extra_cost
->vect
.alu
;
9485 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
9486 *cost
+= (extra_cost
->mult
[mode
== DImode
].add
9487 + extra_cost
->mult
[mode
== DImode
].idiv
9488 + (code
== MOD
? 1 : 0));
9490 return false; /* All arguments need to be in registers. */
9497 if (VECTOR_MODE_P (mode
))
9498 *cost
+= extra_cost
->vect
.alu
;
9499 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
9500 /* There is no integer SQRT, so only DIV and UDIV can get
9502 *cost
+= (extra_cost
->mult
[mode
== DImode
].idiv
9503 /* Slighly prefer UDIV over SDIV. */
9504 + (code
== DIV
? 1 : 0));
9506 *cost
+= extra_cost
->fp
[mode
== DFmode
].div
;
9508 return false; /* All arguments need to be in registers. */
9511 return aarch64_if_then_else_costs (XEXP (x
, 0), XEXP (x
, 1),
9512 XEXP (x
, 2), cost
, speed
);
9525 return false; /* All arguments must be in registers. */
9534 if (VECTOR_MODE_P (mode
))
9535 *cost
+= extra_cost
->vect
.alu
;
9537 *cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
9540 /* FMSUB, FNMADD, and FNMSUB are free. */
9541 if (GET_CODE (op0
) == NEG
)
9542 op0
= XEXP (op0
, 0);
9544 if (GET_CODE (op2
) == NEG
)
9545 op2
= XEXP (op2
, 0);
9547 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
9548 and the by-element operand as operand 0. */
9549 if (GET_CODE (op1
) == NEG
)
9550 op1
= XEXP (op1
, 0);
9552 /* Catch vector-by-element operations. The by-element operand can
9553 either be (vec_duplicate (vec_select (x))) or just
9554 (vec_select (x)), depending on whether we are multiplying by
9555 a vector or a scalar.
9557 Canonicalization is not very good in these cases, FMA4 will put the
9558 by-element operand as operand 0, FNMA4 will have it as operand 1. */
9559 if (GET_CODE (op0
) == VEC_DUPLICATE
)
9560 op0
= XEXP (op0
, 0);
9561 else if (GET_CODE (op1
) == VEC_DUPLICATE
)
9562 op1
= XEXP (op1
, 0);
9564 if (GET_CODE (op0
) == VEC_SELECT
)
9565 op0
= XEXP (op0
, 0);
9566 else if (GET_CODE (op1
) == VEC_SELECT
)
9567 op1
= XEXP (op1
, 0);
9569 /* If the remaining parameters are not registers,
9570 get the cost to put them into registers. */
9571 *cost
+= rtx_cost (op0
, mode
, FMA
, 0, speed
);
9572 *cost
+= rtx_cost (op1
, mode
, FMA
, 1, speed
);
9573 *cost
+= rtx_cost (op2
, mode
, FMA
, 2, speed
);
9577 case UNSIGNED_FLOAT
:
9579 *cost
+= extra_cost
->fp
[mode
== DFmode
].fromint
;
9585 if (VECTOR_MODE_P (mode
))
9587 /*Vector truncate. */
9588 *cost
+= extra_cost
->vect
.alu
;
9591 *cost
+= extra_cost
->fp
[mode
== DFmode
].widen
;
9595 case FLOAT_TRUNCATE
:
9598 if (VECTOR_MODE_P (mode
))
9600 /*Vector conversion. */
9601 *cost
+= extra_cost
->vect
.alu
;
9604 *cost
+= extra_cost
->fp
[mode
== DFmode
].narrow
;
9611 /* Strip the rounding part. They will all be implemented
9612 by the fcvt* family of instructions anyway. */
9613 if (GET_CODE (x
) == UNSPEC
)
9615 unsigned int uns_code
= XINT (x
, 1);
9617 if (uns_code
== UNSPEC_FRINTA
9618 || uns_code
== UNSPEC_FRINTM
9619 || uns_code
== UNSPEC_FRINTN
9620 || uns_code
== UNSPEC_FRINTP
9621 || uns_code
== UNSPEC_FRINTZ
)
9622 x
= XVECEXP (x
, 0, 0);
9627 if (VECTOR_MODE_P (mode
))
9628 *cost
+= extra_cost
->vect
.alu
;
9630 *cost
+= extra_cost
->fp
[GET_MODE (x
) == DFmode
].toint
;
9633 /* We can combine fmul by a power of 2 followed by a fcvt into a single
9634 fixed-point fcvt. */
9635 if (GET_CODE (x
) == MULT
9636 && ((VECTOR_MODE_P (mode
)
9637 && aarch64_vec_fpconst_pow_of_2 (XEXP (x
, 1)) > 0)
9638 || aarch64_fpconst_pow_of_2 (XEXP (x
, 1)) > 0))
9640 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (rtx_code
) code
,
9645 *cost
+= rtx_cost (x
, VOIDmode
, (enum rtx_code
) code
, 0, speed
);
9649 if (VECTOR_MODE_P (mode
))
9653 *cost
+= extra_cost
->vect
.alu
;
9655 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
9659 /* FABD, which is analogous to FADD. */
9660 if (GET_CODE (op0
) == MINUS
)
9662 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, MINUS
, 0, speed
);
9663 *cost
+= rtx_cost (XEXP (op0
, 1), mode
, MINUS
, 1, speed
);
9665 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
9669 /* Simple FABS is analogous to FNEG. */
9671 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
9675 /* Integer ABS will either be split to
9676 two arithmetic instructions, or will be an ABS
9677 (scalar), which we don't model. */
9678 *cost
= COSTS_N_INSNS (2);
9680 *cost
+= 2 * extra_cost
->alu
.arith
;
9688 if (VECTOR_MODE_P (mode
))
9689 *cost
+= extra_cost
->vect
.alu
;
9692 /* FMAXNM/FMINNM/FMAX/FMIN.
9693 TODO: This may not be accurate for all implementations, but
9694 we do not model this in the cost tables. */
9695 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
9701 /* The floating point round to integer frint* instructions. */
9702 if (aarch64_frint_unspec_p (XINT (x
, 1)))
9705 *cost
+= extra_cost
->fp
[mode
== DFmode
].roundint
;
9710 if (XINT (x
, 1) == UNSPEC_RBIT
)
9713 *cost
+= extra_cost
->alu
.rev
;
9721 /* Decompose <su>muldi3_highpart. */
9722 if (/* (truncate:DI */
9725 && GET_MODE (XEXP (x
, 0)) == TImode
9726 && GET_CODE (XEXP (x
, 0)) == LSHIFTRT
9728 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == MULT
9729 /* (ANY_EXTEND:TI (reg:DI))
9730 (ANY_EXTEND:TI (reg:DI))) */
9731 && ((GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == ZERO_EXTEND
9732 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == ZERO_EXTEND
)
9733 || (GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == SIGN_EXTEND
9734 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == SIGN_EXTEND
))
9735 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0)) == DImode
9736 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0)) == DImode
9737 /* (const_int 64) */
9738 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
9739 && UINTVAL (XEXP (XEXP (x
, 0), 1)) == 64)
9743 *cost
+= extra_cost
->mult
[mode
== DImode
].extend
;
9744 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0),
9745 mode
, MULT
, 0, speed
);
9746 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0),
9747 mode
, MULT
, 1, speed
);
9757 && flag_aarch64_verbose_cost
)
9759 "\nFailed to cost RTX. Assuming default cost.\n");
9764 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
9765 calculated for X. This cost is stored in *COST. Returns true
9766 if the total cost of X was calculated. */
9768 aarch64_rtx_costs_wrapper (rtx x
, machine_mode mode
, int outer
,
9769 int param
, int *cost
, bool speed
)
9771 bool result
= aarch64_rtx_costs (x
, mode
, outer
, param
, cost
, speed
);
9774 && flag_aarch64_verbose_cost
)
9776 print_rtl_single (dump_file
, x
);
9777 fprintf (dump_file
, "\n%s cost: %d (%s)\n",
9778 speed
? "Hot" : "Cold",
9779 *cost
, result
? "final" : "partial");
9786 aarch64_register_move_cost (machine_mode mode
,
9787 reg_class_t from_i
, reg_class_t to_i
)
9789 enum reg_class from
= (enum reg_class
) from_i
;
9790 enum reg_class to
= (enum reg_class
) to_i
;
9791 const struct cpu_regmove_cost
*regmove_cost
9792 = aarch64_tune_params
.regmove_cost
;
9794 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
9795 if (to
== TAILCALL_ADDR_REGS
|| to
== POINTER_REGS
)
9798 if (from
== TAILCALL_ADDR_REGS
|| from
== POINTER_REGS
)
9799 from
= GENERAL_REGS
;
9801 /* Moving between GPR and stack cost is the same as GP2GP. */
9802 if ((from
== GENERAL_REGS
&& to
== STACK_REG
)
9803 || (to
== GENERAL_REGS
&& from
== STACK_REG
))
9804 return regmove_cost
->GP2GP
;
9806 /* To/From the stack register, we move via the gprs. */
9807 if (to
== STACK_REG
|| from
== STACK_REG
)
9808 return aarch64_register_move_cost (mode
, from
, GENERAL_REGS
)
9809 + aarch64_register_move_cost (mode
, GENERAL_REGS
, to
);
9811 if (known_eq (GET_MODE_SIZE (mode
), 16))
9813 /* 128-bit operations on general registers require 2 instructions. */
9814 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
9815 return regmove_cost
->GP2GP
* 2;
9816 else if (from
== GENERAL_REGS
)
9817 return regmove_cost
->GP2FP
* 2;
9818 else if (to
== GENERAL_REGS
)
9819 return regmove_cost
->FP2GP
* 2;
9821 /* When AdvSIMD instructions are disabled it is not possible to move
9822 a 128-bit value directly between Q registers. This is handled in
9823 secondary reload. A general register is used as a scratch to move
9824 the upper DI value and the lower DI value is moved directly,
9825 hence the cost is the sum of three moves. */
9827 return regmove_cost
->GP2FP
+ regmove_cost
->FP2GP
+ regmove_cost
->FP2FP
;
9829 return regmove_cost
->FP2FP
;
9832 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
9833 return regmove_cost
->GP2GP
;
9834 else if (from
== GENERAL_REGS
)
9835 return regmove_cost
->GP2FP
;
9836 else if (to
== GENERAL_REGS
)
9837 return regmove_cost
->FP2GP
;
9839 return regmove_cost
->FP2FP
;
9843 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED
,
9844 reg_class_t rclass ATTRIBUTE_UNUSED
,
9845 bool in ATTRIBUTE_UNUSED
)
9847 return aarch64_tune_params
.memmov_cost
;
9850 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
9851 to optimize 1.0/sqrt. */
9854 use_rsqrt_p (machine_mode mode
)
9856 return (!flag_trapping_math
9857 && flag_unsafe_math_optimizations
9858 && ((aarch64_tune_params
.approx_modes
->recip_sqrt
9859 & AARCH64_APPROX_MODE (mode
))
9860 || flag_mrecip_low_precision_sqrt
));
9863 /* Function to decide when to use the approximate reciprocal square root
9867 aarch64_builtin_reciprocal (tree fndecl
)
9869 machine_mode mode
= TYPE_MODE (TREE_TYPE (fndecl
));
9871 if (!use_rsqrt_p (mode
))
9873 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl
));
9876 typedef rtx (*rsqrte_type
) (rtx
, rtx
);
9878 /* Select reciprocal square root initial estimate insn depending on machine
9882 get_rsqrte_type (machine_mode mode
)
9886 case E_DFmode
: return gen_aarch64_rsqrtedf
;
9887 case E_SFmode
: return gen_aarch64_rsqrtesf
;
9888 case E_V2DFmode
: return gen_aarch64_rsqrtev2df
;
9889 case E_V2SFmode
: return gen_aarch64_rsqrtev2sf
;
9890 case E_V4SFmode
: return gen_aarch64_rsqrtev4sf
;
9891 default: gcc_unreachable ();
9895 typedef rtx (*rsqrts_type
) (rtx
, rtx
, rtx
);
9897 /* Select reciprocal square root series step insn depending on machine mode. */
9900 get_rsqrts_type (machine_mode mode
)
9904 case E_DFmode
: return gen_aarch64_rsqrtsdf
;
9905 case E_SFmode
: return gen_aarch64_rsqrtssf
;
9906 case E_V2DFmode
: return gen_aarch64_rsqrtsv2df
;
9907 case E_V2SFmode
: return gen_aarch64_rsqrtsv2sf
;
9908 case E_V4SFmode
: return gen_aarch64_rsqrtsv4sf
;
9909 default: gcc_unreachable ();
9913 /* Emit instruction sequence to compute either the approximate square root
9914 or its approximate reciprocal, depending on the flag RECP, and return
9915 whether the sequence was emitted or not. */
9918 aarch64_emit_approx_sqrt (rtx dst
, rtx src
, bool recp
)
9920 machine_mode mode
= GET_MODE (dst
);
9922 if (GET_MODE_INNER (mode
) == HFmode
)
9930 if (!(flag_mlow_precision_sqrt
9931 || (aarch64_tune_params
.approx_modes
->sqrt
9932 & AARCH64_APPROX_MODE (mode
))))
9935 if (flag_finite_math_only
9936 || flag_trapping_math
9937 || !flag_unsafe_math_optimizations
9938 || optimize_function_for_size_p (cfun
))
9942 /* Caller assumes we cannot fail. */
9943 gcc_assert (use_rsqrt_p (mode
));
9945 machine_mode mmsk
= mode_for_int_vector (mode
).require ();
9946 rtx xmsk
= gen_reg_rtx (mmsk
);
9948 /* When calculating the approximate square root, compare the
9949 argument with 0.0 and create a mask. */
9950 emit_insn (gen_rtx_SET (xmsk
,
9952 gen_rtx_EQ (mmsk
, src
,
9953 CONST0_RTX (mode
)))));
9955 /* Estimate the approximate reciprocal square root. */
9956 rtx xdst
= gen_reg_rtx (mode
);
9957 emit_insn ((*get_rsqrte_type (mode
)) (xdst
, src
));
9959 /* Iterate over the series twice for SF and thrice for DF. */
9960 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
9962 /* Optionally iterate over the series once less for faster performance
9963 while sacrificing the accuracy. */
9964 if ((recp
&& flag_mrecip_low_precision_sqrt
)
9965 || (!recp
&& flag_mlow_precision_sqrt
))
9968 /* Iterate over the series to calculate the approximate reciprocal square
9970 rtx x1
= gen_reg_rtx (mode
);
9971 while (iterations
--)
9973 rtx x2
= gen_reg_rtx (mode
);
9974 emit_set_insn (x2
, gen_rtx_MULT (mode
, xdst
, xdst
));
9976 emit_insn ((*get_rsqrts_type (mode
)) (x1
, src
, x2
));
9979 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, x1
));
9984 /* Qualify the approximate reciprocal square root when the argument is
9985 0.0 by squashing the intermediary result to 0.0. */
9986 rtx xtmp
= gen_reg_rtx (mmsk
);
9987 emit_set_insn (xtmp
, gen_rtx_AND (mmsk
, gen_rtx_NOT (mmsk
, xmsk
),
9988 gen_rtx_SUBREG (mmsk
, xdst
, 0)));
9989 emit_move_insn (xdst
, gen_rtx_SUBREG (mode
, xtmp
, 0));
9991 /* Calculate the approximate square root. */
9992 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, src
));
9995 /* Finalize the approximation. */
9996 emit_set_insn (dst
, gen_rtx_MULT (mode
, xdst
, x1
));
10001 typedef rtx (*recpe_type
) (rtx
, rtx
);
10003 /* Select reciprocal initial estimate insn depending on machine mode. */
10006 get_recpe_type (machine_mode mode
)
10010 case E_SFmode
: return (gen_aarch64_frecpesf
);
10011 case E_V2SFmode
: return (gen_aarch64_frecpev2sf
);
10012 case E_V4SFmode
: return (gen_aarch64_frecpev4sf
);
10013 case E_DFmode
: return (gen_aarch64_frecpedf
);
10014 case E_V2DFmode
: return (gen_aarch64_frecpev2df
);
10015 default: gcc_unreachable ();
10019 typedef rtx (*recps_type
) (rtx
, rtx
, rtx
);
10021 /* Select reciprocal series step insn depending on machine mode. */
10024 get_recps_type (machine_mode mode
)
10028 case E_SFmode
: return (gen_aarch64_frecpssf
);
10029 case E_V2SFmode
: return (gen_aarch64_frecpsv2sf
);
10030 case E_V4SFmode
: return (gen_aarch64_frecpsv4sf
);
10031 case E_DFmode
: return (gen_aarch64_frecpsdf
);
10032 case E_V2DFmode
: return (gen_aarch64_frecpsv2df
);
10033 default: gcc_unreachable ();
10037 /* Emit the instruction sequence to compute the approximation for the division
10038 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
10041 aarch64_emit_approx_div (rtx quo
, rtx num
, rtx den
)
10043 machine_mode mode
= GET_MODE (quo
);
10045 if (GET_MODE_INNER (mode
) == HFmode
)
10048 bool use_approx_division_p
= (flag_mlow_precision_div
10049 || (aarch64_tune_params
.approx_modes
->division
10050 & AARCH64_APPROX_MODE (mode
)));
10052 if (!flag_finite_math_only
10053 || flag_trapping_math
10054 || !flag_unsafe_math_optimizations
10055 || optimize_function_for_size_p (cfun
)
10056 || !use_approx_division_p
)
10059 if (!TARGET_SIMD
&& VECTOR_MODE_P (mode
))
10062 /* Estimate the approximate reciprocal. */
10063 rtx xrcp
= gen_reg_rtx (mode
);
10064 emit_insn ((*get_recpe_type (mode
)) (xrcp
, den
));
10066 /* Iterate over the series twice for SF and thrice for DF. */
10067 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
10069 /* Optionally iterate over the series once less for faster performance,
10070 while sacrificing the accuracy. */
10071 if (flag_mlow_precision_div
)
10074 /* Iterate over the series to calculate the approximate reciprocal. */
10075 rtx xtmp
= gen_reg_rtx (mode
);
10076 while (iterations
--)
10078 emit_insn ((*get_recps_type (mode
)) (xtmp
, xrcp
, den
));
10080 if (iterations
> 0)
10081 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
10084 if (num
!= CONST1_RTX (mode
))
10086 /* As the approximate reciprocal of DEN is already calculated, only
10087 calculate the approximate division when NUM is not 1.0. */
10088 rtx xnum
= force_reg (mode
, num
);
10089 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xnum
));
10092 /* Finalize the approximation. */
10093 emit_set_insn (quo
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
10097 /* Return the number of instructions that can be issued per cycle. */
10099 aarch64_sched_issue_rate (void)
10101 return aarch64_tune_params
.issue_rate
;
10105 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10107 int issue_rate
= aarch64_sched_issue_rate ();
10109 return issue_rate
> 1 && !sched_fusion
? issue_rate
: 0;
10113 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10114 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
10115 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
10118 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn
*insn
,
10121 return autopref_multipass_dfa_lookahead_guard (insn
, ready_index
);
10125 /* Vectorizer cost model target hooks. */
10127 /* Implement targetm.vectorize.builtin_vectorization_cost. */
10129 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost
,
10131 int misalign ATTRIBUTE_UNUSED
)
10134 const cpu_vector_cost
*costs
= aarch64_tune_params
.vec_costs
;
10137 if (vectype
!= NULL
)
10138 fp
= FLOAT_TYPE_P (vectype
);
10140 switch (type_of_cost
)
10143 return fp
? costs
->scalar_fp_stmt_cost
: costs
->scalar_int_stmt_cost
;
10146 return costs
->scalar_load_cost
;
10149 return costs
->scalar_store_cost
;
10152 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
10155 return costs
->vec_align_load_cost
;
10158 return costs
->vec_store_cost
;
10160 case vec_to_scalar
:
10161 return costs
->vec_to_scalar_cost
;
10163 case scalar_to_vec
:
10164 return costs
->scalar_to_vec_cost
;
10166 case unaligned_load
:
10167 case vector_gather_load
:
10168 return costs
->vec_unalign_load_cost
;
10170 case unaligned_store
:
10171 case vector_scatter_store
:
10172 return costs
->vec_unalign_store_cost
;
10174 case cond_branch_taken
:
10175 return costs
->cond_taken_branch_cost
;
10177 case cond_branch_not_taken
:
10178 return costs
->cond_not_taken_branch_cost
;
10181 return costs
->vec_permute_cost
;
10183 case vec_promote_demote
:
10184 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
10186 case vec_construct
:
10187 elements
= estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype
));
10188 return elements
/ 2 + 1;
10191 gcc_unreachable ();
10195 /* Implement targetm.vectorize.add_stmt_cost. */
10197 aarch64_add_stmt_cost (void *data
, int count
, enum vect_cost_for_stmt kind
,
10198 struct _stmt_vec_info
*stmt_info
, int misalign
,
10199 enum vect_cost_model_location where
)
10201 unsigned *cost
= (unsigned *) data
;
10202 unsigned retval
= 0;
10204 if (flag_vect_cost_model
)
10206 tree vectype
= stmt_info
? stmt_vectype (stmt_info
) : NULL_TREE
;
10208 aarch64_builtin_vectorization_cost (kind
, vectype
, misalign
);
10210 /* Statements in an inner loop relative to the loop being
10211 vectorized are weighted more heavily. The value here is
10212 arbitrary and could potentially be improved with analysis. */
10213 if (where
== vect_body
&& stmt_info
&& stmt_in_inner_loop_p (stmt_info
))
10214 count
*= 50; /* FIXME */
10216 retval
= (unsigned) (count
* stmt_cost
);
10217 cost
[where
] += retval
;
10223 static void initialize_aarch64_code_model (struct gcc_options
*);
10225 /* Parse the TO_PARSE string and put the architecture struct that it
10226 selects into RES and the architectural features into ISA_FLAGS.
10227 Return an aarch64_parse_opt_result describing the parse result.
10228 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
10230 static enum aarch64_parse_opt_result
10231 aarch64_parse_arch (const char *to_parse
, const struct processor
**res
,
10232 unsigned long *isa_flags
)
10235 const struct processor
*arch
;
10236 char *str
= (char *) alloca (strlen (to_parse
) + 1);
10239 strcpy (str
, to_parse
);
10241 ext
= strchr (str
, '+');
10246 len
= strlen (str
);
10249 return AARCH64_PARSE_MISSING_ARG
;
10252 /* Loop through the list of supported ARCHes to find a match. */
10253 for (arch
= all_architectures
; arch
->name
!= NULL
; arch
++)
10255 if (strlen (arch
->name
) == len
&& strncmp (arch
->name
, str
, len
) == 0)
10257 unsigned long isa_temp
= arch
->flags
;
10261 /* TO_PARSE string contains at least one extension. */
10262 enum aarch64_parse_opt_result ext_res
10263 = aarch64_parse_extension (ext
, &isa_temp
);
10265 if (ext_res
!= AARCH64_PARSE_OK
)
10268 /* Extension parsing was successful. Confirm the result
10269 arch and ISA flags. */
10271 *isa_flags
= isa_temp
;
10272 return AARCH64_PARSE_OK
;
10276 /* ARCH name not found in list. */
10277 return AARCH64_PARSE_INVALID_ARG
;
10280 /* Parse the TO_PARSE string and put the result tuning in RES and the
10281 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
10282 describing the parse result. If there is an error parsing, RES and
10283 ISA_FLAGS are left unchanged. */
10285 static enum aarch64_parse_opt_result
10286 aarch64_parse_cpu (const char *to_parse
, const struct processor
**res
,
10287 unsigned long *isa_flags
)
10290 const struct processor
*cpu
;
10291 char *str
= (char *) alloca (strlen (to_parse
) + 1);
10294 strcpy (str
, to_parse
);
10296 ext
= strchr (str
, '+');
10301 len
= strlen (str
);
10304 return AARCH64_PARSE_MISSING_ARG
;
10307 /* Loop through the list of supported CPUs to find a match. */
10308 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
10310 if (strlen (cpu
->name
) == len
&& strncmp (cpu
->name
, str
, len
) == 0)
10312 unsigned long isa_temp
= cpu
->flags
;
10317 /* TO_PARSE string contains at least one extension. */
10318 enum aarch64_parse_opt_result ext_res
10319 = aarch64_parse_extension (ext
, &isa_temp
);
10321 if (ext_res
!= AARCH64_PARSE_OK
)
10324 /* Extension parsing was successfull. Confirm the result
10325 cpu and ISA flags. */
10327 *isa_flags
= isa_temp
;
10328 return AARCH64_PARSE_OK
;
10332 /* CPU name not found in list. */
10333 return AARCH64_PARSE_INVALID_ARG
;
10336 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10337 Return an aarch64_parse_opt_result describing the parse result.
10338 If the parsing fails the RES does not change. */
10340 static enum aarch64_parse_opt_result
10341 aarch64_parse_tune (const char *to_parse
, const struct processor
**res
)
10343 const struct processor
*cpu
;
10344 char *str
= (char *) alloca (strlen (to_parse
) + 1);
10346 strcpy (str
, to_parse
);
10348 /* Loop through the list of supported CPUs to find a match. */
10349 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
10351 if (strcmp (cpu
->name
, str
) == 0)
10354 return AARCH64_PARSE_OK
;
10358 /* CPU name not found in list. */
10359 return AARCH64_PARSE_INVALID_ARG
;
10362 /* Parse TOKEN, which has length LENGTH to see if it is an option
10363 described in FLAG. If it is, return the index bit for that fusion type.
10364 If not, error (printing OPTION_NAME) and return zero. */
10366 static unsigned int
10367 aarch64_parse_one_option_token (const char *token
,
10369 const struct aarch64_flag_desc
*flag
,
10370 const char *option_name
)
10372 for (; flag
->name
!= NULL
; flag
++)
10374 if (length
== strlen (flag
->name
)
10375 && !strncmp (flag
->name
, token
, length
))
10379 error ("unknown flag passed in -moverride=%s (%s)", option_name
, token
);
10383 /* Parse OPTION which is a comma-separated list of flags to enable.
10384 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10385 default state we inherit from the CPU tuning structures. OPTION_NAME
10386 gives the top-level option we are parsing in the -moverride string,
10387 for use in error messages. */
10389 static unsigned int
10390 aarch64_parse_boolean_options (const char *option
,
10391 const struct aarch64_flag_desc
*flags
,
10392 unsigned int initial_state
,
10393 const char *option_name
)
10395 const char separator
= '.';
10396 const char* specs
= option
;
10397 const char* ntoken
= option
;
10398 unsigned int found_flags
= initial_state
;
10400 while ((ntoken
= strchr (specs
, separator
)))
10402 size_t token_length
= ntoken
- specs
;
10403 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
10407 /* If we find "none" (or, for simplicity's sake, an error) anywhere
10408 in the token stream, reset the supported operations. So:
10410 adrp+add.cmp+branch.none.adrp+add
10412 would have the result of turning on only adrp+add fusion. */
10416 found_flags
|= token_ops
;
10420 /* We ended with a comma, print something. */
10423 error ("%s string ill-formed\n", option_name
);
10427 /* We still have one more token to parse. */
10428 size_t token_length
= strlen (specs
);
10429 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
10436 found_flags
|= token_ops
;
10437 return found_flags
;
10440 /* Support for overriding instruction fusion. */
10443 aarch64_parse_fuse_string (const char *fuse_string
,
10444 struct tune_params
*tune
)
10446 tune
->fusible_ops
= aarch64_parse_boolean_options (fuse_string
,
10447 aarch64_fusible_pairs
,
10452 /* Support for overriding other tuning flags. */
10455 aarch64_parse_tune_string (const char *tune_string
,
10456 struct tune_params
*tune
)
10458 tune
->extra_tuning_flags
10459 = aarch64_parse_boolean_options (tune_string
,
10460 aarch64_tuning_flags
,
10461 tune
->extra_tuning_flags
,
10465 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
10466 we understand. If it is, extract the option string and handoff to
10467 the appropriate function. */
10470 aarch64_parse_one_override_token (const char* token
,
10472 struct tune_params
*tune
)
10474 const struct aarch64_tuning_override_function
*fn
10475 = aarch64_tuning_override_functions
;
10477 const char *option_part
= strchr (token
, '=');
10480 error ("tuning string missing in option (%s)", token
);
10484 /* Get the length of the option name. */
10485 length
= option_part
- token
;
10486 /* Skip the '=' to get to the option string. */
10489 for (; fn
->name
!= NULL
; fn
++)
10491 if (!strncmp (fn
->name
, token
, length
))
10493 fn
->parse_override (option_part
, tune
);
10498 error ("unknown tuning option (%s)",token
);
10502 /* A checking mechanism for the implementation of the tls size. */
10505 initialize_aarch64_tls_size (struct gcc_options
*opts
)
10507 if (aarch64_tls_size
== 0)
10508 aarch64_tls_size
= 24;
10510 switch (opts
->x_aarch64_cmodel_var
)
10512 case AARCH64_CMODEL_TINY
:
10513 /* Both the default and maximum TLS size allowed under tiny is 1M which
10514 needs two instructions to address, so we clamp the size to 24. */
10515 if (aarch64_tls_size
> 24)
10516 aarch64_tls_size
= 24;
10518 case AARCH64_CMODEL_SMALL
:
10519 /* The maximum TLS size allowed under small is 4G. */
10520 if (aarch64_tls_size
> 32)
10521 aarch64_tls_size
= 32;
10523 case AARCH64_CMODEL_LARGE
:
10524 /* The maximum TLS size allowed under large is 16E.
10525 FIXME: 16E should be 64bit, we only support 48bit offset now. */
10526 if (aarch64_tls_size
> 48)
10527 aarch64_tls_size
= 48;
10530 gcc_unreachable ();
10536 /* Parse STRING looking for options in the format:
10537 string :: option:string
10538 option :: name=substring
10540 substring :: defined by option. */
10543 aarch64_parse_override_string (const char* input_string
,
10544 struct tune_params
* tune
)
10546 const char separator
= ':';
10547 size_t string_length
= strlen (input_string
) + 1;
10548 char *string_root
= (char *) xmalloc (sizeof (*string_root
) * string_length
);
10549 char *string
= string_root
;
10550 strncpy (string
, input_string
, string_length
);
10551 string
[string_length
- 1] = '\0';
10553 char* ntoken
= string
;
10555 while ((ntoken
= strchr (string
, separator
)))
10557 size_t token_length
= ntoken
- string
;
10558 /* Make this substring look like a string. */
10560 aarch64_parse_one_override_token (string
, token_length
, tune
);
10564 /* One last option to parse. */
10565 aarch64_parse_one_override_token (string
, strlen (string
), tune
);
10566 free (string_root
);
10571 aarch64_override_options_after_change_1 (struct gcc_options
*opts
)
10573 /* PR 70044: We have to be careful about being called multiple times for the
10574 same function. This means all changes should be repeatable. */
10576 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
10577 Disable the frame pointer flag so the mid-end will not use a frame
10578 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
10579 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
10580 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
10581 aarch64_use_frame_pointer
= opts
->x_flag_omit_frame_pointer
!= 1;
10582 if (opts
->x_flag_omit_frame_pointer
== 0)
10583 opts
->x_flag_omit_frame_pointer
= 2;
10585 /* If not optimizing for size, set the default
10586 alignment to what the target wants. */
10587 if (!opts
->x_optimize_size
)
10589 if (opts
->x_flag_align_loops
&& !opts
->x_str_align_loops
)
10590 opts
->x_str_align_loops
= aarch64_tune_params
.loop_align
;
10591 if (opts
->x_flag_align_jumps
&& !opts
->x_str_align_jumps
)
10592 opts
->x_str_align_jumps
= aarch64_tune_params
.jump_align
;
10593 if (opts
->x_flag_align_functions
&& !opts
->x_str_align_functions
)
10594 opts
->x_str_align_functions
= aarch64_tune_params
.function_align
;
10597 /* We default to no pc-relative literal loads. */
10599 aarch64_pcrelative_literal_loads
= false;
10601 /* If -mpc-relative-literal-loads is set on the command line, this
10602 implies that the user asked for PC relative literal loads. */
10603 if (opts
->x_pcrelative_literal_loads
== 1)
10604 aarch64_pcrelative_literal_loads
= true;
10606 /* In the tiny memory model it makes no sense to disallow PC relative
10607 literal pool loads. */
10608 if (aarch64_cmodel
== AARCH64_CMODEL_TINY
10609 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
10610 aarch64_pcrelative_literal_loads
= true;
10612 /* When enabling the lower precision Newton series for the square root, also
10613 enable it for the reciprocal square root, since the latter is an
10614 intermediary step for the former. */
10615 if (flag_mlow_precision_sqrt
)
10616 flag_mrecip_low_precision_sqrt
= true;
10619 /* 'Unpack' up the internal tuning structs and update the options
10620 in OPTS. The caller must have set up selected_tune and selected_arch
10621 as all the other target-specific codegen decisions are
10622 derived from them. */
10625 aarch64_override_options_internal (struct gcc_options
*opts
)
10627 aarch64_tune_flags
= selected_tune
->flags
;
10628 aarch64_tune
= selected_tune
->sched_core
;
10629 /* Make a copy of the tuning parameters attached to the core, which
10630 we may later overwrite. */
10631 aarch64_tune_params
= *(selected_tune
->tune
);
10632 aarch64_architecture_version
= selected_arch
->architecture_version
;
10634 if (opts
->x_aarch64_override_tune_string
)
10635 aarch64_parse_override_string (opts
->x_aarch64_override_tune_string
,
10636 &aarch64_tune_params
);
10638 /* This target defaults to strict volatile bitfields. */
10639 if (opts
->x_flag_strict_volatile_bitfields
< 0 && abi_version_at_least (2))
10640 opts
->x_flag_strict_volatile_bitfields
= 1;
10642 initialize_aarch64_code_model (opts
);
10643 initialize_aarch64_tls_size (opts
);
10645 int queue_depth
= 0;
10646 switch (aarch64_tune_params
.autoprefetcher_model
)
10648 case tune_params::AUTOPREFETCHER_OFF
:
10651 case tune_params::AUTOPREFETCHER_WEAK
:
10654 case tune_params::AUTOPREFETCHER_STRONG
:
10655 queue_depth
= max_insn_queue_index
+ 1;
10658 gcc_unreachable ();
10661 /* We don't mind passing in global_options_set here as we don't use
10662 the *options_set structs anyway. */
10663 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH
,
10665 opts
->x_param_values
,
10666 global_options_set
.x_param_values
);
10668 /* Set up parameters to be used in prefetching algorithm. Do not
10669 override the defaults unless we are tuning for a core we have
10670 researched values for. */
10671 if (aarch64_tune_params
.prefetch
->num_slots
> 0)
10672 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES
,
10673 aarch64_tune_params
.prefetch
->num_slots
,
10674 opts
->x_param_values
,
10675 global_options_set
.x_param_values
);
10676 if (aarch64_tune_params
.prefetch
->l1_cache_size
>= 0)
10677 maybe_set_param_value (PARAM_L1_CACHE_SIZE
,
10678 aarch64_tune_params
.prefetch
->l1_cache_size
,
10679 opts
->x_param_values
,
10680 global_options_set
.x_param_values
);
10681 if (aarch64_tune_params
.prefetch
->l1_cache_line_size
>= 0)
10682 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE
,
10683 aarch64_tune_params
.prefetch
->l1_cache_line_size
,
10684 opts
->x_param_values
,
10685 global_options_set
.x_param_values
);
10686 if (aarch64_tune_params
.prefetch
->l2_cache_size
>= 0)
10687 maybe_set_param_value (PARAM_L2_CACHE_SIZE
,
10688 aarch64_tune_params
.prefetch
->l2_cache_size
,
10689 opts
->x_param_values
,
10690 global_options_set
.x_param_values
);
10691 if (!aarch64_tune_params
.prefetch
->prefetch_dynamic_strides
)
10692 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES
,
10694 opts
->x_param_values
,
10695 global_options_set
.x_param_values
);
10696 if (aarch64_tune_params
.prefetch
->minimum_stride
>= 0)
10697 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE
,
10698 aarch64_tune_params
.prefetch
->minimum_stride
,
10699 opts
->x_param_values
,
10700 global_options_set
.x_param_values
);
10702 /* Use the alternative scheduling-pressure algorithm by default. */
10703 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM
, SCHED_PRESSURE_MODEL
,
10704 opts
->x_param_values
,
10705 global_options_set
.x_param_values
);
10707 /* Enable sw prefetching at specified optimization level for
10708 CPUS that have prefetch. Lower optimization level threshold by 1
10709 when profiling is enabled. */
10710 if (opts
->x_flag_prefetch_loop_arrays
< 0
10711 && !opts
->x_optimize_size
10712 && aarch64_tune_params
.prefetch
->default_opt_level
>= 0
10713 && opts
->x_optimize
>= aarch64_tune_params
.prefetch
->default_opt_level
)
10714 opts
->x_flag_prefetch_loop_arrays
= 1;
10716 aarch64_override_options_after_change_1 (opts
);
10719 /* Print a hint with a suggestion for a core or architecture name that
10720 most closely resembles what the user passed in STR. ARCH is true if
10721 the user is asking for an architecture name. ARCH is false if the user
10722 is asking for a core name. */
10725 aarch64_print_hint_for_core_or_arch (const char *str
, bool arch
)
10727 auto_vec
<const char *> candidates
;
10728 const struct processor
*entry
= arch
? all_architectures
: all_cores
;
10729 for (; entry
->name
!= NULL
; entry
++)
10730 candidates
.safe_push (entry
->name
);
10732 #ifdef HAVE_LOCAL_CPU_DETECT
10733 /* Add also "native" as possible value. */
10735 candidates
.safe_push ("native");
10739 const char *hint
= candidates_list_and_hint (str
, s
, candidates
);
10741 inform (input_location
, "valid arguments are: %s;"
10742 " did you mean %qs?", s
, hint
);
10744 inform (input_location
, "valid arguments are: %s", s
);
10749 /* Print a hint with a suggestion for a core name that most closely resembles
10750 what the user passed in STR. */
10753 aarch64_print_hint_for_core (const char *str
)
10755 aarch64_print_hint_for_core_or_arch (str
, false);
10758 /* Print a hint with a suggestion for an architecture name that most closely
10759 resembles what the user passed in STR. */
10762 aarch64_print_hint_for_arch (const char *str
)
10764 aarch64_print_hint_for_core_or_arch (str
, true);
10767 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
10768 specified in STR and throw errors if appropriate. Put the results if
10769 they are valid in RES and ISA_FLAGS. Return whether the option is
10773 aarch64_validate_mcpu (const char *str
, const struct processor
**res
,
10774 unsigned long *isa_flags
)
10776 enum aarch64_parse_opt_result parse_res
10777 = aarch64_parse_cpu (str
, res
, isa_flags
);
10779 if (parse_res
== AARCH64_PARSE_OK
)
10784 case AARCH64_PARSE_MISSING_ARG
:
10785 error ("missing cpu name in %<-mcpu=%s%>", str
);
10787 case AARCH64_PARSE_INVALID_ARG
:
10788 error ("unknown value %qs for -mcpu", str
);
10789 aarch64_print_hint_for_core (str
);
10791 case AARCH64_PARSE_INVALID_FEATURE
:
10792 error ("invalid feature modifier in %<-mcpu=%s%>", str
);
10795 gcc_unreachable ();
10801 /* Validate a command-line -march option. Parse the arch and extensions
10802 (if any) specified in STR and throw errors if appropriate. Put the
10803 results, if they are valid, in RES and ISA_FLAGS. Return whether the
10804 option is valid. */
10807 aarch64_validate_march (const char *str
, const struct processor
**res
,
10808 unsigned long *isa_flags
)
10810 enum aarch64_parse_opt_result parse_res
10811 = aarch64_parse_arch (str
, res
, isa_flags
);
10813 if (parse_res
== AARCH64_PARSE_OK
)
10818 case AARCH64_PARSE_MISSING_ARG
:
10819 error ("missing arch name in %<-march=%s%>", str
);
10821 case AARCH64_PARSE_INVALID_ARG
:
10822 error ("unknown value %qs for -march", str
);
10823 aarch64_print_hint_for_arch (str
);
10825 case AARCH64_PARSE_INVALID_FEATURE
:
10826 error ("invalid feature modifier in %<-march=%s%>", str
);
10829 gcc_unreachable ();
10835 /* Validate a command-line -mtune option. Parse the cpu
10836 specified in STR and throw errors if appropriate. Put the
10837 result, if it is valid, in RES. Return whether the option is
10841 aarch64_validate_mtune (const char *str
, const struct processor
**res
)
10843 enum aarch64_parse_opt_result parse_res
10844 = aarch64_parse_tune (str
, res
);
10846 if (parse_res
== AARCH64_PARSE_OK
)
10851 case AARCH64_PARSE_MISSING_ARG
:
10852 error ("missing cpu name in %<-mtune=%s%>", str
);
10854 case AARCH64_PARSE_INVALID_ARG
:
10855 error ("unknown value %qs for -mtune", str
);
10856 aarch64_print_hint_for_core (str
);
10859 gcc_unreachable ();
10864 /* Return the CPU corresponding to the enum CPU.
10865 If it doesn't specify a cpu, return the default. */
10867 static const struct processor
*
10868 aarch64_get_tune_cpu (enum aarch64_processor cpu
)
10870 if (cpu
!= aarch64_none
)
10871 return &all_cores
[cpu
];
10873 /* The & 0x3f is to extract the bottom 6 bits that encode the
10874 default cpu as selected by the --with-cpu GCC configure option
10876 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
10877 flags mechanism should be reworked to make it more sane. */
10878 return &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
10881 /* Return the architecture corresponding to the enum ARCH.
10882 If it doesn't specify a valid architecture, return the default. */
10884 static const struct processor
*
10885 aarch64_get_arch (enum aarch64_arch arch
)
10887 if (arch
!= aarch64_no_arch
)
10888 return &all_architectures
[arch
];
10890 const struct processor
*cpu
= &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
10892 return &all_architectures
[cpu
->arch
];
10895 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
10898 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value
)
10900 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
10901 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
10902 deciding which .md file patterns to use and when deciding whether
10903 something is a legitimate address or constant. */
10904 if (value
== SVE_SCALABLE
|| value
== SVE_128
)
10905 return poly_uint16 (2, 2);
10907 return (int) value
/ 64;
10910 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
10911 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
10912 tuning structs. In particular it must set selected_tune and
10913 aarch64_isa_flags that define the available ISA features and tuning
10914 decisions. It must also set selected_arch as this will be used to
10915 output the .arch asm tags for each function. */
10918 aarch64_override_options (void)
10920 unsigned long cpu_isa
= 0;
10921 unsigned long arch_isa
= 0;
10922 aarch64_isa_flags
= 0;
10924 bool valid_cpu
= true;
10925 bool valid_tune
= true;
10926 bool valid_arch
= true;
10928 selected_cpu
= NULL
;
10929 selected_arch
= NULL
;
10930 selected_tune
= NULL
;
10932 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
10933 If either of -march or -mtune is given, they override their
10934 respective component of -mcpu. */
10935 if (aarch64_cpu_string
)
10936 valid_cpu
= aarch64_validate_mcpu (aarch64_cpu_string
, &selected_cpu
,
10939 if (aarch64_arch_string
)
10940 valid_arch
= aarch64_validate_march (aarch64_arch_string
, &selected_arch
,
10943 if (aarch64_tune_string
)
10944 valid_tune
= aarch64_validate_mtune (aarch64_tune_string
, &selected_tune
);
10946 /* If the user did not specify a processor, choose the default
10947 one for them. This will be the CPU set during configuration using
10948 --with-cpu, otherwise it is "generic". */
10953 selected_cpu
= &all_cores
[selected_arch
->ident
];
10954 aarch64_isa_flags
= arch_isa
;
10955 explicit_arch
= selected_arch
->arch
;
10959 /* Get default configure-time CPU. */
10960 selected_cpu
= aarch64_get_tune_cpu (aarch64_none
);
10961 aarch64_isa_flags
= TARGET_CPU_DEFAULT
>> 6;
10965 explicit_tune_core
= selected_tune
->ident
;
10967 /* If both -mcpu and -march are specified check that they are architecturally
10968 compatible, warn if they're not and prefer the -march ISA flags. */
10969 else if (selected_arch
)
10971 if (selected_arch
->arch
!= selected_cpu
->arch
)
10973 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
10974 all_architectures
[selected_cpu
->arch
].name
,
10975 selected_arch
->name
);
10977 aarch64_isa_flags
= arch_isa
;
10978 explicit_arch
= selected_arch
->arch
;
10979 explicit_tune_core
= selected_tune
? selected_tune
->ident
10980 : selected_cpu
->ident
;
10984 /* -mcpu but no -march. */
10985 aarch64_isa_flags
= cpu_isa
;
10986 explicit_tune_core
= selected_tune
? selected_tune
->ident
10987 : selected_cpu
->ident
;
10988 gcc_assert (selected_cpu
);
10989 selected_arch
= &all_architectures
[selected_cpu
->arch
];
10990 explicit_arch
= selected_arch
->arch
;
10993 /* Set the arch as well as we will need it when outputing
10994 the .arch directive in assembly. */
10995 if (!selected_arch
)
10997 gcc_assert (selected_cpu
);
10998 selected_arch
= &all_architectures
[selected_cpu
->arch
];
11001 if (!selected_tune
)
11002 selected_tune
= selected_cpu
;
11004 #ifndef HAVE_AS_MABI_OPTION
11005 /* The compiler may have been configured with 2.23.* binutils, which does
11006 not have support for ILP32. */
11008 error ("assembler does not support -mabi=ilp32");
11011 /* Convert -msve-vector-bits to a VG count. */
11012 aarch64_sve_vg
= aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits
);
11014 if (aarch64_ra_sign_scope
!= AARCH64_FUNCTION_NONE
&& TARGET_ILP32
)
11015 sorry ("return address signing is only supported for -mabi=lp64");
11017 /* Make sure we properly set up the explicit options. */
11018 if ((aarch64_cpu_string
&& valid_cpu
)
11019 || (aarch64_tune_string
&& valid_tune
))
11020 gcc_assert (explicit_tune_core
!= aarch64_none
);
11022 if ((aarch64_cpu_string
&& valid_cpu
)
11023 || (aarch64_arch_string
&& valid_arch
))
11024 gcc_assert (explicit_arch
!= aarch64_no_arch
);
11026 aarch64_override_options_internal (&global_options
);
11028 /* Save these options as the default ones in case we push and pop them later
11029 while processing functions with potential target attributes. */
11030 target_option_default_node
= target_option_current_node
11031 = build_target_option_node (&global_options
);
11034 /* Implement targetm.override_options_after_change. */
11037 aarch64_override_options_after_change (void)
11039 aarch64_override_options_after_change_1 (&global_options
);
11042 static struct machine_function
*
11043 aarch64_init_machine_status (void)
11045 struct machine_function
*machine
;
11046 machine
= ggc_cleared_alloc
<machine_function
> ();
11051 aarch64_init_expanders (void)
11053 init_machine_status
= aarch64_init_machine_status
;
11056 /* A checking mechanism for the implementation of the various code models. */
11058 initialize_aarch64_code_model (struct gcc_options
*opts
)
11060 if (opts
->x_flag_pic
)
11062 switch (opts
->x_aarch64_cmodel_var
)
11064 case AARCH64_CMODEL_TINY
:
11065 aarch64_cmodel
= AARCH64_CMODEL_TINY_PIC
;
11067 case AARCH64_CMODEL_SMALL
:
11068 #ifdef HAVE_AS_SMALL_PIC_RELOCS
11069 aarch64_cmodel
= (flag_pic
== 2
11070 ? AARCH64_CMODEL_SMALL_PIC
11071 : AARCH64_CMODEL_SMALL_SPIC
);
11073 aarch64_cmodel
= AARCH64_CMODEL_SMALL_PIC
;
11076 case AARCH64_CMODEL_LARGE
:
11077 sorry ("code model %qs with -f%s", "large",
11078 opts
->x_flag_pic
> 1 ? "PIC" : "pic");
11081 gcc_unreachable ();
11085 aarch64_cmodel
= opts
->x_aarch64_cmodel_var
;
11088 /* Implement TARGET_OPTION_SAVE. */
11091 aarch64_option_save (struct cl_target_option
*ptr
, struct gcc_options
*opts
)
11093 ptr
->x_aarch64_override_tune_string
= opts
->x_aarch64_override_tune_string
;
11096 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
11097 using the information saved in PTR. */
11100 aarch64_option_restore (struct gcc_options
*opts
, struct cl_target_option
*ptr
)
11102 opts
->x_explicit_tune_core
= ptr
->x_explicit_tune_core
;
11103 selected_tune
= aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
11104 opts
->x_explicit_arch
= ptr
->x_explicit_arch
;
11105 selected_arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
11106 opts
->x_aarch64_override_tune_string
= ptr
->x_aarch64_override_tune_string
;
11108 aarch64_override_options_internal (opts
);
11111 /* Implement TARGET_OPTION_PRINT. */
11114 aarch64_option_print (FILE *file
, int indent
, struct cl_target_option
*ptr
)
11116 const struct processor
*cpu
11117 = aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
11118 unsigned long isa_flags
= ptr
->x_aarch64_isa_flags
;
11119 const struct processor
*arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
11120 std::string extension
11121 = aarch64_get_extension_string_for_isa_flags (isa_flags
, arch
->flags
);
11123 fprintf (file
, "%*sselected tune = %s\n", indent
, "", cpu
->name
);
11124 fprintf (file
, "%*sselected arch = %s%s\n", indent
, "",
11125 arch
->name
, extension
.c_str ());
11128 static GTY(()) tree aarch64_previous_fndecl
;
11131 aarch64_reset_previous_fndecl (void)
11133 aarch64_previous_fndecl
= NULL
;
11136 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
11137 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
11138 make sure optab availability predicates are recomputed when necessary. */
11141 aarch64_save_restore_target_globals (tree new_tree
)
11143 if (TREE_TARGET_GLOBALS (new_tree
))
11144 restore_target_globals (TREE_TARGET_GLOBALS (new_tree
));
11145 else if (new_tree
== target_option_default_node
)
11146 restore_target_globals (&default_target_globals
);
11148 TREE_TARGET_GLOBALS (new_tree
) = save_target_globals_default_opts ();
11151 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
11152 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
11153 of the function, if such exists. This function may be called multiple
11154 times on a single function so use aarch64_previous_fndecl to avoid
11155 setting up identical state. */
11158 aarch64_set_current_function (tree fndecl
)
11160 if (!fndecl
|| fndecl
== aarch64_previous_fndecl
)
11163 tree old_tree
= (aarch64_previous_fndecl
11164 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl
)
11167 tree new_tree
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
11169 /* If current function has no attributes but the previous one did,
11170 use the default node. */
11171 if (!new_tree
&& old_tree
)
11172 new_tree
= target_option_default_node
;
11174 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
11175 the default have been handled by aarch64_save_restore_target_globals from
11176 aarch64_pragma_target_parse. */
11177 if (old_tree
== new_tree
)
11180 aarch64_previous_fndecl
= fndecl
;
11182 /* First set the target options. */
11183 cl_target_option_restore (&global_options
, TREE_TARGET_OPTION (new_tree
));
11185 aarch64_save_restore_target_globals (new_tree
);
11188 /* Enum describing the various ways we can handle attributes.
11189 In many cases we can reuse the generic option handling machinery. */
11191 enum aarch64_attr_opt_type
11193 aarch64_attr_mask
, /* Attribute should set a bit in target_flags. */
11194 aarch64_attr_bool
, /* Attribute sets or unsets a boolean variable. */
11195 aarch64_attr_enum
, /* Attribute sets an enum variable. */
11196 aarch64_attr_custom
/* Attribute requires a custom handling function. */
11199 /* All the information needed to handle a target attribute.
11200 NAME is the name of the attribute.
11201 ATTR_TYPE specifies the type of behavior of the attribute as described
11202 in the definition of enum aarch64_attr_opt_type.
11203 ALLOW_NEG is true if the attribute supports a "no-" form.
11204 HANDLER is the function that takes the attribute string as an argument
11205 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
11206 OPT_NUM is the enum specifying the option that the attribute modifies.
11207 This is needed for attributes that mirror the behavior of a command-line
11208 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
11209 aarch64_attr_enum. */
11211 struct aarch64_attribute_info
11214 enum aarch64_attr_opt_type attr_type
;
11216 bool (*handler
) (const char *);
11217 enum opt_code opt_num
;
11220 /* Handle the ARCH_STR argument to the arch= target attribute. */
11223 aarch64_handle_attr_arch (const char *str
)
11225 const struct processor
*tmp_arch
= NULL
;
11226 enum aarch64_parse_opt_result parse_res
11227 = aarch64_parse_arch (str
, &tmp_arch
, &aarch64_isa_flags
);
11229 if (parse_res
== AARCH64_PARSE_OK
)
11231 gcc_assert (tmp_arch
);
11232 selected_arch
= tmp_arch
;
11233 explicit_arch
= selected_arch
->arch
;
11239 case AARCH64_PARSE_MISSING_ARG
:
11240 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
11242 case AARCH64_PARSE_INVALID_ARG
:
11243 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str
);
11244 aarch64_print_hint_for_arch (str
);
11246 case AARCH64_PARSE_INVALID_FEATURE
:
11247 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str
);
11250 gcc_unreachable ();
11256 /* Handle the argument CPU_STR to the cpu= target attribute. */
11259 aarch64_handle_attr_cpu (const char *str
)
11261 const struct processor
*tmp_cpu
= NULL
;
11262 enum aarch64_parse_opt_result parse_res
11263 = aarch64_parse_cpu (str
, &tmp_cpu
, &aarch64_isa_flags
);
11265 if (parse_res
== AARCH64_PARSE_OK
)
11267 gcc_assert (tmp_cpu
);
11268 selected_tune
= tmp_cpu
;
11269 explicit_tune_core
= selected_tune
->ident
;
11271 selected_arch
= &all_architectures
[tmp_cpu
->arch
];
11272 explicit_arch
= selected_arch
->arch
;
11278 case AARCH64_PARSE_MISSING_ARG
:
11279 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
11281 case AARCH64_PARSE_INVALID_ARG
:
11282 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str
);
11283 aarch64_print_hint_for_core (str
);
11285 case AARCH64_PARSE_INVALID_FEATURE
:
11286 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str
);
11289 gcc_unreachable ();
11295 /* Handle the argument STR to the tune= target attribute. */
11298 aarch64_handle_attr_tune (const char *str
)
11300 const struct processor
*tmp_tune
= NULL
;
11301 enum aarch64_parse_opt_result parse_res
11302 = aarch64_parse_tune (str
, &tmp_tune
);
11304 if (parse_res
== AARCH64_PARSE_OK
)
11306 gcc_assert (tmp_tune
);
11307 selected_tune
= tmp_tune
;
11308 explicit_tune_core
= selected_tune
->ident
;
11314 case AARCH64_PARSE_INVALID_ARG
:
11315 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str
);
11316 aarch64_print_hint_for_core (str
);
11319 gcc_unreachable ();
11325 /* Parse an architecture extensions target attribute string specified in STR.
11326 For example "+fp+nosimd". Show any errors if needed. Return TRUE
11327 if successful. Update aarch64_isa_flags to reflect the ISA features
11331 aarch64_handle_attr_isa_flags (char *str
)
11333 enum aarch64_parse_opt_result parse_res
;
11334 unsigned long isa_flags
= aarch64_isa_flags
;
11336 /* We allow "+nothing" in the beginning to clear out all architectural
11337 features if the user wants to handpick specific features. */
11338 if (strncmp ("+nothing", str
, 8) == 0)
11344 parse_res
= aarch64_parse_extension (str
, &isa_flags
);
11346 if (parse_res
== AARCH64_PARSE_OK
)
11348 aarch64_isa_flags
= isa_flags
;
11354 case AARCH64_PARSE_MISSING_ARG
:
11355 error ("missing value in %<target()%> pragma or attribute");
11358 case AARCH64_PARSE_INVALID_FEATURE
:
11359 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str
);
11363 gcc_unreachable ();
11369 /* The target attributes that we support. On top of these we also support just
11370 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
11371 handled explicitly in aarch64_process_one_target_attr. */
11373 static const struct aarch64_attribute_info aarch64_attributes
[] =
11375 { "general-regs-only", aarch64_attr_mask
, false, NULL
,
11376 OPT_mgeneral_regs_only
},
11377 { "fix-cortex-a53-835769", aarch64_attr_bool
, true, NULL
,
11378 OPT_mfix_cortex_a53_835769
},
11379 { "fix-cortex-a53-843419", aarch64_attr_bool
, true, NULL
,
11380 OPT_mfix_cortex_a53_843419
},
11381 { "cmodel", aarch64_attr_enum
, false, NULL
, OPT_mcmodel_
},
11382 { "strict-align", aarch64_attr_mask
, true, NULL
, OPT_mstrict_align
},
11383 { "omit-leaf-frame-pointer", aarch64_attr_bool
, true, NULL
,
11384 OPT_momit_leaf_frame_pointer
},
11385 { "tls-dialect", aarch64_attr_enum
, false, NULL
, OPT_mtls_dialect_
},
11386 { "arch", aarch64_attr_custom
, false, aarch64_handle_attr_arch
,
11388 { "cpu", aarch64_attr_custom
, false, aarch64_handle_attr_cpu
, OPT_mcpu_
},
11389 { "tune", aarch64_attr_custom
, false, aarch64_handle_attr_tune
,
11391 { "sign-return-address", aarch64_attr_enum
, false, NULL
,
11392 OPT_msign_return_address_
},
11393 { NULL
, aarch64_attr_custom
, false, NULL
, OPT____
}
11396 /* Parse ARG_STR which contains the definition of one target attribute.
11397 Show appropriate errors if any or return true if the attribute is valid. */
11400 aarch64_process_one_target_attr (char *arg_str
)
11402 bool invert
= false;
11404 size_t len
= strlen (arg_str
);
11408 error ("malformed %<target()%> pragma or attribute");
11412 char *str_to_check
= (char *) alloca (len
+ 1);
11413 strcpy (str_to_check
, arg_str
);
11415 /* Skip leading whitespace. */
11416 while (*str_to_check
== ' ' || *str_to_check
== '\t')
11419 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
11420 It is easier to detect and handle it explicitly here rather than going
11421 through the machinery for the rest of the target attributes in this
11423 if (*str_to_check
== '+')
11424 return aarch64_handle_attr_isa_flags (str_to_check
);
11426 if (len
> 3 && strncmp (str_to_check
, "no-", 3) == 0)
11431 char *arg
= strchr (str_to_check
, '=');
11433 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
11434 and point ARG to "foo". */
11440 const struct aarch64_attribute_info
*p_attr
;
11441 bool found
= false;
11442 for (p_attr
= aarch64_attributes
; p_attr
->name
; p_attr
++)
11444 /* If the names don't match up, or the user has given an argument
11445 to an attribute that doesn't accept one, or didn't give an argument
11446 to an attribute that expects one, fail to match. */
11447 if (strcmp (str_to_check
, p_attr
->name
) != 0)
11451 bool attr_need_arg_p
= p_attr
->attr_type
== aarch64_attr_custom
11452 || p_attr
->attr_type
== aarch64_attr_enum
;
11454 if (attr_need_arg_p
^ (arg
!= NULL
))
11456 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check
);
11460 /* If the name matches but the attribute does not allow "no-" versions
11461 then we can't match. */
11462 if (invert
&& !p_attr
->allow_neg
)
11464 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check
);
11468 switch (p_attr
->attr_type
)
11470 /* Has a custom handler registered.
11471 For example, cpu=, arch=, tune=. */
11472 case aarch64_attr_custom
:
11473 gcc_assert (p_attr
->handler
);
11474 if (!p_attr
->handler (arg
))
11478 /* Either set or unset a boolean option. */
11479 case aarch64_attr_bool
:
11481 struct cl_decoded_option decoded
;
11483 generate_option (p_attr
->opt_num
, NULL
, !invert
,
11484 CL_TARGET
, &decoded
);
11485 aarch64_handle_option (&global_options
, &global_options_set
,
11486 &decoded
, input_location
);
11489 /* Set or unset a bit in the target_flags. aarch64_handle_option
11490 should know what mask to apply given the option number. */
11491 case aarch64_attr_mask
:
11493 struct cl_decoded_option decoded
;
11494 /* We only need to specify the option number.
11495 aarch64_handle_option will know which mask to apply. */
11496 decoded
.opt_index
= p_attr
->opt_num
;
11497 decoded
.value
= !invert
;
11498 aarch64_handle_option (&global_options
, &global_options_set
,
11499 &decoded
, input_location
);
11502 /* Use the option setting machinery to set an option to an enum. */
11503 case aarch64_attr_enum
:
11508 valid
= opt_enum_arg_to_value (p_attr
->opt_num
, arg
,
11509 &value
, CL_TARGET
);
11512 set_option (&global_options
, NULL
, p_attr
->opt_num
, value
,
11513 NULL
, DK_UNSPECIFIED
, input_location
,
11518 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check
, arg
);
11523 gcc_unreachable ();
11527 /* If we reached here we either have found an attribute and validated
11528 it or didn't match any. If we matched an attribute but its arguments
11529 were malformed we will have returned false already. */
11533 /* Count how many times the character C appears in
11534 NULL-terminated string STR. */
11536 static unsigned int
11537 num_occurences_in_str (char c
, char *str
)
11539 unsigned int res
= 0;
11540 while (*str
!= '\0')
11551 /* Parse the tree in ARGS that contains the target attribute information
11552 and update the global target options space. */
11555 aarch64_process_target_attr (tree args
)
11557 if (TREE_CODE (args
) == TREE_LIST
)
11561 tree head
= TREE_VALUE (args
);
11564 if (!aarch64_process_target_attr (head
))
11567 args
= TREE_CHAIN (args
);
11573 if (TREE_CODE (args
) != STRING_CST
)
11575 error ("attribute %<target%> argument not a string");
11579 size_t len
= strlen (TREE_STRING_POINTER (args
));
11580 char *str_to_check
= (char *) alloca (len
+ 1);
11581 strcpy (str_to_check
, TREE_STRING_POINTER (args
));
11585 error ("malformed %<target()%> pragma or attribute");
11589 /* Used to catch empty spaces between commas i.e.
11590 attribute ((target ("attr1,,attr2"))). */
11591 unsigned int num_commas
= num_occurences_in_str (',', str_to_check
);
11593 /* Handle multiple target attributes separated by ','. */
11594 char *token
= strtok (str_to_check
, ",");
11596 unsigned int num_attrs
= 0;
11600 if (!aarch64_process_one_target_attr (token
))
11602 error ("pragma or attribute %<target(\"%s\")%> is not valid", token
);
11606 token
= strtok (NULL
, ",");
11609 if (num_attrs
!= num_commas
+ 1)
11611 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args
));
11618 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
11619 process attribute ((target ("..."))). */
11622 aarch64_option_valid_attribute_p (tree fndecl
, tree
, tree args
, int)
11624 struct cl_target_option cur_target
;
11627 tree new_target
, new_optimize
;
11628 tree existing_target
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
11630 /* If what we're processing is the current pragma string then the
11631 target option node is already stored in target_option_current_node
11632 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
11633 having to re-parse the string. This is especially useful to keep
11634 arm_neon.h compile times down since that header contains a lot
11635 of intrinsics enclosed in pragmas. */
11636 if (!existing_target
&& args
== current_target_pragma
)
11638 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = target_option_current_node
;
11641 tree func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
11643 old_optimize
= build_optimization_node (&global_options
);
11644 func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
11646 /* If the function changed the optimization levels as well as setting
11647 target options, start with the optimizations specified. */
11648 if (func_optimize
&& func_optimize
!= old_optimize
)
11649 cl_optimization_restore (&global_options
,
11650 TREE_OPTIMIZATION (func_optimize
));
11652 /* Save the current target options to restore at the end. */
11653 cl_target_option_save (&cur_target
, &global_options
);
11655 /* If fndecl already has some target attributes applied to it, unpack
11656 them so that we add this attribute on top of them, rather than
11657 overwriting them. */
11658 if (existing_target
)
11660 struct cl_target_option
*existing_options
11661 = TREE_TARGET_OPTION (existing_target
);
11663 if (existing_options
)
11664 cl_target_option_restore (&global_options
, existing_options
);
11667 cl_target_option_restore (&global_options
,
11668 TREE_TARGET_OPTION (target_option_current_node
));
11670 ret
= aarch64_process_target_attr (args
);
11672 /* Set up any additional state. */
11675 aarch64_override_options_internal (&global_options
);
11676 /* Initialize SIMD builtins if we haven't already.
11677 Set current_target_pragma to NULL for the duration so that
11678 the builtin initialization code doesn't try to tag the functions
11679 being built with the attributes specified by any current pragma, thus
11680 going into an infinite recursion. */
11683 tree saved_current_target_pragma
= current_target_pragma
;
11684 current_target_pragma
= NULL
;
11685 aarch64_init_simd_builtins ();
11686 current_target_pragma
= saved_current_target_pragma
;
11688 new_target
= build_target_option_node (&global_options
);
11693 new_optimize
= build_optimization_node (&global_options
);
11697 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = new_target
;
11699 if (old_optimize
!= new_optimize
)
11700 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
) = new_optimize
;
11703 cl_target_option_restore (&global_options
, &cur_target
);
11705 if (old_optimize
!= new_optimize
)
11706 cl_optimization_restore (&global_options
,
11707 TREE_OPTIMIZATION (old_optimize
));
11711 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
11712 tri-bool options (yes, no, don't care) and the default value is
11713 DEF, determine whether to reject inlining. */
11716 aarch64_tribools_ok_for_inlining_p (int caller
, int callee
,
11717 int dont_care
, int def
)
11719 /* If the callee doesn't care, always allow inlining. */
11720 if (callee
== dont_care
)
11723 /* If the caller doesn't care, always allow inlining. */
11724 if (caller
== dont_care
)
11727 /* Otherwise, allow inlining if either the callee and caller values
11728 agree, or if the callee is using the default value. */
11729 return (callee
== caller
|| callee
== def
);
11732 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
11733 to inline CALLEE into CALLER based on target-specific info.
11734 Make sure that the caller and callee have compatible architectural
11735 features. Then go through the other possible target attributes
11736 and see if they can block inlining. Try not to reject always_inline
11737 callees unless they are incompatible architecturally. */
11740 aarch64_can_inline_p (tree caller
, tree callee
)
11742 tree caller_tree
= DECL_FUNCTION_SPECIFIC_TARGET (caller
);
11743 tree callee_tree
= DECL_FUNCTION_SPECIFIC_TARGET (callee
);
11745 struct cl_target_option
*caller_opts
11746 = TREE_TARGET_OPTION (caller_tree
? caller_tree
11747 : target_option_default_node
);
11749 struct cl_target_option
*callee_opts
11750 = TREE_TARGET_OPTION (callee_tree
? callee_tree
11751 : target_option_default_node
);
11753 /* Callee's ISA flags should be a subset of the caller's. */
11754 if ((caller_opts
->x_aarch64_isa_flags
& callee_opts
->x_aarch64_isa_flags
)
11755 != callee_opts
->x_aarch64_isa_flags
)
11758 /* Allow non-strict aligned functions inlining into strict
11760 if ((TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)
11761 != TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
))
11762 && !(!TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
)
11763 && TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)))
11766 bool always_inline
= lookup_attribute ("always_inline",
11767 DECL_ATTRIBUTES (callee
));
11769 /* If the architectural features match up and the callee is always_inline
11770 then the other attributes don't matter. */
11774 if (caller_opts
->x_aarch64_cmodel_var
11775 != callee_opts
->x_aarch64_cmodel_var
)
11778 if (caller_opts
->x_aarch64_tls_dialect
11779 != callee_opts
->x_aarch64_tls_dialect
)
11782 /* Honour explicit requests to workaround errata. */
11783 if (!aarch64_tribools_ok_for_inlining_p (
11784 caller_opts
->x_aarch64_fix_a53_err835769
,
11785 callee_opts
->x_aarch64_fix_a53_err835769
,
11786 2, TARGET_FIX_ERR_A53_835769_DEFAULT
))
11789 if (!aarch64_tribools_ok_for_inlining_p (
11790 caller_opts
->x_aarch64_fix_a53_err843419
,
11791 callee_opts
->x_aarch64_fix_a53_err843419
,
11792 2, TARGET_FIX_ERR_A53_843419
))
11795 /* If the user explicitly specified -momit-leaf-frame-pointer for the
11796 caller and calle and they don't match up, reject inlining. */
11797 if (!aarch64_tribools_ok_for_inlining_p (
11798 caller_opts
->x_flag_omit_leaf_frame_pointer
,
11799 callee_opts
->x_flag_omit_leaf_frame_pointer
,
11803 /* If the callee has specific tuning overrides, respect them. */
11804 if (callee_opts
->x_aarch64_override_tune_string
!= NULL
11805 && caller_opts
->x_aarch64_override_tune_string
== NULL
)
11808 /* If the user specified tuning override strings for the
11809 caller and callee and they don't match up, reject inlining.
11810 We just do a string compare here, we don't analyze the meaning
11811 of the string, as it would be too costly for little gain. */
11812 if (callee_opts
->x_aarch64_override_tune_string
11813 && caller_opts
->x_aarch64_override_tune_string
11814 && (strcmp (callee_opts
->x_aarch64_override_tune_string
,
11815 caller_opts
->x_aarch64_override_tune_string
) != 0))
11821 /* Return true if SYMBOL_REF X binds locally. */
11824 aarch64_symbol_binds_local_p (const_rtx x
)
11826 return (SYMBOL_REF_DECL (x
)
11827 ? targetm
.binds_local_p (SYMBOL_REF_DECL (x
))
11828 : SYMBOL_REF_LOCAL_P (x
));
11831 /* Return true if SYMBOL_REF X is thread local */
11833 aarch64_tls_symbol_p (rtx x
)
11835 if (! TARGET_HAVE_TLS
)
11838 if (GET_CODE (x
) != SYMBOL_REF
)
11841 return SYMBOL_REF_TLS_MODEL (x
) != 0;
11844 /* Classify a TLS symbol into one of the TLS kinds. */
11845 enum aarch64_symbol_type
11846 aarch64_classify_tls_symbol (rtx x
)
11848 enum tls_model tls_kind
= tls_symbolic_operand_type (x
);
11852 case TLS_MODEL_GLOBAL_DYNAMIC
:
11853 case TLS_MODEL_LOCAL_DYNAMIC
:
11854 return TARGET_TLS_DESC
? SYMBOL_SMALL_TLSDESC
: SYMBOL_SMALL_TLSGD
;
11856 case TLS_MODEL_INITIAL_EXEC
:
11857 switch (aarch64_cmodel
)
11859 case AARCH64_CMODEL_TINY
:
11860 case AARCH64_CMODEL_TINY_PIC
:
11861 return SYMBOL_TINY_TLSIE
;
11863 return SYMBOL_SMALL_TLSIE
;
11866 case TLS_MODEL_LOCAL_EXEC
:
11867 if (aarch64_tls_size
== 12)
11868 return SYMBOL_TLSLE12
;
11869 else if (aarch64_tls_size
== 24)
11870 return SYMBOL_TLSLE24
;
11871 else if (aarch64_tls_size
== 32)
11872 return SYMBOL_TLSLE32
;
11873 else if (aarch64_tls_size
== 48)
11874 return SYMBOL_TLSLE48
;
11876 gcc_unreachable ();
11878 case TLS_MODEL_EMULATED
:
11879 case TLS_MODEL_NONE
:
11880 return SYMBOL_FORCE_TO_MEM
;
11883 gcc_unreachable ();
11887 /* Return the correct method for accessing X + OFFSET, where X is either
11888 a SYMBOL_REF or LABEL_REF. */
11890 enum aarch64_symbol_type
11891 aarch64_classify_symbol (rtx x
, HOST_WIDE_INT offset
)
11893 if (GET_CODE (x
) == LABEL_REF
)
11895 switch (aarch64_cmodel
)
11897 case AARCH64_CMODEL_LARGE
:
11898 return SYMBOL_FORCE_TO_MEM
;
11900 case AARCH64_CMODEL_TINY_PIC
:
11901 case AARCH64_CMODEL_TINY
:
11902 return SYMBOL_TINY_ABSOLUTE
;
11904 case AARCH64_CMODEL_SMALL_SPIC
:
11905 case AARCH64_CMODEL_SMALL_PIC
:
11906 case AARCH64_CMODEL_SMALL
:
11907 return SYMBOL_SMALL_ABSOLUTE
;
11910 gcc_unreachable ();
11914 if (GET_CODE (x
) == SYMBOL_REF
)
11916 if (aarch64_tls_symbol_p (x
))
11917 return aarch64_classify_tls_symbol (x
);
11919 switch (aarch64_cmodel
)
11921 case AARCH64_CMODEL_TINY
:
11922 /* When we retrieve symbol + offset address, we have to make sure
11923 the offset does not cause overflow of the final address. But
11924 we have no way of knowing the address of symbol at compile time
11925 so we can't accurately say if the distance between the PC and
11926 symbol + offset is outside the addressible range of +/-1M in the
11927 TINY code model. So we rely on images not being greater than
11928 1M and cap the offset at 1M and anything beyond 1M will have to
11929 be loaded using an alternative mechanism. Furthermore if the
11930 symbol is a weak reference to something that isn't known to
11931 resolve to a symbol in this module, then force to memory. */
11932 if ((SYMBOL_REF_WEAK (x
)
11933 && !aarch64_symbol_binds_local_p (x
))
11934 || !IN_RANGE (offset
, -1048575, 1048575))
11935 return SYMBOL_FORCE_TO_MEM
;
11936 return SYMBOL_TINY_ABSOLUTE
;
11938 case AARCH64_CMODEL_SMALL
:
11939 /* Same reasoning as the tiny code model, but the offset cap here is
11941 if ((SYMBOL_REF_WEAK (x
)
11942 && !aarch64_symbol_binds_local_p (x
))
11943 || !IN_RANGE (offset
, HOST_WIDE_INT_C (-4294967263),
11944 HOST_WIDE_INT_C (4294967264)))
11945 return SYMBOL_FORCE_TO_MEM
;
11946 return SYMBOL_SMALL_ABSOLUTE
;
11948 case AARCH64_CMODEL_TINY_PIC
:
11949 if (!aarch64_symbol_binds_local_p (x
))
11950 return SYMBOL_TINY_GOT
;
11951 return SYMBOL_TINY_ABSOLUTE
;
11953 case AARCH64_CMODEL_SMALL_SPIC
:
11954 case AARCH64_CMODEL_SMALL_PIC
:
11955 if (!aarch64_symbol_binds_local_p (x
))
11956 return (aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
11957 ? SYMBOL_SMALL_GOT_28K
: SYMBOL_SMALL_GOT_4G
);
11958 return SYMBOL_SMALL_ABSOLUTE
;
11960 case AARCH64_CMODEL_LARGE
:
11961 /* This is alright even in PIC code as the constant
11962 pool reference is always PC relative and within
11963 the same translation unit. */
11964 if (!aarch64_pcrelative_literal_loads
&& CONSTANT_POOL_ADDRESS_P (x
))
11965 return SYMBOL_SMALL_ABSOLUTE
;
11967 return SYMBOL_FORCE_TO_MEM
;
11970 gcc_unreachable ();
11974 /* By default push everything into the constant pool. */
11975 return SYMBOL_FORCE_TO_MEM
;
11979 aarch64_constant_address_p (rtx x
)
11981 return (CONSTANT_P (x
) && memory_address_p (DImode
, x
));
11985 aarch64_legitimate_pic_operand_p (rtx x
)
11987 if (GET_CODE (x
) == SYMBOL_REF
11988 || (GET_CODE (x
) == CONST
11989 && GET_CODE (XEXP (x
, 0)) == PLUS
11990 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
))
11996 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
11997 that should be rematerialized rather than spilled. */
12000 aarch64_legitimate_constant_p (machine_mode mode
, rtx x
)
12002 /* Support CSE and rematerialization of common constants. */
12003 if (CONST_INT_P (x
)
12004 || (CONST_DOUBLE_P (x
) && GET_MODE_CLASS (mode
) == MODE_FLOAT
)
12005 || GET_CODE (x
) == CONST_VECTOR
)
12008 /* Do not allow vector struct mode constants for Advanced SIMD.
12009 We could support 0 and -1 easily, but they need support in
12010 aarch64-simd.md. */
12011 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
12012 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
12015 /* Only accept variable-length vector constants if they can be
12018 ??? It would be possible to handle rematerialization of other
12019 constants via secondary reloads. */
12020 if (vec_flags
& VEC_ANY_SVE
)
12021 return aarch64_simd_valid_immediate (x
, NULL
);
12023 if (GET_CODE (x
) == HIGH
)
12026 /* Accept polynomial constants that can be calculated by using the
12027 destination of a move as the sole temporary. Constants that
12028 require a second temporary cannot be rematerialized (they can't be
12029 forced to memory and also aren't legitimate constants). */
12031 if (poly_int_rtx_p (x
, &offset
))
12032 return aarch64_offset_temporaries (false, offset
) <= 1;
12034 /* If an offset is being added to something else, we need to allow the
12035 base to be moved into the destination register, meaning that there
12036 are no free temporaries for the offset. */
12037 x
= strip_offset (x
, &offset
);
12038 if (!offset
.is_constant () && aarch64_offset_temporaries (true, offset
) > 0)
12041 /* Do not allow const (plus (anchor_symbol, const_int)). */
12042 if (maybe_ne (offset
, 0) && SYMBOL_REF_P (x
) && SYMBOL_REF_ANCHOR_P (x
))
12045 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
12046 so spilling them is better than rematerialization. */
12047 if (SYMBOL_REF_P (x
) && !SYMBOL_REF_TLS_MODEL (x
))
12050 /* Label references are always constant. */
12051 if (GET_CODE (x
) == LABEL_REF
)
12058 aarch64_load_tp (rtx target
)
12061 || GET_MODE (target
) != Pmode
12062 || !register_operand (target
, Pmode
))
12063 target
= gen_reg_rtx (Pmode
);
12065 /* Can return in any reg. */
12066 emit_insn (gen_aarch64_load_tp_hard (target
));
12070 /* On AAPCS systems, this is the "struct __va_list". */
12071 static GTY(()) tree va_list_type
;
12073 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
12074 Return the type to use as __builtin_va_list.
12076 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
12088 aarch64_build_builtin_va_list (void)
12091 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
12093 /* Create the type. */
12094 va_list_type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
12095 /* Give it the required name. */
12096 va_list_name
= build_decl (BUILTINS_LOCATION
,
12098 get_identifier ("__va_list"),
12100 DECL_ARTIFICIAL (va_list_name
) = 1;
12101 TYPE_NAME (va_list_type
) = va_list_name
;
12102 TYPE_STUB_DECL (va_list_type
) = va_list_name
;
12104 /* Create the fields. */
12105 f_stack
= build_decl (BUILTINS_LOCATION
,
12106 FIELD_DECL
, get_identifier ("__stack"),
12108 f_grtop
= build_decl (BUILTINS_LOCATION
,
12109 FIELD_DECL
, get_identifier ("__gr_top"),
12111 f_vrtop
= build_decl (BUILTINS_LOCATION
,
12112 FIELD_DECL
, get_identifier ("__vr_top"),
12114 f_groff
= build_decl (BUILTINS_LOCATION
,
12115 FIELD_DECL
, get_identifier ("__gr_offs"),
12116 integer_type_node
);
12117 f_vroff
= build_decl (BUILTINS_LOCATION
,
12118 FIELD_DECL
, get_identifier ("__vr_offs"),
12119 integer_type_node
);
12121 /* Tell tree-stdarg pass about our internal offset fields.
12122 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
12123 purpose to identify whether the code is updating va_list internal
12124 offset fields through irregular way. */
12125 va_list_gpr_counter_field
= f_groff
;
12126 va_list_fpr_counter_field
= f_vroff
;
12128 DECL_ARTIFICIAL (f_stack
) = 1;
12129 DECL_ARTIFICIAL (f_grtop
) = 1;
12130 DECL_ARTIFICIAL (f_vrtop
) = 1;
12131 DECL_ARTIFICIAL (f_groff
) = 1;
12132 DECL_ARTIFICIAL (f_vroff
) = 1;
12134 DECL_FIELD_CONTEXT (f_stack
) = va_list_type
;
12135 DECL_FIELD_CONTEXT (f_grtop
) = va_list_type
;
12136 DECL_FIELD_CONTEXT (f_vrtop
) = va_list_type
;
12137 DECL_FIELD_CONTEXT (f_groff
) = va_list_type
;
12138 DECL_FIELD_CONTEXT (f_vroff
) = va_list_type
;
12140 TYPE_FIELDS (va_list_type
) = f_stack
;
12141 DECL_CHAIN (f_stack
) = f_grtop
;
12142 DECL_CHAIN (f_grtop
) = f_vrtop
;
12143 DECL_CHAIN (f_vrtop
) = f_groff
;
12144 DECL_CHAIN (f_groff
) = f_vroff
;
12146 /* Compute its layout. */
12147 layout_type (va_list_type
);
12149 return va_list_type
;
12152 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
12154 aarch64_expand_builtin_va_start (tree valist
, rtx nextarg ATTRIBUTE_UNUSED
)
12156 const CUMULATIVE_ARGS
*cum
;
12157 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
12158 tree stack
, grtop
, vrtop
, groff
, vroff
;
12160 int gr_save_area_size
= cfun
->va_list_gpr_size
;
12161 int vr_save_area_size
= cfun
->va_list_fpr_size
;
12164 cum
= &crtl
->args
.info
;
12165 if (cfun
->va_list_gpr_size
)
12166 gr_save_area_size
= MIN ((NUM_ARG_REGS
- cum
->aapcs_ncrn
) * UNITS_PER_WORD
,
12167 cfun
->va_list_gpr_size
);
12168 if (cfun
->va_list_fpr_size
)
12169 vr_save_area_size
= MIN ((NUM_FP_ARG_REGS
- cum
->aapcs_nvrn
)
12170 * UNITS_PER_VREG
, cfun
->va_list_fpr_size
);
12174 gcc_assert (cum
->aapcs_nvrn
== 0);
12175 vr_save_area_size
= 0;
12178 f_stack
= TYPE_FIELDS (va_list_type_node
);
12179 f_grtop
= DECL_CHAIN (f_stack
);
12180 f_vrtop
= DECL_CHAIN (f_grtop
);
12181 f_groff
= DECL_CHAIN (f_vrtop
);
12182 f_vroff
= DECL_CHAIN (f_groff
);
12184 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), valist
, f_stack
,
12186 grtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
), valist
, f_grtop
,
12188 vrtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
), valist
, f_vrtop
,
12190 groff
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
), valist
, f_groff
,
12192 vroff
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
), valist
, f_vroff
,
12195 /* Emit code to initialize STACK, which points to the next varargs stack
12196 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
12197 by named arguments. STACK is 8-byte aligned. */
12198 t
= make_tree (TREE_TYPE (stack
), virtual_incoming_args_rtx
);
12199 if (cum
->aapcs_stack_size
> 0)
12200 t
= fold_build_pointer_plus_hwi (t
, cum
->aapcs_stack_size
* UNITS_PER_WORD
);
12201 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), stack
, t
);
12202 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
12204 /* Emit code to initialize GRTOP, the top of the GR save area.
12205 virtual_incoming_args_rtx should have been 16 byte aligned. */
12206 t
= make_tree (TREE_TYPE (grtop
), virtual_incoming_args_rtx
);
12207 t
= build2 (MODIFY_EXPR
, TREE_TYPE (grtop
), grtop
, t
);
12208 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
12210 /* Emit code to initialize VRTOP, the top of the VR save area.
12211 This address is gr_save_area_bytes below GRTOP, rounded
12212 down to the next 16-byte boundary. */
12213 t
= make_tree (TREE_TYPE (vrtop
), virtual_incoming_args_rtx
);
12214 vr_offset
= ROUND_UP (gr_save_area_size
,
12215 STACK_BOUNDARY
/ BITS_PER_UNIT
);
12218 t
= fold_build_pointer_plus_hwi (t
, -vr_offset
);
12219 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vrtop
), vrtop
, t
);
12220 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
12222 /* Emit code to initialize GROFF, the offset from GRTOP of the
12223 next GPR argument. */
12224 t
= build2 (MODIFY_EXPR
, TREE_TYPE (groff
), groff
,
12225 build_int_cst (TREE_TYPE (groff
), -gr_save_area_size
));
12226 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
12228 /* Likewise emit code to initialize VROFF, the offset from FTOP
12229 of the next VR argument. */
12230 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vroff
), vroff
,
12231 build_int_cst (TREE_TYPE (vroff
), -vr_save_area_size
));
12232 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
12235 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
12238 aarch64_gimplify_va_arg_expr (tree valist
, tree type
, gimple_seq
*pre_p
,
12239 gimple_seq
*post_p ATTRIBUTE_UNUSED
)
12243 bool is_ha
; /* is HFA or HVA. */
12244 bool dw_align
; /* double-word align. */
12245 machine_mode ag_mode
= VOIDmode
;
12249 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
12250 tree stack
, f_top
, f_off
, off
, arg
, roundup
, on_stack
;
12251 HOST_WIDE_INT size
, rsize
, adjust
, align
;
12252 tree t
, u
, cond1
, cond2
;
12254 indirect_p
= pass_by_reference (NULL
, TYPE_MODE (type
), type
, false);
12256 type
= build_pointer_type (type
);
12258 mode
= TYPE_MODE (type
);
12260 f_stack
= TYPE_FIELDS (va_list_type_node
);
12261 f_grtop
= DECL_CHAIN (f_stack
);
12262 f_vrtop
= DECL_CHAIN (f_grtop
);
12263 f_groff
= DECL_CHAIN (f_vrtop
);
12264 f_vroff
= DECL_CHAIN (f_groff
);
12266 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), unshare_expr (valist
),
12267 f_stack
, NULL_TREE
);
12268 size
= int_size_in_bytes (type
);
12269 align
= aarch64_function_arg_alignment (mode
, type
) / BITS_PER_UNIT
;
12273 if (aarch64_vfp_is_call_or_return_candidate (mode
,
12279 /* No frontends can create types with variable-sized modes, so we
12280 shouldn't be asked to pass or return them. */
12281 unsigned int ag_size
= GET_MODE_SIZE (ag_mode
).to_constant ();
12283 /* TYPE passed in fp/simd registers. */
12285 aarch64_err_no_fpadvsimd (mode
);
12287 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
),
12288 unshare_expr (valist
), f_vrtop
, NULL_TREE
);
12289 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
),
12290 unshare_expr (valist
), f_vroff
, NULL_TREE
);
12292 rsize
= nregs
* UNITS_PER_VREG
;
12296 if (BYTES_BIG_ENDIAN
&& ag_size
< UNITS_PER_VREG
)
12297 adjust
= UNITS_PER_VREG
- ag_size
;
12299 else if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
12300 && size
< UNITS_PER_VREG
)
12302 adjust
= UNITS_PER_VREG
- size
;
12307 /* TYPE passed in general registers. */
12308 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
),
12309 unshare_expr (valist
), f_grtop
, NULL_TREE
);
12310 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
),
12311 unshare_expr (valist
), f_groff
, NULL_TREE
);
12312 rsize
= ROUND_UP (size
, UNITS_PER_WORD
);
12313 nregs
= rsize
/ UNITS_PER_WORD
;
12318 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
12319 && size
< UNITS_PER_WORD
)
12321 adjust
= UNITS_PER_WORD
- size
;
12325 /* Get a local temporary for the field value. */
12326 off
= get_initialized_tmp_var (f_off
, pre_p
, NULL
);
12328 /* Emit code to branch if off >= 0. */
12329 t
= build2 (GE_EXPR
, boolean_type_node
, off
,
12330 build_int_cst (TREE_TYPE (off
), 0));
12331 cond1
= build3 (COND_EXPR
, ptr_type_node
, t
, NULL_TREE
, NULL_TREE
);
12335 /* Emit: offs = (offs + 15) & -16. */
12336 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
12337 build_int_cst (TREE_TYPE (off
), 15));
12338 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (off
), t
,
12339 build_int_cst (TREE_TYPE (off
), -16));
12340 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (off
), off
, t
);
12345 /* Update ap.__[g|v]r_offs */
12346 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
12347 build_int_cst (TREE_TYPE (off
), rsize
));
12348 t
= build2 (MODIFY_EXPR
, TREE_TYPE (f_off
), unshare_expr (f_off
), t
);
12352 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
12354 /* [cond2] if (ap.__[g|v]r_offs > 0) */
12355 u
= build2 (GT_EXPR
, boolean_type_node
, unshare_expr (f_off
),
12356 build_int_cst (TREE_TYPE (f_off
), 0));
12357 cond2
= build3 (COND_EXPR
, ptr_type_node
, u
, NULL_TREE
, NULL_TREE
);
12359 /* String up: make sure the assignment happens before the use. */
12360 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (cond2
), t
, cond2
);
12361 COND_EXPR_ELSE (cond1
) = t
;
12363 /* Prepare the trees handling the argument that is passed on the stack;
12364 the top level node will store in ON_STACK. */
12365 arg
= get_initialized_tmp_var (stack
, pre_p
, NULL
);
12368 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
12369 t
= fold_build_pointer_plus_hwi (arg
, 15);
12370 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
12371 build_int_cst (TREE_TYPE (t
), -16));
12372 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (arg
), arg
, t
);
12376 /* Advance ap.__stack */
12377 t
= fold_build_pointer_plus_hwi (arg
, size
+ 7);
12378 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
12379 build_int_cst (TREE_TYPE (t
), -8));
12380 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), unshare_expr (stack
), t
);
12381 /* String up roundup and advance. */
12383 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
12384 /* String up with arg */
12385 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), t
, arg
);
12386 /* Big-endianness related address adjustment. */
12387 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
12388 && size
< UNITS_PER_WORD
)
12390 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (arg
), arg
,
12391 size_int (UNITS_PER_WORD
- size
));
12392 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), on_stack
, t
);
12395 COND_EXPR_THEN (cond1
) = unshare_expr (on_stack
);
12396 COND_EXPR_THEN (cond2
) = unshare_expr (on_stack
);
12398 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
12401 t
= build2 (PREINCREMENT_EXPR
, TREE_TYPE (off
), off
,
12402 build_int_cst (TREE_TYPE (off
), adjust
));
12404 t
= fold_convert (sizetype
, t
);
12405 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (f_top
), f_top
, t
);
12409 /* type ha; // treat as "struct {ftype field[n];}"
12410 ... [computing offs]
12411 for (i = 0; i <nregs; ++i, offs += 16)
12412 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
12415 tree tmp_ha
, field_t
, field_ptr_t
;
12417 /* Declare a local variable. */
12418 tmp_ha
= create_tmp_var_raw (type
, "ha");
12419 gimple_add_tmp_var (tmp_ha
);
12421 /* Establish the base type. */
12425 field_t
= float_type_node
;
12426 field_ptr_t
= float_ptr_type_node
;
12429 field_t
= double_type_node
;
12430 field_ptr_t
= double_ptr_type_node
;
12433 field_t
= long_double_type_node
;
12434 field_ptr_t
= long_double_ptr_type_node
;
12437 field_t
= aarch64_fp16_type_node
;
12438 field_ptr_t
= aarch64_fp16_ptr_type_node
;
12443 tree innertype
= make_signed_type (GET_MODE_PRECISION (SImode
));
12444 field_t
= build_vector_type_for_mode (innertype
, ag_mode
);
12445 field_ptr_t
= build_pointer_type (field_t
);
12452 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
12453 tmp_ha
= build1 (ADDR_EXPR
, field_ptr_t
, tmp_ha
);
12455 t
= fold_convert (field_ptr_t
, addr
);
12456 t
= build2 (MODIFY_EXPR
, field_t
,
12457 build1 (INDIRECT_REF
, field_t
, tmp_ha
),
12458 build1 (INDIRECT_REF
, field_t
, t
));
12460 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
12461 for (i
= 1; i
< nregs
; ++i
)
12463 addr
= fold_build_pointer_plus_hwi (addr
, UNITS_PER_VREG
);
12464 u
= fold_convert (field_ptr_t
, addr
);
12465 u
= build2 (MODIFY_EXPR
, field_t
,
12466 build2 (MEM_REF
, field_t
, tmp_ha
,
12467 build_int_cst (field_ptr_t
,
12469 int_size_in_bytes (field_t
)))),
12470 build1 (INDIRECT_REF
, field_t
, u
));
12471 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), t
, u
);
12474 u
= fold_convert (TREE_TYPE (f_top
), tmp_ha
);
12475 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (f_top
), t
, u
);
12478 COND_EXPR_ELSE (cond2
) = t
;
12479 addr
= fold_convert (build_pointer_type (type
), cond1
);
12480 addr
= build_va_arg_indirect_ref (addr
);
12483 addr
= build_va_arg_indirect_ref (addr
);
12488 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
12491 aarch64_setup_incoming_varargs (cumulative_args_t cum_v
, machine_mode mode
,
12492 tree type
, int *pretend_size ATTRIBUTE_UNUSED
,
12495 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
12496 CUMULATIVE_ARGS local_cum
;
12497 int gr_saved
= cfun
->va_list_gpr_size
;
12498 int vr_saved
= cfun
->va_list_fpr_size
;
12500 /* The caller has advanced CUM up to, but not beyond, the last named
12501 argument. Advance a local copy of CUM past the last "real" named
12502 argument, to find out how many registers are left over. */
12504 aarch64_function_arg_advance (pack_cumulative_args(&local_cum
), mode
, type
, true);
12506 /* Found out how many registers we need to save.
12507 Honor tree-stdvar analysis results. */
12508 if (cfun
->va_list_gpr_size
)
12509 gr_saved
= MIN (NUM_ARG_REGS
- local_cum
.aapcs_ncrn
,
12510 cfun
->va_list_gpr_size
/ UNITS_PER_WORD
);
12511 if (cfun
->va_list_fpr_size
)
12512 vr_saved
= MIN (NUM_FP_ARG_REGS
- local_cum
.aapcs_nvrn
,
12513 cfun
->va_list_fpr_size
/ UNITS_PER_VREG
);
12517 gcc_assert (local_cum
.aapcs_nvrn
== 0);
12527 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
12528 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
,
12529 - gr_saved
* UNITS_PER_WORD
);
12530 mem
= gen_frame_mem (BLKmode
, ptr
);
12531 set_mem_alias_set (mem
, get_varargs_alias_set ());
12533 move_block_from_reg (local_cum
.aapcs_ncrn
+ R0_REGNUM
,
12538 /* We can't use move_block_from_reg, because it will use
12539 the wrong mode, storing D regs only. */
12540 machine_mode mode
= TImode
;
12541 int off
, i
, vr_start
;
12543 /* Set OFF to the offset from virtual_incoming_args_rtx of
12544 the first vector register. The VR save area lies below
12545 the GR one, and is aligned to 16 bytes. */
12546 off
= -ROUND_UP (gr_saved
* UNITS_PER_WORD
,
12547 STACK_BOUNDARY
/ BITS_PER_UNIT
);
12548 off
-= vr_saved
* UNITS_PER_VREG
;
12550 vr_start
= V0_REGNUM
+ local_cum
.aapcs_nvrn
;
12551 for (i
= 0; i
< vr_saved
; ++i
)
12555 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
, off
);
12556 mem
= gen_frame_mem (mode
, ptr
);
12557 set_mem_alias_set (mem
, get_varargs_alias_set ());
12558 aarch64_emit_move (mem
, gen_rtx_REG (mode
, vr_start
+ i
));
12559 off
+= UNITS_PER_VREG
;
12564 /* We don't save the size into *PRETEND_SIZE because we want to avoid
12565 any complication of having crtl->args.pretend_args_size changed. */
12566 cfun
->machine
->frame
.saved_varargs_size
12567 = (ROUND_UP (gr_saved
* UNITS_PER_WORD
,
12568 STACK_BOUNDARY
/ BITS_PER_UNIT
)
12569 + vr_saved
* UNITS_PER_VREG
);
12573 aarch64_conditional_register_usage (void)
12578 for (i
= V0_REGNUM
; i
<= V31_REGNUM
; i
++)
12581 call_used_regs
[i
] = 1;
12585 for (i
= P0_REGNUM
; i
<= P15_REGNUM
; i
++)
12588 call_used_regs
[i
] = 1;
12592 /* Walk down the type tree of TYPE counting consecutive base elements.
12593 If *MODEP is VOIDmode, then set it to the first valid floating point
12594 type. If a non-floating point type is found, or if a floating point
12595 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
12596 otherwise return the count in the sub-tree. */
12598 aapcs_vfp_sub_candidate (const_tree type
, machine_mode
*modep
)
12601 HOST_WIDE_INT size
;
12603 switch (TREE_CODE (type
))
12606 mode
= TYPE_MODE (type
);
12607 if (mode
!= DFmode
&& mode
!= SFmode
12608 && mode
!= TFmode
&& mode
!= HFmode
)
12611 if (*modep
== VOIDmode
)
12614 if (*modep
== mode
)
12620 mode
= TYPE_MODE (TREE_TYPE (type
));
12621 if (mode
!= DFmode
&& mode
!= SFmode
12622 && mode
!= TFmode
&& mode
!= HFmode
)
12625 if (*modep
== VOIDmode
)
12628 if (*modep
== mode
)
12634 /* Use V2SImode and V4SImode as representatives of all 64-bit
12635 and 128-bit vector types. */
12636 size
= int_size_in_bytes (type
);
12649 if (*modep
== VOIDmode
)
12652 /* Vector modes are considered to be opaque: two vectors are
12653 equivalent for the purposes of being homogeneous aggregates
12654 if they are the same size. */
12655 if (*modep
== mode
)
12663 tree index
= TYPE_DOMAIN (type
);
12665 /* Can't handle incomplete types nor sizes that are not
12667 if (!COMPLETE_TYPE_P (type
)
12668 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
12671 count
= aapcs_vfp_sub_candidate (TREE_TYPE (type
), modep
);
12674 || !TYPE_MAX_VALUE (index
)
12675 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index
))
12676 || !TYPE_MIN_VALUE (index
)
12677 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index
))
12681 count
*= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index
))
12682 - tree_to_uhwi (TYPE_MIN_VALUE (index
)));
12684 /* There must be no padding. */
12685 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
12686 count
* GET_MODE_BITSIZE (*modep
)))
12698 /* Can't handle incomplete types nor sizes that are not
12700 if (!COMPLETE_TYPE_P (type
)
12701 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
12704 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
12706 if (TREE_CODE (field
) != FIELD_DECL
)
12709 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
12712 count
+= sub_count
;
12715 /* There must be no padding. */
12716 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
12717 count
* GET_MODE_BITSIZE (*modep
)))
12724 case QUAL_UNION_TYPE
:
12726 /* These aren't very interesting except in a degenerate case. */
12731 /* Can't handle incomplete types nor sizes that are not
12733 if (!COMPLETE_TYPE_P (type
)
12734 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
12737 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
12739 if (TREE_CODE (field
) != FIELD_DECL
)
12742 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
12745 count
= count
> sub_count
? count
: sub_count
;
12748 /* There must be no padding. */
12749 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
12750 count
* GET_MODE_BITSIZE (*modep
)))
12763 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
12764 type as described in AAPCS64 \S 4.1.2.
12766 See the comment above aarch64_composite_type_p for the notes on MODE. */
12769 aarch64_short_vector_p (const_tree type
,
12772 poly_int64 size
= -1;
12774 if (type
&& TREE_CODE (type
) == VECTOR_TYPE
)
12775 size
= int_size_in_bytes (type
);
12776 else if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
12777 || GET_MODE_CLASS (mode
) == MODE_VECTOR_FLOAT
)
12778 size
= GET_MODE_SIZE (mode
);
12780 return known_eq (size
, 8) || known_eq (size
, 16);
12783 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
12784 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
12785 array types. The C99 floating-point complex types are also considered
12786 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
12787 types, which are GCC extensions and out of the scope of AAPCS64, are
12788 treated as composite types here as well.
12790 Note that MODE itself is not sufficient in determining whether a type
12791 is such a composite type or not. This is because
12792 stor-layout.c:compute_record_mode may have already changed the MODE
12793 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
12794 structure with only one field may have its MODE set to the mode of the
12795 field. Also an integer mode whose size matches the size of the
12796 RECORD_TYPE type may be used to substitute the original mode
12797 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
12798 solely relied on. */
12801 aarch64_composite_type_p (const_tree type
,
12804 if (aarch64_short_vector_p (type
, mode
))
12807 if (type
&& (AGGREGATE_TYPE_P (type
) || TREE_CODE (type
) == COMPLEX_TYPE
))
12810 if (mode
== BLKmode
12811 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
12812 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_INT
)
12818 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
12819 shall be passed or returned in simd/fp register(s) (providing these
12820 parameter passing registers are available).
12822 Upon successful return, *COUNT returns the number of needed registers,
12823 *BASE_MODE returns the mode of the individual register and when IS_HAF
12824 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
12825 floating-point aggregate or a homogeneous short-vector aggregate. */
12828 aarch64_vfp_is_call_or_return_candidate (machine_mode mode
,
12830 machine_mode
*base_mode
,
12834 machine_mode new_mode
= VOIDmode
;
12835 bool composite_p
= aarch64_composite_type_p (type
, mode
);
12837 if (is_ha
!= NULL
) *is_ha
= false;
12839 if ((!composite_p
&& GET_MODE_CLASS (mode
) == MODE_FLOAT
)
12840 || aarch64_short_vector_p (type
, mode
))
12845 else if (GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
)
12847 if (is_ha
!= NULL
) *is_ha
= true;
12849 new_mode
= GET_MODE_INNER (mode
);
12851 else if (type
&& composite_p
)
12853 int ag_count
= aapcs_vfp_sub_candidate (type
, &new_mode
);
12855 if (ag_count
> 0 && ag_count
<= HA_MAX_NUM_FLDS
)
12857 if (is_ha
!= NULL
) *is_ha
= true;
12866 *base_mode
= new_mode
;
12870 /* Implement TARGET_STRUCT_VALUE_RTX. */
12873 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED
,
12874 int incoming ATTRIBUTE_UNUSED
)
12876 return gen_rtx_REG (Pmode
, AARCH64_STRUCT_VALUE_REGNUM
);
12879 /* Implements target hook vector_mode_supported_p. */
12881 aarch64_vector_mode_supported_p (machine_mode mode
)
12883 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
12884 return vec_flags
!= 0 && (vec_flags
& VEC_STRUCT
) == 0;
12887 /* Return appropriate SIMD container
12888 for MODE within a vector of WIDTH bits. */
12889 static machine_mode
12890 aarch64_simd_container_mode (scalar_mode mode
, poly_int64 width
)
12892 if (TARGET_SVE
&& known_eq (width
, BITS_PER_SVE_VECTOR
))
12908 return VNx16QImode
;
12913 gcc_assert (known_eq (width
, 64) || known_eq (width
, 128));
12916 if (known_eq (width
, 128))
12956 /* Return 128-bit container as the preferred SIMD mode for MODE. */
12957 static machine_mode
12958 aarch64_preferred_simd_mode (scalar_mode mode
)
12960 poly_int64 bits
= TARGET_SVE
? BITS_PER_SVE_VECTOR
: 128;
12961 return aarch64_simd_container_mode (mode
, bits
);
12964 /* Return a list of possible vector sizes for the vectorizer
12965 to iterate over. */
12967 aarch64_autovectorize_vector_sizes (vector_sizes
*sizes
)
12970 sizes
->safe_push (BYTES_PER_SVE_VECTOR
);
12971 sizes
->safe_push (16);
12972 sizes
->safe_push (8);
12975 /* Implement TARGET_MANGLE_TYPE. */
12977 static const char *
12978 aarch64_mangle_type (const_tree type
)
12980 /* The AArch64 ABI documents say that "__va_list" has to be
12981 managled as if it is in the "std" namespace. */
12982 if (lang_hooks
.types_compatible_p (CONST_CAST_TREE (type
), va_list_type
))
12983 return "St9__va_list";
12985 /* Half-precision float. */
12986 if (TREE_CODE (type
) == REAL_TYPE
&& TYPE_PRECISION (type
) == 16)
12989 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
12991 if (TYPE_NAME (type
) != NULL
)
12992 return aarch64_mangle_builtin_type (type
);
12994 /* Use the default mangling. */
12998 /* Find the first rtx_insn before insn that will generate an assembly
13002 aarch64_prev_real_insn (rtx_insn
*insn
)
13009 insn
= prev_real_insn (insn
);
13011 while (insn
&& recog_memoized (insn
) < 0);
13017 is_madd_op (enum attr_type t1
)
13020 /* A number of these may be AArch32 only. */
13021 enum attr_type mlatypes
[] = {
13022 TYPE_MLA
, TYPE_MLAS
, TYPE_SMLAD
, TYPE_SMLADX
, TYPE_SMLAL
, TYPE_SMLALD
,
13023 TYPE_SMLALS
, TYPE_SMLALXY
, TYPE_SMLAWX
, TYPE_SMLAWY
, TYPE_SMLAXY
,
13024 TYPE_SMMLA
, TYPE_UMLAL
, TYPE_UMLALS
,TYPE_SMLSD
, TYPE_SMLSDX
, TYPE_SMLSLD
13027 for (i
= 0; i
< sizeof (mlatypes
) / sizeof (enum attr_type
); i
++)
13029 if (t1
== mlatypes
[i
])
13036 /* Check if there is a register dependency between a load and the insn
13037 for which we hold recog_data. */
13040 dep_between_memop_and_curr (rtx memop
)
13045 gcc_assert (GET_CODE (memop
) == SET
);
13047 if (!REG_P (SET_DEST (memop
)))
13050 load_reg
= SET_DEST (memop
);
13051 for (opno
= 1; opno
< recog_data
.n_operands
; opno
++)
13053 rtx operand
= recog_data
.operand
[opno
];
13054 if (REG_P (operand
)
13055 && reg_overlap_mentioned_p (load_reg
, operand
))
13063 /* When working around the Cortex-A53 erratum 835769,
13064 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
13065 instruction and has a preceding memory instruction such that a NOP
13066 should be inserted between them. */
13069 aarch64_madd_needs_nop (rtx_insn
* insn
)
13071 enum attr_type attr_type
;
13075 if (!TARGET_FIX_ERR_A53_835769
)
13078 if (!INSN_P (insn
) || recog_memoized (insn
) < 0)
13081 attr_type
= get_attr_type (insn
);
13082 if (!is_madd_op (attr_type
))
13085 prev
= aarch64_prev_real_insn (insn
);
13086 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
13087 Restore recog state to INSN to avoid state corruption. */
13088 extract_constrain_insn_cached (insn
);
13090 if (!prev
|| !contains_mem_rtx_p (PATTERN (prev
)))
13093 body
= single_set (prev
);
13095 /* If the previous insn is a memory op and there is no dependency between
13096 it and the DImode madd, emit a NOP between them. If body is NULL then we
13097 have a complex memory operation, probably a load/store pair.
13098 Be conservative for now and emit a NOP. */
13099 if (GET_MODE (recog_data
.operand
[0]) == DImode
13100 && (!body
|| !dep_between_memop_and_curr (body
)))
13108 /* Implement FINAL_PRESCAN_INSN. */
13111 aarch64_final_prescan_insn (rtx_insn
*insn
)
13113 if (aarch64_madd_needs_nop (insn
))
13114 fprintf (asm_out_file
, "\tnop // between mem op and mult-accumulate\n");
13118 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
13122 aarch64_sve_index_immediate_p (rtx base_or_step
)
13124 return (CONST_INT_P (base_or_step
)
13125 && IN_RANGE (INTVAL (base_or_step
), -16, 15));
13128 /* Return true if X is a valid immediate for the SVE ADD and SUB
13129 instructions. Negate X first if NEGATE_P is true. */
13132 aarch64_sve_arith_immediate_p (rtx x
, bool negate_p
)
13136 if (!const_vec_duplicate_p (x
, &elt
)
13137 || !CONST_INT_P (elt
))
13140 HOST_WIDE_INT val
= INTVAL (elt
);
13143 val
&= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x
)));
13146 return IN_RANGE (val
, 0, 0xff);
13147 return IN_RANGE (val
, 0, 0xff00);
13150 /* Return true if X is a valid immediate operand for an SVE logical
13151 instruction such as AND. */
13154 aarch64_sve_bitmask_immediate_p (rtx x
)
13158 return (const_vec_duplicate_p (x
, &elt
)
13159 && CONST_INT_P (elt
)
13160 && aarch64_bitmask_imm (INTVAL (elt
),
13161 GET_MODE_INNER (GET_MODE (x
))));
13164 /* Return true if X is a valid immediate for the SVE DUP and CPY
13168 aarch64_sve_dup_immediate_p (rtx x
)
13172 if (!const_vec_duplicate_p (x
, &elt
)
13173 || !CONST_INT_P (elt
))
13176 HOST_WIDE_INT val
= INTVAL (elt
);
13178 return IN_RANGE (val
, -0x80, 0x7f);
13179 return IN_RANGE (val
, -0x8000, 0x7f00);
13182 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
13183 SIGNED_P says whether the operand is signed rather than unsigned. */
13186 aarch64_sve_cmp_immediate_p (rtx x
, bool signed_p
)
13190 return (const_vec_duplicate_p (x
, &elt
)
13191 && CONST_INT_P (elt
)
13193 ? IN_RANGE (INTVAL (elt
), -16, 15)
13194 : IN_RANGE (INTVAL (elt
), 0, 127)));
13197 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
13198 instruction. Negate X first if NEGATE_P is true. */
13201 aarch64_sve_float_arith_immediate_p (rtx x
, bool negate_p
)
13206 if (!const_vec_duplicate_p (x
, &elt
)
13207 || GET_CODE (elt
) != CONST_DOUBLE
)
13210 r
= *CONST_DOUBLE_REAL_VALUE (elt
);
13213 r
= real_value_negate (&r
);
13215 if (real_equal (&r
, &dconst1
))
13217 if (real_equal (&r
, &dconsthalf
))
13222 /* Return true if X is a valid immediate operand for an SVE FMUL
13226 aarch64_sve_float_mul_immediate_p (rtx x
)
13230 /* GCC will never generate a multiply with an immediate of 2, so there is no
13231 point testing for it (even though it is a valid constant). */
13232 return (const_vec_duplicate_p (x
, &elt
)
13233 && GET_CODE (elt
) == CONST_DOUBLE
13234 && real_equal (CONST_DOUBLE_REAL_VALUE (elt
), &dconsthalf
));
13237 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
13238 for the Advanced SIMD operation described by WHICH and INSN. If INFO
13239 is nonnull, use it to describe valid immediates. */
13241 aarch64_advsimd_valid_immediate_hs (unsigned int val32
,
13242 simd_immediate_info
*info
,
13243 enum simd_immediate_check which
,
13244 simd_immediate_info::insn_type insn
)
13246 /* Try a 4-byte immediate with LSL. */
13247 for (unsigned int shift
= 0; shift
< 32; shift
+= 8)
13248 if ((val32
& (0xff << shift
)) == val32
)
13251 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
13252 simd_immediate_info::LSL
, shift
);
13256 /* Try a 2-byte immediate with LSL. */
13257 unsigned int imm16
= val32
& 0xffff;
13258 if (imm16
== (val32
>> 16))
13259 for (unsigned int shift
= 0; shift
< 16; shift
+= 8)
13260 if ((imm16
& (0xff << shift
)) == imm16
)
13263 *info
= simd_immediate_info (HImode
, imm16
>> shift
, insn
,
13264 simd_immediate_info::LSL
, shift
);
13268 /* Try a 4-byte immediate with MSL, except for cases that MVN
13270 if (which
== AARCH64_CHECK_MOV
)
13271 for (unsigned int shift
= 8; shift
< 24; shift
+= 8)
13273 unsigned int low
= (1 << shift
) - 1;
13274 if (((val32
& (0xff << shift
)) | low
) == val32
)
13277 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
13278 simd_immediate_info::MSL
, shift
);
13286 /* Return true if replicating VAL64 is a valid immediate for the
13287 Advanced SIMD operation described by WHICH. If INFO is nonnull,
13288 use it to describe valid immediates. */
13290 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64
,
13291 simd_immediate_info
*info
,
13292 enum simd_immediate_check which
)
13294 unsigned int val32
= val64
& 0xffffffff;
13295 unsigned int val16
= val64
& 0xffff;
13296 unsigned int val8
= val64
& 0xff;
13298 if (val32
== (val64
>> 32))
13300 if ((which
& AARCH64_CHECK_ORR
) != 0
13301 && aarch64_advsimd_valid_immediate_hs (val32
, info
, which
,
13302 simd_immediate_info::MOV
))
13305 if ((which
& AARCH64_CHECK_BIC
) != 0
13306 && aarch64_advsimd_valid_immediate_hs (~val32
, info
, which
,
13307 simd_immediate_info::MVN
))
13310 /* Try using a replicated byte. */
13311 if (which
== AARCH64_CHECK_MOV
13312 && val16
== (val32
>> 16)
13313 && val8
== (val16
>> 8))
13316 *info
= simd_immediate_info (QImode
, val8
);
13321 /* Try using a bit-to-bytemask. */
13322 if (which
== AARCH64_CHECK_MOV
)
13325 for (i
= 0; i
< 64; i
+= 8)
13327 unsigned char byte
= (val64
>> i
) & 0xff;
13328 if (byte
!= 0 && byte
!= 0xff)
13334 *info
= simd_immediate_info (DImode
, val64
);
13341 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
13342 instruction. If INFO is nonnull, use it to describe valid immediates. */
13345 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64
,
13346 simd_immediate_info
*info
)
13348 scalar_int_mode mode
= DImode
;
13349 unsigned int val32
= val64
& 0xffffffff;
13350 if (val32
== (val64
>> 32))
13353 unsigned int val16
= val32
& 0xffff;
13354 if (val16
== (val32
>> 16))
13357 unsigned int val8
= val16
& 0xff;
13358 if (val8
== (val16
>> 8))
13362 HOST_WIDE_INT val
= trunc_int_for_mode (val64
, mode
);
13363 if (IN_RANGE (val
, -0x80, 0x7f))
13365 /* DUP with no shift. */
13367 *info
= simd_immediate_info (mode
, val
);
13370 if ((val
& 0xff) == 0 && IN_RANGE (val
, -0x8000, 0x7f00))
13372 /* DUP with LSL #8. */
13374 *info
= simd_immediate_info (mode
, val
);
13377 if (aarch64_bitmask_imm (val64
, mode
))
13381 *info
= simd_immediate_info (mode
, val
);
13387 /* Return true if OP is a valid SIMD immediate for the operation
13388 described by WHICH. If INFO is nonnull, use it to describe valid
13391 aarch64_simd_valid_immediate (rtx op
, simd_immediate_info
*info
,
13392 enum simd_immediate_check which
)
13394 machine_mode mode
= GET_MODE (op
);
13395 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
13396 if (vec_flags
== 0 || vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
13399 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
13401 unsigned int n_elts
;
13402 if (GET_CODE (op
) == CONST_VECTOR
13403 && CONST_VECTOR_DUPLICATE_P (op
))
13404 n_elts
= CONST_VECTOR_NPATTERNS (op
);
13405 else if ((vec_flags
& VEC_SVE_DATA
)
13406 && const_vec_series_p (op
, &base
, &step
))
13408 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
);
13409 if (!aarch64_sve_index_immediate_p (base
)
13410 || !aarch64_sve_index_immediate_p (step
))
13414 *info
= simd_immediate_info (elt_mode
, base
, step
);
13417 else if (GET_CODE (op
) == CONST_VECTOR
13418 && CONST_VECTOR_NUNITS (op
).is_constant (&n_elts
))
13419 /* N_ELTS set above. */;
13423 /* Handle PFALSE and PTRUE. */
13424 if (vec_flags
& VEC_SVE_PRED
)
13425 return (op
== CONST0_RTX (mode
)
13426 || op
== CONSTM1_RTX (mode
));
13428 scalar_float_mode elt_float_mode
;
13430 && is_a
<scalar_float_mode
> (elt_mode
, &elt_float_mode
))
13432 rtx elt
= CONST_VECTOR_ENCODED_ELT (op
, 0);
13433 if (aarch64_float_const_zero_rtx_p (elt
)
13434 || aarch64_float_const_representable_p (elt
))
13437 *info
= simd_immediate_info (elt_float_mode
, elt
);
13442 unsigned int elt_size
= GET_MODE_SIZE (elt_mode
);
13446 scalar_int_mode elt_int_mode
= int_mode_for_mode (elt_mode
).require ();
13448 /* Expand the vector constant out into a byte vector, with the least
13449 significant byte of the register first. */
13450 auto_vec
<unsigned char, 16> bytes
;
13451 bytes
.reserve (n_elts
* elt_size
);
13452 for (unsigned int i
= 0; i
< n_elts
; i
++)
13454 /* The vector is provided in gcc endian-neutral fashion.
13455 For aarch64_be Advanced SIMD, it must be laid out in the vector
13456 register in reverse order. */
13457 bool swap_p
= ((vec_flags
& VEC_ADVSIMD
) != 0 && BYTES_BIG_ENDIAN
);
13458 rtx elt
= CONST_VECTOR_ELT (op
, swap_p
? (n_elts
- 1 - i
) : i
);
13460 if (elt_mode
!= elt_int_mode
)
13461 elt
= gen_lowpart (elt_int_mode
, elt
);
13463 if (!CONST_INT_P (elt
))
13466 unsigned HOST_WIDE_INT elt_val
= INTVAL (elt
);
13467 for (unsigned int byte
= 0; byte
< elt_size
; byte
++)
13469 bytes
.quick_push (elt_val
& 0xff);
13470 elt_val
>>= BITS_PER_UNIT
;
13474 /* The immediate must repeat every eight bytes. */
13475 unsigned int nbytes
= bytes
.length ();
13476 for (unsigned i
= 8; i
< nbytes
; ++i
)
13477 if (bytes
[i
] != bytes
[i
- 8])
13480 /* Get the repeating 8-byte value as an integer. No endian correction
13481 is needed here because bytes is already in lsb-first order. */
13482 unsigned HOST_WIDE_INT val64
= 0;
13483 for (unsigned int i
= 0; i
< 8; i
++)
13484 val64
|= ((unsigned HOST_WIDE_INT
) bytes
[i
% nbytes
]
13485 << (i
* BITS_PER_UNIT
));
13487 if (vec_flags
& VEC_SVE_DATA
)
13488 return aarch64_sve_valid_immediate (val64
, info
);
13490 return aarch64_advsimd_valid_immediate (val64
, info
, which
);
13493 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
13494 has a step in the range of INDEX. Return the index expression if so,
13495 otherwise return null. */
13497 aarch64_check_zero_based_sve_index_immediate (rtx x
)
13500 if (const_vec_series_p (x
, &base
, &step
)
13501 && base
== const0_rtx
13502 && aarch64_sve_index_immediate_p (step
))
13507 /* Check of immediate shift constants are within range. */
13509 aarch64_simd_shift_imm_p (rtx x
, machine_mode mode
, bool left
)
13511 int bit_width
= GET_MODE_UNIT_SIZE (mode
) * BITS_PER_UNIT
;
13513 return aarch64_const_vec_all_same_in_range_p (x
, 0, bit_width
- 1);
13515 return aarch64_const_vec_all_same_in_range_p (x
, 1, bit_width
);
13518 /* Return the bitmask CONST_INT to select the bits required by a zero extract
13519 operation of width WIDTH at bit position POS. */
13522 aarch64_mask_from_zextract_ops (rtx width
, rtx pos
)
13524 gcc_assert (CONST_INT_P (width
));
13525 gcc_assert (CONST_INT_P (pos
));
13527 unsigned HOST_WIDE_INT mask
13528 = ((unsigned HOST_WIDE_INT
) 1 << UINTVAL (width
)) - 1;
13529 return GEN_INT (mask
<< UINTVAL (pos
));
13533 aarch64_mov_operand_p (rtx x
, machine_mode mode
)
13535 if (GET_CODE (x
) == HIGH
13536 && aarch64_valid_symref (XEXP (x
, 0), GET_MODE (XEXP (x
, 0))))
13539 if (CONST_INT_P (x
))
13542 if (VECTOR_MODE_P (GET_MODE (x
)))
13543 return aarch64_simd_valid_immediate (x
, NULL
);
13545 if (GET_CODE (x
) == SYMBOL_REF
&& mode
== DImode
&& CONSTANT_ADDRESS_P (x
))
13548 if (aarch64_sve_cnt_immediate_p (x
))
13551 return aarch64_classify_symbolic_expression (x
)
13552 == SYMBOL_TINY_ABSOLUTE
;
13555 /* Return a const_int vector of VAL. */
13557 aarch64_simd_gen_const_vector_dup (machine_mode mode
, HOST_WIDE_INT val
)
13559 rtx c
= gen_int_mode (val
, GET_MODE_INNER (mode
));
13560 return gen_const_vec_duplicate (mode
, c
);
13563 /* Check OP is a legal scalar immediate for the MOVI instruction. */
13566 aarch64_simd_scalar_immediate_valid_for_move (rtx op
, scalar_int_mode mode
)
13568 machine_mode vmode
;
13570 vmode
= aarch64_simd_container_mode (mode
, 64);
13571 rtx op_v
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (op
));
13572 return aarch64_simd_valid_immediate (op_v
, NULL
);
13575 /* Construct and return a PARALLEL RTX vector with elements numbering the
13576 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
13577 the vector - from the perspective of the architecture. This does not
13578 line up with GCC's perspective on lane numbers, so we end up with
13579 different masks depending on our target endian-ness. The diagram
13580 below may help. We must draw the distinction when building masks
13581 which select one half of the vector. An instruction selecting
13582 architectural low-lanes for a big-endian target, must be described using
13583 a mask selecting GCC high-lanes.
13585 Big-Endian Little-Endian
13587 GCC 0 1 2 3 3 2 1 0
13588 | x | x | x | x | | x | x | x | x |
13589 Architecture 3 2 1 0 3 2 1 0
13591 Low Mask: { 2, 3 } { 0, 1 }
13592 High Mask: { 0, 1 } { 2, 3 }
13594 MODE Is the mode of the vector and NUNITS is the number of units in it. */
13597 aarch64_simd_vect_par_cnst_half (machine_mode mode
, int nunits
, bool high
)
13599 rtvec v
= rtvec_alloc (nunits
/ 2);
13600 int high_base
= nunits
/ 2;
13606 if (BYTES_BIG_ENDIAN
)
13607 base
= high
? low_base
: high_base
;
13609 base
= high
? high_base
: low_base
;
13611 for (i
= 0; i
< nunits
/ 2; i
++)
13612 RTVEC_ELT (v
, i
) = GEN_INT (base
+ i
);
13614 t1
= gen_rtx_PARALLEL (mode
, v
);
13618 /* Check OP for validity as a PARALLEL RTX vector with elements
13619 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
13620 from the perspective of the architecture. See the diagram above
13621 aarch64_simd_vect_par_cnst_half for more details. */
13624 aarch64_simd_check_vect_par_cnst_half (rtx op
, machine_mode mode
,
13628 if (!VECTOR_MODE_P (mode
) || !GET_MODE_NUNITS (mode
).is_constant (&nelts
))
13631 rtx ideal
= aarch64_simd_vect_par_cnst_half (mode
, nelts
, high
);
13632 HOST_WIDE_INT count_op
= XVECLEN (op
, 0);
13633 HOST_WIDE_INT count_ideal
= XVECLEN (ideal
, 0);
13636 if (count_op
!= count_ideal
)
13639 for (i
= 0; i
< count_ideal
; i
++)
13641 rtx elt_op
= XVECEXP (op
, 0, i
);
13642 rtx elt_ideal
= XVECEXP (ideal
, 0, i
);
13644 if (!CONST_INT_P (elt_op
)
13645 || INTVAL (elt_ideal
) != INTVAL (elt_op
))
13651 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
13652 HIGH (exclusive). */
13654 aarch64_simd_lane_bounds (rtx operand
, HOST_WIDE_INT low
, HOST_WIDE_INT high
,
13657 HOST_WIDE_INT lane
;
13658 gcc_assert (CONST_INT_P (operand
));
13659 lane
= INTVAL (operand
);
13661 if (lane
< low
|| lane
>= high
)
13664 error ("%Klane %wd out of range %wd - %wd", exp
, lane
, low
, high
- 1);
13666 error ("lane %wd out of range %wd - %wd", lane
, low
, high
- 1);
13670 /* Peform endian correction on lane number N, which indexes a vector
13671 of mode MODE, and return the result as an SImode rtx. */
13674 aarch64_endian_lane_rtx (machine_mode mode
, unsigned int n
)
13676 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode
), n
), SImode
);
13679 /* Return TRUE if OP is a valid vector addressing mode. */
13682 aarch64_simd_mem_operand_p (rtx op
)
13684 return MEM_P (op
) && (GET_CODE (XEXP (op
, 0)) == POST_INC
13685 || REG_P (XEXP (op
, 0)));
13688 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
13691 aarch64_sve_ld1r_operand_p (rtx op
)
13693 struct aarch64_address_info addr
;
13697 && is_a
<scalar_mode
> (GET_MODE (op
), &mode
)
13698 && aarch64_classify_address (&addr
, XEXP (op
, 0), mode
, false)
13699 && addr
.type
== ADDRESS_REG_IMM
13700 && offset_6bit_unsigned_scaled_p (mode
, addr
.const_offset
));
13703 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
13704 The conditions for STR are the same. */
13706 aarch64_sve_ldr_operand_p (rtx op
)
13708 struct aarch64_address_info addr
;
13711 && aarch64_classify_address (&addr
, XEXP (op
, 0), GET_MODE (op
),
13712 false, ADDR_QUERY_ANY
)
13713 && addr
.type
== ADDRESS_REG_IMM
);
13716 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
13717 We need to be able to access the individual pieces, so the range
13718 is different from LD[234] and ST[234]. */
13720 aarch64_sve_struct_memory_operand_p (rtx op
)
13725 machine_mode mode
= GET_MODE (op
);
13726 struct aarch64_address_info addr
;
13727 if (!aarch64_classify_address (&addr
, XEXP (op
, 0), SVE_BYTE_MODE
, false,
13729 || addr
.type
!= ADDRESS_REG_IMM
)
13732 poly_int64 first
= addr
.const_offset
;
13733 poly_int64 last
= first
+ GET_MODE_SIZE (mode
) - BYTES_PER_SVE_VECTOR
;
13734 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, first
)
13735 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, last
));
13738 /* Emit a register copy from operand to operand, taking care not to
13739 early-clobber source registers in the process.
13741 COUNT is the number of components into which the copy needs to be
13744 aarch64_simd_emit_reg_reg_move (rtx
*operands
, machine_mode mode
,
13745 unsigned int count
)
13748 int rdest
= REGNO (operands
[0]);
13749 int rsrc
= REGNO (operands
[1]);
13751 if (!reg_overlap_mentioned_p (operands
[0], operands
[1])
13753 for (i
= 0; i
< count
; i
++)
13754 emit_move_insn (gen_rtx_REG (mode
, rdest
+ i
),
13755 gen_rtx_REG (mode
, rsrc
+ i
));
13757 for (i
= 0; i
< count
; i
++)
13758 emit_move_insn (gen_rtx_REG (mode
, rdest
+ count
- i
- 1),
13759 gen_rtx_REG (mode
, rsrc
+ count
- i
- 1));
13762 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
13763 one of VSTRUCT modes: OI, CI, or XI. */
13765 aarch64_simd_attr_length_rglist (machine_mode mode
)
13767 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
13768 return (GET_MODE_SIZE (mode
).to_constant () / UNITS_PER_VREG
) * 4;
13771 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
13772 alignment of a vector to 128 bits. SVE predicates have an alignment of
13774 static HOST_WIDE_INT
13775 aarch64_simd_vector_alignment (const_tree type
)
13777 if (TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
13778 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
13779 be set for non-predicate vectors of booleans. Modes are the most
13780 direct way we have of identifying real SVE predicate types. */
13781 return GET_MODE_CLASS (TYPE_MODE (type
)) == MODE_VECTOR_BOOL
? 16 : 128;
13782 HOST_WIDE_INT align
= tree_to_shwi (TYPE_SIZE (type
));
13783 return MIN (align
, 128);
13786 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
13787 static HOST_WIDE_INT
13788 aarch64_vectorize_preferred_vector_alignment (const_tree type
)
13790 if (aarch64_sve_data_mode_p (TYPE_MODE (type
)))
13792 /* If the length of the vector is fixed, try to align to that length,
13793 otherwise don't try to align at all. */
13794 HOST_WIDE_INT result
;
13795 if (!BITS_PER_SVE_VECTOR
.is_constant (&result
))
13796 result
= TYPE_ALIGN (TREE_TYPE (type
));
13799 return TYPE_ALIGN (type
);
13802 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
13804 aarch64_simd_vector_alignment_reachable (const_tree type
, bool is_packed
)
13809 /* For fixed-length vectors, check that the vectorizer will aim for
13810 full-vector alignment. This isn't true for generic GCC vectors
13811 that are wider than the ABI maximum of 128 bits. */
13812 if (TREE_CODE (TYPE_SIZE (type
)) == INTEGER_CST
13813 && (wi::to_widest (TYPE_SIZE (type
))
13814 != aarch64_vectorize_preferred_vector_alignment (type
)))
13817 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
13821 /* Return true if the vector misalignment factor is supported by the
13824 aarch64_builtin_support_vector_misalignment (machine_mode mode
,
13825 const_tree type
, int misalignment
,
13828 if (TARGET_SIMD
&& STRICT_ALIGNMENT
)
13830 /* Return if movmisalign pattern is not supported for this mode. */
13831 if (optab_handler (movmisalign_optab
, mode
) == CODE_FOR_nothing
)
13834 /* Misalignment factor is unknown at compile time. */
13835 if (misalignment
== -1)
13838 return default_builtin_support_vector_misalignment (mode
, type
, misalignment
,
13842 /* If VALS is a vector constant that can be loaded into a register
13843 using DUP, generate instructions to do so and return an RTX to
13844 assign to the register. Otherwise return NULL_RTX. */
13846 aarch64_simd_dup_constant (rtx vals
)
13848 machine_mode mode
= GET_MODE (vals
);
13849 machine_mode inner_mode
= GET_MODE_INNER (mode
);
13852 if (!const_vec_duplicate_p (vals
, &x
))
13855 /* We can load this constant by using DUP and a constant in a
13856 single ARM register. This will be cheaper than a vector
13858 x
= copy_to_mode_reg (inner_mode
, x
);
13859 return gen_vec_duplicate (mode
, x
);
13863 /* Generate code to load VALS, which is a PARALLEL containing only
13864 constants (for vec_init) or CONST_VECTOR, efficiently into a
13865 register. Returns an RTX to copy into the register, or NULL_RTX
13866 for a PARALLEL that can not be converted into a CONST_VECTOR. */
13868 aarch64_simd_make_constant (rtx vals
)
13870 machine_mode mode
= GET_MODE (vals
);
13872 rtx const_vec
= NULL_RTX
;
13876 if (GET_CODE (vals
) == CONST_VECTOR
)
13878 else if (GET_CODE (vals
) == PARALLEL
)
13880 /* A CONST_VECTOR must contain only CONST_INTs and
13881 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
13882 Only store valid constants in a CONST_VECTOR. */
13883 int n_elts
= XVECLEN (vals
, 0);
13884 for (i
= 0; i
< n_elts
; ++i
)
13886 rtx x
= XVECEXP (vals
, 0, i
);
13887 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
13890 if (n_const
== n_elts
)
13891 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0));
13894 gcc_unreachable ();
13896 if (const_vec
!= NULL_RTX
13897 && aarch64_simd_valid_immediate (const_vec
, NULL
))
13898 /* Load using MOVI/MVNI. */
13900 else if ((const_dup
= aarch64_simd_dup_constant (vals
)) != NULL_RTX
)
13901 /* Loaded using DUP. */
13903 else if (const_vec
!= NULL_RTX
)
13904 /* Load from constant pool. We can not take advantage of single-cycle
13905 LD1 because we need a PC-relative addressing mode. */
13908 /* A PARALLEL containing something not valid inside CONST_VECTOR.
13909 We can not construct an initializer. */
13913 /* Expand a vector initialisation sequence, such that TARGET is
13914 initialised to contain VALS. */
13917 aarch64_expand_vector_init (rtx target
, rtx vals
)
13919 machine_mode mode
= GET_MODE (target
);
13920 scalar_mode inner_mode
= GET_MODE_INNER (mode
);
13921 /* The number of vector elements. */
13922 int n_elts
= XVECLEN (vals
, 0);
13923 /* The number of vector elements which are not constant. */
13925 rtx any_const
= NULL_RTX
;
13926 /* The first element of vals. */
13927 rtx v0
= XVECEXP (vals
, 0, 0);
13928 bool all_same
= true;
13930 /* Count the number of variable elements to initialise. */
13931 for (int i
= 0; i
< n_elts
; ++i
)
13933 rtx x
= XVECEXP (vals
, 0, i
);
13934 if (!(CONST_INT_P (x
) || CONST_DOUBLE_P (x
)))
13939 all_same
&= rtx_equal_p (x
, v0
);
13942 /* No variable elements, hand off to aarch64_simd_make_constant which knows
13943 how best to handle this. */
13946 rtx constant
= aarch64_simd_make_constant (vals
);
13947 if (constant
!= NULL_RTX
)
13949 emit_move_insn (target
, constant
);
13954 /* Splat a single non-constant element if we can. */
13957 rtx x
= copy_to_mode_reg (inner_mode
, v0
);
13958 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
13962 enum insn_code icode
= optab_handler (vec_set_optab
, mode
);
13963 gcc_assert (icode
!= CODE_FOR_nothing
);
13965 /* If there are only variable elements, try to optimize
13966 the insertion using dup for the most common element
13967 followed by insertions. */
13969 /* The algorithm will fill matches[*][0] with the earliest matching element,
13970 and matches[X][1] with the count of duplicate elements (if X is the
13971 earliest element which has duplicates). */
13973 if (n_var
== n_elts
&& n_elts
<= 16)
13975 int matches
[16][2] = {0};
13976 for (int i
= 0; i
< n_elts
; i
++)
13978 for (int j
= 0; j
<= i
; j
++)
13980 if (rtx_equal_p (XVECEXP (vals
, 0, i
), XVECEXP (vals
, 0, j
)))
13988 int maxelement
= 0;
13990 for (int i
= 0; i
< n_elts
; i
++)
13991 if (matches
[i
][1] > maxv
)
13994 maxv
= matches
[i
][1];
13997 /* Create a duplicate of the most common element, unless all elements
13998 are equally useless to us, in which case just immediately set the
13999 vector register using the first element. */
14003 /* For vectors of two 64-bit elements, we can do even better. */
14005 && (inner_mode
== E_DImode
14006 || inner_mode
== E_DFmode
))
14009 rtx x0
= XVECEXP (vals
, 0, 0);
14010 rtx x1
= XVECEXP (vals
, 0, 1);
14011 /* Combine can pick up this case, but handling it directly
14012 here leaves clearer RTL.
14014 This is load_pair_lanes<mode>, and also gives us a clean-up
14015 for store_pair_lanes<mode>. */
14016 if (memory_operand (x0
, inner_mode
)
14017 && memory_operand (x1
, inner_mode
)
14018 && !STRICT_ALIGNMENT
14019 && rtx_equal_p (XEXP (x1
, 0),
14020 plus_constant (Pmode
,
14022 GET_MODE_SIZE (inner_mode
))))
14025 if (inner_mode
== DFmode
)
14026 t
= gen_load_pair_lanesdf (target
, x0
, x1
);
14028 t
= gen_load_pair_lanesdi (target
, x0
, x1
);
14033 /* The subreg-move sequence below will move into lane zero of the
14034 vector register. For big-endian we want that position to hold
14035 the last element of VALS. */
14036 maxelement
= BYTES_BIG_ENDIAN
? n_elts
- 1 : 0;
14037 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
14038 aarch64_emit_move (target
, lowpart_subreg (mode
, x
, inner_mode
));
14042 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
14043 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
14046 /* Insert the rest. */
14047 for (int i
= 0; i
< n_elts
; i
++)
14049 rtx x
= XVECEXP (vals
, 0, i
);
14050 if (matches
[i
][0] == maxelement
)
14052 x
= copy_to_mode_reg (inner_mode
, x
);
14053 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
14058 /* Initialise a vector which is part-variable. We want to first try
14059 to build those lanes which are constant in the most efficient way we
14061 if (n_var
!= n_elts
)
14063 rtx copy
= copy_rtx (vals
);
14065 /* Load constant part of vector. We really don't care what goes into the
14066 parts we will overwrite, but we're more likely to be able to load the
14067 constant efficiently if it has fewer, larger, repeating parts
14068 (see aarch64_simd_valid_immediate). */
14069 for (int i
= 0; i
< n_elts
; i
++)
14071 rtx x
= XVECEXP (vals
, 0, i
);
14072 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
14074 rtx subst
= any_const
;
14075 for (int bit
= n_elts
/ 2; bit
> 0; bit
/= 2)
14077 /* Look in the copied vector, as more elements are const. */
14078 rtx test
= XVECEXP (copy
, 0, i
^ bit
);
14079 if (CONST_INT_P (test
) || CONST_DOUBLE_P (test
))
14085 XVECEXP (copy
, 0, i
) = subst
;
14087 aarch64_expand_vector_init (target
, copy
);
14090 /* Insert the variable lanes directly. */
14091 for (int i
= 0; i
< n_elts
; i
++)
14093 rtx x
= XVECEXP (vals
, 0, i
);
14094 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
14096 x
= copy_to_mode_reg (inner_mode
, x
);
14097 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
14101 static unsigned HOST_WIDE_INT
14102 aarch64_shift_truncation_mask (machine_mode mode
)
14104 if (!SHIFT_COUNT_TRUNCATED
|| aarch64_vector_data_mode_p (mode
))
14106 return GET_MODE_UNIT_BITSIZE (mode
) - 1;
14109 /* Select a format to encode pointers in exception handling data. */
14111 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED
, int global
)
14114 switch (aarch64_cmodel
)
14116 case AARCH64_CMODEL_TINY
:
14117 case AARCH64_CMODEL_TINY_PIC
:
14118 case AARCH64_CMODEL_SMALL
:
14119 case AARCH64_CMODEL_SMALL_PIC
:
14120 case AARCH64_CMODEL_SMALL_SPIC
:
14121 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
14123 type
= DW_EH_PE_sdata4
;
14126 /* No assumptions here. 8-byte relocs required. */
14127 type
= DW_EH_PE_sdata8
;
14130 return (global
? DW_EH_PE_indirect
: 0) | DW_EH_PE_pcrel
| type
;
14133 /* The last .arch and .tune assembly strings that we printed. */
14134 static std::string aarch64_last_printed_arch_string
;
14135 static std::string aarch64_last_printed_tune_string
;
14137 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
14138 by the function fndecl. */
14141 aarch64_declare_function_name (FILE *stream
, const char* name
,
14144 tree target_parts
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
14146 struct cl_target_option
*targ_options
;
14148 targ_options
= TREE_TARGET_OPTION (target_parts
);
14150 targ_options
= TREE_TARGET_OPTION (target_option_current_node
);
14151 gcc_assert (targ_options
);
14153 const struct processor
*this_arch
14154 = aarch64_get_arch (targ_options
->x_explicit_arch
);
14156 unsigned long isa_flags
= targ_options
->x_aarch64_isa_flags
;
14157 std::string extension
14158 = aarch64_get_extension_string_for_isa_flags (isa_flags
,
14160 /* Only update the assembler .arch string if it is distinct from the last
14161 such string we printed. */
14162 std::string to_print
= this_arch
->name
+ extension
;
14163 if (to_print
!= aarch64_last_printed_arch_string
)
14165 asm_fprintf (asm_out_file
, "\t.arch %s\n", to_print
.c_str ());
14166 aarch64_last_printed_arch_string
= to_print
;
14169 /* Print the cpu name we're tuning for in the comments, might be
14170 useful to readers of the generated asm. Do it only when it changes
14171 from function to function and verbose assembly is requested. */
14172 const struct processor
*this_tune
14173 = aarch64_get_tune_cpu (targ_options
->x_explicit_tune_core
);
14175 if (flag_debug_asm
&& aarch64_last_printed_tune_string
!= this_tune
->name
)
14177 asm_fprintf (asm_out_file
, "\t" ASM_COMMENT_START
".tune %s\n",
14179 aarch64_last_printed_tune_string
= this_tune
->name
;
14182 /* Don't forget the type directive for ELF. */
14183 ASM_OUTPUT_TYPE_DIRECTIVE (stream
, name
, "function");
14184 ASM_OUTPUT_LABEL (stream
, name
);
14187 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
14190 aarch64_start_file (void)
14192 struct cl_target_option
*default_options
14193 = TREE_TARGET_OPTION (target_option_default_node
);
14195 const struct processor
*default_arch
14196 = aarch64_get_arch (default_options
->x_explicit_arch
);
14197 unsigned long default_isa_flags
= default_options
->x_aarch64_isa_flags
;
14198 std::string extension
14199 = aarch64_get_extension_string_for_isa_flags (default_isa_flags
,
14200 default_arch
->flags
);
14202 aarch64_last_printed_arch_string
= default_arch
->name
+ extension
;
14203 aarch64_last_printed_tune_string
= "";
14204 asm_fprintf (asm_out_file
, "\t.arch %s\n",
14205 aarch64_last_printed_arch_string
.c_str ());
14207 default_file_start ();
14210 /* Emit load exclusive. */
14213 aarch64_emit_load_exclusive (machine_mode mode
, rtx rval
,
14214 rtx mem
, rtx model_rtx
)
14216 rtx (*gen
) (rtx
, rtx
, rtx
);
14220 case E_QImode
: gen
= gen_aarch64_load_exclusiveqi
; break;
14221 case E_HImode
: gen
= gen_aarch64_load_exclusivehi
; break;
14222 case E_SImode
: gen
= gen_aarch64_load_exclusivesi
; break;
14223 case E_DImode
: gen
= gen_aarch64_load_exclusivedi
; break;
14225 gcc_unreachable ();
14228 emit_insn (gen (rval
, mem
, model_rtx
));
14231 /* Emit store exclusive. */
14234 aarch64_emit_store_exclusive (machine_mode mode
, rtx bval
,
14235 rtx rval
, rtx mem
, rtx model_rtx
)
14237 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
14241 case E_QImode
: gen
= gen_aarch64_store_exclusiveqi
; break;
14242 case E_HImode
: gen
= gen_aarch64_store_exclusivehi
; break;
14243 case E_SImode
: gen
= gen_aarch64_store_exclusivesi
; break;
14244 case E_DImode
: gen
= gen_aarch64_store_exclusivedi
; break;
14246 gcc_unreachable ();
14249 emit_insn (gen (bval
, rval
, mem
, model_rtx
));
14252 /* Mark the previous jump instruction as unlikely. */
14255 aarch64_emit_unlikely_jump (rtx insn
)
14257 rtx_insn
*jump
= emit_jump_insn (insn
);
14258 add_reg_br_prob_note (jump
, profile_probability::very_unlikely ());
14261 /* Expand a compare and swap pattern. */
14264 aarch64_expand_compare_and_swap (rtx operands
[])
14266 rtx bval
, rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
, x
;
14267 machine_mode mode
, cmp_mode
;
14268 typedef rtx (*gen_cas_fn
) (rtx
, rtx
, rtx
, rtx
, rtx
, rtx
, rtx
);
14271 const gen_cas_fn split_cas
[] =
14273 gen_aarch64_compare_and_swapqi
,
14274 gen_aarch64_compare_and_swaphi
,
14275 gen_aarch64_compare_and_swapsi
,
14276 gen_aarch64_compare_and_swapdi
14278 const gen_cas_fn atomic_cas
[] =
14280 gen_aarch64_compare_and_swapqi_lse
,
14281 gen_aarch64_compare_and_swaphi_lse
,
14282 gen_aarch64_compare_and_swapsi_lse
,
14283 gen_aarch64_compare_and_swapdi_lse
14286 bval
= operands
[0];
14287 rval
= operands
[1];
14289 oldval
= operands
[3];
14290 newval
= operands
[4];
14291 is_weak
= operands
[5];
14292 mod_s
= operands
[6];
14293 mod_f
= operands
[7];
14294 mode
= GET_MODE (mem
);
14297 /* Normally the succ memory model must be stronger than fail, but in the
14298 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
14299 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
14301 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f
)))
14302 && is_mm_release (memmodel_from_int (INTVAL (mod_s
))))
14303 mod_s
= GEN_INT (MEMMODEL_ACQ_REL
);
14309 /* For short modes, we're going to perform the comparison in SImode,
14310 so do the zero-extension now. */
14312 rval
= gen_reg_rtx (SImode
);
14313 oldval
= convert_modes (SImode
, mode
, oldval
, true);
14314 /* Fall through. */
14318 /* Force the value into a register if needed. */
14319 if (!aarch64_plus_operand (oldval
, mode
))
14320 oldval
= force_reg (cmp_mode
, oldval
);
14324 gcc_unreachable ();
14329 case E_QImode
: idx
= 0; break;
14330 case E_HImode
: idx
= 1; break;
14331 case E_SImode
: idx
= 2; break;
14332 case E_DImode
: idx
= 3; break;
14334 gcc_unreachable ();
14337 gen
= atomic_cas
[idx
];
14339 gen
= split_cas
[idx
];
14341 emit_insn (gen (rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
));
14343 if (mode
== QImode
|| mode
== HImode
)
14344 emit_move_insn (operands
[1], gen_lowpart (mode
, rval
));
14346 x
= gen_rtx_REG (CCmode
, CC_REGNUM
);
14347 x
= gen_rtx_EQ (SImode
, x
, const0_rtx
);
14348 emit_insn (gen_rtx_SET (bval
, x
));
14351 /* Test whether the target supports using a atomic load-operate instruction.
14352 CODE is the operation and AFTER is TRUE if the data in memory after the
14353 operation should be returned and FALSE if the data before the operation
14354 should be returned. Returns FALSE if the operation isn't supported by the
14358 aarch64_atomic_ldop_supported_p (enum rtx_code code
)
14377 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
14378 sequence implementing an atomic operation. */
14381 aarch64_emit_post_barrier (enum memmodel model
)
14383 const enum memmodel base_model
= memmodel_base (model
);
14385 if (is_mm_sync (model
)
14386 && (base_model
== MEMMODEL_ACQUIRE
14387 || base_model
== MEMMODEL_ACQ_REL
14388 || base_model
== MEMMODEL_SEQ_CST
))
14390 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST
)));
14394 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
14395 for the data in memory. EXPECTED is the value expected to be in memory.
14396 DESIRED is the value to store to memory. MEM is the memory location. MODEL
14397 is the memory ordering to use. */
14400 aarch64_gen_atomic_cas (rtx rval
, rtx mem
,
14401 rtx expected
, rtx desired
,
14404 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
14407 mode
= GET_MODE (mem
);
14411 case E_QImode
: gen
= gen_aarch64_atomic_casqi
; break;
14412 case E_HImode
: gen
= gen_aarch64_atomic_cashi
; break;
14413 case E_SImode
: gen
= gen_aarch64_atomic_cassi
; break;
14414 case E_DImode
: gen
= gen_aarch64_atomic_casdi
; break;
14416 gcc_unreachable ();
14419 /* Move the expected value into the CAS destination register. */
14420 emit_insn (gen_rtx_SET (rval
, expected
));
14422 /* Emit the CAS. */
14423 emit_insn (gen (rval
, mem
, desired
, model
));
14425 /* Compare the expected value with the value loaded by the CAS, to establish
14426 whether the swap was made. */
14427 aarch64_gen_compare_reg (EQ
, rval
, expected
);
14430 /* Split a compare and swap pattern. */
14433 aarch64_split_compare_and_swap (rtx operands
[])
14435 rtx rval
, mem
, oldval
, newval
, scratch
;
14438 rtx_code_label
*label1
, *label2
;
14440 enum memmodel model
;
14443 rval
= operands
[0];
14445 oldval
= operands
[2];
14446 newval
= operands
[3];
14447 is_weak
= (operands
[4] != const0_rtx
);
14448 model_rtx
= operands
[5];
14449 scratch
= operands
[7];
14450 mode
= GET_MODE (mem
);
14451 model
= memmodel_from_int (INTVAL (model_rtx
));
14453 /* When OLDVAL is zero and we want the strong version we can emit a tighter
14456 LD[A]XR rval, [mem]
14458 ST[L]XR scratch, newval, [mem]
14459 CBNZ scratch, .label1
14462 bool strong_zero_p
= !is_weak
&& oldval
== const0_rtx
;
14467 label1
= gen_label_rtx ();
14468 emit_label (label1
);
14470 label2
= gen_label_rtx ();
14472 /* The initial load can be relaxed for a __sync operation since a final
14473 barrier will be emitted to stop code hoisting. */
14474 if (is_mm_sync (model
))
14475 aarch64_emit_load_exclusive (mode
, rval
, mem
,
14476 GEN_INT (MEMMODEL_RELAXED
));
14478 aarch64_emit_load_exclusive (mode
, rval
, mem
, model_rtx
);
14482 x
= gen_rtx_NE (VOIDmode
, rval
, const0_rtx
);
14483 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
14484 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
14485 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
14489 cond
= aarch64_gen_compare_reg (NE
, rval
, oldval
);
14490 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
14491 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
14492 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
14493 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
14496 aarch64_emit_store_exclusive (mode
, scratch
, mem
, newval
, model_rtx
);
14500 x
= gen_rtx_NE (VOIDmode
, scratch
, const0_rtx
);
14501 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
14502 gen_rtx_LABEL_REF (Pmode
, label1
), pc_rtx
);
14503 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
14507 cond
= gen_rtx_REG (CCmode
, CC_REGNUM
);
14508 x
= gen_rtx_COMPARE (CCmode
, scratch
, const0_rtx
);
14509 emit_insn (gen_rtx_SET (cond
, x
));
14512 emit_label (label2
);
14513 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
14514 to set the condition flags. If this is not used it will be removed by
14518 cond
= gen_rtx_REG (CCmode
, CC_REGNUM
);
14519 x
= gen_rtx_COMPARE (CCmode
, rval
, const0_rtx
);
14520 emit_insn (gen_rtx_SET (cond
, x
));
14522 /* Emit any final barrier needed for a __sync operation. */
14523 if (is_mm_sync (model
))
14524 aarch64_emit_post_barrier (model
);
14527 /* Emit a BIC instruction. */
14530 aarch64_emit_bic (machine_mode mode
, rtx dst
, rtx s1
, rtx s2
, int shift
)
14532 rtx shift_rtx
= GEN_INT (shift
);
14533 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
14537 case E_SImode
: gen
= gen_and_one_cmpl_lshrsi3
; break;
14538 case E_DImode
: gen
= gen_and_one_cmpl_lshrdi3
; break;
14540 gcc_unreachable ();
14543 emit_insn (gen (dst
, s2
, shift_rtx
, s1
));
14546 /* Emit an atomic swap. */
14549 aarch64_emit_atomic_swap (machine_mode mode
, rtx dst
, rtx value
,
14550 rtx mem
, rtx model
)
14552 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
14556 case E_QImode
: gen
= gen_aarch64_atomic_swpqi
; break;
14557 case E_HImode
: gen
= gen_aarch64_atomic_swphi
; break;
14558 case E_SImode
: gen
= gen_aarch64_atomic_swpsi
; break;
14559 case E_DImode
: gen
= gen_aarch64_atomic_swpdi
; break;
14561 gcc_unreachable ();
14564 emit_insn (gen (dst
, mem
, value
, model
));
14567 /* Operations supported by aarch64_emit_atomic_load_op. */
14569 enum aarch64_atomic_load_op_code
14571 AARCH64_LDOP_PLUS
, /* A + B */
14572 AARCH64_LDOP_XOR
, /* A ^ B */
14573 AARCH64_LDOP_OR
, /* A | B */
14574 AARCH64_LDOP_BIC
/* A & ~B */
14577 /* Emit an atomic load-operate. */
14580 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code
,
14581 machine_mode mode
, rtx dst
, rtx src
,
14582 rtx mem
, rtx model
)
14584 typedef rtx (*aarch64_atomic_load_op_fn
) (rtx
, rtx
, rtx
, rtx
);
14585 const aarch64_atomic_load_op_fn plus
[] =
14587 gen_aarch64_atomic_loadaddqi
,
14588 gen_aarch64_atomic_loadaddhi
,
14589 gen_aarch64_atomic_loadaddsi
,
14590 gen_aarch64_atomic_loadadddi
14592 const aarch64_atomic_load_op_fn eor
[] =
14594 gen_aarch64_atomic_loadeorqi
,
14595 gen_aarch64_atomic_loadeorhi
,
14596 gen_aarch64_atomic_loadeorsi
,
14597 gen_aarch64_atomic_loadeordi
14599 const aarch64_atomic_load_op_fn ior
[] =
14601 gen_aarch64_atomic_loadsetqi
,
14602 gen_aarch64_atomic_loadsethi
,
14603 gen_aarch64_atomic_loadsetsi
,
14604 gen_aarch64_atomic_loadsetdi
14606 const aarch64_atomic_load_op_fn bic
[] =
14608 gen_aarch64_atomic_loadclrqi
,
14609 gen_aarch64_atomic_loadclrhi
,
14610 gen_aarch64_atomic_loadclrsi
,
14611 gen_aarch64_atomic_loadclrdi
14613 aarch64_atomic_load_op_fn gen
;
14618 case E_QImode
: idx
= 0; break;
14619 case E_HImode
: idx
= 1; break;
14620 case E_SImode
: idx
= 2; break;
14621 case E_DImode
: idx
= 3; break;
14623 gcc_unreachable ();
14628 case AARCH64_LDOP_PLUS
: gen
= plus
[idx
]; break;
14629 case AARCH64_LDOP_XOR
: gen
= eor
[idx
]; break;
14630 case AARCH64_LDOP_OR
: gen
= ior
[idx
]; break;
14631 case AARCH64_LDOP_BIC
: gen
= bic
[idx
]; break;
14633 gcc_unreachable ();
14636 emit_insn (gen (dst
, mem
, src
, model
));
14639 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
14640 location to store the data read from memory. OUT_RESULT is the location to
14641 store the result of the operation. MEM is the memory location to read and
14642 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
14643 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
14647 aarch64_gen_atomic_ldop (enum rtx_code code
, rtx out_data
, rtx out_result
,
14648 rtx mem
, rtx value
, rtx model_rtx
)
14650 machine_mode mode
= GET_MODE (mem
);
14651 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
14652 const bool short_mode
= (mode
< SImode
);
14653 aarch64_atomic_load_op_code ldop_code
;
14658 out_data
= gen_lowpart (mode
, out_data
);
14661 out_result
= gen_lowpart (mode
, out_result
);
14663 /* Make sure the value is in a register, putting it into a destination
14664 register if it needs to be manipulated. */
14665 if (!register_operand (value
, mode
)
14666 || code
== AND
|| code
== MINUS
)
14668 src
= out_result
? out_result
: out_data
;
14669 emit_move_insn (src
, gen_lowpart (mode
, value
));
14673 gcc_assert (register_operand (src
, mode
));
14675 /* Preprocess the data for the operation as necessary. If the operation is
14676 a SET then emit a swap instruction and finish. */
14680 aarch64_emit_atomic_swap (mode
, out_data
, src
, mem
, model_rtx
);
14684 /* Negate the value and treat it as a PLUS. */
14688 /* Resize the value if necessary. */
14690 src
= gen_lowpart (wmode
, src
);
14692 neg_src
= gen_rtx_NEG (wmode
, src
);
14693 emit_insn (gen_rtx_SET (src
, neg_src
));
14696 src
= gen_lowpart (mode
, src
);
14698 /* Fall-through. */
14700 ldop_code
= AARCH64_LDOP_PLUS
;
14704 ldop_code
= AARCH64_LDOP_OR
;
14708 ldop_code
= AARCH64_LDOP_XOR
;
14715 /* Resize the value if necessary. */
14717 src
= gen_lowpart (wmode
, src
);
14719 not_src
= gen_rtx_NOT (wmode
, src
);
14720 emit_insn (gen_rtx_SET (src
, not_src
));
14723 src
= gen_lowpart (mode
, src
);
14725 ldop_code
= AARCH64_LDOP_BIC
;
14729 /* The operation can't be done with atomic instructions. */
14730 gcc_unreachable ();
14733 aarch64_emit_atomic_load_op (ldop_code
, mode
, out_data
, src
, mem
, model_rtx
);
14735 /* If necessary, calculate the data in memory after the update by redoing the
14736 operation from values in registers. */
14742 src
= gen_lowpart (wmode
, src
);
14743 out_data
= gen_lowpart (wmode
, out_data
);
14744 out_result
= gen_lowpart (wmode
, out_result
);
14753 x
= gen_rtx_PLUS (wmode
, out_data
, src
);
14756 x
= gen_rtx_IOR (wmode
, out_data
, src
);
14759 x
= gen_rtx_XOR (wmode
, out_data
, src
);
14762 aarch64_emit_bic (wmode
, out_result
, out_data
, src
, 0);
14765 gcc_unreachable ();
14768 emit_set_insn (out_result
, x
);
14773 /* Split an atomic operation. */
14776 aarch64_split_atomic_op (enum rtx_code code
, rtx old_out
, rtx new_out
, rtx mem
,
14777 rtx value
, rtx model_rtx
, rtx cond
)
14779 machine_mode mode
= GET_MODE (mem
);
14780 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
14781 const enum memmodel model
= memmodel_from_int (INTVAL (model_rtx
));
14782 const bool is_sync
= is_mm_sync (model
);
14783 rtx_code_label
*label
;
14786 /* Split the atomic operation into a sequence. */
14787 label
= gen_label_rtx ();
14788 emit_label (label
);
14791 new_out
= gen_lowpart (wmode
, new_out
);
14793 old_out
= gen_lowpart (wmode
, old_out
);
14796 value
= simplify_gen_subreg (wmode
, value
, mode
, 0);
14798 /* The initial load can be relaxed for a __sync operation since a final
14799 barrier will be emitted to stop code hoisting. */
14801 aarch64_emit_load_exclusive (mode
, old_out
, mem
,
14802 GEN_INT (MEMMODEL_RELAXED
));
14804 aarch64_emit_load_exclusive (mode
, old_out
, mem
, model_rtx
);
14813 x
= gen_rtx_AND (wmode
, old_out
, value
);
14814 emit_insn (gen_rtx_SET (new_out
, x
));
14815 x
= gen_rtx_NOT (wmode
, new_out
);
14816 emit_insn (gen_rtx_SET (new_out
, x
));
14820 if (CONST_INT_P (value
))
14822 value
= GEN_INT (-INTVAL (value
));
14825 /* Fall through. */
14828 x
= gen_rtx_fmt_ee (code
, wmode
, old_out
, value
);
14829 emit_insn (gen_rtx_SET (new_out
, x
));
14833 aarch64_emit_store_exclusive (mode
, cond
, mem
,
14834 gen_lowpart (mode
, new_out
), model_rtx
);
14836 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
14837 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
14838 gen_rtx_LABEL_REF (Pmode
, label
), pc_rtx
);
14839 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
14841 /* Emit any final barrier needed for a __sync operation. */
14843 aarch64_emit_post_barrier (model
);
14847 aarch64_init_libfuncs (void)
14849 /* Half-precision float operations. The compiler handles all operations
14850 with NULL libfuncs by converting to SFmode. */
14853 set_conv_libfunc (trunc_optab
, HFmode
, SFmode
, "__gnu_f2h_ieee");
14854 set_conv_libfunc (sext_optab
, SFmode
, HFmode
, "__gnu_h2f_ieee");
14857 set_optab_libfunc (add_optab
, HFmode
, NULL
);
14858 set_optab_libfunc (sdiv_optab
, HFmode
, NULL
);
14859 set_optab_libfunc (smul_optab
, HFmode
, NULL
);
14860 set_optab_libfunc (neg_optab
, HFmode
, NULL
);
14861 set_optab_libfunc (sub_optab
, HFmode
, NULL
);
14864 set_optab_libfunc (eq_optab
, HFmode
, NULL
);
14865 set_optab_libfunc (ne_optab
, HFmode
, NULL
);
14866 set_optab_libfunc (lt_optab
, HFmode
, NULL
);
14867 set_optab_libfunc (le_optab
, HFmode
, NULL
);
14868 set_optab_libfunc (ge_optab
, HFmode
, NULL
);
14869 set_optab_libfunc (gt_optab
, HFmode
, NULL
);
14870 set_optab_libfunc (unord_optab
, HFmode
, NULL
);
14873 /* Target hook for c_mode_for_suffix. */
14874 static machine_mode
14875 aarch64_c_mode_for_suffix (char suffix
)
14883 /* We can only represent floating point constants which will fit in
14884 "quarter-precision" values. These values are characterised by
14885 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
14888 (-1)^s * (n/16) * 2^r
14891 's' is the sign bit.
14892 'n' is an integer in the range 16 <= n <= 31.
14893 'r' is an integer in the range -3 <= r <= 4. */
14895 /* Return true iff X can be represented by a quarter-precision
14896 floating point immediate operand X. Note, we cannot represent 0.0. */
14898 aarch64_float_const_representable_p (rtx x
)
14900 /* This represents our current view of how many bits
14901 make up the mantissa. */
14902 int point_pos
= 2 * HOST_BITS_PER_WIDE_INT
- 1;
14904 unsigned HOST_WIDE_INT mantissa
, mask
;
14905 REAL_VALUE_TYPE r
, m
;
14908 if (!CONST_DOUBLE_P (x
))
14911 /* We don't support HFmode constants yet. */
14912 if (GET_MODE (x
) == VOIDmode
|| GET_MODE (x
) == HFmode
)
14915 r
= *CONST_DOUBLE_REAL_VALUE (x
);
14917 /* We cannot represent infinities, NaNs or +/-zero. We won't
14918 know if we have +zero until we analyse the mantissa, but we
14919 can reject the other invalid values. */
14920 if (REAL_VALUE_ISINF (r
) || REAL_VALUE_ISNAN (r
)
14921 || REAL_VALUE_MINUS_ZERO (r
))
14924 /* Extract exponent. */
14925 r
= real_value_abs (&r
);
14926 exponent
= REAL_EXP (&r
);
14928 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
14929 highest (sign) bit, with a fixed binary point at bit point_pos.
14930 m1 holds the low part of the mantissa, m2 the high part.
14931 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
14932 bits for the mantissa, this can fail (low bits will be lost). */
14933 real_ldexp (&m
, &r
, point_pos
- exponent
);
14934 wide_int w
= real_to_integer (&m
, &fail
, HOST_BITS_PER_WIDE_INT
* 2);
14936 /* If the low part of the mantissa has bits set we cannot represent
14938 if (w
.ulow () != 0)
14940 /* We have rejected the lower HOST_WIDE_INT, so update our
14941 understanding of how many bits lie in the mantissa and
14942 look only at the high HOST_WIDE_INT. */
14943 mantissa
= w
.elt (1);
14944 point_pos
-= HOST_BITS_PER_WIDE_INT
;
14946 /* We can only represent values with a mantissa of the form 1.xxxx. */
14947 mask
= ((unsigned HOST_WIDE_INT
)1 << (point_pos
- 5)) - 1;
14948 if ((mantissa
& mask
) != 0)
14951 /* Having filtered unrepresentable values, we may now remove all
14952 but the highest 5 bits. */
14953 mantissa
>>= point_pos
- 5;
14955 /* We cannot represent the value 0.0, so reject it. This is handled
14960 /* Then, as bit 4 is always set, we can mask it off, leaving
14961 the mantissa in the range [0, 15]. */
14962 mantissa
&= ~(1 << 4);
14963 gcc_assert (mantissa
<= 15);
14965 /* GCC internally does not use IEEE754-like encoding (where normalized
14966 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
14967 Our mantissa values are shifted 4 places to the left relative to
14968 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
14969 by 5 places to correct for GCC's representation. */
14970 exponent
= 5 - exponent
;
14972 return (exponent
>= 0 && exponent
<= 7);
14975 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
14976 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
14977 output MOVI/MVNI, ORR or BIC immediate. */
14979 aarch64_output_simd_mov_immediate (rtx const_vector
, unsigned width
,
14980 enum simd_immediate_check which
)
14983 static char templ
[40];
14984 const char *mnemonic
;
14985 const char *shift_op
;
14986 unsigned int lane_count
= 0;
14989 struct simd_immediate_info info
;
14991 /* This will return true to show const_vector is legal for use as either
14992 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
14993 It will also update INFO to show how the immediate should be generated.
14994 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
14995 is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
, which
);
14996 gcc_assert (is_valid
);
14998 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
14999 lane_count
= width
/ GET_MODE_BITSIZE (info
.elt_mode
);
15001 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
15003 gcc_assert (info
.shift
== 0 && info
.insn
== simd_immediate_info::MOV
);
15004 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
15005 move immediate path. */
15006 if (aarch64_float_const_zero_rtx_p (info
.value
))
15007 info
.value
= GEN_INT (0);
15010 const unsigned int buf_size
= 20;
15011 char float_buf
[buf_size
] = {'\0'};
15012 real_to_decimal_for_mode (float_buf
,
15013 CONST_DOUBLE_REAL_VALUE (info
.value
),
15014 buf_size
, buf_size
, 1, info
.elt_mode
);
15016 if (lane_count
== 1)
15017 snprintf (templ
, sizeof (templ
), "fmov\t%%d0, %s", float_buf
);
15019 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%d%c, %s",
15020 lane_count
, element_char
, float_buf
);
15025 gcc_assert (CONST_INT_P (info
.value
));
15027 if (which
== AARCH64_CHECK_MOV
)
15029 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "mvni" : "movi";
15030 shift_op
= info
.modifier
== simd_immediate_info::MSL
? "msl" : "lsl";
15031 if (lane_count
== 1)
15032 snprintf (templ
, sizeof (templ
), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX
,
15033 mnemonic
, UINTVAL (info
.value
));
15034 else if (info
.shift
)
15035 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
15036 HOST_WIDE_INT_PRINT_HEX
", %s %d", mnemonic
, lane_count
,
15037 element_char
, UINTVAL (info
.value
), shift_op
, info
.shift
);
15039 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
15040 HOST_WIDE_INT_PRINT_HEX
, mnemonic
, lane_count
,
15041 element_char
, UINTVAL (info
.value
));
15045 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
15046 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "bic" : "orr";
15048 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
15049 HOST_WIDE_INT_PRINT_DEC
", %s #%d", mnemonic
, lane_count
,
15050 element_char
, UINTVAL (info
.value
), "lsl", info
.shift
);
15052 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
15053 HOST_WIDE_INT_PRINT_DEC
, mnemonic
, lane_count
,
15054 element_char
, UINTVAL (info
.value
));
15060 aarch64_output_scalar_simd_mov_immediate (rtx immediate
, scalar_int_mode mode
)
15063 /* If a floating point number was passed and we desire to use it in an
15064 integer mode do the conversion to integer. */
15065 if (CONST_DOUBLE_P (immediate
) && GET_MODE_CLASS (mode
) == MODE_INT
)
15067 unsigned HOST_WIDE_INT ival
;
15068 if (!aarch64_reinterpret_float_as_int (immediate
, &ival
))
15069 gcc_unreachable ();
15070 immediate
= gen_int_mode (ival
, mode
);
15073 machine_mode vmode
;
15074 /* use a 64 bit mode for everything except for DI/DF mode, where we use
15075 a 128 bit vector mode. */
15076 int width
= GET_MODE_BITSIZE (mode
) == 64 ? 128 : 64;
15078 vmode
= aarch64_simd_container_mode (mode
, width
);
15079 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (immediate
));
15080 return aarch64_output_simd_mov_immediate (v_op
, width
);
15083 /* Return the output string to use for moving immediate CONST_VECTOR
15084 into an SVE register. */
15087 aarch64_output_sve_mov_immediate (rtx const_vector
)
15089 static char templ
[40];
15090 struct simd_immediate_info info
;
15093 bool is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
);
15094 gcc_assert (is_valid
);
15096 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
15100 snprintf (templ
, sizeof (templ
), "index\t%%0.%c, #"
15101 HOST_WIDE_INT_PRINT_DEC
", #" HOST_WIDE_INT_PRINT_DEC
,
15102 element_char
, INTVAL (info
.value
), INTVAL (info
.step
));
15106 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
15108 if (aarch64_float_const_zero_rtx_p (info
.value
))
15109 info
.value
= GEN_INT (0);
15112 const int buf_size
= 20;
15113 char float_buf
[buf_size
] = {};
15114 real_to_decimal_for_mode (float_buf
,
15115 CONST_DOUBLE_REAL_VALUE (info
.value
),
15116 buf_size
, buf_size
, 1, info
.elt_mode
);
15118 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%c, #%s",
15119 element_char
, float_buf
);
15124 snprintf (templ
, sizeof (templ
), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC
,
15125 element_char
, INTVAL (info
.value
));
15129 /* Return the asm format for a PTRUE instruction whose destination has
15130 mode MODE. SUFFIX is the element size suffix. */
15133 aarch64_output_ptrue (machine_mode mode
, char suffix
)
15135 unsigned int nunits
;
15136 static char buf
[sizeof ("ptrue\t%0.N, vlNNNNN")];
15137 if (GET_MODE_NUNITS (mode
).is_constant (&nunits
))
15138 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, vl%d", suffix
, nunits
);
15140 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, all", suffix
);
15144 /* Split operands into moves from op[1] + op[2] into op[0]. */
15147 aarch64_split_combinev16qi (rtx operands
[3])
15149 unsigned int dest
= REGNO (operands
[0]);
15150 unsigned int src1
= REGNO (operands
[1]);
15151 unsigned int src2
= REGNO (operands
[2]);
15152 machine_mode halfmode
= GET_MODE (operands
[1]);
15153 unsigned int halfregs
= REG_NREGS (operands
[1]);
15154 rtx destlo
, desthi
;
15156 gcc_assert (halfmode
== V16QImode
);
15158 if (src1
== dest
&& src2
== dest
+ halfregs
)
15160 /* No-op move. Can't split to nothing; emit something. */
15161 emit_note (NOTE_INSN_DELETED
);
15165 /* Preserve register attributes for variable tracking. */
15166 destlo
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
, 0);
15167 desthi
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
+ halfregs
,
15168 GET_MODE_SIZE (halfmode
));
15170 /* Special case of reversed high/low parts. */
15171 if (reg_overlap_mentioned_p (operands
[2], destlo
)
15172 && reg_overlap_mentioned_p (operands
[1], desthi
))
15174 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
15175 emit_insn (gen_xorv16qi3 (operands
[2], operands
[1], operands
[2]));
15176 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
15178 else if (!reg_overlap_mentioned_p (operands
[2], destlo
))
15180 /* Try to avoid unnecessary moves if part of the result
15181 is in the right place already. */
15183 emit_move_insn (destlo
, operands
[1]);
15184 if (src2
!= dest
+ halfregs
)
15185 emit_move_insn (desthi
, operands
[2]);
15189 if (src2
!= dest
+ halfregs
)
15190 emit_move_insn (desthi
, operands
[2]);
15192 emit_move_insn (destlo
, operands
[1]);
15196 /* vec_perm support. */
15198 struct expand_vec_perm_d
15200 rtx target
, op0
, op1
;
15201 vec_perm_indices perm
;
15202 machine_mode vmode
;
15203 unsigned int vec_flags
;
15208 /* Generate a variable permutation. */
15211 aarch64_expand_vec_perm_1 (rtx target
, rtx op0
, rtx op1
, rtx sel
)
15213 machine_mode vmode
= GET_MODE (target
);
15214 bool one_vector_p
= rtx_equal_p (op0
, op1
);
15216 gcc_checking_assert (vmode
== V8QImode
|| vmode
== V16QImode
);
15217 gcc_checking_assert (GET_MODE (op0
) == vmode
);
15218 gcc_checking_assert (GET_MODE (op1
) == vmode
);
15219 gcc_checking_assert (GET_MODE (sel
) == vmode
);
15220 gcc_checking_assert (TARGET_SIMD
);
15224 if (vmode
== V8QImode
)
15226 /* Expand the argument to a V16QI mode by duplicating it. */
15227 rtx pair
= gen_reg_rtx (V16QImode
);
15228 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op0
));
15229 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
15233 emit_insn (gen_aarch64_tbl1v16qi (target
, op0
, sel
));
15240 if (vmode
== V8QImode
)
15242 pair
= gen_reg_rtx (V16QImode
);
15243 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op1
));
15244 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
15248 pair
= gen_reg_rtx (OImode
);
15249 emit_insn (gen_aarch64_combinev16qi (pair
, op0
, op1
));
15250 emit_insn (gen_aarch64_tbl2v16qi (target
, pair
, sel
));
15255 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
15256 NELT is the number of elements in the vector. */
15259 aarch64_expand_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
,
15262 machine_mode vmode
= GET_MODE (target
);
15263 bool one_vector_p
= rtx_equal_p (op0
, op1
);
15266 /* The TBL instruction does not use a modulo index, so we must take care
15267 of that ourselves. */
15268 mask
= aarch64_simd_gen_const_vector_dup (vmode
,
15269 one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
15270 sel
= expand_simple_binop (vmode
, AND
, sel
, mask
, NULL
, 0, OPTAB_LIB_WIDEN
);
15272 /* For big-endian, we also need to reverse the index within the vector
15273 (but not which vector). */
15274 if (BYTES_BIG_ENDIAN
)
15276 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
15278 mask
= aarch64_simd_gen_const_vector_dup (vmode
, nelt
- 1);
15279 sel
= expand_simple_binop (vmode
, XOR
, sel
, mask
,
15280 NULL
, 0, OPTAB_LIB_WIDEN
);
15282 aarch64_expand_vec_perm_1 (target
, op0
, op1
, sel
);
15285 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
15288 emit_unspec2 (rtx target
, int code
, rtx op0
, rtx op1
)
15290 emit_insn (gen_rtx_SET (target
,
15291 gen_rtx_UNSPEC (GET_MODE (target
),
15292 gen_rtvec (2, op0
, op1
), code
)));
15295 /* Expand an SVE vec_perm with the given operands. */
15298 aarch64_expand_sve_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
)
15300 machine_mode data_mode
= GET_MODE (target
);
15301 machine_mode sel_mode
= GET_MODE (sel
);
15302 /* Enforced by the pattern condition. */
15303 int nunits
= GET_MODE_NUNITS (sel_mode
).to_constant ();
15305 /* Note: vec_perm indices are supposed to wrap when they go beyond the
15306 size of the two value vectors, i.e. the upper bits of the indices
15307 are effectively ignored. SVE TBL instead produces 0 for any
15308 out-of-range indices, so we need to modulo all the vec_perm indices
15309 to ensure they are all in range. */
15310 rtx sel_reg
= force_reg (sel_mode
, sel
);
15312 /* Check if the sel only references the first values vector. */
15313 if (GET_CODE (sel
) == CONST_VECTOR
15314 && aarch64_const_vec_all_in_range_p (sel
, 0, nunits
- 1))
15316 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_reg
);
15320 /* Check if the two values vectors are the same. */
15321 if (rtx_equal_p (op0
, op1
))
15323 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
, nunits
- 1);
15324 rtx sel_mod
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
15325 NULL
, 0, OPTAB_DIRECT
);
15326 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_mod
);
15330 /* Run TBL on for each value vector and combine the results. */
15332 rtx res0
= gen_reg_rtx (data_mode
);
15333 rtx res1
= gen_reg_rtx (data_mode
);
15334 rtx neg_num_elems
= aarch64_simd_gen_const_vector_dup (sel_mode
, -nunits
);
15335 if (GET_CODE (sel
) != CONST_VECTOR
15336 || !aarch64_const_vec_all_in_range_p (sel
, 0, 2 * nunits
- 1))
15338 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
,
15340 sel_reg
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
15341 NULL
, 0, OPTAB_DIRECT
);
15343 emit_unspec2 (res0
, UNSPEC_TBL
, op0
, sel_reg
);
15344 rtx sel_sub
= expand_simple_binop (sel_mode
, PLUS
, sel_reg
, neg_num_elems
,
15345 NULL
, 0, OPTAB_DIRECT
);
15346 emit_unspec2 (res1
, UNSPEC_TBL
, op1
, sel_sub
);
15347 if (GET_MODE_CLASS (data_mode
) == MODE_VECTOR_INT
)
15348 emit_insn (gen_rtx_SET (target
, gen_rtx_IOR (data_mode
, res0
, res1
)));
15350 emit_unspec2 (target
, UNSPEC_IORF
, res0
, res1
);
15353 /* Recognize patterns suitable for the TRN instructions. */
15355 aarch64_evpc_trn (struct expand_vec_perm_d
*d
)
15358 poly_uint64 nelt
= d
->perm
.length ();
15359 rtx out
, in0
, in1
, x
;
15360 machine_mode vmode
= d
->vmode
;
15362 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
15365 /* Note that these are little-endian tests.
15366 We correct for big-endian later. */
15367 if (!d
->perm
[0].is_constant (&odd
)
15368 || (odd
!= 0 && odd
!= 1)
15369 || !d
->perm
.series_p (0, 2, odd
, 2)
15370 || !d
->perm
.series_p (1, 2, nelt
+ odd
, 2))
15379 /* We don't need a big-endian lane correction for SVE; see the comment
15380 at the head of aarch64-sve.md for details. */
15381 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
15383 x
= in0
, in0
= in1
, in1
= x
;
15388 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
15389 odd
? UNSPEC_TRN2
: UNSPEC_TRN1
));
15393 /* Recognize patterns suitable for the UZP instructions. */
15395 aarch64_evpc_uzp (struct expand_vec_perm_d
*d
)
15398 rtx out
, in0
, in1
, x
;
15399 machine_mode vmode
= d
->vmode
;
15401 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
15404 /* Note that these are little-endian tests.
15405 We correct for big-endian later. */
15406 if (!d
->perm
[0].is_constant (&odd
)
15407 || (odd
!= 0 && odd
!= 1)
15408 || !d
->perm
.series_p (0, 1, odd
, 2))
15417 /* We don't need a big-endian lane correction for SVE; see the comment
15418 at the head of aarch64-sve.md for details. */
15419 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
15421 x
= in0
, in0
= in1
, in1
= x
;
15426 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
15427 odd
? UNSPEC_UZP2
: UNSPEC_UZP1
));
15431 /* Recognize patterns suitable for the ZIP instructions. */
15433 aarch64_evpc_zip (struct expand_vec_perm_d
*d
)
15436 poly_uint64 nelt
= d
->perm
.length ();
15437 rtx out
, in0
, in1
, x
;
15438 machine_mode vmode
= d
->vmode
;
15440 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
15443 /* Note that these are little-endian tests.
15444 We correct for big-endian later. */
15445 poly_uint64 first
= d
->perm
[0];
15446 if ((maybe_ne (first
, 0U) && maybe_ne (first
* 2, nelt
))
15447 || !d
->perm
.series_p (0, 2, first
, 1)
15448 || !d
->perm
.series_p (1, 2, first
+ nelt
, 1))
15450 high
= maybe_ne (first
, 0U);
15458 /* We don't need a big-endian lane correction for SVE; see the comment
15459 at the head of aarch64-sve.md for details. */
15460 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
15462 x
= in0
, in0
= in1
, in1
= x
;
15467 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
15468 high
? UNSPEC_ZIP2
: UNSPEC_ZIP1
));
15472 /* Recognize patterns for the EXT insn. */
15475 aarch64_evpc_ext (struct expand_vec_perm_d
*d
)
15477 HOST_WIDE_INT location
;
15480 /* The first element always refers to the first vector.
15481 Check if the extracted indices are increasing by one. */
15482 if (d
->vec_flags
== VEC_SVE_PRED
15483 || !d
->perm
[0].is_constant (&location
)
15484 || !d
->perm
.series_p (0, 1, location
, 1))
15491 /* The case where (location == 0) is a no-op for both big- and little-endian,
15492 and is removed by the mid-end at optimization levels -O1 and higher.
15494 We don't need a big-endian lane correction for SVE; see the comment
15495 at the head of aarch64-sve.md for details. */
15496 if (BYTES_BIG_ENDIAN
&& location
!= 0 && d
->vec_flags
== VEC_ADVSIMD
)
15498 /* After setup, we want the high elements of the first vector (stored
15499 at the LSB end of the register), and the low elements of the second
15500 vector (stored at the MSB end of the register). So swap. */
15501 std::swap (d
->op0
, d
->op1
);
15502 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
15503 to_constant () is safe since this is restricted to Advanced SIMD
15505 location
= d
->perm
.length ().to_constant () - location
;
15508 offset
= GEN_INT (location
);
15509 emit_set_insn (d
->target
,
15510 gen_rtx_UNSPEC (d
->vmode
,
15511 gen_rtvec (3, d
->op0
, d
->op1
, offset
),
15516 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
15517 within each 64-bit, 32-bit or 16-bit granule. */
15520 aarch64_evpc_rev_local (struct expand_vec_perm_d
*d
)
15522 HOST_WIDE_INT diff
;
15523 unsigned int i
, size
, unspec
;
15524 machine_mode pred_mode
;
15526 if (d
->vec_flags
== VEC_SVE_PRED
15527 || !d
->one_vector_p
15528 || !d
->perm
[0].is_constant (&diff
))
15531 size
= (diff
+ 1) * GET_MODE_UNIT_SIZE (d
->vmode
);
15534 unspec
= UNSPEC_REV64
;
15535 pred_mode
= VNx2BImode
;
15537 else if (size
== 4)
15539 unspec
= UNSPEC_REV32
;
15540 pred_mode
= VNx4BImode
;
15542 else if (size
== 2)
15544 unspec
= UNSPEC_REV16
;
15545 pred_mode
= VNx8BImode
;
15550 unsigned int step
= diff
+ 1;
15551 for (i
= 0; i
< step
; ++i
)
15552 if (!d
->perm
.series_p (i
, step
, diff
- i
, step
))
15559 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), unspec
);
15560 if (d
->vec_flags
== VEC_SVE_DATA
)
15562 rtx pred
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
15563 src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (2, pred
, src
),
15564 UNSPEC_MERGE_PTRUE
);
15566 emit_set_insn (d
->target
, src
);
15570 /* Recognize patterns for the REV insn, which reverses elements within
15574 aarch64_evpc_rev_global (struct expand_vec_perm_d
*d
)
15576 poly_uint64 nelt
= d
->perm
.length ();
15578 if (!d
->one_vector_p
|| d
->vec_flags
!= VEC_SVE_DATA
)
15581 if (!d
->perm
.series_p (0, 1, nelt
- 1, -1))
15588 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), UNSPEC_REV
);
15589 emit_set_insn (d
->target
, src
);
15594 aarch64_evpc_dup (struct expand_vec_perm_d
*d
)
15596 rtx out
= d
->target
;
15599 machine_mode vmode
= d
->vmode
;
15602 if (d
->vec_flags
== VEC_SVE_PRED
15603 || d
->perm
.encoding ().encoded_nelts () != 1
15604 || !d
->perm
[0].is_constant (&elt
))
15607 if (d
->vec_flags
== VEC_SVE_DATA
&& elt
>= 64 * GET_MODE_UNIT_SIZE (vmode
))
15614 /* The generic preparation in aarch64_expand_vec_perm_const_1
15615 swaps the operand order and the permute indices if it finds
15616 d->perm[0] to be in the second operand. Thus, we can always
15617 use d->op0 and need not do any extra arithmetic to get the
15618 correct lane number. */
15620 lane
= GEN_INT (elt
); /* The pattern corrects for big-endian. */
15622 rtx parallel
= gen_rtx_PARALLEL (vmode
, gen_rtvec (1, lane
));
15623 rtx select
= gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode
), in0
, parallel
);
15624 emit_set_insn (out
, gen_rtx_VEC_DUPLICATE (vmode
, select
));
15629 aarch64_evpc_tbl (struct expand_vec_perm_d
*d
)
15631 rtx rperm
[MAX_COMPILE_TIME_VEC_BYTES
], sel
;
15632 machine_mode vmode
= d
->vmode
;
15634 /* Make sure that the indices are constant. */
15635 unsigned int encoded_nelts
= d
->perm
.encoding ().encoded_nelts ();
15636 for (unsigned int i
= 0; i
< encoded_nelts
; ++i
)
15637 if (!d
->perm
[i
].is_constant ())
15643 /* Generic code will try constant permutation twice. Once with the
15644 original mode and again with the elements lowered to QImode.
15645 So wait and don't do the selector expansion ourselves. */
15646 if (vmode
!= V8QImode
&& vmode
!= V16QImode
)
15649 /* to_constant is safe since this routine is specific to Advanced SIMD
15651 unsigned int nelt
= d
->perm
.length ().to_constant ();
15652 for (unsigned int i
= 0; i
< nelt
; ++i
)
15653 /* If big-endian and two vectors we end up with a weird mixed-endian
15654 mode on NEON. Reverse the index within each word but not the word
15655 itself. to_constant is safe because we checked is_constant above. */
15656 rperm
[i
] = GEN_INT (BYTES_BIG_ENDIAN
15657 ? d
->perm
[i
].to_constant () ^ (nelt
- 1)
15658 : d
->perm
[i
].to_constant ());
15660 sel
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
15661 sel
= force_reg (vmode
, sel
);
15663 aarch64_expand_vec_perm_1 (d
->target
, d
->op0
, d
->op1
, sel
);
15667 /* Try to implement D using an SVE TBL instruction. */
15670 aarch64_evpc_sve_tbl (struct expand_vec_perm_d
*d
)
15672 unsigned HOST_WIDE_INT nelt
;
15674 /* Permuting two variable-length vectors could overflow the
15676 if (!d
->one_vector_p
&& !d
->perm
.length ().is_constant (&nelt
))
15682 machine_mode sel_mode
= mode_for_int_vector (d
->vmode
).require ();
15683 rtx sel
= vec_perm_indices_to_rtx (sel_mode
, d
->perm
);
15684 aarch64_expand_sve_vec_perm (d
->target
, d
->op0
, d
->op1
, sel
);
15689 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
15691 /* The pattern matching functions above are written to look for a small
15692 number to begin the sequence (0, 1, N/2). If we begin with an index
15693 from the second operand, we can swap the operands. */
15694 poly_int64 nelt
= d
->perm
.length ();
15695 if (known_ge (d
->perm
[0], nelt
))
15697 d
->perm
.rotate_inputs (1);
15698 std::swap (d
->op0
, d
->op1
);
15701 if ((d
->vec_flags
== VEC_ADVSIMD
15702 || d
->vec_flags
== VEC_SVE_DATA
15703 || d
->vec_flags
== VEC_SVE_PRED
)
15704 && known_gt (nelt
, 1))
15706 if (aarch64_evpc_rev_local (d
))
15708 else if (aarch64_evpc_rev_global (d
))
15710 else if (aarch64_evpc_ext (d
))
15712 else if (aarch64_evpc_dup (d
))
15714 else if (aarch64_evpc_zip (d
))
15716 else if (aarch64_evpc_uzp (d
))
15718 else if (aarch64_evpc_trn (d
))
15720 if (d
->vec_flags
== VEC_SVE_DATA
)
15721 return aarch64_evpc_sve_tbl (d
);
15722 else if (d
->vec_flags
== VEC_SVE_DATA
)
15723 return aarch64_evpc_tbl (d
);
15728 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
15731 aarch64_vectorize_vec_perm_const (machine_mode vmode
, rtx target
, rtx op0
,
15732 rtx op1
, const vec_perm_indices
&sel
)
15734 struct expand_vec_perm_d d
;
15736 /* Check whether the mask can be applied to a single vector. */
15737 if (op0
&& rtx_equal_p (op0
, op1
))
15738 d
.one_vector_p
= true;
15739 else if (sel
.all_from_input_p (0))
15741 d
.one_vector_p
= true;
15744 else if (sel
.all_from_input_p (1))
15746 d
.one_vector_p
= true;
15750 d
.one_vector_p
= false;
15752 d
.perm
.new_vector (sel
.encoding (), d
.one_vector_p
? 1 : 2,
15753 sel
.nelts_per_input ());
15755 d
.vec_flags
= aarch64_classify_vector_mode (d
.vmode
);
15759 d
.testing_p
= !target
;
15762 return aarch64_expand_vec_perm_const_1 (&d
);
15764 rtx_insn
*last
= get_last_insn ();
15765 bool ret
= aarch64_expand_vec_perm_const_1 (&d
);
15766 gcc_assert (last
== get_last_insn ());
15771 /* Generate a byte permute mask for a register of mode MODE,
15772 which has NUNITS units. */
15775 aarch64_reverse_mask (machine_mode mode
, unsigned int nunits
)
15777 /* We have to reverse each vector because we dont have
15778 a permuted load that can reverse-load according to ABI rules. */
15780 rtvec v
= rtvec_alloc (16);
15782 unsigned int usize
= GET_MODE_UNIT_SIZE (mode
);
15784 gcc_assert (BYTES_BIG_ENDIAN
);
15785 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode
));
15787 for (i
= 0; i
< nunits
; i
++)
15788 for (j
= 0; j
< usize
; j
++)
15789 RTVEC_ELT (v
, i
* usize
+ j
) = GEN_INT ((i
+ 1) * usize
- 1 - j
);
15790 mask
= gen_rtx_CONST_VECTOR (V16QImode
, v
);
15791 return force_reg (V16QImode
, mask
);
15794 /* Return true if X is a valid second operand for the SVE instruction
15795 that implements integer comparison OP_CODE. */
15798 aarch64_sve_cmp_operand_p (rtx_code op_code
, rtx x
)
15800 if (register_operand (x
, VOIDmode
))
15809 return aarch64_sve_cmp_immediate_p (x
, false);
15816 return aarch64_sve_cmp_immediate_p (x
, true);
15818 gcc_unreachable ();
15822 /* Use predicated SVE instructions to implement the equivalent of:
15826 given that PTRUE is an all-true predicate of the appropriate mode. */
15829 aarch64_emit_sve_ptrue_op (rtx target
, rtx ptrue
, rtx op
)
15831 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (target
),
15832 gen_rtvec (2, ptrue
, op
),
15833 UNSPEC_MERGE_PTRUE
);
15834 rtx_insn
*insn
= emit_set_insn (target
, unspec
);
15835 set_unique_reg_note (insn
, REG_EQUAL
, copy_rtx (op
));
15838 /* Likewise, but also clobber the condition codes. */
15841 aarch64_emit_sve_ptrue_op_cc (rtx target
, rtx ptrue
, rtx op
)
15843 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (target
),
15844 gen_rtvec (2, ptrue
, op
),
15845 UNSPEC_MERGE_PTRUE
);
15846 rtx_insn
*insn
= emit_insn (gen_set_clobber_cc (target
, unspec
));
15847 set_unique_reg_note (insn
, REG_EQUAL
, copy_rtx (op
));
15850 /* Return the UNSPEC_COND_* code for comparison CODE. */
15852 static unsigned int
15853 aarch64_unspec_cond_code (rtx_code code
)
15858 return UNSPEC_COND_NE
;
15860 return UNSPEC_COND_EQ
;
15862 return UNSPEC_COND_LT
;
15864 return UNSPEC_COND_GT
;
15866 return UNSPEC_COND_LE
;
15868 return UNSPEC_COND_GE
;
15870 gcc_unreachable ();
15876 (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
15878 where <X> is the operation associated with comparison CODE. This form
15879 of instruction is used when (and (CODE OP0 OP1) PRED) would have different
15880 semantics, such as when PRED might not be all-true and when comparing
15881 inactive lanes could have side effects. */
15884 aarch64_emit_sve_predicated_cond (rtx target
, rtx_code code
,
15885 rtx pred
, rtx op0
, rtx op1
)
15887 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (pred
),
15888 gen_rtvec (3, pred
, op0
, op1
),
15889 aarch64_unspec_cond_code (code
));
15890 emit_set_insn (target
, unspec
);
15893 /* Expand an SVE integer comparison using the SVE equivalent of:
15895 (set TARGET (CODE OP0 OP1)). */
15898 aarch64_expand_sve_vec_cmp_int (rtx target
, rtx_code code
, rtx op0
, rtx op1
)
15900 machine_mode pred_mode
= GET_MODE (target
);
15901 machine_mode data_mode
= GET_MODE (op0
);
15903 if (!aarch64_sve_cmp_operand_p (code
, op1
))
15904 op1
= force_reg (data_mode
, op1
);
15906 rtx ptrue
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
15907 rtx cond
= gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
);
15908 aarch64_emit_sve_ptrue_op_cc (target
, ptrue
, cond
);
15911 /* Emit the SVE equivalent of:
15913 (set TMP1 (CODE1 OP0 OP1))
15914 (set TMP2 (CODE2 OP0 OP1))
15915 (set TARGET (ior:PRED_MODE TMP1 TMP2))
15917 PTRUE is an all-true predicate with the same mode as TARGET. */
15920 aarch64_emit_sve_or_conds (rtx target
, rtx_code code1
, rtx_code code2
,
15921 rtx ptrue
, rtx op0
, rtx op1
)
15923 machine_mode pred_mode
= GET_MODE (ptrue
);
15924 rtx tmp1
= gen_reg_rtx (pred_mode
);
15925 aarch64_emit_sve_ptrue_op (tmp1
, ptrue
,
15926 gen_rtx_fmt_ee (code1
, pred_mode
, op0
, op1
));
15927 rtx tmp2
= gen_reg_rtx (pred_mode
);
15928 aarch64_emit_sve_ptrue_op (tmp2
, ptrue
,
15929 gen_rtx_fmt_ee (code2
, pred_mode
, op0
, op1
));
15930 aarch64_emit_binop (target
, ior_optab
, tmp1
, tmp2
);
15933 /* Emit the SVE equivalent of:
15935 (set TMP (CODE OP0 OP1))
15936 (set TARGET (not TMP))
15938 PTRUE is an all-true predicate with the same mode as TARGET. */
15941 aarch64_emit_sve_inverted_cond (rtx target
, rtx ptrue
, rtx_code code
,
15944 machine_mode pred_mode
= GET_MODE (ptrue
);
15945 rtx tmp
= gen_reg_rtx (pred_mode
);
15946 aarch64_emit_sve_ptrue_op (tmp
, ptrue
,
15947 gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
));
15948 aarch64_emit_unop (target
, one_cmpl_optab
, tmp
);
15951 /* Expand an SVE floating-point comparison using the SVE equivalent of:
15953 (set TARGET (CODE OP0 OP1))
15955 If CAN_INVERT_P is true, the caller can also handle inverted results;
15956 return true if the result is in fact inverted. */
15959 aarch64_expand_sve_vec_cmp_float (rtx target
, rtx_code code
,
15960 rtx op0
, rtx op1
, bool can_invert_p
)
15962 machine_mode pred_mode
= GET_MODE (target
);
15963 machine_mode data_mode
= GET_MODE (op0
);
15965 rtx ptrue
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
15969 /* UNORDERED has no immediate form. */
15970 op1
= force_reg (data_mode
, op1
);
15979 /* There is native support for the comparison. */
15980 rtx cond
= gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
);
15981 aarch64_emit_sve_ptrue_op (target
, ptrue
, cond
);
15986 /* This is a trapping operation (LT or GT). */
15987 aarch64_emit_sve_or_conds (target
, LT
, GT
, ptrue
, op0
, op1
);
15991 if (!flag_trapping_math
)
15993 /* This would trap for signaling NaNs. */
15994 op1
= force_reg (data_mode
, op1
);
15995 aarch64_emit_sve_or_conds (target
, UNORDERED
, EQ
, ptrue
, op0
, op1
);
16003 if (flag_trapping_math
)
16005 /* Work out which elements are ordered. */
16006 rtx ordered
= gen_reg_rtx (pred_mode
);
16007 op1
= force_reg (data_mode
, op1
);
16008 aarch64_emit_sve_inverted_cond (ordered
, ptrue
, UNORDERED
, op0
, op1
);
16010 /* Test the opposite condition for the ordered elements,
16011 then invert the result. */
16015 code
= reverse_condition_maybe_unordered (code
);
16018 aarch64_emit_sve_predicated_cond (target
, code
,
16019 ordered
, op0
, op1
);
16022 rtx tmp
= gen_reg_rtx (pred_mode
);
16023 aarch64_emit_sve_predicated_cond (tmp
, code
, ordered
, op0
, op1
);
16024 aarch64_emit_unop (target
, one_cmpl_optab
, tmp
);
16030 /* ORDERED has no immediate form. */
16031 op1
= force_reg (data_mode
, op1
);
16035 gcc_unreachable ();
16038 /* There is native support for the inverse comparison. */
16039 code
= reverse_condition_maybe_unordered (code
);
16042 rtx cond
= gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
);
16043 aarch64_emit_sve_ptrue_op (target
, ptrue
, cond
);
16046 aarch64_emit_sve_inverted_cond (target
, ptrue
, code
, op0
, op1
);
16050 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
16051 of the data being selected and CMP_MODE is the mode of the values being
16055 aarch64_expand_sve_vcond (machine_mode data_mode
, machine_mode cmp_mode
,
16058 machine_mode pred_mode
16059 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode
),
16060 GET_MODE_SIZE (cmp_mode
)).require ();
16061 rtx pred
= gen_reg_rtx (pred_mode
);
16062 if (FLOAT_MODE_P (cmp_mode
))
16064 if (aarch64_expand_sve_vec_cmp_float (pred
, GET_CODE (ops
[3]),
16065 ops
[4], ops
[5], true))
16066 std::swap (ops
[1], ops
[2]);
16069 aarch64_expand_sve_vec_cmp_int (pred
, GET_CODE (ops
[3]), ops
[4], ops
[5]);
16071 rtvec vec
= gen_rtvec (3, pred
, ops
[1], ops
[2]);
16072 emit_set_insn (ops
[0], gen_rtx_UNSPEC (data_mode
, vec
, UNSPEC_SEL
));
16075 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
16076 true. However due to issues with register allocation it is preferable
16077 to avoid tieing integer scalar and FP scalar modes. Executing integer
16078 operations in general registers is better than treating them as scalar
16079 vector operations. This reduces latency and avoids redundant int<->FP
16080 moves. So tie modes if they are either the same class, or vector modes
16081 with other vector modes, vector structs or any scalar mode. */
16084 aarch64_modes_tieable_p (machine_mode mode1
, machine_mode mode2
)
16086 if (GET_MODE_CLASS (mode1
) == GET_MODE_CLASS (mode2
))
16089 /* We specifically want to allow elements of "structure" modes to
16090 be tieable to the structure. This more general condition allows
16091 other rarer situations too. The reason we don't extend this to
16092 predicate modes is that there are no predicate structure modes
16093 nor any specific instructions for extracting part of a predicate
16095 if (aarch64_vector_data_mode_p (mode1
)
16096 && aarch64_vector_data_mode_p (mode2
))
16099 /* Also allow any scalar modes with vectors. */
16100 if (aarch64_vector_mode_supported_p (mode1
)
16101 || aarch64_vector_mode_supported_p (mode2
))
16107 /* Return a new RTX holding the result of moving POINTER forward by
16111 aarch64_move_pointer (rtx pointer
, poly_int64 amount
)
16113 rtx next
= plus_constant (Pmode
, XEXP (pointer
, 0), amount
);
16115 return adjust_automodify_address (pointer
, GET_MODE (pointer
),
16119 /* Return a new RTX holding the result of moving POINTER forward by the
16120 size of the mode it points to. */
16123 aarch64_progress_pointer (rtx pointer
)
16125 return aarch64_move_pointer (pointer
, GET_MODE_SIZE (GET_MODE (pointer
)));
16128 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
16132 aarch64_copy_one_block_and_progress_pointers (rtx
*src
, rtx
*dst
,
16135 rtx reg
= gen_reg_rtx (mode
);
16137 /* "Cast" the pointers to the correct mode. */
16138 *src
= adjust_address (*src
, mode
, 0);
16139 *dst
= adjust_address (*dst
, mode
, 0);
16140 /* Emit the memcpy. */
16141 emit_move_insn (reg
, *src
);
16142 emit_move_insn (*dst
, reg
);
16143 /* Move the pointers forward. */
16144 *src
= aarch64_progress_pointer (*src
);
16145 *dst
= aarch64_progress_pointer (*dst
);
16148 /* Expand movmem, as if from a __builtin_memcpy. Return true if
16149 we succeed, otherwise return false. */
16152 aarch64_expand_movmem (rtx
*operands
)
16155 rtx dst
= operands
[0];
16156 rtx src
= operands
[1];
16158 machine_mode cur_mode
= BLKmode
, next_mode
;
16159 bool speed_p
= !optimize_function_for_size_p (cfun
);
16161 /* When optimizing for size, give a better estimate of the length of a
16162 memcpy call, but use the default otherwise. Moves larger than 8 bytes
16163 will always require an even number of instructions to do now. And each
16164 operation requires both a load+store, so devide the max number by 2. */
16165 int max_num_moves
= (speed_p
? 16 : AARCH64_CALL_RATIO
) / 2;
16167 /* We can't do anything smart if the amount to copy is not constant. */
16168 if (!CONST_INT_P (operands
[2]))
16171 n
= INTVAL (operands
[2]);
16173 /* Try to keep the number of instructions low. For all cases we will do at
16174 most two moves for the residual amount, since we'll always overlap the
16176 if (((n
/ 16) + (n
% 16 ? 2 : 0)) > max_num_moves
)
16179 base
= copy_to_mode_reg (Pmode
, XEXP (dst
, 0));
16180 dst
= adjust_automodify_address (dst
, VOIDmode
, base
, 0);
16182 base
= copy_to_mode_reg (Pmode
, XEXP (src
, 0));
16183 src
= adjust_automodify_address (src
, VOIDmode
, base
, 0);
16185 /* Convert n to bits to make the rest of the code simpler. */
16186 n
= n
* BITS_PER_UNIT
;
16190 /* Find the largest mode in which to do the copy in without over reading
16192 opt_scalar_int_mode mode_iter
;
16193 FOR_EACH_MODE_IN_CLASS (mode_iter
, MODE_INT
)
16194 if (GET_MODE_BITSIZE (mode_iter
.require ()) <= n
)
16195 cur_mode
= mode_iter
.require ();
16197 gcc_assert (cur_mode
!= BLKmode
);
16199 mode_bits
= GET_MODE_BITSIZE (cur_mode
).to_constant ();
16200 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, cur_mode
);
16204 /* Do certain trailing copies as overlapping if it's going to be
16205 cheaper. i.e. less instructions to do so. For instance doing a 15
16206 byte copy it's more efficient to do two overlapping 8 byte copies than
16208 next_mode
= smallest_mode_for_size (n
, MODE_INT
);
16209 int n_bits
= GET_MODE_BITSIZE (next_mode
).to_constant ();
16210 if (n
> 0 && n_bits
> n
&& n_bits
<= 8 * BITS_PER_UNIT
)
16212 src
= aarch64_move_pointer (src
, (n
- n_bits
) / BITS_PER_UNIT
);
16213 dst
= aarch64_move_pointer (dst
, (n
- n_bits
) / BITS_PER_UNIT
);
16221 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
16222 SImode stores. Handle the case when the constant has identical
16223 bottom and top halves. This is beneficial when the two stores can be
16224 merged into an STP and we avoid synthesising potentially expensive
16225 immediates twice. Return true if such a split is possible. */
16228 aarch64_split_dimode_const_store (rtx dst
, rtx src
)
16230 rtx lo
= gen_lowpart (SImode
, src
);
16231 rtx hi
= gen_highpart_mode (SImode
, DImode
, src
);
16233 bool size_p
= optimize_function_for_size_p (cfun
);
16235 if (!rtx_equal_p (lo
, hi
))
16238 unsigned int orig_cost
16239 = aarch64_internal_mov_immediate (NULL_RTX
, src
, false, DImode
);
16240 unsigned int lo_cost
16241 = aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, SImode
);
16243 /* We want to transform:
16245 MOVK x1, 0x140, lsl 16
16246 MOVK x1, 0xc0da, lsl 32
16247 MOVK x1, 0x140, lsl 48
16251 MOVK w1, 0x140, lsl 16
16253 So we want to perform this only when we save two instructions
16254 or more. When optimizing for size, however, accept any code size
16256 if (size_p
&& orig_cost
<= lo_cost
)
16260 && (orig_cost
<= lo_cost
+ 1))
16263 rtx mem_lo
= adjust_address (dst
, SImode
, 0);
16264 if (!aarch64_mem_pair_operand (mem_lo
, SImode
))
16267 rtx tmp_reg
= gen_reg_rtx (SImode
);
16268 aarch64_expand_mov_immediate (tmp_reg
, lo
);
16269 rtx mem_hi
= aarch64_move_pointer (mem_lo
, GET_MODE_SIZE (SImode
));
16270 /* Don't emit an explicit store pair as this may not be always profitable.
16271 Let the sched-fusion logic decide whether to merge them. */
16272 emit_move_insn (mem_lo
, tmp_reg
);
16273 emit_move_insn (mem_hi
, tmp_reg
);
16278 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
16280 static unsigned HOST_WIDE_INT
16281 aarch64_asan_shadow_offset (void)
16283 return (HOST_WIDE_INT_1
<< 36);
16287 aarch64_gen_ccmp_first (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
,
16288 int code
, tree treeop0
, tree treeop1
)
16290 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
16292 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
16294 struct expand_operand ops
[4];
16297 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
16299 op_mode
= GET_MODE (op0
);
16300 if (op_mode
== VOIDmode
)
16301 op_mode
= GET_MODE (op1
);
16309 icode
= CODE_FOR_cmpsi
;
16314 icode
= CODE_FOR_cmpdi
;
16319 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
16320 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpesf
: CODE_FOR_fcmpsf
;
16325 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
16326 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpedf
: CODE_FOR_fcmpdf
;
16334 op0
= prepare_operand (icode
, op0
, 0, op_mode
, cmp_mode
, unsignedp
);
16335 op1
= prepare_operand (icode
, op1
, 1, op_mode
, cmp_mode
, unsignedp
);
16341 *prep_seq
= get_insns ();
16344 create_fixed_operand (&ops
[0], op0
);
16345 create_fixed_operand (&ops
[1], op1
);
16348 if (!maybe_expand_insn (icode
, 2, ops
))
16353 *gen_seq
= get_insns ();
16356 return gen_rtx_fmt_ee ((rtx_code
) code
, cc_mode
,
16357 gen_rtx_REG (cc_mode
, CC_REGNUM
), const0_rtx
);
16361 aarch64_gen_ccmp_next (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
, rtx prev
,
16362 int cmp_code
, tree treeop0
, tree treeop1
, int bit_code
)
16364 rtx op0
, op1
, target
;
16365 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
16366 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
16368 struct expand_operand ops
[6];
16371 push_to_sequence (*prep_seq
);
16372 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
16374 op_mode
= GET_MODE (op0
);
16375 if (op_mode
== VOIDmode
)
16376 op_mode
= GET_MODE (op1
);
16384 icode
= CODE_FOR_ccmpsi
;
16389 icode
= CODE_FOR_ccmpdi
;
16394 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
16395 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpesf
: CODE_FOR_fccmpsf
;
16400 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
16401 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpedf
: CODE_FOR_fccmpdf
;
16409 op0
= prepare_operand (icode
, op0
, 2, op_mode
, cmp_mode
, unsignedp
);
16410 op1
= prepare_operand (icode
, op1
, 3, op_mode
, cmp_mode
, unsignedp
);
16416 *prep_seq
= get_insns ();
16419 target
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
16420 aarch64_cond
= aarch64_get_condition_code_1 (cc_mode
, (rtx_code
) cmp_code
);
16422 if (bit_code
!= AND
)
16424 prev
= gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev
),
16425 GET_MODE (XEXP (prev
, 0))),
16426 VOIDmode
, XEXP (prev
, 0), const0_rtx
);
16427 aarch64_cond
= AARCH64_INVERSE_CONDITION_CODE (aarch64_cond
);
16430 create_fixed_operand (&ops
[0], XEXP (prev
, 0));
16431 create_fixed_operand (&ops
[1], target
);
16432 create_fixed_operand (&ops
[2], op0
);
16433 create_fixed_operand (&ops
[3], op1
);
16434 create_fixed_operand (&ops
[4], prev
);
16435 create_fixed_operand (&ops
[5], GEN_INT (aarch64_cond
));
16437 push_to_sequence (*gen_seq
);
16438 if (!maybe_expand_insn (icode
, 6, ops
))
16444 *gen_seq
= get_insns ();
16447 return gen_rtx_fmt_ee ((rtx_code
) cmp_code
, VOIDmode
, target
, const0_rtx
);
16450 #undef TARGET_GEN_CCMP_FIRST
16451 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
16453 #undef TARGET_GEN_CCMP_NEXT
16454 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
16456 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
16457 instruction fusion of some sort. */
16460 aarch64_macro_fusion_p (void)
16462 return aarch64_tune_params
.fusible_ops
!= AARCH64_FUSE_NOTHING
;
16466 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
16467 should be kept together during scheduling. */
16470 aarch_macro_fusion_pair_p (rtx_insn
*prev
, rtx_insn
*curr
)
16473 rtx prev_set
= single_set (prev
);
16474 rtx curr_set
= single_set (curr
);
16475 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
16476 bool simple_sets_p
= prev_set
&& curr_set
&& !any_condjump_p (curr
);
16478 if (!aarch64_macro_fusion_p ())
16481 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK
))
16483 /* We are trying to match:
16484 prev (mov) == (set (reg r0) (const_int imm16))
16485 curr (movk) == (set (zero_extract (reg r0)
16488 (const_int imm16_1)) */
16490 set_dest
= SET_DEST (curr_set
);
16492 if (GET_CODE (set_dest
) == ZERO_EXTRACT
16493 && CONST_INT_P (SET_SRC (curr_set
))
16494 && CONST_INT_P (SET_SRC (prev_set
))
16495 && CONST_INT_P (XEXP (set_dest
, 2))
16496 && INTVAL (XEXP (set_dest
, 2)) == 16
16497 && REG_P (XEXP (set_dest
, 0))
16498 && REG_P (SET_DEST (prev_set
))
16499 && REGNO (XEXP (set_dest
, 0)) == REGNO (SET_DEST (prev_set
)))
16505 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD
))
16508 /* We're trying to match:
16509 prev (adrp) == (set (reg r1)
16510 (high (symbol_ref ("SYM"))))
16511 curr (add) == (set (reg r0)
16513 (symbol_ref ("SYM"))))
16514 Note that r0 need not necessarily be the same as r1, especially
16515 during pre-regalloc scheduling. */
16517 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
16518 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
16520 if (GET_CODE (SET_SRC (curr_set
)) == LO_SUM
16521 && REG_P (XEXP (SET_SRC (curr_set
), 0))
16522 && REGNO (XEXP (SET_SRC (curr_set
), 0))
16523 == REGNO (SET_DEST (prev_set
))
16524 && rtx_equal_p (XEXP (SET_SRC (prev_set
), 0),
16525 XEXP (SET_SRC (curr_set
), 1)))
16530 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK
))
16533 /* We're trying to match:
16534 prev (movk) == (set (zero_extract (reg r0)
16537 (const_int imm16_1))
16538 curr (movk) == (set (zero_extract (reg r0)
16541 (const_int imm16_2)) */
16543 if (GET_CODE (SET_DEST (prev_set
)) == ZERO_EXTRACT
16544 && GET_CODE (SET_DEST (curr_set
)) == ZERO_EXTRACT
16545 && REG_P (XEXP (SET_DEST (prev_set
), 0))
16546 && REG_P (XEXP (SET_DEST (curr_set
), 0))
16547 && REGNO (XEXP (SET_DEST (prev_set
), 0))
16548 == REGNO (XEXP (SET_DEST (curr_set
), 0))
16549 && CONST_INT_P (XEXP (SET_DEST (prev_set
), 2))
16550 && CONST_INT_P (XEXP (SET_DEST (curr_set
), 2))
16551 && INTVAL (XEXP (SET_DEST (prev_set
), 2)) == 32
16552 && INTVAL (XEXP (SET_DEST (curr_set
), 2)) == 48
16553 && CONST_INT_P (SET_SRC (prev_set
))
16554 && CONST_INT_P (SET_SRC (curr_set
)))
16558 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR
))
16560 /* We're trying to match:
16561 prev (adrp) == (set (reg r0)
16562 (high (symbol_ref ("SYM"))))
16563 curr (ldr) == (set (reg r1)
16564 (mem (lo_sum (reg r0)
16565 (symbol_ref ("SYM")))))
16567 curr (ldr) == (set (reg r1)
16570 (symbol_ref ("SYM")))))) */
16571 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
16572 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
16574 rtx curr_src
= SET_SRC (curr_set
);
16576 if (GET_CODE (curr_src
) == ZERO_EXTEND
)
16577 curr_src
= XEXP (curr_src
, 0);
16579 if (MEM_P (curr_src
) && GET_CODE (XEXP (curr_src
, 0)) == LO_SUM
16580 && REG_P (XEXP (XEXP (curr_src
, 0), 0))
16581 && REGNO (XEXP (XEXP (curr_src
, 0), 0))
16582 == REGNO (SET_DEST (prev_set
))
16583 && rtx_equal_p (XEXP (XEXP (curr_src
, 0), 1),
16584 XEXP (SET_SRC (prev_set
), 0)))
16589 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC
)
16590 && aarch_crypto_can_dual_issue (prev
, curr
))
16593 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH
)
16594 && any_condjump_p (curr
))
16596 enum attr_type prev_type
= get_attr_type (prev
);
16598 unsigned int condreg1
, condreg2
;
16600 aarch64_fixed_condition_code_regs (&condreg1
, &condreg2
);
16601 cc_reg_1
= gen_rtx_REG (CCmode
, condreg1
);
16603 if (reg_referenced_p (cc_reg_1
, PATTERN (curr
))
16605 && modified_in_p (cc_reg_1
, prev
))
16607 /* FIXME: this misses some which is considered simple arthematic
16608 instructions for ThunderX. Simple shifts are missed here. */
16609 if (prev_type
== TYPE_ALUS_SREG
16610 || prev_type
== TYPE_ALUS_IMM
16611 || prev_type
== TYPE_LOGICS_REG
16612 || prev_type
== TYPE_LOGICS_IMM
)
16619 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH
)
16620 && any_condjump_p (curr
))
16622 /* We're trying to match:
16623 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
16624 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
16626 (label_ref ("SYM"))
16628 if (SET_DEST (curr_set
) == (pc_rtx
)
16629 && GET_CODE (SET_SRC (curr_set
)) == IF_THEN_ELSE
16630 && REG_P (XEXP (XEXP (SET_SRC (curr_set
), 0), 0))
16631 && REG_P (SET_DEST (prev_set
))
16632 && REGNO (SET_DEST (prev_set
))
16633 == REGNO (XEXP (XEXP (SET_SRC (curr_set
), 0), 0)))
16635 /* Fuse ALU operations followed by conditional branch instruction. */
16636 switch (get_attr_type (prev
))
16639 case TYPE_ALU_SREG
:
16642 case TYPE_ADCS_REG
:
16643 case TYPE_ADCS_IMM
:
16644 case TYPE_LOGIC_REG
:
16645 case TYPE_LOGIC_IMM
:
16649 case TYPE_SHIFT_REG
:
16650 case TYPE_SHIFT_IMM
:
16665 /* Return true iff the instruction fusion described by OP is enabled. */
16668 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op
)
16670 return (aarch64_tune_params
.fusible_ops
& op
) != 0;
16673 /* If MEM is in the form of [base+offset], extract the two parts
16674 of address and set to BASE and OFFSET, otherwise return false
16675 after clearing BASE and OFFSET. */
16678 extract_base_offset_in_addr (rtx mem
, rtx
*base
, rtx
*offset
)
16682 gcc_assert (MEM_P (mem
));
16684 addr
= XEXP (mem
, 0);
16689 *offset
= const0_rtx
;
16693 if (GET_CODE (addr
) == PLUS
16694 && REG_P (XEXP (addr
, 0)) && CONST_INT_P (XEXP (addr
, 1)))
16696 *base
= XEXP (addr
, 0);
16697 *offset
= XEXP (addr
, 1);
16702 *offset
= NULL_RTX
;
16707 /* Types for scheduling fusion. */
16708 enum sched_fusion_type
16710 SCHED_FUSION_NONE
= 0,
16711 SCHED_FUSION_LD_SIGN_EXTEND
,
16712 SCHED_FUSION_LD_ZERO_EXTEND
,
16718 /* If INSN is a load or store of address in the form of [base+offset],
16719 extract the two parts and set to BASE and OFFSET. Return scheduling
16720 fusion type this INSN is. */
16722 static enum sched_fusion_type
16723 fusion_load_store (rtx_insn
*insn
, rtx
*base
, rtx
*offset
)
16726 enum sched_fusion_type fusion
= SCHED_FUSION_LD
;
16728 gcc_assert (INSN_P (insn
));
16729 x
= PATTERN (insn
);
16730 if (GET_CODE (x
) != SET
)
16731 return SCHED_FUSION_NONE
;
16734 dest
= SET_DEST (x
);
16736 machine_mode dest_mode
= GET_MODE (dest
);
16738 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode
))
16739 return SCHED_FUSION_NONE
;
16741 if (GET_CODE (src
) == SIGN_EXTEND
)
16743 fusion
= SCHED_FUSION_LD_SIGN_EXTEND
;
16744 src
= XEXP (src
, 0);
16745 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
16746 return SCHED_FUSION_NONE
;
16748 else if (GET_CODE (src
) == ZERO_EXTEND
)
16750 fusion
= SCHED_FUSION_LD_ZERO_EXTEND
;
16751 src
= XEXP (src
, 0);
16752 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
16753 return SCHED_FUSION_NONE
;
16756 if (GET_CODE (src
) == MEM
&& REG_P (dest
))
16757 extract_base_offset_in_addr (src
, base
, offset
);
16758 else if (GET_CODE (dest
) == MEM
&& (REG_P (src
) || src
== const0_rtx
))
16760 fusion
= SCHED_FUSION_ST
;
16761 extract_base_offset_in_addr (dest
, base
, offset
);
16764 return SCHED_FUSION_NONE
;
16766 if (*base
== NULL_RTX
|| *offset
== NULL_RTX
)
16767 fusion
= SCHED_FUSION_NONE
;
16772 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
16774 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
16775 and PRI are only calculated for these instructions. For other instruction,
16776 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
16777 type instruction fusion can be added by returning different priorities.
16779 It's important that irrelevant instructions get the largest FUSION_PRI. */
16782 aarch64_sched_fusion_priority (rtx_insn
*insn
, int max_pri
,
16783 int *fusion_pri
, int *pri
)
16787 enum sched_fusion_type fusion
;
16789 gcc_assert (INSN_P (insn
));
16792 fusion
= fusion_load_store (insn
, &base
, &offset
);
16793 if (fusion
== SCHED_FUSION_NONE
)
16800 /* Set FUSION_PRI according to fusion type and base register. */
16801 *fusion_pri
= tmp
- fusion
* FIRST_PSEUDO_REGISTER
- REGNO (base
);
16803 /* Calculate PRI. */
16806 /* INSN with smaller offset goes first. */
16807 off_val
= (int)(INTVAL (offset
));
16809 tmp
-= (off_val
& 0xfffff);
16811 tmp
+= ((- off_val
) & 0xfffff);
16817 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
16818 Adjust priority of sha1h instructions so they are scheduled before
16819 other SHA1 instructions. */
16822 aarch64_sched_adjust_priority (rtx_insn
*insn
, int priority
)
16824 rtx x
= PATTERN (insn
);
16826 if (GET_CODE (x
) == SET
)
16830 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_SHA1H
)
16831 return priority
+ 10;
16837 /* Given OPERANDS of consecutive load/store, check if we can merge
16838 them into ldp/stp. LOAD is true if they are load instructions.
16839 MODE is the mode of memory operands. */
16842 aarch64_operands_ok_for_ldpstp (rtx
*operands
, bool load
,
16845 HOST_WIDE_INT offval_1
, offval_2
, msize
;
16846 enum reg_class rclass_1
, rclass_2
;
16847 rtx mem_1
, mem_2
, reg_1
, reg_2
, base_1
, base_2
, offset_1
, offset_2
;
16851 mem_1
= operands
[1];
16852 mem_2
= operands
[3];
16853 reg_1
= operands
[0];
16854 reg_2
= operands
[2];
16855 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
));
16856 if (REGNO (reg_1
) == REGNO (reg_2
))
16861 mem_1
= operands
[0];
16862 mem_2
= operands
[2];
16863 reg_1
= operands
[1];
16864 reg_2
= operands
[3];
16867 /* The mems cannot be volatile. */
16868 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
))
16871 /* If we have SImode and slow unaligned ldp,
16872 check the alignment to be at least 8 byte. */
16874 && (aarch64_tune_params
.extra_tuning_flags
16875 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
16877 && MEM_ALIGN (mem_1
) < 8 * BITS_PER_UNIT
)
16880 /* Check if the addresses are in the form of [base+offset]. */
16881 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
16882 if (base_1
== NULL_RTX
|| offset_1
== NULL_RTX
)
16884 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
16885 if (base_2
== NULL_RTX
|| offset_2
== NULL_RTX
)
16888 /* Check if the bases are same. */
16889 if (!rtx_equal_p (base_1
, base_2
))
16892 /* The operands must be of the same size. */
16893 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1
)),
16894 GET_MODE_SIZE (GET_MODE (mem_2
))));
16896 offval_1
= INTVAL (offset_1
);
16897 offval_2
= INTVAL (offset_2
);
16898 /* We should only be trying this for fixed-sized modes. There is no
16899 SVE LDP/STP instruction. */
16900 msize
= GET_MODE_SIZE (mode
).to_constant ();
16901 /* Check if the offsets are consecutive. */
16902 if (offval_1
!= (offval_2
+ msize
) && offval_2
!= (offval_1
+ msize
))
16905 /* Check if the addresses are clobbered by load. */
16908 if (reg_mentioned_p (reg_1
, mem_1
))
16911 /* In increasing order, the last load can clobber the address. */
16912 if (offval_1
> offval_2
&& reg_mentioned_p (reg_2
, mem_2
))
16916 /* One of the memory accesses must be a mempair operand.
16917 If it is not the first one, they need to be swapped by the
16919 if (!aarch64_mem_pair_operand (mem_1
, GET_MODE (mem_1
))
16920 && !aarch64_mem_pair_operand (mem_2
, GET_MODE (mem_2
)))
16923 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
16924 rclass_1
= FP_REGS
;
16926 rclass_1
= GENERAL_REGS
;
16928 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
16929 rclass_2
= FP_REGS
;
16931 rclass_2
= GENERAL_REGS
;
16933 /* Check if the registers are of same class. */
16934 if (rclass_1
!= rclass_2
)
16940 /* Given OPERANDS of consecutive load/store that can be merged,
16941 swap them if they are not in ascending order. */
16943 aarch64_swap_ldrstr_operands (rtx
* operands
, bool load
)
16945 rtx mem_1
, mem_2
, base_1
, base_2
, offset_1
, offset_2
;
16946 HOST_WIDE_INT offval_1
, offval_2
;
16950 mem_1
= operands
[1];
16951 mem_2
= operands
[3];
16955 mem_1
= operands
[0];
16956 mem_2
= operands
[2];
16959 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
16960 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
16962 offval_1
= INTVAL (offset_1
);
16963 offval_2
= INTVAL (offset_2
);
16965 if (offval_1
> offval_2
)
16967 /* Irrespective of whether this is a load or a store,
16968 we do the same swap. */
16969 std::swap (operands
[0], operands
[2]);
16970 std::swap (operands
[1], operands
[3]);
16974 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
16975 comparison between the two. */
16977 aarch64_host_wide_int_compare (const void *x
, const void *y
)
16979 return wi::cmps (* ((const HOST_WIDE_INT
*) x
),
16980 * ((const HOST_WIDE_INT
*) y
));
16983 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
16984 other pointing to a REG rtx containing an offset, compare the offsets
16989 1 iff offset (X) > offset (Y)
16990 0 iff offset (X) == offset (Y)
16991 -1 iff offset (X) < offset (Y) */
16993 aarch64_ldrstr_offset_compare (const void *x
, const void *y
)
16995 const rtx
* operands_1
= (const rtx
*) x
;
16996 const rtx
* operands_2
= (const rtx
*) y
;
16997 rtx mem_1
, mem_2
, base
, offset_1
, offset_2
;
16999 if (MEM_P (operands_1
[0]))
17000 mem_1
= operands_1
[0];
17002 mem_1
= operands_1
[1];
17004 if (MEM_P (operands_2
[0]))
17005 mem_2
= operands_2
[0];
17007 mem_2
= operands_2
[1];
17009 /* Extract the offsets. */
17010 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
17011 extract_base_offset_in_addr (mem_2
, &base
, &offset_2
);
17013 gcc_assert (offset_1
!= NULL_RTX
&& offset_2
!= NULL_RTX
);
17015 return wi::cmps (INTVAL (offset_1
), INTVAL (offset_2
));
17018 /* Given OPERANDS of consecutive load/store, check if we can merge
17019 them into ldp/stp by adjusting the offset. LOAD is true if they
17020 are load instructions. MODE is the mode of memory operands.
17022 Given below consecutive stores:
17024 str w1, [xb, 0x100]
17025 str w1, [xb, 0x104]
17026 str w1, [xb, 0x108]
17027 str w1, [xb, 0x10c]
17029 Though the offsets are out of the range supported by stp, we can
17030 still pair them after adjusting the offset, like:
17032 add scratch, xb, 0x100
17033 stp w1, w1, [scratch]
17034 stp w1, w1, [scratch, 0x8]
17036 The peephole patterns detecting this opportunity should guarantee
17037 the scratch register is avaliable. */
17040 aarch64_operands_adjust_ok_for_ldpstp (rtx
*operands
, bool load
,
17043 enum reg_class rclass_1
, rclass_2
, rclass_3
, rclass_4
;
17044 HOST_WIDE_INT offvals
[4], msize
;
17045 rtx mem_1
, mem_2
, mem_3
, mem_4
, reg_1
, reg_2
, reg_3
, reg_4
;
17046 rtx base_1
, base_2
, base_3
, base_4
, offset_1
, offset_2
, offset_3
, offset_4
;
17050 reg_1
= operands
[0];
17051 mem_1
= operands
[1];
17052 reg_2
= operands
[2];
17053 mem_2
= operands
[3];
17054 reg_3
= operands
[4];
17055 mem_3
= operands
[5];
17056 reg_4
= operands
[6];
17057 mem_4
= operands
[7];
17058 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
)
17059 && REG_P (reg_3
) && REG_P (reg_4
));
17061 /* Do not attempt to merge the loads if the loads clobber each other. */
17062 for (int i
= 0; i
< 8; i
+= 2)
17063 for (int j
= i
+ 2; j
< 8; j
+= 2)
17064 if (reg_overlap_mentioned_p (operands
[i
], operands
[j
]))
17069 mem_1
= operands
[0];
17070 reg_1
= operands
[1];
17071 mem_2
= operands
[2];
17072 reg_2
= operands
[3];
17073 mem_3
= operands
[4];
17074 reg_3
= operands
[5];
17075 mem_4
= operands
[6];
17076 reg_4
= operands
[7];
17078 /* Skip if memory operand is by itslef valid for ldp/stp. */
17079 if (!MEM_P (mem_1
) || aarch64_mem_pair_operand (mem_1
, mode
))
17082 /* The mems cannot be volatile. */
17083 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
)
17084 || MEM_VOLATILE_P (mem_3
) ||MEM_VOLATILE_P (mem_4
))
17087 /* Check if the addresses are in the form of [base+offset]. */
17088 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
17089 if (base_1
== NULL_RTX
|| offset_1
== NULL_RTX
)
17091 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
17092 if (base_2
== NULL_RTX
|| offset_2
== NULL_RTX
)
17094 extract_base_offset_in_addr (mem_3
, &base_3
, &offset_3
);
17095 if (base_3
== NULL_RTX
|| offset_3
== NULL_RTX
)
17097 extract_base_offset_in_addr (mem_4
, &base_4
, &offset_4
);
17098 if (base_4
== NULL_RTX
|| offset_4
== NULL_RTX
)
17101 /* Check if the bases are same. */
17102 if (!rtx_equal_p (base_1
, base_2
)
17103 || !rtx_equal_p (base_2
, base_3
)
17104 || !rtx_equal_p (base_3
, base_4
))
17107 offvals
[0] = INTVAL (offset_1
);
17108 offvals
[1] = INTVAL (offset_2
);
17109 offvals
[2] = INTVAL (offset_3
);
17110 offvals
[3] = INTVAL (offset_4
);
17111 msize
= GET_MODE_SIZE (mode
);
17113 /* Check if the offsets can be put in the right order to do a ldp/stp. */
17114 qsort (offvals
, 4, sizeof (HOST_WIDE_INT
), aarch64_host_wide_int_compare
);
17116 if (!(offvals
[1] == offvals
[0] + msize
17117 && offvals
[3] == offvals
[2] + msize
))
17120 /* Check that offsets are within range of each other. The ldp/stp
17121 instructions have 7 bit immediate offsets, so use 0x80. */
17122 if (offvals
[2] - offvals
[0] >= msize
* 0x80)
17125 /* The offsets must be aligned with respect to each other. */
17126 if (offvals
[0] % msize
!= offvals
[2] % msize
)
17129 /* Check if the addresses are clobbered by load. */
17130 if (load
&& (reg_mentioned_p (reg_1
, mem_1
)
17131 || reg_mentioned_p (reg_2
, mem_2
)
17132 || reg_mentioned_p (reg_3
, mem_3
)
17133 || reg_mentioned_p (reg_4
, mem_4
)))
17136 /* If we have SImode and slow unaligned ldp,
17137 check the alignment to be at least 8 byte. */
17139 && (aarch64_tune_params
.extra_tuning_flags
17140 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
17142 && MEM_ALIGN (mem_1
) < 8 * BITS_PER_UNIT
)
17145 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
17146 rclass_1
= FP_REGS
;
17148 rclass_1
= GENERAL_REGS
;
17150 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
17151 rclass_2
= FP_REGS
;
17153 rclass_2
= GENERAL_REGS
;
17155 if (REG_P (reg_3
) && FP_REGNUM_P (REGNO (reg_3
)))
17156 rclass_3
= FP_REGS
;
17158 rclass_3
= GENERAL_REGS
;
17160 if (REG_P (reg_4
) && FP_REGNUM_P (REGNO (reg_4
)))
17161 rclass_4
= FP_REGS
;
17163 rclass_4
= GENERAL_REGS
;
17165 /* Check if the registers are of same class. */
17166 if (rclass_1
!= rclass_2
|| rclass_2
!= rclass_3
|| rclass_3
!= rclass_4
)
17172 /* Given OPERANDS of consecutive load/store, this function pairs them
17173 into LDP/STP after adjusting the offset. It depends on the fact
17174 that the operands can be sorted so the offsets are correct for STP.
17175 MODE is the mode of memory operands. CODE is the rtl operator
17176 which should be applied to all memory operands, it's SIGN_EXTEND,
17177 ZERO_EXTEND or UNKNOWN. */
17180 aarch64_gen_adjusted_ldpstp (rtx
*operands
, bool load
,
17181 scalar_mode mode
, RTX_CODE code
)
17183 rtx base
, offset_1
, offset_3
, t1
, t2
;
17184 rtx mem_1
, mem_2
, mem_3
, mem_4
;
17185 rtx temp_operands
[8];
17186 HOST_WIDE_INT off_val_1
, off_val_3
, base_off
, new_off_1
, new_off_3
,
17187 stp_off_upper_limit
, stp_off_lower_limit
, msize
;
17189 /* We make changes on a copy as we may still bail out. */
17190 for (int i
= 0; i
< 8; i
++)
17191 temp_operands
[i
] = operands
[i
];
17193 /* Sort the operands. */
17194 qsort (temp_operands
, 4, 2 * sizeof (rtx
*), aarch64_ldrstr_offset_compare
);
17198 mem_1
= temp_operands
[1];
17199 mem_2
= temp_operands
[3];
17200 mem_3
= temp_operands
[5];
17201 mem_4
= temp_operands
[7];
17205 mem_1
= temp_operands
[0];
17206 mem_2
= temp_operands
[2];
17207 mem_3
= temp_operands
[4];
17208 mem_4
= temp_operands
[6];
17209 gcc_assert (code
== UNKNOWN
);
17212 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
17213 extract_base_offset_in_addr (mem_3
, &base
, &offset_3
);
17214 gcc_assert (base
!= NULL_RTX
&& offset_1
!= NULL_RTX
17215 && offset_3
!= NULL_RTX
);
17217 /* Adjust offset so it can fit in LDP/STP instruction. */
17218 msize
= GET_MODE_SIZE (mode
);
17219 stp_off_upper_limit
= msize
* (0x40 - 1);
17220 stp_off_lower_limit
= - msize
* 0x40;
17222 off_val_1
= INTVAL (offset_1
);
17223 off_val_3
= INTVAL (offset_3
);
17225 /* The base offset is optimally half way between the two STP/LDP offsets. */
17227 base_off
= (off_val_1
+ off_val_3
) / 2;
17229 /* However, due to issues with negative LDP/STP offset generation for
17230 larger modes, for DF, DI and vector modes. we must not use negative
17231 addresses smaller than 9 signed unadjusted bits can store. This
17232 provides the most range in this case. */
17233 base_off
= off_val_1
;
17235 /* Adjust the base so that it is aligned with the addresses but still
17237 if (base_off
% msize
!= off_val_1
% msize
)
17238 /* Fix the offset, bearing in mind we want to make it bigger not
17240 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
17241 else if (msize
<= 4)
17242 /* The negative range of LDP/STP is one larger than the positive range. */
17245 /* Check if base offset is too big or too small. We can attempt to resolve
17246 this issue by setting it to the maximum value and seeing if the offsets
17248 if (base_off
>= 0x1000)
17250 base_off
= 0x1000 - 1;
17251 /* We must still make sure that the base offset is aligned with respect
17252 to the address. But it may may not be made any bigger. */
17253 base_off
-= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
17256 /* Likewise for the case where the base is too small. */
17257 if (base_off
<= -0x1000)
17259 base_off
= -0x1000 + 1;
17260 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
17263 /* Offset of the first STP/LDP. */
17264 new_off_1
= off_val_1
- base_off
;
17266 /* Offset of the second STP/LDP. */
17267 new_off_3
= off_val_3
- base_off
;
17269 /* The offsets must be within the range of the LDP/STP instructions. */
17270 if (new_off_1
> stp_off_upper_limit
|| new_off_1
< stp_off_lower_limit
17271 || new_off_3
> stp_off_upper_limit
|| new_off_3
< stp_off_lower_limit
)
17274 replace_equiv_address_nv (mem_1
, plus_constant (Pmode
, operands
[8],
17276 replace_equiv_address_nv (mem_2
, plus_constant (Pmode
, operands
[8],
17277 new_off_1
+ msize
), true);
17278 replace_equiv_address_nv (mem_3
, plus_constant (Pmode
, operands
[8],
17280 replace_equiv_address_nv (mem_4
, plus_constant (Pmode
, operands
[8],
17281 new_off_3
+ msize
), true);
17283 if (!aarch64_mem_pair_operand (mem_1
, mode
)
17284 || !aarch64_mem_pair_operand (mem_3
, mode
))
17287 if (code
== ZERO_EXTEND
)
17289 mem_1
= gen_rtx_ZERO_EXTEND (DImode
, mem_1
);
17290 mem_2
= gen_rtx_ZERO_EXTEND (DImode
, mem_2
);
17291 mem_3
= gen_rtx_ZERO_EXTEND (DImode
, mem_3
);
17292 mem_4
= gen_rtx_ZERO_EXTEND (DImode
, mem_4
);
17294 else if (code
== SIGN_EXTEND
)
17296 mem_1
= gen_rtx_SIGN_EXTEND (DImode
, mem_1
);
17297 mem_2
= gen_rtx_SIGN_EXTEND (DImode
, mem_2
);
17298 mem_3
= gen_rtx_SIGN_EXTEND (DImode
, mem_3
);
17299 mem_4
= gen_rtx_SIGN_EXTEND (DImode
, mem_4
);
17304 operands
[0] = temp_operands
[0];
17305 operands
[1] = mem_1
;
17306 operands
[2] = temp_operands
[2];
17307 operands
[3] = mem_2
;
17308 operands
[4] = temp_operands
[4];
17309 operands
[5] = mem_3
;
17310 operands
[6] = temp_operands
[6];
17311 operands
[7] = mem_4
;
17315 operands
[0] = mem_1
;
17316 operands
[1] = temp_operands
[1];
17317 operands
[2] = mem_2
;
17318 operands
[3] = temp_operands
[3];
17319 operands
[4] = mem_3
;
17320 operands
[5] = temp_operands
[5];
17321 operands
[6] = mem_4
;
17322 operands
[7] = temp_operands
[7];
17325 /* Emit adjusting instruction. */
17326 emit_insn (gen_rtx_SET (operands
[8], plus_constant (DImode
, base
, base_off
)));
17327 /* Emit ldp/stp instructions. */
17328 t1
= gen_rtx_SET (operands
[0], operands
[1]);
17329 t2
= gen_rtx_SET (operands
[2], operands
[3]);
17330 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
17331 t1
= gen_rtx_SET (operands
[4], operands
[5]);
17332 t2
= gen_rtx_SET (operands
[6], operands
[7]);
17333 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
17337 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
17338 it isn't worth branching around empty masked ops (including masked
17342 aarch64_empty_mask_is_expensive (unsigned)
17347 /* Return 1 if pseudo register should be created and used to hold
17348 GOT address for PIC code. */
17351 aarch64_use_pseudo_pic_reg (void)
17353 return aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
;
17356 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
17359 aarch64_unspec_may_trap_p (const_rtx x
, unsigned flags
)
17361 switch (XINT (x
, 1))
17363 case UNSPEC_GOTSMALLPIC
:
17364 case UNSPEC_GOTSMALLPIC28K
:
17365 case UNSPEC_GOTTINYPIC
:
17371 return default_unspec_may_trap_p (x
, flags
);
17375 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
17376 return the log2 of that value. Otherwise return -1. */
17379 aarch64_fpconst_pow_of_2 (rtx x
)
17381 const REAL_VALUE_TYPE
*r
;
17383 if (!CONST_DOUBLE_P (x
))
17386 r
= CONST_DOUBLE_REAL_VALUE (x
);
17388 if (REAL_VALUE_NEGATIVE (*r
)
17389 || REAL_VALUE_ISNAN (*r
)
17390 || REAL_VALUE_ISINF (*r
)
17391 || !real_isinteger (r
, DFmode
))
17394 return exact_log2 (real_to_integer (r
));
17397 /* If X is a vector of equal CONST_DOUBLE values and that value is
17398 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
17401 aarch64_vec_fpconst_pow_of_2 (rtx x
)
17404 if (GET_CODE (x
) != CONST_VECTOR
17405 || !CONST_VECTOR_NUNITS (x
).is_constant (&nelts
))
17408 if (GET_MODE_CLASS (GET_MODE (x
)) != MODE_VECTOR_FLOAT
)
17411 int firstval
= aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, 0));
17415 for (int i
= 1; i
< nelts
; i
++)
17416 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, i
)) != firstval
)
17422 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
17425 __fp16 always promotes through this hook.
17426 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
17427 through the generic excess precision logic rather than here. */
17430 aarch64_promoted_type (const_tree t
)
17432 if (SCALAR_FLOAT_TYPE_P (t
)
17433 && TYPE_MAIN_VARIANT (t
) == aarch64_fp16_type_node
)
17434 return float_type_node
;
17439 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
17442 aarch64_optab_supported_p (int op
, machine_mode mode1
, machine_mode
,
17443 optimization_type opt_type
)
17448 return opt_type
== OPTIMIZE_FOR_SPEED
&& use_rsqrt_p (mode1
);
17455 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
17457 static unsigned int
17458 aarch64_dwarf_poly_indeterminate_value (unsigned int i
, unsigned int *factor
,
17461 /* Polynomial invariant 1 == (VG / 2) - 1. */
17462 gcc_assert (i
== 1);
17465 return AARCH64_DWARF_VG
;
17468 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
17469 if MODE is HFmode, and punt to the generic implementation otherwise. */
17472 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode
)
17474 return (mode
== HFmode
17476 : default_libgcc_floating_mode_supported_p (mode
));
17479 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
17480 if MODE is HFmode, and punt to the generic implementation otherwise. */
17483 aarch64_scalar_mode_supported_p (scalar_mode mode
)
17485 return (mode
== HFmode
17487 : default_scalar_mode_supported_p (mode
));
17490 /* Set the value of FLT_EVAL_METHOD.
17491 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
17493 0: evaluate all operations and constants, whose semantic type has at
17494 most the range and precision of type float, to the range and
17495 precision of float; evaluate all other operations and constants to
17496 the range and precision of the semantic type;
17498 N, where _FloatN is a supported interchange floating type
17499 evaluate all operations and constants, whose semantic type has at
17500 most the range and precision of _FloatN type, to the range and
17501 precision of the _FloatN type; evaluate all other operations and
17502 constants to the range and precision of the semantic type;
17504 If we have the ARMv8.2-A extensions then we support _Float16 in native
17505 precision, so we should set this to 16. Otherwise, we support the type,
17506 but want to evaluate expressions in float precision, so set this to
17509 static enum flt_eval_method
17510 aarch64_excess_precision (enum excess_precision_type type
)
17514 case EXCESS_PRECISION_TYPE_FAST
:
17515 case EXCESS_PRECISION_TYPE_STANDARD
:
17516 /* We can calculate either in 16-bit range and precision or
17517 32-bit range and precision. Make that decision based on whether
17518 we have native support for the ARMv8.2-A 16-bit floating-point
17519 instructions or not. */
17520 return (TARGET_FP_F16INST
17521 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
17522 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
);
17523 case EXCESS_PRECISION_TYPE_IMPLICIT
:
17524 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
;
17526 gcc_unreachable ();
17528 return FLT_EVAL_METHOD_UNPREDICTABLE
;
17531 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
17532 scheduled for speculative execution. Reject the long-running division
17533 and square-root instructions. */
17536 aarch64_sched_can_speculate_insn (rtx_insn
*insn
)
17538 switch (get_attr_type (insn
))
17546 case TYPE_NEON_FP_SQRT_S
:
17547 case TYPE_NEON_FP_SQRT_D
:
17548 case TYPE_NEON_FP_SQRT_S_Q
:
17549 case TYPE_NEON_FP_SQRT_D_Q
:
17550 case TYPE_NEON_FP_DIV_S
:
17551 case TYPE_NEON_FP_DIV_D
:
17552 case TYPE_NEON_FP_DIV_S_Q
:
17553 case TYPE_NEON_FP_DIV_D_Q
:
17560 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
17563 aarch64_compute_pressure_classes (reg_class
*classes
)
17566 classes
[i
++] = GENERAL_REGS
;
17567 classes
[i
++] = FP_REGS
;
17568 /* PR_REGS isn't a useful pressure class because many predicate pseudo
17569 registers need to go in PR_LO_REGS at some point during their
17570 lifetime. Splitting it into two halves has the effect of making
17571 all predicates count against PR_LO_REGS, so that we try whenever
17572 possible to restrict the number of live predicates to 8. This
17573 greatly reduces the amount of spilling in certain loops. */
17574 classes
[i
++] = PR_LO_REGS
;
17575 classes
[i
++] = PR_HI_REGS
;
17579 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
17582 aarch64_can_change_mode_class (machine_mode from
,
17583 machine_mode to
, reg_class_t
)
17585 if (BYTES_BIG_ENDIAN
)
17587 bool from_sve_p
= aarch64_sve_data_mode_p (from
);
17588 bool to_sve_p
= aarch64_sve_data_mode_p (to
);
17590 /* Don't allow changes between SVE data modes and non-SVE modes.
17591 See the comment at the head of aarch64-sve.md for details. */
17592 if (from_sve_p
!= to_sve_p
)
17595 /* Don't allow changes in element size: lane 0 of the new vector
17596 would not then be lane 0 of the old vector. See the comment
17597 above aarch64_maybe_expand_sve_subreg_move for a more detailed
17600 In the worst case, this forces a register to be spilled in
17601 one mode and reloaded in the other, which handles the
17602 endianness correctly. */
17603 if (from_sve_p
&& GET_MODE_UNIT_SIZE (from
) != GET_MODE_UNIT_SIZE (to
))
17609 /* Implement TARGET_EARLY_REMAT_MODES. */
17612 aarch64_select_early_remat_modes (sbitmap modes
)
17614 /* SVE values are not normally live across a call, so it should be
17615 worth doing early rematerialization even in VL-specific mode. */
17616 for (int i
= 0; i
< NUM_MACHINE_MODES
; ++i
)
17618 machine_mode mode
= (machine_mode
) i
;
17619 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
17620 if (vec_flags
& VEC_ANY_SVE
)
17621 bitmap_set_bit (modes
, i
);
17625 /* Target-specific selftests. */
17629 namespace selftest
{
17631 /* Selftest for the RTL loader.
17632 Verify that the RTL loader copes with a dump from
17633 print_rtx_function. This is essentially just a test that class
17634 function_reader can handle a real dump, but it also verifies
17635 that lookup_reg_by_dump_name correctly handles hard regs.
17636 The presence of hard reg names in the dump means that the test is
17637 target-specific, hence it is in this file. */
17640 aarch64_test_loading_full_dump ()
17642 rtl_dump_test
t (SELFTEST_LOCATION
, locate_file ("aarch64/times-two.rtl"));
17644 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun
->decl
)));
17646 rtx_insn
*insn_1
= get_insn_by_uid (1);
17647 ASSERT_EQ (NOTE
, GET_CODE (insn_1
));
17649 rtx_insn
*insn_15
= get_insn_by_uid (15);
17650 ASSERT_EQ (INSN
, GET_CODE (insn_15
));
17651 ASSERT_EQ (USE
, GET_CODE (PATTERN (insn_15
)));
17653 /* Verify crtl->return_rtx. */
17654 ASSERT_EQ (REG
, GET_CODE (crtl
->return_rtx
));
17655 ASSERT_EQ (0, REGNO (crtl
->return_rtx
));
17656 ASSERT_EQ (SImode
, GET_MODE (crtl
->return_rtx
));
17659 /* Run all target-specific selftests. */
17662 aarch64_run_selftests (void)
17664 aarch64_test_loading_full_dump ();
17667 } // namespace selftest
17669 #endif /* #if CHECKING_P */
17671 #undef TARGET_ADDRESS_COST
17672 #define TARGET_ADDRESS_COST aarch64_address_cost
17674 /* This hook will determines whether unnamed bitfields affect the alignment
17675 of the containing structure. The hook returns true if the structure
17676 should inherit the alignment requirements of an unnamed bitfield's
17678 #undef TARGET_ALIGN_ANON_BITFIELD
17679 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
17681 #undef TARGET_ASM_ALIGNED_DI_OP
17682 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
17684 #undef TARGET_ASM_ALIGNED_HI_OP
17685 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
17687 #undef TARGET_ASM_ALIGNED_SI_OP
17688 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
17690 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
17691 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
17692 hook_bool_const_tree_hwi_hwi_const_tree_true
17694 #undef TARGET_ASM_FILE_START
17695 #define TARGET_ASM_FILE_START aarch64_start_file
17697 #undef TARGET_ASM_OUTPUT_MI_THUNK
17698 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
17700 #undef TARGET_ASM_SELECT_RTX_SECTION
17701 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
17703 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
17704 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
17706 #undef TARGET_BUILD_BUILTIN_VA_LIST
17707 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
17709 #undef TARGET_CALLEE_COPIES
17710 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
17712 #undef TARGET_CAN_ELIMINATE
17713 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
17715 #undef TARGET_CAN_INLINE_P
17716 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
17718 #undef TARGET_CANNOT_FORCE_CONST_MEM
17719 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
17721 #undef TARGET_CASE_VALUES_THRESHOLD
17722 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
17724 #undef TARGET_CONDITIONAL_REGISTER_USAGE
17725 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
17727 /* Only the least significant bit is used for initialization guard
17729 #undef TARGET_CXX_GUARD_MASK_BIT
17730 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
17732 #undef TARGET_C_MODE_FOR_SUFFIX
17733 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
17735 #ifdef TARGET_BIG_ENDIAN_DEFAULT
17736 #undef TARGET_DEFAULT_TARGET_FLAGS
17737 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
17740 #undef TARGET_CLASS_MAX_NREGS
17741 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
17743 #undef TARGET_BUILTIN_DECL
17744 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
17746 #undef TARGET_BUILTIN_RECIPROCAL
17747 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
17749 #undef TARGET_C_EXCESS_PRECISION
17750 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
17752 #undef TARGET_EXPAND_BUILTIN
17753 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
17755 #undef TARGET_EXPAND_BUILTIN_VA_START
17756 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
17758 #undef TARGET_FOLD_BUILTIN
17759 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
17761 #undef TARGET_FUNCTION_ARG
17762 #define TARGET_FUNCTION_ARG aarch64_function_arg
17764 #undef TARGET_FUNCTION_ARG_ADVANCE
17765 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
17767 #undef TARGET_FUNCTION_ARG_BOUNDARY
17768 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
17770 #undef TARGET_FUNCTION_ARG_PADDING
17771 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
17773 #undef TARGET_GET_RAW_RESULT_MODE
17774 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
17775 #undef TARGET_GET_RAW_ARG_MODE
17776 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
17778 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
17779 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
17781 #undef TARGET_FUNCTION_VALUE
17782 #define TARGET_FUNCTION_VALUE aarch64_function_value
17784 #undef TARGET_FUNCTION_VALUE_REGNO_P
17785 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
17787 #undef TARGET_GIMPLE_FOLD_BUILTIN
17788 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
17790 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
17791 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
17793 #undef TARGET_INIT_BUILTINS
17794 #define TARGET_INIT_BUILTINS aarch64_init_builtins
17796 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
17797 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
17798 aarch64_ira_change_pseudo_allocno_class
17800 #undef TARGET_LEGITIMATE_ADDRESS_P
17801 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
17803 #undef TARGET_LEGITIMATE_CONSTANT_P
17804 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
17806 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
17807 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
17808 aarch64_legitimize_address_displacement
17810 #undef TARGET_LIBGCC_CMP_RETURN_MODE
17811 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
17813 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
17814 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
17815 aarch64_libgcc_floating_mode_supported_p
17817 #undef TARGET_MANGLE_TYPE
17818 #define TARGET_MANGLE_TYPE aarch64_mangle_type
17820 #undef TARGET_MEMORY_MOVE_COST
17821 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
17823 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
17824 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
17826 #undef TARGET_MUST_PASS_IN_STACK
17827 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
17829 /* This target hook should return true if accesses to volatile bitfields
17830 should use the narrowest mode possible. It should return false if these
17831 accesses should use the bitfield container type. */
17832 #undef TARGET_NARROW_VOLATILE_BITFIELD
17833 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
17835 #undef TARGET_OPTION_OVERRIDE
17836 #define TARGET_OPTION_OVERRIDE aarch64_override_options
17838 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
17839 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
17840 aarch64_override_options_after_change
17842 #undef TARGET_OPTION_SAVE
17843 #define TARGET_OPTION_SAVE aarch64_option_save
17845 #undef TARGET_OPTION_RESTORE
17846 #define TARGET_OPTION_RESTORE aarch64_option_restore
17848 #undef TARGET_OPTION_PRINT
17849 #define TARGET_OPTION_PRINT aarch64_option_print
17851 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
17852 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
17854 #undef TARGET_SET_CURRENT_FUNCTION
17855 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
17857 #undef TARGET_PASS_BY_REFERENCE
17858 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
17860 #undef TARGET_PREFERRED_RELOAD_CLASS
17861 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
17863 #undef TARGET_SCHED_REASSOCIATION_WIDTH
17864 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
17866 #undef TARGET_PROMOTED_TYPE
17867 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
17869 #undef TARGET_SECONDARY_RELOAD
17870 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
17872 #undef TARGET_SHIFT_TRUNCATION_MASK
17873 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
17875 #undef TARGET_SETUP_INCOMING_VARARGS
17876 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
17878 #undef TARGET_STRUCT_VALUE_RTX
17879 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
17881 #undef TARGET_REGISTER_MOVE_COST
17882 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
17884 #undef TARGET_RETURN_IN_MEMORY
17885 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
17887 #undef TARGET_RETURN_IN_MSB
17888 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
17890 #undef TARGET_RTX_COSTS
17891 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
17893 #undef TARGET_SCALAR_MODE_SUPPORTED_P
17894 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
17896 #undef TARGET_SCHED_ISSUE_RATE
17897 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
17899 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
17900 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
17901 aarch64_sched_first_cycle_multipass_dfa_lookahead
17903 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
17904 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
17905 aarch64_first_cycle_multipass_dfa_lookahead_guard
17907 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
17908 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
17909 aarch64_get_separate_components
17911 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
17912 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
17913 aarch64_components_for_bb
17915 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
17916 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
17917 aarch64_disqualify_components
17919 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
17920 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
17921 aarch64_emit_prologue_components
17923 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
17924 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
17925 aarch64_emit_epilogue_components
17927 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
17928 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
17929 aarch64_set_handled_components
17931 #undef TARGET_TRAMPOLINE_INIT
17932 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
17934 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
17935 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
17937 #undef TARGET_VECTOR_MODE_SUPPORTED_P
17938 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
17940 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
17941 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
17942 aarch64_builtin_support_vector_misalignment
17944 #undef TARGET_ARRAY_MODE
17945 #define TARGET_ARRAY_MODE aarch64_array_mode
17947 #undef TARGET_ARRAY_MODE_SUPPORTED_P
17948 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
17950 #undef TARGET_VECTORIZE_ADD_STMT_COST
17951 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
17953 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
17954 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
17955 aarch64_builtin_vectorization_cost
17957 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
17958 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
17960 #undef TARGET_VECTORIZE_BUILTINS
17961 #define TARGET_VECTORIZE_BUILTINS
17963 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
17964 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
17965 aarch64_builtin_vectorized_function
17967 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
17968 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
17969 aarch64_autovectorize_vector_sizes
17971 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
17972 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
17973 aarch64_atomic_assign_expand_fenv
17975 /* Section anchor support. */
17977 #undef TARGET_MIN_ANCHOR_OFFSET
17978 #define TARGET_MIN_ANCHOR_OFFSET -256
17980 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
17981 byte offset; we can do much more for larger data types, but have no way
17982 to determine the size of the access. We assume accesses are aligned. */
17983 #undef TARGET_MAX_ANCHOR_OFFSET
17984 #define TARGET_MAX_ANCHOR_OFFSET 4095
17986 #undef TARGET_VECTOR_ALIGNMENT
17987 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
17989 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
17990 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
17991 aarch64_vectorize_preferred_vector_alignment
17992 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
17993 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
17994 aarch64_simd_vector_alignment_reachable
17996 /* vec_perm support. */
17998 #undef TARGET_VECTORIZE_VEC_PERM_CONST
17999 #define TARGET_VECTORIZE_VEC_PERM_CONST \
18000 aarch64_vectorize_vec_perm_const
18002 #undef TARGET_VECTORIZE_GET_MASK_MODE
18003 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
18004 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
18005 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
18006 aarch64_empty_mask_is_expensive
18007 #undef TARGET_PREFERRED_ELSE_VALUE
18008 #define TARGET_PREFERRED_ELSE_VALUE \
18009 aarch64_preferred_else_value
18011 #undef TARGET_INIT_LIBFUNCS
18012 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
18014 #undef TARGET_FIXED_CONDITION_CODE_REGS
18015 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
18017 #undef TARGET_FLAGS_REGNUM
18018 #define TARGET_FLAGS_REGNUM CC_REGNUM
18020 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
18021 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
18023 #undef TARGET_ASAN_SHADOW_OFFSET
18024 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
18026 #undef TARGET_LEGITIMIZE_ADDRESS
18027 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
18029 #undef TARGET_SCHED_CAN_SPECULATE_INSN
18030 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
18032 #undef TARGET_CAN_USE_DOLOOP_P
18033 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
18035 #undef TARGET_SCHED_ADJUST_PRIORITY
18036 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
18038 #undef TARGET_SCHED_MACRO_FUSION_P
18039 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
18041 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
18042 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
18044 #undef TARGET_SCHED_FUSION_PRIORITY
18045 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
18047 #undef TARGET_UNSPEC_MAY_TRAP_P
18048 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
18050 #undef TARGET_USE_PSEUDO_PIC_REG
18051 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
18053 #undef TARGET_PRINT_OPERAND
18054 #define TARGET_PRINT_OPERAND aarch64_print_operand
18056 #undef TARGET_PRINT_OPERAND_ADDRESS
18057 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
18059 #undef TARGET_OPTAB_SUPPORTED_P
18060 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
18062 #undef TARGET_OMIT_STRUCT_RETURN_REG
18063 #define TARGET_OMIT_STRUCT_RETURN_REG true
18065 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
18066 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
18067 aarch64_dwarf_poly_indeterminate_value
18069 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
18070 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
18071 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
18073 #undef TARGET_HARD_REGNO_NREGS
18074 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
18075 #undef TARGET_HARD_REGNO_MODE_OK
18076 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
18078 #undef TARGET_MODES_TIEABLE_P
18079 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
18081 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
18082 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
18083 aarch64_hard_regno_call_part_clobbered
18085 #undef TARGET_CONSTANT_ALIGNMENT
18086 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
18088 #undef TARGET_COMPUTE_PRESSURE_CLASSES
18089 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
18091 #undef TARGET_CAN_CHANGE_MODE_CLASS
18092 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
18094 #undef TARGET_SELECT_EARLY_REMAT_MODES
18095 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
18098 #undef TARGET_RUN_TARGET_SELFTESTS
18099 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
18100 #endif /* #if CHECKING_P */
18102 struct gcc_target targetm
= TARGET_INITIALIZER
;
18104 #include "gt-aarch64.h"