1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2019 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
24 #define INCLUDE_STRING
26 #include "coretypes.h"
37 #include "stringpool.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
56 #include "langhooks.h"
61 #include "gimple-iterator.h"
62 #include "tree-vectorizer.h"
63 #include "aarch64-cost-tables.h"
67 #include "tm-constrs.h"
68 #include "sched-int.h"
69 #include "target-globals.h"
70 #include "common/common-target.h"
73 #include "selftest-rtl.h"
74 #include "rtx-vector-builder.h"
77 #include "function-abi.h"
79 /* This file should be included last. */
80 #include "target-def.h"
82 /* Defined for convenience. */
83 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
85 /* Information about a legitimate vector immediate operand. */
86 struct simd_immediate_info
88 enum insn_type
{ MOV
, MVN
, INDEX
, PTRUE
};
89 enum modifier_type
{ LSL
, MSL
};
91 simd_immediate_info () {}
92 simd_immediate_info (scalar_float_mode
, rtx
);
93 simd_immediate_info (scalar_int_mode
, unsigned HOST_WIDE_INT
,
94 insn_type
= MOV
, modifier_type
= LSL
,
96 simd_immediate_info (scalar_mode
, rtx
, rtx
);
97 simd_immediate_info (scalar_int_mode
, aarch64_svpattern
);
99 /* The mode of the elements. */
100 scalar_mode elt_mode
;
102 /* The instruction to use to move the immediate into a vector. */
107 /* For MOV and MVN. */
110 /* The value of each element. */
113 /* The kind of shift modifier to use, and the number of bits to shift.
114 This is (LSL, 0) if no shift is needed. */
115 modifier_type modifier
;
122 /* The value of the first element and the step to be added for each
123 subsequent element. */
128 aarch64_svpattern pattern
;
132 /* Construct a floating-point immediate in which each element has mode
133 ELT_MODE_IN and value VALUE_IN. */
134 inline simd_immediate_info
135 ::simd_immediate_info (scalar_float_mode elt_mode_in
, rtx value_in
)
136 : elt_mode (elt_mode_in
), insn (MOV
)
138 u
.mov
.value
= value_in
;
139 u
.mov
.modifier
= LSL
;
143 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
144 and value VALUE_IN. The other parameters are as for the structure
146 inline simd_immediate_info
147 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
148 unsigned HOST_WIDE_INT value_in
,
149 insn_type insn_in
, modifier_type modifier_in
,
150 unsigned int shift_in
)
151 : elt_mode (elt_mode_in
), insn (insn_in
)
153 u
.mov
.value
= gen_int_mode (value_in
, elt_mode_in
);
154 u
.mov
.modifier
= modifier_in
;
155 u
.mov
.shift
= shift_in
;
158 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
159 and where element I is equal to BASE_IN + I * STEP_IN. */
160 inline simd_immediate_info
161 ::simd_immediate_info (scalar_mode elt_mode_in
, rtx base_in
, rtx step_in
)
162 : elt_mode (elt_mode_in
), insn (INDEX
)
164 u
.index
.base
= base_in
;
165 u
.index
.step
= step_in
;
168 /* Construct a predicate that controls elements of mode ELT_MODE_IN
169 and has PTRUE pattern PATTERN_IN. */
170 inline simd_immediate_info
171 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
172 aarch64_svpattern pattern_in
)
173 : elt_mode (elt_mode_in
), insn (PTRUE
)
175 u
.pattern
= pattern_in
;
178 /* The current code model. */
179 enum aarch64_code_model aarch64_cmodel
;
181 /* The number of 64-bit elements in an SVE vector. */
182 poly_uint16 aarch64_sve_vg
;
185 #undef TARGET_HAVE_TLS
186 #define TARGET_HAVE_TLS 1
189 static bool aarch64_composite_type_p (const_tree
, machine_mode
);
190 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode
,
192 machine_mode
*, int *,
194 static void aarch64_elf_asm_constructor (rtx
, int) ATTRIBUTE_UNUSED
;
195 static void aarch64_elf_asm_destructor (rtx
, int) ATTRIBUTE_UNUSED
;
196 static void aarch64_override_options_after_change (void);
197 static bool aarch64_vector_mode_supported_p (machine_mode
);
198 static int aarch64_address_cost (rtx
, machine_mode
, addr_space_t
, bool);
199 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode
,
203 static machine_mode
aarch64_simd_container_mode (scalar_mode
, poly_int64
);
204 static bool aarch64_print_address_internal (FILE*, machine_mode
, rtx
,
205 aarch64_addr_query_type
);
206 static HOST_WIDE_INT
aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val
);
208 /* Major revision number of the ARM Architecture implemented by the target. */
209 unsigned aarch64_architecture_version
;
211 /* The processor for which instructions should be scheduled. */
212 enum aarch64_processor aarch64_tune
= cortexa53
;
214 /* Mask to specify which instruction scheduling options should be used. */
215 uint64_t aarch64_tune_flags
= 0;
217 /* Global flag for PC relative loads. */
218 bool aarch64_pcrelative_literal_loads
;
220 /* Global flag for whether frame pointer is enabled. */
221 bool aarch64_use_frame_pointer
;
223 #define BRANCH_PROTECT_STR_MAX 255
224 char *accepted_branch_protection_string
= NULL
;
226 static enum aarch64_parse_opt_result
227 aarch64_parse_branch_protection (const char*, char**);
229 /* Support for command line parsing of boolean flags in the tuning
231 struct aarch64_flag_desc
237 #define AARCH64_FUSION_PAIR(name, internal_name) \
238 { name, AARCH64_FUSE_##internal_name },
239 static const struct aarch64_flag_desc aarch64_fusible_pairs
[] =
241 { "none", AARCH64_FUSE_NOTHING
},
242 #include "aarch64-fusion-pairs.def"
243 { "all", AARCH64_FUSE_ALL
},
244 { NULL
, AARCH64_FUSE_NOTHING
}
247 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
248 { name, AARCH64_EXTRA_TUNE_##internal_name },
249 static const struct aarch64_flag_desc aarch64_tuning_flags
[] =
251 { "none", AARCH64_EXTRA_TUNE_NONE
},
252 #include "aarch64-tuning-flags.def"
253 { "all", AARCH64_EXTRA_TUNE_ALL
},
254 { NULL
, AARCH64_EXTRA_TUNE_NONE
}
257 /* Tuning parameters. */
259 static const struct cpu_addrcost_table generic_addrcost_table
=
269 0, /* register_offset */
270 0, /* register_sextend */
271 0, /* register_zextend */
275 static const struct cpu_addrcost_table exynosm1_addrcost_table
=
285 1, /* register_offset */
286 1, /* register_sextend */
287 2, /* register_zextend */
291 static const struct cpu_addrcost_table xgene1_addrcost_table
=
301 0, /* register_offset */
302 1, /* register_sextend */
303 1, /* register_zextend */
307 static const struct cpu_addrcost_table thunderx2t99_addrcost_table
=
317 2, /* register_offset */
318 3, /* register_sextend */
319 3, /* register_zextend */
323 static const struct cpu_addrcost_table tsv110_addrcost_table
=
333 0, /* register_offset */
334 1, /* register_sextend */
335 1, /* register_zextend */
339 static const struct cpu_addrcost_table qdf24xx_addrcost_table
=
349 3, /* register_offset */
350 3, /* register_sextend */
351 3, /* register_zextend */
355 static const struct cpu_regmove_cost generic_regmove_cost
=
358 /* Avoid the use of slow int<->fp moves for spilling by setting
359 their cost higher than memmov_cost. */
365 static const struct cpu_regmove_cost cortexa57_regmove_cost
=
368 /* Avoid the use of slow int<->fp moves for spilling by setting
369 their cost higher than memmov_cost. */
375 static const struct cpu_regmove_cost cortexa53_regmove_cost
=
378 /* Avoid the use of slow int<->fp moves for spilling by setting
379 their cost higher than memmov_cost. */
385 static const struct cpu_regmove_cost exynosm1_regmove_cost
=
388 /* Avoid the use of slow int<->fp moves for spilling by setting
389 their cost higher than memmov_cost (actual, 4 and 9). */
395 static const struct cpu_regmove_cost thunderx_regmove_cost
=
403 static const struct cpu_regmove_cost xgene1_regmove_cost
=
406 /* Avoid the use of slow int<->fp moves for spilling by setting
407 their cost higher than memmov_cost. */
413 static const struct cpu_regmove_cost qdf24xx_regmove_cost
=
416 /* Avoid the use of int<->fp moves for spilling. */
422 static const struct cpu_regmove_cost thunderx2t99_regmove_cost
=
425 /* Avoid the use of int<->fp moves for spilling. */
431 static const struct cpu_regmove_cost tsv110_regmove_cost
=
434 /* Avoid the use of slow int<->fp moves for spilling by setting
435 their cost higher than memmov_cost. */
441 /* Generic costs for vector insn classes. */
442 static const struct cpu_vector_cost generic_vector_cost
=
444 1, /* scalar_int_stmt_cost */
445 1, /* scalar_fp_stmt_cost */
446 1, /* scalar_load_cost */
447 1, /* scalar_store_cost */
448 1, /* vec_int_stmt_cost */
449 1, /* vec_fp_stmt_cost */
450 2, /* vec_permute_cost */
451 1, /* vec_to_scalar_cost */
452 1, /* scalar_to_vec_cost */
453 1, /* vec_align_load_cost */
454 1, /* vec_unalign_load_cost */
455 1, /* vec_unalign_store_cost */
456 1, /* vec_store_cost */
457 3, /* cond_taken_branch_cost */
458 1 /* cond_not_taken_branch_cost */
461 /* QDF24XX costs for vector insn classes. */
462 static const struct cpu_vector_cost qdf24xx_vector_cost
=
464 1, /* scalar_int_stmt_cost */
465 1, /* scalar_fp_stmt_cost */
466 1, /* scalar_load_cost */
467 1, /* scalar_store_cost */
468 1, /* vec_int_stmt_cost */
469 3, /* vec_fp_stmt_cost */
470 2, /* vec_permute_cost */
471 1, /* vec_to_scalar_cost */
472 1, /* scalar_to_vec_cost */
473 1, /* vec_align_load_cost */
474 1, /* vec_unalign_load_cost */
475 1, /* vec_unalign_store_cost */
476 1, /* vec_store_cost */
477 3, /* cond_taken_branch_cost */
478 1 /* cond_not_taken_branch_cost */
481 /* ThunderX costs for vector insn classes. */
482 static const struct cpu_vector_cost thunderx_vector_cost
=
484 1, /* scalar_int_stmt_cost */
485 1, /* scalar_fp_stmt_cost */
486 3, /* scalar_load_cost */
487 1, /* scalar_store_cost */
488 4, /* vec_int_stmt_cost */
489 1, /* vec_fp_stmt_cost */
490 4, /* vec_permute_cost */
491 2, /* vec_to_scalar_cost */
492 2, /* scalar_to_vec_cost */
493 3, /* vec_align_load_cost */
494 5, /* vec_unalign_load_cost */
495 5, /* vec_unalign_store_cost */
496 1, /* vec_store_cost */
497 3, /* cond_taken_branch_cost */
498 3 /* cond_not_taken_branch_cost */
501 static const struct cpu_vector_cost tsv110_vector_cost
=
503 1, /* scalar_int_stmt_cost */
504 1, /* scalar_fp_stmt_cost */
505 5, /* scalar_load_cost */
506 1, /* scalar_store_cost */
507 2, /* vec_int_stmt_cost */
508 2, /* vec_fp_stmt_cost */
509 2, /* vec_permute_cost */
510 3, /* vec_to_scalar_cost */
511 2, /* scalar_to_vec_cost */
512 5, /* vec_align_load_cost */
513 5, /* vec_unalign_load_cost */
514 1, /* vec_unalign_store_cost */
515 1, /* vec_store_cost */
516 1, /* cond_taken_branch_cost */
517 1 /* cond_not_taken_branch_cost */
520 /* Generic costs for vector insn classes. */
521 static const struct cpu_vector_cost cortexa57_vector_cost
=
523 1, /* scalar_int_stmt_cost */
524 1, /* scalar_fp_stmt_cost */
525 4, /* scalar_load_cost */
526 1, /* scalar_store_cost */
527 2, /* vec_int_stmt_cost */
528 2, /* vec_fp_stmt_cost */
529 3, /* vec_permute_cost */
530 8, /* vec_to_scalar_cost */
531 8, /* scalar_to_vec_cost */
532 4, /* vec_align_load_cost */
533 4, /* vec_unalign_load_cost */
534 1, /* vec_unalign_store_cost */
535 1, /* vec_store_cost */
536 1, /* cond_taken_branch_cost */
537 1 /* cond_not_taken_branch_cost */
540 static const struct cpu_vector_cost exynosm1_vector_cost
=
542 1, /* scalar_int_stmt_cost */
543 1, /* scalar_fp_stmt_cost */
544 5, /* scalar_load_cost */
545 1, /* scalar_store_cost */
546 3, /* vec_int_stmt_cost */
547 3, /* vec_fp_stmt_cost */
548 3, /* vec_permute_cost */
549 3, /* vec_to_scalar_cost */
550 3, /* scalar_to_vec_cost */
551 5, /* vec_align_load_cost */
552 5, /* vec_unalign_load_cost */
553 1, /* vec_unalign_store_cost */
554 1, /* vec_store_cost */
555 1, /* cond_taken_branch_cost */
556 1 /* cond_not_taken_branch_cost */
559 /* Generic costs for vector insn classes. */
560 static const struct cpu_vector_cost xgene1_vector_cost
=
562 1, /* scalar_int_stmt_cost */
563 1, /* scalar_fp_stmt_cost */
564 5, /* scalar_load_cost */
565 1, /* scalar_store_cost */
566 2, /* vec_int_stmt_cost */
567 2, /* vec_fp_stmt_cost */
568 2, /* vec_permute_cost */
569 4, /* vec_to_scalar_cost */
570 4, /* scalar_to_vec_cost */
571 10, /* vec_align_load_cost */
572 10, /* vec_unalign_load_cost */
573 2, /* vec_unalign_store_cost */
574 2, /* vec_store_cost */
575 2, /* cond_taken_branch_cost */
576 1 /* cond_not_taken_branch_cost */
579 /* Costs for vector insn classes for Vulcan. */
580 static const struct cpu_vector_cost thunderx2t99_vector_cost
=
582 1, /* scalar_int_stmt_cost */
583 6, /* scalar_fp_stmt_cost */
584 4, /* scalar_load_cost */
585 1, /* scalar_store_cost */
586 5, /* vec_int_stmt_cost */
587 6, /* vec_fp_stmt_cost */
588 3, /* vec_permute_cost */
589 6, /* vec_to_scalar_cost */
590 5, /* scalar_to_vec_cost */
591 8, /* vec_align_load_cost */
592 8, /* vec_unalign_load_cost */
593 4, /* vec_unalign_store_cost */
594 4, /* vec_store_cost */
595 2, /* cond_taken_branch_cost */
596 1 /* cond_not_taken_branch_cost */
599 /* Generic costs for branch instructions. */
600 static const struct cpu_branch_cost generic_branch_cost
=
602 1, /* Predictable. */
603 3 /* Unpredictable. */
606 /* Generic approximation modes. */
607 static const cpu_approx_modes generic_approx_modes
=
609 AARCH64_APPROX_NONE
, /* division */
610 AARCH64_APPROX_NONE
, /* sqrt */
611 AARCH64_APPROX_NONE
/* recip_sqrt */
614 /* Approximation modes for Exynos M1. */
615 static const cpu_approx_modes exynosm1_approx_modes
=
617 AARCH64_APPROX_NONE
, /* division */
618 AARCH64_APPROX_ALL
, /* sqrt */
619 AARCH64_APPROX_ALL
/* recip_sqrt */
622 /* Approximation modes for X-Gene 1. */
623 static const cpu_approx_modes xgene1_approx_modes
=
625 AARCH64_APPROX_NONE
, /* division */
626 AARCH64_APPROX_NONE
, /* sqrt */
627 AARCH64_APPROX_ALL
/* recip_sqrt */
630 /* Generic prefetch settings (which disable prefetch). */
631 static const cpu_prefetch_tune generic_prefetch_tune
=
634 -1, /* l1_cache_size */
635 -1, /* l1_cache_line_size */
636 -1, /* l2_cache_size */
637 true, /* prefetch_dynamic_strides */
638 -1, /* minimum_stride */
639 -1 /* default_opt_level */
642 static const cpu_prefetch_tune exynosm1_prefetch_tune
=
645 -1, /* l1_cache_size */
646 64, /* l1_cache_line_size */
647 -1, /* l2_cache_size */
648 true, /* prefetch_dynamic_strides */
649 -1, /* minimum_stride */
650 -1 /* default_opt_level */
653 static const cpu_prefetch_tune qdf24xx_prefetch_tune
=
656 32, /* l1_cache_size */
657 64, /* l1_cache_line_size */
658 512, /* l2_cache_size */
659 false, /* prefetch_dynamic_strides */
660 2048, /* minimum_stride */
661 3 /* default_opt_level */
664 static const cpu_prefetch_tune thunderxt88_prefetch_tune
=
667 32, /* l1_cache_size */
668 128, /* l1_cache_line_size */
669 16*1024, /* l2_cache_size */
670 true, /* prefetch_dynamic_strides */
671 -1, /* minimum_stride */
672 3 /* default_opt_level */
675 static const cpu_prefetch_tune thunderx_prefetch_tune
=
678 32, /* l1_cache_size */
679 128, /* l1_cache_line_size */
680 -1, /* l2_cache_size */
681 true, /* prefetch_dynamic_strides */
682 -1, /* minimum_stride */
683 -1 /* default_opt_level */
686 static const cpu_prefetch_tune thunderx2t99_prefetch_tune
=
689 32, /* l1_cache_size */
690 64, /* l1_cache_line_size */
691 256, /* l2_cache_size */
692 true, /* prefetch_dynamic_strides */
693 -1, /* minimum_stride */
694 -1 /* default_opt_level */
697 static const cpu_prefetch_tune tsv110_prefetch_tune
=
700 64, /* l1_cache_size */
701 64, /* l1_cache_line_size */
702 512, /* l2_cache_size */
703 true, /* prefetch_dynamic_strides */
704 -1, /* minimum_stride */
705 -1 /* default_opt_level */
708 static const cpu_prefetch_tune xgene1_prefetch_tune
=
711 32, /* l1_cache_size */
712 64, /* l1_cache_line_size */
713 256, /* l2_cache_size */
714 true, /* prefetch_dynamic_strides */
715 -1, /* minimum_stride */
716 -1 /* default_opt_level */
719 static const struct tune_params generic_tunings
=
721 &cortexa57_extra_costs
,
722 &generic_addrcost_table
,
723 &generic_regmove_cost
,
724 &generic_vector_cost
,
725 &generic_branch_cost
,
726 &generic_approx_modes
,
727 SVE_NOT_IMPLEMENTED
, /* sve_width */
730 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
731 "16:12", /* function_align. */
732 "4", /* jump_align. */
733 "8", /* loop_align. */
734 2, /* int_reassoc_width. */
735 4, /* fp_reassoc_width. */
736 1, /* vec_reassoc_width. */
737 2, /* min_div_recip_mul_sf. */
738 2, /* min_div_recip_mul_df. */
739 0, /* max_case_values. */
740 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
741 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
742 &generic_prefetch_tune
745 static const struct tune_params cortexa35_tunings
=
747 &cortexa53_extra_costs
,
748 &generic_addrcost_table
,
749 &cortexa53_regmove_cost
,
750 &generic_vector_cost
,
751 &generic_branch_cost
,
752 &generic_approx_modes
,
753 SVE_NOT_IMPLEMENTED
, /* sve_width */
756 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
757 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
758 "16", /* function_align. */
759 "4", /* jump_align. */
760 "8", /* loop_align. */
761 2, /* int_reassoc_width. */
762 4, /* fp_reassoc_width. */
763 1, /* vec_reassoc_width. */
764 2, /* min_div_recip_mul_sf. */
765 2, /* min_div_recip_mul_df. */
766 0, /* max_case_values. */
767 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
768 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
769 &generic_prefetch_tune
772 static const struct tune_params cortexa53_tunings
=
774 &cortexa53_extra_costs
,
775 &generic_addrcost_table
,
776 &cortexa53_regmove_cost
,
777 &generic_vector_cost
,
778 &generic_branch_cost
,
779 &generic_approx_modes
,
780 SVE_NOT_IMPLEMENTED
, /* sve_width */
783 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
784 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
785 "16", /* function_align. */
786 "4", /* jump_align. */
787 "8", /* loop_align. */
788 2, /* int_reassoc_width. */
789 4, /* fp_reassoc_width. */
790 1, /* vec_reassoc_width. */
791 2, /* min_div_recip_mul_sf. */
792 2, /* min_div_recip_mul_df. */
793 0, /* max_case_values. */
794 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
795 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
796 &generic_prefetch_tune
799 static const struct tune_params cortexa57_tunings
=
801 &cortexa57_extra_costs
,
802 &generic_addrcost_table
,
803 &cortexa57_regmove_cost
,
804 &cortexa57_vector_cost
,
805 &generic_branch_cost
,
806 &generic_approx_modes
,
807 SVE_NOT_IMPLEMENTED
, /* sve_width */
810 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
811 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
812 "16", /* function_align. */
813 "4", /* jump_align. */
814 "8", /* loop_align. */
815 2, /* int_reassoc_width. */
816 4, /* fp_reassoc_width. */
817 1, /* vec_reassoc_width. */
818 2, /* min_div_recip_mul_sf. */
819 2, /* min_div_recip_mul_df. */
820 0, /* max_case_values. */
821 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
822 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
), /* tune_flags. */
823 &generic_prefetch_tune
826 static const struct tune_params cortexa72_tunings
=
828 &cortexa57_extra_costs
,
829 &generic_addrcost_table
,
830 &cortexa57_regmove_cost
,
831 &cortexa57_vector_cost
,
832 &generic_branch_cost
,
833 &generic_approx_modes
,
834 SVE_NOT_IMPLEMENTED
, /* sve_width */
837 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
838 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
839 "16", /* function_align. */
840 "4", /* jump_align. */
841 "8", /* loop_align. */
842 2, /* int_reassoc_width. */
843 4, /* fp_reassoc_width. */
844 1, /* vec_reassoc_width. */
845 2, /* min_div_recip_mul_sf. */
846 2, /* min_div_recip_mul_df. */
847 0, /* max_case_values. */
848 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
849 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
850 &generic_prefetch_tune
853 static const struct tune_params cortexa73_tunings
=
855 &cortexa57_extra_costs
,
856 &generic_addrcost_table
,
857 &cortexa57_regmove_cost
,
858 &cortexa57_vector_cost
,
859 &generic_branch_cost
,
860 &generic_approx_modes
,
861 SVE_NOT_IMPLEMENTED
, /* sve_width */
862 4, /* memmov_cost. */
864 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
865 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
866 "16", /* function_align. */
867 "4", /* jump_align. */
868 "8", /* loop_align. */
869 2, /* int_reassoc_width. */
870 4, /* fp_reassoc_width. */
871 1, /* vec_reassoc_width. */
872 2, /* min_div_recip_mul_sf. */
873 2, /* min_div_recip_mul_df. */
874 0, /* max_case_values. */
875 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
876 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
877 &generic_prefetch_tune
882 static const struct tune_params exynosm1_tunings
=
884 &exynosm1_extra_costs
,
885 &exynosm1_addrcost_table
,
886 &exynosm1_regmove_cost
,
887 &exynosm1_vector_cost
,
888 &generic_branch_cost
,
889 &exynosm1_approx_modes
,
890 SVE_NOT_IMPLEMENTED
, /* sve_width */
893 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
894 "4", /* function_align. */
895 "4", /* jump_align. */
896 "4", /* loop_align. */
897 2, /* int_reassoc_width. */
898 4, /* fp_reassoc_width. */
899 1, /* vec_reassoc_width. */
900 2, /* min_div_recip_mul_sf. */
901 2, /* min_div_recip_mul_df. */
902 48, /* max_case_values. */
903 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
904 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
905 &exynosm1_prefetch_tune
908 static const struct tune_params thunderxt88_tunings
=
910 &thunderx_extra_costs
,
911 &generic_addrcost_table
,
912 &thunderx_regmove_cost
,
913 &thunderx_vector_cost
,
914 &generic_branch_cost
,
915 &generic_approx_modes
,
916 SVE_NOT_IMPLEMENTED
, /* sve_width */
919 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
920 "8", /* function_align. */
921 "8", /* jump_align. */
922 "8", /* loop_align. */
923 2, /* int_reassoc_width. */
924 4, /* fp_reassoc_width. */
925 1, /* vec_reassoc_width. */
926 2, /* min_div_recip_mul_sf. */
927 2, /* min_div_recip_mul_df. */
928 0, /* max_case_values. */
929 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
930 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
), /* tune_flags. */
931 &thunderxt88_prefetch_tune
934 static const struct tune_params thunderx_tunings
=
936 &thunderx_extra_costs
,
937 &generic_addrcost_table
,
938 &thunderx_regmove_cost
,
939 &thunderx_vector_cost
,
940 &generic_branch_cost
,
941 &generic_approx_modes
,
942 SVE_NOT_IMPLEMENTED
, /* sve_width */
945 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
946 "8", /* function_align. */
947 "8", /* jump_align. */
948 "8", /* loop_align. */
949 2, /* int_reassoc_width. */
950 4, /* fp_reassoc_width. */
951 1, /* vec_reassoc_width. */
952 2, /* min_div_recip_mul_sf. */
953 2, /* min_div_recip_mul_df. */
954 0, /* max_case_values. */
955 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
956 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
957 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
), /* tune_flags. */
958 &thunderx_prefetch_tune
961 static const struct tune_params tsv110_tunings
=
964 &tsv110_addrcost_table
,
965 &tsv110_regmove_cost
,
967 &generic_branch_cost
,
968 &generic_approx_modes
,
969 SVE_NOT_IMPLEMENTED
, /* sve_width */
972 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_CMP_BRANCH
973 | AARCH64_FUSE_ALU_BRANCH
), /* fusible_ops */
974 "16", /* function_align. */
975 "4", /* jump_align. */
976 "8", /* loop_align. */
977 2, /* int_reassoc_width. */
978 4, /* fp_reassoc_width. */
979 1, /* vec_reassoc_width. */
980 2, /* min_div_recip_mul_sf. */
981 2, /* min_div_recip_mul_df. */
982 0, /* max_case_values. */
983 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
984 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
985 &tsv110_prefetch_tune
988 static const struct tune_params xgene1_tunings
=
991 &xgene1_addrcost_table
,
992 &xgene1_regmove_cost
,
994 &generic_branch_cost
,
995 &xgene1_approx_modes
,
996 SVE_NOT_IMPLEMENTED
, /* sve_width */
999 AARCH64_FUSE_NOTHING
, /* fusible_ops */
1000 "16", /* function_align. */
1001 "16", /* jump_align. */
1002 "16", /* loop_align. */
1003 2, /* int_reassoc_width. */
1004 4, /* fp_reassoc_width. */
1005 1, /* vec_reassoc_width. */
1006 2, /* min_div_recip_mul_sf. */
1007 2, /* min_div_recip_mul_df. */
1008 17, /* max_case_values. */
1009 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
1010 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
), /* tune_flags. */
1011 &xgene1_prefetch_tune
1014 static const struct tune_params emag_tunings
=
1016 &xgene1_extra_costs
,
1017 &xgene1_addrcost_table
,
1018 &xgene1_regmove_cost
,
1019 &xgene1_vector_cost
,
1020 &generic_branch_cost
,
1021 &xgene1_approx_modes
,
1022 SVE_NOT_IMPLEMENTED
,
1023 6, /* memmov_cost */
1025 AARCH64_FUSE_NOTHING
, /* fusible_ops */
1026 "16", /* function_align. */
1027 "16", /* jump_align. */
1028 "16", /* loop_align. */
1029 2, /* int_reassoc_width. */
1030 4, /* fp_reassoc_width. */
1031 1, /* vec_reassoc_width. */
1032 2, /* min_div_recip_mul_sf. */
1033 2, /* min_div_recip_mul_df. */
1034 17, /* max_case_values. */
1035 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
1036 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
), /* tune_flags. */
1037 &xgene1_prefetch_tune
1040 static const struct tune_params qdf24xx_tunings
=
1042 &qdf24xx_extra_costs
,
1043 &qdf24xx_addrcost_table
,
1044 &qdf24xx_regmove_cost
,
1045 &qdf24xx_vector_cost
,
1046 &generic_branch_cost
,
1047 &generic_approx_modes
,
1048 SVE_NOT_IMPLEMENTED
, /* sve_width */
1049 4, /* memmov_cost */
1051 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
1052 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
1053 "16", /* function_align. */
1054 "8", /* jump_align. */
1055 "16", /* loop_align. */
1056 2, /* int_reassoc_width. */
1057 4, /* fp_reassoc_width. */
1058 1, /* vec_reassoc_width. */
1059 2, /* min_div_recip_mul_sf. */
1060 2, /* min_div_recip_mul_df. */
1061 0, /* max_case_values. */
1062 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1063 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS
, /* tune_flags. */
1064 &qdf24xx_prefetch_tune
1067 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1069 static const struct tune_params saphira_tunings
=
1071 &generic_extra_costs
,
1072 &generic_addrcost_table
,
1073 &generic_regmove_cost
,
1074 &generic_vector_cost
,
1075 &generic_branch_cost
,
1076 &generic_approx_modes
,
1077 SVE_NOT_IMPLEMENTED
, /* sve_width */
1078 4, /* memmov_cost */
1080 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
1081 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
1082 "16", /* function_align. */
1083 "8", /* jump_align. */
1084 "16", /* loop_align. */
1085 2, /* int_reassoc_width. */
1086 4, /* fp_reassoc_width. */
1087 1, /* vec_reassoc_width. */
1088 2, /* min_div_recip_mul_sf. */
1089 2, /* min_div_recip_mul_df. */
1090 0, /* max_case_values. */
1091 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1092 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1093 &generic_prefetch_tune
1096 static const struct tune_params thunderx2t99_tunings
=
1098 &thunderx2t99_extra_costs
,
1099 &thunderx2t99_addrcost_table
,
1100 &thunderx2t99_regmove_cost
,
1101 &thunderx2t99_vector_cost
,
1102 &generic_branch_cost
,
1103 &generic_approx_modes
,
1104 SVE_NOT_IMPLEMENTED
, /* sve_width */
1105 4, /* memmov_cost. */
1106 4, /* issue_rate. */
1107 (AARCH64_FUSE_CMP_BRANCH
| AARCH64_FUSE_AES_AESMC
1108 | AARCH64_FUSE_ALU_BRANCH
), /* fusible_ops */
1109 "16", /* function_align. */
1110 "8", /* jump_align. */
1111 "16", /* loop_align. */
1112 3, /* int_reassoc_width. */
1113 2, /* fp_reassoc_width. */
1114 2, /* vec_reassoc_width. */
1115 2, /* min_div_recip_mul_sf. */
1116 2, /* min_div_recip_mul_df. */
1117 0, /* max_case_values. */
1118 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1119 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1120 &thunderx2t99_prefetch_tune
1123 static const struct tune_params neoversen1_tunings
=
1125 &cortexa57_extra_costs
,
1126 &generic_addrcost_table
,
1127 &generic_regmove_cost
,
1128 &cortexa57_vector_cost
,
1129 &generic_branch_cost
,
1130 &generic_approx_modes
,
1131 SVE_NOT_IMPLEMENTED
, /* sve_width */
1132 4, /* memmov_cost */
1134 AARCH64_FUSE_AES_AESMC
, /* fusible_ops */
1135 "32:16", /* function_align. */
1136 "32:16", /* jump_align. */
1137 "32:16", /* loop_align. */
1138 2, /* int_reassoc_width. */
1139 4, /* fp_reassoc_width. */
1140 2, /* vec_reassoc_width. */
1141 2, /* min_div_recip_mul_sf. */
1142 2, /* min_div_recip_mul_df. */
1143 0, /* max_case_values. */
1144 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1145 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1146 &generic_prefetch_tune
1149 /* Support for fine-grained override of the tuning structures. */
1150 struct aarch64_tuning_override_function
1153 void (*parse_override
)(const char*, struct tune_params
*);
1156 static void aarch64_parse_fuse_string (const char*, struct tune_params
*);
1157 static void aarch64_parse_tune_string (const char*, struct tune_params
*);
1158 static void aarch64_parse_sve_width_string (const char*, struct tune_params
*);
1160 static const struct aarch64_tuning_override_function
1161 aarch64_tuning_override_functions
[] =
1163 { "fuse", aarch64_parse_fuse_string
},
1164 { "tune", aarch64_parse_tune_string
},
1165 { "sve_width", aarch64_parse_sve_width_string
},
1169 /* A processor implementing AArch64. */
1172 const char *const name
;
1173 enum aarch64_processor ident
;
1174 enum aarch64_processor sched_core
;
1175 enum aarch64_arch arch
;
1176 unsigned architecture_version
;
1177 const uint64_t flags
;
1178 const struct tune_params
*const tune
;
1181 /* Architectures implementing AArch64. */
1182 static const struct processor all_architectures
[] =
1184 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1185 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1186 #include "aarch64-arches.def"
1187 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
1190 /* Processor cores implementing AArch64. */
1191 static const struct processor all_cores
[] =
1193 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1194 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1195 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1196 FLAGS, &COSTS##_tunings},
1197 #include "aarch64-cores.def"
1198 {"generic", generic
, cortexa53
, AARCH64_ARCH_8A
, 8,
1199 AARCH64_FL_FOR_ARCH8
, &generic_tunings
},
1200 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
1204 /* Target specification. These are populated by the -march, -mtune, -mcpu
1205 handling code or by target attributes. */
1206 static const struct processor
*selected_arch
;
1207 static const struct processor
*selected_cpu
;
1208 static const struct processor
*selected_tune
;
1210 enum aarch64_key_type aarch64_ra_sign_key
= AARCH64_KEY_A
;
1212 /* The current tuning set. */
1213 struct tune_params aarch64_tune_params
= generic_tunings
;
1215 /* Table of machine attributes. */
1216 static const struct attribute_spec aarch64_attribute_table
[] =
1218 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1219 affects_type_identity, handler, exclude } */
1220 { "aarch64_vector_pcs", 0, 0, false, true, true, true, NULL
, NULL
},
1221 { NULL
, 0, 0, false, false, false, false, NULL
, NULL
}
1224 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1226 /* An ISA extension in the co-processor and main instruction set space. */
1227 struct aarch64_option_extension
1229 const char *const name
;
1230 const unsigned long flags_on
;
1231 const unsigned long flags_off
;
1234 typedef enum aarch64_cond_code
1236 AARCH64_EQ
= 0, AARCH64_NE
, AARCH64_CS
, AARCH64_CC
, AARCH64_MI
, AARCH64_PL
,
1237 AARCH64_VS
, AARCH64_VC
, AARCH64_HI
, AARCH64_LS
, AARCH64_GE
, AARCH64_LT
,
1238 AARCH64_GT
, AARCH64_LE
, AARCH64_AL
, AARCH64_NV
1242 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1244 struct aarch64_branch_protect_type
1246 /* The type's name that the user passes to the branch-protection option
1249 /* Function to handle the protection type and set global variables.
1250 First argument is the string token corresponding with this type and the
1251 second argument is the next token in the option string.
1253 * AARCH64_PARSE_OK: Handling was sucessful.
1254 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1255 should print an error.
1256 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1258 enum aarch64_parse_opt_result (*handler
)(char*, char*);
1259 /* A list of types that can follow this type in the option string. */
1260 const aarch64_branch_protect_type
* subtypes
;
1261 unsigned int num_subtypes
;
1264 static enum aarch64_parse_opt_result
1265 aarch64_handle_no_branch_protection (char* str
, char* rest
)
1267 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NONE
;
1268 aarch64_enable_bti
= 0;
1271 error ("unexpected %<%s%> after %<%s%>", rest
, str
);
1272 return AARCH64_PARSE_INVALID_FEATURE
;
1274 return AARCH64_PARSE_OK
;
1277 static enum aarch64_parse_opt_result
1278 aarch64_handle_standard_branch_protection (char* str
, char* rest
)
1280 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
1281 aarch64_ra_sign_key
= AARCH64_KEY_A
;
1282 aarch64_enable_bti
= 1;
1285 error ("unexpected %<%s%> after %<%s%>", rest
, str
);
1286 return AARCH64_PARSE_INVALID_FEATURE
;
1288 return AARCH64_PARSE_OK
;
1291 static enum aarch64_parse_opt_result
1292 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED
,
1293 char* rest ATTRIBUTE_UNUSED
)
1295 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
1296 aarch64_ra_sign_key
= AARCH64_KEY_A
;
1297 return AARCH64_PARSE_OK
;
1300 static enum aarch64_parse_opt_result
1301 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED
,
1302 char* rest ATTRIBUTE_UNUSED
)
1304 aarch64_ra_sign_scope
= AARCH64_FUNCTION_ALL
;
1305 return AARCH64_PARSE_OK
;
1308 static enum aarch64_parse_opt_result
1309 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED
,
1310 char* rest ATTRIBUTE_UNUSED
)
1312 aarch64_ra_sign_key
= AARCH64_KEY_B
;
1313 return AARCH64_PARSE_OK
;
1316 static enum aarch64_parse_opt_result
1317 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED
,
1318 char* rest ATTRIBUTE_UNUSED
)
1320 aarch64_enable_bti
= 1;
1321 return AARCH64_PARSE_OK
;
1324 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes
[] = {
1325 { "leaf", aarch64_handle_pac_ret_leaf
, NULL
, 0 },
1326 { "b-key", aarch64_handle_pac_ret_b_key
, NULL
, 0 },
1327 { NULL
, NULL
, NULL
, 0 }
1330 static const struct aarch64_branch_protect_type aarch64_branch_protect_types
[] = {
1331 { "none", aarch64_handle_no_branch_protection
, NULL
, 0 },
1332 { "standard", aarch64_handle_standard_branch_protection
, NULL
, 0 },
1333 { "pac-ret", aarch64_handle_pac_ret_protection
, aarch64_pac_ret_subtypes
,
1334 ARRAY_SIZE (aarch64_pac_ret_subtypes
) },
1335 { "bti", aarch64_handle_bti_protection
, NULL
, 0 },
1336 { NULL
, NULL
, NULL
, 0 }
1339 /* The condition codes of the processor, and the inverse function. */
1340 static const char * const aarch64_condition_codes
[] =
1342 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1343 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1346 /* The preferred condition codes for SVE conditions. */
1347 static const char *const aarch64_sve_condition_codes
[] =
1349 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1350 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1353 /* Return the assembly token for svpattern value VALUE. */
1356 svpattern_token (enum aarch64_svpattern pattern
)
1360 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1361 AARCH64_FOR_SVPATTERN (CASE
)
1363 case AARCH64_NUM_SVPATTERNS
:
1369 /* Return the descriptor of the SIMD ABI. */
1371 static const predefined_function_abi
&
1372 aarch64_simd_abi (void)
1374 predefined_function_abi
&simd_abi
= function_abis
[ARM_PCS_SIMD
];
1375 if (!simd_abi
.initialized_p ())
1377 HARD_REG_SET full_reg_clobbers
1378 = default_function_abi
.full_reg_clobbers ();
1379 for (int regno
= 0; regno
< FIRST_PSEUDO_REGISTER
; regno
++)
1380 if (FP_SIMD_SAVED_REGNUM_P (regno
))
1381 CLEAR_HARD_REG_BIT (full_reg_clobbers
, regno
);
1382 simd_abi
.initialize (ARM_PCS_SIMD
, full_reg_clobbers
);
1387 /* Generate code to enable conditional branches in functions over 1 MiB. */
1389 aarch64_gen_far_branch (rtx
* operands
, int pos_label
, const char * dest
,
1390 const char * branch_format
)
1392 rtx_code_label
* tmp_label
= gen_label_rtx ();
1393 char label_buf
[256];
1395 ASM_GENERATE_INTERNAL_LABEL (label_buf
, dest
,
1396 CODE_LABEL_NUMBER (tmp_label
));
1397 const char *label_ptr
= targetm
.strip_name_encoding (label_buf
);
1398 rtx dest_label
= operands
[pos_label
];
1399 operands
[pos_label
] = tmp_label
;
1401 snprintf (buffer
, sizeof (buffer
), "%s%s", branch_format
, label_ptr
);
1402 output_asm_insn (buffer
, operands
);
1404 snprintf (buffer
, sizeof (buffer
), "b\t%%l%d\n%s:", pos_label
, label_ptr
);
1405 operands
[pos_label
] = dest_label
;
1406 output_asm_insn (buffer
, operands
);
1411 aarch64_err_no_fpadvsimd (machine_mode mode
)
1413 if (TARGET_GENERAL_REGS_ONLY
)
1414 if (FLOAT_MODE_P (mode
))
1415 error ("%qs is incompatible with the use of floating-point types",
1416 "-mgeneral-regs-only");
1418 error ("%qs is incompatible with the use of vector types",
1419 "-mgeneral-regs-only");
1421 if (FLOAT_MODE_P (mode
))
1422 error ("%qs feature modifier is incompatible with the use of"
1423 " floating-point types", "+nofp");
1425 error ("%qs feature modifier is incompatible with the use of"
1426 " vector types", "+nofp");
1429 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1430 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1431 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1432 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1433 and GENERAL_REGS is lower than the memory cost (in this case the best class
1434 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1435 cost results in bad allocations with many redundant int<->FP moves which
1436 are expensive on various cores.
1437 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1438 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1439 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1440 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1441 The result of this is that it is no longer inefficient to have a higher
1442 memory move cost than the register move cost.
1446 aarch64_ira_change_pseudo_allocno_class (int regno
, reg_class_t allocno_class
,
1447 reg_class_t best_class
)
1451 if (!reg_class_subset_p (GENERAL_REGS
, allocno_class
)
1452 || !reg_class_subset_p (FP_REGS
, allocno_class
))
1453 return allocno_class
;
1455 if (!reg_class_subset_p (GENERAL_REGS
, best_class
)
1456 || !reg_class_subset_p (FP_REGS
, best_class
))
1459 mode
= PSEUDO_REGNO_MODE (regno
);
1460 return FLOAT_MODE_P (mode
) || VECTOR_MODE_P (mode
) ? FP_REGS
: GENERAL_REGS
;
1464 aarch64_min_divisions_for_recip_mul (machine_mode mode
)
1466 if (GET_MODE_UNIT_SIZE (mode
) == 4)
1467 return aarch64_tune_params
.min_div_recip_mul_sf
;
1468 return aarch64_tune_params
.min_div_recip_mul_df
;
1471 /* Return the reassociation width of treeop OPC with mode MODE. */
1473 aarch64_reassociation_width (unsigned opc
, machine_mode mode
)
1475 if (VECTOR_MODE_P (mode
))
1476 return aarch64_tune_params
.vec_reassoc_width
;
1477 if (INTEGRAL_MODE_P (mode
))
1478 return aarch64_tune_params
.int_reassoc_width
;
1479 /* Avoid reassociating floating point addition so we emit more FMAs. */
1480 if (FLOAT_MODE_P (mode
) && opc
!= PLUS_EXPR
)
1481 return aarch64_tune_params
.fp_reassoc_width
;
1485 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1487 aarch64_dbx_register_number (unsigned regno
)
1489 if (GP_REGNUM_P (regno
))
1490 return AARCH64_DWARF_R0
+ regno
- R0_REGNUM
;
1491 else if (regno
== SP_REGNUM
)
1492 return AARCH64_DWARF_SP
;
1493 else if (FP_REGNUM_P (regno
))
1494 return AARCH64_DWARF_V0
+ regno
- V0_REGNUM
;
1495 else if (PR_REGNUM_P (regno
))
1496 return AARCH64_DWARF_P0
+ regno
- P0_REGNUM
;
1497 else if (regno
== VG_REGNUM
)
1498 return AARCH64_DWARF_VG
;
1500 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1501 equivalent DWARF register. */
1502 return DWARF_FRAME_REGISTERS
;
1505 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1506 integer, otherwise return X unmodified. */
1508 aarch64_bit_representation (rtx x
)
1510 if (CONST_DOUBLE_P (x
))
1511 x
= gen_lowpart (int_mode_for_mode (GET_MODE (x
)).require (), x
);
1515 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1517 aarch64_advsimd_struct_mode_p (machine_mode mode
)
1520 && (mode
== OImode
|| mode
== CImode
|| mode
== XImode
));
1523 /* Return true if MODE is an SVE predicate mode. */
1525 aarch64_sve_pred_mode_p (machine_mode mode
)
1528 && (mode
== VNx16BImode
1529 || mode
== VNx8BImode
1530 || mode
== VNx4BImode
1531 || mode
== VNx2BImode
));
1534 /* Three mutually-exclusive flags describing a vector or predicate type. */
1535 const unsigned int VEC_ADVSIMD
= 1;
1536 const unsigned int VEC_SVE_DATA
= 2;
1537 const unsigned int VEC_SVE_PRED
= 4;
1538 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1539 a structure of 2, 3 or 4 vectors. */
1540 const unsigned int VEC_STRUCT
= 8;
1541 /* Useful combinations of the above. */
1542 const unsigned int VEC_ANY_SVE
= VEC_SVE_DATA
| VEC_SVE_PRED
;
1543 const unsigned int VEC_ANY_DATA
= VEC_ADVSIMD
| VEC_SVE_DATA
;
1545 /* Return a set of flags describing the vector properties of mode MODE.
1546 Ignore modes that are not supported by the current target. */
1548 aarch64_classify_vector_mode (machine_mode mode
)
1550 if (aarch64_advsimd_struct_mode_p (mode
))
1551 return VEC_ADVSIMD
| VEC_STRUCT
;
1553 if (aarch64_sve_pred_mode_p (mode
))
1554 return VEC_SVE_PRED
;
1556 /* Make the decision based on the mode's enum value rather than its
1557 properties, so that we keep the correct classification regardless
1558 of -msve-vector-bits. */
1561 /* Single SVE vectors. */
1569 return TARGET_SVE
? VEC_SVE_DATA
: 0;
1571 /* x2 SVE vectors. */
1579 /* x3 SVE vectors. */
1587 /* x4 SVE vectors. */
1595 return TARGET_SVE
? VEC_SVE_DATA
| VEC_STRUCT
: 0;
1597 /* 64-bit Advanced SIMD vectors. */
1601 /* ...E_V1DImode doesn't exist. */
1605 /* 128-bit Advanced SIMD vectors. */
1613 return TARGET_SIMD
? VEC_ADVSIMD
: 0;
1620 /* Return true if MODE is any of the data vector modes, including
1623 aarch64_vector_data_mode_p (machine_mode mode
)
1625 return aarch64_classify_vector_mode (mode
) & VEC_ANY_DATA
;
1628 /* Return true if MODE is any form of SVE mode, including predicates,
1629 vectors and structures. */
1631 aarch64_sve_mode_p (machine_mode mode
)
1633 return aarch64_classify_vector_mode (mode
) & VEC_ANY_SVE
;
1636 /* Return true if MODE is an SVE data vector mode; either a single vector
1637 or a structure of vectors. */
1639 aarch64_sve_data_mode_p (machine_mode mode
)
1641 return aarch64_classify_vector_mode (mode
) & VEC_SVE_DATA
;
1644 /* Implement target hook TARGET_ARRAY_MODE. */
1645 static opt_machine_mode
1646 aarch64_array_mode (machine_mode mode
, unsigned HOST_WIDE_INT nelems
)
1648 if (aarch64_classify_vector_mode (mode
) == VEC_SVE_DATA
1649 && IN_RANGE (nelems
, 2, 4))
1650 return mode_for_vector (GET_MODE_INNER (mode
),
1651 GET_MODE_NUNITS (mode
) * nelems
);
1653 return opt_machine_mode ();
1656 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1658 aarch64_array_mode_supported_p (machine_mode mode
,
1659 unsigned HOST_WIDE_INT nelems
)
1662 && (AARCH64_VALID_SIMD_QREG_MODE (mode
)
1663 || AARCH64_VALID_SIMD_DREG_MODE (mode
))
1664 && (nelems
>= 2 && nelems
<= 4))
1670 /* Return the SVE predicate mode to use for elements that have
1671 ELEM_NBYTES bytes, if such a mode exists. */
1674 aarch64_sve_pred_mode (unsigned int elem_nbytes
)
1678 if (elem_nbytes
== 1)
1680 if (elem_nbytes
== 2)
1682 if (elem_nbytes
== 4)
1684 if (elem_nbytes
== 8)
1687 return opt_machine_mode ();
1690 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1692 static opt_machine_mode
1693 aarch64_get_mask_mode (poly_uint64 nunits
, poly_uint64 nbytes
)
1695 if (TARGET_SVE
&& known_eq (nbytes
, BYTES_PER_SVE_VECTOR
))
1697 unsigned int elem_nbytes
= vector_element_size (nbytes
, nunits
);
1698 machine_mode pred_mode
;
1699 if (aarch64_sve_pred_mode (elem_nbytes
).exists (&pred_mode
))
1703 return default_get_mask_mode (nunits
, nbytes
);
1706 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
1708 static opt_machine_mode
1709 aarch64_sve_data_mode (scalar_mode inner_mode
, poly_uint64 nunits
)
1711 enum mode_class mclass
= (is_a
<scalar_float_mode
> (inner_mode
)
1712 ? MODE_VECTOR_FLOAT
: MODE_VECTOR_INT
);
1714 FOR_EACH_MODE_IN_CLASS (mode
, mclass
)
1715 if (inner_mode
== GET_MODE_INNER (mode
)
1716 && known_eq (nunits
, GET_MODE_NUNITS (mode
))
1717 && aarch64_sve_data_mode_p (mode
))
1719 return opt_machine_mode ();
1722 /* Return the integer element mode associated with SVE mode MODE. */
1724 static scalar_int_mode
1725 aarch64_sve_element_int_mode (machine_mode mode
)
1727 unsigned int elt_bits
= vector_element_size (BITS_PER_SVE_VECTOR
,
1728 GET_MODE_NUNITS (mode
));
1729 return int_mode_for_size (elt_bits
, 0).require ();
1732 /* Return the integer vector mode associated with SVE mode MODE.
1733 Unlike mode_for_int_vector, this can handle the case in which
1734 MODE is a predicate (and thus has a different total size). */
1737 aarch64_sve_int_mode (machine_mode mode
)
1739 scalar_int_mode int_mode
= aarch64_sve_element_int_mode (mode
);
1740 return aarch64_sve_data_mode (int_mode
, GET_MODE_NUNITS (mode
)).require ();
1743 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1744 prefer to use the first arithmetic operand as the else value if
1745 the else value doesn't matter, since that exactly matches the SVE
1746 destructive merging form. For ternary operations we could either
1747 pick the first operand and use FMAD-like instructions or the last
1748 operand and use FMLA-like instructions; the latter seems more
1752 aarch64_preferred_else_value (unsigned, tree
, unsigned int nops
, tree
*ops
)
1754 return nops
== 3 ? ops
[2] : ops
[0];
1757 /* Implement TARGET_HARD_REGNO_NREGS. */
1760 aarch64_hard_regno_nregs (unsigned regno
, machine_mode mode
)
1762 /* ??? Logically we should only need to provide a value when
1763 HARD_REGNO_MODE_OK says that the combination is valid,
1764 but at the moment we need to handle all modes. Just ignore
1765 any runtime parts for registers that can't store them. */
1766 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
1767 switch (aarch64_regno_regclass (regno
))
1772 if (aarch64_sve_data_mode_p (mode
))
1773 return exact_div (GET_MODE_SIZE (mode
),
1774 BYTES_PER_SVE_VECTOR
).to_constant ();
1775 return CEIL (lowest_size
, UNITS_PER_VREG
);
1781 return CEIL (lowest_size
, UNITS_PER_WORD
);
1786 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1789 aarch64_hard_regno_mode_ok (unsigned regno
, machine_mode mode
)
1791 if (GET_MODE_CLASS (mode
) == MODE_CC
)
1792 return regno
== CC_REGNUM
;
1794 if (regno
== VG_REGNUM
)
1795 /* This must have the same size as _Unwind_Word. */
1796 return mode
== DImode
;
1798 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1799 if (vec_flags
& VEC_SVE_PRED
)
1800 return PR_REGNUM_P (regno
);
1802 if (PR_REGNUM_P (regno
))
1805 if (regno
== SP_REGNUM
)
1806 /* The purpose of comparing with ptr_mode is to support the
1807 global register variable associated with the stack pointer
1808 register via the syntax of asm ("wsp") in ILP32. */
1809 return mode
== Pmode
|| mode
== ptr_mode
;
1811 if (regno
== FRAME_POINTER_REGNUM
|| regno
== ARG_POINTER_REGNUM
)
1812 return mode
== Pmode
;
1814 if (GP_REGNUM_P (regno
))
1816 if (known_le (GET_MODE_SIZE (mode
), 8))
1818 else if (known_le (GET_MODE_SIZE (mode
), 16))
1819 return (regno
& 1) == 0;
1821 else if (FP_REGNUM_P (regno
))
1823 if (vec_flags
& VEC_STRUCT
)
1824 return end_hard_regno (mode
, regno
) - 1 <= V31_REGNUM
;
1826 return !VECTOR_MODE_P (mode
) || vec_flags
!= 0;
1832 /* Implement TARGET_FNTYPE_ABI. */
1834 static const predefined_function_abi
&
1835 aarch64_fntype_abi (const_tree fntype
)
1837 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype
)))
1838 return aarch64_simd_abi ();
1839 return default_function_abi
;
1842 /* Return true if this is a definition of a vectorized simd function. */
1845 aarch64_simd_decl_p (tree fndecl
)
1851 fntype
= TREE_TYPE (fndecl
);
1855 /* Functions with the aarch64_vector_pcs attribute use the simd ABI. */
1856 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype
)) != NULL
)
1862 /* Return the mode a register save/restore should use. DImode for integer
1863 registers, DFmode for FP registers in non-SIMD functions (they only save
1864 the bottom half of a 128 bit register), or TFmode for FP registers in
1868 aarch64_reg_save_mode (tree fndecl
, unsigned regno
)
1870 return GP_REGNUM_P (regno
)
1872 : (aarch64_simd_decl_p (fndecl
) ? E_TFmode
: E_DFmode
);
1875 /* Return true if the instruction is a call to a SIMD function, false
1876 if it is not a SIMD function or if we do not know anything about
1880 aarch64_simd_call_p (const rtx_insn
*insn
)
1886 gcc_assert (CALL_P (insn
));
1887 call
= get_call_rtx_from (insn
);
1888 symbol
= XEXP (XEXP (call
, 0), 0);
1889 if (GET_CODE (symbol
) != SYMBOL_REF
)
1891 fndecl
= SYMBOL_REF_DECL (symbol
);
1895 return aarch64_simd_decl_p (fndecl
);
1898 /* Implement TARGET_INSN_CALLEE_ABI. */
1900 const predefined_function_abi
&
1901 aarch64_insn_callee_abi (const rtx_insn
*insn
)
1903 if (aarch64_simd_call_p (insn
))
1904 return aarch64_simd_abi ();
1905 return default_function_abi
;
1908 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1909 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1910 clobbers the top 64 bits when restoring the bottom 64 bits. */
1913 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id
,
1917 if (FP_REGNUM_P (regno
))
1919 bool simd_p
= (abi_id
== ARM_PCS_SIMD
);
1920 poly_int64 per_register_size
= GET_MODE_SIZE (mode
);
1921 unsigned int nregs
= hard_regno_nregs (regno
, mode
);
1923 per_register_size
= exact_div (per_register_size
, nregs
);
1924 return maybe_gt (per_register_size
, simd_p
? 16 : 8);
1929 /* Implement REGMODE_NATURAL_SIZE. */
1931 aarch64_regmode_natural_size (machine_mode mode
)
1933 /* The natural size for SVE data modes is one SVE data vector,
1934 and similarly for predicates. We can't independently modify
1935 anything smaller than that. */
1936 /* ??? For now, only do this for variable-width SVE registers.
1937 Doing it for constant-sized registers breaks lower-subreg.c. */
1938 /* ??? And once that's fixed, we should probably have similar
1939 code for Advanced SIMD. */
1940 if (!aarch64_sve_vg
.is_constant ())
1942 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1943 if (vec_flags
& VEC_SVE_PRED
)
1944 return BYTES_PER_SVE_PRED
;
1945 if (vec_flags
& VEC_SVE_DATA
)
1946 return BYTES_PER_SVE_VECTOR
;
1948 return UNITS_PER_WORD
;
1951 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1953 aarch64_hard_regno_caller_save_mode (unsigned regno
, unsigned,
1956 /* The predicate mode determines which bits are significant and
1957 which are "don't care". Decreasing the number of lanes would
1958 lose data while increasing the number of lanes would make bits
1959 unnecessarily significant. */
1960 if (PR_REGNUM_P (regno
))
1962 if (known_ge (GET_MODE_SIZE (mode
), 4))
1968 /* Return true if I's bits are consecutive ones from the MSB. */
1970 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i
)
1972 return exact_log2 (-i
) != HOST_WIDE_INT_M1
;
1975 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1976 that strcpy from constants will be faster. */
1978 static HOST_WIDE_INT
1979 aarch64_constant_alignment (const_tree exp
, HOST_WIDE_INT align
)
1981 if (TREE_CODE (exp
) == STRING_CST
&& !optimize_size
)
1982 return MAX (align
, BITS_PER_WORD
);
1986 /* Return true if calls to DECL should be treated as
1987 long-calls (ie called via a register). */
1989 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED
)
1994 /* Return true if calls to symbol-ref SYM should be treated as
1995 long-calls (ie called via a register). */
1997 aarch64_is_long_call_p (rtx sym
)
1999 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym
));
2002 /* Return true if calls to symbol-ref SYM should not go through
2006 aarch64_is_noplt_call_p (rtx sym
)
2008 const_tree decl
= SYMBOL_REF_DECL (sym
);
2013 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl
)))
2014 && !targetm
.binds_local_p (decl
))
2020 /* Return true if the offsets to a zero/sign-extract operation
2021 represent an expression that matches an extend operation. The
2022 operands represent the paramters from
2024 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
2026 aarch64_is_extend_from_extract (scalar_int_mode mode
, rtx mult_imm
,
2029 HOST_WIDE_INT mult_val
, extract_val
;
2031 if (! CONST_INT_P (mult_imm
) || ! CONST_INT_P (extract_imm
))
2034 mult_val
= INTVAL (mult_imm
);
2035 extract_val
= INTVAL (extract_imm
);
2038 && extract_val
< GET_MODE_BITSIZE (mode
)
2039 && exact_log2 (extract_val
& ~7) > 0
2040 && (extract_val
& 7) <= 4
2041 && mult_val
== (1 << (extract_val
& 7)))
2047 /* Emit an insn that's a simple single-set. Both the operands must be
2048 known to be valid. */
2049 inline static rtx_insn
*
2050 emit_set_insn (rtx x
, rtx y
)
2052 return emit_insn (gen_rtx_SET (x
, y
));
2055 /* X and Y are two things to compare using CODE. Emit the compare insn and
2056 return the rtx for register 0 in the proper mode. */
2058 aarch64_gen_compare_reg (RTX_CODE code
, rtx x
, rtx y
)
2060 machine_mode cmp_mode
= GET_MODE (x
);
2061 machine_mode cc_mode
;
2064 if (cmp_mode
== TImode
)
2066 gcc_assert (code
== NE
);
2069 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
2071 rtx x_lo
= operand_subword (x
, 0, 0, TImode
);
2072 rtx y_lo
= operand_subword (y
, 0, 0, TImode
);
2073 emit_set_insn (cc_reg
, gen_rtx_COMPARE (cc_mode
, x_lo
, y_lo
));
2075 rtx x_hi
= operand_subword (x
, 1, 0, TImode
);
2076 rtx y_hi
= operand_subword (y
, 1, 0, TImode
);
2077 emit_insn (gen_ccmpdi (cc_reg
, cc_reg
, x_hi
, y_hi
,
2078 gen_rtx_EQ (cc_mode
, cc_reg
, const0_rtx
),
2079 GEN_INT (AARCH64_EQ
)));
2083 cc_mode
= SELECT_CC_MODE (code
, x
, y
);
2084 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
2085 emit_set_insn (cc_reg
, gen_rtx_COMPARE (cc_mode
, x
, y
));
2090 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
2093 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code
, rtx x
, rtx y
,
2094 machine_mode y_mode
)
2096 if (y_mode
== E_QImode
|| y_mode
== E_HImode
)
2098 if (CONST_INT_P (y
))
2099 y
= GEN_INT (INTVAL (y
) & GET_MODE_MASK (y_mode
));
2103 machine_mode cc_mode
;
2105 t
= gen_rtx_ZERO_EXTEND (SImode
, y
);
2106 t
= gen_rtx_COMPARE (CC_SWPmode
, t
, x
);
2107 cc_mode
= CC_SWPmode
;
2108 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
2109 emit_set_insn (cc_reg
, t
);
2114 if (!aarch64_plus_operand (y
, y_mode
))
2115 y
= force_reg (y_mode
, y
);
2117 return aarch64_gen_compare_reg (code
, x
, y
);
2120 /* Build the SYMBOL_REF for __tls_get_addr. */
2122 static GTY(()) rtx tls_get_addr_libfunc
;
2125 aarch64_tls_get_addr (void)
2127 if (!tls_get_addr_libfunc
)
2128 tls_get_addr_libfunc
= init_one_libfunc ("__tls_get_addr");
2129 return tls_get_addr_libfunc
;
2132 /* Return the TLS model to use for ADDR. */
2134 static enum tls_model
2135 tls_symbolic_operand_type (rtx addr
)
2137 enum tls_model tls_kind
= TLS_MODEL_NONE
;
2138 if (GET_CODE (addr
) == CONST
)
2141 rtx sym
= strip_offset (addr
, &addend
);
2142 if (GET_CODE (sym
) == SYMBOL_REF
)
2143 tls_kind
= SYMBOL_REF_TLS_MODEL (sym
);
2145 else if (GET_CODE (addr
) == SYMBOL_REF
)
2146 tls_kind
= SYMBOL_REF_TLS_MODEL (addr
);
2151 /* We'll allow lo_sum's in addresses in our legitimate addresses
2152 so that combine would take care of combining addresses where
2153 necessary, but for generation purposes, we'll generate the address
2156 tmp = hi (symbol_ref); adrp x1, foo
2157 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
2161 adrp x1, :got:foo adrp tmp, :tlsgd:foo
2162 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
2166 Load TLS symbol, depending on TLS mechanism and TLS access model.
2168 Global Dynamic - Traditional TLS:
2169 adrp tmp, :tlsgd:imm
2170 add dest, tmp, #:tlsgd_lo12:imm
2173 Global Dynamic - TLS Descriptors:
2174 adrp dest, :tlsdesc:imm
2175 ldr tmp, [dest, #:tlsdesc_lo12:imm]
2176 add dest, dest, #:tlsdesc_lo12:imm
2183 adrp tmp, :gottprel:imm
2184 ldr dest, [tmp, #:gottprel_lo12:imm]
2189 add t0, tp, #:tprel_hi12:imm, lsl #12
2190 add t0, t0, #:tprel_lo12_nc:imm
2194 aarch64_load_symref_appropriately (rtx dest
, rtx imm
,
2195 enum aarch64_symbol_type type
)
2199 case SYMBOL_SMALL_ABSOLUTE
:
2201 /* In ILP32, the mode of dest can be either SImode or DImode. */
2203 machine_mode mode
= GET_MODE (dest
);
2205 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
2207 if (can_create_pseudo_p ())
2208 tmp_reg
= gen_reg_rtx (mode
);
2210 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
2211 emit_insn (gen_add_losym (dest
, tmp_reg
, imm
));
2215 case SYMBOL_TINY_ABSOLUTE
:
2216 emit_insn (gen_rtx_SET (dest
, imm
));
2219 case SYMBOL_SMALL_GOT_28K
:
2221 machine_mode mode
= GET_MODE (dest
);
2222 rtx gp_rtx
= pic_offset_table_rtx
;
2226 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2227 here before rtl expand. Tree IVOPT will generate rtl pattern to
2228 decide rtx costs, in which case pic_offset_table_rtx is not
2229 initialized. For that case no need to generate the first adrp
2230 instruction as the final cost for global variable access is
2234 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2235 using the page base as GOT base, the first page may be wasted,
2236 in the worst scenario, there is only 28K space for GOT).
2238 The generate instruction sequence for accessing global variable
2241 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2243 Only one instruction needed. But we must initialize
2244 pic_offset_table_rtx properly. We generate initialize insn for
2245 every global access, and allow CSE to remove all redundant.
2247 The final instruction sequences will look like the following
2248 for multiply global variables access.
2250 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2252 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2253 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2254 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2257 rtx s
= gen_rtx_SYMBOL_REF (Pmode
, "_GLOBAL_OFFSET_TABLE_");
2258 crtl
->uses_pic_offset_table
= 1;
2259 emit_move_insn (gp_rtx
, gen_rtx_HIGH (Pmode
, s
));
2261 if (mode
!= GET_MODE (gp_rtx
))
2262 gp_rtx
= gen_lowpart (mode
, gp_rtx
);
2266 if (mode
== ptr_mode
)
2269 insn
= gen_ldr_got_small_28k_di (dest
, gp_rtx
, imm
);
2271 insn
= gen_ldr_got_small_28k_si (dest
, gp_rtx
, imm
);
2273 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
2277 gcc_assert (mode
== Pmode
);
2279 insn
= gen_ldr_got_small_28k_sidi (dest
, gp_rtx
, imm
);
2280 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
2283 /* The operand is expected to be MEM. Whenever the related insn
2284 pattern changed, above code which calculate mem should be
2286 gcc_assert (GET_CODE (mem
) == MEM
);
2287 MEM_READONLY_P (mem
) = 1;
2288 MEM_NOTRAP_P (mem
) = 1;
2293 case SYMBOL_SMALL_GOT_4G
:
2295 /* In ILP32, the mode of dest can be either SImode or DImode,
2296 while the got entry is always of SImode size. The mode of
2297 dest depends on how dest is used: if dest is assigned to a
2298 pointer (e.g. in the memory), it has SImode; it may have
2299 DImode if dest is dereferenced to access the memeory.
2300 This is why we have to handle three different ldr_got_small
2301 patterns here (two patterns for ILP32). */
2306 machine_mode mode
= GET_MODE (dest
);
2308 if (can_create_pseudo_p ())
2309 tmp_reg
= gen_reg_rtx (mode
);
2311 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
2312 if (mode
== ptr_mode
)
2315 insn
= gen_ldr_got_small_di (dest
, tmp_reg
, imm
);
2317 insn
= gen_ldr_got_small_si (dest
, tmp_reg
, imm
);
2319 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
2323 gcc_assert (mode
== Pmode
);
2325 insn
= gen_ldr_got_small_sidi (dest
, tmp_reg
, imm
);
2326 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
2329 gcc_assert (GET_CODE (mem
) == MEM
);
2330 MEM_READONLY_P (mem
) = 1;
2331 MEM_NOTRAP_P (mem
) = 1;
2336 case SYMBOL_SMALL_TLSGD
:
2339 machine_mode mode
= GET_MODE (dest
);
2340 rtx result
= gen_rtx_REG (mode
, R0_REGNUM
);
2344 aarch64_emit_call_insn (gen_tlsgd_small_si (result
, imm
));
2346 aarch64_emit_call_insn (gen_tlsgd_small_di (result
, imm
));
2347 insns
= get_insns ();
2350 RTL_CONST_CALL_P (insns
) = 1;
2351 emit_libcall_block (insns
, dest
, result
, imm
);
2355 case SYMBOL_SMALL_TLSDESC
:
2357 machine_mode mode
= GET_MODE (dest
);
2358 rtx x0
= gen_rtx_REG (mode
, R0_REGNUM
);
2361 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
2363 /* In ILP32, the got entry is always of SImode size. Unlike
2364 small GOT, the dest is fixed at reg 0. */
2366 emit_insn (gen_tlsdesc_small_si (imm
));
2368 emit_insn (gen_tlsdesc_small_di (imm
));
2369 tp
= aarch64_load_tp (NULL
);
2372 tp
= gen_lowpart (mode
, tp
);
2374 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, x0
)));
2376 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2380 case SYMBOL_SMALL_TLSIE
:
2382 /* In ILP32, the mode of dest can be either SImode or DImode,
2383 while the got entry is always of SImode size. The mode of
2384 dest depends on how dest is used: if dest is assigned to a
2385 pointer (e.g. in the memory), it has SImode; it may have
2386 DImode if dest is dereferenced to access the memeory.
2387 This is why we have to handle three different tlsie_small
2388 patterns here (two patterns for ILP32). */
2389 machine_mode mode
= GET_MODE (dest
);
2390 rtx tmp_reg
= gen_reg_rtx (mode
);
2391 rtx tp
= aarch64_load_tp (NULL
);
2393 if (mode
== ptr_mode
)
2396 emit_insn (gen_tlsie_small_di (tmp_reg
, imm
));
2399 emit_insn (gen_tlsie_small_si (tmp_reg
, imm
));
2400 tp
= gen_lowpart (mode
, tp
);
2405 gcc_assert (mode
== Pmode
);
2406 emit_insn (gen_tlsie_small_sidi (tmp_reg
, imm
));
2409 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, tmp_reg
)));
2411 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2415 case SYMBOL_TLSLE12
:
2416 case SYMBOL_TLSLE24
:
2417 case SYMBOL_TLSLE32
:
2418 case SYMBOL_TLSLE48
:
2420 machine_mode mode
= GET_MODE (dest
);
2421 rtx tp
= aarch64_load_tp (NULL
);
2424 tp
= gen_lowpart (mode
, tp
);
2428 case SYMBOL_TLSLE12
:
2429 emit_insn ((mode
== DImode
? gen_tlsle12_di
: gen_tlsle12_si
)
2432 case SYMBOL_TLSLE24
:
2433 emit_insn ((mode
== DImode
? gen_tlsle24_di
: gen_tlsle24_si
)
2436 case SYMBOL_TLSLE32
:
2437 emit_insn ((mode
== DImode
? gen_tlsle32_di
: gen_tlsle32_si
)
2439 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
2442 case SYMBOL_TLSLE48
:
2443 emit_insn ((mode
== DImode
? gen_tlsle48_di
: gen_tlsle48_si
)
2445 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
2453 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2457 case SYMBOL_TINY_GOT
:
2458 emit_insn (gen_ldr_got_tiny (dest
, imm
));
2461 case SYMBOL_TINY_TLSIE
:
2463 machine_mode mode
= GET_MODE (dest
);
2464 rtx tp
= aarch64_load_tp (NULL
);
2466 if (mode
== ptr_mode
)
2469 emit_insn (gen_tlsie_tiny_di (dest
, imm
, tp
));
2472 tp
= gen_lowpart (mode
, tp
);
2473 emit_insn (gen_tlsie_tiny_si (dest
, imm
, tp
));
2478 gcc_assert (mode
== Pmode
);
2479 emit_insn (gen_tlsie_tiny_sidi (dest
, imm
, tp
));
2483 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2492 /* Emit a move from SRC to DEST. Assume that the move expanders can
2493 handle all moves if !can_create_pseudo_p (). The distinction is
2494 important because, unlike emit_move_insn, the move expanders know
2495 how to force Pmode objects into the constant pool even when the
2496 constant pool address is not itself legitimate. */
2498 aarch64_emit_move (rtx dest
, rtx src
)
2500 return (can_create_pseudo_p ()
2501 ? emit_move_insn (dest
, src
)
2502 : emit_move_insn_1 (dest
, src
));
2505 /* Apply UNOPTAB to OP and store the result in DEST. */
2508 aarch64_emit_unop (rtx dest
, optab unoptab
, rtx op
)
2510 rtx tmp
= expand_unop (GET_MODE (dest
), unoptab
, op
, dest
, 0);
2512 emit_move_insn (dest
, tmp
);
2515 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
2518 aarch64_emit_binop (rtx dest
, optab binoptab
, rtx op0
, rtx op1
)
2520 rtx tmp
= expand_binop (GET_MODE (dest
), binoptab
, op0
, op1
, dest
, 0,
2523 emit_move_insn (dest
, tmp
);
2526 /* Split a 128-bit move operation into two 64-bit move operations,
2527 taking care to handle partial overlap of register to register
2528 copies. Special cases are needed when moving between GP regs and
2529 FP regs. SRC can be a register, constant or memory; DST a register
2530 or memory. If either operand is memory it must not have any side
2533 aarch64_split_128bit_move (rtx dst
, rtx src
)
2538 machine_mode mode
= GET_MODE (dst
);
2540 gcc_assert (mode
== TImode
|| mode
== TFmode
);
2541 gcc_assert (!(side_effects_p (src
) || side_effects_p (dst
)));
2542 gcc_assert (mode
== GET_MODE (src
) || GET_MODE (src
) == VOIDmode
);
2544 if (REG_P (dst
) && REG_P (src
))
2546 int src_regno
= REGNO (src
);
2547 int dst_regno
= REGNO (dst
);
2549 /* Handle FP <-> GP regs. */
2550 if (FP_REGNUM_P (dst_regno
) && GP_REGNUM_P (src_regno
))
2552 src_lo
= gen_lowpart (word_mode
, src
);
2553 src_hi
= gen_highpart (word_mode
, src
);
2555 emit_insn (gen_aarch64_movlow_di (mode
, dst
, src_lo
));
2556 emit_insn (gen_aarch64_movhigh_di (mode
, dst
, src_hi
));
2559 else if (GP_REGNUM_P (dst_regno
) && FP_REGNUM_P (src_regno
))
2561 dst_lo
= gen_lowpart (word_mode
, dst
);
2562 dst_hi
= gen_highpart (word_mode
, dst
);
2564 emit_insn (gen_aarch64_movdi_low (mode
, dst_lo
, src
));
2565 emit_insn (gen_aarch64_movdi_high (mode
, dst_hi
, src
));
2570 dst_lo
= gen_lowpart (word_mode
, dst
);
2571 dst_hi
= gen_highpart (word_mode
, dst
);
2572 src_lo
= gen_lowpart (word_mode
, src
);
2573 src_hi
= gen_highpart_mode (word_mode
, mode
, src
);
2575 /* At most one pairing may overlap. */
2576 if (reg_overlap_mentioned_p (dst_lo
, src_hi
))
2578 aarch64_emit_move (dst_hi
, src_hi
);
2579 aarch64_emit_move (dst_lo
, src_lo
);
2583 aarch64_emit_move (dst_lo
, src_lo
);
2584 aarch64_emit_move (dst_hi
, src_hi
);
2589 aarch64_split_128bit_move_p (rtx dst
, rtx src
)
2591 return (! REG_P (src
)
2592 || ! (FP_REGNUM_P (REGNO (dst
)) && FP_REGNUM_P (REGNO (src
))));
2595 /* Split a complex SIMD combine. */
2598 aarch64_split_simd_combine (rtx dst
, rtx src1
, rtx src2
)
2600 machine_mode src_mode
= GET_MODE (src1
);
2601 machine_mode dst_mode
= GET_MODE (dst
);
2603 gcc_assert (VECTOR_MODE_P (dst_mode
));
2604 gcc_assert (register_operand (dst
, dst_mode
)
2605 && register_operand (src1
, src_mode
)
2606 && register_operand (src2
, src_mode
));
2608 emit_insn (gen_aarch64_simd_combine (src_mode
, dst
, src1
, src2
));
2612 /* Split a complex SIMD move. */
2615 aarch64_split_simd_move (rtx dst
, rtx src
)
2617 machine_mode src_mode
= GET_MODE (src
);
2618 machine_mode dst_mode
= GET_MODE (dst
);
2620 gcc_assert (VECTOR_MODE_P (dst_mode
));
2622 if (REG_P (dst
) && REG_P (src
))
2624 gcc_assert (VECTOR_MODE_P (src_mode
));
2625 emit_insn (gen_aarch64_split_simd_mov (src_mode
, dst
, src
));
2630 aarch64_zero_extend_const_eq (machine_mode xmode
, rtx x
,
2631 machine_mode ymode
, rtx y
)
2633 rtx r
= simplify_const_unary_operation (ZERO_EXTEND
, xmode
, y
, ymode
);
2634 gcc_assert (r
!= NULL
);
2635 return rtx_equal_p (x
, r
);
2638 /* Return TARGET if it is nonnull and a register of mode MODE.
2639 Otherwise, return a fresh register of mode MODE if we can,
2640 or TARGET reinterpreted as MODE if we can't. */
2643 aarch64_target_reg (rtx target
, machine_mode mode
)
2645 if (target
&& REG_P (target
) && GET_MODE (target
) == mode
)
2647 if (!can_create_pseudo_p ())
2649 gcc_assert (target
);
2650 return gen_lowpart (mode
, target
);
2652 return gen_reg_rtx (mode
);
2655 /* Return a register that contains the constant in BUILDER, given that
2656 the constant is a legitimate move operand. Use TARGET as the register
2657 if it is nonnull and convenient. */
2660 aarch64_emit_set_immediate (rtx target
, rtx_vector_builder
&builder
)
2662 rtx src
= builder
.build ();
2663 target
= aarch64_target_reg (target
, GET_MODE (src
));
2664 emit_insn (gen_rtx_SET (target
, src
));
2669 aarch64_force_temporary (machine_mode mode
, rtx x
, rtx value
)
2671 if (can_create_pseudo_p ())
2672 return force_reg (mode
, value
);
2676 aarch64_emit_move (x
, value
);
2681 /* Return true if predicate value X is a constant in which every element
2682 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
2683 value, i.e. as a predicate in which all bits are significant. */
2686 aarch64_get_sve_pred_bits (rtx_vector_builder
&builder
, rtx x
)
2688 if (GET_CODE (x
) != CONST_VECTOR
)
2691 unsigned int factor
= vector_element_size (GET_MODE_NUNITS (VNx16BImode
),
2692 GET_MODE_NUNITS (GET_MODE (x
)));
2693 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (x
) * factor
;
2694 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (x
);
2695 builder
.new_vector (VNx16BImode
, npatterns
, nelts_per_pattern
);
2697 unsigned int nelts
= const_vector_encoded_nelts (x
);
2698 for (unsigned int i
= 0; i
< nelts
; ++i
)
2700 rtx elt
= CONST_VECTOR_ENCODED_ELT (x
, i
);
2701 if (!CONST_INT_P (elt
))
2704 builder
.quick_push (elt
);
2705 for (unsigned int j
= 1; j
< factor
; ++j
)
2706 builder
.quick_push (const0_rtx
);
2708 builder
.finalize ();
2712 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
2713 widest predicate element size it can have (that is, the largest size
2714 for which each element would still be 0 or 1). */
2717 aarch64_widest_sve_pred_elt_size (rtx_vector_builder
&builder
)
2719 /* Start with the most optimistic assumption: that we only need
2720 one bit per pattern. This is what we will use if only the first
2721 bit in each pattern is ever set. */
2722 unsigned int mask
= GET_MODE_SIZE (DImode
);
2723 mask
|= builder
.npatterns ();
2725 /* Look for set bits. */
2726 unsigned int nelts
= builder
.encoded_nelts ();
2727 for (unsigned int i
= 1; i
< nelts
; ++i
)
2728 if (INTVAL (builder
.elt (i
)) != 0)
2734 return mask
& -mask
;
2737 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
2738 that the constant would have with predicate element size ELT_SIZE
2739 (ignoring the upper bits in each element) and return:
2741 * -1 if all bits are set
2742 * N if the predicate has N leading set bits followed by all clear bits
2743 * 0 if the predicate does not have any of these forms. */
2746 aarch64_partial_ptrue_length (rtx_vector_builder
&builder
,
2747 unsigned int elt_size
)
2749 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
2750 followed by set bits. */
2751 if (builder
.nelts_per_pattern () == 3)
2754 /* Skip over leading set bits. */
2755 unsigned int nelts
= builder
.encoded_nelts ();
2757 for (; i
< nelts
; i
+= elt_size
)
2758 if (INTVAL (builder
.elt (i
)) == 0)
2760 unsigned int vl
= i
/ elt_size
;
2762 /* Check for the all-true case. */
2766 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
2767 repeating pattern of set bits followed by clear bits. */
2768 if (builder
.nelts_per_pattern () != 2)
2771 /* We have a "foreground" value and a duplicated "background" value.
2772 If the background might repeat and the last set bit belongs to it,
2773 we might have set bits followed by clear bits followed by set bits. */
2774 if (i
> builder
.npatterns () && maybe_ne (nelts
, builder
.full_nelts ()))
2777 /* Make sure that the rest are all clear. */
2778 for (; i
< nelts
; i
+= elt_size
)
2779 if (INTVAL (builder
.elt (i
)) != 0)
2785 /* See if there is an svpattern that encodes an SVE predicate of mode
2786 PRED_MODE in which the first VL bits are set and the rest are clear.
2787 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
2788 A VL of -1 indicates an all-true vector. */
2791 aarch64_svpattern_for_vl (machine_mode pred_mode
, int vl
)
2794 return AARCH64_SV_ALL
;
2796 if (maybe_gt (vl
, GET_MODE_NUNITS (pred_mode
)))
2797 return AARCH64_NUM_SVPATTERNS
;
2799 if (vl
>= 1 && vl
<= 8)
2800 return aarch64_svpattern (AARCH64_SV_VL1
+ (vl
- 1));
2802 if (vl
>= 16 && vl
<= 256 && pow2p_hwi (vl
))
2803 return aarch64_svpattern (AARCH64_SV_VL16
+ (exact_log2 (vl
) - 4));
2806 if (GET_MODE_NUNITS (pred_mode
).is_constant (&max_vl
))
2808 if (vl
== (max_vl
/ 3) * 3)
2809 return AARCH64_SV_MUL3
;
2810 /* These would only trigger for non-power-of-2 lengths. */
2811 if (vl
== (max_vl
& -4))
2812 return AARCH64_SV_MUL4
;
2813 if (vl
== (1 << floor_log2 (max_vl
)))
2814 return AARCH64_SV_POW2
;
2816 return AARCH64_SV_ALL
;
2818 return AARCH64_NUM_SVPATTERNS
;
2821 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
2822 bits has the lowest bit set and the upper bits clear. This is the
2823 VNx16BImode equivalent of a PTRUE for controlling elements of
2824 ELT_SIZE bytes. However, because the constant is VNx16BImode,
2825 all bits are significant, even the upper zeros. */
2828 aarch64_ptrue_all (unsigned int elt_size
)
2830 rtx_vector_builder
builder (VNx16BImode
, elt_size
, 1);
2831 builder
.quick_push (const1_rtx
);
2832 for (unsigned int i
= 1; i
< elt_size
; ++i
)
2833 builder
.quick_push (const0_rtx
);
2834 return builder
.build ();
2837 /* Return an all-true predicate register of mode MODE. */
2840 aarch64_ptrue_reg (machine_mode mode
)
2842 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
);
2843 rtx reg
= force_reg (VNx16BImode
, CONSTM1_RTX (VNx16BImode
));
2844 return gen_lowpart (mode
, reg
);
2847 /* Return an all-false predicate register of mode MODE. */
2850 aarch64_pfalse_reg (machine_mode mode
)
2852 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
);
2853 rtx reg
= force_reg (VNx16BImode
, CONST0_RTX (VNx16BImode
));
2854 return gen_lowpart (mode
, reg
);
2857 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
2858 true, or alternatively if we know that the operation predicated by
2859 PRED1[0] is safe to perform whenever PRED2 is true. PRED1[1] is a
2860 aarch64_sve_gp_strictness operand that describes the operation
2861 predicated by PRED1[0]. */
2864 aarch64_sve_pred_dominates_p (rtx
*pred1
, rtx pred2
)
2866 machine_mode mode
= GET_MODE (pred2
);
2867 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
2868 && mode
== GET_MODE (pred1
[0])
2869 && aarch64_sve_gp_strictness (pred1
[1], SImode
));
2870 return (pred1
[0] == CONSTM1_RTX (mode
)
2871 || INTVAL (pred1
[1]) == SVE_RELAXED_GP
2872 || rtx_equal_p (pred1
[0], pred2
));
2875 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
2876 for it. PRED2[0] is the predicate for the instruction whose result
2877 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
2878 for it. Return true if we can prove that the two predicates are
2879 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
2880 with PRED1[0] without changing behavior. */
2883 aarch64_sve_same_pred_for_ptest_p (rtx
*pred1
, rtx
*pred2
)
2885 machine_mode mode
= GET_MODE (pred1
[0]);
2886 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
2887 && mode
== GET_MODE (pred2
[0])
2888 && aarch64_sve_ptrue_flag (pred1
[1], SImode
)
2889 && aarch64_sve_ptrue_flag (pred2
[1], SImode
));
2891 bool ptrue1_p
= (pred1
[0] == CONSTM1_RTX (mode
)
2892 || INTVAL (pred1
[1]) == SVE_KNOWN_PTRUE
);
2893 bool ptrue2_p
= (pred2
[0] == CONSTM1_RTX (mode
)
2894 || INTVAL (pred2
[1]) == SVE_KNOWN_PTRUE
);
2895 return (ptrue1_p
&& ptrue2_p
) || rtx_equal_p (pred1
[0], pred2
[0]);
2898 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
2899 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
2900 Use TARGET as the target register if nonnull and convenient. */
2903 aarch64_sve_emit_int_cmp (rtx target
, machine_mode pred_mode
, rtx_code cmp
,
2904 machine_mode data_mode
, rtx op1
, rtx op2
)
2906 insn_code icode
= code_for_aarch64_pred_cmp (cmp
, data_mode
);
2907 expand_operand ops
[5];
2908 create_output_operand (&ops
[0], target
, pred_mode
);
2909 create_input_operand (&ops
[1], CONSTM1_RTX (pred_mode
), pred_mode
);
2910 create_integer_operand (&ops
[2], SVE_KNOWN_PTRUE
);
2911 create_input_operand (&ops
[3], op1
, data_mode
);
2912 create_input_operand (&ops
[4], op2
, data_mode
);
2913 expand_insn (icode
, 5, ops
);
2914 return ops
[0].value
;
2917 /* Use a comparison to convert integer vector SRC into MODE, which is
2918 the corresponding SVE predicate mode. Use TARGET for the result
2919 if it's nonnull and convenient. */
2922 aarch64_convert_sve_data_to_pred (rtx target
, machine_mode mode
, rtx src
)
2924 machine_mode src_mode
= GET_MODE (src
);
2925 return aarch64_sve_emit_int_cmp (target
, mode
, NE
, src_mode
,
2926 src
, CONST0_RTX (src_mode
));
2929 /* Return true if we can move VALUE into a register using a single
2930 CNT[BHWD] instruction. */
2933 aarch64_sve_cnt_immediate_p (poly_int64 value
)
2935 HOST_WIDE_INT factor
= value
.coeffs
[0];
2936 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2937 return (value
.coeffs
[1] == factor
2938 && IN_RANGE (factor
, 2, 16 * 16)
2939 && (factor
& 1) == 0
2940 && factor
<= 16 * (factor
& -factor
));
2943 /* Likewise for rtx X. */
2946 aarch64_sve_cnt_immediate_p (rtx x
)
2949 return poly_int_rtx_p (x
, &value
) && aarch64_sve_cnt_immediate_p (value
);
2952 /* Return the asm string for an instruction with a CNT-like vector size
2953 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2954 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2955 first part of the operands template (the part that comes before the
2956 vector size itself). PATTERN is the pattern to use. FACTOR is the
2957 number of quadwords. NELTS_PER_VQ, if nonzero, is the number of elements
2958 in each quadword. If it is zero, we can use any element size. */
2961 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
2962 aarch64_svpattern pattern
,
2963 unsigned int factor
,
2964 unsigned int nelts_per_vq
)
2966 static char buffer
[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
2968 if (nelts_per_vq
== 0)
2969 /* There is some overlap in the ranges of the four CNT instructions.
2970 Here we always use the smallest possible element size, so that the
2971 multiplier is 1 whereever possible. */
2972 nelts_per_vq
= factor
& -factor
;
2973 int shift
= std::min (exact_log2 (nelts_per_vq
), 4);
2974 gcc_assert (IN_RANGE (shift
, 1, 4));
2975 char suffix
= "dwhb"[shift
- 1];
2978 unsigned int written
;
2979 if (pattern
== AARCH64_SV_ALL
&& factor
== 1)
2980 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s",
2981 prefix
, suffix
, operands
);
2982 else if (factor
== 1)
2983 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s, %s",
2984 prefix
, suffix
, operands
, svpattern_token (pattern
));
2986 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s, %s, mul #%d",
2987 prefix
, suffix
, operands
, svpattern_token (pattern
),
2989 gcc_assert (written
< sizeof (buffer
));
2993 /* Return the asm string for an instruction with a CNT-like vector size
2994 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2995 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2996 first part of the operands template (the part that comes before the
2997 vector size itself). X is the value of the vector size operand,
2998 as a polynomial integer rtx; we need to convert this into an "all"
2999 pattern with a multiplier. */
3002 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
3005 poly_int64 value
= rtx_to_poly_int64 (x
);
3006 gcc_assert (aarch64_sve_cnt_immediate_p (value
));
3007 return aarch64_output_sve_cnt_immediate (prefix
, operands
, AARCH64_SV_ALL
,
3008 value
.coeffs
[1], 0);
3011 /* Return true if we can add X using a single SVE INC or DEC instruction. */
3014 aarch64_sve_scalar_inc_dec_immediate_p (rtx x
)
3017 return (poly_int_rtx_p (x
, &value
)
3018 && (aarch64_sve_cnt_immediate_p (value
)
3019 || aarch64_sve_cnt_immediate_p (-value
)));
3022 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
3026 aarch64_output_sve_scalar_inc_dec (rtx offset
)
3028 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
3029 gcc_assert (offset_value
.coeffs
[0] == offset_value
.coeffs
[1]);
3030 if (offset_value
.coeffs
[1] > 0)
3031 return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL
,
3032 offset_value
.coeffs
[1], 0);
3034 return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL
,
3035 -offset_value
.coeffs
[1], 0);
3038 /* Return true if we can add VALUE to a register using a single ADDVL
3039 or ADDPL instruction. */
3042 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value
)
3044 HOST_WIDE_INT factor
= value
.coeffs
[0];
3045 if (factor
== 0 || value
.coeffs
[1] != factor
)
3047 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
3048 and a value of 16 is one vector width. */
3049 return (((factor
& 15) == 0 && IN_RANGE (factor
, -32 * 16, 31 * 16))
3050 || ((factor
& 1) == 0 && IN_RANGE (factor
, -32 * 2, 31 * 2)));
3053 /* Likewise for rtx X. */
3056 aarch64_sve_addvl_addpl_immediate_p (rtx x
)
3059 return (poly_int_rtx_p (x
, &value
)
3060 && aarch64_sve_addvl_addpl_immediate_p (value
));
3063 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3064 to operand 1 and storing the result in operand 0. */
3067 aarch64_output_sve_addvl_addpl (rtx offset
)
3069 static char buffer
[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3070 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
3071 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value
));
3073 int factor
= offset_value
.coeffs
[1];
3074 if ((factor
& 15) == 0)
3075 snprintf (buffer
, sizeof (buffer
), "addvl\t%%x0, %%x1, #%d", factor
/ 16);
3077 snprintf (buffer
, sizeof (buffer
), "addpl\t%%x0, %%x1, #%d", factor
/ 2);
3081 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3082 instruction. If it is, store the number of elements in each vector
3083 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3084 factor in *FACTOR_OUT (if nonnull). */
3087 aarch64_sve_vector_inc_dec_immediate_p (rtx x
, int *factor_out
,
3088 unsigned int *nelts_per_vq_out
)
3093 if (!const_vec_duplicate_p (x
, &elt
)
3094 || !poly_int_rtx_p (elt
, &value
))
3097 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x
));
3098 if (nelts_per_vq
!= 8 && nelts_per_vq
!= 4 && nelts_per_vq
!= 2)
3099 /* There's no vector INCB. */
3102 HOST_WIDE_INT factor
= value
.coeffs
[0];
3103 if (value
.coeffs
[1] != factor
)
3106 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
3107 if ((factor
% nelts_per_vq
) != 0
3108 || !IN_RANGE (abs (factor
), nelts_per_vq
, 16 * nelts_per_vq
))
3112 *factor_out
= factor
;
3113 if (nelts_per_vq_out
)
3114 *nelts_per_vq_out
= nelts_per_vq
;
3118 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3122 aarch64_sve_vector_inc_dec_immediate_p (rtx x
)
3124 return aarch64_sve_vector_inc_dec_immediate_p (x
, NULL
, NULL
);
3127 /* Return the asm template for an SVE vector INC or DEC instruction.
3128 OPERANDS gives the operands before the vector count and X is the
3129 value of the vector count operand itself. */
3132 aarch64_output_sve_vector_inc_dec (const char *operands
, rtx x
)
3135 unsigned int nelts_per_vq
;
3136 if (!aarch64_sve_vector_inc_dec_immediate_p (x
, &factor
, &nelts_per_vq
))
3139 return aarch64_output_sve_cnt_immediate ("dec", operands
, AARCH64_SV_ALL
,
3140 -factor
, nelts_per_vq
);
3142 return aarch64_output_sve_cnt_immediate ("inc", operands
, AARCH64_SV_ALL
,
3143 factor
, nelts_per_vq
);
3147 aarch64_internal_mov_immediate (rtx dest
, rtx imm
, bool generate
,
3148 scalar_int_mode mode
)
3151 unsigned HOST_WIDE_INT val
, val2
, mask
;
3152 int one_match
, zero_match
;
3157 if (aarch64_move_imm (val
, mode
))
3160 emit_insn (gen_rtx_SET (dest
, imm
));
3164 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3165 (with XXXX non-zero). In that case check to see if the move can be done in
3167 val2
= val
& 0xffffffff;
3169 && aarch64_move_imm (val2
, SImode
)
3170 && (((val
>> 32) & 0xffff) == 0 || (val
>> 48) == 0))
3173 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
3175 /* Check if we have to emit a second instruction by checking to see
3176 if any of the upper 32 bits of the original DI mode value is set. */
3180 i
= (val
>> 48) ? 48 : 32;
3183 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
3184 GEN_INT ((val
>> i
) & 0xffff)));
3189 if ((val
>> 32) == 0 || mode
== SImode
)
3193 emit_insn (gen_rtx_SET (dest
, GEN_INT (val
& 0xffff)));
3195 emit_insn (gen_insv_immsi (dest
, GEN_INT (16),
3196 GEN_INT ((val
>> 16) & 0xffff)));
3198 emit_insn (gen_insv_immdi (dest
, GEN_INT (16),
3199 GEN_INT ((val
>> 16) & 0xffff)));
3204 /* Remaining cases are all for DImode. */
3207 zero_match
= ((val
& mask
) == 0) + ((val
& (mask
<< 16)) == 0) +
3208 ((val
& (mask
<< 32)) == 0) + ((val
& (mask
<< 48)) == 0);
3209 one_match
= ((~val
& mask
) == 0) + ((~val
& (mask
<< 16)) == 0) +
3210 ((~val
& (mask
<< 32)) == 0) + ((~val
& (mask
<< 48)) == 0);
3212 if (zero_match
!= 2 && one_match
!= 2)
3214 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3215 For a 64-bit bitmask try whether changing 16 bits to all ones or
3216 zeroes creates a valid bitmask. To check any repeated bitmask,
3217 try using 16 bits from the other 32-bit half of val. */
3219 for (i
= 0; i
< 64; i
+= 16, mask
<<= 16)
3222 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
3225 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
3227 val2
= val2
& ~mask
;
3228 val2
= val2
| (((val2
>> 32) | (val2
<< 32)) & mask
);
3229 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
3236 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
3237 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
3238 GEN_INT ((val
>> i
) & 0xffff)));
3244 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3245 are emitted by the initial mov. If one_match > zero_match, skip set bits,
3246 otherwise skip zero bits. */
3250 val2
= one_match
> zero_match
? ~val
: val
;
3251 i
= (val2
& mask
) != 0 ? 0 : (val2
& (mask
<< 16)) != 0 ? 16 : 32;
3254 emit_insn (gen_rtx_SET (dest
, GEN_INT (one_match
> zero_match
3255 ? (val
| ~(mask
<< i
))
3256 : (val
& (mask
<< i
)))));
3257 for (i
+= 16; i
< 64; i
+= 16)
3259 if ((val2
& (mask
<< i
)) == 0)
3262 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
3263 GEN_INT ((val
>> i
) & 0xffff)));
3270 /* Return whether imm is a 128-bit immediate which is simple enough to
3273 aarch64_mov128_immediate (rtx imm
)
3275 if (GET_CODE (imm
) == CONST_INT
)
3278 gcc_assert (CONST_WIDE_INT_NUNITS (imm
) == 2);
3280 rtx lo
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 0));
3281 rtx hi
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 1));
3283 return aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, DImode
)
3284 + aarch64_internal_mov_immediate (NULL_RTX
, hi
, false, DImode
) <= 4;
3288 /* Return the number of temporary registers that aarch64_add_offset_1
3289 would need to add OFFSET to a register. */
3292 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset
)
3294 return abs_hwi (offset
) < 0x1000000 ? 0 : 1;
3297 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
3298 a non-polynomial OFFSET. MODE is the mode of the addition.
3299 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3300 be set and CFA adjustments added to the generated instructions.
3302 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3303 temporary if register allocation is already complete. This temporary
3304 register may overlap DEST but must not overlap SRC. If TEMP1 is known
3305 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3306 the immediate again.
3308 Since this function may be used to adjust the stack pointer, we must
3309 ensure that it cannot cause transient stack deallocation (for example
3310 by first incrementing SP and then decrementing when adjusting by a
3311 large immediate). */
3314 aarch64_add_offset_1 (scalar_int_mode mode
, rtx dest
,
3315 rtx src
, HOST_WIDE_INT offset
, rtx temp1
,
3316 bool frame_related_p
, bool emit_move_imm
)
3318 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
3319 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
3321 HOST_WIDE_INT moffset
= abs_hwi (offset
);
3326 if (!rtx_equal_p (dest
, src
))
3328 insn
= emit_insn (gen_rtx_SET (dest
, src
));
3329 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3334 /* Single instruction adjustment. */
3335 if (aarch64_uimm12_shift (moffset
))
3337 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (offset
)));
3338 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3342 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3345 a) the offset cannot be loaded by a 16-bit move or
3346 b) there is no spare register into which we can move it. */
3347 if (moffset
< 0x1000000
3348 && ((!temp1
&& !can_create_pseudo_p ())
3349 || !aarch64_move_imm (moffset
, mode
)))
3351 HOST_WIDE_INT low_off
= moffset
& 0xfff;
3353 low_off
= offset
< 0 ? -low_off
: low_off
;
3354 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (low_off
)));
3355 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3356 insn
= emit_insn (gen_add2_insn (dest
, GEN_INT (offset
- low_off
)));
3357 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3361 /* Emit a move immediate if required and an addition/subtraction. */
3364 gcc_assert (temp1
!= NULL_RTX
|| can_create_pseudo_p ());
3365 temp1
= aarch64_force_temporary (mode
, temp1
, GEN_INT (moffset
));
3367 insn
= emit_insn (offset
< 0
3368 ? gen_sub3_insn (dest
, src
, temp1
)
3369 : gen_add3_insn (dest
, src
, temp1
));
3370 if (frame_related_p
)
3372 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3373 rtx adj
= plus_constant (mode
, src
, offset
);
3374 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, gen_rtx_SET (dest
, adj
));
3378 /* Return the number of temporary registers that aarch64_add_offset
3379 would need to move OFFSET into a register or add OFFSET to a register;
3380 ADD_P is true if we want the latter rather than the former. */
3383 aarch64_offset_temporaries (bool add_p
, poly_int64 offset
)
3385 /* This follows the same structure as aarch64_add_offset. */
3386 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
3389 unsigned int count
= 0;
3390 HOST_WIDE_INT factor
= offset
.coeffs
[1];
3391 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
3392 poly_int64
poly_offset (factor
, factor
);
3393 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
3394 /* Need one register for the ADDVL/ADDPL result. */
3396 else if (factor
!= 0)
3398 factor
= abs (factor
);
3399 if (factor
> 16 * (factor
& -factor
))
3400 /* Need one register for the CNT result and one for the multiplication
3401 factor. If necessary, the second temporary can be reused for the
3402 constant part of the offset. */
3404 /* Need one register for the CNT result (which might then
3408 return count
+ aarch64_add_offset_1_temporaries (constant
);
3411 /* If X can be represented as a poly_int64, return the number
3412 of temporaries that are required to add it to a register.
3413 Return -1 otherwise. */
3416 aarch64_add_offset_temporaries (rtx x
)
3419 if (!poly_int_rtx_p (x
, &offset
))
3421 return aarch64_offset_temporaries (true, offset
);
3424 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
3425 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3426 be set and CFA adjustments added to the generated instructions.
3428 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3429 temporary if register allocation is already complete. This temporary
3430 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3431 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3432 false to avoid emitting the immediate again.
3434 TEMP2, if nonnull, is a second temporary register that doesn't
3435 overlap either DEST or REG.
3437 Since this function may be used to adjust the stack pointer, we must
3438 ensure that it cannot cause transient stack deallocation (for example
3439 by first incrementing SP and then decrementing when adjusting by a
3440 large immediate). */
3443 aarch64_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
3444 poly_int64 offset
, rtx temp1
, rtx temp2
,
3445 bool frame_related_p
, bool emit_move_imm
= true)
3447 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
3448 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
3449 gcc_assert (temp1
== NULL_RTX
3451 || !reg_overlap_mentioned_p (temp1
, dest
));
3452 gcc_assert (temp2
== NULL_RTX
|| !reg_overlap_mentioned_p (dest
, temp2
));
3454 /* Try using ADDVL or ADDPL to add the whole value. */
3455 if (src
!= const0_rtx
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
3457 rtx offset_rtx
= gen_int_mode (offset
, mode
);
3458 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
3459 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3463 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3464 SVE vector register, over and above the minimum size of 128 bits.
3465 This is equivalent to half the value returned by CNTD with a
3466 vector shape of ALL. */
3467 HOST_WIDE_INT factor
= offset
.coeffs
[1];
3468 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
3470 /* Try using ADDVL or ADDPL to add the VG-based part. */
3471 poly_int64
poly_offset (factor
, factor
);
3472 if (src
!= const0_rtx
3473 && aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
3475 rtx offset_rtx
= gen_int_mode (poly_offset
, mode
);
3476 if (frame_related_p
)
3478 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
3479 RTX_FRAME_RELATED_P (insn
) = true;
3484 rtx addr
= gen_rtx_PLUS (mode
, src
, offset_rtx
);
3485 src
= aarch64_force_temporary (mode
, temp1
, addr
);
3490 /* Otherwise use a CNT-based sequence. */
3491 else if (factor
!= 0)
3493 /* Use a subtraction if we have a negative factor. */
3494 rtx_code code
= PLUS
;
3501 /* Calculate CNTD * FACTOR / 2. First try to fold the division
3502 into the multiplication. */
3506 /* Use a right shift by 1. */
3510 HOST_WIDE_INT low_bit
= factor
& -factor
;
3511 if (factor
<= 16 * low_bit
)
3513 if (factor
> 16 * 8)
3515 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3516 the value with the minimum multiplier and shift it into
3518 int extra_shift
= exact_log2 (low_bit
);
3519 shift
+= extra_shift
;
3520 factor
>>= extra_shift
;
3522 val
= gen_int_mode (poly_int64 (factor
* 2, factor
* 2), mode
);
3526 /* Base the factor on LOW_BIT if we can calculate LOW_BIT
3527 directly, since that should increase the chances of being
3528 able to use a shift and add sequence. If LOW_BIT itself
3529 is out of range, just use CNTD. */
3530 if (low_bit
<= 16 * 8)
3535 val
= gen_int_mode (poly_int64 (low_bit
* 2, low_bit
* 2), mode
);
3536 val
= aarch64_force_temporary (mode
, temp1
, val
);
3538 if (can_create_pseudo_p ())
3540 rtx coeff1
= gen_int_mode (factor
, mode
);
3541 val
= expand_mult (mode
, val
, coeff1
, NULL_RTX
, false, true);
3545 /* Go back to using a negative multiplication factor if we have
3546 no register from which to subtract. */
3547 if (code
== MINUS
&& src
== const0_rtx
)
3552 rtx coeff1
= gen_int_mode (factor
, mode
);
3553 coeff1
= aarch64_force_temporary (mode
, temp2
, coeff1
);
3554 val
= gen_rtx_MULT (mode
, val
, coeff1
);
3560 /* Multiply by 1 << SHIFT. */
3561 val
= aarch64_force_temporary (mode
, temp1
, val
);
3562 val
= gen_rtx_ASHIFT (mode
, val
, GEN_INT (shift
));
3564 else if (shift
== -1)
3567 val
= aarch64_force_temporary (mode
, temp1
, val
);
3568 val
= gen_rtx_ASHIFTRT (mode
, val
, const1_rtx
);
3571 /* Calculate SRC +/- CNTD * FACTOR / 2. */
3572 if (src
!= const0_rtx
)
3574 val
= aarch64_force_temporary (mode
, temp1
, val
);
3575 val
= gen_rtx_fmt_ee (code
, mode
, src
, val
);
3577 else if (code
== MINUS
)
3579 val
= aarch64_force_temporary (mode
, temp1
, val
);
3580 val
= gen_rtx_NEG (mode
, val
);
3583 if (constant
== 0 || frame_related_p
)
3585 rtx_insn
*insn
= emit_insn (gen_rtx_SET (dest
, val
));
3586 if (frame_related_p
)
3588 RTX_FRAME_RELATED_P (insn
) = true;
3589 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
3590 gen_rtx_SET (dest
, plus_constant (Pmode
, src
,
3599 src
= aarch64_force_temporary (mode
, temp1
, val
);
3604 emit_move_imm
= true;
3607 aarch64_add_offset_1 (mode
, dest
, src
, constant
, temp1
,
3608 frame_related_p
, emit_move_imm
);
3611 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3612 than a poly_int64. */
3615 aarch64_split_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
3616 rtx offset_rtx
, rtx temp1
, rtx temp2
)
3618 aarch64_add_offset (mode
, dest
, src
, rtx_to_poly_int64 (offset_rtx
),
3619 temp1
, temp2
, false);
3622 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3623 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
3624 if TEMP1 already contains abs (DELTA). */
3627 aarch64_add_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool emit_move_imm
)
3629 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, delta
,
3630 temp1
, temp2
, true, emit_move_imm
);
3633 /* Subtract DELTA from the stack pointer, marking the instructions
3634 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
3638 aarch64_sub_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool frame_related_p
,
3639 bool emit_move_imm
= true)
3641 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, -delta
,
3642 temp1
, temp2
, frame_related_p
, emit_move_imm
);
3645 /* Set DEST to (vec_series BASE STEP). */
3648 aarch64_expand_vec_series (rtx dest
, rtx base
, rtx step
)
3650 machine_mode mode
= GET_MODE (dest
);
3651 scalar_mode inner
= GET_MODE_INNER (mode
);
3653 /* Each operand can be a register or an immediate in the range [-16, 15]. */
3654 if (!aarch64_sve_index_immediate_p (base
))
3655 base
= force_reg (inner
, base
);
3656 if (!aarch64_sve_index_immediate_p (step
))
3657 step
= force_reg (inner
, step
);
3659 emit_set_insn (dest
, gen_rtx_VEC_SERIES (mode
, base
, step
));
3662 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
3663 register of mode MODE. Use TARGET for the result if it's nonnull
3666 The two vector modes must have the same element mode. The behavior
3667 is to duplicate architectural lane N of SRC into architectural lanes
3668 N + I * STEP of the result. On big-endian targets, architectural
3669 lane 0 of an Advanced SIMD vector is the last element of the vector
3670 in memory layout, so for big-endian targets this operation has the
3671 effect of reversing SRC before duplicating it. Callers need to
3672 account for this. */
3675 aarch64_expand_sve_dupq (rtx target
, machine_mode mode
, rtx src
)
3677 machine_mode src_mode
= GET_MODE (src
);
3678 gcc_assert (GET_MODE_INNER (mode
) == GET_MODE_INNER (src_mode
));
3679 insn_code icode
= (BYTES_BIG_ENDIAN
3680 ? code_for_aarch64_vec_duplicate_vq_be (mode
)
3681 : code_for_aarch64_vec_duplicate_vq_le (mode
));
3684 expand_operand ops
[3];
3685 create_output_operand (&ops
[i
++], target
, mode
);
3686 create_output_operand (&ops
[i
++], src
, src_mode
);
3687 if (BYTES_BIG_ENDIAN
)
3689 /* Create a PARALLEL describing the reversal of SRC. */
3690 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (mode
);
3691 rtx sel
= aarch64_gen_stepped_int_parallel (nelts_per_vq
,
3692 nelts_per_vq
- 1, -1);
3693 create_fixed_operand (&ops
[i
++], sel
);
3695 expand_insn (icode
, i
, ops
);
3696 return ops
[0].value
;
3699 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
3700 the memory image into DEST. Return true on success. */
3703 aarch64_expand_sve_ld1rq (rtx dest
, rtx src
)
3705 src
= force_const_mem (GET_MODE (src
), src
);
3709 /* Make sure that the address is legitimate. */
3710 if (!aarch64_sve_ld1rq_operand_p (src
))
3712 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
3713 src
= replace_equiv_address (src
, addr
);
3716 machine_mode mode
= GET_MODE (dest
);
3717 unsigned int elem_bytes
= GET_MODE_UNIT_SIZE (mode
);
3718 machine_mode pred_mode
= aarch64_sve_pred_mode (elem_bytes
).require ();
3719 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
3720 emit_insn (gen_aarch64_sve_ld1rq (mode
, dest
, src
, ptrue
));
3724 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
3725 SVE data mode and isn't a legitimate constant. Use TARGET for the
3726 result if convenient.
3728 The returned register can have whatever mode seems most natural
3729 given the contents of SRC. */
3732 aarch64_expand_sve_const_vector (rtx target
, rtx src
)
3734 machine_mode mode
= GET_MODE (src
);
3735 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (src
);
3736 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (src
);
3737 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
3738 unsigned int elt_bits
= GET_MODE_BITSIZE (elt_mode
);
3739 unsigned int encoded_bits
= npatterns
* nelts_per_pattern
* elt_bits
;
3741 if (nelts_per_pattern
== 1 && encoded_bits
== 128)
3743 /* The constant is a duplicated quadword but can't be narrowed
3744 beyond a quadword. Get the memory image of the first quadword
3745 as a 128-bit vector and try using LD1RQ to load it from memory.
3747 The effect for both endiannesses is to load memory lane N into
3748 architectural lanes N + I * STEP of the result. On big-endian
3749 targets, the layout of the 128-bit vector in an Advanced SIMD
3750 register would be different from its layout in an SVE register,
3751 but this 128-bit vector is a memory value only. */
3752 machine_mode vq_mode
= aarch64_vq_mode (elt_mode
).require ();
3753 rtx vq_value
= simplify_gen_subreg (vq_mode
, src
, mode
, 0);
3754 if (vq_value
&& aarch64_expand_sve_ld1rq (target
, vq_value
))
3758 if (nelts_per_pattern
== 1 && encoded_bits
< 128)
3760 /* The vector is a repeating sequence of 64 bits or fewer.
3761 See if we can load them using an Advanced SIMD move and then
3762 duplicate it to fill a vector. This is better than using a GPR
3763 move because it keeps everything in the same register file. */
3764 machine_mode vq_mode
= aarch64_vq_mode (elt_mode
).require ();
3765 rtx_vector_builder
builder (vq_mode
, npatterns
, 1);
3766 for (unsigned int i
= 0; i
< npatterns
; ++i
)
3768 /* We want memory lane N to go into architectural lane N,
3769 so reverse for big-endian targets. The DUP .Q pattern
3770 has a compensating reverse built-in. */
3771 unsigned int srci
= BYTES_BIG_ENDIAN
? npatterns
- i
- 1 : i
;
3772 builder
.quick_push (CONST_VECTOR_ENCODED_ELT (src
, srci
));
3774 rtx vq_src
= builder
.build ();
3775 if (aarch64_simd_valid_immediate (vq_src
, NULL
))
3777 vq_src
= force_reg (vq_mode
, vq_src
);
3778 return aarch64_expand_sve_dupq (target
, mode
, vq_src
);
3781 /* Get an integer representation of the repeating part of Advanced
3782 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
3783 which for big-endian targets is lane-swapped wrt a normal
3784 Advanced SIMD vector. This means that for both endiannesses,
3785 memory lane N of SVE vector SRC corresponds to architectural
3786 lane N of a register holding VQ_SRC. This in turn means that
3787 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
3788 as a single 128-bit value) and thus that memory lane 0 of SRC is
3789 in the lsb of the integer. Duplicating the integer therefore
3790 ensures that memory lane N of SRC goes into architectural lane
3791 N + I * INDEX of the SVE register. */
3792 scalar_mode int_mode
= int_mode_for_size (encoded_bits
, 0).require ();
3793 rtx elt_value
= simplify_gen_subreg (int_mode
, vq_src
, vq_mode
, 0);
3796 /* Pretend that we had a vector of INT_MODE to start with. */
3797 elt_mode
= int_mode
;
3798 mode
= aarch64_full_sve_mode (int_mode
).require ();
3800 /* If the integer can be moved into a general register by a
3801 single instruction, do that and duplicate the result. */
3802 if (CONST_INT_P (elt_value
)
3803 && aarch64_move_imm (INTVAL (elt_value
), elt_mode
))
3805 elt_value
= force_reg (elt_mode
, elt_value
);
3806 return expand_vector_broadcast (mode
, elt_value
);
3809 else if (npatterns
== 1)
3810 /* We're duplicating a single value, but can't do better than
3811 force it to memory and load from there. This handles things
3812 like symbolic constants. */
3813 elt_value
= CONST_VECTOR_ENCODED_ELT (src
, 0);
3817 /* Load the element from memory if we can, otherwise move it into
3818 a register and use a DUP. */
3819 rtx op
= force_const_mem (elt_mode
, elt_value
);
3821 op
= force_reg (elt_mode
, elt_value
);
3822 return expand_vector_broadcast (mode
, op
);
3826 /* Try using INDEX. */
3828 if (const_vec_series_p (src
, &base
, &step
))
3830 aarch64_expand_vec_series (target
, base
, step
);
3834 /* From here on, it's better to force the whole constant to memory
3836 if (GET_MODE_NUNITS (mode
).is_constant ())
3839 /* Expand each pattern individually. */
3840 gcc_assert (npatterns
> 1);
3841 rtx_vector_builder builder
;
3842 auto_vec
<rtx
, 16> vectors (npatterns
);
3843 for (unsigned int i
= 0; i
< npatterns
; ++i
)
3845 builder
.new_vector (mode
, 1, nelts_per_pattern
);
3846 for (unsigned int j
= 0; j
< nelts_per_pattern
; ++j
)
3847 builder
.quick_push (CONST_VECTOR_ELT (src
, i
+ j
* npatterns
));
3848 vectors
.quick_push (force_reg (mode
, builder
.build ()));
3851 /* Use permutes to interleave the separate vectors. */
3852 while (npatterns
> 1)
3855 for (unsigned int i
= 0; i
< npatterns
; ++i
)
3857 rtx tmp
= (npatterns
== 1 ? target
: gen_reg_rtx (mode
));
3858 rtvec v
= gen_rtvec (2, vectors
[i
], vectors
[i
+ npatterns
]);
3859 emit_set_insn (tmp
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
3863 gcc_assert (vectors
[0] == target
);
3867 /* Use WHILE to set a predicate register of mode MODE in which the first
3868 VL bits are set and the rest are clear. Use TARGET for the register
3869 if it's nonnull and convenient. */
3872 aarch64_sve_move_pred_via_while (rtx target
, machine_mode mode
,
3875 rtx limit
= force_reg (DImode
, gen_int_mode (vl
, DImode
));
3876 target
= aarch64_target_reg (target
, mode
);
3877 emit_insn (gen_while_ult (DImode
, mode
, target
, const0_rtx
, limit
));
3882 aarch64_expand_sve_const_pred_1 (rtx
, rtx_vector_builder
&, bool);
3884 /* BUILDER is a constant predicate in which the index of every set bit
3885 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
3886 by inverting every element at a multiple of ELT_SIZE and EORing the
3887 result with an ELT_SIZE PTRUE.
3889 Return a register that contains the constant on success, otherwise
3890 return null. Use TARGET as the register if it is nonnull and
3894 aarch64_expand_sve_const_pred_eor (rtx target
, rtx_vector_builder
&builder
,
3895 unsigned int elt_size
)
3897 /* Invert every element at a multiple of ELT_SIZE, keeping the
3899 rtx_vector_builder
inv_builder (VNx16BImode
, builder
.npatterns (),
3900 builder
.nelts_per_pattern ());
3901 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); ++i
)
3902 if ((i
& (elt_size
- 1)) == 0 && INTVAL (builder
.elt (i
)) == 0)
3903 inv_builder
.quick_push (const1_rtx
);
3905 inv_builder
.quick_push (const0_rtx
);
3906 inv_builder
.finalize ();
3908 /* See if we can load the constant cheaply. */
3909 rtx inv
= aarch64_expand_sve_const_pred_1 (NULL_RTX
, inv_builder
, false);
3913 /* EOR the result with an ELT_SIZE PTRUE. */
3914 rtx mask
= aarch64_ptrue_all (elt_size
);
3915 mask
= force_reg (VNx16BImode
, mask
);
3916 target
= aarch64_target_reg (target
, VNx16BImode
);
3917 emit_insn (gen_aarch64_pred_z (XOR
, VNx16BImode
, target
, mask
, inv
, mask
));
3921 /* BUILDER is a constant predicate in which the index of every set bit
3922 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
3923 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
3924 register on success, otherwise return null. Use TARGET as the register
3925 if nonnull and convenient. */
3928 aarch64_expand_sve_const_pred_trn (rtx target
, rtx_vector_builder
&builder
,
3929 unsigned int elt_size
,
3930 unsigned int permute_size
)
3932 /* We're going to split the constant into two new constants A and B,
3933 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
3934 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
3936 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
3937 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
3939 where _ indicates elements that will be discarded by the permute.
3941 First calculate the ELT_SIZEs for A and B. */
3942 unsigned int a_elt_size
= GET_MODE_SIZE (DImode
);
3943 unsigned int b_elt_size
= GET_MODE_SIZE (DImode
);
3944 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); i
+= elt_size
)
3945 if (INTVAL (builder
.elt (i
)) != 0)
3947 if (i
& permute_size
)
3948 b_elt_size
|= i
- permute_size
;
3952 a_elt_size
&= -a_elt_size
;
3953 b_elt_size
&= -b_elt_size
;
3955 /* Now construct the vectors themselves. */
3956 rtx_vector_builder
a_builder (VNx16BImode
, builder
.npatterns (),
3957 builder
.nelts_per_pattern ());
3958 rtx_vector_builder
b_builder (VNx16BImode
, builder
.npatterns (),
3959 builder
.nelts_per_pattern ());
3960 unsigned int nelts
= builder
.encoded_nelts ();
3961 for (unsigned int i
= 0; i
< nelts
; ++i
)
3962 if (i
& (elt_size
- 1))
3964 a_builder
.quick_push (const0_rtx
);
3965 b_builder
.quick_push (const0_rtx
);
3967 else if ((i
& permute_size
) == 0)
3969 /* The A and B elements are significant. */
3970 a_builder
.quick_push (builder
.elt (i
));
3971 b_builder
.quick_push (builder
.elt (i
+ permute_size
));
3975 /* The A and B elements are going to be discarded, so pick whatever
3976 is likely to give a nice constant. We are targeting element
3977 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
3978 with the aim of each being a sequence of ones followed by
3979 a sequence of zeros. So:
3981 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
3982 duplicate the last X_ELT_SIZE element, to extend the
3983 current sequence of ones or zeros.
3985 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
3986 zero, so that the constant really does have X_ELT_SIZE and
3987 not a smaller size. */
3988 if (a_elt_size
> permute_size
)
3989 a_builder
.quick_push (const0_rtx
);
3991 a_builder
.quick_push (a_builder
.elt (i
- a_elt_size
));
3992 if (b_elt_size
> permute_size
)
3993 b_builder
.quick_push (const0_rtx
);
3995 b_builder
.quick_push (b_builder
.elt (i
- b_elt_size
));
3997 a_builder
.finalize ();
3998 b_builder
.finalize ();
4000 /* Try loading A into a register. */
4001 rtx_insn
*last
= get_last_insn ();
4002 rtx a
= aarch64_expand_sve_const_pred_1 (NULL_RTX
, a_builder
, false);
4006 /* Try loading B into a register. */
4008 if (a_builder
!= b_builder
)
4010 b
= aarch64_expand_sve_const_pred_1 (NULL_RTX
, b_builder
, false);
4013 delete_insns_since (last
);
4018 /* Emit the TRN1 itself. */
4019 machine_mode mode
= aarch64_sve_pred_mode (permute_size
).require ();
4020 target
= aarch64_target_reg (target
, mode
);
4021 emit_insn (gen_aarch64_sve (UNSPEC_TRN1
, mode
, target
,
4022 gen_lowpart (mode
, a
),
4023 gen_lowpart (mode
, b
)));
4027 /* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
4028 constant in BUILDER into an SVE predicate register. Return the register
4029 on success, otherwise return null. Use TARGET for the register if
4030 nonnull and convenient.
4032 ALLOW_RECURSE_P is true if we can use methods that would call this
4033 function recursively. */
4036 aarch64_expand_sve_const_pred_1 (rtx target
, rtx_vector_builder
&builder
,
4037 bool allow_recurse_p
)
4039 if (builder
.encoded_nelts () == 1)
4040 /* A PFALSE or a PTRUE .B ALL. */
4041 return aarch64_emit_set_immediate (target
, builder
);
4043 unsigned int elt_size
= aarch64_widest_sve_pred_elt_size (builder
);
4044 if (int vl
= aarch64_partial_ptrue_length (builder
, elt_size
))
4046 /* If we can load the constant using PTRUE, use it as-is. */
4047 machine_mode mode
= aarch64_sve_pred_mode (elt_size
).require ();
4048 if (aarch64_svpattern_for_vl (mode
, vl
) != AARCH64_NUM_SVPATTERNS
)
4049 return aarch64_emit_set_immediate (target
, builder
);
4051 /* Otherwise use WHILE to set the first VL bits. */
4052 return aarch64_sve_move_pred_via_while (target
, mode
, vl
);
4055 if (!allow_recurse_p
)
4058 /* Try inverting the vector in element size ELT_SIZE and then EORing
4059 the result with an ELT_SIZE PTRUE. */
4060 if (INTVAL (builder
.elt (0)) == 0)
4061 if (rtx res
= aarch64_expand_sve_const_pred_eor (target
, builder
,
4065 /* Try using TRN1 to permute two simpler constants. */
4066 for (unsigned int i
= elt_size
; i
<= 8; i
*= 2)
4067 if (rtx res
= aarch64_expand_sve_const_pred_trn (target
, builder
,
4074 /* Return an SVE predicate register that contains the VNx16BImode
4075 constant in BUILDER, without going through the move expanders.
4077 The returned register can have whatever mode seems most natural
4078 given the contents of BUILDER. Use TARGET for the result if
4082 aarch64_expand_sve_const_pred (rtx target
, rtx_vector_builder
&builder
)
4084 /* Try loading the constant using pure predicate operations. */
4085 if (rtx res
= aarch64_expand_sve_const_pred_1 (target
, builder
, true))
4088 /* Try forcing the constant to memory. */
4089 if (builder
.full_nelts ().is_constant ())
4090 if (rtx mem
= force_const_mem (VNx16BImode
, builder
.build ()))
4092 target
= aarch64_target_reg (target
, VNx16BImode
);
4093 emit_move_insn (target
, mem
);
4097 /* The last resort is to load the constant as an integer and then
4098 compare it against zero. Use -1 for set bits in order to increase
4099 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
4100 rtx_vector_builder
int_builder (VNx16QImode
, builder
.npatterns (),
4101 builder
.nelts_per_pattern ());
4102 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); ++i
)
4103 int_builder
.quick_push (INTVAL (builder
.elt (i
))
4104 ? constm1_rtx
: const0_rtx
);
4105 return aarch64_convert_sve_data_to_pred (target
, VNx16BImode
,
4106 int_builder
.build ());
4109 /* Set DEST to immediate IMM. */
4112 aarch64_expand_mov_immediate (rtx dest
, rtx imm
)
4114 machine_mode mode
= GET_MODE (dest
);
4116 /* Check on what type of symbol it is. */
4117 scalar_int_mode int_mode
;
4118 if ((GET_CODE (imm
) == SYMBOL_REF
4119 || GET_CODE (imm
) == LABEL_REF
4120 || GET_CODE (imm
) == CONST
4121 || GET_CODE (imm
) == CONST_POLY_INT
)
4122 && is_a
<scalar_int_mode
> (mode
, &int_mode
))
4126 HOST_WIDE_INT const_offset
;
4127 enum aarch64_symbol_type sty
;
4129 /* If we have (const (plus symbol offset)), separate out the offset
4130 before we start classifying the symbol. */
4131 rtx base
= strip_offset (imm
, &offset
);
4133 /* We must always add an offset involving VL separately, rather than
4134 folding it into the relocation. */
4135 if (!offset
.is_constant (&const_offset
))
4137 if (base
== const0_rtx
&& aarch64_sve_cnt_immediate_p (offset
))
4138 emit_insn (gen_rtx_SET (dest
, imm
));
4141 /* Do arithmetic on 32-bit values if the result is smaller
4143 if (partial_subreg_p (int_mode
, SImode
))
4145 /* It is invalid to do symbol calculations in modes
4146 narrower than SImode. */
4147 gcc_assert (base
== const0_rtx
);
4148 dest
= gen_lowpart (SImode
, dest
);
4151 if (base
!= const0_rtx
)
4153 base
= aarch64_force_temporary (int_mode
, dest
, base
);
4154 aarch64_add_offset (int_mode
, dest
, base
, offset
,
4155 NULL_RTX
, NULL_RTX
, false);
4158 aarch64_add_offset (int_mode
, dest
, base
, offset
,
4159 dest
, NULL_RTX
, false);
4164 sty
= aarch64_classify_symbol (base
, const_offset
);
4167 case SYMBOL_FORCE_TO_MEM
:
4168 if (const_offset
!= 0
4169 && targetm
.cannot_force_const_mem (int_mode
, imm
))
4171 gcc_assert (can_create_pseudo_p ());
4172 base
= aarch64_force_temporary (int_mode
, dest
, base
);
4173 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
4174 NULL_RTX
, NULL_RTX
, false);
4178 mem
= force_const_mem (ptr_mode
, imm
);
4181 /* If we aren't generating PC relative literals, then
4182 we need to expand the literal pool access carefully.
4183 This is something that needs to be done in a number
4184 of places, so could well live as a separate function. */
4185 if (!aarch64_pcrelative_literal_loads
)
4187 gcc_assert (can_create_pseudo_p ());
4188 base
= gen_reg_rtx (ptr_mode
);
4189 aarch64_expand_mov_immediate (base
, XEXP (mem
, 0));
4190 if (ptr_mode
!= Pmode
)
4191 base
= convert_memory_address (Pmode
, base
);
4192 mem
= gen_rtx_MEM (ptr_mode
, base
);
4195 if (int_mode
!= ptr_mode
)
4196 mem
= gen_rtx_ZERO_EXTEND (int_mode
, mem
);
4198 emit_insn (gen_rtx_SET (dest
, mem
));
4202 case SYMBOL_SMALL_TLSGD
:
4203 case SYMBOL_SMALL_TLSDESC
:
4204 case SYMBOL_SMALL_TLSIE
:
4205 case SYMBOL_SMALL_GOT_28K
:
4206 case SYMBOL_SMALL_GOT_4G
:
4207 case SYMBOL_TINY_GOT
:
4208 case SYMBOL_TINY_TLSIE
:
4209 if (const_offset
!= 0)
4211 gcc_assert(can_create_pseudo_p ());
4212 base
= aarch64_force_temporary (int_mode
, dest
, base
);
4213 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
4214 NULL_RTX
, NULL_RTX
, false);
4219 case SYMBOL_SMALL_ABSOLUTE
:
4220 case SYMBOL_TINY_ABSOLUTE
:
4221 case SYMBOL_TLSLE12
:
4222 case SYMBOL_TLSLE24
:
4223 case SYMBOL_TLSLE32
:
4224 case SYMBOL_TLSLE48
:
4225 aarch64_load_symref_appropriately (dest
, imm
, sty
);
4233 if (!CONST_INT_P (imm
))
4235 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
)
4237 /* Only the low bit of each .H, .S and .D element is defined,
4238 so we can set the upper bits to whatever we like. If the
4239 predicate is all-true in MODE, prefer to set all the undefined
4240 bits as well, so that we can share a single .B predicate for
4242 if (imm
== CONSTM1_RTX (mode
))
4243 imm
= CONSTM1_RTX (VNx16BImode
);
4245 /* All methods for constructing predicate modes wider than VNx16BI
4246 will set the upper bits of each element to zero. Expose this
4247 by moving such constants as a VNx16BI, so that all bits are
4248 significant and so that constants for different modes can be
4249 shared. The wider constant will still be available as a
4251 rtx_vector_builder builder
;
4252 if (aarch64_get_sve_pred_bits (builder
, imm
))
4254 rtx res
= aarch64_expand_sve_const_pred (dest
, builder
);
4256 emit_move_insn (dest
, gen_lowpart (mode
, res
));
4261 if (GET_CODE (imm
) == HIGH
4262 || aarch64_simd_valid_immediate (imm
, NULL
))
4264 emit_insn (gen_rtx_SET (dest
, imm
));
4268 if (GET_CODE (imm
) == CONST_VECTOR
&& aarch64_sve_data_mode_p (mode
))
4269 if (rtx res
= aarch64_expand_sve_const_vector (dest
, imm
))
4272 emit_insn (gen_aarch64_sve_reinterpret (mode
, dest
, res
));
4276 rtx mem
= force_const_mem (mode
, imm
);
4278 emit_move_insn (dest
, mem
);
4282 aarch64_internal_mov_immediate (dest
, imm
, true,
4283 as_a
<scalar_int_mode
> (mode
));
4286 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
4287 that is known to contain PTRUE. */
4290 aarch64_emit_sve_pred_move (rtx dest
, rtx pred
, rtx src
)
4292 expand_operand ops
[3];
4293 machine_mode mode
= GET_MODE (dest
);
4294 create_output_operand (&ops
[0], dest
, mode
);
4295 create_input_operand (&ops
[1], pred
, GET_MODE(pred
));
4296 create_input_operand (&ops
[2], src
, mode
);
4297 temporary_volatile_ok
v (true);
4298 expand_insn (code_for_aarch64_pred_mov (mode
), 3, ops
);
4301 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
4302 operand is in memory. In this case we need to use the predicated LD1
4303 and ST1 instead of LDR and STR, both for correctness on big-endian
4304 targets and because LD1 and ST1 support a wider range of addressing modes.
4305 PRED_MODE is the mode of the predicate.
4307 See the comment at the head of aarch64-sve.md for details about the
4308 big-endian handling. */
4311 aarch64_expand_sve_mem_move (rtx dest
, rtx src
, machine_mode pred_mode
)
4313 machine_mode mode
= GET_MODE (dest
);
4314 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
4315 if (!register_operand (src
, mode
)
4316 && !register_operand (dest
, mode
))
4318 rtx tmp
= gen_reg_rtx (mode
);
4320 aarch64_emit_sve_pred_move (tmp
, ptrue
, src
);
4322 emit_move_insn (tmp
, src
);
4325 aarch64_emit_sve_pred_move (dest
, ptrue
, src
);
4328 /* Called only on big-endian targets. See whether an SVE vector move
4329 from SRC to DEST is effectively a REV[BHW] instruction, because at
4330 least one operand is a subreg of an SVE vector that has wider or
4331 narrower elements. Return true and emit the instruction if so.
4335 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
4337 represents a VIEW_CONVERT between the following vectors, viewed
4340 R2: { [0].high, [0].low, [1].high, [1].low, ... }
4341 R1: { [0], [1], [2], [3], ... }
4343 The high part of lane X in R2 should therefore correspond to lane X*2
4344 of R1, but the register representations are:
4347 R2: ...... [1].high [1].low [0].high [0].low
4348 R1: ...... [3] [2] [1] [0]
4350 where the low part of lane X in R2 corresponds to lane X*2 in R1.
4351 We therefore need a reverse operation to swap the high and low values
4354 This is purely an optimization. Without it we would spill the
4355 subreg operand to the stack in one mode and reload it in the
4356 other mode, which has the same effect as the REV. */
4359 aarch64_maybe_expand_sve_subreg_move (rtx dest
, rtx src
)
4361 gcc_assert (BYTES_BIG_ENDIAN
);
4362 if (GET_CODE (dest
) == SUBREG
)
4363 dest
= SUBREG_REG (dest
);
4364 if (GET_CODE (src
) == SUBREG
)
4365 src
= SUBREG_REG (src
);
4367 /* The optimization handles two single SVE REGs with different element
4371 || aarch64_classify_vector_mode (GET_MODE (dest
)) != VEC_SVE_DATA
4372 || aarch64_classify_vector_mode (GET_MODE (src
)) != VEC_SVE_DATA
4373 || (GET_MODE_UNIT_SIZE (GET_MODE (dest
))
4374 == GET_MODE_UNIT_SIZE (GET_MODE (src
))))
4377 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
4378 rtx ptrue
= aarch64_ptrue_reg (VNx16BImode
);
4379 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (dest
), gen_rtvec (2, ptrue
, src
),
4381 emit_insn (gen_rtx_SET (dest
, unspec
));
4385 /* Return a copy of X with mode MODE, without changing its other
4386 attributes. Unlike gen_lowpart, this doesn't care whether the
4387 mode change is valid. */
4390 aarch64_replace_reg_mode (rtx x
, machine_mode mode
)
4392 if (GET_MODE (x
) == mode
)
4395 x
= shallow_copy_rtx (x
);
4396 set_mode_and_regno (x
, mode
, REGNO (x
));
4400 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
4401 stored in wider integer containers. */
4404 aarch64_sve_rev_unspec (machine_mode mode
)
4406 switch (GET_MODE_UNIT_SIZE (mode
))
4408 case 1: return UNSPEC_REVB
;
4409 case 2: return UNSPEC_REVH
;
4410 case 4: return UNSPEC_REVW
;
4415 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
4419 aarch64_split_sve_subreg_move (rtx dest
, rtx ptrue
, rtx src
)
4421 /* Decide which REV operation we need. The mode with wider elements
4422 determines the mode of the operands and the mode with the narrower
4423 elements determines the reverse width. */
4424 machine_mode mode_with_wider_elts
= GET_MODE (dest
);
4425 machine_mode mode_with_narrower_elts
= GET_MODE (src
);
4426 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts
)
4427 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts
))
4428 std::swap (mode_with_wider_elts
, mode_with_narrower_elts
);
4430 unsigned int unspec
= aarch64_sve_rev_unspec (mode_with_narrower_elts
);
4431 unsigned int wider_bytes
= GET_MODE_UNIT_SIZE (mode_with_wider_elts
);
4432 machine_mode pred_mode
= aarch64_sve_pred_mode (wider_bytes
).require ();
4434 /* Get the operands in the appropriate modes and emit the instruction. */
4435 ptrue
= gen_lowpart (pred_mode
, ptrue
);
4436 dest
= aarch64_replace_reg_mode (dest
, mode_with_wider_elts
);
4437 src
= aarch64_replace_reg_mode (src
, mode_with_wider_elts
);
4438 emit_insn (gen_aarch64_pred (unspec
, mode_with_wider_elts
,
4443 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED
,
4444 tree exp ATTRIBUTE_UNUSED
)
4446 if (aarch64_simd_decl_p (cfun
->decl
) != aarch64_simd_decl_p (decl
))
4452 /* Implement TARGET_PASS_BY_REFERENCE. */
4455 aarch64_pass_by_reference (cumulative_args_t
, const function_arg_info
&arg
)
4458 machine_mode dummymode
;
4461 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
4462 if (arg
.mode
== BLKmode
&& arg
.type
)
4463 size
= int_size_in_bytes (arg
.type
);
4465 /* No frontends can create types with variable-sized modes, so we
4466 shouldn't be asked to pass or return them. */
4467 size
= GET_MODE_SIZE (arg
.mode
).to_constant ();
4469 /* Aggregates are passed by reference based on their size. */
4470 if (arg
.aggregate_type_p ())
4471 size
= int_size_in_bytes (arg
.type
);
4473 /* Variable sized arguments are always returned by reference. */
4477 /* Can this be a candidate to be passed in fp/simd register(s)? */
4478 if (aarch64_vfp_is_call_or_return_candidate (arg
.mode
, arg
.type
,
4483 /* Arguments which are variable sized or larger than 2 registers are
4484 passed by reference unless they are a homogenous floating point
4486 return size
> 2 * UNITS_PER_WORD
;
4489 /* Return TRUE if VALTYPE is padded to its least significant bits. */
4491 aarch64_return_in_msb (const_tree valtype
)
4493 machine_mode dummy_mode
;
4496 /* Never happens in little-endian mode. */
4497 if (!BYTES_BIG_ENDIAN
)
4500 /* Only composite types smaller than or equal to 16 bytes can
4501 be potentially returned in registers. */
4502 if (!aarch64_composite_type_p (valtype
, TYPE_MODE (valtype
))
4503 || int_size_in_bytes (valtype
) <= 0
4504 || int_size_in_bytes (valtype
) > 16)
4507 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4508 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4509 is always passed/returned in the least significant bits of fp/simd
4511 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype
), valtype
,
4512 &dummy_mode
, &dummy_int
, NULL
))
4518 /* Implement TARGET_FUNCTION_VALUE.
4519 Define how to find the value returned by a function. */
4522 aarch64_function_value (const_tree type
, const_tree func
,
4523 bool outgoing ATTRIBUTE_UNUSED
)
4528 machine_mode ag_mode
;
4530 mode
= TYPE_MODE (type
);
4531 if (INTEGRAL_TYPE_P (type
))
4532 mode
= promote_function_mode (type
, mode
, &unsignedp
, func
, 1);
4534 if (aarch64_return_in_msb (type
))
4536 HOST_WIDE_INT size
= int_size_in_bytes (type
);
4538 if (size
% UNITS_PER_WORD
!= 0)
4540 size
+= UNITS_PER_WORD
- size
% UNITS_PER_WORD
;
4541 mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 0).require ();
4545 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
4546 &ag_mode
, &count
, NULL
))
4548 if (!aarch64_composite_type_p (type
, mode
))
4550 gcc_assert (count
== 1 && mode
== ag_mode
);
4551 return gen_rtx_REG (mode
, V0_REGNUM
);
4558 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (count
));
4559 for (i
= 0; i
< count
; i
++)
4561 rtx tmp
= gen_rtx_REG (ag_mode
, V0_REGNUM
+ i
);
4562 rtx offset
= gen_int_mode (i
* GET_MODE_SIZE (ag_mode
), Pmode
);
4563 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
4564 XVECEXP (par
, 0, i
) = tmp
;
4570 return gen_rtx_REG (mode
, R0_REGNUM
);
4573 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
4574 Return true if REGNO is the number of a hard register in which the values
4575 of called function may come back. */
4578 aarch64_function_value_regno_p (const unsigned int regno
)
4580 /* Maximum of 16 bytes can be returned in the general registers. Examples
4581 of 16-byte return values are: 128-bit integers and 16-byte small
4582 structures (excluding homogeneous floating-point aggregates). */
4583 if (regno
== R0_REGNUM
|| regno
== R1_REGNUM
)
4586 /* Up to four fp/simd registers can return a function value, e.g. a
4587 homogeneous floating-point aggregate having four members. */
4588 if (regno
>= V0_REGNUM
&& regno
< V0_REGNUM
+ HA_MAX_NUM_FLDS
)
4589 return TARGET_FLOAT
;
4594 /* Implement TARGET_RETURN_IN_MEMORY.
4596 If the type T of the result of a function is such that
4598 would require that arg be passed as a value in a register (or set of
4599 registers) according to the parameter passing rules, then the result
4600 is returned in the same registers as would be used for such an
4604 aarch64_return_in_memory (const_tree type
, const_tree fndecl ATTRIBUTE_UNUSED
)
4607 machine_mode ag_mode
;
4610 if (!AGGREGATE_TYPE_P (type
)
4611 && TREE_CODE (type
) != COMPLEX_TYPE
4612 && TREE_CODE (type
) != VECTOR_TYPE
)
4613 /* Simple scalar types always returned in registers. */
4616 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
),
4623 /* Types larger than 2 registers returned in memory. */
4624 size
= int_size_in_bytes (type
);
4625 return (size
< 0 || size
> 2 * UNITS_PER_WORD
);
4629 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v
, machine_mode mode
,
4630 const_tree type
, int *nregs
)
4632 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
4633 return aarch64_vfp_is_call_or_return_candidate (mode
,
4635 &pcum
->aapcs_vfp_rmode
,
4640 /* Given MODE and TYPE of a function argument, return the alignment in
4641 bits. The idea is to suppress any stronger alignment requested by
4642 the user and opt for the natural alignment (specified in AAPCS64 \S
4643 4.1). ABI_BREAK is set to true if the alignment was incorrectly
4644 calculated in versions of GCC prior to GCC-9. This is a helper
4645 function for local use only. */
4648 aarch64_function_arg_alignment (machine_mode mode
, const_tree type
,
4653 return GET_MODE_ALIGNMENT (mode
);
4655 if (integer_zerop (TYPE_SIZE (type
)))
4658 gcc_assert (TYPE_MODE (type
) == mode
);
4660 if (!AGGREGATE_TYPE_P (type
))
4661 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type
));
4663 if (TREE_CODE (type
) == ARRAY_TYPE
)
4664 return TYPE_ALIGN (TREE_TYPE (type
));
4666 unsigned int alignment
= 0;
4667 unsigned int bitfield_alignment
= 0;
4668 for (tree field
= TYPE_FIELDS (type
); field
; field
= DECL_CHAIN (field
))
4669 if (TREE_CODE (field
) == FIELD_DECL
)
4671 alignment
= std::max (alignment
, DECL_ALIGN (field
));
4672 if (DECL_BIT_FIELD_TYPE (field
))
4674 = std::max (bitfield_alignment
,
4675 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field
)));
4678 if (bitfield_alignment
> alignment
)
4681 return bitfield_alignment
;
4687 /* Layout a function argument according to the AAPCS64 rules. The rule
4688 numbers refer to the rule numbers in the AAPCS64. */
4691 aarch64_layout_arg (cumulative_args_t pcum_v
, machine_mode mode
,
4693 bool named ATTRIBUTE_UNUSED
)
4695 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
4696 int ncrn
, nvrn
, nregs
;
4697 bool allocate_ncrn
, allocate_nvrn
;
4701 /* We need to do this once per argument. */
4702 if (pcum
->aapcs_arg_processed
)
4705 pcum
->aapcs_arg_processed
= true;
4707 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
4709 size
= int_size_in_bytes (type
);
4711 /* No frontends can create types with variable-sized modes, so we
4712 shouldn't be asked to pass or return them. */
4713 size
= GET_MODE_SIZE (mode
).to_constant ();
4714 size
= ROUND_UP (size
, UNITS_PER_WORD
);
4716 allocate_ncrn
= (type
) ? !(FLOAT_TYPE_P (type
)) : !FLOAT_MODE_P (mode
);
4717 allocate_nvrn
= aarch64_vfp_is_call_candidate (pcum_v
,
4722 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
4723 The following code thus handles passing by SIMD/FP registers first. */
4725 nvrn
= pcum
->aapcs_nvrn
;
4727 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
4728 and homogenous short-vector aggregates (HVA). */
4732 aarch64_err_no_fpadvsimd (mode
);
4734 if (nvrn
+ nregs
<= NUM_FP_ARG_REGS
)
4736 pcum
->aapcs_nextnvrn
= nvrn
+ nregs
;
4737 if (!aarch64_composite_type_p (type
, mode
))
4739 gcc_assert (nregs
== 1);
4740 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ nvrn
);
4746 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
4747 for (i
= 0; i
< nregs
; i
++)
4749 rtx tmp
= gen_rtx_REG (pcum
->aapcs_vfp_rmode
,
4750 V0_REGNUM
+ nvrn
+ i
);
4751 rtx offset
= gen_int_mode
4752 (i
* GET_MODE_SIZE (pcum
->aapcs_vfp_rmode
), Pmode
);
4753 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
4754 XVECEXP (par
, 0, i
) = tmp
;
4756 pcum
->aapcs_reg
= par
;
4762 /* C.3 NSRN is set to 8. */
4763 pcum
->aapcs_nextnvrn
= NUM_FP_ARG_REGS
;
4768 ncrn
= pcum
->aapcs_ncrn
;
4769 nregs
= size
/ UNITS_PER_WORD
;
4771 /* C6 - C9. though the sign and zero extension semantics are
4772 handled elsewhere. This is the case where the argument fits
4773 entirely general registers. */
4774 if (allocate_ncrn
&& (ncrn
+ nregs
<= NUM_ARG_REGS
))
4776 gcc_assert (nregs
== 0 || nregs
== 1 || nregs
== 2);
4778 /* C.8 if the argument has an alignment of 16 then the NGRN is
4779 rounded up to the next even number. */
4782 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
4783 comparison is there because for > 16 * BITS_PER_UNIT
4784 alignment nregs should be > 2 and therefore it should be
4785 passed by reference rather than value. */
4786 && (aarch64_function_arg_alignment (mode
, type
, &abi_break
)
4787 == 16 * BITS_PER_UNIT
))
4789 if (abi_break
&& warn_psabi
&& currently_expanding_gimple_stmt
)
4790 inform (input_location
, "parameter passing for argument of type "
4791 "%qT changed in GCC 9.1", type
);
4793 gcc_assert (ncrn
+ nregs
<= NUM_ARG_REGS
);
4796 /* NREGS can be 0 when e.g. an empty structure is to be passed.
4797 A reg is still generated for it, but the caller should be smart
4798 enough not to use it. */
4799 if (nregs
== 0 || nregs
== 1 || GET_MODE_CLASS (mode
) == MODE_INT
)
4800 pcum
->aapcs_reg
= gen_rtx_REG (mode
, R0_REGNUM
+ ncrn
);
4806 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
4807 for (i
= 0; i
< nregs
; i
++)
4809 rtx tmp
= gen_rtx_REG (word_mode
, R0_REGNUM
+ ncrn
+ i
);
4810 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
,
4811 GEN_INT (i
* UNITS_PER_WORD
));
4812 XVECEXP (par
, 0, i
) = tmp
;
4814 pcum
->aapcs_reg
= par
;
4817 pcum
->aapcs_nextncrn
= ncrn
+ nregs
;
4822 pcum
->aapcs_nextncrn
= NUM_ARG_REGS
;
4824 /* The argument is passed on stack; record the needed number of words for
4825 this argument and align the total size if necessary. */
4827 pcum
->aapcs_stack_words
= size
/ UNITS_PER_WORD
;
4829 if (aarch64_function_arg_alignment (mode
, type
, &abi_break
)
4830 == 16 * BITS_PER_UNIT
)
4832 int new_size
= ROUND_UP (pcum
->aapcs_stack_size
, 16 / UNITS_PER_WORD
);
4833 if (pcum
->aapcs_stack_size
!= new_size
)
4835 if (abi_break
&& warn_psabi
&& currently_expanding_gimple_stmt
)
4836 inform (input_location
, "parameter passing for argument of type "
4837 "%qT changed in GCC 9.1", type
);
4838 pcum
->aapcs_stack_size
= new_size
;
4844 /* Implement TARGET_FUNCTION_ARG. */
4847 aarch64_function_arg (cumulative_args_t pcum_v
, const function_arg_info
&arg
)
4849 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
4850 gcc_assert (pcum
->pcs_variant
== ARM_PCS_AAPCS64
);
4852 if (arg
.end_marker_p ())
4855 aarch64_layout_arg (pcum_v
, arg
.mode
, arg
.type
, arg
.named
);
4856 return pcum
->aapcs_reg
;
4860 aarch64_init_cumulative_args (CUMULATIVE_ARGS
*pcum
,
4861 const_tree fntype ATTRIBUTE_UNUSED
,
4862 rtx libname ATTRIBUTE_UNUSED
,
4863 const_tree fndecl ATTRIBUTE_UNUSED
,
4864 unsigned n_named ATTRIBUTE_UNUSED
)
4866 pcum
->aapcs_ncrn
= 0;
4867 pcum
->aapcs_nvrn
= 0;
4868 pcum
->aapcs_nextncrn
= 0;
4869 pcum
->aapcs_nextnvrn
= 0;
4870 pcum
->pcs_variant
= ARM_PCS_AAPCS64
;
4871 pcum
->aapcs_reg
= NULL_RTX
;
4872 pcum
->aapcs_arg_processed
= false;
4873 pcum
->aapcs_stack_words
= 0;
4874 pcum
->aapcs_stack_size
= 0;
4877 && fndecl
&& TREE_PUBLIC (fndecl
)
4878 && fntype
&& fntype
!= error_mark_node
)
4880 const_tree type
= TREE_TYPE (fntype
);
4881 machine_mode mode ATTRIBUTE_UNUSED
; /* To pass pointer as argument. */
4882 int nregs ATTRIBUTE_UNUSED
; /* Likewise. */
4883 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
), type
,
4884 &mode
, &nregs
, NULL
))
4885 aarch64_err_no_fpadvsimd (TYPE_MODE (type
));
4891 aarch64_function_arg_advance (cumulative_args_t pcum_v
,
4892 const function_arg_info
&arg
)
4894 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
4895 if (pcum
->pcs_variant
== ARM_PCS_AAPCS64
)
4897 aarch64_layout_arg (pcum_v
, arg
.mode
, arg
.type
, arg
.named
);
4898 gcc_assert ((pcum
->aapcs_reg
!= NULL_RTX
)
4899 != (pcum
->aapcs_stack_words
!= 0));
4900 pcum
->aapcs_arg_processed
= false;
4901 pcum
->aapcs_ncrn
= pcum
->aapcs_nextncrn
;
4902 pcum
->aapcs_nvrn
= pcum
->aapcs_nextnvrn
;
4903 pcum
->aapcs_stack_size
+= pcum
->aapcs_stack_words
;
4904 pcum
->aapcs_stack_words
= 0;
4905 pcum
->aapcs_reg
= NULL_RTX
;
4910 aarch64_function_arg_regno_p (unsigned regno
)
4912 return ((GP_REGNUM_P (regno
) && regno
< R0_REGNUM
+ NUM_ARG_REGS
)
4913 || (FP_REGNUM_P (regno
) && regno
< V0_REGNUM
+ NUM_FP_ARG_REGS
));
4916 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
4917 PARM_BOUNDARY bits of alignment, but will be given anything up
4918 to STACK_BOUNDARY bits if the type requires it. This makes sure
4919 that both before and after the layout of each argument, the Next
4920 Stacked Argument Address (NSAA) will have a minimum alignment of
4924 aarch64_function_arg_boundary (machine_mode mode
, const_tree type
)
4927 unsigned int alignment
= aarch64_function_arg_alignment (mode
, type
,
4929 if (abi_break
& warn_psabi
)
4930 inform (input_location
, "parameter passing for argument of type "
4931 "%qT changed in GCC 9.1", type
);
4933 return MIN (MAX (alignment
, PARM_BOUNDARY
), STACK_BOUNDARY
);
4936 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
4938 static fixed_size_mode
4939 aarch64_get_reg_raw_mode (int regno
)
4941 if (TARGET_SVE
&& FP_REGNUM_P (regno
))
4942 /* Don't use the SVE part of the register for __builtin_apply and
4943 __builtin_return. The SVE registers aren't used by the normal PCS,
4944 so using them there would be a waste of time. The PCS extensions
4945 for SVE types are fundamentally incompatible with the
4946 __builtin_return/__builtin_apply interface. */
4947 return as_a
<fixed_size_mode
> (V16QImode
);
4948 return default_get_reg_raw_mode (regno
);
4951 /* Implement TARGET_FUNCTION_ARG_PADDING.
4953 Small aggregate types are placed in the lowest memory address.
4955 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
4957 static pad_direction
4958 aarch64_function_arg_padding (machine_mode mode
, const_tree type
)
4960 /* On little-endian targets, the least significant byte of every stack
4961 argument is passed at the lowest byte address of the stack slot. */
4962 if (!BYTES_BIG_ENDIAN
)
4965 /* Otherwise, integral, floating-point and pointer types are padded downward:
4966 the least significant byte of a stack argument is passed at the highest
4967 byte address of the stack slot. */
4969 ? (INTEGRAL_TYPE_P (type
) || SCALAR_FLOAT_TYPE_P (type
)
4970 || POINTER_TYPE_P (type
))
4971 : (SCALAR_INT_MODE_P (mode
) || SCALAR_FLOAT_MODE_P (mode
)))
4972 return PAD_DOWNWARD
;
4974 /* Everything else padded upward, i.e. data in first byte of stack slot. */
4978 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4980 It specifies padding for the last (may also be the only)
4981 element of a block move between registers and memory. If
4982 assuming the block is in the memory, padding upward means that
4983 the last element is padded after its highest significant byte,
4984 while in downward padding, the last element is padded at the
4985 its least significant byte side.
4987 Small aggregates and small complex types are always padded
4990 We don't need to worry about homogeneous floating-point or
4991 short-vector aggregates; their move is not affected by the
4992 padding direction determined here. Regardless of endianness,
4993 each element of such an aggregate is put in the least
4994 significant bits of a fp/simd register.
4996 Return !BYTES_BIG_ENDIAN if the least significant byte of the
4997 register has useful data, and return the opposite if the most
4998 significant byte does. */
5001 aarch64_pad_reg_upward (machine_mode mode
, const_tree type
,
5002 bool first ATTRIBUTE_UNUSED
)
5005 /* Small composite types are always padded upward. */
5006 if (BYTES_BIG_ENDIAN
&& aarch64_composite_type_p (type
, mode
))
5010 size
= int_size_in_bytes (type
);
5012 /* No frontends can create types with variable-sized modes, so we
5013 shouldn't be asked to pass or return them. */
5014 size
= GET_MODE_SIZE (mode
).to_constant ();
5015 if (size
< 2 * UNITS_PER_WORD
)
5019 /* Otherwise, use the default padding. */
5020 return !BYTES_BIG_ENDIAN
;
5023 static scalar_int_mode
5024 aarch64_libgcc_cmp_return_mode (void)
5029 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
5031 /* We use the 12-bit shifted immediate arithmetic instructions so values
5032 must be multiple of (1 << 12), i.e. 4096. */
5033 #define ARITH_FACTOR 4096
5035 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
5036 #error Cannot use simple address calculation for stack probing
5039 /* The pair of scratch registers used for stack probing. */
5040 #define PROBE_STACK_FIRST_REG R9_REGNUM
5041 #define PROBE_STACK_SECOND_REG R10_REGNUM
5043 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
5044 inclusive. These are offsets from the current stack pointer. */
5047 aarch64_emit_probe_stack_range (HOST_WIDE_INT first
, poly_int64 poly_size
)
5050 if (!poly_size
.is_constant (&size
))
5052 sorry ("stack probes for SVE frames");
5056 rtx reg1
= gen_rtx_REG (Pmode
, PROBE_STACK_FIRST_REG
);
5058 /* See the same assertion on PROBE_INTERVAL above. */
5059 gcc_assert ((first
% ARITH_FACTOR
) == 0);
5061 /* See if we have a constant small number of probes to generate. If so,
5062 that's the easy case. */
5063 if (size
<= PROBE_INTERVAL
)
5065 const HOST_WIDE_INT base
= ROUND_UP (size
, ARITH_FACTOR
);
5067 emit_set_insn (reg1
,
5068 plus_constant (Pmode
,
5069 stack_pointer_rtx
, -(first
+ base
)));
5070 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- size
));
5073 /* The run-time loop is made up of 8 insns in the generic case while the
5074 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
5075 else if (size
<= 4 * PROBE_INTERVAL
)
5077 HOST_WIDE_INT i
, rem
;
5079 emit_set_insn (reg1
,
5080 plus_constant (Pmode
,
5082 -(first
+ PROBE_INTERVAL
)));
5083 emit_stack_probe (reg1
);
5085 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
5086 it exceeds SIZE. If only two probes are needed, this will not
5087 generate any code. Then probe at FIRST + SIZE. */
5088 for (i
= 2 * PROBE_INTERVAL
; i
< size
; i
+= PROBE_INTERVAL
)
5090 emit_set_insn (reg1
,
5091 plus_constant (Pmode
, reg1
, -PROBE_INTERVAL
));
5092 emit_stack_probe (reg1
);
5095 rem
= size
- (i
- PROBE_INTERVAL
);
5098 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
5100 emit_set_insn (reg1
, plus_constant (Pmode
, reg1
, -base
));
5101 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- rem
));
5104 emit_stack_probe (plus_constant (Pmode
, reg1
, -rem
));
5107 /* Otherwise, do the same as above, but in a loop. Note that we must be
5108 extra careful with variables wrapping around because we might be at
5109 the very top (or the very bottom) of the address space and we have
5110 to be able to handle this case properly; in particular, we use an
5111 equality test for the loop condition. */
5114 rtx reg2
= gen_rtx_REG (Pmode
, PROBE_STACK_SECOND_REG
);
5116 /* Step 1: round SIZE to the previous multiple of the interval. */
5118 HOST_WIDE_INT rounded_size
= size
& -PROBE_INTERVAL
;
5121 /* Step 2: compute initial and final value of the loop counter. */
5123 /* TEST_ADDR = SP + FIRST. */
5124 emit_set_insn (reg1
,
5125 plus_constant (Pmode
, stack_pointer_rtx
, -first
));
5127 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
5128 HOST_WIDE_INT adjustment
= - (first
+ rounded_size
);
5129 if (! aarch64_uimm12_shift (adjustment
))
5131 aarch64_internal_mov_immediate (reg2
, GEN_INT (adjustment
),
5133 emit_set_insn (reg2
, gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, reg2
));
5136 emit_set_insn (reg2
,
5137 plus_constant (Pmode
, stack_pointer_rtx
, adjustment
));
5143 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
5146 while (TEST_ADDR != LAST_ADDR)
5148 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
5149 until it is equal to ROUNDED_SIZE. */
5151 emit_insn (gen_probe_stack_range (reg1
, reg1
, reg2
));
5154 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
5155 that SIZE is equal to ROUNDED_SIZE. */
5157 if (size
!= rounded_size
)
5159 HOST_WIDE_INT rem
= size
- rounded_size
;
5163 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
5165 emit_set_insn (reg2
, plus_constant (Pmode
, reg2
, -base
));
5166 emit_stack_probe (plus_constant (Pmode
, reg2
, base
- rem
));
5169 emit_stack_probe (plus_constant (Pmode
, reg2
, -rem
));
5173 /* Make sure nothing is scheduled before we are done. */
5174 emit_insn (gen_blockage ());
5177 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
5178 absolute addresses. */
5181 aarch64_output_probe_stack_range (rtx reg1
, rtx reg2
)
5183 static int labelno
= 0;
5187 ASM_GENERATE_INTERNAL_LABEL (loop_lab
, "LPSRL", labelno
++);
5190 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_lab
);
5192 HOST_WIDE_INT stack_clash_probe_interval
5193 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
5195 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
5197 HOST_WIDE_INT interval
;
5198 if (flag_stack_clash_protection
)
5199 interval
= stack_clash_probe_interval
;
5201 interval
= PROBE_INTERVAL
;
5203 gcc_assert (aarch64_uimm12_shift (interval
));
5204 xops
[1] = GEN_INT (interval
);
5206 output_asm_insn ("sub\t%0, %0, %1", xops
);
5208 /* If doing stack clash protection then we probe up by the ABI specified
5209 amount. We do this because we're dropping full pages at a time in the
5210 loop. But if we're doing non-stack clash probing, probe at SP 0. */
5211 if (flag_stack_clash_protection
)
5212 xops
[1] = GEN_INT (STACK_CLASH_CALLER_GUARD
);
5214 xops
[1] = CONST0_RTX (GET_MODE (xops
[1]));
5216 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
5217 by this amount for each iteration. */
5218 output_asm_insn ("str\txzr, [%0, %1]", xops
);
5220 /* Test if TEST_ADDR == LAST_ADDR. */
5222 output_asm_insn ("cmp\t%0, %1", xops
);
5225 fputs ("\tb.ne\t", asm_out_file
);
5226 assemble_name_raw (asm_out_file
, loop_lab
);
5227 fputc ('\n', asm_out_file
);
5232 /* Emit the probe loop for doing stack clash probes and stack adjustments for
5233 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
5234 of GUARD_SIZE. When a probe is emitted it is done at most
5235 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
5236 at most MIN_PROBE_THRESHOLD. By the end of this function
5237 BASE = BASE - ADJUSTMENT. */
5240 aarch64_output_probe_sve_stack_clash (rtx base
, rtx adjustment
,
5241 rtx min_probe_threshold
, rtx guard_size
)
5243 /* This function is not allowed to use any instruction generation function
5244 like gen_ and friends. If you do you'll likely ICE during CFG validation,
5245 so instead emit the code you want using output_asm_insn. */
5246 gcc_assert (flag_stack_clash_protection
);
5247 gcc_assert (CONST_INT_P (min_probe_threshold
) && CONST_INT_P (guard_size
));
5248 gcc_assert (INTVAL (guard_size
) > INTVAL (min_probe_threshold
));
5250 /* The minimum required allocation before the residual requires probing. */
5251 HOST_WIDE_INT residual_probe_guard
= INTVAL (min_probe_threshold
);
5253 /* Clamp the value down to the nearest value that can be used with a cmp. */
5254 residual_probe_guard
= aarch64_clamp_to_uimm12_shift (residual_probe_guard
);
5255 rtx probe_offset_value_rtx
= gen_int_mode (residual_probe_guard
, Pmode
);
5257 gcc_assert (INTVAL (min_probe_threshold
) >= residual_probe_guard
);
5258 gcc_assert (aarch64_uimm12_shift (residual_probe_guard
));
5260 static int labelno
= 0;
5261 char loop_start_lab
[32];
5262 char loop_end_lab
[32];
5265 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab
, "SVLPSPL", labelno
);
5266 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab
, "SVLPEND", labelno
++);
5268 /* Emit loop start label. */
5269 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_start_lab
);
5271 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
5272 xops
[0] = adjustment
;
5273 xops
[1] = probe_offset_value_rtx
;
5274 output_asm_insn ("cmp\t%0, %1", xops
);
5276 /* Branch to end if not enough adjustment to probe. */
5277 fputs ("\tb.lt\t", asm_out_file
);
5278 assemble_name_raw (asm_out_file
, loop_end_lab
);
5279 fputc ('\n', asm_out_file
);
5281 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
5283 xops
[1] = probe_offset_value_rtx
;
5284 output_asm_insn ("sub\t%0, %0, %1", xops
);
5286 /* Probe at BASE. */
5287 xops
[1] = const0_rtx
;
5288 output_asm_insn ("str\txzr, [%0, %1]", xops
);
5290 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
5291 xops
[0] = adjustment
;
5292 xops
[1] = probe_offset_value_rtx
;
5293 output_asm_insn ("sub\t%0, %0, %1", xops
);
5295 /* Branch to start if still more bytes to allocate. */
5296 fputs ("\tb\t", asm_out_file
);
5297 assemble_name_raw (asm_out_file
, loop_start_lab
);
5298 fputc ('\n', asm_out_file
);
5300 /* No probe leave. */
5301 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_end_lab
);
5303 /* BASE = BASE - ADJUSTMENT. */
5305 xops
[1] = adjustment
;
5306 output_asm_insn ("sub\t%0, %0, %1", xops
);
5310 /* Determine whether a frame chain needs to be generated. */
5312 aarch64_needs_frame_chain (void)
5314 /* Force a frame chain for EH returns so the return address is at FP+8. */
5315 if (frame_pointer_needed
|| crtl
->calls_eh_return
)
5318 /* A leaf function cannot have calls or write LR. */
5319 bool is_leaf
= crtl
->is_leaf
&& !df_regs_ever_live_p (LR_REGNUM
);
5321 /* Don't use a frame chain in leaf functions if leaf frame pointers
5323 if (flag_omit_leaf_frame_pointer
&& is_leaf
)
5326 return aarch64_use_frame_pointer
;
5329 /* Mark the registers that need to be saved by the callee and calculate
5330 the size of the callee-saved registers area and frame record (both FP
5331 and LR may be omitted). */
5333 aarch64_layout_frame (void)
5335 HOST_WIDE_INT offset
= 0;
5336 int regno
, last_fp_reg
= INVALID_REGNUM
;
5337 bool simd_function
= aarch64_simd_decl_p (cfun
->decl
);
5339 cfun
->machine
->frame
.emit_frame_chain
= aarch64_needs_frame_chain ();
5341 /* Adjust the outgoing arguments size if required. Keep it in sync with what
5342 the mid-end is doing. */
5343 crtl
->outgoing_args_size
= STACK_DYNAMIC_OFFSET (cfun
);
5345 #define SLOT_NOT_REQUIRED (-2)
5346 #define SLOT_REQUIRED (-1)
5348 cfun
->machine
->frame
.wb_candidate1
= INVALID_REGNUM
;
5349 cfun
->machine
->frame
.wb_candidate2
= INVALID_REGNUM
;
5351 /* If this is a non-leaf simd function with calls we assume that
5352 at least one of those calls is to a non-simd function and thus
5353 we must save V8 to V23 in the prologue. */
5355 if (simd_function
&& !crtl
->is_leaf
)
5357 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
5358 if (FP_SIMD_SAVED_REGNUM_P (regno
))
5359 df_set_regs_ever_live (regno
, true);
5362 /* First mark all the registers that really need to be saved... */
5363 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
5364 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
5366 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
5367 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
5369 /* ... that includes the eh data registers (if needed)... */
5370 if (crtl
->calls_eh_return
)
5371 for (regno
= 0; EH_RETURN_DATA_REGNO (regno
) != INVALID_REGNUM
; regno
++)
5372 cfun
->machine
->frame
.reg_offset
[EH_RETURN_DATA_REGNO (regno
)]
5375 /* ... and any callee saved register that dataflow says is live. */
5376 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
5377 if (df_regs_ever_live_p (regno
)
5378 && (regno
== R30_REGNUM
5379 || !call_used_or_fixed_reg_p (regno
)))
5380 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
5382 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
5383 if (df_regs_ever_live_p (regno
)
5384 && (!call_used_or_fixed_reg_p (regno
)
5385 || (simd_function
&& FP_SIMD_SAVED_REGNUM_P (regno
))))
5387 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
5388 last_fp_reg
= regno
;
5391 if (cfun
->machine
->frame
.emit_frame_chain
)
5393 /* FP and LR are placed in the linkage record. */
5394 cfun
->machine
->frame
.reg_offset
[R29_REGNUM
] = 0;
5395 cfun
->machine
->frame
.wb_candidate1
= R29_REGNUM
;
5396 cfun
->machine
->frame
.reg_offset
[R30_REGNUM
] = UNITS_PER_WORD
;
5397 cfun
->machine
->frame
.wb_candidate2
= R30_REGNUM
;
5398 offset
= 2 * UNITS_PER_WORD
;
5401 /* With stack-clash, LR must be saved in non-leaf functions. */
5402 gcc_assert (crtl
->is_leaf
5403 || (cfun
->machine
->frame
.reg_offset
[R30_REGNUM
]
5404 != SLOT_NOT_REQUIRED
));
5406 /* Now assign stack slots for them. */
5407 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
5408 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
5410 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
5411 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
5412 cfun
->machine
->frame
.wb_candidate1
= regno
;
5413 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
)
5414 cfun
->machine
->frame
.wb_candidate2
= regno
;
5415 offset
+= UNITS_PER_WORD
;
5418 HOST_WIDE_INT max_int_offset
= offset
;
5419 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
5420 bool has_align_gap
= offset
!= max_int_offset
;
5422 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
5423 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
5425 /* If there is an alignment gap between integer and fp callee-saves,
5426 allocate the last fp register to it if possible. */
5427 if (regno
== last_fp_reg
5430 && (offset
& 8) == 0)
5432 cfun
->machine
->frame
.reg_offset
[regno
] = max_int_offset
;
5436 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
5437 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
5438 cfun
->machine
->frame
.wb_candidate1
= regno
;
5439 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
5440 && cfun
->machine
->frame
.wb_candidate1
>= V0_REGNUM
)
5441 cfun
->machine
->frame
.wb_candidate2
= regno
;
5442 offset
+= simd_function
? UNITS_PER_VREG
: UNITS_PER_WORD
;
5445 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
5447 cfun
->machine
->frame
.saved_regs_size
= offset
;
5449 HOST_WIDE_INT varargs_and_saved_regs_size
5450 = offset
+ cfun
->machine
->frame
.saved_varargs_size
;
5452 cfun
->machine
->frame
.hard_fp_offset
5453 = aligned_upper_bound (varargs_and_saved_regs_size
5454 + get_frame_size (),
5455 STACK_BOUNDARY
/ BITS_PER_UNIT
);
5457 /* Both these values are already aligned. */
5458 gcc_assert (multiple_p (crtl
->outgoing_args_size
,
5459 STACK_BOUNDARY
/ BITS_PER_UNIT
));
5460 cfun
->machine
->frame
.frame_size
5461 = (cfun
->machine
->frame
.hard_fp_offset
5462 + crtl
->outgoing_args_size
);
5464 cfun
->machine
->frame
.locals_offset
= cfun
->machine
->frame
.saved_varargs_size
;
5466 cfun
->machine
->frame
.initial_adjust
= 0;
5467 cfun
->machine
->frame
.final_adjust
= 0;
5468 cfun
->machine
->frame
.callee_adjust
= 0;
5469 cfun
->machine
->frame
.callee_offset
= 0;
5471 HOST_WIDE_INT max_push_offset
= 0;
5472 if (cfun
->machine
->frame
.wb_candidate2
!= INVALID_REGNUM
)
5473 max_push_offset
= 512;
5474 else if (cfun
->machine
->frame
.wb_candidate1
!= INVALID_REGNUM
)
5475 max_push_offset
= 256;
5477 HOST_WIDE_INT const_size
, const_fp_offset
;
5478 if (cfun
->machine
->frame
.frame_size
.is_constant (&const_size
)
5479 && const_size
< max_push_offset
5480 && known_eq (crtl
->outgoing_args_size
, 0))
5482 /* Simple, small frame with no outgoing arguments:
5483 stp reg1, reg2, [sp, -frame_size]!
5484 stp reg3, reg4, [sp, 16] */
5485 cfun
->machine
->frame
.callee_adjust
= const_size
;
5487 else if (known_lt (crtl
->outgoing_args_size
5488 + cfun
->machine
->frame
.saved_regs_size
, 512)
5489 && !(cfun
->calls_alloca
5490 && known_lt (cfun
->machine
->frame
.hard_fp_offset
,
5493 /* Frame with small outgoing arguments:
5494 sub sp, sp, frame_size
5495 stp reg1, reg2, [sp, outgoing_args_size]
5496 stp reg3, reg4, [sp, outgoing_args_size + 16] */
5497 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.frame_size
;
5498 cfun
->machine
->frame
.callee_offset
5499 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.hard_fp_offset
;
5501 else if (cfun
->machine
->frame
.hard_fp_offset
.is_constant (&const_fp_offset
)
5502 && const_fp_offset
< max_push_offset
)
5504 /* Frame with large outgoing arguments but a small local area:
5505 stp reg1, reg2, [sp, -hard_fp_offset]!
5506 stp reg3, reg4, [sp, 16]
5507 sub sp, sp, outgoing_args_size */
5508 cfun
->machine
->frame
.callee_adjust
= const_fp_offset
;
5509 cfun
->machine
->frame
.final_adjust
5510 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.callee_adjust
;
5514 /* Frame with large local area and outgoing arguments using frame pointer:
5515 sub sp, sp, hard_fp_offset
5516 stp x29, x30, [sp, 0]
5518 stp reg3, reg4, [sp, 16]
5519 sub sp, sp, outgoing_args_size */
5520 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.hard_fp_offset
;
5521 cfun
->machine
->frame
.final_adjust
5522 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.initial_adjust
;
5525 cfun
->machine
->frame
.laid_out
= true;
5528 /* Return true if the register REGNO is saved on entry to
5529 the current function. */
5532 aarch64_register_saved_on_entry (int regno
)
5534 return cfun
->machine
->frame
.reg_offset
[regno
] >= 0;
5537 /* Return the next register up from REGNO up to LIMIT for the callee
5541 aarch64_next_callee_save (unsigned regno
, unsigned limit
)
5543 while (regno
<= limit
&& !aarch64_register_saved_on_entry (regno
))
5548 /* Push the register number REGNO of mode MODE to the stack with write-back
5549 adjusting the stack by ADJUSTMENT. */
5552 aarch64_pushwb_single_reg (machine_mode mode
, unsigned regno
,
5553 HOST_WIDE_INT adjustment
)
5555 rtx base_rtx
= stack_pointer_rtx
;
5558 reg
= gen_rtx_REG (mode
, regno
);
5559 mem
= gen_rtx_PRE_MODIFY (Pmode
, base_rtx
,
5560 plus_constant (Pmode
, base_rtx
, -adjustment
));
5561 mem
= gen_frame_mem (mode
, mem
);
5563 insn
= emit_move_insn (mem
, reg
);
5564 RTX_FRAME_RELATED_P (insn
) = 1;
5567 /* Generate and return an instruction to store the pair of registers
5568 REG and REG2 of mode MODE to location BASE with write-back adjusting
5569 the stack location BASE by ADJUSTMENT. */
5572 aarch64_gen_storewb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
5573 HOST_WIDE_INT adjustment
)
5578 return gen_storewb_pairdi_di (base
, base
, reg
, reg2
,
5579 GEN_INT (-adjustment
),
5580 GEN_INT (UNITS_PER_WORD
- adjustment
));
5582 return gen_storewb_pairdf_di (base
, base
, reg
, reg2
,
5583 GEN_INT (-adjustment
),
5584 GEN_INT (UNITS_PER_WORD
- adjustment
));
5586 return gen_storewb_pairtf_di (base
, base
, reg
, reg2
,
5587 GEN_INT (-adjustment
),
5588 GEN_INT (UNITS_PER_VREG
- adjustment
));
5594 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
5595 stack pointer by ADJUSTMENT. */
5598 aarch64_push_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
)
5601 machine_mode mode
= aarch64_reg_save_mode (cfun
->decl
, regno1
);
5603 if (regno2
== INVALID_REGNUM
)
5604 return aarch64_pushwb_single_reg (mode
, regno1
, adjustment
);
5606 rtx reg1
= gen_rtx_REG (mode
, regno1
);
5607 rtx reg2
= gen_rtx_REG (mode
, regno2
);
5609 insn
= emit_insn (aarch64_gen_storewb_pair (mode
, stack_pointer_rtx
, reg1
,
5611 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 2)) = 1;
5612 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
5613 RTX_FRAME_RELATED_P (insn
) = 1;
5616 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
5617 adjusting it by ADJUSTMENT afterwards. */
5620 aarch64_gen_loadwb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
5621 HOST_WIDE_INT adjustment
)
5626 return gen_loadwb_pairdi_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
5627 GEN_INT (UNITS_PER_WORD
));
5629 return gen_loadwb_pairdf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
5630 GEN_INT (UNITS_PER_WORD
));
5632 return gen_loadwb_pairtf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
5633 GEN_INT (UNITS_PER_VREG
));
5639 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
5640 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
5644 aarch64_pop_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
,
5647 machine_mode mode
= aarch64_reg_save_mode (cfun
->decl
, regno1
);
5648 rtx reg1
= gen_rtx_REG (mode
, regno1
);
5650 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg1
, *cfi_ops
);
5652 if (regno2
== INVALID_REGNUM
)
5654 rtx mem
= plus_constant (Pmode
, stack_pointer_rtx
, adjustment
);
5655 mem
= gen_rtx_POST_MODIFY (Pmode
, stack_pointer_rtx
, mem
);
5656 emit_move_insn (reg1
, gen_frame_mem (mode
, mem
));
5660 rtx reg2
= gen_rtx_REG (mode
, regno2
);
5661 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
5662 emit_insn (aarch64_gen_loadwb_pair (mode
, stack_pointer_rtx
, reg1
,
5667 /* Generate and return a store pair instruction of mode MODE to store
5668 register REG1 to MEM1 and register REG2 to MEM2. */
5671 aarch64_gen_store_pair (machine_mode mode
, rtx mem1
, rtx reg1
, rtx mem2
,
5677 return gen_store_pair_dw_didi (mem1
, reg1
, mem2
, reg2
);
5680 return gen_store_pair_dw_dfdf (mem1
, reg1
, mem2
, reg2
);
5683 return gen_store_pair_dw_tftf (mem1
, reg1
, mem2
, reg2
);
5690 /* Generate and regurn a load pair isntruction of mode MODE to load register
5691 REG1 from MEM1 and register REG2 from MEM2. */
5694 aarch64_gen_load_pair (machine_mode mode
, rtx reg1
, rtx mem1
, rtx reg2
,
5700 return gen_load_pair_dw_didi (reg1
, mem1
, reg2
, mem2
);
5703 return gen_load_pair_dw_dfdf (reg1
, mem1
, reg2
, mem2
);
5706 return gen_load_pair_dw_tftf (reg1
, mem1
, reg2
, mem2
);
5713 /* Return TRUE if return address signing should be enabled for the current
5714 function, otherwise return FALSE. */
5717 aarch64_return_address_signing_enabled (void)
5719 /* This function should only be called after frame laid out. */
5720 gcc_assert (cfun
->machine
->frame
.laid_out
);
5722 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
5723 if its LR is pushed onto stack. */
5724 return (aarch64_ra_sign_scope
== AARCH64_FUNCTION_ALL
5725 || (aarch64_ra_sign_scope
== AARCH64_FUNCTION_NON_LEAF
5726 && cfun
->machine
->frame
.reg_offset
[LR_REGNUM
] >= 0));
5729 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
5731 aarch64_bti_enabled (void)
5733 return (aarch64_enable_bti
== 1);
5736 /* Emit code to save the callee-saved registers from register number START
5737 to LIMIT to the stack at the location starting at offset START_OFFSET,
5738 skipping any write-back candidates if SKIP_WB is true. */
5741 aarch64_save_callee_saves (machine_mode mode
, poly_int64 start_offset
,
5742 unsigned start
, unsigned limit
, bool skip_wb
)
5748 for (regno
= aarch64_next_callee_save (start
, limit
);
5750 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
5757 && (regno
== cfun
->machine
->frame
.wb_candidate1
5758 || regno
== cfun
->machine
->frame
.wb_candidate2
))
5761 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
5764 reg
= gen_rtx_REG (mode
, regno
);
5765 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
5766 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
5769 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
5770 offset_diff
= cfun
->machine
->frame
.reg_offset
[regno2
]
5771 - cfun
->machine
->frame
.reg_offset
[regno
];
5774 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
5775 && known_eq (GET_MODE_SIZE (mode
), offset_diff
))
5777 rtx reg2
= gen_rtx_REG (mode
, regno2
);
5780 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
5781 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
5783 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
,
5786 /* The first part of a frame-related parallel insn is
5787 always assumed to be relevant to the frame
5788 calculations; subsequent parts, are only
5789 frame-related if explicitly marked. */
5790 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
5794 insn
= emit_move_insn (mem
, reg
);
5796 RTX_FRAME_RELATED_P (insn
) = 1;
5800 /* Emit code to restore the callee registers of mode MODE from register
5801 number START up to and including LIMIT. Restore from the stack offset
5802 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
5803 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
5806 aarch64_restore_callee_saves (machine_mode mode
,
5807 poly_int64 start_offset
, unsigned start
,
5808 unsigned limit
, bool skip_wb
, rtx
*cfi_ops
)
5810 rtx base_rtx
= stack_pointer_rtx
;
5815 for (regno
= aarch64_next_callee_save (start
, limit
);
5817 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
5819 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
5826 && (regno
== cfun
->machine
->frame
.wb_candidate1
5827 || regno
== cfun
->machine
->frame
.wb_candidate2
))
5830 reg
= gen_rtx_REG (mode
, regno
);
5831 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
5832 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
5834 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
5835 offset_diff
= cfun
->machine
->frame
.reg_offset
[regno2
]
5836 - cfun
->machine
->frame
.reg_offset
[regno
];
5839 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
5840 && known_eq (GET_MODE_SIZE (mode
), offset_diff
))
5842 rtx reg2
= gen_rtx_REG (mode
, regno2
);
5845 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
5846 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
5847 emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
5849 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
5853 emit_move_insn (reg
, mem
);
5854 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg
, *cfi_ops
);
5858 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
5862 offset_4bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
5864 HOST_WIDE_INT multiple
;
5865 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5866 && IN_RANGE (multiple
, -8, 7));
5869 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5873 offset_6bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
5875 HOST_WIDE_INT multiple
;
5876 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5877 && IN_RANGE (multiple
, 0, 63));
5880 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5884 aarch64_offset_7bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
5886 HOST_WIDE_INT multiple
;
5887 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5888 && IN_RANGE (multiple
, -64, 63));
5891 /* Return true if OFFSET is a signed 9-bit value. */
5894 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED
,
5897 HOST_WIDE_INT const_offset
;
5898 return (offset
.is_constant (&const_offset
)
5899 && IN_RANGE (const_offset
, -256, 255));
5902 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5906 offset_9bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
5908 HOST_WIDE_INT multiple
;
5909 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5910 && IN_RANGE (multiple
, -256, 255));
5913 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5917 offset_12bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
5919 HOST_WIDE_INT multiple
;
5920 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5921 && IN_RANGE (multiple
, 0, 4095));
5924 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
5927 aarch64_get_separate_components (void)
5929 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
5930 bitmap_clear (components
);
5932 /* The registers we need saved to the frame. */
5933 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
5934 if (aarch64_register_saved_on_entry (regno
))
5936 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
5937 if (!frame_pointer_needed
)
5938 offset
+= cfun
->machine
->frame
.frame_size
5939 - cfun
->machine
->frame
.hard_fp_offset
;
5940 /* Check that we can access the stack slot of the register with one
5941 direct load with no adjustments needed. */
5942 if (offset_12bit_unsigned_scaled_p (DImode
, offset
))
5943 bitmap_set_bit (components
, regno
);
5946 /* Don't mess with the hard frame pointer. */
5947 if (frame_pointer_needed
)
5948 bitmap_clear_bit (components
, HARD_FRAME_POINTER_REGNUM
);
5950 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
5951 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
5952 /* If registers have been chosen to be stored/restored with
5953 writeback don't interfere with them to avoid having to output explicit
5954 stack adjustment instructions. */
5955 if (reg2
!= INVALID_REGNUM
)
5956 bitmap_clear_bit (components
, reg2
);
5957 if (reg1
!= INVALID_REGNUM
)
5958 bitmap_clear_bit (components
, reg1
);
5960 bitmap_clear_bit (components
, LR_REGNUM
);
5961 bitmap_clear_bit (components
, SP_REGNUM
);
5966 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
5969 aarch64_components_for_bb (basic_block bb
)
5971 bitmap in
= DF_LIVE_IN (bb
);
5972 bitmap gen
= &DF_LIVE_BB_INFO (bb
)->gen
;
5973 bitmap kill
= &DF_LIVE_BB_INFO (bb
)->kill
;
5974 bool simd_function
= aarch64_simd_decl_p (cfun
->decl
);
5976 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
5977 bitmap_clear (components
);
5979 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
5980 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
5981 if ((!call_used_or_fixed_reg_p (regno
)
5982 || (simd_function
&& FP_SIMD_SAVED_REGNUM_P (regno
)))
5983 && (bitmap_bit_p (in
, regno
)
5984 || bitmap_bit_p (gen
, regno
)
5985 || bitmap_bit_p (kill
, regno
)))
5987 unsigned regno2
, offset
, offset2
;
5988 bitmap_set_bit (components
, regno
);
5990 /* If there is a callee-save at an adjacent offset, add it too
5991 to increase the use of LDP/STP. */
5992 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
5993 regno2
= ((offset
& 8) == 0) ? regno
+ 1 : regno
- 1;
5995 if (regno2
<= LAST_SAVED_REGNUM
)
5997 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
5998 if ((offset
& ~8) == (offset2
& ~8))
5999 bitmap_set_bit (components
, regno2
);
6006 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
6007 Nothing to do for aarch64. */
6010 aarch64_disqualify_components (sbitmap
, edge
, sbitmap
, bool)
6014 /* Return the next set bit in BMP from START onwards. Return the total number
6015 of bits in BMP if no set bit is found at or after START. */
6018 aarch64_get_next_set_bit (sbitmap bmp
, unsigned int start
)
6020 unsigned int nbits
= SBITMAP_SIZE (bmp
);
6024 gcc_assert (start
< nbits
);
6025 for (unsigned int i
= start
; i
< nbits
; i
++)
6026 if (bitmap_bit_p (bmp
, i
))
6032 /* Do the work for aarch64_emit_prologue_components and
6033 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
6034 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
6035 for these components or the epilogue sequence. That is, it determines
6036 whether we should emit stores or loads and what kind of CFA notes to attach
6037 to the insns. Otherwise the logic for the two sequences is very
6041 aarch64_process_components (sbitmap components
, bool prologue_p
)
6043 rtx ptr_reg
= gen_rtx_REG (Pmode
, frame_pointer_needed
6044 ? HARD_FRAME_POINTER_REGNUM
6045 : STACK_POINTER_REGNUM
);
6047 unsigned last_regno
= SBITMAP_SIZE (components
);
6048 unsigned regno
= aarch64_get_next_set_bit (components
, R0_REGNUM
);
6049 rtx_insn
*insn
= NULL
;
6051 while (regno
!= last_regno
)
6053 /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
6054 so DFmode for the vector registers is enough. For simd functions
6055 we want to save the low 128 bits. */
6056 machine_mode mode
= aarch64_reg_save_mode (cfun
->decl
, regno
);
6058 rtx reg
= gen_rtx_REG (mode
, regno
);
6059 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
6060 if (!frame_pointer_needed
)
6061 offset
+= cfun
->machine
->frame
.frame_size
6062 - cfun
->machine
->frame
.hard_fp_offset
;
6063 rtx addr
= plus_constant (Pmode
, ptr_reg
, offset
);
6064 rtx mem
= gen_frame_mem (mode
, addr
);
6066 rtx set
= prologue_p
? gen_rtx_SET (mem
, reg
) : gen_rtx_SET (reg
, mem
);
6067 unsigned regno2
= aarch64_get_next_set_bit (components
, regno
+ 1);
6068 /* No more registers to handle after REGNO.
6069 Emit a single save/restore and exit. */
6070 if (regno2
== last_regno
)
6072 insn
= emit_insn (set
);
6073 RTX_FRAME_RELATED_P (insn
) = 1;
6075 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
6077 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
6081 poly_int64 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
6082 /* The next register is not of the same class or its offset is not
6083 mergeable with the current one into a pair. */
6084 if (!satisfies_constraint_Ump (mem
)
6085 || GP_REGNUM_P (regno
) != GP_REGNUM_P (regno2
)
6086 || (aarch64_simd_decl_p (cfun
->decl
) && FP_REGNUM_P (regno
))
6087 || maybe_ne ((offset2
- cfun
->machine
->frame
.reg_offset
[regno
]),
6088 GET_MODE_SIZE (mode
)))
6090 insn
= emit_insn (set
);
6091 RTX_FRAME_RELATED_P (insn
) = 1;
6093 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
6095 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
6101 /* REGNO2 can be saved/restored in a pair with REGNO. */
6102 rtx reg2
= gen_rtx_REG (mode
, regno2
);
6103 if (!frame_pointer_needed
)
6104 offset2
+= cfun
->machine
->frame
.frame_size
6105 - cfun
->machine
->frame
.hard_fp_offset
;
6106 rtx addr2
= plus_constant (Pmode
, ptr_reg
, offset2
);
6107 rtx mem2
= gen_frame_mem (mode
, addr2
);
6108 rtx set2
= prologue_p
? gen_rtx_SET (mem2
, reg2
)
6109 : gen_rtx_SET (reg2
, mem2
);
6112 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
, reg2
));
6114 insn
= emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
6116 RTX_FRAME_RELATED_P (insn
) = 1;
6119 add_reg_note (insn
, REG_CFA_OFFSET
, set
);
6120 add_reg_note (insn
, REG_CFA_OFFSET
, set2
);
6124 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
6125 add_reg_note (insn
, REG_CFA_RESTORE
, reg2
);
6128 regno
= aarch64_get_next_set_bit (components
, regno2
+ 1);
6132 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
6135 aarch64_emit_prologue_components (sbitmap components
)
6137 aarch64_process_components (components
, true);
6140 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
6143 aarch64_emit_epilogue_components (sbitmap components
)
6145 aarch64_process_components (components
, false);
6148 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
6151 aarch64_set_handled_components (sbitmap components
)
6153 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
6154 if (bitmap_bit_p (components
, regno
))
6155 cfun
->machine
->reg_is_wrapped_separately
[regno
] = true;
6158 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
6159 determining the probe offset for alloca. */
6161 static HOST_WIDE_INT
6162 aarch64_stack_clash_protection_alloca_probe_range (void)
6164 return STACK_CLASH_CALLER_GUARD
;
6168 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
6169 registers. If POLY_SIZE is not large enough to require a probe this function
6170 will only adjust the stack. When allocating the stack space
6171 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
6172 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
6173 arguments. If we are then we ensure that any allocation larger than the ABI
6174 defined buffer needs a probe so that the invariant of having a 1KB buffer is
6177 We emit barriers after each stack adjustment to prevent optimizations from
6178 breaking the invariant that we never drop the stack more than a page. This
6179 invariant is needed to make it easier to correctly handle asynchronous
6180 events, e.g. if we were to allow the stack to be dropped by more than a page
6181 and then have multiple probes up and we take a signal somewhere in between
6182 then the signal handler doesn't know the state of the stack and can make no
6183 assumptions about which pages have been probed. */
6186 aarch64_allocate_and_probe_stack_space (rtx temp1
, rtx temp2
,
6187 poly_int64 poly_size
,
6188 bool frame_related_p
,
6189 bool final_adjustment_p
)
6191 HOST_WIDE_INT guard_size
6192 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
6193 HOST_WIDE_INT guard_used_by_caller
= STACK_CLASH_CALLER_GUARD
;
6194 /* When doing the final adjustment for the outgoing argument size we can't
6195 assume that LR was saved at position 0. So subtract it's offset from the
6196 ABI safe buffer so that we don't accidentally allow an adjustment that
6197 would result in an allocation larger than the ABI buffer without
6199 HOST_WIDE_INT min_probe_threshold
6200 = final_adjustment_p
6201 ? guard_used_by_caller
- cfun
->machine
->frame
.reg_offset
[LR_REGNUM
]
6202 : guard_size
- guard_used_by_caller
;
6204 poly_int64 frame_size
= cfun
->machine
->frame
.frame_size
;
6206 /* We should always have a positive probe threshold. */
6207 gcc_assert (min_probe_threshold
> 0);
6209 if (flag_stack_clash_protection
&& !final_adjustment_p
)
6211 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
6212 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
6214 if (known_eq (frame_size
, 0))
6216 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME
, false);
6218 else if (known_lt (initial_adjust
, guard_size
- guard_used_by_caller
)
6219 && known_lt (final_adjust
, guard_used_by_caller
))
6221 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME
, true);
6225 /* If SIZE is not large enough to require probing, just adjust the stack and
6227 if (known_lt (poly_size
, min_probe_threshold
)
6228 || !flag_stack_clash_protection
)
6230 aarch64_sub_sp (temp1
, temp2
, poly_size
, frame_related_p
);
6235 /* Handle the SVE non-constant case first. */
6236 if (!poly_size
.is_constant (&size
))
6240 fprintf (dump_file
, "Stack clash SVE prologue: ");
6241 print_dec (poly_size
, dump_file
);
6242 fprintf (dump_file
, " bytes, dynamic probing will be required.\n");
6245 /* First calculate the amount of bytes we're actually spilling. */
6246 aarch64_add_offset (Pmode
, temp1
, CONST0_RTX (Pmode
),
6247 poly_size
, temp1
, temp2
, false, true);
6249 rtx_insn
*insn
= get_last_insn ();
6251 if (frame_related_p
)
6253 /* This is done to provide unwinding information for the stack
6254 adjustments we're about to do, however to prevent the optimizers
6255 from removing the R11 move and leaving the CFA note (which would be
6256 very wrong) we tie the old and new stack pointer together.
6257 The tie will expand to nothing but the optimizers will not touch
6259 rtx stack_ptr_copy
= gen_rtx_REG (Pmode
, STACK_CLASH_SVE_CFA_REGNUM
);
6260 emit_move_insn (stack_ptr_copy
, stack_pointer_rtx
);
6261 emit_insn (gen_stack_tie (stack_ptr_copy
, stack_pointer_rtx
));
6263 /* We want the CFA independent of the stack pointer for the
6264 duration of the loop. */
6265 add_reg_note (insn
, REG_CFA_DEF_CFA
, stack_ptr_copy
);
6266 RTX_FRAME_RELATED_P (insn
) = 1;
6269 rtx probe_const
= gen_int_mode (min_probe_threshold
, Pmode
);
6270 rtx guard_const
= gen_int_mode (guard_size
, Pmode
);
6272 insn
= emit_insn (gen_probe_sve_stack_clash (Pmode
, stack_pointer_rtx
,
6273 stack_pointer_rtx
, temp1
,
6274 probe_const
, guard_const
));
6276 /* Now reset the CFA register if needed. */
6277 if (frame_related_p
)
6279 add_reg_note (insn
, REG_CFA_DEF_CFA
,
6280 gen_rtx_PLUS (Pmode
, stack_pointer_rtx
,
6281 gen_int_mode (poly_size
, Pmode
)));
6282 RTX_FRAME_RELATED_P (insn
) = 1;
6290 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
6291 " bytes, probing will be required.\n", size
);
6293 /* Round size to the nearest multiple of guard_size, and calculate the
6294 residual as the difference between the original size and the rounded
6296 HOST_WIDE_INT rounded_size
= ROUND_DOWN (size
, guard_size
);
6297 HOST_WIDE_INT residual
= size
- rounded_size
;
6299 /* We can handle a small number of allocations/probes inline. Otherwise
6301 if (rounded_size
<= STACK_CLASH_MAX_UNROLL_PAGES
* guard_size
)
6303 for (HOST_WIDE_INT i
= 0; i
< rounded_size
; i
+= guard_size
)
6305 aarch64_sub_sp (NULL
, temp2
, guard_size
, true);
6306 emit_stack_probe (plus_constant (Pmode
, stack_pointer_rtx
,
6307 guard_used_by_caller
));
6308 emit_insn (gen_blockage ());
6310 dump_stack_clash_frame_info (PROBE_INLINE
, size
!= rounded_size
);
6314 /* Compute the ending address. */
6315 aarch64_add_offset (Pmode
, temp1
, stack_pointer_rtx
, -rounded_size
,
6316 temp1
, NULL
, false, true);
6317 rtx_insn
*insn
= get_last_insn ();
6319 /* For the initial allocation, we don't have a frame pointer
6320 set up, so we always need CFI notes. If we're doing the
6321 final allocation, then we may have a frame pointer, in which
6322 case it is the CFA, otherwise we need CFI notes.
6324 We can determine which allocation we are doing by looking at
6325 the value of FRAME_RELATED_P since the final allocations are not
6327 if (frame_related_p
)
6329 /* We want the CFA independent of the stack pointer for the
6330 duration of the loop. */
6331 add_reg_note (insn
, REG_CFA_DEF_CFA
,
6332 plus_constant (Pmode
, temp1
, rounded_size
));
6333 RTX_FRAME_RELATED_P (insn
) = 1;
6336 /* This allocates and probes the stack. Note that this re-uses some of
6337 the existing Ada stack protection code. However we are guaranteed not
6338 to enter the non loop or residual branches of that code.
6340 The non-loop part won't be entered because if our allocation amount
6341 doesn't require a loop, the case above would handle it.
6343 The residual amount won't be entered because TEMP1 is a mutliple of
6344 the allocation size. The residual will always be 0. As such, the only
6345 part we are actually using from that code is the loop setup. The
6346 actual probing is done in aarch64_output_probe_stack_range. */
6347 insn
= emit_insn (gen_probe_stack_range (stack_pointer_rtx
,
6348 stack_pointer_rtx
, temp1
));
6350 /* Now reset the CFA register if needed. */
6351 if (frame_related_p
)
6353 add_reg_note (insn
, REG_CFA_DEF_CFA
,
6354 plus_constant (Pmode
, stack_pointer_rtx
, rounded_size
));
6355 RTX_FRAME_RELATED_P (insn
) = 1;
6358 emit_insn (gen_blockage ());
6359 dump_stack_clash_frame_info (PROBE_LOOP
, size
!= rounded_size
);
6362 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
6363 be probed. This maintains the requirement that each page is probed at
6364 least once. For initial probing we probe only if the allocation is
6365 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
6366 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
6367 GUARD_SIZE. This works that for any allocation that is large enough to
6368 trigger a probe here, we'll have at least one, and if they're not large
6369 enough for this code to emit anything for them, The page would have been
6370 probed by the saving of FP/LR either by this function or any callees. If
6371 we don't have any callees then we won't have more stack adjustments and so
6375 HOST_WIDE_INT residual_probe_offset
= guard_used_by_caller
;
6376 /* If we're doing final adjustments, and we've done any full page
6377 allocations then any residual needs to be probed. */
6378 if (final_adjustment_p
&& rounded_size
!= 0)
6379 min_probe_threshold
= 0;
6380 /* If doing a small final adjustment, we always probe at offset 0.
6381 This is done to avoid issues when LR is not at position 0 or when
6382 the final adjustment is smaller than the probing offset. */
6383 else if (final_adjustment_p
&& rounded_size
== 0)
6384 residual_probe_offset
= 0;
6386 aarch64_sub_sp (temp1
, temp2
, residual
, frame_related_p
);
6387 if (residual
>= min_probe_threshold
)
6391 "Stack clash AArch64 prologue residuals: "
6392 HOST_WIDE_INT_PRINT_DEC
" bytes, probing will be required."
6395 emit_stack_probe (plus_constant (Pmode
, stack_pointer_rtx
,
6396 residual_probe_offset
));
6397 emit_insn (gen_blockage ());
6402 /* Return 1 if the register is used by the epilogue. We need to say the
6403 return register is used, but only after epilogue generation is complete.
6404 Note that in the case of sibcalls, the values "used by the epilogue" are
6405 considered live at the start of the called function.
6407 For SIMD functions we need to return 1 for FP registers that are saved and
6408 restored by a function but are not zero in call_used_regs. If we do not do
6409 this optimizations may remove the restore of the register. */
6412 aarch64_epilogue_uses (int regno
)
6414 if (epilogue_completed
)
6416 if (regno
== LR_REGNUM
)
6418 if (aarch64_simd_decl_p (cfun
->decl
) && FP_SIMD_SAVED_REGNUM_P (regno
))
6424 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6425 is saved at BASE + OFFSET. */
6428 aarch64_add_cfa_expression (rtx_insn
*insn
, unsigned int reg
,
6429 rtx base
, poly_int64 offset
)
6431 rtx mem
= gen_frame_mem (DImode
, plus_constant (Pmode
, base
, offset
));
6432 add_reg_note (insn
, REG_CFA_EXPRESSION
,
6433 gen_rtx_SET (mem
, regno_reg_rtx
[reg
]));
6436 /* AArch64 stack frames generated by this compiler look like:
6438 +-------------------------------+
6440 | incoming stack arguments |
6442 +-------------------------------+
6443 | | <-- incoming stack pointer (aligned)
6444 | callee-allocated save area |
6445 | for register varargs |
6447 +-------------------------------+
6448 | local variables | <-- frame_pointer_rtx
6450 +-------------------------------+
6452 +-------------------------------+ |
6453 | callee-saved registers | | frame.saved_regs_size
6454 +-------------------------------+ |
6456 +-------------------------------+ |
6457 | FP' | / <- hard_frame_pointer_rtx (aligned)
6458 +-------------------------------+
6459 | dynamic allocation |
6460 +-------------------------------+
6462 +-------------------------------+
6463 | outgoing stack arguments | <-- arg_pointer
6465 +-------------------------------+
6466 | | <-- stack_pointer_rtx (aligned)
6468 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
6469 but leave frame_pointer_rtx and hard_frame_pointer_rtx
6472 By default for stack-clash we assume the guard is at least 64KB, but this
6473 value is configurable to either 4KB or 64KB. We also force the guard size to
6474 be the same as the probing interval and both values are kept in sync.
6476 With those assumptions the callee can allocate up to 63KB (or 3KB depending
6477 on the guard size) of stack space without probing.
6479 When probing is needed, we emit a probe at the start of the prologue
6480 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
6482 We have to track how much space has been allocated and the only stores
6483 to the stack we track as implicit probes are the FP/LR stores.
6485 For outgoing arguments we probe if the size is larger than 1KB, such that
6486 the ABI specified buffer is maintained for the next callee.
6488 The following registers are reserved during frame layout and should not be
6489 used for any other purpose:
6491 - r11: Used by stack clash protection when SVE is enabled.
6492 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
6493 - r14 and r15: Used for speculation tracking.
6494 - r16(IP0), r17(IP1): Used by indirect tailcalls.
6495 - r30(LR), r29(FP): Used by standard frame layout.
6497 These registers must be avoided in frame layout related code unless the
6498 explicit intention is to interact with one of the features listed above. */
6500 /* Generate the prologue instructions for entry into a function.
6501 Establish the stack frame by decreasing the stack pointer with a
6502 properly calculated size and, if necessary, create a frame record
6503 filled with the values of LR and previous frame pointer. The
6504 current FP is also set up if it is in use. */
6507 aarch64_expand_prologue (void)
6509 poly_int64 frame_size
= cfun
->machine
->frame
.frame_size
;
6510 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
6511 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
6512 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
6513 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
6514 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
6515 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
6516 bool emit_frame_chain
= cfun
->machine
->frame
.emit_frame_chain
;
6519 /* Sign return address for functions. */
6520 if (aarch64_return_address_signing_enabled ())
6522 switch (aarch64_ra_sign_key
)
6525 insn
= emit_insn (gen_paciasp ());
6528 insn
= emit_insn (gen_pacibsp ());
6533 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
6534 RTX_FRAME_RELATED_P (insn
) = 1;
6537 if (flag_stack_usage_info
)
6538 current_function_static_stack_size
= constant_lower_bound (frame_size
);
6540 if (flag_stack_check
== STATIC_BUILTIN_STACK_CHECK
)
6542 if (crtl
->is_leaf
&& !cfun
->calls_alloca
)
6544 if (maybe_gt (frame_size
, PROBE_INTERVAL
)
6545 && maybe_gt (frame_size
, get_stack_check_protect ()))
6546 aarch64_emit_probe_stack_range (get_stack_check_protect (),
6548 - get_stack_check_protect ()));
6550 else if (maybe_gt (frame_size
, 0))
6551 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size
);
6554 rtx tmp0_rtx
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
6555 rtx tmp1_rtx
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
6557 /* In theory we should never have both an initial adjustment
6558 and a callee save adjustment. Verify that is the case since the
6559 code below does not handle it for -fstack-clash-protection. */
6560 gcc_assert (known_eq (initial_adjust
, 0) || callee_adjust
== 0);
6562 /* Will only probe if the initial adjustment is larger than the guard
6563 less the amount of the guard reserved for use by the caller's
6565 aarch64_allocate_and_probe_stack_space (tmp0_rtx
, tmp1_rtx
, initial_adjust
,
6568 if (callee_adjust
!= 0)
6569 aarch64_push_regs (reg1
, reg2
, callee_adjust
);
6571 if (emit_frame_chain
)
6573 poly_int64 reg_offset
= callee_adjust
;
6574 if (callee_adjust
== 0)
6578 reg_offset
= callee_offset
;
6579 aarch64_save_callee_saves (DImode
, reg_offset
, reg1
, reg2
, false);
6581 aarch64_add_offset (Pmode
, hard_frame_pointer_rtx
,
6582 stack_pointer_rtx
, callee_offset
,
6583 tmp1_rtx
, tmp0_rtx
, frame_pointer_needed
);
6584 if (frame_pointer_needed
&& !frame_size
.is_constant ())
6586 /* Variable-sized frames need to describe the save slot
6587 address using DW_CFA_expression rather than DW_CFA_offset.
6588 This means that, without taking further action, the
6589 locations of the registers that we've already saved would
6590 remain based on the stack pointer even after we redefine
6591 the CFA based on the frame pointer. We therefore need new
6592 DW_CFA_expressions to re-express the save slots with addresses
6593 based on the frame pointer. */
6594 rtx_insn
*insn
= get_last_insn ();
6595 gcc_assert (RTX_FRAME_RELATED_P (insn
));
6597 /* Add an explicit CFA definition if this was previously
6599 if (!find_reg_note (insn
, REG_CFA_ADJUST_CFA
, NULL_RTX
))
6601 rtx src
= plus_constant (Pmode
, stack_pointer_rtx
,
6603 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
6604 gen_rtx_SET (hard_frame_pointer_rtx
, src
));
6607 /* Change the save slot expressions for the registers that
6608 we've already saved. */
6609 reg_offset
-= callee_offset
;
6610 aarch64_add_cfa_expression (insn
, reg2
, hard_frame_pointer_rtx
,
6611 reg_offset
+ UNITS_PER_WORD
);
6612 aarch64_add_cfa_expression (insn
, reg1
, hard_frame_pointer_rtx
,
6615 emit_insn (gen_stack_tie (stack_pointer_rtx
, hard_frame_pointer_rtx
));
6618 aarch64_save_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
6619 callee_adjust
!= 0 || emit_frame_chain
);
6620 if (aarch64_simd_decl_p (cfun
->decl
))
6621 aarch64_save_callee_saves (TFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
6622 callee_adjust
!= 0 || emit_frame_chain
);
6624 aarch64_save_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
6625 callee_adjust
!= 0 || emit_frame_chain
);
6627 /* We may need to probe the final adjustment if it is larger than the guard
6628 that is assumed by the called. */
6629 aarch64_allocate_and_probe_stack_space (tmp1_rtx
, tmp0_rtx
, final_adjust
,
6630 !frame_pointer_needed
, true);
6633 /* Return TRUE if we can use a simple_return insn.
6635 This function checks whether the callee saved stack is empty, which
6636 means no restore actions are need. The pro_and_epilogue will use
6637 this to check whether shrink-wrapping opt is feasible. */
6640 aarch64_use_return_insn_p (void)
6642 if (!reload_completed
)
6648 return known_eq (cfun
->machine
->frame
.frame_size
, 0);
6651 /* Return false for non-leaf SIMD functions in order to avoid
6652 shrink-wrapping them. Doing this will lose the necessary
6653 save/restore of FP registers. */
6656 aarch64_use_simple_return_insn_p (void)
6658 if (aarch64_simd_decl_p (cfun
->decl
) && !crtl
->is_leaf
)
6664 /* Generate the epilogue instructions for returning from a function.
6665 This is almost exactly the reverse of the prolog sequence, except
6666 that we need to insert barriers to avoid scheduling loads that read
6667 from a deallocated stack, and we optimize the unwind records by
6668 emitting them all together if possible. */
6670 aarch64_expand_epilogue (bool for_sibcall
)
6672 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
6673 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
6674 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
6675 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
6676 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
6677 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
6680 /* A stack clash protection prologue may not have left EP0_REGNUM or
6681 EP1_REGNUM in a usable state. The same is true for allocations
6682 with an SVE component, since we then need both temporary registers
6683 for each allocation. For stack clash we are in a usable state if
6684 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
6685 HOST_WIDE_INT guard_size
6686 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
6687 HOST_WIDE_INT guard_used_by_caller
= STACK_CLASH_CALLER_GUARD
;
6689 /* We can re-use the registers when the allocation amount is smaller than
6690 guard_size - guard_used_by_caller because we won't be doing any probes
6691 then. In such situations the register should remain live with the correct
6693 bool can_inherit_p
= (initial_adjust
.is_constant ()
6694 && final_adjust
.is_constant ())
6695 && (!flag_stack_clash_protection
6696 || known_lt (initial_adjust
,
6697 guard_size
- guard_used_by_caller
));
6699 /* We need to add memory barrier to prevent read from deallocated stack. */
6701 = maybe_ne (get_frame_size ()
6702 + cfun
->machine
->frame
.saved_varargs_size
, 0);
6704 /* Emit a barrier to prevent loads from a deallocated stack. */
6705 if (maybe_gt (final_adjust
, crtl
->outgoing_args_size
)
6706 || cfun
->calls_alloca
6707 || crtl
->calls_eh_return
)
6709 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
6710 need_barrier_p
= false;
6713 /* Restore the stack pointer from the frame pointer if it may not
6714 be the same as the stack pointer. */
6715 rtx tmp0_rtx
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
6716 rtx tmp1_rtx
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
6717 if (frame_pointer_needed
6718 && (maybe_ne (final_adjust
, 0) || cfun
->calls_alloca
))
6719 /* If writeback is used when restoring callee-saves, the CFA
6720 is restored on the instruction doing the writeback. */
6721 aarch64_add_offset (Pmode
, stack_pointer_rtx
,
6722 hard_frame_pointer_rtx
, -callee_offset
,
6723 tmp1_rtx
, tmp0_rtx
, callee_adjust
== 0);
6725 /* The case where we need to re-use the register here is very rare, so
6726 avoid the complicated condition and just always emit a move if the
6727 immediate doesn't fit. */
6728 aarch64_add_sp (tmp1_rtx
, tmp0_rtx
, final_adjust
, true);
6730 aarch64_restore_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
6731 callee_adjust
!= 0, &cfi_ops
);
6732 if (aarch64_simd_decl_p (cfun
->decl
))
6733 aarch64_restore_callee_saves (TFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
6734 callee_adjust
!= 0, &cfi_ops
);
6736 aarch64_restore_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
6737 callee_adjust
!= 0, &cfi_ops
);
6740 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
6742 if (callee_adjust
!= 0)
6743 aarch64_pop_regs (reg1
, reg2
, callee_adjust
, &cfi_ops
);
6745 if (callee_adjust
!= 0 || maybe_gt (initial_adjust
, 65536))
6747 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
6748 insn
= get_last_insn ();
6749 rtx new_cfa
= plus_constant (Pmode
, stack_pointer_rtx
, initial_adjust
);
6750 REG_NOTES (insn
) = alloc_reg_note (REG_CFA_DEF_CFA
, new_cfa
, cfi_ops
);
6751 RTX_FRAME_RELATED_P (insn
) = 1;
6755 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
6756 add restriction on emit_move optimization to leaf functions. */
6757 aarch64_add_sp (tmp0_rtx
, tmp1_rtx
, initial_adjust
,
6758 (!can_inherit_p
|| !crtl
->is_leaf
6759 || df_regs_ever_live_p (EP0_REGNUM
)));
6763 /* Emit delayed restores and reset the CFA to be SP. */
6764 insn
= get_last_insn ();
6765 cfi_ops
= alloc_reg_note (REG_CFA_DEF_CFA
, stack_pointer_rtx
, cfi_ops
);
6766 REG_NOTES (insn
) = cfi_ops
;
6767 RTX_FRAME_RELATED_P (insn
) = 1;
6770 /* We prefer to emit the combined return/authenticate instruction RETAA,
6771 however there are three cases in which we must instead emit an explicit
6772 authentication instruction.
6774 1) Sibcalls don't return in a normal way, so if we're about to call one
6775 we must authenticate.
6777 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
6778 generating code for !TARGET_ARMV8_3 we can't use it and must
6779 explicitly authenticate.
6781 3) On an eh_return path we make extra stack adjustments to update the
6782 canonical frame address to be the exception handler's CFA. We want
6783 to authenticate using the CFA of the function which calls eh_return.
6785 if (aarch64_return_address_signing_enabled ()
6786 && (for_sibcall
|| !TARGET_ARMV8_3
|| crtl
->calls_eh_return
))
6788 switch (aarch64_ra_sign_key
)
6791 insn
= emit_insn (gen_autiasp ());
6794 insn
= emit_insn (gen_autibsp ());
6799 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
6800 RTX_FRAME_RELATED_P (insn
) = 1;
6803 /* Stack adjustment for exception handler. */
6804 if (crtl
->calls_eh_return
&& !for_sibcall
)
6806 /* We need to unwind the stack by the offset computed by
6807 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
6808 to be SP; letting the CFA move during this adjustment
6809 is just as correct as retaining the CFA from the body
6810 of the function. Therefore, do nothing special. */
6811 emit_insn (gen_add2_insn (stack_pointer_rtx
, EH_RETURN_STACKADJ_RTX
));
6814 emit_use (gen_rtx_REG (DImode
, LR_REGNUM
));
6816 emit_jump_insn (ret_rtx
);
6819 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
6820 normally or return to a previous frame after unwinding.
6822 An EH return uses a single shared return sequence. The epilogue is
6823 exactly like a normal epilogue except that it has an extra input
6824 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
6825 that must be applied after the frame has been destroyed. An extra label
6826 is inserted before the epilogue which initializes this register to zero,
6827 and this is the entry point for a normal return.
6829 An actual EH return updates the return address, initializes the stack
6830 adjustment and jumps directly into the epilogue (bypassing the zeroing
6831 of the adjustment). Since the return address is typically saved on the
6832 stack when a function makes a call, the saved LR must be updated outside
6835 This poses problems as the store is generated well before the epilogue,
6836 so the offset of LR is not known yet. Also optimizations will remove the
6837 store as it appears dead, even after the epilogue is generated (as the
6838 base or offset for loading LR is different in many cases).
6840 To avoid these problems this implementation forces the frame pointer
6841 in eh_return functions so that the location of LR is fixed and known early.
6842 It also marks the store volatile, so no optimization is permitted to
6843 remove the store. */
6845 aarch64_eh_return_handler_rtx (void)
6847 rtx tmp
= gen_frame_mem (Pmode
,
6848 plus_constant (Pmode
, hard_frame_pointer_rtx
, UNITS_PER_WORD
));
6850 /* Mark the store volatile, so no optimization is permitted to remove it. */
6851 MEM_VOLATILE_P (tmp
) = true;
6855 /* Output code to add DELTA to the first argument, and then jump
6856 to FUNCTION. Used for C++ multiple inheritance. */
6858 aarch64_output_mi_thunk (FILE *file
, tree thunk ATTRIBUTE_UNUSED
,
6859 HOST_WIDE_INT delta
,
6860 HOST_WIDE_INT vcall_offset
,
6863 /* The this pointer is always in x0. Note that this differs from
6864 Arm where the this pointer maybe bumped to r1 if r0 is required
6865 to return a pointer to an aggregate. On AArch64 a result value
6866 pointer will be in x8. */
6867 int this_regno
= R0_REGNUM
;
6868 rtx this_rtx
, temp0
, temp1
, addr
, funexp
;
6870 const char *fnname
= IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk
));
6872 if (aarch64_bti_enabled ())
6873 emit_insn (gen_bti_c());
6875 reload_completed
= 1;
6876 emit_note (NOTE_INSN_PROLOGUE_END
);
6878 this_rtx
= gen_rtx_REG (Pmode
, this_regno
);
6879 temp0
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
6880 temp1
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
6882 if (vcall_offset
== 0)
6883 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
, temp1
, temp0
, false);
6886 gcc_assert ((vcall_offset
& (POINTER_BYTES
- 1)) == 0);
6891 if (delta
>= -256 && delta
< 256)
6892 addr
= gen_rtx_PRE_MODIFY (Pmode
, this_rtx
,
6893 plus_constant (Pmode
, this_rtx
, delta
));
6895 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
,
6896 temp1
, temp0
, false);
6899 if (Pmode
== ptr_mode
)
6900 aarch64_emit_move (temp0
, gen_rtx_MEM (ptr_mode
, addr
));
6902 aarch64_emit_move (temp0
,
6903 gen_rtx_ZERO_EXTEND (Pmode
,
6904 gen_rtx_MEM (ptr_mode
, addr
)));
6906 if (vcall_offset
>= -256 && vcall_offset
< 4096 * POINTER_BYTES
)
6907 addr
= plus_constant (Pmode
, temp0
, vcall_offset
);
6910 aarch64_internal_mov_immediate (temp1
, GEN_INT (vcall_offset
), true,
6912 addr
= gen_rtx_PLUS (Pmode
, temp0
, temp1
);
6915 if (Pmode
== ptr_mode
)
6916 aarch64_emit_move (temp1
, gen_rtx_MEM (ptr_mode
,addr
));
6918 aarch64_emit_move (temp1
,
6919 gen_rtx_SIGN_EXTEND (Pmode
,
6920 gen_rtx_MEM (ptr_mode
, addr
)));
6922 emit_insn (gen_add2_insn (this_rtx
, temp1
));
6925 /* Generate a tail call to the target function. */
6926 if (!TREE_USED (function
))
6928 assemble_external (function
);
6929 TREE_USED (function
) = 1;
6931 funexp
= XEXP (DECL_RTL (function
), 0);
6932 funexp
= gen_rtx_MEM (FUNCTION_MODE
, funexp
);
6933 insn
= emit_call_insn (gen_sibcall (funexp
, const0_rtx
, NULL_RTX
));
6934 SIBLING_CALL_P (insn
) = 1;
6936 insn
= get_insns ();
6937 shorten_branches (insn
);
6939 assemble_start_function (thunk
, fnname
);
6940 final_start_function (insn
, file
, 1);
6941 final (insn
, file
, 1);
6942 final_end_function ();
6943 assemble_end_function (thunk
, fnname
);
6945 /* Stop pretending to be a post-reload pass. */
6946 reload_completed
= 0;
6950 aarch64_tls_referenced_p (rtx x
)
6952 if (!TARGET_HAVE_TLS
)
6954 subrtx_iterator::array_type array
;
6955 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
6957 const_rtx x
= *iter
;
6958 if (GET_CODE (x
) == SYMBOL_REF
&& SYMBOL_REF_TLS_MODEL (x
) != 0)
6960 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6961 TLS offsets, not real symbol references. */
6962 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
6963 iter
.skip_subrtxes ();
6969 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6970 a left shift of 0 or 12 bits. */
6972 aarch64_uimm12_shift (HOST_WIDE_INT val
)
6974 return ((val
& (((HOST_WIDE_INT
) 0xfff) << 0)) == val
6975 || (val
& (((HOST_WIDE_INT
) 0xfff) << 12)) == val
6979 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6980 that can be created with a left shift of 0 or 12. */
6981 static HOST_WIDE_INT
6982 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val
)
6984 /* Check to see if the value fits in 24 bits, as that is the maximum we can
6985 handle correctly. */
6986 gcc_assert ((val
& 0xffffff) == val
);
6988 if (((val
& 0xfff) << 0) == val
)
6991 return val
& (0xfff << 12);
6994 /* Return true if val is an immediate that can be loaded into a
6995 register by a MOVZ instruction. */
6997 aarch64_movw_imm (HOST_WIDE_INT val
, scalar_int_mode mode
)
6999 if (GET_MODE_SIZE (mode
) > 4)
7001 if ((val
& (((HOST_WIDE_INT
) 0xffff) << 32)) == val
7002 || (val
& (((HOST_WIDE_INT
) 0xffff) << 48)) == val
)
7007 /* Ignore sign extension. */
7008 val
&= (HOST_WIDE_INT
) 0xffffffff;
7010 return ((val
& (((HOST_WIDE_INT
) 0xffff) << 0)) == val
7011 || (val
& (((HOST_WIDE_INT
) 0xffff) << 16)) == val
);
7014 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
7015 64-bit (DImode) integer. */
7017 static unsigned HOST_WIDE_INT
7018 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val
, machine_mode mode
)
7020 unsigned int size
= GET_MODE_UNIT_PRECISION (mode
);
7023 val
&= (HOST_WIDE_INT_1U
<< size
) - 1;
7030 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
7032 static const unsigned HOST_WIDE_INT bitmask_imm_mul
[] =
7034 0x0000000100000001ull
,
7035 0x0001000100010001ull
,
7036 0x0101010101010101ull
,
7037 0x1111111111111111ull
,
7038 0x5555555555555555ull
,
7042 /* Return true if val is a valid bitmask immediate. */
7045 aarch64_bitmask_imm (HOST_WIDE_INT val_in
, machine_mode mode
)
7047 unsigned HOST_WIDE_INT val
, tmp
, mask
, first_one
, next_one
;
7050 /* Check for a single sequence of one bits and return quickly if so.
7051 The special cases of all ones and all zeroes returns false. */
7052 val
= aarch64_replicate_bitmask_imm (val_in
, mode
);
7053 tmp
= val
+ (val
& -val
);
7055 if (tmp
== (tmp
& -tmp
))
7056 return (val
+ 1) > 1;
7058 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
7060 val
= (val
<< 32) | (val
& 0xffffffff);
7062 /* Invert if the immediate doesn't start with a zero bit - this means we
7063 only need to search for sequences of one bits. */
7067 /* Find the first set bit and set tmp to val with the first sequence of one
7068 bits removed. Return success if there is a single sequence of ones. */
7069 first_one
= val
& -val
;
7070 tmp
= val
& (val
+ first_one
);
7075 /* Find the next set bit and compute the difference in bit position. */
7076 next_one
= tmp
& -tmp
;
7077 bits
= clz_hwi (first_one
) - clz_hwi (next_one
);
7080 /* Check the bit position difference is a power of 2, and that the first
7081 sequence of one bits fits within 'bits' bits. */
7082 if ((mask
>> bits
) != 0 || bits
!= (bits
& -bits
))
7085 /* Check the sequence of one bits is repeated 64/bits times. */
7086 return val
== mask
* bitmask_imm_mul
[__builtin_clz (bits
) - 26];
7089 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
7090 Assumed precondition: VAL_IN Is not zero. */
7092 unsigned HOST_WIDE_INT
7093 aarch64_and_split_imm1 (HOST_WIDE_INT val_in
)
7095 int lowest_bit_set
= ctz_hwi (val_in
);
7096 int highest_bit_set
= floor_log2 (val_in
);
7097 gcc_assert (val_in
!= 0);
7099 return ((HOST_WIDE_INT_UC (2) << highest_bit_set
) -
7100 (HOST_WIDE_INT_1U
<< lowest_bit_set
));
7103 /* Create constant where bits outside of lowest bit set to highest bit set
7106 unsigned HOST_WIDE_INT
7107 aarch64_and_split_imm2 (HOST_WIDE_INT val_in
)
7109 return val_in
| ~aarch64_and_split_imm1 (val_in
);
7112 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
7115 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in
, machine_mode mode
)
7117 scalar_int_mode int_mode
;
7118 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
7121 if (aarch64_bitmask_imm (val_in
, int_mode
))
7124 if (aarch64_move_imm (val_in
, int_mode
))
7127 unsigned HOST_WIDE_INT imm2
= aarch64_and_split_imm2 (val_in
);
7129 return aarch64_bitmask_imm (imm2
, int_mode
);
7132 /* Return true if val is an immediate that can be loaded into a
7133 register in a single instruction. */
7135 aarch64_move_imm (HOST_WIDE_INT val
, machine_mode mode
)
7137 scalar_int_mode int_mode
;
7138 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
7141 if (aarch64_movw_imm (val
, int_mode
) || aarch64_movw_imm (~val
, int_mode
))
7143 return aarch64_bitmask_imm (val
, int_mode
);
7147 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED
, rtx x
)
7151 if (GET_CODE (x
) == HIGH
)
7154 /* There's no way to calculate VL-based values using relocations. */
7155 subrtx_iterator::array_type array
;
7156 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
7157 if (GET_CODE (*iter
) == CONST_POLY_INT
)
7160 split_const (x
, &base
, &offset
);
7161 if (GET_CODE (base
) == SYMBOL_REF
|| GET_CODE (base
) == LABEL_REF
)
7163 if (aarch64_classify_symbol (base
, INTVAL (offset
))
7164 != SYMBOL_FORCE_TO_MEM
)
7167 /* Avoid generating a 64-bit relocation in ILP32; leave
7168 to aarch64_expand_mov_immediate to handle it properly. */
7169 return mode
!= ptr_mode
;
7172 return aarch64_tls_referenced_p (x
);
7175 /* Implement TARGET_CASE_VALUES_THRESHOLD.
7176 The expansion for a table switch is quite expensive due to the number
7177 of instructions, the table lookup and hard to predict indirect jump.
7178 When optimizing for speed, and -O3 enabled, use the per-core tuning if
7179 set, otherwise use tables for > 16 cases as a tradeoff between size and
7180 performance. When optimizing for size, use the default setting. */
7183 aarch64_case_values_threshold (void)
7185 /* Use the specified limit for the number of cases before using jump
7186 tables at higher optimization levels. */
7188 && selected_cpu
->tune
->max_case_values
!= 0)
7189 return selected_cpu
->tune
->max_case_values
;
7191 return optimize_size
? default_case_values_threshold () : 17;
7194 /* Return true if register REGNO is a valid index register.
7195 STRICT_P is true if REG_OK_STRICT is in effect. */
7198 aarch64_regno_ok_for_index_p (int regno
, bool strict_p
)
7200 if (!HARD_REGISTER_NUM_P (regno
))
7208 regno
= reg_renumber
[regno
];
7210 return GP_REGNUM_P (regno
);
7213 /* Return true if register REGNO is a valid base register for mode MODE.
7214 STRICT_P is true if REG_OK_STRICT is in effect. */
7217 aarch64_regno_ok_for_base_p (int regno
, bool strict_p
)
7219 if (!HARD_REGISTER_NUM_P (regno
))
7227 regno
= reg_renumber
[regno
];
7230 /* The fake registers will be eliminated to either the stack or
7231 hard frame pointer, both of which are usually valid base registers.
7232 Reload deals with the cases where the eliminated form isn't valid. */
7233 return (GP_REGNUM_P (regno
)
7234 || regno
== SP_REGNUM
7235 || regno
== FRAME_POINTER_REGNUM
7236 || regno
== ARG_POINTER_REGNUM
);
7239 /* Return true if X is a valid base register for mode MODE.
7240 STRICT_P is true if REG_OK_STRICT is in effect. */
7243 aarch64_base_register_rtx_p (rtx x
, bool strict_p
)
7246 && GET_CODE (x
) == SUBREG
7247 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (x
))])
7250 return (REG_P (x
) && aarch64_regno_ok_for_base_p (REGNO (x
), strict_p
));
7253 /* Return true if address offset is a valid index. If it is, fill in INFO
7254 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
7257 aarch64_classify_index (struct aarch64_address_info
*info
, rtx x
,
7258 machine_mode mode
, bool strict_p
)
7260 enum aarch64_address_type type
;
7265 if ((REG_P (x
) || GET_CODE (x
) == SUBREG
)
7266 && GET_MODE (x
) == Pmode
)
7268 type
= ADDRESS_REG_REG
;
7272 /* (sign_extend:DI (reg:SI)) */
7273 else if ((GET_CODE (x
) == SIGN_EXTEND
7274 || GET_CODE (x
) == ZERO_EXTEND
)
7275 && GET_MODE (x
) == DImode
7276 && GET_MODE (XEXP (x
, 0)) == SImode
)
7278 type
= (GET_CODE (x
) == SIGN_EXTEND
)
7279 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
7280 index
= XEXP (x
, 0);
7283 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
7284 else if (GET_CODE (x
) == MULT
7285 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
7286 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
7287 && GET_MODE (XEXP (x
, 0)) == DImode
7288 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
7289 && CONST_INT_P (XEXP (x
, 1)))
7291 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
7292 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
7293 index
= XEXP (XEXP (x
, 0), 0);
7294 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
7296 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
7297 else if (GET_CODE (x
) == ASHIFT
7298 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
7299 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
7300 && GET_MODE (XEXP (x
, 0)) == DImode
7301 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
7302 && CONST_INT_P (XEXP (x
, 1)))
7304 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
7305 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
7306 index
= XEXP (XEXP (x
, 0), 0);
7307 shift
= INTVAL (XEXP (x
, 1));
7309 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
7310 else if ((GET_CODE (x
) == SIGN_EXTRACT
7311 || GET_CODE (x
) == ZERO_EXTRACT
)
7312 && GET_MODE (x
) == DImode
7313 && GET_CODE (XEXP (x
, 0)) == MULT
7314 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
7315 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
7317 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
7318 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
7319 index
= XEXP (XEXP (x
, 0), 0);
7320 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
7321 if (INTVAL (XEXP (x
, 1)) != 32 + shift
7322 || INTVAL (XEXP (x
, 2)) != 0)
7325 /* (and:DI (mult:DI (reg:DI) (const_int scale))
7326 (const_int 0xffffffff<<shift)) */
7327 else if (GET_CODE (x
) == AND
7328 && GET_MODE (x
) == DImode
7329 && GET_CODE (XEXP (x
, 0)) == MULT
7330 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
7331 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
7332 && CONST_INT_P (XEXP (x
, 1)))
7334 type
= ADDRESS_REG_UXTW
;
7335 index
= XEXP (XEXP (x
, 0), 0);
7336 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
7337 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
7340 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
7341 else if ((GET_CODE (x
) == SIGN_EXTRACT
7342 || GET_CODE (x
) == ZERO_EXTRACT
)
7343 && GET_MODE (x
) == DImode
7344 && GET_CODE (XEXP (x
, 0)) == ASHIFT
7345 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
7346 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
7348 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
7349 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
7350 index
= XEXP (XEXP (x
, 0), 0);
7351 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
7352 if (INTVAL (XEXP (x
, 1)) != 32 + shift
7353 || INTVAL (XEXP (x
, 2)) != 0)
7356 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
7357 (const_int 0xffffffff<<shift)) */
7358 else if (GET_CODE (x
) == AND
7359 && GET_MODE (x
) == DImode
7360 && GET_CODE (XEXP (x
, 0)) == ASHIFT
7361 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
7362 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
7363 && CONST_INT_P (XEXP (x
, 1)))
7365 type
= ADDRESS_REG_UXTW
;
7366 index
= XEXP (XEXP (x
, 0), 0);
7367 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
7368 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
7371 /* (mult:P (reg:P) (const_int scale)) */
7372 else if (GET_CODE (x
) == MULT
7373 && GET_MODE (x
) == Pmode
7374 && GET_MODE (XEXP (x
, 0)) == Pmode
7375 && CONST_INT_P (XEXP (x
, 1)))
7377 type
= ADDRESS_REG_REG
;
7378 index
= XEXP (x
, 0);
7379 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
7381 /* (ashift:P (reg:P) (const_int shift)) */
7382 else if (GET_CODE (x
) == ASHIFT
7383 && GET_MODE (x
) == Pmode
7384 && GET_MODE (XEXP (x
, 0)) == Pmode
7385 && CONST_INT_P (XEXP (x
, 1)))
7387 type
= ADDRESS_REG_REG
;
7388 index
= XEXP (x
, 0);
7389 shift
= INTVAL (XEXP (x
, 1));
7395 && GET_CODE (index
) == SUBREG
7396 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (index
))])
7397 index
= SUBREG_REG (index
);
7399 if (aarch64_sve_data_mode_p (mode
))
7401 if (type
!= ADDRESS_REG_REG
7402 || (1 << shift
) != GET_MODE_UNIT_SIZE (mode
))
7408 && !(IN_RANGE (shift
, 1, 3)
7409 && known_eq (1 << shift
, GET_MODE_SIZE (mode
))))
7414 && aarch64_regno_ok_for_index_p (REGNO (index
), strict_p
))
7417 info
->offset
= index
;
7418 info
->shift
= shift
;
7425 /* Return true if MODE is one of the modes for which we
7426 support LDP/STP operations. */
7429 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode
)
7431 return mode
== SImode
|| mode
== DImode
7432 || mode
== SFmode
|| mode
== DFmode
7433 || (aarch64_vector_mode_supported_p (mode
)
7434 && (known_eq (GET_MODE_SIZE (mode
), 8)
7435 || (known_eq (GET_MODE_SIZE (mode
), 16)
7436 && (aarch64_tune_params
.extra_tuning_flags
7437 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
) == 0)));
7440 /* Return true if REGNO is a virtual pointer register, or an eliminable
7441 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
7442 include stack_pointer or hard_frame_pointer. */
7444 virt_or_elim_regno_p (unsigned regno
)
7446 return ((regno
>= FIRST_VIRTUAL_REGISTER
7447 && regno
<= LAST_VIRTUAL_POINTER_REGISTER
)
7448 || regno
== FRAME_POINTER_REGNUM
7449 || regno
== ARG_POINTER_REGNUM
);
7452 /* Return true if X is a valid address of type TYPE for machine mode MODE.
7453 If it is, fill in INFO appropriately. STRICT_P is true if
7454 REG_OK_STRICT is in effect. */
7457 aarch64_classify_address (struct aarch64_address_info
*info
,
7458 rtx x
, machine_mode mode
, bool strict_p
,
7459 aarch64_addr_query_type type
)
7461 enum rtx_code code
= GET_CODE (x
);
7465 HOST_WIDE_INT const_size
;
7467 /* On BE, we use load/store pair for all large int mode load/stores.
7468 TI/TFmode may also use a load/store pair. */
7469 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
7470 bool advsimd_struct_p
= (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
));
7471 bool load_store_pair_p
= (type
== ADDR_QUERY_LDP_STP
7472 || type
== ADDR_QUERY_LDP_STP_N
7475 || (BYTES_BIG_ENDIAN
&& advsimd_struct_p
));
7477 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
7478 corresponds to the actual size of the memory being loaded/stored and the
7479 mode of the corresponding addressing mode is half of that. */
7480 if (type
== ADDR_QUERY_LDP_STP_N
7481 && known_eq (GET_MODE_SIZE (mode
), 16))
7484 bool allow_reg_index_p
= (!load_store_pair_p
7485 && (known_lt (GET_MODE_SIZE (mode
), 16)
7486 || vec_flags
== VEC_ADVSIMD
7487 || vec_flags
& VEC_SVE_DATA
));
7489 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
7490 [Rn, #offset, MUL VL]. */
7491 if ((vec_flags
& (VEC_SVE_DATA
| VEC_SVE_PRED
)) != 0
7492 && (code
!= REG
&& code
!= PLUS
))
7495 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
7497 if (advsimd_struct_p
7498 && !BYTES_BIG_ENDIAN
7499 && (code
!= POST_INC
&& code
!= REG
))
7502 gcc_checking_assert (GET_MODE (x
) == VOIDmode
7503 || SCALAR_INT_MODE_P (GET_MODE (x
)));
7509 info
->type
= ADDRESS_REG_IMM
;
7511 info
->offset
= const0_rtx
;
7512 info
->const_offset
= 0;
7513 return aarch64_base_register_rtx_p (x
, strict_p
);
7521 && virt_or_elim_regno_p (REGNO (op0
))
7522 && poly_int_rtx_p (op1
, &offset
))
7524 info
->type
= ADDRESS_REG_IMM
;
7527 info
->const_offset
= offset
;
7532 if (maybe_ne (GET_MODE_SIZE (mode
), 0)
7533 && aarch64_base_register_rtx_p (op0
, strict_p
)
7534 && poly_int_rtx_p (op1
, &offset
))
7536 info
->type
= ADDRESS_REG_IMM
;
7539 info
->const_offset
= offset
;
7541 /* TImode and TFmode values are allowed in both pairs of X
7542 registers and individual Q registers. The available
7544 X,X: 7-bit signed scaled offset
7545 Q: 9-bit signed offset
7546 We conservatively require an offset representable in either mode.
7547 When performing the check for pairs of X registers i.e. LDP/STP
7548 pass down DImode since that is the natural size of the LDP/STP
7549 instruction memory accesses. */
7550 if (mode
== TImode
|| mode
== TFmode
)
7551 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
7552 && (aarch64_offset_9bit_signed_unscaled_p (mode
, offset
)
7553 || offset_12bit_unsigned_scaled_p (mode
, offset
)));
7555 /* A 7bit offset check because OImode will emit a ldp/stp
7556 instruction (only big endian will get here).
7557 For ldp/stp instructions, the offset is scaled for the size of a
7558 single element of the pair. */
7560 return aarch64_offset_7bit_signed_scaled_p (TImode
, offset
);
7562 /* Three 9/12 bit offsets checks because CImode will emit three
7563 ldr/str instructions (only big endian will get here). */
7565 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
7566 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode
,
7568 || offset_12bit_unsigned_scaled_p (V16QImode
,
7571 /* Two 7bit offsets checks because XImode will emit two ldp/stp
7572 instructions (only big endian will get here). */
7574 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
7575 && aarch64_offset_7bit_signed_scaled_p (TImode
,
7578 /* Make "m" use the LD1 offset range for SVE data modes, so
7579 that pre-RTL optimizers like ivopts will work to that
7580 instead of the wider LDR/STR range. */
7581 if (vec_flags
== VEC_SVE_DATA
)
7582 return (type
== ADDR_QUERY_M
7583 ? offset_4bit_signed_scaled_p (mode
, offset
)
7584 : offset_9bit_signed_scaled_p (mode
, offset
));
7586 if (vec_flags
== (VEC_SVE_DATA
| VEC_STRUCT
))
7588 poly_int64 end_offset
= (offset
7589 + GET_MODE_SIZE (mode
)
7590 - BYTES_PER_SVE_VECTOR
);
7591 return (type
== ADDR_QUERY_M
7592 ? offset_4bit_signed_scaled_p (mode
, offset
)
7593 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE
, offset
)
7594 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE
,
7598 if (vec_flags
== VEC_SVE_PRED
)
7599 return offset_9bit_signed_scaled_p (mode
, offset
);
7601 if (load_store_pair_p
)
7602 return ((known_eq (GET_MODE_SIZE (mode
), 4)
7603 || known_eq (GET_MODE_SIZE (mode
), 8)
7604 || known_eq (GET_MODE_SIZE (mode
), 16))
7605 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
7607 return (aarch64_offset_9bit_signed_unscaled_p (mode
, offset
)
7608 || offset_12bit_unsigned_scaled_p (mode
, offset
));
7611 if (allow_reg_index_p
)
7613 /* Look for base + (scaled/extended) index register. */
7614 if (aarch64_base_register_rtx_p (op0
, strict_p
)
7615 && aarch64_classify_index (info
, op1
, mode
, strict_p
))
7620 if (aarch64_base_register_rtx_p (op1
, strict_p
)
7621 && aarch64_classify_index (info
, op0
, mode
, strict_p
))
7634 info
->type
= ADDRESS_REG_WB
;
7635 info
->base
= XEXP (x
, 0);
7636 info
->offset
= NULL_RTX
;
7637 return aarch64_base_register_rtx_p (info
->base
, strict_p
);
7641 info
->type
= ADDRESS_REG_WB
;
7642 info
->base
= XEXP (x
, 0);
7643 if (GET_CODE (XEXP (x
, 1)) == PLUS
7644 && poly_int_rtx_p (XEXP (XEXP (x
, 1), 1), &offset
)
7645 && rtx_equal_p (XEXP (XEXP (x
, 1), 0), info
->base
)
7646 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
7648 info
->offset
= XEXP (XEXP (x
, 1), 1);
7649 info
->const_offset
= offset
;
7651 /* TImode and TFmode values are allowed in both pairs of X
7652 registers and individual Q registers. The available
7654 X,X: 7-bit signed scaled offset
7655 Q: 9-bit signed offset
7656 We conservatively require an offset representable in either mode.
7658 if (mode
== TImode
|| mode
== TFmode
)
7659 return (aarch64_offset_7bit_signed_scaled_p (mode
, offset
)
7660 && aarch64_offset_9bit_signed_unscaled_p (mode
, offset
));
7662 if (load_store_pair_p
)
7663 return ((known_eq (GET_MODE_SIZE (mode
), 4)
7664 || known_eq (GET_MODE_SIZE (mode
), 8)
7665 || known_eq (GET_MODE_SIZE (mode
), 16))
7666 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
7668 return aarch64_offset_9bit_signed_unscaled_p (mode
, offset
);
7675 /* load literal: pc-relative constant pool entry. Only supported
7676 for SI mode or larger. */
7677 info
->type
= ADDRESS_SYMBOLIC
;
7679 if (!load_store_pair_p
7680 && GET_MODE_SIZE (mode
).is_constant (&const_size
)
7685 split_const (x
, &sym
, &addend
);
7686 return ((GET_CODE (sym
) == LABEL_REF
7687 || (GET_CODE (sym
) == SYMBOL_REF
7688 && CONSTANT_POOL_ADDRESS_P (sym
)
7689 && aarch64_pcrelative_literal_loads
)));
7694 info
->type
= ADDRESS_LO_SUM
;
7695 info
->base
= XEXP (x
, 0);
7696 info
->offset
= XEXP (x
, 1);
7697 if (allow_reg_index_p
7698 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
7701 split_const (info
->offset
, &sym
, &offs
);
7702 if (GET_CODE (sym
) == SYMBOL_REF
7703 && (aarch64_classify_symbol (sym
, INTVAL (offs
))
7704 == SYMBOL_SMALL_ABSOLUTE
))
7706 /* The symbol and offset must be aligned to the access size. */
7709 if (CONSTANT_POOL_ADDRESS_P (sym
))
7710 align
= GET_MODE_ALIGNMENT (get_pool_mode (sym
));
7711 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym
))
7713 tree exp
= SYMBOL_REF_DECL (sym
);
7714 align
= TYPE_ALIGN (TREE_TYPE (exp
));
7715 align
= aarch64_constant_alignment (exp
, align
);
7717 else if (SYMBOL_REF_DECL (sym
))
7718 align
= DECL_ALIGN (SYMBOL_REF_DECL (sym
));
7719 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym
)
7720 && SYMBOL_REF_BLOCK (sym
) != NULL
)
7721 align
= SYMBOL_REF_BLOCK (sym
)->alignment
;
7723 align
= BITS_PER_UNIT
;
7725 poly_int64 ref_size
= GET_MODE_SIZE (mode
);
7726 if (known_eq (ref_size
, 0))
7727 ref_size
= GET_MODE_SIZE (DImode
);
7729 return (multiple_p (INTVAL (offs
), ref_size
)
7730 && multiple_p (align
/ BITS_PER_UNIT
, ref_size
));
7740 /* Return true if the address X is valid for a PRFM instruction.
7741 STRICT_P is true if we should do strict checking with
7742 aarch64_classify_address. */
7745 aarch64_address_valid_for_prefetch_p (rtx x
, bool strict_p
)
7747 struct aarch64_address_info addr
;
7749 /* PRFM accepts the same addresses as DImode... */
7750 bool res
= aarch64_classify_address (&addr
, x
, DImode
, strict_p
);
7754 /* ... except writeback forms. */
7755 return addr
.type
!= ADDRESS_REG_WB
;
7759 aarch64_symbolic_address_p (rtx x
)
7763 split_const (x
, &x
, &offset
);
7764 return GET_CODE (x
) == SYMBOL_REF
|| GET_CODE (x
) == LABEL_REF
;
7767 /* Classify the base of symbolic expression X. */
7769 enum aarch64_symbol_type
7770 aarch64_classify_symbolic_expression (rtx x
)
7774 split_const (x
, &x
, &offset
);
7775 return aarch64_classify_symbol (x
, INTVAL (offset
));
7779 /* Return TRUE if X is a legitimate address for accessing memory in
7782 aarch64_legitimate_address_hook_p (machine_mode mode
, rtx x
, bool strict_p
)
7784 struct aarch64_address_info addr
;
7786 return aarch64_classify_address (&addr
, x
, mode
, strict_p
);
7789 /* Return TRUE if X is a legitimate address of type TYPE for accessing
7790 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
7792 aarch64_legitimate_address_p (machine_mode mode
, rtx x
, bool strict_p
,
7793 aarch64_addr_query_type type
)
7795 struct aarch64_address_info addr
;
7797 return aarch64_classify_address (&addr
, x
, mode
, strict_p
, type
);
7800 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
7803 aarch64_legitimize_address_displacement (rtx
*offset1
, rtx
*offset2
,
7804 poly_int64 orig_offset
,
7808 if (GET_MODE_SIZE (mode
).is_constant (&size
))
7810 HOST_WIDE_INT const_offset
, second_offset
;
7812 /* A general SVE offset is A * VQ + B. Remove the A component from
7813 coefficient 0 in order to get the constant B. */
7814 const_offset
= orig_offset
.coeffs
[0] - orig_offset
.coeffs
[1];
7816 /* Split an out-of-range address displacement into a base and
7817 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
7818 range otherwise to increase opportunities for sharing the base
7819 address of different sizes. Unaligned accesses use the signed
7820 9-bit range, TImode/TFmode use the intersection of signed
7821 scaled 7-bit and signed 9-bit offset. */
7822 if (mode
== TImode
|| mode
== TFmode
)
7823 second_offset
= ((const_offset
+ 0x100) & 0x1f8) - 0x100;
7824 else if ((const_offset
& (size
- 1)) != 0)
7825 second_offset
= ((const_offset
+ 0x100) & 0x1ff) - 0x100;
7827 second_offset
= const_offset
& (size
< 4 ? 0xfff : 0x3ffc);
7829 if (second_offset
== 0 || known_eq (orig_offset
, second_offset
))
7832 /* Split the offset into second_offset and the rest. */
7833 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
7834 *offset2
= gen_int_mode (second_offset
, Pmode
);
7839 /* Get the mode we should use as the basis of the range. For structure
7840 modes this is the mode of one vector. */
7841 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
7842 machine_mode step_mode
7843 = (vec_flags
& VEC_STRUCT
) != 0 ? SVE_BYTE_MODE
: mode
;
7845 /* Get the "mul vl" multiplier we'd like to use. */
7846 HOST_WIDE_INT factor
= GET_MODE_SIZE (step_mode
).coeffs
[1];
7847 HOST_WIDE_INT vnum
= orig_offset
.coeffs
[1] / factor
;
7848 if (vec_flags
& VEC_SVE_DATA
)
7849 /* LDR supports a 9-bit range, but the move patterns for
7850 structure modes require all vectors to be in range of the
7851 same base. The simplest way of accomodating that while still
7852 promoting reuse of anchor points between different modes is
7853 to use an 8-bit range unconditionally. */
7854 vnum
= ((vnum
+ 128) & 255) - 128;
7856 /* Predicates are only handled singly, so we might as well use
7858 vnum
= ((vnum
+ 256) & 511) - 256;
7862 /* Convert the "mul vl" multiplier into a byte offset. */
7863 poly_int64 second_offset
= GET_MODE_SIZE (step_mode
) * vnum
;
7864 if (known_eq (second_offset
, orig_offset
))
7867 /* Split the offset into second_offset and the rest. */
7868 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
7869 *offset2
= gen_int_mode (second_offset
, Pmode
);
7874 /* Return the binary representation of floating point constant VALUE in INTVAL.
7875 If the value cannot be converted, return false without setting INTVAL.
7876 The conversion is done in the given MODE. */
7878 aarch64_reinterpret_float_as_int (rtx value
, unsigned HOST_WIDE_INT
*intval
)
7881 /* We make a general exception for 0. */
7882 if (aarch64_float_const_zero_rtx_p (value
))
7888 scalar_float_mode mode
;
7889 if (GET_CODE (value
) != CONST_DOUBLE
7890 || !is_a
<scalar_float_mode
> (GET_MODE (value
), &mode
)
7891 || GET_MODE_BITSIZE (mode
) > HOST_BITS_PER_WIDE_INT
7892 /* Only support up to DF mode. */
7893 || GET_MODE_BITSIZE (mode
) > GET_MODE_BITSIZE (DFmode
))
7896 unsigned HOST_WIDE_INT ival
= 0;
7899 real_to_target (res
,
7900 CONST_DOUBLE_REAL_VALUE (value
),
7901 REAL_MODE_FORMAT (mode
));
7905 int order
= BYTES_BIG_ENDIAN
? 1 : 0;
7906 ival
= zext_hwi (res
[order
], 32);
7907 ival
|= (zext_hwi (res
[1 - order
], 32) << 32);
7910 ival
= zext_hwi (res
[0], 32);
7916 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7917 single MOV(+MOVK) followed by an FMOV. */
7919 aarch64_float_const_rtx_p (rtx x
)
7921 machine_mode mode
= GET_MODE (x
);
7922 if (mode
== VOIDmode
)
7925 /* Determine whether it's cheaper to write float constants as
7926 mov/movk pairs over ldr/adrp pairs. */
7927 unsigned HOST_WIDE_INT ival
;
7929 if (GET_CODE (x
) == CONST_DOUBLE
7930 && SCALAR_FLOAT_MODE_P (mode
)
7931 && aarch64_reinterpret_float_as_int (x
, &ival
))
7933 scalar_int_mode imode
= (mode
== HFmode
7935 : int_mode_for_mode (mode
).require ());
7936 int num_instr
= aarch64_internal_mov_immediate
7937 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
7938 return num_instr
< 3;
7944 /* Return TRUE if rtx X is immediate constant 0.0 */
7946 aarch64_float_const_zero_rtx_p (rtx x
)
7948 if (GET_MODE (x
) == VOIDmode
)
7951 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x
)))
7952 return !HONOR_SIGNED_ZEROS (GET_MODE (x
));
7953 return real_equal (CONST_DOUBLE_REAL_VALUE (x
), &dconst0
);
7956 /* Return TRUE if rtx X is immediate constant that fits in a single
7957 MOVI immediate operation. */
7959 aarch64_can_const_movi_rtx_p (rtx x
, machine_mode mode
)
7965 scalar_int_mode imode
;
7966 unsigned HOST_WIDE_INT ival
;
7968 if (GET_CODE (x
) == CONST_DOUBLE
7969 && SCALAR_FLOAT_MODE_P (mode
))
7971 if (!aarch64_reinterpret_float_as_int (x
, &ival
))
7974 /* We make a general exception for 0. */
7975 if (aarch64_float_const_zero_rtx_p (x
))
7978 imode
= int_mode_for_mode (mode
).require ();
7980 else if (GET_CODE (x
) == CONST_INT
7981 && is_a
<scalar_int_mode
> (mode
, &imode
))
7986 /* use a 64 bit mode for everything except for DI/DF mode, where we use
7987 a 128 bit vector mode. */
7988 int width
= GET_MODE_BITSIZE (imode
) == 64 ? 128 : 64;
7990 vmode
= aarch64_simd_container_mode (imode
, width
);
7991 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, ival
);
7993 return aarch64_simd_valid_immediate (v_op
, NULL
);
7997 /* Return the fixed registers used for condition codes. */
8000 aarch64_fixed_condition_code_regs (unsigned int *p1
, unsigned int *p2
)
8003 *p2
= INVALID_REGNUM
;
8007 /* This function is used by the call expanders of the machine description.
8008 RESULT is the register in which the result is returned. It's NULL for
8009 "call" and "sibcall".
8010 MEM is the location of the function call.
8011 SIBCALL indicates whether this function call is normal call or sibling call.
8012 It will generate different pattern accordingly. */
8015 aarch64_expand_call (rtx result
, rtx mem
, bool sibcall
)
8017 rtx call
, callee
, tmp
;
8021 gcc_assert (MEM_P (mem
));
8022 callee
= XEXP (mem
, 0);
8023 mode
= GET_MODE (callee
);
8024 gcc_assert (mode
== Pmode
);
8026 /* Decide if we should generate indirect calls by loading the
8027 address of the callee into a register before performing
8028 the branch-and-link. */
8029 if (SYMBOL_REF_P (callee
)
8030 ? (aarch64_is_long_call_p (callee
)
8031 || aarch64_is_noplt_call_p (callee
))
8033 XEXP (mem
, 0) = force_reg (mode
, callee
);
8035 call
= gen_rtx_CALL (VOIDmode
, mem
, const0_rtx
);
8037 if (result
!= NULL_RTX
)
8038 call
= gen_rtx_SET (result
, call
);
8043 tmp
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (Pmode
, LR_REGNUM
));
8045 vec
= gen_rtvec (2, call
, tmp
);
8046 call
= gen_rtx_PARALLEL (VOIDmode
, vec
);
8048 aarch64_emit_call_insn (call
);
8051 /* Emit call insn with PAT and do aarch64-specific handling. */
8054 aarch64_emit_call_insn (rtx pat
)
8056 rtx insn
= emit_call_insn (pat
);
8058 rtx
*fusage
= &CALL_INSN_FUNCTION_USAGE (insn
);
8059 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP0_REGNUM
));
8060 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP1_REGNUM
));
8064 aarch64_select_cc_mode (RTX_CODE code
, rtx x
, rtx y
)
8066 machine_mode mode_x
= GET_MODE (x
);
8067 rtx_code code_x
= GET_CODE (x
);
8069 /* All floating point compares return CCFP if it is an equality
8070 comparison, and CCFPE otherwise. */
8071 if (GET_MODE_CLASS (mode_x
) == MODE_FLOAT
)
8098 /* Equality comparisons of short modes against zero can be performed
8099 using the TST instruction with the appropriate bitmask. */
8100 if (y
== const0_rtx
&& (REG_P (x
) || SUBREG_P (x
))
8101 && (code
== EQ
|| code
== NE
)
8102 && (mode_x
== HImode
|| mode_x
== QImode
))
8105 /* Similarly, comparisons of zero_extends from shorter modes can
8106 be performed using an ANDS with an immediate mask. */
8107 if (y
== const0_rtx
&& code_x
== ZERO_EXTEND
8108 && (mode_x
== SImode
|| mode_x
== DImode
)
8109 && (GET_MODE (XEXP (x
, 0)) == HImode
|| GET_MODE (XEXP (x
, 0)) == QImode
)
8110 && (code
== EQ
|| code
== NE
))
8113 if ((mode_x
== SImode
|| mode_x
== DImode
)
8115 && (code
== EQ
|| code
== NE
|| code
== LT
|| code
== GE
)
8116 && (code_x
== PLUS
|| code_x
== MINUS
|| code_x
== AND
8118 || (code_x
== ZERO_EXTRACT
&& CONST_INT_P (XEXP (x
, 1))
8119 && CONST_INT_P (XEXP (x
, 2)))))
8122 /* A compare with a shifted operand. Because of canonicalization,
8123 the comparison will have to be swapped when we emit the assembly
8125 if ((mode_x
== SImode
|| mode_x
== DImode
)
8126 && (REG_P (y
) || GET_CODE (y
) == SUBREG
|| y
== const0_rtx
)
8127 && (code_x
== ASHIFT
|| code_x
== ASHIFTRT
8128 || code_x
== LSHIFTRT
8129 || code_x
== ZERO_EXTEND
|| code_x
== SIGN_EXTEND
))
8132 /* Similarly for a negated operand, but we can only do this for
8134 if ((mode_x
== SImode
|| mode_x
== DImode
)
8135 && (REG_P (y
) || GET_CODE (y
) == SUBREG
)
8136 && (code
== EQ
|| code
== NE
)
8140 /* A test for unsigned overflow from an addition. */
8141 if ((mode_x
== DImode
|| mode_x
== TImode
)
8142 && (code
== LTU
|| code
== GEU
)
8144 && rtx_equal_p (XEXP (x
, 0), y
))
8147 /* A test for unsigned overflow from an add with carry. */
8148 if ((mode_x
== DImode
|| mode_x
== TImode
)
8149 && (code
== LTU
|| code
== GEU
)
8151 && CONST_SCALAR_INT_P (y
)
8152 && (rtx_mode_t (y
, mode_x
)
8153 == (wi::shwi (1, mode_x
)
8154 << (GET_MODE_BITSIZE (mode_x
).to_constant () / 2))))
8157 /* A test for signed overflow. */
8158 if ((mode_x
== DImode
|| mode_x
== TImode
)
8161 && GET_CODE (y
) == SIGN_EXTEND
)
8164 /* For everything else, return CCmode. */
8169 aarch64_get_condition_code_1 (machine_mode
, enum rtx_code
);
8172 aarch64_get_condition_code (rtx x
)
8174 machine_mode mode
= GET_MODE (XEXP (x
, 0));
8175 enum rtx_code comp_code
= GET_CODE (x
);
8177 if (GET_MODE_CLASS (mode
) != MODE_CC
)
8178 mode
= SELECT_CC_MODE (comp_code
, XEXP (x
, 0), XEXP (x
, 1));
8179 return aarch64_get_condition_code_1 (mode
, comp_code
);
8183 aarch64_get_condition_code_1 (machine_mode mode
, enum rtx_code comp_code
)
8191 case GE
: return AARCH64_GE
;
8192 case GT
: return AARCH64_GT
;
8193 case LE
: return AARCH64_LS
;
8194 case LT
: return AARCH64_MI
;
8195 case NE
: return AARCH64_NE
;
8196 case EQ
: return AARCH64_EQ
;
8197 case ORDERED
: return AARCH64_VC
;
8198 case UNORDERED
: return AARCH64_VS
;
8199 case UNLT
: return AARCH64_LT
;
8200 case UNLE
: return AARCH64_LE
;
8201 case UNGT
: return AARCH64_HI
;
8202 case UNGE
: return AARCH64_PL
;
8210 case NE
: return AARCH64_NE
;
8211 case EQ
: return AARCH64_EQ
;
8212 case GE
: return AARCH64_GE
;
8213 case GT
: return AARCH64_GT
;
8214 case LE
: return AARCH64_LE
;
8215 case LT
: return AARCH64_LT
;
8216 case GEU
: return AARCH64_CS
;
8217 case GTU
: return AARCH64_HI
;
8218 case LEU
: return AARCH64_LS
;
8219 case LTU
: return AARCH64_CC
;
8227 case NE
: return AARCH64_NE
;
8228 case EQ
: return AARCH64_EQ
;
8229 case GE
: return AARCH64_LE
;
8230 case GT
: return AARCH64_LT
;
8231 case LE
: return AARCH64_GE
;
8232 case LT
: return AARCH64_GT
;
8233 case GEU
: return AARCH64_LS
;
8234 case GTU
: return AARCH64_CC
;
8235 case LEU
: return AARCH64_CS
;
8236 case LTU
: return AARCH64_HI
;
8244 case NE
: return AARCH64_NE
; /* = any */
8245 case EQ
: return AARCH64_EQ
; /* = none */
8246 case GE
: return AARCH64_PL
; /* = nfrst */
8247 case LT
: return AARCH64_MI
; /* = first */
8248 case GEU
: return AARCH64_CS
; /* = nlast */
8249 case GTU
: return AARCH64_HI
; /* = pmore */
8250 case LEU
: return AARCH64_LS
; /* = plast */
8251 case LTU
: return AARCH64_CC
; /* = last */
8259 case NE
: return AARCH64_NE
;
8260 case EQ
: return AARCH64_EQ
;
8261 case GE
: return AARCH64_PL
;
8262 case LT
: return AARCH64_MI
;
8270 case NE
: return AARCH64_NE
;
8271 case EQ
: return AARCH64_EQ
;
8279 case LTU
: return AARCH64_CS
;
8280 case GEU
: return AARCH64_CC
;
8288 case GEU
: return AARCH64_CS
;
8289 case LTU
: return AARCH64_CC
;
8297 case NE
: return AARCH64_VS
;
8298 case EQ
: return AARCH64_VC
;
8311 aarch64_const_vec_all_same_in_range_p (rtx x
,
8312 HOST_WIDE_INT minval
,
8313 HOST_WIDE_INT maxval
)
8316 return (const_vec_duplicate_p (x
, &elt
)
8317 && CONST_INT_P (elt
)
8318 && IN_RANGE (INTVAL (elt
), minval
, maxval
));
8322 aarch64_const_vec_all_same_int_p (rtx x
, HOST_WIDE_INT val
)
8324 return aarch64_const_vec_all_same_in_range_p (x
, val
, val
);
8327 /* Return true if VEC is a constant in which every element is in the range
8328 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
8331 aarch64_const_vec_all_in_range_p (rtx vec
,
8332 HOST_WIDE_INT minval
,
8333 HOST_WIDE_INT maxval
)
8335 if (GET_CODE (vec
) != CONST_VECTOR
8336 || GET_MODE_CLASS (GET_MODE (vec
)) != MODE_VECTOR_INT
)
8340 if (!CONST_VECTOR_STEPPED_P (vec
))
8341 nunits
= const_vector_encoded_nelts (vec
);
8342 else if (!CONST_VECTOR_NUNITS (vec
).is_constant (&nunits
))
8345 for (int i
= 0; i
< nunits
; i
++)
8347 rtx vec_elem
= CONST_VECTOR_ELT (vec
, i
);
8348 if (!CONST_INT_P (vec_elem
)
8349 || !IN_RANGE (INTVAL (vec_elem
), minval
, maxval
))
8356 #define AARCH64_CC_V 1
8357 #define AARCH64_CC_C (1 << 1)
8358 #define AARCH64_CC_Z (1 << 2)
8359 #define AARCH64_CC_N (1 << 3)
8361 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
8362 static const int aarch64_nzcv_codes
[] =
8364 0, /* EQ, Z == 1. */
8365 AARCH64_CC_Z
, /* NE, Z == 0. */
8366 0, /* CS, C == 1. */
8367 AARCH64_CC_C
, /* CC, C == 0. */
8368 0, /* MI, N == 1. */
8369 AARCH64_CC_N
, /* PL, N == 0. */
8370 0, /* VS, V == 1. */
8371 AARCH64_CC_V
, /* VC, V == 0. */
8372 0, /* HI, C ==1 && Z == 0. */
8373 AARCH64_CC_C
, /* LS, !(C == 1 && Z == 0). */
8374 AARCH64_CC_V
, /* GE, N == V. */
8375 0, /* LT, N != V. */
8376 AARCH64_CC_Z
, /* GT, Z == 0 && N == V. */
8377 0, /* LE, !(Z == 0 && N == V). */
8382 /* Print floating-point vector immediate operand X to F, negating it
8383 first if NEGATE is true. Return true on success, false if it isn't
8384 a constant we can handle. */
8387 aarch64_print_vector_float_operand (FILE *f
, rtx x
, bool negate
)
8391 if (!const_vec_duplicate_p (x
, &elt
))
8394 REAL_VALUE_TYPE r
= *CONST_DOUBLE_REAL_VALUE (elt
);
8396 r
= real_value_negate (&r
);
8398 /* Handle the SVE single-bit immediates specially, since they have a
8399 fixed form in the assembly syntax. */
8400 if (real_equal (&r
, &dconst0
))
8401 asm_fprintf (f
, "0.0");
8402 else if (real_equal (&r
, &dconst2
))
8403 asm_fprintf (f
, "2.0");
8404 else if (real_equal (&r
, &dconst1
))
8405 asm_fprintf (f
, "1.0");
8406 else if (real_equal (&r
, &dconsthalf
))
8407 asm_fprintf (f
, "0.5");
8410 const int buf_size
= 20;
8411 char float_buf
[buf_size
] = {'\0'};
8412 real_to_decimal_for_mode (float_buf
, &r
, buf_size
, buf_size
,
8414 asm_fprintf (f
, "%s", float_buf
);
8420 /* Return the equivalent letter for size. */
8422 sizetochar (int size
)
8426 case 64: return 'd';
8427 case 32: return 's';
8428 case 16: return 'h';
8429 case 8 : return 'b';
8430 default: gcc_unreachable ();
8434 /* Print operand X to file F in a target specific manner according to CODE.
8435 The acceptable formatting commands given by CODE are:
8436 'c': An integer or symbol address without a preceding #
8438 'C': Take the duplicated element in a vector constant
8439 and print it in hex.
8440 'D': Take the duplicated element in a vector constant
8441 and print it as an unsigned integer, in decimal.
8442 'e': Print the sign/zero-extend size as a character 8->b,
8443 16->h, 32->w. Can also be used for masks:
8444 0xff->b, 0xffff->h, 0xffffffff->w.
8445 'I': If the operand is a duplicated vector constant,
8446 replace it with the duplicated scalar. If the
8447 operand is then a floating-point constant, replace
8448 it with the integer bit representation. Print the
8449 transformed constant as a signed decimal number.
8450 'p': Prints N such that 2^N == X (X must be power of 2 and
8452 'P': Print the number of non-zero bits in X (a const_int).
8453 'H': Print the higher numbered register of a pair (TImode)
8455 'm': Print a condition (eq, ne, etc).
8456 'M': Same as 'm', but invert condition.
8457 'N': Take the duplicated element in a vector constant
8458 and print the negative of it in decimal.
8459 'b/h/s/d/q': Print a scalar FP/SIMD register name.
8460 'S/T/U/V': Print a FP/SIMD register name for a register list.
8461 The register printed is the FP/SIMD register name
8462 of X + 0/1/2/3 for S/T/U/V.
8463 'R': Print a scalar Integer/FP/SIMD register name + 1.
8464 'X': Print bottom 16 bits of integer constant in hex.
8465 'w/x': Print a general register name or the zero register
8467 '0': Print a normal operand, if it's a general register,
8468 then we assume DImode.
8469 'k': Print NZCV for conditional compare instructions.
8470 'A': Output address constant representing the first
8471 argument of X, specifying a relocation offset
8473 'L': Output constant address specified by X
8474 with a relocation offset if appropriate.
8475 'G': Prints address of X, specifying a PC relative
8476 relocation mode if appropriate.
8477 'y': Output address of LDP or STP - this is used for
8478 some LDP/STPs which don't use a PARALLEL in their
8479 pattern (so the mode needs to be adjusted).
8480 'z': Output address of a typical LDP or STP. */
8483 aarch64_print_operand (FILE *f
, rtx x
, int code
)
8489 switch (GET_CODE (x
))
8492 fprintf (f
, HOST_WIDE_INT_PRINT_DEC
, INTVAL (x
));
8496 output_addr_const (f
, x
);
8500 if (GET_CODE (XEXP (x
, 0)) == PLUS
8501 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
)
8503 output_addr_const (f
, x
);
8509 output_operand_lossage ("unsupported operand for code '%c'", code
);
8515 x
= unwrap_const_vec_duplicate (x
);
8516 if (!CONST_INT_P (x
))
8518 output_operand_lossage ("invalid operand for '%%%c'", code
);
8522 HOST_WIDE_INT val
= INTVAL (x
);
8523 if ((val
& ~7) == 8 || val
== 0xff)
8525 else if ((val
& ~7) == 16 || val
== 0xffff)
8527 else if ((val
& ~7) == 32 || val
== 0xffffffff)
8531 output_operand_lossage ("invalid operand for '%%%c'", code
);
8541 if (!CONST_INT_P (x
) || (n
= exact_log2 (INTVAL (x
))) < 0)
8543 output_operand_lossage ("invalid operand for '%%%c'", code
);
8547 asm_fprintf (f
, "%d", n
);
8552 if (!CONST_INT_P (x
))
8554 output_operand_lossage ("invalid operand for '%%%c'", code
);
8558 asm_fprintf (f
, "%u", popcount_hwi (INTVAL (x
)));
8562 if (x
== const0_rtx
)
8564 asm_fprintf (f
, "xzr");
8568 if (!REG_P (x
) || !GP_REGNUM_P (REGNO (x
) + 1))
8570 output_operand_lossage ("invalid operand for '%%%c'", code
);
8574 asm_fprintf (f
, "%s", reg_names
[REGNO (x
) + 1]);
8579 x
= aarch64_bit_representation (unwrap_const_vec_duplicate (x
));
8580 if (CONST_INT_P (x
))
8581 asm_fprintf (f
, "%wd", INTVAL (x
));
8584 output_operand_lossage ("invalid operand for '%%%c'", code
);
8594 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
8595 if (x
== const_true_rtx
)
8602 if (!COMPARISON_P (x
))
8604 output_operand_lossage ("invalid operand for '%%%c'", code
);
8608 cond_code
= aarch64_get_condition_code (x
);
8609 gcc_assert (cond_code
>= 0);
8611 cond_code
= AARCH64_INVERSE_CONDITION_CODE (cond_code
);
8612 if (GET_MODE (XEXP (x
, 0)) == CC_NZCmode
)
8613 fputs (aarch64_sve_condition_codes
[cond_code
], f
);
8615 fputs (aarch64_condition_codes
[cond_code
], f
);
8620 if (!const_vec_duplicate_p (x
, &elt
))
8622 output_operand_lossage ("invalid vector constant");
8626 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
8627 asm_fprintf (f
, "%wd", -INTVAL (elt
));
8628 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
8629 && aarch64_print_vector_float_operand (f
, x
, true))
8633 output_operand_lossage ("invalid vector constant");
8643 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
8645 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
8648 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - V0_REGNUM
);
8655 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
8657 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
8660 asm_fprintf (f
, "%c%d",
8661 aarch64_sve_data_mode_p (GET_MODE (x
)) ? 'z' : 'v',
8662 REGNO (x
) - V0_REGNUM
+ (code
- 'S'));
8666 if (REG_P (x
) && FP_REGNUM_P (REGNO (x
)))
8667 asm_fprintf (f
, "q%d", REGNO (x
) - V0_REGNUM
+ 1);
8668 else if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
8669 asm_fprintf (f
, "x%d", REGNO (x
) - R0_REGNUM
+ 1);
8671 output_operand_lossage ("incompatible register operand for '%%%c'",
8676 if (!CONST_INT_P (x
))
8678 output_operand_lossage ("invalid operand for '%%%c'", code
);
8681 asm_fprintf (f
, "0x%wx", UINTVAL (x
) & 0xffff);
8686 /* Print a replicated constant in hex. */
8687 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
8689 output_operand_lossage ("invalid operand for '%%%c'", code
);
8692 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
8693 asm_fprintf (f
, "0x%wx", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
8699 /* Print a replicated constant in decimal, treating it as
8701 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
8703 output_operand_lossage ("invalid operand for '%%%c'", code
);
8706 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
8707 asm_fprintf (f
, "%wd", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
8714 || (CONST_DOUBLE_P (x
) && aarch64_float_const_zero_rtx_p (x
)))
8716 asm_fprintf (f
, "%czr", code
);
8720 if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
8722 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - R0_REGNUM
);
8726 if (REG_P (x
) && REGNO (x
) == SP_REGNUM
)
8728 asm_fprintf (f
, "%ssp", code
== 'w' ? "w" : "");
8737 output_operand_lossage ("missing operand");
8741 switch (GET_CODE (x
))
8744 if (aarch64_sve_data_mode_p (GET_MODE (x
)))
8746 if (REG_NREGS (x
) == 1)
8747 asm_fprintf (f
, "z%d", REGNO (x
) - V0_REGNUM
);
8751 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x
)));
8752 asm_fprintf (f
, "{z%d.%c - z%d.%c}",
8753 REGNO (x
) - V0_REGNUM
, suffix
,
8754 END_REGNO (x
) - V0_REGNUM
- 1, suffix
);
8758 asm_fprintf (f
, "%s", reg_names
[REGNO (x
)]);
8762 output_address (GET_MODE (x
), XEXP (x
, 0));
8767 output_addr_const (asm_out_file
, x
);
8771 asm_fprintf (f
, "%wd", INTVAL (x
));
8775 if (!VECTOR_MODE_P (GET_MODE (x
)))
8777 output_addr_const (asm_out_file
, x
);
8783 if (!const_vec_duplicate_p (x
, &elt
))
8785 output_operand_lossage ("invalid vector constant");
8789 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
8790 asm_fprintf (f
, "%wd", INTVAL (elt
));
8791 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
8792 && aarch64_print_vector_float_operand (f
, x
, false))
8796 output_operand_lossage ("invalid vector constant");
8802 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
8803 be getting CONST_DOUBLEs holding integers. */
8804 gcc_assert (GET_MODE (x
) != VOIDmode
);
8805 if (aarch64_float_const_zero_rtx_p (x
))
8810 else if (aarch64_float_const_representable_p (x
))
8813 char float_buf
[buf_size
] = {'\0'};
8814 real_to_decimal_for_mode (float_buf
,
8815 CONST_DOUBLE_REAL_VALUE (x
),
8818 asm_fprintf (asm_out_file
, "%s", float_buf
);
8822 output_operand_lossage ("invalid constant");
8825 output_operand_lossage ("invalid operand");
8831 if (GET_CODE (x
) == HIGH
)
8834 switch (aarch64_classify_symbolic_expression (x
))
8836 case SYMBOL_SMALL_GOT_4G
:
8837 asm_fprintf (asm_out_file
, ":got:");
8840 case SYMBOL_SMALL_TLSGD
:
8841 asm_fprintf (asm_out_file
, ":tlsgd:");
8844 case SYMBOL_SMALL_TLSDESC
:
8845 asm_fprintf (asm_out_file
, ":tlsdesc:");
8848 case SYMBOL_SMALL_TLSIE
:
8849 asm_fprintf (asm_out_file
, ":gottprel:");
8852 case SYMBOL_TLSLE24
:
8853 asm_fprintf (asm_out_file
, ":tprel:");
8856 case SYMBOL_TINY_GOT
:
8863 output_addr_const (asm_out_file
, x
);
8867 switch (aarch64_classify_symbolic_expression (x
))
8869 case SYMBOL_SMALL_GOT_4G
:
8870 asm_fprintf (asm_out_file
, ":lo12:");
8873 case SYMBOL_SMALL_TLSGD
:
8874 asm_fprintf (asm_out_file
, ":tlsgd_lo12:");
8877 case SYMBOL_SMALL_TLSDESC
:
8878 asm_fprintf (asm_out_file
, ":tlsdesc_lo12:");
8881 case SYMBOL_SMALL_TLSIE
:
8882 asm_fprintf (asm_out_file
, ":gottprel_lo12:");
8885 case SYMBOL_TLSLE12
:
8886 asm_fprintf (asm_out_file
, ":tprel_lo12:");
8889 case SYMBOL_TLSLE24
:
8890 asm_fprintf (asm_out_file
, ":tprel_lo12_nc:");
8893 case SYMBOL_TINY_GOT
:
8894 asm_fprintf (asm_out_file
, ":got:");
8897 case SYMBOL_TINY_TLSIE
:
8898 asm_fprintf (asm_out_file
, ":gottprel:");
8904 output_addr_const (asm_out_file
, x
);
8908 switch (aarch64_classify_symbolic_expression (x
))
8910 case SYMBOL_TLSLE24
:
8911 asm_fprintf (asm_out_file
, ":tprel_hi12:");
8916 output_addr_const (asm_out_file
, x
);
8921 HOST_WIDE_INT cond_code
;
8923 if (!CONST_INT_P (x
))
8925 output_operand_lossage ("invalid operand for '%%%c'", code
);
8929 cond_code
= INTVAL (x
);
8930 gcc_assert (cond_code
>= 0 && cond_code
<= AARCH64_NV
);
8931 asm_fprintf (f
, "%d", aarch64_nzcv_codes
[cond_code
]);
8938 machine_mode mode
= GET_MODE (x
);
8940 if (GET_CODE (x
) != MEM
8941 || (code
== 'y' && maybe_ne (GET_MODE_SIZE (mode
), 16)))
8943 output_operand_lossage ("invalid operand for '%%%c'", code
);
8947 if (!aarch64_print_address_internal (f
, mode
, XEXP (x
, 0),
8949 ? ADDR_QUERY_LDP_STP_N
8950 : ADDR_QUERY_LDP_STP
))
8951 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
8956 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
8961 /* Print address 'x' of a memory access with mode 'mode'.
8962 'op' is the context required by aarch64_classify_address. It can either be
8963 MEM for a normal memory access or PARALLEL for LDP/STP. */
8965 aarch64_print_address_internal (FILE *f
, machine_mode mode
, rtx x
,
8966 aarch64_addr_query_type type
)
8968 struct aarch64_address_info addr
;
8971 /* Check all addresses are Pmode - including ILP32. */
8972 if (GET_MODE (x
) != Pmode
8973 && (!CONST_INT_P (x
)
8974 || trunc_int_for_mode (INTVAL (x
), Pmode
) != INTVAL (x
)))
8976 output_operand_lossage ("invalid address mode");
8980 if (aarch64_classify_address (&addr
, x
, mode
, true, type
))
8983 case ADDRESS_REG_IMM
:
8984 if (known_eq (addr
.const_offset
, 0))
8985 asm_fprintf (f
, "[%s]", reg_names
[REGNO (addr
.base
)]);
8986 else if (aarch64_sve_data_mode_p (mode
))
8989 = exact_div (addr
.const_offset
,
8990 BYTES_PER_SVE_VECTOR
).to_constant ();
8991 asm_fprintf (f
, "[%s, #%wd, mul vl]",
8992 reg_names
[REGNO (addr
.base
)], vnum
);
8994 else if (aarch64_sve_pred_mode_p (mode
))
8997 = exact_div (addr
.const_offset
,
8998 BYTES_PER_SVE_PRED
).to_constant ();
8999 asm_fprintf (f
, "[%s, #%wd, mul vl]",
9000 reg_names
[REGNO (addr
.base
)], vnum
);
9003 asm_fprintf (f
, "[%s, %wd]", reg_names
[REGNO (addr
.base
)],
9004 INTVAL (addr
.offset
));
9007 case ADDRESS_REG_REG
:
9008 if (addr
.shift
== 0)
9009 asm_fprintf (f
, "[%s, %s]", reg_names
[REGNO (addr
.base
)],
9010 reg_names
[REGNO (addr
.offset
)]);
9012 asm_fprintf (f
, "[%s, %s, lsl %u]", reg_names
[REGNO (addr
.base
)],
9013 reg_names
[REGNO (addr
.offset
)], addr
.shift
);
9016 case ADDRESS_REG_UXTW
:
9017 if (addr
.shift
== 0)
9018 asm_fprintf (f
, "[%s, w%d, uxtw]", reg_names
[REGNO (addr
.base
)],
9019 REGNO (addr
.offset
) - R0_REGNUM
);
9021 asm_fprintf (f
, "[%s, w%d, uxtw %u]", reg_names
[REGNO (addr
.base
)],
9022 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
9025 case ADDRESS_REG_SXTW
:
9026 if (addr
.shift
== 0)
9027 asm_fprintf (f
, "[%s, w%d, sxtw]", reg_names
[REGNO (addr
.base
)],
9028 REGNO (addr
.offset
) - R0_REGNUM
);
9030 asm_fprintf (f
, "[%s, w%d, sxtw %u]", reg_names
[REGNO (addr
.base
)],
9031 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
9034 case ADDRESS_REG_WB
:
9035 /* Writeback is only supported for fixed-width modes. */
9036 size
= GET_MODE_SIZE (mode
).to_constant ();
9037 switch (GET_CODE (x
))
9040 asm_fprintf (f
, "[%s, %d]!", reg_names
[REGNO (addr
.base
)], size
);
9043 asm_fprintf (f
, "[%s], %d", reg_names
[REGNO (addr
.base
)], size
);
9046 asm_fprintf (f
, "[%s, -%d]!", reg_names
[REGNO (addr
.base
)], size
);
9049 asm_fprintf (f
, "[%s], -%d", reg_names
[REGNO (addr
.base
)], size
);
9052 asm_fprintf (f
, "[%s, %wd]!", reg_names
[REGNO (addr
.base
)],
9053 INTVAL (addr
.offset
));
9056 asm_fprintf (f
, "[%s], %wd", reg_names
[REGNO (addr
.base
)],
9057 INTVAL (addr
.offset
));
9064 case ADDRESS_LO_SUM
:
9065 asm_fprintf (f
, "[%s, #:lo12:", reg_names
[REGNO (addr
.base
)]);
9066 output_addr_const (f
, addr
.offset
);
9067 asm_fprintf (f
, "]");
9070 case ADDRESS_SYMBOLIC
:
9071 output_addr_const (f
, x
);
9078 /* Print address 'x' of a memory access with mode 'mode'. */
9080 aarch64_print_operand_address (FILE *f
, machine_mode mode
, rtx x
)
9082 if (!aarch64_print_address_internal (f
, mode
, x
, ADDR_QUERY_ANY
))
9083 output_addr_const (f
, x
);
9087 aarch64_label_mentioned_p (rtx x
)
9092 if (GET_CODE (x
) == LABEL_REF
)
9095 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
9096 referencing instruction, but they are constant offsets, not
9098 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
9101 fmt
= GET_RTX_FORMAT (GET_CODE (x
));
9102 for (i
= GET_RTX_LENGTH (GET_CODE (x
)) - 1; i
>= 0; i
--)
9108 for (j
= XVECLEN (x
, i
) - 1; j
>= 0; j
--)
9109 if (aarch64_label_mentioned_p (XVECEXP (x
, i
, j
)))
9112 else if (fmt
[i
] == 'e' && aarch64_label_mentioned_p (XEXP (x
, i
)))
9119 /* Implement REGNO_REG_CLASS. */
9122 aarch64_regno_regclass (unsigned regno
)
9124 if (GP_REGNUM_P (regno
))
9125 return GENERAL_REGS
;
9127 if (regno
== SP_REGNUM
)
9130 if (regno
== FRAME_POINTER_REGNUM
9131 || regno
== ARG_POINTER_REGNUM
)
9132 return POINTER_REGS
;
9134 if (FP_REGNUM_P (regno
))
9135 return (FP_LO8_REGNUM_P (regno
) ? FP_LO8_REGS
9136 : FP_LO_REGNUM_P (regno
) ? FP_LO_REGS
: FP_REGS
);
9138 if (PR_REGNUM_P (regno
))
9139 return PR_LO_REGNUM_P (regno
) ? PR_LO_REGS
: PR_HI_REGS
;
9144 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
9145 If OFFSET is out of range, return an offset of an anchor point
9146 that is in range. Return 0 otherwise. */
9148 static HOST_WIDE_INT
9149 aarch64_anchor_offset (HOST_WIDE_INT offset
, HOST_WIDE_INT size
,
9152 /* Does it look like we'll need a 16-byte load/store-pair operation? */
9154 return (offset
+ 0x400) & ~0x7f0;
9156 /* For offsets that aren't a multiple of the access size, the limit is
9158 if (offset
& (size
- 1))
9160 /* BLKmode typically uses LDP of X-registers. */
9161 if (mode
== BLKmode
)
9162 return (offset
+ 512) & ~0x3ff;
9163 return (offset
+ 0x100) & ~0x1ff;
9166 /* Small negative offsets are supported. */
9167 if (IN_RANGE (offset
, -256, 0))
9170 if (mode
== TImode
|| mode
== TFmode
)
9171 return (offset
+ 0x100) & ~0x1ff;
9173 /* Use 12-bit offset by access size. */
9174 return offset
& (~0xfff * size
);
9178 aarch64_legitimize_address (rtx x
, rtx
/* orig_x */, machine_mode mode
)
9180 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
9181 where mask is selected by alignment and size of the offset.
9182 We try to pick as large a range for the offset as possible to
9183 maximize the chance of a CSE. However, for aligned addresses
9184 we limit the range to 4k so that structures with different sized
9185 elements are likely to use the same base. We need to be careful
9186 not to split a CONST for some forms of address expression, otherwise
9187 it will generate sub-optimal code. */
9189 if (GET_CODE (x
) == PLUS
&& CONST_INT_P (XEXP (x
, 1)))
9191 rtx base
= XEXP (x
, 0);
9192 rtx offset_rtx
= XEXP (x
, 1);
9193 HOST_WIDE_INT offset
= INTVAL (offset_rtx
);
9195 if (GET_CODE (base
) == PLUS
)
9197 rtx op0
= XEXP (base
, 0);
9198 rtx op1
= XEXP (base
, 1);
9200 /* Force any scaling into a temp for CSE. */
9201 op0
= force_reg (Pmode
, op0
);
9202 op1
= force_reg (Pmode
, op1
);
9204 /* Let the pointer register be in op0. */
9205 if (REG_POINTER (op1
))
9206 std::swap (op0
, op1
);
9208 /* If the pointer is virtual or frame related, then we know that
9209 virtual register instantiation or register elimination is going
9210 to apply a second constant. We want the two constants folded
9211 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
9212 if (virt_or_elim_regno_p (REGNO (op0
)))
9214 base
= expand_binop (Pmode
, add_optab
, op0
, offset_rtx
,
9215 NULL_RTX
, true, OPTAB_DIRECT
);
9216 return gen_rtx_PLUS (Pmode
, base
, op1
);
9219 /* Otherwise, in order to encourage CSE (and thence loop strength
9220 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
9221 base
= expand_binop (Pmode
, add_optab
, op0
, op1
,
9222 NULL_RTX
, true, OPTAB_DIRECT
);
9223 x
= gen_rtx_PLUS (Pmode
, base
, offset_rtx
);
9227 if (GET_MODE_SIZE (mode
).is_constant (&size
))
9229 HOST_WIDE_INT base_offset
= aarch64_anchor_offset (offset
, size
,
9231 if (base_offset
!= 0)
9233 base
= plus_constant (Pmode
, base
, base_offset
);
9234 base
= force_operand (base
, NULL_RTX
);
9235 return plus_constant (Pmode
, base
, offset
- base_offset
);
9244 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED
, rtx x
,
9247 secondary_reload_info
*sri
)
9249 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
9250 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
9251 comment at the head of aarch64-sve.md for more details about the
9252 big-endian handling. */
9253 if (BYTES_BIG_ENDIAN
9254 && reg_class_subset_p (rclass
, FP_REGS
)
9255 && !((REG_P (x
) && HARD_REGISTER_P (x
))
9256 || aarch64_simd_valid_immediate (x
, NULL
))
9257 && aarch64_sve_data_mode_p (mode
))
9259 sri
->icode
= CODE_FOR_aarch64_sve_reload_be
;
9263 /* If we have to disable direct literal pool loads and stores because the
9264 function is too big, then we need a scratch register. */
9265 if (MEM_P (x
) && GET_CODE (x
) == SYMBOL_REF
&& CONSTANT_POOL_ADDRESS_P (x
)
9266 && (SCALAR_FLOAT_MODE_P (GET_MODE (x
))
9267 || targetm
.vector_mode_supported_p (GET_MODE (x
)))
9268 && !aarch64_pcrelative_literal_loads
)
9270 sri
->icode
= code_for_aarch64_reload_movcp (mode
, DImode
);
9274 /* Without the TARGET_SIMD instructions we cannot move a Q register
9275 to a Q register directly. We need a scratch. */
9276 if (REG_P (x
) && (mode
== TFmode
|| mode
== TImode
) && mode
== GET_MODE (x
)
9277 && FP_REGNUM_P (REGNO (x
)) && !TARGET_SIMD
9278 && reg_class_subset_p (rclass
, FP_REGS
))
9280 sri
->icode
= code_for_aarch64_reload_mov (mode
);
9284 /* A TFmode or TImode memory access should be handled via an FP_REGS
9285 because AArch64 has richer addressing modes for LDR/STR instructions
9286 than LDP/STP instructions. */
9287 if (TARGET_FLOAT
&& rclass
== GENERAL_REGS
9288 && known_eq (GET_MODE_SIZE (mode
), 16) && MEM_P (x
))
9291 if (rclass
== FP_REGS
&& (mode
== TImode
|| mode
== TFmode
) && CONSTANT_P(x
))
9292 return GENERAL_REGS
;
9298 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED
, const int to
)
9300 gcc_assert (from
== ARG_POINTER_REGNUM
|| from
== FRAME_POINTER_REGNUM
);
9302 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
9303 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
9304 if (frame_pointer_needed
)
9305 return to
== HARD_FRAME_POINTER_REGNUM
;
9310 aarch64_initial_elimination_offset (unsigned from
, unsigned to
)
9312 if (to
== HARD_FRAME_POINTER_REGNUM
)
9314 if (from
== ARG_POINTER_REGNUM
)
9315 return cfun
->machine
->frame
.hard_fp_offset
;
9317 if (from
== FRAME_POINTER_REGNUM
)
9318 return cfun
->machine
->frame
.hard_fp_offset
9319 - cfun
->machine
->frame
.locals_offset
;
9322 if (to
== STACK_POINTER_REGNUM
)
9324 if (from
== FRAME_POINTER_REGNUM
)
9325 return cfun
->machine
->frame
.frame_size
9326 - cfun
->machine
->frame
.locals_offset
;
9329 return cfun
->machine
->frame
.frame_size
;
9332 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
9336 aarch64_return_addr (int count
, rtx frame ATTRIBUTE_UNUSED
)
9340 return get_hard_reg_initial_val (Pmode
, LR_REGNUM
);
9345 aarch64_asm_trampoline_template (FILE *f
)
9350 if (aarch64_bti_enabled ())
9352 asm_fprintf (f
, "\thint\t34 // bti c\n");
9359 asm_fprintf (f
, "\tldr\tw%d, .+%d\n", IP1_REGNUM
- R0_REGNUM
, offset1
);
9360 asm_fprintf (f
, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM
- R0_REGNUM
,
9365 asm_fprintf (f
, "\tldr\t%s, .+%d\n", reg_names
[IP1_REGNUM
], offset1
);
9366 asm_fprintf (f
, "\tldr\t%s, .+%d\n", reg_names
[STATIC_CHAIN_REGNUM
],
9369 asm_fprintf (f
, "\tbr\t%s\n", reg_names
[IP1_REGNUM
]);
9371 /* The trampoline needs an extra padding instruction. In case if BTI is
9372 enabled the padding instruction is replaced by the BTI instruction at
9374 if (!aarch64_bti_enabled ())
9375 assemble_aligned_integer (4, const0_rtx
);
9377 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
9378 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
9382 aarch64_trampoline_init (rtx m_tramp
, tree fndecl
, rtx chain_value
)
9384 rtx fnaddr
, mem
, a_tramp
;
9385 const int tramp_code_sz
= 16;
9387 /* Don't need to copy the trailing D-words, we fill those in below. */
9388 emit_block_move (m_tramp
, assemble_trampoline_template (),
9389 GEN_INT (tramp_code_sz
), BLOCK_OP_NORMAL
);
9390 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
);
9391 fnaddr
= XEXP (DECL_RTL (fndecl
), 0);
9392 if (GET_MODE (fnaddr
) != ptr_mode
)
9393 fnaddr
= convert_memory_address (ptr_mode
, fnaddr
);
9394 emit_move_insn (mem
, fnaddr
);
9396 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
+ POINTER_BYTES
);
9397 emit_move_insn (mem
, chain_value
);
9399 /* XXX We should really define a "clear_cache" pattern and use
9400 gen_clear_cache(). */
9401 a_tramp
= XEXP (m_tramp
, 0);
9402 emit_library_call (gen_rtx_SYMBOL_REF (Pmode
, "__clear_cache"),
9403 LCT_NORMAL
, VOIDmode
, a_tramp
, ptr_mode
,
9404 plus_constant (ptr_mode
, a_tramp
, TRAMPOLINE_SIZE
),
9408 static unsigned char
9409 aarch64_class_max_nregs (reg_class_t regclass
, machine_mode mode
)
9411 /* ??? Logically we should only need to provide a value when
9412 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
9413 can hold MODE, but at the moment we need to handle all modes.
9414 Just ignore any runtime parts for registers that can't store them. */
9415 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
9419 case TAILCALL_ADDR_REGS
:
9423 case POINTER_AND_FP_REGS
:
9427 if (aarch64_sve_data_mode_p (mode
)
9428 && constant_multiple_p (GET_MODE_SIZE (mode
),
9429 BYTES_PER_SVE_VECTOR
, &nregs
))
9431 return (aarch64_vector_data_mode_p (mode
)
9432 ? CEIL (lowest_size
, UNITS_PER_VREG
)
9433 : CEIL (lowest_size
, UNITS_PER_WORD
));
9450 aarch64_preferred_reload_class (rtx x
, reg_class_t regclass
)
9452 if (regclass
== POINTER_REGS
)
9453 return GENERAL_REGS
;
9455 if (regclass
== STACK_REG
)
9458 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x
)), POINTER_REGS
))
9464 /* Register eliminiation can result in a request for
9465 SP+constant->FP_REGS. We cannot support such operations which
9466 use SP as source and an FP_REG as destination, so reject out
9468 if (! reg_class_subset_p (regclass
, GENERAL_REGS
) && GET_CODE (x
) == PLUS
)
9470 rtx lhs
= XEXP (x
, 0);
9472 /* Look through a possible SUBREG introduced by ILP32. */
9473 if (GET_CODE (lhs
) == SUBREG
)
9474 lhs
= SUBREG_REG (lhs
);
9476 gcc_assert (REG_P (lhs
));
9477 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs
)),
9486 aarch64_asm_output_labelref (FILE* f
, const char *name
)
9488 asm_fprintf (f
, "%U%s", name
);
9492 aarch64_elf_asm_constructor (rtx symbol
, int priority
)
9494 if (priority
== DEFAULT_INIT_PRIORITY
)
9495 default_ctor_section_asm_out_constructor (symbol
, priority
);
9499 /* While priority is known to be in range [0, 65535], so 18 bytes
9500 would be enough, the compiler might not know that. To avoid
9501 -Wformat-truncation false positive, use a larger size. */
9503 snprintf (buf
, sizeof (buf
), ".init_array.%.5u", priority
);
9504 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
9505 switch_to_section (s
);
9506 assemble_align (POINTER_SIZE
);
9507 assemble_aligned_integer (POINTER_BYTES
, symbol
);
9512 aarch64_elf_asm_destructor (rtx symbol
, int priority
)
9514 if (priority
== DEFAULT_INIT_PRIORITY
)
9515 default_dtor_section_asm_out_destructor (symbol
, priority
);
9519 /* While priority is known to be in range [0, 65535], so 18 bytes
9520 would be enough, the compiler might not know that. To avoid
9521 -Wformat-truncation false positive, use a larger size. */
9523 snprintf (buf
, sizeof (buf
), ".fini_array.%.5u", priority
);
9524 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
9525 switch_to_section (s
);
9526 assemble_align (POINTER_SIZE
);
9527 assemble_aligned_integer (POINTER_BYTES
, symbol
);
9532 aarch64_output_casesi (rtx
*operands
)
9536 rtx diff_vec
= PATTERN (NEXT_INSN (as_a
<rtx_insn
*> (operands
[2])));
9538 static const char *const patterns
[4][2] =
9541 "ldrb\t%w3, [%0,%w1,uxtw]",
9542 "add\t%3, %4, %w3, sxtb #2"
9545 "ldrh\t%w3, [%0,%w1,uxtw #1]",
9546 "add\t%3, %4, %w3, sxth #2"
9549 "ldr\t%w3, [%0,%w1,uxtw #2]",
9550 "add\t%3, %4, %w3, sxtw #2"
9552 /* We assume that DImode is only generated when not optimizing and
9553 that we don't really need 64-bit address offsets. That would
9554 imply an object file with 8GB of code in a single function! */
9556 "ldr\t%w3, [%0,%w1,uxtw #2]",
9557 "add\t%3, %4, %w3, sxtw #2"
9561 gcc_assert (GET_CODE (diff_vec
) == ADDR_DIFF_VEC
);
9563 scalar_int_mode mode
= as_a
<scalar_int_mode
> (GET_MODE (diff_vec
));
9564 index
= exact_log2 (GET_MODE_SIZE (mode
));
9566 gcc_assert (index
>= 0 && index
<= 3);
9568 /* Need to implement table size reduction, by chaning the code below. */
9569 output_asm_insn (patterns
[index
][0], operands
);
9570 ASM_GENERATE_INTERNAL_LABEL (label
, "Lrtx", CODE_LABEL_NUMBER (operands
[2]));
9571 snprintf (buf
, sizeof (buf
),
9572 "adr\t%%4, %s", targetm
.strip_name_encoding (label
));
9573 output_asm_insn (buf
, operands
);
9574 output_asm_insn (patterns
[index
][1], operands
);
9575 output_asm_insn ("br\t%3", operands
);
9576 assemble_label (asm_out_file
, label
);
9581 /* Return size in bits of an arithmetic operand which is shifted/scaled and
9582 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
9586 aarch64_uxt_size (int shift
, HOST_WIDE_INT mask
)
9588 if (shift
>= 0 && shift
<= 3)
9591 for (size
= 8; size
<= 32; size
*= 2)
9593 HOST_WIDE_INT bits
= ((HOST_WIDE_INT
)1U << size
) - 1;
9594 if (mask
== bits
<< shift
)
9601 /* Constant pools are per function only when PC relative
9602 literal loads are true or we are in the large memory
9606 aarch64_can_use_per_function_literal_pools_p (void)
9608 return (aarch64_pcrelative_literal_loads
9609 || aarch64_cmodel
== AARCH64_CMODEL_LARGE
);
9613 aarch64_use_blocks_for_constant_p (machine_mode
, const_rtx
)
9615 /* We can't use blocks for constants when we're using a per-function
9617 return !aarch64_can_use_per_function_literal_pools_p ();
9620 /* Select appropriate section for constants depending
9621 on where we place literal pools. */
9624 aarch64_select_rtx_section (machine_mode mode
,
9626 unsigned HOST_WIDE_INT align
)
9628 if (aarch64_can_use_per_function_literal_pools_p ())
9629 return function_section (current_function_decl
);
9631 return default_elf_select_rtx_section (mode
, x
, align
);
9634 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
9636 aarch64_asm_output_pool_epilogue (FILE *f
, const char *, tree
,
9637 HOST_WIDE_INT offset
)
9639 /* When using per-function literal pools, we must ensure that any code
9640 section is aligned to the minimal instruction length, lest we get
9641 errors from the assembler re "unaligned instructions". */
9642 if ((offset
& 3) && aarch64_can_use_per_function_literal_pools_p ())
9643 ASM_OUTPUT_ALIGN (f
, 2);
9648 /* Helper function for rtx cost calculation. Strip a shift expression
9649 from X. Returns the inner operand if successful, or the original
9650 expression on failure. */
9652 aarch64_strip_shift (rtx x
)
9656 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
9657 we can convert both to ROR during final output. */
9658 if ((GET_CODE (op
) == ASHIFT
9659 || GET_CODE (op
) == ASHIFTRT
9660 || GET_CODE (op
) == LSHIFTRT
9661 || GET_CODE (op
) == ROTATERT
9662 || GET_CODE (op
) == ROTATE
)
9663 && CONST_INT_P (XEXP (op
, 1)))
9664 return XEXP (op
, 0);
9666 if (GET_CODE (op
) == MULT
9667 && CONST_INT_P (XEXP (op
, 1))
9668 && ((unsigned) exact_log2 (INTVAL (XEXP (op
, 1)))) < 64)
9669 return XEXP (op
, 0);
9674 /* Helper function for rtx cost calculation. Strip an extend
9675 expression from X. Returns the inner operand if successful, or the
9676 original expression on failure. We deal with a number of possible
9677 canonicalization variations here. If STRIP_SHIFT is true, then
9678 we can strip off a shift also. */
9680 aarch64_strip_extend (rtx x
, bool strip_shift
)
9682 scalar_int_mode mode
;
9685 if (!is_a
<scalar_int_mode
> (GET_MODE (op
), &mode
))
9688 /* Zero and sign extraction of a widened value. */
9689 if ((GET_CODE (op
) == ZERO_EXTRACT
|| GET_CODE (op
) == SIGN_EXTRACT
)
9690 && XEXP (op
, 2) == const0_rtx
9691 && GET_CODE (XEXP (op
, 0)) == MULT
9692 && aarch64_is_extend_from_extract (mode
, XEXP (XEXP (op
, 0), 1),
9694 return XEXP (XEXP (op
, 0), 0);
9696 /* It can also be represented (for zero-extend) as an AND with an
9698 if (GET_CODE (op
) == AND
9699 && GET_CODE (XEXP (op
, 0)) == MULT
9700 && CONST_INT_P (XEXP (XEXP (op
, 0), 1))
9701 && CONST_INT_P (XEXP (op
, 1))
9702 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op
, 0), 1))),
9703 INTVAL (XEXP (op
, 1))) != 0)
9704 return XEXP (XEXP (op
, 0), 0);
9706 /* Now handle extended register, as this may also have an optional
9707 left shift by 1..4. */
9709 && GET_CODE (op
) == ASHIFT
9710 && CONST_INT_P (XEXP (op
, 1))
9711 && ((unsigned HOST_WIDE_INT
) INTVAL (XEXP (op
, 1))) <= 4)
9714 if (GET_CODE (op
) == ZERO_EXTEND
9715 || GET_CODE (op
) == SIGN_EXTEND
)
9724 /* Return true iff CODE is a shift supported in combination
9725 with arithmetic instructions. */
9728 aarch64_shift_p (enum rtx_code code
)
9730 return code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
;
9734 /* Return true iff X is a cheap shift without a sign extend. */
9737 aarch64_cheap_mult_shift_p (rtx x
)
9744 if (!(aarch64_tune_params
.extra_tuning_flags
9745 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
))
9748 if (GET_CODE (op0
) == SIGN_EXTEND
)
9751 if (GET_CODE (x
) == ASHIFT
&& CONST_INT_P (op1
)
9752 && UINTVAL (op1
) <= 4)
9755 if (GET_CODE (x
) != MULT
|| !CONST_INT_P (op1
))
9758 HOST_WIDE_INT l2
= exact_log2 (INTVAL (op1
));
9760 if (l2
> 0 && l2
<= 4)
9766 /* Helper function for rtx cost calculation. Calculate the cost of
9767 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
9768 Return the calculated cost of the expression, recursing manually in to
9769 operands where needed. */
9772 aarch64_rtx_mult_cost (rtx x
, enum rtx_code code
, int outer
, bool speed
)
9775 const struct cpu_cost_table
*extra_cost
9776 = aarch64_tune_params
.insn_extra_cost
;
9778 bool compound_p
= (outer
== PLUS
|| outer
== MINUS
);
9779 machine_mode mode
= GET_MODE (x
);
9781 gcc_checking_assert (code
== MULT
);
9786 if (VECTOR_MODE_P (mode
))
9787 mode
= GET_MODE_INNER (mode
);
9789 /* Integer multiply/fma. */
9790 if (GET_MODE_CLASS (mode
) == MODE_INT
)
9792 /* The multiply will be canonicalized as a shift, cost it as such. */
9793 if (aarch64_shift_p (GET_CODE (x
))
9794 || (CONST_INT_P (op1
)
9795 && exact_log2 (INTVAL (op1
)) > 0))
9797 bool is_extend
= GET_CODE (op0
) == ZERO_EXTEND
9798 || GET_CODE (op0
) == SIGN_EXTEND
;
9803 /* If the shift is considered cheap,
9804 then don't add any cost. */
9805 if (aarch64_cheap_mult_shift_p (x
))
9807 else if (REG_P (op1
))
9808 /* ARITH + shift-by-register. */
9809 cost
+= extra_cost
->alu
.arith_shift_reg
;
9811 /* ARITH + extended register. We don't have a cost field
9812 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
9813 cost
+= extra_cost
->alu
.extend_arith
;
9815 /* ARITH + shift-by-immediate. */
9816 cost
+= extra_cost
->alu
.arith_shift
;
9819 /* LSL (immediate). */
9820 cost
+= extra_cost
->alu
.shift
;
9823 /* Strip extends as we will have costed them in the case above. */
9825 op0
= aarch64_strip_extend (op0
, true);
9827 cost
+= rtx_cost (op0
, VOIDmode
, code
, 0, speed
);
9832 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
9833 compound and let the below cases handle it. After all, MNEG is a
9834 special-case alias of MSUB. */
9835 if (GET_CODE (op0
) == NEG
)
9837 op0
= XEXP (op0
, 0);
9841 /* Integer multiplies or FMAs have zero/sign extending variants. */
9842 if ((GET_CODE (op0
) == ZERO_EXTEND
9843 && GET_CODE (op1
) == ZERO_EXTEND
)
9844 || (GET_CODE (op0
) == SIGN_EXTEND
9845 && GET_CODE (op1
) == SIGN_EXTEND
))
9847 cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, MULT
, 0, speed
);
9848 cost
+= rtx_cost (XEXP (op1
, 0), VOIDmode
, MULT
, 1, speed
);
9853 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
9854 cost
+= extra_cost
->mult
[0].extend_add
;
9856 /* MUL/SMULL/UMULL. */
9857 cost
+= extra_cost
->mult
[0].extend
;
9863 /* This is either an integer multiply or a MADD. In both cases
9864 we want to recurse and cost the operands. */
9865 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
9866 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
9872 cost
+= extra_cost
->mult
[mode
== DImode
].add
;
9875 cost
+= extra_cost
->mult
[mode
== DImode
].simple
;
9884 /* Floating-point FMA/FMUL can also support negations of the
9885 operands, unless the rounding mode is upward or downward in
9886 which case FNMUL is different than FMUL with operand negation. */
9887 bool neg0
= GET_CODE (op0
) == NEG
;
9888 bool neg1
= GET_CODE (op1
) == NEG
;
9889 if (compound_p
|| !flag_rounding_math
|| (neg0
&& neg1
))
9892 op0
= XEXP (op0
, 0);
9894 op1
= XEXP (op1
, 0);
9898 /* FMADD/FNMADD/FNMSUB/FMSUB. */
9899 cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
9902 cost
+= extra_cost
->fp
[mode
== DFmode
].mult
;
9905 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
9906 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
9912 aarch64_address_cost (rtx x
,
9914 addr_space_t as ATTRIBUTE_UNUSED
,
9917 enum rtx_code c
= GET_CODE (x
);
9918 const struct cpu_addrcost_table
*addr_cost
= aarch64_tune_params
.addr_cost
;
9919 struct aarch64_address_info info
;
9923 if (!aarch64_classify_address (&info
, x
, mode
, false))
9925 if (GET_CODE (x
) == CONST
|| GET_CODE (x
) == SYMBOL_REF
)
9927 /* This is a CONST or SYMBOL ref which will be split
9928 in a different way depending on the code model in use.
9929 Cost it through the generic infrastructure. */
9930 int cost_symbol_ref
= rtx_cost (x
, Pmode
, MEM
, 1, speed
);
9931 /* Divide through by the cost of one instruction to
9932 bring it to the same units as the address costs. */
9933 cost_symbol_ref
/= COSTS_N_INSNS (1);
9934 /* The cost is then the cost of preparing the address,
9935 followed by an immediate (possibly 0) offset. */
9936 return cost_symbol_ref
+ addr_cost
->imm_offset
;
9940 /* This is most likely a jump table from a case
9942 return addr_cost
->register_offset
;
9948 case ADDRESS_LO_SUM
:
9949 case ADDRESS_SYMBOLIC
:
9950 case ADDRESS_REG_IMM
:
9951 cost
+= addr_cost
->imm_offset
;
9954 case ADDRESS_REG_WB
:
9955 if (c
== PRE_INC
|| c
== PRE_DEC
|| c
== PRE_MODIFY
)
9956 cost
+= addr_cost
->pre_modify
;
9957 else if (c
== POST_INC
|| c
== POST_DEC
|| c
== POST_MODIFY
)
9958 cost
+= addr_cost
->post_modify
;
9964 case ADDRESS_REG_REG
:
9965 cost
+= addr_cost
->register_offset
;
9968 case ADDRESS_REG_SXTW
:
9969 cost
+= addr_cost
->register_sextend
;
9972 case ADDRESS_REG_UXTW
:
9973 cost
+= addr_cost
->register_zextend
;
9983 /* For the sake of calculating the cost of the shifted register
9984 component, we can treat same sized modes in the same way. */
9985 if (known_eq (GET_MODE_BITSIZE (mode
), 16))
9986 cost
+= addr_cost
->addr_scale_costs
.hi
;
9987 else if (known_eq (GET_MODE_BITSIZE (mode
), 32))
9988 cost
+= addr_cost
->addr_scale_costs
.si
;
9989 else if (known_eq (GET_MODE_BITSIZE (mode
), 64))
9990 cost
+= addr_cost
->addr_scale_costs
.di
;
9992 /* We can't tell, or this is a 128-bit vector. */
9993 cost
+= addr_cost
->addr_scale_costs
.ti
;
9999 /* Return the cost of a branch. If SPEED_P is true then the compiler is
10000 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
10004 aarch64_branch_cost (bool speed_p
, bool predictable_p
)
10006 /* When optimizing for speed, use the cost of unpredictable branches. */
10007 const struct cpu_branch_cost
*branch_costs
=
10008 aarch64_tune_params
.branch_costs
;
10010 if (!speed_p
|| predictable_p
)
10011 return branch_costs
->predictable
;
10013 return branch_costs
->unpredictable
;
10016 /* Return true if the RTX X in mode MODE is a zero or sign extract
10017 usable in an ADD or SUB (extended register) instruction. */
10019 aarch64_rtx_arith_op_extract_p (rtx x
, scalar_int_mode mode
)
10021 /* Catch add with a sign extract.
10022 This is add_<optab><mode>_multp2. */
10023 if (GET_CODE (x
) == SIGN_EXTRACT
10024 || GET_CODE (x
) == ZERO_EXTRACT
)
10026 rtx op0
= XEXP (x
, 0);
10027 rtx op1
= XEXP (x
, 1);
10028 rtx op2
= XEXP (x
, 2);
10030 if (GET_CODE (op0
) == MULT
10031 && CONST_INT_P (op1
)
10032 && op2
== const0_rtx
10033 && CONST_INT_P (XEXP (op0
, 1))
10034 && aarch64_is_extend_from_extract (mode
,
10041 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
10043 else if (GET_CODE (x
) == SIGN_EXTEND
10044 || GET_CODE (x
) == ZERO_EXTEND
)
10045 return REG_P (XEXP (x
, 0));
10051 aarch64_frint_unspec_p (unsigned int u
)
10055 case UNSPEC_FRINTZ
:
10056 case UNSPEC_FRINTP
:
10057 case UNSPEC_FRINTM
:
10058 case UNSPEC_FRINTA
:
10059 case UNSPEC_FRINTN
:
10060 case UNSPEC_FRINTX
:
10061 case UNSPEC_FRINTI
:
10069 /* Return true iff X is an rtx that will match an extr instruction
10070 i.e. as described in the *extr<mode>5_insn family of patterns.
10071 OP0 and OP1 will be set to the operands of the shifts involved
10072 on success and will be NULL_RTX otherwise. */
10075 aarch64_extr_rtx_p (rtx x
, rtx
*res_op0
, rtx
*res_op1
)
10078 scalar_int_mode mode
;
10079 if (!is_a
<scalar_int_mode
> (GET_MODE (x
), &mode
))
10082 *res_op0
= NULL_RTX
;
10083 *res_op1
= NULL_RTX
;
10085 if (GET_CODE (x
) != IOR
)
10091 if ((GET_CODE (op0
) == ASHIFT
&& GET_CODE (op1
) == LSHIFTRT
)
10092 || (GET_CODE (op1
) == ASHIFT
&& GET_CODE (op0
) == LSHIFTRT
))
10094 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
10095 if (GET_CODE (op1
) == ASHIFT
)
10096 std::swap (op0
, op1
);
10098 if (!CONST_INT_P (XEXP (op0
, 1)) || !CONST_INT_P (XEXP (op1
, 1)))
10101 unsigned HOST_WIDE_INT shft_amnt_0
= UINTVAL (XEXP (op0
, 1));
10102 unsigned HOST_WIDE_INT shft_amnt_1
= UINTVAL (XEXP (op1
, 1));
10104 if (shft_amnt_0
< GET_MODE_BITSIZE (mode
)
10105 && shft_amnt_0
+ shft_amnt_1
== GET_MODE_BITSIZE (mode
))
10107 *res_op0
= XEXP (op0
, 0);
10108 *res_op1
= XEXP (op1
, 0);
10116 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
10117 storing it in *COST. Result is true if the total cost of the operation
10118 has now been calculated. */
10120 aarch64_if_then_else_costs (rtx op0
, rtx op1
, rtx op2
, int *cost
, bool speed
)
10124 enum rtx_code cmpcode
;
10126 if (COMPARISON_P (op0
))
10128 inner
= XEXP (op0
, 0);
10129 comparator
= XEXP (op0
, 1);
10130 cmpcode
= GET_CODE (op0
);
10135 comparator
= const0_rtx
;
10139 if (GET_CODE (op1
) == PC
|| GET_CODE (op2
) == PC
)
10141 /* Conditional branch. */
10142 if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
10146 if (cmpcode
== NE
|| cmpcode
== EQ
)
10148 if (comparator
== const0_rtx
)
10150 /* TBZ/TBNZ/CBZ/CBNZ. */
10151 if (GET_CODE (inner
) == ZERO_EXTRACT
)
10153 *cost
+= rtx_cost (XEXP (inner
, 0), VOIDmode
,
10154 ZERO_EXTRACT
, 0, speed
);
10157 *cost
+= rtx_cost (inner
, VOIDmode
, cmpcode
, 0, speed
);
10162 else if (cmpcode
== LT
|| cmpcode
== GE
)
10165 if (comparator
== const0_rtx
)
10170 else if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
10173 if (GET_CODE (op1
) == COMPARE
)
10175 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
10176 if (XEXP (op1
, 1) == const0_rtx
)
10180 machine_mode mode
= GET_MODE (XEXP (op1
, 0));
10181 const struct cpu_cost_table
*extra_cost
10182 = aarch64_tune_params
.insn_extra_cost
;
10184 if (GET_MODE_CLASS (mode
) == MODE_INT
)
10185 *cost
+= extra_cost
->alu
.arith
;
10187 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
10192 /* It's a conditional operation based on the status flags,
10193 so it must be some flavor of CSEL. */
10195 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
10196 if (GET_CODE (op1
) == NEG
10197 || GET_CODE (op1
) == NOT
10198 || (GET_CODE (op1
) == PLUS
&& XEXP (op1
, 1) == const1_rtx
))
10199 op1
= XEXP (op1
, 0);
10200 else if (GET_CODE (op1
) == ZERO_EXTEND
&& GET_CODE (op2
) == ZERO_EXTEND
)
10202 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
10203 op1
= XEXP (op1
, 0);
10204 op2
= XEXP (op2
, 0);
10207 *cost
+= rtx_cost (op1
, VOIDmode
, IF_THEN_ELSE
, 1, speed
);
10208 *cost
+= rtx_cost (op2
, VOIDmode
, IF_THEN_ELSE
, 2, speed
);
10212 /* We don't know what this is, cost all operands. */
10216 /* Check whether X is a bitfield operation of the form shift + extend that
10217 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
10218 operand to which the bitfield operation is applied. Otherwise return
10222 aarch64_extend_bitfield_pattern_p (rtx x
)
10224 rtx_code outer_code
= GET_CODE (x
);
10225 machine_mode outer_mode
= GET_MODE (x
);
10227 if (outer_code
!= ZERO_EXTEND
&& outer_code
!= SIGN_EXTEND
10228 && outer_mode
!= SImode
&& outer_mode
!= DImode
)
10231 rtx inner
= XEXP (x
, 0);
10232 rtx_code inner_code
= GET_CODE (inner
);
10233 machine_mode inner_mode
= GET_MODE (inner
);
10236 switch (inner_code
)
10239 if (CONST_INT_P (XEXP (inner
, 1))
10240 && (inner_mode
== QImode
|| inner_mode
== HImode
))
10241 op
= XEXP (inner
, 0);
10244 if (outer_code
== ZERO_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
10245 && (inner_mode
== QImode
|| inner_mode
== HImode
))
10246 op
= XEXP (inner
, 0);
10249 if (outer_code
== SIGN_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
10250 && (inner_mode
== QImode
|| inner_mode
== HImode
))
10251 op
= XEXP (inner
, 0);
10260 /* Return true if the mask and a shift amount from an RTX of the form
10261 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
10262 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
10265 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode
, rtx mask
,
10268 return CONST_INT_P (mask
) && CONST_INT_P (shft_amnt
)
10269 && INTVAL (shft_amnt
) < GET_MODE_BITSIZE (mode
)
10270 && exact_log2 ((INTVAL (mask
) >> INTVAL (shft_amnt
)) + 1) >= 0
10272 & ((HOST_WIDE_INT_1U
<< INTVAL (shft_amnt
)) - 1)) == 0;
10275 /* Return true if the masks and a shift amount from an RTX of the form
10276 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
10277 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
10280 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode
,
10281 unsigned HOST_WIDE_INT mask1
,
10282 unsigned HOST_WIDE_INT shft_amnt
,
10283 unsigned HOST_WIDE_INT mask2
)
10285 unsigned HOST_WIDE_INT t
;
10287 /* Verify that there is no overlap in what bits are set in the two masks. */
10288 if (mask1
!= ~mask2
)
10291 /* Verify that mask2 is not all zeros or ones. */
10292 if (mask2
== 0 || mask2
== HOST_WIDE_INT_M1U
)
10295 /* The shift amount should always be less than the mode size. */
10296 gcc_assert (shft_amnt
< GET_MODE_BITSIZE (mode
));
10298 /* Verify that the mask being shifted is contiguous and would be in the
10299 least significant bits after shifting by shft_amnt. */
10300 t
= mask2
+ (HOST_WIDE_INT_1U
<< shft_amnt
);
10301 return (t
== (t
& -t
));
10304 /* Calculate the cost of calculating X, storing it in *COST. Result
10305 is true if the total cost of the operation has now been calculated. */
10307 aarch64_rtx_costs (rtx x
, machine_mode mode
, int outer ATTRIBUTE_UNUSED
,
10308 int param ATTRIBUTE_UNUSED
, int *cost
, bool speed
)
10311 const struct cpu_cost_table
*extra_cost
10312 = aarch64_tune_params
.insn_extra_cost
;
10313 int code
= GET_CODE (x
);
10314 scalar_int_mode int_mode
;
10316 /* By default, assume that everything has equivalent cost to the
10317 cheapest instruction. Any additional costs are applied as a delta
10318 above this default. */
10319 *cost
= COSTS_N_INSNS (1);
10324 /* The cost depends entirely on the operands to SET. */
10326 op0
= SET_DEST (x
);
10329 switch (GET_CODE (op0
))
10334 rtx address
= XEXP (op0
, 0);
10335 if (VECTOR_MODE_P (mode
))
10336 *cost
+= extra_cost
->ldst
.storev
;
10337 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
10338 *cost
+= extra_cost
->ldst
.store
;
10339 else if (mode
== SFmode
)
10340 *cost
+= extra_cost
->ldst
.storef
;
10341 else if (mode
== DFmode
)
10342 *cost
+= extra_cost
->ldst
.stored
;
10345 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
10349 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
10353 if (! REG_P (SUBREG_REG (op0
)))
10354 *cost
+= rtx_cost (SUBREG_REG (op0
), VOIDmode
, SET
, 0, speed
);
10356 /* Fall through. */
10358 /* The cost is one per vector-register copied. */
10359 if (VECTOR_MODE_P (GET_MODE (op0
)) && REG_P (op1
))
10361 int nregs
= aarch64_hard_regno_nregs (V0_REGNUM
, GET_MODE (op0
));
10362 *cost
= COSTS_N_INSNS (nregs
);
10364 /* const0_rtx is in general free, but we will use an
10365 instruction to set a register to 0. */
10366 else if (REG_P (op1
) || op1
== const0_rtx
)
10368 /* The cost is 1 per register copied. */
10369 int nregs
= aarch64_hard_regno_nregs (R0_REGNUM
, GET_MODE (op0
));
10370 *cost
= COSTS_N_INSNS (nregs
);
10373 /* Cost is just the cost of the RHS of the set. */
10374 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
10379 /* Bit-field insertion. Strip any redundant widening of
10380 the RHS to meet the width of the target. */
10381 if (GET_CODE (op1
) == SUBREG
)
10382 op1
= SUBREG_REG (op1
);
10383 if ((GET_CODE (op1
) == ZERO_EXTEND
10384 || GET_CODE (op1
) == SIGN_EXTEND
)
10385 && CONST_INT_P (XEXP (op0
, 1))
10386 && is_a
<scalar_int_mode
> (GET_MODE (XEXP (op1
, 0)), &int_mode
)
10387 && GET_MODE_BITSIZE (int_mode
) >= INTVAL (XEXP (op0
, 1)))
10388 op1
= XEXP (op1
, 0);
10390 if (CONST_INT_P (op1
))
10392 /* MOV immediate is assumed to always be cheap. */
10393 *cost
= COSTS_N_INSNS (1);
10399 *cost
+= extra_cost
->alu
.bfi
;
10400 *cost
+= rtx_cost (op1
, VOIDmode
, (enum rtx_code
) code
, 1, speed
);
10406 /* We can't make sense of this, assume default cost. */
10407 *cost
= COSTS_N_INSNS (1);
10413 /* If an instruction can incorporate a constant within the
10414 instruction, the instruction's expression avoids calling
10415 rtx_cost() on the constant. If rtx_cost() is called on a
10416 constant, then it is usually because the constant must be
10417 moved into a register by one or more instructions.
10419 The exception is constant 0, which can be expressed
10420 as XZR/WZR and is therefore free. The exception to this is
10421 if we have (set (reg) (const0_rtx)) in which case we must cost
10422 the move. However, we can catch that when we cost the SET, so
10423 we don't need to consider that here. */
10424 if (x
== const0_rtx
)
10428 /* To an approximation, building any other constant is
10429 proportionally expensive to the number of instructions
10430 required to build that constant. This is true whether we
10431 are compiling for SPEED or otherwise. */
10432 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
10433 int_mode
= word_mode
;
10434 *cost
= COSTS_N_INSNS (aarch64_internal_mov_immediate
10435 (NULL_RTX
, x
, false, int_mode
));
10441 /* First determine number of instructions to do the move
10442 as an integer constant. */
10443 if (!aarch64_float_const_representable_p (x
)
10444 && !aarch64_can_const_movi_rtx_p (x
, mode
)
10445 && aarch64_float_const_rtx_p (x
))
10447 unsigned HOST_WIDE_INT ival
;
10448 bool succeed
= aarch64_reinterpret_float_as_int (x
, &ival
);
10449 gcc_assert (succeed
);
10451 scalar_int_mode imode
= (mode
== HFmode
10453 : int_mode_for_mode (mode
).require ());
10454 int ncost
= aarch64_internal_mov_immediate
10455 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
10456 *cost
+= COSTS_N_INSNS (ncost
);
10462 /* mov[df,sf]_aarch64. */
10463 if (aarch64_float_const_representable_p (x
))
10464 /* FMOV (scalar immediate). */
10465 *cost
+= extra_cost
->fp
[mode
== DFmode
].fpconst
;
10466 else if (!aarch64_float_const_zero_rtx_p (x
))
10468 /* This will be a load from memory. */
10469 if (mode
== DFmode
)
10470 *cost
+= extra_cost
->ldst
.loadd
;
10472 *cost
+= extra_cost
->ldst
.loadf
;
10475 /* Otherwise this is +0.0. We get this using MOVI d0, #0
10476 or MOV v0.s[0], wzr - neither of which are modeled by the
10477 cost tables. Just use the default cost. */
10487 /* For loads we want the base cost of a load, plus an
10488 approximation for the additional cost of the addressing
10490 rtx address
= XEXP (x
, 0);
10491 if (VECTOR_MODE_P (mode
))
10492 *cost
+= extra_cost
->ldst
.loadv
;
10493 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
10494 *cost
+= extra_cost
->ldst
.load
;
10495 else if (mode
== SFmode
)
10496 *cost
+= extra_cost
->ldst
.loadf
;
10497 else if (mode
== DFmode
)
10498 *cost
+= extra_cost
->ldst
.loadd
;
10501 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
10510 if (VECTOR_MODE_P (mode
))
10515 *cost
+= extra_cost
->vect
.alu
;
10520 if (GET_MODE_CLASS (mode
) == MODE_INT
)
10522 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
10523 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
10526 *cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, NEG
, 0, speed
);
10530 /* Cost this as SUB wzr, X. */
10531 op0
= CONST0_RTX (mode
);
10536 if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
10538 /* Support (neg(fma...)) as a single instruction only if
10539 sign of zeros is unimportant. This matches the decision
10540 making in aarch64.md. */
10541 if (GET_CODE (op0
) == FMA
&& !HONOR_SIGNED_ZEROS (GET_MODE (op0
)))
10544 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
10547 if (GET_CODE (op0
) == MULT
)
10550 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
10555 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
10565 if (VECTOR_MODE_P (mode
))
10566 *cost
+= extra_cost
->vect
.alu
;
10568 *cost
+= extra_cost
->alu
.clz
;
10577 if (op1
== const0_rtx
10578 && GET_CODE (op0
) == AND
)
10581 mode
= GET_MODE (op0
);
10585 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
)
10587 /* TODO: A write to the CC flags possibly costs extra, this
10588 needs encoding in the cost tables. */
10590 mode
= GET_MODE (op0
);
10592 if (GET_CODE (op0
) == AND
)
10598 if (GET_CODE (op0
) == PLUS
)
10600 /* ADDS (and CMN alias). */
10605 if (GET_CODE (op0
) == MINUS
)
10612 if (GET_CODE (op0
) == ZERO_EXTRACT
&& op1
== const0_rtx
10613 && GET_MODE (x
) == CC_NZmode
&& CONST_INT_P (XEXP (op0
, 1))
10614 && CONST_INT_P (XEXP (op0
, 2)))
10616 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
10617 Handle it here directly rather than going to cost_logic
10618 since we know the immediate generated for the TST is valid
10619 so we can avoid creating an intermediate rtx for it only
10620 for costing purposes. */
10622 *cost
+= extra_cost
->alu
.logical
;
10624 *cost
+= rtx_cost (XEXP (op0
, 0), GET_MODE (op0
),
10625 ZERO_EXTRACT
, 0, speed
);
10629 if (GET_CODE (op1
) == NEG
)
10633 *cost
+= extra_cost
->alu
.arith
;
10635 *cost
+= rtx_cost (op0
, mode
, COMPARE
, 0, speed
);
10636 *cost
+= rtx_cost (XEXP (op1
, 0), mode
, NEG
, 1, speed
);
10642 Compare can freely swap the order of operands, and
10643 canonicalization puts the more complex operation first.
10644 But the integer MINUS logic expects the shift/extend
10645 operation in op1. */
10647 || (GET_CODE (op0
) == SUBREG
&& REG_P (SUBREG_REG (op0
)))))
10655 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_FLOAT
)
10659 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
10661 if (CONST_DOUBLE_P (op1
) && aarch64_float_const_zero_rtx_p (op1
))
10663 *cost
+= rtx_cost (op0
, VOIDmode
, COMPARE
, 0, speed
);
10664 /* FCMP supports constant 0.0 for no extra cost. */
10670 if (VECTOR_MODE_P (mode
))
10672 /* Vector compare. */
10674 *cost
+= extra_cost
->vect
.alu
;
10676 if (aarch64_float_const_zero_rtx_p (op1
))
10678 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
10692 *cost
+= rtx_cost (op0
, mode
, MINUS
, 0, speed
);
10694 /* Detect valid immediates. */
10695 if ((GET_MODE_CLASS (mode
) == MODE_INT
10696 || (GET_MODE_CLASS (mode
) == MODE_CC
10697 && GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
))
10698 && CONST_INT_P (op1
)
10699 && aarch64_uimm12_shift (INTVAL (op1
)))
10702 /* SUB(S) (immediate). */
10703 *cost
+= extra_cost
->alu
.arith
;
10707 /* Look for SUB (extended register). */
10708 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
10709 && aarch64_rtx_arith_op_extract_p (op1
, int_mode
))
10712 *cost
+= extra_cost
->alu
.extend_arith
;
10714 op1
= aarch64_strip_extend (op1
, true);
10715 *cost
+= rtx_cost (op1
, VOIDmode
,
10716 (enum rtx_code
) GET_CODE (op1
), 0, speed
);
10720 rtx new_op1
= aarch64_strip_extend (op1
, false);
10722 /* Cost this as an FMA-alike operation. */
10723 if ((GET_CODE (new_op1
) == MULT
10724 || aarch64_shift_p (GET_CODE (new_op1
)))
10725 && code
!= COMPARE
)
10727 *cost
+= aarch64_rtx_mult_cost (new_op1
, MULT
,
10728 (enum rtx_code
) code
,
10733 *cost
+= rtx_cost (new_op1
, VOIDmode
, MINUS
, 1, speed
);
10737 if (VECTOR_MODE_P (mode
))
10740 *cost
+= extra_cost
->vect
.alu
;
10742 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
10745 *cost
+= extra_cost
->alu
.arith
;
10747 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
10750 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
10764 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
10765 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
10768 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, PLUS
, 0, speed
);
10769 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
10773 if (GET_MODE_CLASS (mode
) == MODE_INT
10774 && (aarch64_plus_immediate (op1
, mode
)
10775 || aarch64_sve_addvl_addpl_immediate (op1
, mode
)))
10777 *cost
+= rtx_cost (op0
, mode
, PLUS
, 0, speed
);
10780 /* ADD (immediate). */
10781 *cost
+= extra_cost
->alu
.arith
;
10785 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
10787 /* Look for ADD (extended register). */
10788 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
10789 && aarch64_rtx_arith_op_extract_p (op0
, int_mode
))
10792 *cost
+= extra_cost
->alu
.extend_arith
;
10794 op0
= aarch64_strip_extend (op0
, true);
10795 *cost
+= rtx_cost (op0
, VOIDmode
,
10796 (enum rtx_code
) GET_CODE (op0
), 0, speed
);
10800 /* Strip any extend, leave shifts behind as we will
10801 cost them through mult_cost. */
10802 new_op0
= aarch64_strip_extend (op0
, false);
10804 if (GET_CODE (new_op0
) == MULT
10805 || aarch64_shift_p (GET_CODE (new_op0
)))
10807 *cost
+= aarch64_rtx_mult_cost (new_op0
, MULT
, PLUS
,
10812 *cost
+= rtx_cost (new_op0
, VOIDmode
, PLUS
, 0, speed
);
10816 if (VECTOR_MODE_P (mode
))
10819 *cost
+= extra_cost
->vect
.alu
;
10821 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
10824 *cost
+= extra_cost
->alu
.arith
;
10826 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
10829 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
10836 *cost
= COSTS_N_INSNS (1);
10840 if (VECTOR_MODE_P (mode
))
10841 *cost
+= extra_cost
->vect
.alu
;
10843 *cost
+= extra_cost
->alu
.rev
;
10848 if (aarch_rev16_p (x
))
10850 *cost
= COSTS_N_INSNS (1);
10854 if (VECTOR_MODE_P (mode
))
10855 *cost
+= extra_cost
->vect
.alu
;
10857 *cost
+= extra_cost
->alu
.rev
;
10862 if (aarch64_extr_rtx_p (x
, &op0
, &op1
))
10864 *cost
+= rtx_cost (op0
, mode
, IOR
, 0, speed
);
10865 *cost
+= rtx_cost (op1
, mode
, IOR
, 1, speed
);
10867 *cost
+= extra_cost
->alu
.shift
;
10871 /* Fall through. */
10878 if (VECTOR_MODE_P (mode
))
10881 *cost
+= extra_cost
->vect
.alu
;
10886 && GET_CODE (op0
) == MULT
10887 && CONST_INT_P (XEXP (op0
, 1))
10888 && CONST_INT_P (op1
)
10889 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0
, 1))),
10890 INTVAL (op1
)) != 0)
10892 /* This is a UBFM/SBFM. */
10893 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, ZERO_EXTRACT
, 0, speed
);
10895 *cost
+= extra_cost
->alu
.bfx
;
10899 if (is_int_mode (mode
, &int_mode
))
10901 if (CONST_INT_P (op1
))
10903 /* We have a mask + shift version of a UBFIZ
10904 i.e. the *andim_ashift<mode>_bfiz pattern. */
10905 if (GET_CODE (op0
) == ASHIFT
10906 && aarch64_mask_and_shift_for_ubfiz_p (int_mode
, op1
,
10909 *cost
+= rtx_cost (XEXP (op0
, 0), int_mode
,
10910 (enum rtx_code
) code
, 0, speed
);
10912 *cost
+= extra_cost
->alu
.bfx
;
10916 else if (aarch64_bitmask_imm (INTVAL (op1
), int_mode
))
10918 /* We possibly get the immediate for free, this is not
10920 *cost
+= rtx_cost (op0
, int_mode
,
10921 (enum rtx_code
) code
, 0, speed
);
10923 *cost
+= extra_cost
->alu
.logical
;
10932 /* Handle ORN, EON, or BIC. */
10933 if (GET_CODE (op0
) == NOT
)
10934 op0
= XEXP (op0
, 0);
10936 new_op0
= aarch64_strip_shift (op0
);
10938 /* If we had a shift on op0 then this is a logical-shift-
10939 by-register/immediate operation. Otherwise, this is just
10940 a logical operation. */
10943 if (new_op0
!= op0
)
10945 /* Shift by immediate. */
10946 if (CONST_INT_P (XEXP (op0
, 1)))
10947 *cost
+= extra_cost
->alu
.log_shift
;
10949 *cost
+= extra_cost
->alu
.log_shift_reg
;
10952 *cost
+= extra_cost
->alu
.logical
;
10955 /* In both cases we want to cost both operands. */
10956 *cost
+= rtx_cost (new_op0
, int_mode
, (enum rtx_code
) code
,
10958 *cost
+= rtx_cost (op1
, int_mode
, (enum rtx_code
) code
,
10968 op0
= aarch64_strip_shift (x
);
10970 if (VECTOR_MODE_P (mode
))
10973 *cost
+= extra_cost
->vect
.alu
;
10977 /* MVN-shifted-reg. */
10980 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
10983 *cost
+= extra_cost
->alu
.log_shift
;
10987 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10988 Handle the second form here taking care that 'a' in the above can
10990 else if (GET_CODE (op0
) == XOR
)
10992 rtx newop0
= XEXP (op0
, 0);
10993 rtx newop1
= XEXP (op0
, 1);
10994 rtx op0_stripped
= aarch64_strip_shift (newop0
);
10996 *cost
+= rtx_cost (newop1
, mode
, (enum rtx_code
) code
, 1, speed
);
10997 *cost
+= rtx_cost (op0_stripped
, mode
, XOR
, 0, speed
);
11001 if (op0_stripped
!= newop0
)
11002 *cost
+= extra_cost
->alu
.log_shift
;
11004 *cost
+= extra_cost
->alu
.logical
;
11011 *cost
+= extra_cost
->alu
.logical
;
11018 /* If a value is written in SI mode, then zero extended to DI
11019 mode, the operation will in general be free as a write to
11020 a 'w' register implicitly zeroes the upper bits of an 'x'
11021 register. However, if this is
11023 (set (reg) (zero_extend (reg)))
11025 we must cost the explicit register move. */
11027 && GET_MODE (op0
) == SImode
11030 int op_cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, 0, speed
);
11032 /* If OP_COST is non-zero, then the cost of the zero extend
11033 is effectively the cost of the inner operation. Otherwise
11034 we have a MOV instruction and we take the cost from the MOV
11035 itself. This is true independently of whether we are
11036 optimizing for space or time. */
11042 else if (MEM_P (op0
))
11044 /* All loads can zero extend to any size for free. */
11045 *cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, param
, speed
);
11049 op0
= aarch64_extend_bitfield_pattern_p (x
);
11052 *cost
+= rtx_cost (op0
, mode
, ZERO_EXTEND
, 0, speed
);
11054 *cost
+= extra_cost
->alu
.bfx
;
11060 if (VECTOR_MODE_P (mode
))
11063 *cost
+= extra_cost
->vect
.alu
;
11067 /* We generate an AND instead of UXTB/UXTH. */
11068 *cost
+= extra_cost
->alu
.logical
;
11074 if (MEM_P (XEXP (x
, 0)))
11079 rtx address
= XEXP (XEXP (x
, 0), 0);
11080 *cost
+= extra_cost
->ldst
.load_sign_extend
;
11083 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
11089 op0
= aarch64_extend_bitfield_pattern_p (x
);
11092 *cost
+= rtx_cost (op0
, mode
, SIGN_EXTEND
, 0, speed
);
11094 *cost
+= extra_cost
->alu
.bfx
;
11100 if (VECTOR_MODE_P (mode
))
11101 *cost
+= extra_cost
->vect
.alu
;
11103 *cost
+= extra_cost
->alu
.extend
;
11111 if (CONST_INT_P (op1
))
11115 if (VECTOR_MODE_P (mode
))
11117 /* Vector shift (immediate). */
11118 *cost
+= extra_cost
->vect
.alu
;
11122 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
11124 *cost
+= extra_cost
->alu
.shift
;
11128 /* We can incorporate zero/sign extend for free. */
11129 if (GET_CODE (op0
) == ZERO_EXTEND
11130 || GET_CODE (op0
) == SIGN_EXTEND
)
11131 op0
= XEXP (op0
, 0);
11133 *cost
+= rtx_cost (op0
, VOIDmode
, ASHIFT
, 0, speed
);
11138 if (VECTOR_MODE_P (mode
))
11141 /* Vector shift (register). */
11142 *cost
+= extra_cost
->vect
.alu
;
11148 *cost
+= extra_cost
->alu
.shift_reg
;
11150 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
11151 && CONST_INT_P (XEXP (op1
, 1))
11152 && known_eq (INTVAL (XEXP (op1
, 1)),
11153 GET_MODE_BITSIZE (mode
) - 1))
11155 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
11156 /* We already demanded XEXP (op1, 0) to be REG_P, so
11157 don't recurse into it. */
11161 return false; /* All arguments need to be in registers. */
11171 if (CONST_INT_P (op1
))
11173 /* ASR (immediate) and friends. */
11176 if (VECTOR_MODE_P (mode
))
11177 *cost
+= extra_cost
->vect
.alu
;
11179 *cost
+= extra_cost
->alu
.shift
;
11182 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
11187 if (VECTOR_MODE_P (mode
))
11190 /* Vector shift (register). */
11191 *cost
+= extra_cost
->vect
.alu
;
11196 /* ASR (register) and friends. */
11197 *cost
+= extra_cost
->alu
.shift_reg
;
11199 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
11200 && CONST_INT_P (XEXP (op1
, 1))
11201 && known_eq (INTVAL (XEXP (op1
, 1)),
11202 GET_MODE_BITSIZE (mode
) - 1))
11204 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
11205 /* We already demanded XEXP (op1, 0) to be REG_P, so
11206 don't recurse into it. */
11210 return false; /* All arguments need to be in registers. */
11215 if (aarch64_cmodel
== AARCH64_CMODEL_LARGE
11216 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
)
11220 *cost
+= extra_cost
->ldst
.load
;
11222 else if (aarch64_cmodel
== AARCH64_CMODEL_SMALL
11223 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_PIC
)
11225 /* ADRP, followed by ADD. */
11226 *cost
+= COSTS_N_INSNS (1);
11228 *cost
+= 2 * extra_cost
->alu
.arith
;
11230 else if (aarch64_cmodel
== AARCH64_CMODEL_TINY
11231 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
11235 *cost
+= extra_cost
->alu
.arith
;
11240 /* One extra load instruction, after accessing the GOT. */
11241 *cost
+= COSTS_N_INSNS (1);
11243 *cost
+= extra_cost
->ldst
.load
;
11249 /* ADRP/ADD (immediate). */
11251 *cost
+= extra_cost
->alu
.arith
;
11259 if (VECTOR_MODE_P (mode
))
11260 *cost
+= extra_cost
->vect
.alu
;
11262 *cost
+= extra_cost
->alu
.bfx
;
11265 /* We can trust that the immediates used will be correct (there
11266 are no by-register forms), so we need only cost op0. */
11267 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (enum rtx_code
) code
, 0, speed
);
11271 *cost
+= aarch64_rtx_mult_cost (x
, MULT
, 0, speed
);
11272 /* aarch64_rtx_mult_cost always handles recursion to its
11277 /* We can expand signed mod by power of 2 using a NEGS, two parallel
11278 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
11279 an unconditional negate. This case should only ever be reached through
11280 the set_smod_pow2_cheap check in expmed.c. */
11281 if (CONST_INT_P (XEXP (x
, 1))
11282 && exact_log2 (INTVAL (XEXP (x
, 1))) > 0
11283 && (mode
== SImode
|| mode
== DImode
))
11285 /* We expand to 4 instructions. Reset the baseline. */
11286 *cost
= COSTS_N_INSNS (4);
11289 *cost
+= 2 * extra_cost
->alu
.logical
11290 + 2 * extra_cost
->alu
.arith
;
11295 /* Fall-through. */
11299 /* Slighly prefer UMOD over SMOD. */
11300 if (VECTOR_MODE_P (mode
))
11301 *cost
+= extra_cost
->vect
.alu
;
11302 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
11303 *cost
+= (extra_cost
->mult
[mode
== DImode
].add
11304 + extra_cost
->mult
[mode
== DImode
].idiv
11305 + (code
== MOD
? 1 : 0));
11307 return false; /* All arguments need to be in registers. */
11314 if (VECTOR_MODE_P (mode
))
11315 *cost
+= extra_cost
->vect
.alu
;
11316 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
11317 /* There is no integer SQRT, so only DIV and UDIV can get
11319 *cost
+= (extra_cost
->mult
[mode
== DImode
].idiv
11320 /* Slighly prefer UDIV over SDIV. */
11321 + (code
== DIV
? 1 : 0));
11323 *cost
+= extra_cost
->fp
[mode
== DFmode
].div
;
11325 return false; /* All arguments need to be in registers. */
11328 return aarch64_if_then_else_costs (XEXP (x
, 0), XEXP (x
, 1),
11329 XEXP (x
, 2), cost
, speed
);
11342 return false; /* All arguments must be in registers. */
11351 if (VECTOR_MODE_P (mode
))
11352 *cost
+= extra_cost
->vect
.alu
;
11354 *cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
11357 /* FMSUB, FNMADD, and FNMSUB are free. */
11358 if (GET_CODE (op0
) == NEG
)
11359 op0
= XEXP (op0
, 0);
11361 if (GET_CODE (op2
) == NEG
)
11362 op2
= XEXP (op2
, 0);
11364 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
11365 and the by-element operand as operand 0. */
11366 if (GET_CODE (op1
) == NEG
)
11367 op1
= XEXP (op1
, 0);
11369 /* Catch vector-by-element operations. The by-element operand can
11370 either be (vec_duplicate (vec_select (x))) or just
11371 (vec_select (x)), depending on whether we are multiplying by
11372 a vector or a scalar.
11374 Canonicalization is not very good in these cases, FMA4 will put the
11375 by-element operand as operand 0, FNMA4 will have it as operand 1. */
11376 if (GET_CODE (op0
) == VEC_DUPLICATE
)
11377 op0
= XEXP (op0
, 0);
11378 else if (GET_CODE (op1
) == VEC_DUPLICATE
)
11379 op1
= XEXP (op1
, 0);
11381 if (GET_CODE (op0
) == VEC_SELECT
)
11382 op0
= XEXP (op0
, 0);
11383 else if (GET_CODE (op1
) == VEC_SELECT
)
11384 op1
= XEXP (op1
, 0);
11386 /* If the remaining parameters are not registers,
11387 get the cost to put them into registers. */
11388 *cost
+= rtx_cost (op0
, mode
, FMA
, 0, speed
);
11389 *cost
+= rtx_cost (op1
, mode
, FMA
, 1, speed
);
11390 *cost
+= rtx_cost (op2
, mode
, FMA
, 2, speed
);
11394 case UNSIGNED_FLOAT
:
11396 *cost
+= extra_cost
->fp
[mode
== DFmode
].fromint
;
11402 if (VECTOR_MODE_P (mode
))
11404 /*Vector truncate. */
11405 *cost
+= extra_cost
->vect
.alu
;
11408 *cost
+= extra_cost
->fp
[mode
== DFmode
].widen
;
11412 case FLOAT_TRUNCATE
:
11415 if (VECTOR_MODE_P (mode
))
11417 /*Vector conversion. */
11418 *cost
+= extra_cost
->vect
.alu
;
11421 *cost
+= extra_cost
->fp
[mode
== DFmode
].narrow
;
11428 /* Strip the rounding part. They will all be implemented
11429 by the fcvt* family of instructions anyway. */
11430 if (GET_CODE (x
) == UNSPEC
)
11432 unsigned int uns_code
= XINT (x
, 1);
11434 if (uns_code
== UNSPEC_FRINTA
11435 || uns_code
== UNSPEC_FRINTM
11436 || uns_code
== UNSPEC_FRINTN
11437 || uns_code
== UNSPEC_FRINTP
11438 || uns_code
== UNSPEC_FRINTZ
)
11439 x
= XVECEXP (x
, 0, 0);
11444 if (VECTOR_MODE_P (mode
))
11445 *cost
+= extra_cost
->vect
.alu
;
11447 *cost
+= extra_cost
->fp
[GET_MODE (x
) == DFmode
].toint
;
11450 /* We can combine fmul by a power of 2 followed by a fcvt into a single
11451 fixed-point fcvt. */
11452 if (GET_CODE (x
) == MULT
11453 && ((VECTOR_MODE_P (mode
)
11454 && aarch64_vec_fpconst_pow_of_2 (XEXP (x
, 1)) > 0)
11455 || aarch64_fpconst_pow_of_2 (XEXP (x
, 1)) > 0))
11457 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (rtx_code
) code
,
11462 *cost
+= rtx_cost (x
, VOIDmode
, (enum rtx_code
) code
, 0, speed
);
11466 if (VECTOR_MODE_P (mode
))
11468 /* ABS (vector). */
11470 *cost
+= extra_cost
->vect
.alu
;
11472 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
11476 /* FABD, which is analogous to FADD. */
11477 if (GET_CODE (op0
) == MINUS
)
11479 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, MINUS
, 0, speed
);
11480 *cost
+= rtx_cost (XEXP (op0
, 1), mode
, MINUS
, 1, speed
);
11482 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
11486 /* Simple FABS is analogous to FNEG. */
11488 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
11492 /* Integer ABS will either be split to
11493 two arithmetic instructions, or will be an ABS
11494 (scalar), which we don't model. */
11495 *cost
= COSTS_N_INSNS (2);
11497 *cost
+= 2 * extra_cost
->alu
.arith
;
11505 if (VECTOR_MODE_P (mode
))
11506 *cost
+= extra_cost
->vect
.alu
;
11509 /* FMAXNM/FMINNM/FMAX/FMIN.
11510 TODO: This may not be accurate for all implementations, but
11511 we do not model this in the cost tables. */
11512 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
11518 /* The floating point round to integer frint* instructions. */
11519 if (aarch64_frint_unspec_p (XINT (x
, 1)))
11522 *cost
+= extra_cost
->fp
[mode
== DFmode
].roundint
;
11527 if (XINT (x
, 1) == UNSPEC_RBIT
)
11530 *cost
+= extra_cost
->alu
.rev
;
11538 /* Decompose <su>muldi3_highpart. */
11539 if (/* (truncate:DI */
11542 && GET_MODE (XEXP (x
, 0)) == TImode
11543 && GET_CODE (XEXP (x
, 0)) == LSHIFTRT
11545 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == MULT
11546 /* (ANY_EXTEND:TI (reg:DI))
11547 (ANY_EXTEND:TI (reg:DI))) */
11548 && ((GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == ZERO_EXTEND
11549 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == ZERO_EXTEND
)
11550 || (GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == SIGN_EXTEND
11551 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == SIGN_EXTEND
))
11552 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0)) == DImode
11553 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0)) == DImode
11554 /* (const_int 64) */
11555 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
11556 && UINTVAL (XEXP (XEXP (x
, 0), 1)) == 64)
11560 *cost
+= extra_cost
->mult
[mode
== DImode
].extend
;
11561 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0),
11562 mode
, MULT
, 0, speed
);
11563 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0),
11564 mode
, MULT
, 1, speed
);
11568 /* Fall through. */
11574 && flag_aarch64_verbose_cost
)
11575 fprintf (dump_file
,
11576 "\nFailed to cost RTX. Assuming default cost.\n");
11581 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
11582 calculated for X. This cost is stored in *COST. Returns true
11583 if the total cost of X was calculated. */
11585 aarch64_rtx_costs_wrapper (rtx x
, machine_mode mode
, int outer
,
11586 int param
, int *cost
, bool speed
)
11588 bool result
= aarch64_rtx_costs (x
, mode
, outer
, param
, cost
, speed
);
11591 && flag_aarch64_verbose_cost
)
11593 print_rtl_single (dump_file
, x
);
11594 fprintf (dump_file
, "\n%s cost: %d (%s)\n",
11595 speed
? "Hot" : "Cold",
11596 *cost
, result
? "final" : "partial");
11603 aarch64_register_move_cost (machine_mode mode
,
11604 reg_class_t from_i
, reg_class_t to_i
)
11606 enum reg_class from
= (enum reg_class
) from_i
;
11607 enum reg_class to
= (enum reg_class
) to_i
;
11608 const struct cpu_regmove_cost
*regmove_cost
11609 = aarch64_tune_params
.regmove_cost
;
11611 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
11612 if (to
== TAILCALL_ADDR_REGS
|| to
== POINTER_REGS
)
11615 if (from
== TAILCALL_ADDR_REGS
|| from
== POINTER_REGS
)
11616 from
= GENERAL_REGS
;
11618 /* Moving between GPR and stack cost is the same as GP2GP. */
11619 if ((from
== GENERAL_REGS
&& to
== STACK_REG
)
11620 || (to
== GENERAL_REGS
&& from
== STACK_REG
))
11621 return regmove_cost
->GP2GP
;
11623 /* To/From the stack register, we move via the gprs. */
11624 if (to
== STACK_REG
|| from
== STACK_REG
)
11625 return aarch64_register_move_cost (mode
, from
, GENERAL_REGS
)
11626 + aarch64_register_move_cost (mode
, GENERAL_REGS
, to
);
11628 if (known_eq (GET_MODE_SIZE (mode
), 16))
11630 /* 128-bit operations on general registers require 2 instructions. */
11631 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
11632 return regmove_cost
->GP2GP
* 2;
11633 else if (from
== GENERAL_REGS
)
11634 return regmove_cost
->GP2FP
* 2;
11635 else if (to
== GENERAL_REGS
)
11636 return regmove_cost
->FP2GP
* 2;
11638 /* When AdvSIMD instructions are disabled it is not possible to move
11639 a 128-bit value directly between Q registers. This is handled in
11640 secondary reload. A general register is used as a scratch to move
11641 the upper DI value and the lower DI value is moved directly,
11642 hence the cost is the sum of three moves. */
11644 return regmove_cost
->GP2FP
+ regmove_cost
->FP2GP
+ regmove_cost
->FP2FP
;
11646 return regmove_cost
->FP2FP
;
11649 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
11650 return regmove_cost
->GP2GP
;
11651 else if (from
== GENERAL_REGS
)
11652 return regmove_cost
->GP2FP
;
11653 else if (to
== GENERAL_REGS
)
11654 return regmove_cost
->FP2GP
;
11656 return regmove_cost
->FP2FP
;
11660 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED
,
11661 reg_class_t rclass ATTRIBUTE_UNUSED
,
11662 bool in ATTRIBUTE_UNUSED
)
11664 return aarch64_tune_params
.memmov_cost
;
11667 /* Implement TARGET_INIT_BUILTINS. */
11669 aarch64_init_builtins ()
11671 aarch64_general_init_builtins ();
11674 /* Implement TARGET_FOLD_BUILTIN. */
11676 aarch64_fold_builtin (tree fndecl
, int nargs
, tree
*args
, bool)
11678 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
11679 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
11680 tree type
= TREE_TYPE (TREE_TYPE (fndecl
));
11681 switch (code
& AARCH64_BUILTIN_CLASS
)
11683 case AARCH64_BUILTIN_GENERAL
:
11684 return aarch64_general_fold_builtin (subcode
, type
, nargs
, args
);
11686 gcc_unreachable ();
11689 /* Implement TARGET_GIMPLE_FOLD_BUILTIN. */
11691 aarch64_gimple_fold_builtin (gimple_stmt_iterator
*gsi
)
11693 gcall
*stmt
= as_a
<gcall
*> (gsi_stmt (*gsi
));
11694 tree fndecl
= gimple_call_fndecl (stmt
);
11695 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
11696 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
11697 gimple
*new_stmt
= NULL
;
11698 switch (code
& AARCH64_BUILTIN_CLASS
)
11700 case AARCH64_BUILTIN_GENERAL
:
11701 new_stmt
= aarch64_general_gimple_fold_builtin (subcode
, stmt
);
11708 gsi_replace (gsi
, new_stmt
, true);
11712 /* Implement TARGET_EXPAND_BUILTIN. */
11714 aarch64_expand_builtin (tree exp
, rtx target
, rtx
, machine_mode
, int)
11716 tree fndecl
= TREE_OPERAND (CALL_EXPR_FN (exp
), 0);
11717 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
11718 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
11719 switch (code
& AARCH64_BUILTIN_CLASS
)
11721 case AARCH64_BUILTIN_GENERAL
:
11722 return aarch64_general_expand_builtin (subcode
, exp
, target
);
11724 gcc_unreachable ();
11727 /* Implement TARGET_BUILTIN_DECL. */
11729 aarch64_builtin_decl (unsigned int code
, bool initialize_p
)
11731 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
11732 switch (code
& AARCH64_BUILTIN_CLASS
)
11734 case AARCH64_BUILTIN_GENERAL
:
11735 return aarch64_general_builtin_decl (subcode
, initialize_p
);
11737 gcc_unreachable ();
11740 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
11741 to optimize 1.0/sqrt. */
11744 use_rsqrt_p (machine_mode mode
)
11746 return (!flag_trapping_math
11747 && flag_unsafe_math_optimizations
11748 && ((aarch64_tune_params
.approx_modes
->recip_sqrt
11749 & AARCH64_APPROX_MODE (mode
))
11750 || flag_mrecip_low_precision_sqrt
));
11753 /* Function to decide when to use the approximate reciprocal square root
11757 aarch64_builtin_reciprocal (tree fndecl
)
11759 machine_mode mode
= TYPE_MODE (TREE_TYPE (fndecl
));
11761 if (!use_rsqrt_p (mode
))
11763 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
11764 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
11765 switch (code
& AARCH64_BUILTIN_CLASS
)
11767 case AARCH64_BUILTIN_GENERAL
:
11768 return aarch64_general_builtin_rsqrt (subcode
);
11770 gcc_unreachable ();
11773 /* Emit instruction sequence to compute either the approximate square root
11774 or its approximate reciprocal, depending on the flag RECP, and return
11775 whether the sequence was emitted or not. */
11778 aarch64_emit_approx_sqrt (rtx dst
, rtx src
, bool recp
)
11780 machine_mode mode
= GET_MODE (dst
);
11782 if (GET_MODE_INNER (mode
) == HFmode
)
11784 gcc_assert (!recp
);
11790 if (!(flag_mlow_precision_sqrt
11791 || (aarch64_tune_params
.approx_modes
->sqrt
11792 & AARCH64_APPROX_MODE (mode
))))
11795 if (flag_finite_math_only
11796 || flag_trapping_math
11797 || !flag_unsafe_math_optimizations
11798 || optimize_function_for_size_p (cfun
))
11802 /* Caller assumes we cannot fail. */
11803 gcc_assert (use_rsqrt_p (mode
));
11805 machine_mode mmsk
= mode_for_int_vector (mode
).require ();
11806 rtx xmsk
= gen_reg_rtx (mmsk
);
11808 /* When calculating the approximate square root, compare the
11809 argument with 0.0 and create a mask. */
11810 emit_insn (gen_rtx_SET (xmsk
,
11812 gen_rtx_EQ (mmsk
, src
,
11813 CONST0_RTX (mode
)))));
11815 /* Estimate the approximate reciprocal square root. */
11816 rtx xdst
= gen_reg_rtx (mode
);
11817 emit_insn (gen_aarch64_rsqrte (mode
, xdst
, src
));
11819 /* Iterate over the series twice for SF and thrice for DF. */
11820 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
11822 /* Optionally iterate over the series once less for faster performance
11823 while sacrificing the accuracy. */
11824 if ((recp
&& flag_mrecip_low_precision_sqrt
)
11825 || (!recp
&& flag_mlow_precision_sqrt
))
11828 /* Iterate over the series to calculate the approximate reciprocal square
11830 rtx x1
= gen_reg_rtx (mode
);
11831 while (iterations
--)
11833 rtx x2
= gen_reg_rtx (mode
);
11834 emit_set_insn (x2
, gen_rtx_MULT (mode
, xdst
, xdst
));
11836 emit_insn (gen_aarch64_rsqrts (mode
, x1
, src
, x2
));
11838 if (iterations
> 0)
11839 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, x1
));
11844 /* Qualify the approximate reciprocal square root when the argument is
11845 0.0 by squashing the intermediary result to 0.0. */
11846 rtx xtmp
= gen_reg_rtx (mmsk
);
11847 emit_set_insn (xtmp
, gen_rtx_AND (mmsk
, gen_rtx_NOT (mmsk
, xmsk
),
11848 gen_rtx_SUBREG (mmsk
, xdst
, 0)));
11849 emit_move_insn (xdst
, gen_rtx_SUBREG (mode
, xtmp
, 0));
11851 /* Calculate the approximate square root. */
11852 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, src
));
11855 /* Finalize the approximation. */
11856 emit_set_insn (dst
, gen_rtx_MULT (mode
, xdst
, x1
));
11861 /* Emit the instruction sequence to compute the approximation for the division
11862 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
11865 aarch64_emit_approx_div (rtx quo
, rtx num
, rtx den
)
11867 machine_mode mode
= GET_MODE (quo
);
11869 if (GET_MODE_INNER (mode
) == HFmode
)
11872 bool use_approx_division_p
= (flag_mlow_precision_div
11873 || (aarch64_tune_params
.approx_modes
->division
11874 & AARCH64_APPROX_MODE (mode
)));
11876 if (!flag_finite_math_only
11877 || flag_trapping_math
11878 || !flag_unsafe_math_optimizations
11879 || optimize_function_for_size_p (cfun
)
11880 || !use_approx_division_p
)
11883 if (!TARGET_SIMD
&& VECTOR_MODE_P (mode
))
11886 /* Estimate the approximate reciprocal. */
11887 rtx xrcp
= gen_reg_rtx (mode
);
11888 emit_insn (gen_aarch64_frecpe (mode
, xrcp
, den
));
11890 /* Iterate over the series twice for SF and thrice for DF. */
11891 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
11893 /* Optionally iterate over the series once less for faster performance,
11894 while sacrificing the accuracy. */
11895 if (flag_mlow_precision_div
)
11898 /* Iterate over the series to calculate the approximate reciprocal. */
11899 rtx xtmp
= gen_reg_rtx (mode
);
11900 while (iterations
--)
11902 emit_insn (gen_aarch64_frecps (mode
, xtmp
, xrcp
, den
));
11904 if (iterations
> 0)
11905 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
11908 if (num
!= CONST1_RTX (mode
))
11910 /* As the approximate reciprocal of DEN is already calculated, only
11911 calculate the approximate division when NUM is not 1.0. */
11912 rtx xnum
= force_reg (mode
, num
);
11913 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xnum
));
11916 /* Finalize the approximation. */
11917 emit_set_insn (quo
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
11921 /* Return the number of instructions that can be issued per cycle. */
11923 aarch64_sched_issue_rate (void)
11925 return aarch64_tune_params
.issue_rate
;
11928 /* Implement TARGET_SCHED_VARIABLE_ISSUE. */
11930 aarch64_sched_variable_issue (FILE *, int, rtx_insn
*insn
, int more
)
11932 if (DEBUG_INSN_P (insn
))
11935 rtx_code code
= GET_CODE (PATTERN (insn
));
11936 if (code
== USE
|| code
== CLOBBER
)
11939 if (get_attr_type (insn
) == TYPE_NO_INSN
)
11946 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
11948 int issue_rate
= aarch64_sched_issue_rate ();
11950 return issue_rate
> 1 && !sched_fusion
? issue_rate
: 0;
11954 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
11955 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
11956 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
11959 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn
*insn
,
11962 return autopref_multipass_dfa_lookahead_guard (insn
, ready_index
);
11966 /* Vectorizer cost model target hooks. */
11968 /* Implement targetm.vectorize.builtin_vectorization_cost. */
11970 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost
,
11972 int misalign ATTRIBUTE_UNUSED
)
11975 const cpu_vector_cost
*costs
= aarch64_tune_params
.vec_costs
;
11978 if (vectype
!= NULL
)
11979 fp
= FLOAT_TYPE_P (vectype
);
11981 switch (type_of_cost
)
11984 return fp
? costs
->scalar_fp_stmt_cost
: costs
->scalar_int_stmt_cost
;
11987 return costs
->scalar_load_cost
;
11990 return costs
->scalar_store_cost
;
11993 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
11996 return costs
->vec_align_load_cost
;
11999 return costs
->vec_store_cost
;
12001 case vec_to_scalar
:
12002 return costs
->vec_to_scalar_cost
;
12004 case scalar_to_vec
:
12005 return costs
->scalar_to_vec_cost
;
12007 case unaligned_load
:
12008 case vector_gather_load
:
12009 return costs
->vec_unalign_load_cost
;
12011 case unaligned_store
:
12012 case vector_scatter_store
:
12013 return costs
->vec_unalign_store_cost
;
12015 case cond_branch_taken
:
12016 return costs
->cond_taken_branch_cost
;
12018 case cond_branch_not_taken
:
12019 return costs
->cond_not_taken_branch_cost
;
12022 return costs
->vec_permute_cost
;
12024 case vec_promote_demote
:
12025 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
12027 case vec_construct
:
12028 elements
= estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype
));
12029 return elements
/ 2 + 1;
12032 gcc_unreachable ();
12036 /* Implement targetm.vectorize.add_stmt_cost. */
12038 aarch64_add_stmt_cost (void *data
, int count
, enum vect_cost_for_stmt kind
,
12039 struct _stmt_vec_info
*stmt_info
, int misalign
,
12040 enum vect_cost_model_location where
)
12042 unsigned *cost
= (unsigned *) data
;
12043 unsigned retval
= 0;
12045 if (flag_vect_cost_model
)
12047 tree vectype
= stmt_info
? stmt_vectype (stmt_info
) : NULL_TREE
;
12049 aarch64_builtin_vectorization_cost (kind
, vectype
, misalign
);
12051 /* Statements in an inner loop relative to the loop being
12052 vectorized are weighted more heavily. The value here is
12053 arbitrary and could potentially be improved with analysis. */
12054 if (where
== vect_body
&& stmt_info
&& stmt_in_inner_loop_p (stmt_info
))
12055 count
*= 50; /* FIXME */
12057 retval
= (unsigned) (count
* stmt_cost
);
12058 cost
[where
] += retval
;
12064 static void initialize_aarch64_code_model (struct gcc_options
*);
12066 /* Parse the TO_PARSE string and put the architecture struct that it
12067 selects into RES and the architectural features into ISA_FLAGS.
12068 Return an aarch64_parse_opt_result describing the parse result.
12069 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
12070 When the TO_PARSE string contains an invalid extension,
12071 a copy of the string is created and stored to INVALID_EXTENSION. */
12073 static enum aarch64_parse_opt_result
12074 aarch64_parse_arch (const char *to_parse
, const struct processor
**res
,
12075 uint64_t *isa_flags
, std::string
*invalid_extension
)
12078 const struct processor
*arch
;
12081 ext
= strchr (to_parse
, '+');
12084 len
= ext
- to_parse
;
12086 len
= strlen (to_parse
);
12089 return AARCH64_PARSE_MISSING_ARG
;
12092 /* Loop through the list of supported ARCHes to find a match. */
12093 for (arch
= all_architectures
; arch
->name
!= NULL
; arch
++)
12095 if (strlen (arch
->name
) == len
12096 && strncmp (arch
->name
, to_parse
, len
) == 0)
12098 uint64_t isa_temp
= arch
->flags
;
12102 /* TO_PARSE string contains at least one extension. */
12103 enum aarch64_parse_opt_result ext_res
12104 = aarch64_parse_extension (ext
, &isa_temp
, invalid_extension
);
12106 if (ext_res
!= AARCH64_PARSE_OK
)
12109 /* Extension parsing was successful. Confirm the result
12110 arch and ISA flags. */
12112 *isa_flags
= isa_temp
;
12113 return AARCH64_PARSE_OK
;
12117 /* ARCH name not found in list. */
12118 return AARCH64_PARSE_INVALID_ARG
;
12121 /* Parse the TO_PARSE string and put the result tuning in RES and the
12122 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
12123 describing the parse result. If there is an error parsing, RES and
12124 ISA_FLAGS are left unchanged.
12125 When the TO_PARSE string contains an invalid extension,
12126 a copy of the string is created and stored to INVALID_EXTENSION. */
12128 static enum aarch64_parse_opt_result
12129 aarch64_parse_cpu (const char *to_parse
, const struct processor
**res
,
12130 uint64_t *isa_flags
, std::string
*invalid_extension
)
12133 const struct processor
*cpu
;
12136 ext
= strchr (to_parse
, '+');
12139 len
= ext
- to_parse
;
12141 len
= strlen (to_parse
);
12144 return AARCH64_PARSE_MISSING_ARG
;
12147 /* Loop through the list of supported CPUs to find a match. */
12148 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
12150 if (strlen (cpu
->name
) == len
&& strncmp (cpu
->name
, to_parse
, len
) == 0)
12152 uint64_t isa_temp
= cpu
->flags
;
12157 /* TO_PARSE string contains at least one extension. */
12158 enum aarch64_parse_opt_result ext_res
12159 = aarch64_parse_extension (ext
, &isa_temp
, invalid_extension
);
12161 if (ext_res
!= AARCH64_PARSE_OK
)
12164 /* Extension parsing was successfull. Confirm the result
12165 cpu and ISA flags. */
12167 *isa_flags
= isa_temp
;
12168 return AARCH64_PARSE_OK
;
12172 /* CPU name not found in list. */
12173 return AARCH64_PARSE_INVALID_ARG
;
12176 /* Parse the TO_PARSE string and put the cpu it selects into RES.
12177 Return an aarch64_parse_opt_result describing the parse result.
12178 If the parsing fails the RES does not change. */
12180 static enum aarch64_parse_opt_result
12181 aarch64_parse_tune (const char *to_parse
, const struct processor
**res
)
12183 const struct processor
*cpu
;
12185 /* Loop through the list of supported CPUs to find a match. */
12186 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
12188 if (strcmp (cpu
->name
, to_parse
) == 0)
12191 return AARCH64_PARSE_OK
;
12195 /* CPU name not found in list. */
12196 return AARCH64_PARSE_INVALID_ARG
;
12199 /* Parse TOKEN, which has length LENGTH to see if it is an option
12200 described in FLAG. If it is, return the index bit for that fusion type.
12201 If not, error (printing OPTION_NAME) and return zero. */
12203 static unsigned int
12204 aarch64_parse_one_option_token (const char *token
,
12206 const struct aarch64_flag_desc
*flag
,
12207 const char *option_name
)
12209 for (; flag
->name
!= NULL
; flag
++)
12211 if (length
== strlen (flag
->name
)
12212 && !strncmp (flag
->name
, token
, length
))
12216 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name
, token
);
12220 /* Parse OPTION which is a comma-separated list of flags to enable.
12221 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
12222 default state we inherit from the CPU tuning structures. OPTION_NAME
12223 gives the top-level option we are parsing in the -moverride string,
12224 for use in error messages. */
12226 static unsigned int
12227 aarch64_parse_boolean_options (const char *option
,
12228 const struct aarch64_flag_desc
*flags
,
12229 unsigned int initial_state
,
12230 const char *option_name
)
12232 const char separator
= '.';
12233 const char* specs
= option
;
12234 const char* ntoken
= option
;
12235 unsigned int found_flags
= initial_state
;
12237 while ((ntoken
= strchr (specs
, separator
)))
12239 size_t token_length
= ntoken
- specs
;
12240 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
12244 /* If we find "none" (or, for simplicity's sake, an error) anywhere
12245 in the token stream, reset the supported operations. So:
12247 adrp+add.cmp+branch.none.adrp+add
12249 would have the result of turning on only adrp+add fusion. */
12253 found_flags
|= token_ops
;
12257 /* We ended with a comma, print something. */
12260 error ("%s string ill-formed\n", option_name
);
12264 /* We still have one more token to parse. */
12265 size_t token_length
= strlen (specs
);
12266 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
12273 found_flags
|= token_ops
;
12274 return found_flags
;
12277 /* Support for overriding instruction fusion. */
12280 aarch64_parse_fuse_string (const char *fuse_string
,
12281 struct tune_params
*tune
)
12283 tune
->fusible_ops
= aarch64_parse_boolean_options (fuse_string
,
12284 aarch64_fusible_pairs
,
12289 /* Support for overriding other tuning flags. */
12292 aarch64_parse_tune_string (const char *tune_string
,
12293 struct tune_params
*tune
)
12295 tune
->extra_tuning_flags
12296 = aarch64_parse_boolean_options (tune_string
,
12297 aarch64_tuning_flags
,
12298 tune
->extra_tuning_flags
,
12302 /* Parse the sve_width tuning moverride string in TUNE_STRING.
12303 Accept the valid SVE vector widths allowed by
12304 aarch64_sve_vector_bits_enum and use it to override sve_width
12308 aarch64_parse_sve_width_string (const char *tune_string
,
12309 struct tune_params
*tune
)
12313 int n
= sscanf (tune_string
, "%d", &width
);
12316 error ("invalid format for sve_width");
12328 error ("invalid sve_width value: %d", width
);
12330 tune
->sve_width
= (enum aarch64_sve_vector_bits_enum
) width
;
12333 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
12334 we understand. If it is, extract the option string and handoff to
12335 the appropriate function. */
12338 aarch64_parse_one_override_token (const char* token
,
12340 struct tune_params
*tune
)
12342 const struct aarch64_tuning_override_function
*fn
12343 = aarch64_tuning_override_functions
;
12345 const char *option_part
= strchr (token
, '=');
12348 error ("tuning string missing in option (%s)", token
);
12352 /* Get the length of the option name. */
12353 length
= option_part
- token
;
12354 /* Skip the '=' to get to the option string. */
12357 for (; fn
->name
!= NULL
; fn
++)
12359 if (!strncmp (fn
->name
, token
, length
))
12361 fn
->parse_override (option_part
, tune
);
12366 error ("unknown tuning option (%s)",token
);
12370 /* A checking mechanism for the implementation of the tls size. */
12373 initialize_aarch64_tls_size (struct gcc_options
*opts
)
12375 if (aarch64_tls_size
== 0)
12376 aarch64_tls_size
= 24;
12378 switch (opts
->x_aarch64_cmodel_var
)
12380 case AARCH64_CMODEL_TINY
:
12381 /* Both the default and maximum TLS size allowed under tiny is 1M which
12382 needs two instructions to address, so we clamp the size to 24. */
12383 if (aarch64_tls_size
> 24)
12384 aarch64_tls_size
= 24;
12386 case AARCH64_CMODEL_SMALL
:
12387 /* The maximum TLS size allowed under small is 4G. */
12388 if (aarch64_tls_size
> 32)
12389 aarch64_tls_size
= 32;
12391 case AARCH64_CMODEL_LARGE
:
12392 /* The maximum TLS size allowed under large is 16E.
12393 FIXME: 16E should be 64bit, we only support 48bit offset now. */
12394 if (aarch64_tls_size
> 48)
12395 aarch64_tls_size
= 48;
12398 gcc_unreachable ();
12404 /* Parse STRING looking for options in the format:
12405 string :: option:string
12406 option :: name=substring
12408 substring :: defined by option. */
12411 aarch64_parse_override_string (const char* input_string
,
12412 struct tune_params
* tune
)
12414 const char separator
= ':';
12415 size_t string_length
= strlen (input_string
) + 1;
12416 char *string_root
= (char *) xmalloc (sizeof (*string_root
) * string_length
);
12417 char *string
= string_root
;
12418 strncpy (string
, input_string
, string_length
);
12419 string
[string_length
- 1] = '\0';
12421 char* ntoken
= string
;
12423 while ((ntoken
= strchr (string
, separator
)))
12425 size_t token_length
= ntoken
- string
;
12426 /* Make this substring look like a string. */
12428 aarch64_parse_one_override_token (string
, token_length
, tune
);
12432 /* One last option to parse. */
12433 aarch64_parse_one_override_token (string
, strlen (string
), tune
);
12434 free (string_root
);
12439 aarch64_override_options_after_change_1 (struct gcc_options
*opts
)
12441 if (accepted_branch_protection_string
)
12443 opts
->x_aarch64_branch_protection_string
12444 = xstrdup (accepted_branch_protection_string
);
12447 /* PR 70044: We have to be careful about being called multiple times for the
12448 same function. This means all changes should be repeatable. */
12450 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
12451 Disable the frame pointer flag so the mid-end will not use a frame
12452 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
12453 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
12454 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
12455 aarch64_use_frame_pointer
= opts
->x_flag_omit_frame_pointer
!= 1;
12456 if (opts
->x_flag_omit_frame_pointer
== 0)
12457 opts
->x_flag_omit_frame_pointer
= 2;
12459 /* If not optimizing for size, set the default
12460 alignment to what the target wants. */
12461 if (!opts
->x_optimize_size
)
12463 if (opts
->x_flag_align_loops
&& !opts
->x_str_align_loops
)
12464 opts
->x_str_align_loops
= aarch64_tune_params
.loop_align
;
12465 if (opts
->x_flag_align_jumps
&& !opts
->x_str_align_jumps
)
12466 opts
->x_str_align_jumps
= aarch64_tune_params
.jump_align
;
12467 if (opts
->x_flag_align_functions
&& !opts
->x_str_align_functions
)
12468 opts
->x_str_align_functions
= aarch64_tune_params
.function_align
;
12471 /* We default to no pc-relative literal loads. */
12473 aarch64_pcrelative_literal_loads
= false;
12475 /* If -mpc-relative-literal-loads is set on the command line, this
12476 implies that the user asked for PC relative literal loads. */
12477 if (opts
->x_pcrelative_literal_loads
== 1)
12478 aarch64_pcrelative_literal_loads
= true;
12480 /* In the tiny memory model it makes no sense to disallow PC relative
12481 literal pool loads. */
12482 if (aarch64_cmodel
== AARCH64_CMODEL_TINY
12483 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
12484 aarch64_pcrelative_literal_loads
= true;
12486 /* When enabling the lower precision Newton series for the square root, also
12487 enable it for the reciprocal square root, since the latter is an
12488 intermediary step for the former. */
12489 if (flag_mlow_precision_sqrt
)
12490 flag_mrecip_low_precision_sqrt
= true;
12493 /* 'Unpack' up the internal tuning structs and update the options
12494 in OPTS. The caller must have set up selected_tune and selected_arch
12495 as all the other target-specific codegen decisions are
12496 derived from them. */
12499 aarch64_override_options_internal (struct gcc_options
*opts
)
12501 aarch64_tune_flags
= selected_tune
->flags
;
12502 aarch64_tune
= selected_tune
->sched_core
;
12503 /* Make a copy of the tuning parameters attached to the core, which
12504 we may later overwrite. */
12505 aarch64_tune_params
= *(selected_tune
->tune
);
12506 aarch64_architecture_version
= selected_arch
->architecture_version
;
12508 if (opts
->x_aarch64_override_tune_string
)
12509 aarch64_parse_override_string (opts
->x_aarch64_override_tune_string
,
12510 &aarch64_tune_params
);
12512 /* This target defaults to strict volatile bitfields. */
12513 if (opts
->x_flag_strict_volatile_bitfields
< 0 && abi_version_at_least (2))
12514 opts
->x_flag_strict_volatile_bitfields
= 1;
12516 if (aarch64_stack_protector_guard
== SSP_GLOBAL
12517 && opts
->x_aarch64_stack_protector_guard_offset_str
)
12519 error ("incompatible options %<-mstack-protector-guard=global%> and "
12520 "%<-mstack-protector-guard-offset=%s%>",
12521 aarch64_stack_protector_guard_offset_str
);
12524 if (aarch64_stack_protector_guard
== SSP_SYSREG
12525 && !(opts
->x_aarch64_stack_protector_guard_offset_str
12526 && opts
->x_aarch64_stack_protector_guard_reg_str
))
12528 error ("both %<-mstack-protector-guard-offset%> and "
12529 "%<-mstack-protector-guard-reg%> must be used "
12530 "with %<-mstack-protector-guard=sysreg%>");
12533 if (opts
->x_aarch64_stack_protector_guard_reg_str
)
12535 if (strlen (opts
->x_aarch64_stack_protector_guard_reg_str
) > 100)
12536 error ("specify a system register with a small string length.");
12539 if (opts
->x_aarch64_stack_protector_guard_offset_str
)
12542 const char *str
= aarch64_stack_protector_guard_offset_str
;
12544 long offs
= strtol (aarch64_stack_protector_guard_offset_str
, &end
, 0);
12545 if (!*str
|| *end
|| errno
)
12546 error ("%qs is not a valid offset in %qs", str
,
12547 "-mstack-protector-guard-offset=");
12548 aarch64_stack_protector_guard_offset
= offs
;
12551 initialize_aarch64_code_model (opts
);
12552 initialize_aarch64_tls_size (opts
);
12554 int queue_depth
= 0;
12555 switch (aarch64_tune_params
.autoprefetcher_model
)
12557 case tune_params::AUTOPREFETCHER_OFF
:
12560 case tune_params::AUTOPREFETCHER_WEAK
:
12563 case tune_params::AUTOPREFETCHER_STRONG
:
12564 queue_depth
= max_insn_queue_index
+ 1;
12567 gcc_unreachable ();
12570 /* We don't mind passing in global_options_set here as we don't use
12571 the *options_set structs anyway. */
12572 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH
,
12574 opts
->x_param_values
,
12575 global_options_set
.x_param_values
);
12577 /* Set up parameters to be used in prefetching algorithm. Do not
12578 override the defaults unless we are tuning for a core we have
12579 researched values for. */
12580 if (aarch64_tune_params
.prefetch
->num_slots
> 0)
12581 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES
,
12582 aarch64_tune_params
.prefetch
->num_slots
,
12583 opts
->x_param_values
,
12584 global_options_set
.x_param_values
);
12585 if (aarch64_tune_params
.prefetch
->l1_cache_size
>= 0)
12586 maybe_set_param_value (PARAM_L1_CACHE_SIZE
,
12587 aarch64_tune_params
.prefetch
->l1_cache_size
,
12588 opts
->x_param_values
,
12589 global_options_set
.x_param_values
);
12590 if (aarch64_tune_params
.prefetch
->l1_cache_line_size
>= 0)
12591 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE
,
12592 aarch64_tune_params
.prefetch
->l1_cache_line_size
,
12593 opts
->x_param_values
,
12594 global_options_set
.x_param_values
);
12595 if (aarch64_tune_params
.prefetch
->l2_cache_size
>= 0)
12596 maybe_set_param_value (PARAM_L2_CACHE_SIZE
,
12597 aarch64_tune_params
.prefetch
->l2_cache_size
,
12598 opts
->x_param_values
,
12599 global_options_set
.x_param_values
);
12600 if (!aarch64_tune_params
.prefetch
->prefetch_dynamic_strides
)
12601 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES
,
12603 opts
->x_param_values
,
12604 global_options_set
.x_param_values
);
12605 if (aarch64_tune_params
.prefetch
->minimum_stride
>= 0)
12606 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE
,
12607 aarch64_tune_params
.prefetch
->minimum_stride
,
12608 opts
->x_param_values
,
12609 global_options_set
.x_param_values
);
12611 /* Use the alternative scheduling-pressure algorithm by default. */
12612 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM
, SCHED_PRESSURE_MODEL
,
12613 opts
->x_param_values
,
12614 global_options_set
.x_param_values
);
12616 /* If the user hasn't changed it via configure then set the default to 64 KB
12617 for the backend. */
12618 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
,
12619 DEFAULT_STK_CLASH_GUARD_SIZE
== 0
12620 ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE
,
12621 opts
->x_param_values
,
12622 global_options_set
.x_param_values
);
12624 /* Validate the guard size. */
12625 int guard_size
= PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
12627 /* Enforce that interval is the same size as size so the mid-end does the
12629 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL
,
12631 opts
->x_param_values
,
12632 global_options_set
.x_param_values
);
12634 /* The maybe_set calls won't update the value if the user has explicitly set
12635 one. Which means we need to validate that probing interval and guard size
12638 = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL
);
12639 if (guard_size
!= probe_interval
)
12640 error ("stack clash guard size %<%d%> must be equal to probing interval "
12641 "%<%d%>", guard_size
, probe_interval
);
12643 /* Enable sw prefetching at specified optimization level for
12644 CPUS that have prefetch. Lower optimization level threshold by 1
12645 when profiling is enabled. */
12646 if (opts
->x_flag_prefetch_loop_arrays
< 0
12647 && !opts
->x_optimize_size
12648 && aarch64_tune_params
.prefetch
->default_opt_level
>= 0
12649 && opts
->x_optimize
>= aarch64_tune_params
.prefetch
->default_opt_level
)
12650 opts
->x_flag_prefetch_loop_arrays
= 1;
12652 if (opts
->x_aarch64_arch_string
== NULL
)
12653 opts
->x_aarch64_arch_string
= selected_arch
->name
;
12654 if (opts
->x_aarch64_cpu_string
== NULL
)
12655 opts
->x_aarch64_cpu_string
= selected_cpu
->name
;
12656 if (opts
->x_aarch64_tune_string
== NULL
)
12657 opts
->x_aarch64_tune_string
= selected_tune
->name
;
12659 aarch64_override_options_after_change_1 (opts
);
12662 /* Print a hint with a suggestion for a core or architecture name that
12663 most closely resembles what the user passed in STR. ARCH is true if
12664 the user is asking for an architecture name. ARCH is false if the user
12665 is asking for a core name. */
12668 aarch64_print_hint_for_core_or_arch (const char *str
, bool arch
)
12670 auto_vec
<const char *> candidates
;
12671 const struct processor
*entry
= arch
? all_architectures
: all_cores
;
12672 for (; entry
->name
!= NULL
; entry
++)
12673 candidates
.safe_push (entry
->name
);
12675 #ifdef HAVE_LOCAL_CPU_DETECT
12676 /* Add also "native" as possible value. */
12678 candidates
.safe_push ("native");
12682 const char *hint
= candidates_list_and_hint (str
, s
, candidates
);
12684 inform (input_location
, "valid arguments are: %s;"
12685 " did you mean %qs?", s
, hint
);
12687 inform (input_location
, "valid arguments are: %s", s
);
12692 /* Print a hint with a suggestion for a core name that most closely resembles
12693 what the user passed in STR. */
12696 aarch64_print_hint_for_core (const char *str
)
12698 aarch64_print_hint_for_core_or_arch (str
, false);
12701 /* Print a hint with a suggestion for an architecture name that most closely
12702 resembles what the user passed in STR. */
12705 aarch64_print_hint_for_arch (const char *str
)
12707 aarch64_print_hint_for_core_or_arch (str
, true);
12711 /* Print a hint with a suggestion for an extension name
12712 that most closely resembles what the user passed in STR. */
12715 aarch64_print_hint_for_extensions (const std::string
&str
)
12717 auto_vec
<const char *> candidates
;
12718 aarch64_get_all_extension_candidates (&candidates
);
12720 const char *hint
= candidates_list_and_hint (str
.c_str (), s
, candidates
);
12722 inform (input_location
, "valid arguments are: %s;"
12723 " did you mean %qs?", s
, hint
);
12725 inform (input_location
, "valid arguments are: %s;", s
);
12730 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
12731 specified in STR and throw errors if appropriate. Put the results if
12732 they are valid in RES and ISA_FLAGS. Return whether the option is
12736 aarch64_validate_mcpu (const char *str
, const struct processor
**res
,
12737 uint64_t *isa_flags
)
12739 std::string invalid_extension
;
12740 enum aarch64_parse_opt_result parse_res
12741 = aarch64_parse_cpu (str
, res
, isa_flags
, &invalid_extension
);
12743 if (parse_res
== AARCH64_PARSE_OK
)
12748 case AARCH64_PARSE_MISSING_ARG
:
12749 error ("missing cpu name in %<-mcpu=%s%>", str
);
12751 case AARCH64_PARSE_INVALID_ARG
:
12752 error ("unknown value %qs for %<-mcpu%>", str
);
12753 aarch64_print_hint_for_core (str
);
12755 case AARCH64_PARSE_INVALID_FEATURE
:
12756 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
12757 invalid_extension
.c_str (), str
);
12758 aarch64_print_hint_for_extensions (invalid_extension
);
12761 gcc_unreachable ();
12767 /* Parses CONST_STR for branch protection features specified in
12768 aarch64_branch_protect_types, and set any global variables required. Returns
12769 the parsing result and assigns LAST_STR to the last processed token from
12770 CONST_STR so that it can be used for error reporting. */
12773 aarch64_parse_opt_result
aarch64_parse_branch_protection (const char *const_str
,
12776 char *str_root
= xstrdup (const_str
);
12777 char* token_save
= NULL
;
12778 char *str
= strtok_r (str_root
, "+", &token_save
);
12779 enum aarch64_parse_opt_result res
= AARCH64_PARSE_OK
;
12781 res
= AARCH64_PARSE_MISSING_ARG
;
12784 char *next_str
= strtok_r (NULL
, "+", &token_save
);
12785 /* Reset the branch protection features to their defaults. */
12786 aarch64_handle_no_branch_protection (NULL
, NULL
);
12788 while (str
&& res
== AARCH64_PARSE_OK
)
12790 const aarch64_branch_protect_type
* type
= aarch64_branch_protect_types
;
12791 bool found
= false;
12792 /* Search for this type. */
12793 while (type
&& type
->name
&& !found
&& res
== AARCH64_PARSE_OK
)
12795 if (strcmp (str
, type
->name
) == 0)
12798 res
= type
->handler (str
, next_str
);
12800 next_str
= strtok_r (NULL
, "+", &token_save
);
12805 if (found
&& res
== AARCH64_PARSE_OK
)
12807 bool found_subtype
= true;
12808 /* Loop through each token until we find one that isn't a
12810 while (found_subtype
)
12812 found_subtype
= false;
12813 const aarch64_branch_protect_type
*subtype
= type
->subtypes
;
12814 /* Search for the subtype. */
12815 while (str
&& subtype
&& subtype
->name
&& !found_subtype
12816 && res
== AARCH64_PARSE_OK
)
12818 if (strcmp (str
, subtype
->name
) == 0)
12820 found_subtype
= true;
12821 res
= subtype
->handler (str
, next_str
);
12823 next_str
= strtok_r (NULL
, "+", &token_save
);
12831 res
= AARCH64_PARSE_INVALID_ARG
;
12834 /* Copy the last processed token into the argument to pass it back.
12835 Used by option and attribute validation to print the offending token. */
12838 if (str
) strcpy (*last_str
, str
);
12839 else *last_str
= NULL
;
12841 if (res
== AARCH64_PARSE_OK
)
12843 /* If needed, alloc the accepted string then copy in const_str.
12844 Used by override_option_after_change_1. */
12845 if (!accepted_branch_protection_string
)
12846 accepted_branch_protection_string
= (char *) xmalloc (
12847 BRANCH_PROTECT_STR_MAX
12849 strncpy (accepted_branch_protection_string
, const_str
,
12850 BRANCH_PROTECT_STR_MAX
+ 1);
12851 /* Forcibly null-terminate. */
12852 accepted_branch_protection_string
[BRANCH_PROTECT_STR_MAX
] = '\0';
12858 aarch64_validate_mbranch_protection (const char *const_str
)
12860 char *str
= (char *) xmalloc (strlen (const_str
));
12861 enum aarch64_parse_opt_result res
=
12862 aarch64_parse_branch_protection (const_str
, &str
);
12863 if (res
== AARCH64_PARSE_INVALID_ARG
)
12864 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str
);
12865 else if (res
== AARCH64_PARSE_MISSING_ARG
)
12866 error ("missing argument for %<-mbranch-protection=%>");
12868 return res
== AARCH64_PARSE_OK
;
12871 /* Validate a command-line -march option. Parse the arch and extensions
12872 (if any) specified in STR and throw errors if appropriate. Put the
12873 results, if they are valid, in RES and ISA_FLAGS. Return whether the
12874 option is valid. */
12877 aarch64_validate_march (const char *str
, const struct processor
**res
,
12878 uint64_t *isa_flags
)
12880 std::string invalid_extension
;
12881 enum aarch64_parse_opt_result parse_res
12882 = aarch64_parse_arch (str
, res
, isa_flags
, &invalid_extension
);
12884 if (parse_res
== AARCH64_PARSE_OK
)
12889 case AARCH64_PARSE_MISSING_ARG
:
12890 error ("missing arch name in %<-march=%s%>", str
);
12892 case AARCH64_PARSE_INVALID_ARG
:
12893 error ("unknown value %qs for %<-march%>", str
);
12894 aarch64_print_hint_for_arch (str
);
12896 case AARCH64_PARSE_INVALID_FEATURE
:
12897 error ("invalid feature modifier %qs in %<-march=%s%>",
12898 invalid_extension
.c_str (), str
);
12899 aarch64_print_hint_for_extensions (invalid_extension
);
12902 gcc_unreachable ();
12908 /* Validate a command-line -mtune option. Parse the cpu
12909 specified in STR and throw errors if appropriate. Put the
12910 result, if it is valid, in RES. Return whether the option is
12914 aarch64_validate_mtune (const char *str
, const struct processor
**res
)
12916 enum aarch64_parse_opt_result parse_res
12917 = aarch64_parse_tune (str
, res
);
12919 if (parse_res
== AARCH64_PARSE_OK
)
12924 case AARCH64_PARSE_MISSING_ARG
:
12925 error ("missing cpu name in %<-mtune=%s%>", str
);
12927 case AARCH64_PARSE_INVALID_ARG
:
12928 error ("unknown value %qs for %<-mtune%>", str
);
12929 aarch64_print_hint_for_core (str
);
12932 gcc_unreachable ();
12937 /* Return the CPU corresponding to the enum CPU.
12938 If it doesn't specify a cpu, return the default. */
12940 static const struct processor
*
12941 aarch64_get_tune_cpu (enum aarch64_processor cpu
)
12943 if (cpu
!= aarch64_none
)
12944 return &all_cores
[cpu
];
12946 /* The & 0x3f is to extract the bottom 6 bits that encode the
12947 default cpu as selected by the --with-cpu GCC configure option
12949 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
12950 flags mechanism should be reworked to make it more sane. */
12951 return &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
12954 /* Return the architecture corresponding to the enum ARCH.
12955 If it doesn't specify a valid architecture, return the default. */
12957 static const struct processor
*
12958 aarch64_get_arch (enum aarch64_arch arch
)
12960 if (arch
!= aarch64_no_arch
)
12961 return &all_architectures
[arch
];
12963 const struct processor
*cpu
= &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
12965 return &all_architectures
[cpu
->arch
];
12968 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
12971 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value
)
12973 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
12974 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
12975 deciding which .md file patterns to use and when deciding whether
12976 something is a legitimate address or constant. */
12977 if (value
== SVE_SCALABLE
|| value
== SVE_128
)
12978 return poly_uint16 (2, 2);
12980 return (int) value
/ 64;
12983 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
12984 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
12985 tuning structs. In particular it must set selected_tune and
12986 aarch64_isa_flags that define the available ISA features and tuning
12987 decisions. It must also set selected_arch as this will be used to
12988 output the .arch asm tags for each function. */
12991 aarch64_override_options (void)
12993 uint64_t cpu_isa
= 0;
12994 uint64_t arch_isa
= 0;
12995 aarch64_isa_flags
= 0;
12997 bool valid_cpu
= true;
12998 bool valid_tune
= true;
12999 bool valid_arch
= true;
13001 selected_cpu
= NULL
;
13002 selected_arch
= NULL
;
13003 selected_tune
= NULL
;
13005 if (aarch64_branch_protection_string
)
13006 aarch64_validate_mbranch_protection (aarch64_branch_protection_string
);
13008 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
13009 If either of -march or -mtune is given, they override their
13010 respective component of -mcpu. */
13011 if (aarch64_cpu_string
)
13012 valid_cpu
= aarch64_validate_mcpu (aarch64_cpu_string
, &selected_cpu
,
13015 if (aarch64_arch_string
)
13016 valid_arch
= aarch64_validate_march (aarch64_arch_string
, &selected_arch
,
13019 if (aarch64_tune_string
)
13020 valid_tune
= aarch64_validate_mtune (aarch64_tune_string
, &selected_tune
);
13022 #ifdef SUBTARGET_OVERRIDE_OPTIONS
13023 SUBTARGET_OVERRIDE_OPTIONS
;
13026 /* If the user did not specify a processor, choose the default
13027 one for them. This will be the CPU set during configuration using
13028 --with-cpu, otherwise it is "generic". */
13033 selected_cpu
= &all_cores
[selected_arch
->ident
];
13034 aarch64_isa_flags
= arch_isa
;
13035 explicit_arch
= selected_arch
->arch
;
13039 /* Get default configure-time CPU. */
13040 selected_cpu
= aarch64_get_tune_cpu (aarch64_none
);
13041 aarch64_isa_flags
= TARGET_CPU_DEFAULT
>> 6;
13045 explicit_tune_core
= selected_tune
->ident
;
13047 /* If both -mcpu and -march are specified check that they are architecturally
13048 compatible, warn if they're not and prefer the -march ISA flags. */
13049 else if (selected_arch
)
13051 if (selected_arch
->arch
!= selected_cpu
->arch
)
13053 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
13054 all_architectures
[selected_cpu
->arch
].name
,
13055 selected_arch
->name
);
13057 aarch64_isa_flags
= arch_isa
;
13058 explicit_arch
= selected_arch
->arch
;
13059 explicit_tune_core
= selected_tune
? selected_tune
->ident
13060 : selected_cpu
->ident
;
13064 /* -mcpu but no -march. */
13065 aarch64_isa_flags
= cpu_isa
;
13066 explicit_tune_core
= selected_tune
? selected_tune
->ident
13067 : selected_cpu
->ident
;
13068 gcc_assert (selected_cpu
);
13069 selected_arch
= &all_architectures
[selected_cpu
->arch
];
13070 explicit_arch
= selected_arch
->arch
;
13073 /* Set the arch as well as we will need it when outputing
13074 the .arch directive in assembly. */
13075 if (!selected_arch
)
13077 gcc_assert (selected_cpu
);
13078 selected_arch
= &all_architectures
[selected_cpu
->arch
];
13081 if (!selected_tune
)
13082 selected_tune
= selected_cpu
;
13084 if (aarch64_enable_bti
== 2)
13086 #ifdef TARGET_ENABLE_BTI
13087 aarch64_enable_bti
= 1;
13089 aarch64_enable_bti
= 0;
13093 /* Return address signing is currently not supported for ILP32 targets. For
13094 LP64 targets use the configured option in the absence of a command-line
13095 option for -mbranch-protection. */
13096 if (!TARGET_ILP32
&& accepted_branch_protection_string
== NULL
)
13098 #ifdef TARGET_ENABLE_PAC_RET
13099 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
13101 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NONE
;
13105 #ifndef HAVE_AS_MABI_OPTION
13106 /* The compiler may have been configured with 2.23.* binutils, which does
13107 not have support for ILP32. */
13109 error ("assembler does not support %<-mabi=ilp32%>");
13112 /* Convert -msve-vector-bits to a VG count. */
13113 aarch64_sve_vg
= aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits
);
13115 if (aarch64_ra_sign_scope
!= AARCH64_FUNCTION_NONE
&& TARGET_ILP32
)
13116 sorry ("return address signing is only supported for %<-mabi=lp64%>");
13118 /* Make sure we properly set up the explicit options. */
13119 if ((aarch64_cpu_string
&& valid_cpu
)
13120 || (aarch64_tune_string
&& valid_tune
))
13121 gcc_assert (explicit_tune_core
!= aarch64_none
);
13123 if ((aarch64_cpu_string
&& valid_cpu
)
13124 || (aarch64_arch_string
&& valid_arch
))
13125 gcc_assert (explicit_arch
!= aarch64_no_arch
);
13127 /* The pass to insert speculation tracking runs before
13128 shrink-wrapping and the latter does not know how to update the
13129 tracking status. So disable it in this case. */
13130 if (aarch64_track_speculation
)
13131 flag_shrink_wrap
= 0;
13133 aarch64_override_options_internal (&global_options
);
13135 /* Save these options as the default ones in case we push and pop them later
13136 while processing functions with potential target attributes. */
13137 target_option_default_node
= target_option_current_node
13138 = build_target_option_node (&global_options
);
13141 /* Implement targetm.override_options_after_change. */
13144 aarch64_override_options_after_change (void)
13146 aarch64_override_options_after_change_1 (&global_options
);
13149 static struct machine_function
*
13150 aarch64_init_machine_status (void)
13152 struct machine_function
*machine
;
13153 machine
= ggc_cleared_alloc
<machine_function
> ();
13158 aarch64_init_expanders (void)
13160 init_machine_status
= aarch64_init_machine_status
;
13163 /* A checking mechanism for the implementation of the various code models. */
13165 initialize_aarch64_code_model (struct gcc_options
*opts
)
13167 if (opts
->x_flag_pic
)
13169 switch (opts
->x_aarch64_cmodel_var
)
13171 case AARCH64_CMODEL_TINY
:
13172 aarch64_cmodel
= AARCH64_CMODEL_TINY_PIC
;
13174 case AARCH64_CMODEL_SMALL
:
13175 #ifdef HAVE_AS_SMALL_PIC_RELOCS
13176 aarch64_cmodel
= (flag_pic
== 2
13177 ? AARCH64_CMODEL_SMALL_PIC
13178 : AARCH64_CMODEL_SMALL_SPIC
);
13180 aarch64_cmodel
= AARCH64_CMODEL_SMALL_PIC
;
13183 case AARCH64_CMODEL_LARGE
:
13184 sorry ("code model %qs with %<-f%s%>", "large",
13185 opts
->x_flag_pic
> 1 ? "PIC" : "pic");
13188 gcc_unreachable ();
13192 aarch64_cmodel
= opts
->x_aarch64_cmodel_var
;
13195 /* Implement TARGET_OPTION_SAVE. */
13198 aarch64_option_save (struct cl_target_option
*ptr
, struct gcc_options
*opts
)
13200 ptr
->x_aarch64_override_tune_string
= opts
->x_aarch64_override_tune_string
;
13201 ptr
->x_aarch64_branch_protection_string
13202 = opts
->x_aarch64_branch_protection_string
;
13205 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
13206 using the information saved in PTR. */
13209 aarch64_option_restore (struct gcc_options
*opts
, struct cl_target_option
*ptr
)
13211 opts
->x_explicit_tune_core
= ptr
->x_explicit_tune_core
;
13212 selected_tune
= aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
13213 opts
->x_explicit_arch
= ptr
->x_explicit_arch
;
13214 selected_arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
13215 opts
->x_aarch64_override_tune_string
= ptr
->x_aarch64_override_tune_string
;
13216 opts
->x_aarch64_branch_protection_string
13217 = ptr
->x_aarch64_branch_protection_string
;
13218 if (opts
->x_aarch64_branch_protection_string
)
13220 aarch64_parse_branch_protection (opts
->x_aarch64_branch_protection_string
,
13224 aarch64_override_options_internal (opts
);
13227 /* Implement TARGET_OPTION_PRINT. */
13230 aarch64_option_print (FILE *file
, int indent
, struct cl_target_option
*ptr
)
13232 const struct processor
*cpu
13233 = aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
13234 uint64_t isa_flags
= ptr
->x_aarch64_isa_flags
;
13235 const struct processor
*arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
13236 std::string extension
13237 = aarch64_get_extension_string_for_isa_flags (isa_flags
, arch
->flags
);
13239 fprintf (file
, "%*sselected tune = %s\n", indent
, "", cpu
->name
);
13240 fprintf (file
, "%*sselected arch = %s%s\n", indent
, "",
13241 arch
->name
, extension
.c_str ());
13244 static GTY(()) tree aarch64_previous_fndecl
;
13247 aarch64_reset_previous_fndecl (void)
13249 aarch64_previous_fndecl
= NULL
;
13252 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
13253 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
13254 make sure optab availability predicates are recomputed when necessary. */
13257 aarch64_save_restore_target_globals (tree new_tree
)
13259 if (TREE_TARGET_GLOBALS (new_tree
))
13260 restore_target_globals (TREE_TARGET_GLOBALS (new_tree
));
13261 else if (new_tree
== target_option_default_node
)
13262 restore_target_globals (&default_target_globals
);
13264 TREE_TARGET_GLOBALS (new_tree
) = save_target_globals_default_opts ();
13267 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
13268 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
13269 of the function, if such exists. This function may be called multiple
13270 times on a single function so use aarch64_previous_fndecl to avoid
13271 setting up identical state. */
13274 aarch64_set_current_function (tree fndecl
)
13276 if (!fndecl
|| fndecl
== aarch64_previous_fndecl
)
13279 tree old_tree
= (aarch64_previous_fndecl
13280 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl
)
13283 tree new_tree
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
13285 /* If current function has no attributes but the previous one did,
13286 use the default node. */
13287 if (!new_tree
&& old_tree
)
13288 new_tree
= target_option_default_node
;
13290 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
13291 the default have been handled by aarch64_save_restore_target_globals from
13292 aarch64_pragma_target_parse. */
13293 if (old_tree
== new_tree
)
13296 aarch64_previous_fndecl
= fndecl
;
13298 /* First set the target options. */
13299 cl_target_option_restore (&global_options
, TREE_TARGET_OPTION (new_tree
));
13301 aarch64_save_restore_target_globals (new_tree
);
13304 /* Enum describing the various ways we can handle attributes.
13305 In many cases we can reuse the generic option handling machinery. */
13307 enum aarch64_attr_opt_type
13309 aarch64_attr_mask
, /* Attribute should set a bit in target_flags. */
13310 aarch64_attr_bool
, /* Attribute sets or unsets a boolean variable. */
13311 aarch64_attr_enum
, /* Attribute sets an enum variable. */
13312 aarch64_attr_custom
/* Attribute requires a custom handling function. */
13315 /* All the information needed to handle a target attribute.
13316 NAME is the name of the attribute.
13317 ATTR_TYPE specifies the type of behavior of the attribute as described
13318 in the definition of enum aarch64_attr_opt_type.
13319 ALLOW_NEG is true if the attribute supports a "no-" form.
13320 HANDLER is the function that takes the attribute string as an argument
13321 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
13322 OPT_NUM is the enum specifying the option that the attribute modifies.
13323 This is needed for attributes that mirror the behavior of a command-line
13324 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
13325 aarch64_attr_enum. */
13327 struct aarch64_attribute_info
13330 enum aarch64_attr_opt_type attr_type
;
13332 bool (*handler
) (const char *);
13333 enum opt_code opt_num
;
13336 /* Handle the ARCH_STR argument to the arch= target attribute. */
13339 aarch64_handle_attr_arch (const char *str
)
13341 const struct processor
*tmp_arch
= NULL
;
13342 std::string invalid_extension
;
13343 enum aarch64_parse_opt_result parse_res
13344 = aarch64_parse_arch (str
, &tmp_arch
, &aarch64_isa_flags
, &invalid_extension
);
13346 if (parse_res
== AARCH64_PARSE_OK
)
13348 gcc_assert (tmp_arch
);
13349 selected_arch
= tmp_arch
;
13350 explicit_arch
= selected_arch
->arch
;
13356 case AARCH64_PARSE_MISSING_ARG
:
13357 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
13359 case AARCH64_PARSE_INVALID_ARG
:
13360 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str
);
13361 aarch64_print_hint_for_arch (str
);
13363 case AARCH64_PARSE_INVALID_FEATURE
:
13364 error ("invalid feature modifier %s of value (\"%s\") in "
13365 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
13366 aarch64_print_hint_for_extensions (invalid_extension
);
13369 gcc_unreachable ();
13375 /* Handle the argument CPU_STR to the cpu= target attribute. */
13378 aarch64_handle_attr_cpu (const char *str
)
13380 const struct processor
*tmp_cpu
= NULL
;
13381 std::string invalid_extension
;
13382 enum aarch64_parse_opt_result parse_res
13383 = aarch64_parse_cpu (str
, &tmp_cpu
, &aarch64_isa_flags
, &invalid_extension
);
13385 if (parse_res
== AARCH64_PARSE_OK
)
13387 gcc_assert (tmp_cpu
);
13388 selected_tune
= tmp_cpu
;
13389 explicit_tune_core
= selected_tune
->ident
;
13391 selected_arch
= &all_architectures
[tmp_cpu
->arch
];
13392 explicit_arch
= selected_arch
->arch
;
13398 case AARCH64_PARSE_MISSING_ARG
:
13399 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
13401 case AARCH64_PARSE_INVALID_ARG
:
13402 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str
);
13403 aarch64_print_hint_for_core (str
);
13405 case AARCH64_PARSE_INVALID_FEATURE
:
13406 error ("invalid feature modifier %s of value (\"%s\") in "
13407 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
13408 aarch64_print_hint_for_extensions (invalid_extension
);
13411 gcc_unreachable ();
13417 /* Handle the argument STR to the branch-protection= attribute. */
13420 aarch64_handle_attr_branch_protection (const char* str
)
13422 char *err_str
= (char *) xmalloc (strlen (str
));
13423 enum aarch64_parse_opt_result res
= aarch64_parse_branch_protection (str
,
13425 bool success
= false;
13428 case AARCH64_PARSE_MISSING_ARG
:
13429 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
13432 case AARCH64_PARSE_INVALID_ARG
:
13433 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
13434 "=\")%> pragma or attribute", err_str
);
13436 case AARCH64_PARSE_OK
:
13438 /* Fall through. */
13439 case AARCH64_PARSE_INVALID_FEATURE
:
13442 gcc_unreachable ();
13448 /* Handle the argument STR to the tune= target attribute. */
13451 aarch64_handle_attr_tune (const char *str
)
13453 const struct processor
*tmp_tune
= NULL
;
13454 enum aarch64_parse_opt_result parse_res
13455 = aarch64_parse_tune (str
, &tmp_tune
);
13457 if (parse_res
== AARCH64_PARSE_OK
)
13459 gcc_assert (tmp_tune
);
13460 selected_tune
= tmp_tune
;
13461 explicit_tune_core
= selected_tune
->ident
;
13467 case AARCH64_PARSE_INVALID_ARG
:
13468 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str
);
13469 aarch64_print_hint_for_core (str
);
13472 gcc_unreachable ();
13478 /* Parse an architecture extensions target attribute string specified in STR.
13479 For example "+fp+nosimd". Show any errors if needed. Return TRUE
13480 if successful. Update aarch64_isa_flags to reflect the ISA features
13484 aarch64_handle_attr_isa_flags (char *str
)
13486 enum aarch64_parse_opt_result parse_res
;
13487 uint64_t isa_flags
= aarch64_isa_flags
;
13489 /* We allow "+nothing" in the beginning to clear out all architectural
13490 features if the user wants to handpick specific features. */
13491 if (strncmp ("+nothing", str
, 8) == 0)
13497 std::string invalid_extension
;
13498 parse_res
= aarch64_parse_extension (str
, &isa_flags
, &invalid_extension
);
13500 if (parse_res
== AARCH64_PARSE_OK
)
13502 aarch64_isa_flags
= isa_flags
;
13508 case AARCH64_PARSE_MISSING_ARG
:
13509 error ("missing value in %<target()%> pragma or attribute");
13512 case AARCH64_PARSE_INVALID_FEATURE
:
13513 error ("invalid feature modifier %s of value (\"%s\") in "
13514 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
13518 gcc_unreachable ();
13524 /* The target attributes that we support. On top of these we also support just
13525 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
13526 handled explicitly in aarch64_process_one_target_attr. */
13528 static const struct aarch64_attribute_info aarch64_attributes
[] =
13530 { "general-regs-only", aarch64_attr_mask
, false, NULL
,
13531 OPT_mgeneral_regs_only
},
13532 { "fix-cortex-a53-835769", aarch64_attr_bool
, true, NULL
,
13533 OPT_mfix_cortex_a53_835769
},
13534 { "fix-cortex-a53-843419", aarch64_attr_bool
, true, NULL
,
13535 OPT_mfix_cortex_a53_843419
},
13536 { "cmodel", aarch64_attr_enum
, false, NULL
, OPT_mcmodel_
},
13537 { "strict-align", aarch64_attr_mask
, true, NULL
, OPT_mstrict_align
},
13538 { "omit-leaf-frame-pointer", aarch64_attr_bool
, true, NULL
,
13539 OPT_momit_leaf_frame_pointer
},
13540 { "tls-dialect", aarch64_attr_enum
, false, NULL
, OPT_mtls_dialect_
},
13541 { "arch", aarch64_attr_custom
, false, aarch64_handle_attr_arch
,
13543 { "cpu", aarch64_attr_custom
, false, aarch64_handle_attr_cpu
, OPT_mcpu_
},
13544 { "tune", aarch64_attr_custom
, false, aarch64_handle_attr_tune
,
13546 { "branch-protection", aarch64_attr_custom
, false,
13547 aarch64_handle_attr_branch_protection
, OPT_mbranch_protection_
},
13548 { "sign-return-address", aarch64_attr_enum
, false, NULL
,
13549 OPT_msign_return_address_
},
13550 { NULL
, aarch64_attr_custom
, false, NULL
, OPT____
}
13553 /* Parse ARG_STR which contains the definition of one target attribute.
13554 Show appropriate errors if any or return true if the attribute is valid. */
13557 aarch64_process_one_target_attr (char *arg_str
)
13559 bool invert
= false;
13561 size_t len
= strlen (arg_str
);
13565 error ("malformed %<target()%> pragma or attribute");
13569 char *str_to_check
= (char *) alloca (len
+ 1);
13570 strcpy (str_to_check
, arg_str
);
13572 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
13573 It is easier to detect and handle it explicitly here rather than going
13574 through the machinery for the rest of the target attributes in this
13576 if (*str_to_check
== '+')
13577 return aarch64_handle_attr_isa_flags (str_to_check
);
13579 if (len
> 3 && strncmp (str_to_check
, "no-", 3) == 0)
13584 char *arg
= strchr (str_to_check
, '=');
13586 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
13587 and point ARG to "foo". */
13593 const struct aarch64_attribute_info
*p_attr
;
13594 bool found
= false;
13595 for (p_attr
= aarch64_attributes
; p_attr
->name
; p_attr
++)
13597 /* If the names don't match up, or the user has given an argument
13598 to an attribute that doesn't accept one, or didn't give an argument
13599 to an attribute that expects one, fail to match. */
13600 if (strcmp (str_to_check
, p_attr
->name
) != 0)
13604 bool attr_need_arg_p
= p_attr
->attr_type
== aarch64_attr_custom
13605 || p_attr
->attr_type
== aarch64_attr_enum
;
13607 if (attr_need_arg_p
^ (arg
!= NULL
))
13609 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check
);
13613 /* If the name matches but the attribute does not allow "no-" versions
13614 then we can't match. */
13615 if (invert
&& !p_attr
->allow_neg
)
13617 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check
);
13621 switch (p_attr
->attr_type
)
13623 /* Has a custom handler registered.
13624 For example, cpu=, arch=, tune=. */
13625 case aarch64_attr_custom
:
13626 gcc_assert (p_attr
->handler
);
13627 if (!p_attr
->handler (arg
))
13631 /* Either set or unset a boolean option. */
13632 case aarch64_attr_bool
:
13634 struct cl_decoded_option decoded
;
13636 generate_option (p_attr
->opt_num
, NULL
, !invert
,
13637 CL_TARGET
, &decoded
);
13638 aarch64_handle_option (&global_options
, &global_options_set
,
13639 &decoded
, input_location
);
13642 /* Set or unset a bit in the target_flags. aarch64_handle_option
13643 should know what mask to apply given the option number. */
13644 case aarch64_attr_mask
:
13646 struct cl_decoded_option decoded
;
13647 /* We only need to specify the option number.
13648 aarch64_handle_option will know which mask to apply. */
13649 decoded
.opt_index
= p_attr
->opt_num
;
13650 decoded
.value
= !invert
;
13651 aarch64_handle_option (&global_options
, &global_options_set
,
13652 &decoded
, input_location
);
13655 /* Use the option setting machinery to set an option to an enum. */
13656 case aarch64_attr_enum
:
13661 valid
= opt_enum_arg_to_value (p_attr
->opt_num
, arg
,
13662 &value
, CL_TARGET
);
13665 set_option (&global_options
, NULL
, p_attr
->opt_num
, value
,
13666 NULL
, DK_UNSPECIFIED
, input_location
,
13671 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check
, arg
);
13676 gcc_unreachable ();
13680 /* If we reached here we either have found an attribute and validated
13681 it or didn't match any. If we matched an attribute but its arguments
13682 were malformed we will have returned false already. */
13686 /* Count how many times the character C appears in
13687 NULL-terminated string STR. */
13689 static unsigned int
13690 num_occurences_in_str (char c
, char *str
)
13692 unsigned int res
= 0;
13693 while (*str
!= '\0')
13704 /* Parse the tree in ARGS that contains the target attribute information
13705 and update the global target options space. */
13708 aarch64_process_target_attr (tree args
)
13710 if (TREE_CODE (args
) == TREE_LIST
)
13714 tree head
= TREE_VALUE (args
);
13717 if (!aarch64_process_target_attr (head
))
13720 args
= TREE_CHAIN (args
);
13726 if (TREE_CODE (args
) != STRING_CST
)
13728 error ("attribute %<target%> argument not a string");
13732 size_t len
= strlen (TREE_STRING_POINTER (args
));
13733 char *str_to_check
= (char *) alloca (len
+ 1);
13734 strcpy (str_to_check
, TREE_STRING_POINTER (args
));
13738 error ("malformed %<target()%> pragma or attribute");
13742 /* Used to catch empty spaces between commas i.e.
13743 attribute ((target ("attr1,,attr2"))). */
13744 unsigned int num_commas
= num_occurences_in_str (',', str_to_check
);
13746 /* Handle multiple target attributes separated by ','. */
13747 char *token
= strtok_r (str_to_check
, ",", &str_to_check
);
13749 unsigned int num_attrs
= 0;
13753 if (!aarch64_process_one_target_attr (token
))
13755 error ("pragma or attribute %<target(\"%s\")%> is not valid", token
);
13759 token
= strtok_r (NULL
, ",", &str_to_check
);
13762 if (num_attrs
!= num_commas
+ 1)
13764 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args
));
13771 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
13772 process attribute ((target ("..."))). */
13775 aarch64_option_valid_attribute_p (tree fndecl
, tree
, tree args
, int)
13777 struct cl_target_option cur_target
;
13780 tree new_target
, new_optimize
;
13781 tree existing_target
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
13783 /* If what we're processing is the current pragma string then the
13784 target option node is already stored in target_option_current_node
13785 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
13786 having to re-parse the string. This is especially useful to keep
13787 arm_neon.h compile times down since that header contains a lot
13788 of intrinsics enclosed in pragmas. */
13789 if (!existing_target
&& args
== current_target_pragma
)
13791 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = target_option_current_node
;
13794 tree func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
13796 old_optimize
= build_optimization_node (&global_options
);
13797 func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
13799 /* If the function changed the optimization levels as well as setting
13800 target options, start with the optimizations specified. */
13801 if (func_optimize
&& func_optimize
!= old_optimize
)
13802 cl_optimization_restore (&global_options
,
13803 TREE_OPTIMIZATION (func_optimize
));
13805 /* Save the current target options to restore at the end. */
13806 cl_target_option_save (&cur_target
, &global_options
);
13808 /* If fndecl already has some target attributes applied to it, unpack
13809 them so that we add this attribute on top of them, rather than
13810 overwriting them. */
13811 if (existing_target
)
13813 struct cl_target_option
*existing_options
13814 = TREE_TARGET_OPTION (existing_target
);
13816 if (existing_options
)
13817 cl_target_option_restore (&global_options
, existing_options
);
13820 cl_target_option_restore (&global_options
,
13821 TREE_TARGET_OPTION (target_option_current_node
));
13823 ret
= aarch64_process_target_attr (args
);
13825 /* Set up any additional state. */
13828 aarch64_override_options_internal (&global_options
);
13829 /* Initialize SIMD builtins if we haven't already.
13830 Set current_target_pragma to NULL for the duration so that
13831 the builtin initialization code doesn't try to tag the functions
13832 being built with the attributes specified by any current pragma, thus
13833 going into an infinite recursion. */
13836 tree saved_current_target_pragma
= current_target_pragma
;
13837 current_target_pragma
= NULL
;
13838 aarch64_init_simd_builtins ();
13839 current_target_pragma
= saved_current_target_pragma
;
13841 new_target
= build_target_option_node (&global_options
);
13846 new_optimize
= build_optimization_node (&global_options
);
13850 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = new_target
;
13852 if (old_optimize
!= new_optimize
)
13853 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
) = new_optimize
;
13856 cl_target_option_restore (&global_options
, &cur_target
);
13858 if (old_optimize
!= new_optimize
)
13859 cl_optimization_restore (&global_options
,
13860 TREE_OPTIMIZATION (old_optimize
));
13864 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
13865 tri-bool options (yes, no, don't care) and the default value is
13866 DEF, determine whether to reject inlining. */
13869 aarch64_tribools_ok_for_inlining_p (int caller
, int callee
,
13870 int dont_care
, int def
)
13872 /* If the callee doesn't care, always allow inlining. */
13873 if (callee
== dont_care
)
13876 /* If the caller doesn't care, always allow inlining. */
13877 if (caller
== dont_care
)
13880 /* Otherwise, allow inlining if either the callee and caller values
13881 agree, or if the callee is using the default value. */
13882 return (callee
== caller
|| callee
== def
);
13885 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
13886 to inline CALLEE into CALLER based on target-specific info.
13887 Make sure that the caller and callee have compatible architectural
13888 features. Then go through the other possible target attributes
13889 and see if they can block inlining. Try not to reject always_inline
13890 callees unless they are incompatible architecturally. */
13893 aarch64_can_inline_p (tree caller
, tree callee
)
13895 tree caller_tree
= DECL_FUNCTION_SPECIFIC_TARGET (caller
);
13896 tree callee_tree
= DECL_FUNCTION_SPECIFIC_TARGET (callee
);
13898 struct cl_target_option
*caller_opts
13899 = TREE_TARGET_OPTION (caller_tree
? caller_tree
13900 : target_option_default_node
);
13902 struct cl_target_option
*callee_opts
13903 = TREE_TARGET_OPTION (callee_tree
? callee_tree
13904 : target_option_default_node
);
13906 /* Callee's ISA flags should be a subset of the caller's. */
13907 if ((caller_opts
->x_aarch64_isa_flags
& callee_opts
->x_aarch64_isa_flags
)
13908 != callee_opts
->x_aarch64_isa_flags
)
13911 /* Allow non-strict aligned functions inlining into strict
13913 if ((TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)
13914 != TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
))
13915 && !(!TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
)
13916 && TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)))
13919 bool always_inline
= lookup_attribute ("always_inline",
13920 DECL_ATTRIBUTES (callee
));
13922 /* If the architectural features match up and the callee is always_inline
13923 then the other attributes don't matter. */
13927 if (caller_opts
->x_aarch64_cmodel_var
13928 != callee_opts
->x_aarch64_cmodel_var
)
13931 if (caller_opts
->x_aarch64_tls_dialect
13932 != callee_opts
->x_aarch64_tls_dialect
)
13935 /* Honour explicit requests to workaround errata. */
13936 if (!aarch64_tribools_ok_for_inlining_p (
13937 caller_opts
->x_aarch64_fix_a53_err835769
,
13938 callee_opts
->x_aarch64_fix_a53_err835769
,
13939 2, TARGET_FIX_ERR_A53_835769_DEFAULT
))
13942 if (!aarch64_tribools_ok_for_inlining_p (
13943 caller_opts
->x_aarch64_fix_a53_err843419
,
13944 callee_opts
->x_aarch64_fix_a53_err843419
,
13945 2, TARGET_FIX_ERR_A53_843419
))
13948 /* If the user explicitly specified -momit-leaf-frame-pointer for the
13949 caller and calle and they don't match up, reject inlining. */
13950 if (!aarch64_tribools_ok_for_inlining_p (
13951 caller_opts
->x_flag_omit_leaf_frame_pointer
,
13952 callee_opts
->x_flag_omit_leaf_frame_pointer
,
13956 /* If the callee has specific tuning overrides, respect them. */
13957 if (callee_opts
->x_aarch64_override_tune_string
!= NULL
13958 && caller_opts
->x_aarch64_override_tune_string
== NULL
)
13961 /* If the user specified tuning override strings for the
13962 caller and callee and they don't match up, reject inlining.
13963 We just do a string compare here, we don't analyze the meaning
13964 of the string, as it would be too costly for little gain. */
13965 if (callee_opts
->x_aarch64_override_tune_string
13966 && caller_opts
->x_aarch64_override_tune_string
13967 && (strcmp (callee_opts
->x_aarch64_override_tune_string
,
13968 caller_opts
->x_aarch64_override_tune_string
) != 0))
13974 /* Return true if SYMBOL_REF X binds locally. */
13977 aarch64_symbol_binds_local_p (const_rtx x
)
13979 return (SYMBOL_REF_DECL (x
)
13980 ? targetm
.binds_local_p (SYMBOL_REF_DECL (x
))
13981 : SYMBOL_REF_LOCAL_P (x
));
13984 /* Return true if SYMBOL_REF X is thread local */
13986 aarch64_tls_symbol_p (rtx x
)
13988 if (! TARGET_HAVE_TLS
)
13991 if (GET_CODE (x
) != SYMBOL_REF
)
13994 return SYMBOL_REF_TLS_MODEL (x
) != 0;
13997 /* Classify a TLS symbol into one of the TLS kinds. */
13998 enum aarch64_symbol_type
13999 aarch64_classify_tls_symbol (rtx x
)
14001 enum tls_model tls_kind
= tls_symbolic_operand_type (x
);
14005 case TLS_MODEL_GLOBAL_DYNAMIC
:
14006 case TLS_MODEL_LOCAL_DYNAMIC
:
14007 return TARGET_TLS_DESC
? SYMBOL_SMALL_TLSDESC
: SYMBOL_SMALL_TLSGD
;
14009 case TLS_MODEL_INITIAL_EXEC
:
14010 switch (aarch64_cmodel
)
14012 case AARCH64_CMODEL_TINY
:
14013 case AARCH64_CMODEL_TINY_PIC
:
14014 return SYMBOL_TINY_TLSIE
;
14016 return SYMBOL_SMALL_TLSIE
;
14019 case TLS_MODEL_LOCAL_EXEC
:
14020 if (aarch64_tls_size
== 12)
14021 return SYMBOL_TLSLE12
;
14022 else if (aarch64_tls_size
== 24)
14023 return SYMBOL_TLSLE24
;
14024 else if (aarch64_tls_size
== 32)
14025 return SYMBOL_TLSLE32
;
14026 else if (aarch64_tls_size
== 48)
14027 return SYMBOL_TLSLE48
;
14029 gcc_unreachable ();
14031 case TLS_MODEL_EMULATED
:
14032 case TLS_MODEL_NONE
:
14033 return SYMBOL_FORCE_TO_MEM
;
14036 gcc_unreachable ();
14040 /* Return the correct method for accessing X + OFFSET, where X is either
14041 a SYMBOL_REF or LABEL_REF. */
14043 enum aarch64_symbol_type
14044 aarch64_classify_symbol (rtx x
, HOST_WIDE_INT offset
)
14046 if (GET_CODE (x
) == LABEL_REF
)
14048 switch (aarch64_cmodel
)
14050 case AARCH64_CMODEL_LARGE
:
14051 return SYMBOL_FORCE_TO_MEM
;
14053 case AARCH64_CMODEL_TINY_PIC
:
14054 case AARCH64_CMODEL_TINY
:
14055 return SYMBOL_TINY_ABSOLUTE
;
14057 case AARCH64_CMODEL_SMALL_SPIC
:
14058 case AARCH64_CMODEL_SMALL_PIC
:
14059 case AARCH64_CMODEL_SMALL
:
14060 return SYMBOL_SMALL_ABSOLUTE
;
14063 gcc_unreachable ();
14067 if (GET_CODE (x
) == SYMBOL_REF
)
14069 if (aarch64_tls_symbol_p (x
))
14070 return aarch64_classify_tls_symbol (x
);
14072 switch (aarch64_cmodel
)
14074 case AARCH64_CMODEL_TINY
:
14075 /* When we retrieve symbol + offset address, we have to make sure
14076 the offset does not cause overflow of the final address. But
14077 we have no way of knowing the address of symbol at compile time
14078 so we can't accurately say if the distance between the PC and
14079 symbol + offset is outside the addressible range of +/-1M in the
14080 TINY code model. So we rely on images not being greater than
14081 1M and cap the offset at 1M and anything beyond 1M will have to
14082 be loaded using an alternative mechanism. Furthermore if the
14083 symbol is a weak reference to something that isn't known to
14084 resolve to a symbol in this module, then force to memory. */
14085 if ((SYMBOL_REF_WEAK (x
)
14086 && !aarch64_symbol_binds_local_p (x
))
14087 || !IN_RANGE (offset
, -1048575, 1048575))
14088 return SYMBOL_FORCE_TO_MEM
;
14089 return SYMBOL_TINY_ABSOLUTE
;
14091 case AARCH64_CMODEL_SMALL
:
14092 /* Same reasoning as the tiny code model, but the offset cap here is
14094 if ((SYMBOL_REF_WEAK (x
)
14095 && !aarch64_symbol_binds_local_p (x
))
14096 || !IN_RANGE (offset
, HOST_WIDE_INT_C (-4294967263),
14097 HOST_WIDE_INT_C (4294967264)))
14098 return SYMBOL_FORCE_TO_MEM
;
14099 return SYMBOL_SMALL_ABSOLUTE
;
14101 case AARCH64_CMODEL_TINY_PIC
:
14102 if (!aarch64_symbol_binds_local_p (x
))
14103 return SYMBOL_TINY_GOT
;
14104 return SYMBOL_TINY_ABSOLUTE
;
14106 case AARCH64_CMODEL_SMALL_SPIC
:
14107 case AARCH64_CMODEL_SMALL_PIC
:
14108 if (!aarch64_symbol_binds_local_p (x
))
14109 return (aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
14110 ? SYMBOL_SMALL_GOT_28K
: SYMBOL_SMALL_GOT_4G
);
14111 return SYMBOL_SMALL_ABSOLUTE
;
14113 case AARCH64_CMODEL_LARGE
:
14114 /* This is alright even in PIC code as the constant
14115 pool reference is always PC relative and within
14116 the same translation unit. */
14117 if (!aarch64_pcrelative_literal_loads
&& CONSTANT_POOL_ADDRESS_P (x
))
14118 return SYMBOL_SMALL_ABSOLUTE
;
14120 return SYMBOL_FORCE_TO_MEM
;
14123 gcc_unreachable ();
14127 /* By default push everything into the constant pool. */
14128 return SYMBOL_FORCE_TO_MEM
;
14132 aarch64_constant_address_p (rtx x
)
14134 return (CONSTANT_P (x
) && memory_address_p (DImode
, x
));
14138 aarch64_legitimate_pic_operand_p (rtx x
)
14140 if (GET_CODE (x
) == SYMBOL_REF
14141 || (GET_CODE (x
) == CONST
14142 && GET_CODE (XEXP (x
, 0)) == PLUS
14143 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
))
14149 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
14150 that should be rematerialized rather than spilled. */
14153 aarch64_legitimate_constant_p (machine_mode mode
, rtx x
)
14155 /* Support CSE and rematerialization of common constants. */
14156 if (CONST_INT_P (x
)
14157 || (CONST_DOUBLE_P (x
) && GET_MODE_CLASS (mode
) == MODE_FLOAT
)
14158 || GET_CODE (x
) == CONST_VECTOR
)
14161 /* Do not allow vector struct mode constants for Advanced SIMD.
14162 We could support 0 and -1 easily, but they need support in
14163 aarch64-simd.md. */
14164 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
14165 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
14168 /* Only accept variable-length vector constants if they can be
14171 ??? It would be possible to handle rematerialization of other
14172 constants via secondary reloads. */
14173 if (vec_flags
& VEC_ANY_SVE
)
14174 return aarch64_simd_valid_immediate (x
, NULL
);
14176 if (GET_CODE (x
) == HIGH
)
14179 /* Accept polynomial constants that can be calculated by using the
14180 destination of a move as the sole temporary. Constants that
14181 require a second temporary cannot be rematerialized (they can't be
14182 forced to memory and also aren't legitimate constants). */
14184 if (poly_int_rtx_p (x
, &offset
))
14185 return aarch64_offset_temporaries (false, offset
) <= 1;
14187 /* If an offset is being added to something else, we need to allow the
14188 base to be moved into the destination register, meaning that there
14189 are no free temporaries for the offset. */
14190 x
= strip_offset (x
, &offset
);
14191 if (!offset
.is_constant () && aarch64_offset_temporaries (true, offset
) > 0)
14194 /* Do not allow const (plus (anchor_symbol, const_int)). */
14195 if (maybe_ne (offset
, 0) && SYMBOL_REF_P (x
) && SYMBOL_REF_ANCHOR_P (x
))
14198 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
14199 so spilling them is better than rematerialization. */
14200 if (SYMBOL_REF_P (x
) && !SYMBOL_REF_TLS_MODEL (x
))
14203 /* Label references are always constant. */
14204 if (GET_CODE (x
) == LABEL_REF
)
14211 aarch64_load_tp (rtx target
)
14214 || GET_MODE (target
) != Pmode
14215 || !register_operand (target
, Pmode
))
14216 target
= gen_reg_rtx (Pmode
);
14218 /* Can return in any reg. */
14219 emit_insn (gen_aarch64_load_tp_hard (target
));
14223 /* On AAPCS systems, this is the "struct __va_list". */
14224 static GTY(()) tree va_list_type
;
14226 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
14227 Return the type to use as __builtin_va_list.
14229 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
14241 aarch64_build_builtin_va_list (void)
14244 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
14246 /* Create the type. */
14247 va_list_type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
14248 /* Give it the required name. */
14249 va_list_name
= build_decl (BUILTINS_LOCATION
,
14251 get_identifier ("__va_list"),
14253 DECL_ARTIFICIAL (va_list_name
) = 1;
14254 TYPE_NAME (va_list_type
) = va_list_name
;
14255 TYPE_STUB_DECL (va_list_type
) = va_list_name
;
14257 /* Create the fields. */
14258 f_stack
= build_decl (BUILTINS_LOCATION
,
14259 FIELD_DECL
, get_identifier ("__stack"),
14261 f_grtop
= build_decl (BUILTINS_LOCATION
,
14262 FIELD_DECL
, get_identifier ("__gr_top"),
14264 f_vrtop
= build_decl (BUILTINS_LOCATION
,
14265 FIELD_DECL
, get_identifier ("__vr_top"),
14267 f_groff
= build_decl (BUILTINS_LOCATION
,
14268 FIELD_DECL
, get_identifier ("__gr_offs"),
14269 integer_type_node
);
14270 f_vroff
= build_decl (BUILTINS_LOCATION
,
14271 FIELD_DECL
, get_identifier ("__vr_offs"),
14272 integer_type_node
);
14274 /* Tell tree-stdarg pass about our internal offset fields.
14275 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
14276 purpose to identify whether the code is updating va_list internal
14277 offset fields through irregular way. */
14278 va_list_gpr_counter_field
= f_groff
;
14279 va_list_fpr_counter_field
= f_vroff
;
14281 DECL_ARTIFICIAL (f_stack
) = 1;
14282 DECL_ARTIFICIAL (f_grtop
) = 1;
14283 DECL_ARTIFICIAL (f_vrtop
) = 1;
14284 DECL_ARTIFICIAL (f_groff
) = 1;
14285 DECL_ARTIFICIAL (f_vroff
) = 1;
14287 DECL_FIELD_CONTEXT (f_stack
) = va_list_type
;
14288 DECL_FIELD_CONTEXT (f_grtop
) = va_list_type
;
14289 DECL_FIELD_CONTEXT (f_vrtop
) = va_list_type
;
14290 DECL_FIELD_CONTEXT (f_groff
) = va_list_type
;
14291 DECL_FIELD_CONTEXT (f_vroff
) = va_list_type
;
14293 TYPE_FIELDS (va_list_type
) = f_stack
;
14294 DECL_CHAIN (f_stack
) = f_grtop
;
14295 DECL_CHAIN (f_grtop
) = f_vrtop
;
14296 DECL_CHAIN (f_vrtop
) = f_groff
;
14297 DECL_CHAIN (f_groff
) = f_vroff
;
14299 /* Compute its layout. */
14300 layout_type (va_list_type
);
14302 return va_list_type
;
14305 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
14307 aarch64_expand_builtin_va_start (tree valist
, rtx nextarg ATTRIBUTE_UNUSED
)
14309 const CUMULATIVE_ARGS
*cum
;
14310 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
14311 tree stack
, grtop
, vrtop
, groff
, vroff
;
14313 int gr_save_area_size
= cfun
->va_list_gpr_size
;
14314 int vr_save_area_size
= cfun
->va_list_fpr_size
;
14317 cum
= &crtl
->args
.info
;
14318 if (cfun
->va_list_gpr_size
)
14319 gr_save_area_size
= MIN ((NUM_ARG_REGS
- cum
->aapcs_ncrn
) * UNITS_PER_WORD
,
14320 cfun
->va_list_gpr_size
);
14321 if (cfun
->va_list_fpr_size
)
14322 vr_save_area_size
= MIN ((NUM_FP_ARG_REGS
- cum
->aapcs_nvrn
)
14323 * UNITS_PER_VREG
, cfun
->va_list_fpr_size
);
14327 gcc_assert (cum
->aapcs_nvrn
== 0);
14328 vr_save_area_size
= 0;
14331 f_stack
= TYPE_FIELDS (va_list_type_node
);
14332 f_grtop
= DECL_CHAIN (f_stack
);
14333 f_vrtop
= DECL_CHAIN (f_grtop
);
14334 f_groff
= DECL_CHAIN (f_vrtop
);
14335 f_vroff
= DECL_CHAIN (f_groff
);
14337 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), valist
, f_stack
,
14339 grtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
), valist
, f_grtop
,
14341 vrtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
), valist
, f_vrtop
,
14343 groff
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
), valist
, f_groff
,
14345 vroff
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
), valist
, f_vroff
,
14348 /* Emit code to initialize STACK, which points to the next varargs stack
14349 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
14350 by named arguments. STACK is 8-byte aligned. */
14351 t
= make_tree (TREE_TYPE (stack
), virtual_incoming_args_rtx
);
14352 if (cum
->aapcs_stack_size
> 0)
14353 t
= fold_build_pointer_plus_hwi (t
, cum
->aapcs_stack_size
* UNITS_PER_WORD
);
14354 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), stack
, t
);
14355 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
14357 /* Emit code to initialize GRTOP, the top of the GR save area.
14358 virtual_incoming_args_rtx should have been 16 byte aligned. */
14359 t
= make_tree (TREE_TYPE (grtop
), virtual_incoming_args_rtx
);
14360 t
= build2 (MODIFY_EXPR
, TREE_TYPE (grtop
), grtop
, t
);
14361 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
14363 /* Emit code to initialize VRTOP, the top of the VR save area.
14364 This address is gr_save_area_bytes below GRTOP, rounded
14365 down to the next 16-byte boundary. */
14366 t
= make_tree (TREE_TYPE (vrtop
), virtual_incoming_args_rtx
);
14367 vr_offset
= ROUND_UP (gr_save_area_size
,
14368 STACK_BOUNDARY
/ BITS_PER_UNIT
);
14371 t
= fold_build_pointer_plus_hwi (t
, -vr_offset
);
14372 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vrtop
), vrtop
, t
);
14373 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
14375 /* Emit code to initialize GROFF, the offset from GRTOP of the
14376 next GPR argument. */
14377 t
= build2 (MODIFY_EXPR
, TREE_TYPE (groff
), groff
,
14378 build_int_cst (TREE_TYPE (groff
), -gr_save_area_size
));
14379 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
14381 /* Likewise emit code to initialize VROFF, the offset from FTOP
14382 of the next VR argument. */
14383 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vroff
), vroff
,
14384 build_int_cst (TREE_TYPE (vroff
), -vr_save_area_size
));
14385 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
14388 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
14391 aarch64_gimplify_va_arg_expr (tree valist
, tree type
, gimple_seq
*pre_p
,
14392 gimple_seq
*post_p ATTRIBUTE_UNUSED
)
14396 bool is_ha
; /* is HFA or HVA. */
14397 bool dw_align
; /* double-word align. */
14398 machine_mode ag_mode
= VOIDmode
;
14402 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
14403 tree stack
, f_top
, f_off
, off
, arg
, roundup
, on_stack
;
14404 HOST_WIDE_INT size
, rsize
, adjust
, align
;
14405 tree t
, u
, cond1
, cond2
;
14407 indirect_p
= pass_va_arg_by_reference (type
);
14409 type
= build_pointer_type (type
);
14411 mode
= TYPE_MODE (type
);
14413 f_stack
= TYPE_FIELDS (va_list_type_node
);
14414 f_grtop
= DECL_CHAIN (f_stack
);
14415 f_vrtop
= DECL_CHAIN (f_grtop
);
14416 f_groff
= DECL_CHAIN (f_vrtop
);
14417 f_vroff
= DECL_CHAIN (f_groff
);
14419 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), unshare_expr (valist
),
14420 f_stack
, NULL_TREE
);
14421 size
= int_size_in_bytes (type
);
14425 = aarch64_function_arg_alignment (mode
, type
, &abi_break
) / BITS_PER_UNIT
;
14429 if (aarch64_vfp_is_call_or_return_candidate (mode
,
14435 /* No frontends can create types with variable-sized modes, so we
14436 shouldn't be asked to pass or return them. */
14437 unsigned int ag_size
= GET_MODE_SIZE (ag_mode
).to_constant ();
14439 /* TYPE passed in fp/simd registers. */
14441 aarch64_err_no_fpadvsimd (mode
);
14443 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
),
14444 unshare_expr (valist
), f_vrtop
, NULL_TREE
);
14445 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
),
14446 unshare_expr (valist
), f_vroff
, NULL_TREE
);
14448 rsize
= nregs
* UNITS_PER_VREG
;
14452 if (BYTES_BIG_ENDIAN
&& ag_size
< UNITS_PER_VREG
)
14453 adjust
= UNITS_PER_VREG
- ag_size
;
14455 else if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
14456 && size
< UNITS_PER_VREG
)
14458 adjust
= UNITS_PER_VREG
- size
;
14463 /* TYPE passed in general registers. */
14464 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
),
14465 unshare_expr (valist
), f_grtop
, NULL_TREE
);
14466 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
),
14467 unshare_expr (valist
), f_groff
, NULL_TREE
);
14468 rsize
= ROUND_UP (size
, UNITS_PER_WORD
);
14469 nregs
= rsize
/ UNITS_PER_WORD
;
14473 if (abi_break
&& warn_psabi
)
14474 inform (input_location
, "parameter passing for argument of type "
14475 "%qT changed in GCC 9.1", type
);
14479 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
14480 && size
< UNITS_PER_WORD
)
14482 adjust
= UNITS_PER_WORD
- size
;
14486 /* Get a local temporary for the field value. */
14487 off
= get_initialized_tmp_var (f_off
, pre_p
, NULL
);
14489 /* Emit code to branch if off >= 0. */
14490 t
= build2 (GE_EXPR
, boolean_type_node
, off
,
14491 build_int_cst (TREE_TYPE (off
), 0));
14492 cond1
= build3 (COND_EXPR
, ptr_type_node
, t
, NULL_TREE
, NULL_TREE
);
14496 /* Emit: offs = (offs + 15) & -16. */
14497 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
14498 build_int_cst (TREE_TYPE (off
), 15));
14499 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (off
), t
,
14500 build_int_cst (TREE_TYPE (off
), -16));
14501 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (off
), off
, t
);
14506 /* Update ap.__[g|v]r_offs */
14507 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
14508 build_int_cst (TREE_TYPE (off
), rsize
));
14509 t
= build2 (MODIFY_EXPR
, TREE_TYPE (f_off
), unshare_expr (f_off
), t
);
14513 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
14515 /* [cond2] if (ap.__[g|v]r_offs > 0) */
14516 u
= build2 (GT_EXPR
, boolean_type_node
, unshare_expr (f_off
),
14517 build_int_cst (TREE_TYPE (f_off
), 0));
14518 cond2
= build3 (COND_EXPR
, ptr_type_node
, u
, NULL_TREE
, NULL_TREE
);
14520 /* String up: make sure the assignment happens before the use. */
14521 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (cond2
), t
, cond2
);
14522 COND_EXPR_ELSE (cond1
) = t
;
14524 /* Prepare the trees handling the argument that is passed on the stack;
14525 the top level node will store in ON_STACK. */
14526 arg
= get_initialized_tmp_var (stack
, pre_p
, NULL
);
14529 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
14530 t
= fold_build_pointer_plus_hwi (arg
, 15);
14531 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
14532 build_int_cst (TREE_TYPE (t
), -16));
14533 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (arg
), arg
, t
);
14537 /* Advance ap.__stack */
14538 t
= fold_build_pointer_plus_hwi (arg
, size
+ 7);
14539 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
14540 build_int_cst (TREE_TYPE (t
), -8));
14541 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), unshare_expr (stack
), t
);
14542 /* String up roundup and advance. */
14544 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
14545 /* String up with arg */
14546 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), t
, arg
);
14547 /* Big-endianness related address adjustment. */
14548 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
14549 && size
< UNITS_PER_WORD
)
14551 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (arg
), arg
,
14552 size_int (UNITS_PER_WORD
- size
));
14553 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), on_stack
, t
);
14556 COND_EXPR_THEN (cond1
) = unshare_expr (on_stack
);
14557 COND_EXPR_THEN (cond2
) = unshare_expr (on_stack
);
14559 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
14562 t
= build2 (PREINCREMENT_EXPR
, TREE_TYPE (off
), off
,
14563 build_int_cst (TREE_TYPE (off
), adjust
));
14565 t
= fold_convert (sizetype
, t
);
14566 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (f_top
), f_top
, t
);
14570 /* type ha; // treat as "struct {ftype field[n];}"
14571 ... [computing offs]
14572 for (i = 0; i <nregs; ++i, offs += 16)
14573 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
14576 tree tmp_ha
, field_t
, field_ptr_t
;
14578 /* Declare a local variable. */
14579 tmp_ha
= create_tmp_var_raw (type
, "ha");
14580 gimple_add_tmp_var (tmp_ha
);
14582 /* Establish the base type. */
14586 field_t
= float_type_node
;
14587 field_ptr_t
= float_ptr_type_node
;
14590 field_t
= double_type_node
;
14591 field_ptr_t
= double_ptr_type_node
;
14594 field_t
= long_double_type_node
;
14595 field_ptr_t
= long_double_ptr_type_node
;
14598 field_t
= aarch64_fp16_type_node
;
14599 field_ptr_t
= aarch64_fp16_ptr_type_node
;
14604 tree innertype
= make_signed_type (GET_MODE_PRECISION (SImode
));
14605 field_t
= build_vector_type_for_mode (innertype
, ag_mode
);
14606 field_ptr_t
= build_pointer_type (field_t
);
14613 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
14614 tmp_ha
= build1 (ADDR_EXPR
, field_ptr_t
, tmp_ha
);
14616 t
= fold_convert (field_ptr_t
, addr
);
14617 t
= build2 (MODIFY_EXPR
, field_t
,
14618 build1 (INDIRECT_REF
, field_t
, tmp_ha
),
14619 build1 (INDIRECT_REF
, field_t
, t
));
14621 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
14622 for (i
= 1; i
< nregs
; ++i
)
14624 addr
= fold_build_pointer_plus_hwi (addr
, UNITS_PER_VREG
);
14625 u
= fold_convert (field_ptr_t
, addr
);
14626 u
= build2 (MODIFY_EXPR
, field_t
,
14627 build2 (MEM_REF
, field_t
, tmp_ha
,
14628 build_int_cst (field_ptr_t
,
14630 int_size_in_bytes (field_t
)))),
14631 build1 (INDIRECT_REF
, field_t
, u
));
14632 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), t
, u
);
14635 u
= fold_convert (TREE_TYPE (f_top
), tmp_ha
);
14636 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (f_top
), t
, u
);
14639 COND_EXPR_ELSE (cond2
) = t
;
14640 addr
= fold_convert (build_pointer_type (type
), cond1
);
14641 addr
= build_va_arg_indirect_ref (addr
);
14644 addr
= build_va_arg_indirect_ref (addr
);
14649 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
14652 aarch64_setup_incoming_varargs (cumulative_args_t cum_v
,
14653 const function_arg_info
&arg
,
14654 int *pretend_size ATTRIBUTE_UNUSED
, int no_rtl
)
14656 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
14657 CUMULATIVE_ARGS local_cum
;
14658 int gr_saved
= cfun
->va_list_gpr_size
;
14659 int vr_saved
= cfun
->va_list_fpr_size
;
14661 /* The caller has advanced CUM up to, but not beyond, the last named
14662 argument. Advance a local copy of CUM past the last "real" named
14663 argument, to find out how many registers are left over. */
14665 aarch64_function_arg_advance (pack_cumulative_args(&local_cum
), arg
);
14667 /* Found out how many registers we need to save.
14668 Honor tree-stdvar analysis results. */
14669 if (cfun
->va_list_gpr_size
)
14670 gr_saved
= MIN (NUM_ARG_REGS
- local_cum
.aapcs_ncrn
,
14671 cfun
->va_list_gpr_size
/ UNITS_PER_WORD
);
14672 if (cfun
->va_list_fpr_size
)
14673 vr_saved
= MIN (NUM_FP_ARG_REGS
- local_cum
.aapcs_nvrn
,
14674 cfun
->va_list_fpr_size
/ UNITS_PER_VREG
);
14678 gcc_assert (local_cum
.aapcs_nvrn
== 0);
14688 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
14689 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
,
14690 - gr_saved
* UNITS_PER_WORD
);
14691 mem
= gen_frame_mem (BLKmode
, ptr
);
14692 set_mem_alias_set (mem
, get_varargs_alias_set ());
14694 move_block_from_reg (local_cum
.aapcs_ncrn
+ R0_REGNUM
,
14699 /* We can't use move_block_from_reg, because it will use
14700 the wrong mode, storing D regs only. */
14701 machine_mode mode
= TImode
;
14702 int off
, i
, vr_start
;
14704 /* Set OFF to the offset from virtual_incoming_args_rtx of
14705 the first vector register. The VR save area lies below
14706 the GR one, and is aligned to 16 bytes. */
14707 off
= -ROUND_UP (gr_saved
* UNITS_PER_WORD
,
14708 STACK_BOUNDARY
/ BITS_PER_UNIT
);
14709 off
-= vr_saved
* UNITS_PER_VREG
;
14711 vr_start
= V0_REGNUM
+ local_cum
.aapcs_nvrn
;
14712 for (i
= 0; i
< vr_saved
; ++i
)
14716 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
, off
);
14717 mem
= gen_frame_mem (mode
, ptr
);
14718 set_mem_alias_set (mem
, get_varargs_alias_set ());
14719 aarch64_emit_move (mem
, gen_rtx_REG (mode
, vr_start
+ i
));
14720 off
+= UNITS_PER_VREG
;
14725 /* We don't save the size into *PRETEND_SIZE because we want to avoid
14726 any complication of having crtl->args.pretend_args_size changed. */
14727 cfun
->machine
->frame
.saved_varargs_size
14728 = (ROUND_UP (gr_saved
* UNITS_PER_WORD
,
14729 STACK_BOUNDARY
/ BITS_PER_UNIT
)
14730 + vr_saved
* UNITS_PER_VREG
);
14734 aarch64_conditional_register_usage (void)
14739 for (i
= V0_REGNUM
; i
<= V31_REGNUM
; i
++)
14742 call_used_regs
[i
] = 1;
14746 for (i
= P0_REGNUM
; i
<= P15_REGNUM
; i
++)
14749 call_used_regs
[i
] = 1;
14752 /* When tracking speculation, we need a couple of call-clobbered registers
14753 to track the speculation state. It would be nice to just use
14754 IP0 and IP1, but currently there are numerous places that just
14755 assume these registers are free for other uses (eg pointer
14756 authentication). */
14757 if (aarch64_track_speculation
)
14759 fixed_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
14760 call_used_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
14761 fixed_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
14762 call_used_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
14766 /* Walk down the type tree of TYPE counting consecutive base elements.
14767 If *MODEP is VOIDmode, then set it to the first valid floating point
14768 type. If a non-floating point type is found, or if a floating point
14769 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
14770 otherwise return the count in the sub-tree. */
14772 aapcs_vfp_sub_candidate (const_tree type
, machine_mode
*modep
)
14775 HOST_WIDE_INT size
;
14777 switch (TREE_CODE (type
))
14780 mode
= TYPE_MODE (type
);
14781 if (mode
!= DFmode
&& mode
!= SFmode
14782 && mode
!= TFmode
&& mode
!= HFmode
)
14785 if (*modep
== VOIDmode
)
14788 if (*modep
== mode
)
14794 mode
= TYPE_MODE (TREE_TYPE (type
));
14795 if (mode
!= DFmode
&& mode
!= SFmode
14796 && mode
!= TFmode
&& mode
!= HFmode
)
14799 if (*modep
== VOIDmode
)
14802 if (*modep
== mode
)
14808 /* Use V2SImode and V4SImode as representatives of all 64-bit
14809 and 128-bit vector types. */
14810 size
= int_size_in_bytes (type
);
14823 if (*modep
== VOIDmode
)
14826 /* Vector modes are considered to be opaque: two vectors are
14827 equivalent for the purposes of being homogeneous aggregates
14828 if they are the same size. */
14829 if (*modep
== mode
)
14837 tree index
= TYPE_DOMAIN (type
);
14839 /* Can't handle incomplete types nor sizes that are not
14841 if (!COMPLETE_TYPE_P (type
)
14842 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
14845 count
= aapcs_vfp_sub_candidate (TREE_TYPE (type
), modep
);
14848 || !TYPE_MAX_VALUE (index
)
14849 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index
))
14850 || !TYPE_MIN_VALUE (index
)
14851 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index
))
14855 count
*= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index
))
14856 - tree_to_uhwi (TYPE_MIN_VALUE (index
)));
14858 /* There must be no padding. */
14859 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
14860 count
* GET_MODE_BITSIZE (*modep
)))
14872 /* Can't handle incomplete types nor sizes that are not
14874 if (!COMPLETE_TYPE_P (type
)
14875 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
14878 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
14880 if (TREE_CODE (field
) != FIELD_DECL
)
14883 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
14886 count
+= sub_count
;
14889 /* There must be no padding. */
14890 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
14891 count
* GET_MODE_BITSIZE (*modep
)))
14898 case QUAL_UNION_TYPE
:
14900 /* These aren't very interesting except in a degenerate case. */
14905 /* Can't handle incomplete types nor sizes that are not
14907 if (!COMPLETE_TYPE_P (type
)
14908 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
14911 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
14913 if (TREE_CODE (field
) != FIELD_DECL
)
14916 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
14919 count
= count
> sub_count
? count
: sub_count
;
14922 /* There must be no padding. */
14923 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
14924 count
* GET_MODE_BITSIZE (*modep
)))
14937 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
14938 type as described in AAPCS64 \S 4.1.2.
14940 See the comment above aarch64_composite_type_p for the notes on MODE. */
14943 aarch64_short_vector_p (const_tree type
,
14946 poly_int64 size
= -1;
14948 if (type
&& TREE_CODE (type
) == VECTOR_TYPE
)
14949 size
= int_size_in_bytes (type
);
14950 else if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
14951 || GET_MODE_CLASS (mode
) == MODE_VECTOR_FLOAT
)
14952 size
= GET_MODE_SIZE (mode
);
14954 return known_eq (size
, 8) || known_eq (size
, 16);
14957 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
14958 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
14959 array types. The C99 floating-point complex types are also considered
14960 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
14961 types, which are GCC extensions and out of the scope of AAPCS64, are
14962 treated as composite types here as well.
14964 Note that MODE itself is not sufficient in determining whether a type
14965 is such a composite type or not. This is because
14966 stor-layout.c:compute_record_mode may have already changed the MODE
14967 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
14968 structure with only one field may have its MODE set to the mode of the
14969 field. Also an integer mode whose size matches the size of the
14970 RECORD_TYPE type may be used to substitute the original mode
14971 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
14972 solely relied on. */
14975 aarch64_composite_type_p (const_tree type
,
14978 if (aarch64_short_vector_p (type
, mode
))
14981 if (type
&& (AGGREGATE_TYPE_P (type
) || TREE_CODE (type
) == COMPLEX_TYPE
))
14984 if (mode
== BLKmode
14985 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
14986 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_INT
)
14992 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
14993 shall be passed or returned in simd/fp register(s) (providing these
14994 parameter passing registers are available).
14996 Upon successful return, *COUNT returns the number of needed registers,
14997 *BASE_MODE returns the mode of the individual register and when IS_HAF
14998 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
14999 floating-point aggregate or a homogeneous short-vector aggregate. */
15002 aarch64_vfp_is_call_or_return_candidate (machine_mode mode
,
15004 machine_mode
*base_mode
,
15008 machine_mode new_mode
= VOIDmode
;
15009 bool composite_p
= aarch64_composite_type_p (type
, mode
);
15011 if (is_ha
!= NULL
) *is_ha
= false;
15013 if ((!composite_p
&& GET_MODE_CLASS (mode
) == MODE_FLOAT
)
15014 || aarch64_short_vector_p (type
, mode
))
15019 else if (GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
)
15021 if (is_ha
!= NULL
) *is_ha
= true;
15023 new_mode
= GET_MODE_INNER (mode
);
15025 else if (type
&& composite_p
)
15027 int ag_count
= aapcs_vfp_sub_candidate (type
, &new_mode
);
15029 if (ag_count
> 0 && ag_count
<= HA_MAX_NUM_FLDS
)
15031 if (is_ha
!= NULL
) *is_ha
= true;
15040 *base_mode
= new_mode
;
15044 /* Implement TARGET_STRUCT_VALUE_RTX. */
15047 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED
,
15048 int incoming ATTRIBUTE_UNUSED
)
15050 return gen_rtx_REG (Pmode
, AARCH64_STRUCT_VALUE_REGNUM
);
15053 /* Implements target hook vector_mode_supported_p. */
15055 aarch64_vector_mode_supported_p (machine_mode mode
)
15057 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
15058 return vec_flags
!= 0 && (vec_flags
& VEC_STRUCT
) == 0;
15061 /* Return the full-width SVE vector mode for element mode MODE, if one
15064 aarch64_full_sve_mode (scalar_mode mode
)
15081 return VNx16QImode
;
15083 return opt_machine_mode ();
15087 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
15090 aarch64_vq_mode (scalar_mode mode
)
15109 return opt_machine_mode ();
15113 /* Return appropriate SIMD container
15114 for MODE within a vector of WIDTH bits. */
15115 static machine_mode
15116 aarch64_simd_container_mode (scalar_mode mode
, poly_int64 width
)
15118 if (TARGET_SVE
&& known_eq (width
, BITS_PER_SVE_VECTOR
))
15119 return aarch64_full_sve_mode (mode
).else_mode (word_mode
);
15121 gcc_assert (known_eq (width
, 64) || known_eq (width
, 128));
15124 if (known_eq (width
, 128))
15125 return aarch64_vq_mode (mode
).else_mode (word_mode
);
15146 /* Return 128-bit container as the preferred SIMD mode for MODE. */
15147 static machine_mode
15148 aarch64_preferred_simd_mode (scalar_mode mode
)
15150 poly_int64 bits
= TARGET_SVE
? BITS_PER_SVE_VECTOR
: 128;
15151 return aarch64_simd_container_mode (mode
, bits
);
15154 /* Return a list of possible vector sizes for the vectorizer
15155 to iterate over. */
15157 aarch64_autovectorize_vector_sizes (vector_sizes
*sizes
, bool)
15160 sizes
->safe_push (BYTES_PER_SVE_VECTOR
);
15161 sizes
->safe_push (16);
15162 sizes
->safe_push (8);
15165 /* Implement TARGET_MANGLE_TYPE. */
15167 static const char *
15168 aarch64_mangle_type (const_tree type
)
15170 /* The AArch64 ABI documents say that "__va_list" has to be
15171 mangled as if it is in the "std" namespace. */
15172 if (lang_hooks
.types_compatible_p (CONST_CAST_TREE (type
), va_list_type
))
15173 return "St9__va_list";
15175 /* Half-precision float. */
15176 if (TREE_CODE (type
) == REAL_TYPE
&& TYPE_PRECISION (type
) == 16)
15179 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
15181 if (TYPE_NAME (type
) != NULL
)
15182 return aarch64_general_mangle_builtin_type (type
);
15184 /* Use the default mangling. */
15188 /* Find the first rtx_insn before insn that will generate an assembly
15192 aarch64_prev_real_insn (rtx_insn
*insn
)
15199 insn
= prev_real_insn (insn
);
15201 while (insn
&& recog_memoized (insn
) < 0);
15207 is_madd_op (enum attr_type t1
)
15210 /* A number of these may be AArch32 only. */
15211 enum attr_type mlatypes
[] = {
15212 TYPE_MLA
, TYPE_MLAS
, TYPE_SMLAD
, TYPE_SMLADX
, TYPE_SMLAL
, TYPE_SMLALD
,
15213 TYPE_SMLALS
, TYPE_SMLALXY
, TYPE_SMLAWX
, TYPE_SMLAWY
, TYPE_SMLAXY
,
15214 TYPE_SMMLA
, TYPE_UMLAL
, TYPE_UMLALS
,TYPE_SMLSD
, TYPE_SMLSDX
, TYPE_SMLSLD
15217 for (i
= 0; i
< sizeof (mlatypes
) / sizeof (enum attr_type
); i
++)
15219 if (t1
== mlatypes
[i
])
15226 /* Check if there is a register dependency between a load and the insn
15227 for which we hold recog_data. */
15230 dep_between_memop_and_curr (rtx memop
)
15235 gcc_assert (GET_CODE (memop
) == SET
);
15237 if (!REG_P (SET_DEST (memop
)))
15240 load_reg
= SET_DEST (memop
);
15241 for (opno
= 1; opno
< recog_data
.n_operands
; opno
++)
15243 rtx operand
= recog_data
.operand
[opno
];
15244 if (REG_P (operand
)
15245 && reg_overlap_mentioned_p (load_reg
, operand
))
15253 /* When working around the Cortex-A53 erratum 835769,
15254 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
15255 instruction and has a preceding memory instruction such that a NOP
15256 should be inserted between them. */
15259 aarch64_madd_needs_nop (rtx_insn
* insn
)
15261 enum attr_type attr_type
;
15265 if (!TARGET_FIX_ERR_A53_835769
)
15268 if (!INSN_P (insn
) || recog_memoized (insn
) < 0)
15271 attr_type
= get_attr_type (insn
);
15272 if (!is_madd_op (attr_type
))
15275 prev
= aarch64_prev_real_insn (insn
);
15276 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
15277 Restore recog state to INSN to avoid state corruption. */
15278 extract_constrain_insn_cached (insn
);
15280 if (!prev
|| !contains_mem_rtx_p (PATTERN (prev
)))
15283 body
= single_set (prev
);
15285 /* If the previous insn is a memory op and there is no dependency between
15286 it and the DImode madd, emit a NOP between them. If body is NULL then we
15287 have a complex memory operation, probably a load/store pair.
15288 Be conservative for now and emit a NOP. */
15289 if (GET_MODE (recog_data
.operand
[0]) == DImode
15290 && (!body
|| !dep_between_memop_and_curr (body
)))
15298 /* Implement FINAL_PRESCAN_INSN. */
15301 aarch64_final_prescan_insn (rtx_insn
*insn
)
15303 if (aarch64_madd_needs_nop (insn
))
15304 fprintf (asm_out_file
, "\tnop // between mem op and mult-accumulate\n");
15308 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
15312 aarch64_sve_index_immediate_p (rtx base_or_step
)
15314 return (CONST_INT_P (base_or_step
)
15315 && IN_RANGE (INTVAL (base_or_step
), -16, 15));
15318 /* Return true if X is a valid immediate for the SVE ADD and SUB
15319 instructions. Negate X first if NEGATE_P is true. */
15322 aarch64_sve_arith_immediate_p (rtx x
, bool negate_p
)
15326 if (!const_vec_duplicate_p (x
, &elt
)
15327 || !CONST_INT_P (elt
))
15330 HOST_WIDE_INT val
= INTVAL (elt
);
15333 val
&= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x
)));
15336 return IN_RANGE (val
, 0, 0xff);
15337 return IN_RANGE (val
, 0, 0xff00);
15340 /* Return true if X is a valid immediate operand for an SVE logical
15341 instruction such as AND. */
15344 aarch64_sve_bitmask_immediate_p (rtx x
)
15348 return (const_vec_duplicate_p (x
, &elt
)
15349 && CONST_INT_P (elt
)
15350 && aarch64_bitmask_imm (INTVAL (elt
),
15351 GET_MODE_INNER (GET_MODE (x
))));
15354 /* Return true if X is a valid immediate for the SVE DUP and CPY
15358 aarch64_sve_dup_immediate_p (rtx x
)
15360 x
= aarch64_bit_representation (unwrap_const_vec_duplicate (x
));
15361 if (!CONST_INT_P (x
))
15364 HOST_WIDE_INT val
= INTVAL (x
);
15366 return IN_RANGE (val
, -0x80, 0x7f);
15367 return IN_RANGE (val
, -0x8000, 0x7f00);
15370 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
15371 SIGNED_P says whether the operand is signed rather than unsigned. */
15374 aarch64_sve_cmp_immediate_p (rtx x
, bool signed_p
)
15378 return (const_vec_duplicate_p (x
, &elt
)
15379 && CONST_INT_P (elt
)
15381 ? IN_RANGE (INTVAL (elt
), -16, 15)
15382 : IN_RANGE (INTVAL (elt
), 0, 127)));
15385 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
15386 instruction. Negate X first if NEGATE_P is true. */
15389 aarch64_sve_float_arith_immediate_p (rtx x
, bool negate_p
)
15394 if (!const_vec_duplicate_p (x
, &elt
)
15395 || GET_CODE (elt
) != CONST_DOUBLE
)
15398 r
= *CONST_DOUBLE_REAL_VALUE (elt
);
15401 r
= real_value_negate (&r
);
15403 if (real_equal (&r
, &dconst1
))
15405 if (real_equal (&r
, &dconsthalf
))
15410 /* Return true if X is a valid immediate operand for an SVE FMUL
15414 aarch64_sve_float_mul_immediate_p (rtx x
)
15418 return (const_vec_duplicate_p (x
, &elt
)
15419 && GET_CODE (elt
) == CONST_DOUBLE
15420 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt
), &dconsthalf
)
15421 || real_equal (CONST_DOUBLE_REAL_VALUE (elt
), &dconst2
)));
15424 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
15425 for the Advanced SIMD operation described by WHICH and INSN. If INFO
15426 is nonnull, use it to describe valid immediates. */
15428 aarch64_advsimd_valid_immediate_hs (unsigned int val32
,
15429 simd_immediate_info
*info
,
15430 enum simd_immediate_check which
,
15431 simd_immediate_info::insn_type insn
)
15433 /* Try a 4-byte immediate with LSL. */
15434 for (unsigned int shift
= 0; shift
< 32; shift
+= 8)
15435 if ((val32
& (0xff << shift
)) == val32
)
15438 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
15439 simd_immediate_info::LSL
, shift
);
15443 /* Try a 2-byte immediate with LSL. */
15444 unsigned int imm16
= val32
& 0xffff;
15445 if (imm16
== (val32
>> 16))
15446 for (unsigned int shift
= 0; shift
< 16; shift
+= 8)
15447 if ((imm16
& (0xff << shift
)) == imm16
)
15450 *info
= simd_immediate_info (HImode
, imm16
>> shift
, insn
,
15451 simd_immediate_info::LSL
, shift
);
15455 /* Try a 4-byte immediate with MSL, except for cases that MVN
15457 if (which
== AARCH64_CHECK_MOV
)
15458 for (unsigned int shift
= 8; shift
< 24; shift
+= 8)
15460 unsigned int low
= (1 << shift
) - 1;
15461 if (((val32
& (0xff << shift
)) | low
) == val32
)
15464 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
15465 simd_immediate_info::MSL
, shift
);
15473 /* Return true if replicating VAL64 is a valid immediate for the
15474 Advanced SIMD operation described by WHICH. If INFO is nonnull,
15475 use it to describe valid immediates. */
15477 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64
,
15478 simd_immediate_info
*info
,
15479 enum simd_immediate_check which
)
15481 unsigned int val32
= val64
& 0xffffffff;
15482 unsigned int val16
= val64
& 0xffff;
15483 unsigned int val8
= val64
& 0xff;
15485 if (val32
== (val64
>> 32))
15487 if ((which
& AARCH64_CHECK_ORR
) != 0
15488 && aarch64_advsimd_valid_immediate_hs (val32
, info
, which
,
15489 simd_immediate_info::MOV
))
15492 if ((which
& AARCH64_CHECK_BIC
) != 0
15493 && aarch64_advsimd_valid_immediate_hs (~val32
, info
, which
,
15494 simd_immediate_info::MVN
))
15497 /* Try using a replicated byte. */
15498 if (which
== AARCH64_CHECK_MOV
15499 && val16
== (val32
>> 16)
15500 && val8
== (val16
>> 8))
15503 *info
= simd_immediate_info (QImode
, val8
);
15508 /* Try using a bit-to-bytemask. */
15509 if (which
== AARCH64_CHECK_MOV
)
15512 for (i
= 0; i
< 64; i
+= 8)
15514 unsigned char byte
= (val64
>> i
) & 0xff;
15515 if (byte
!= 0 && byte
!= 0xff)
15521 *info
= simd_immediate_info (DImode
, val64
);
15528 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
15529 instruction. If INFO is nonnull, use it to describe valid immediates. */
15532 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64
,
15533 simd_immediate_info
*info
)
15535 scalar_int_mode mode
= DImode
;
15536 unsigned int val32
= val64
& 0xffffffff;
15537 if (val32
== (val64
>> 32))
15540 unsigned int val16
= val32
& 0xffff;
15541 if (val16
== (val32
>> 16))
15544 unsigned int val8
= val16
& 0xff;
15545 if (val8
== (val16
>> 8))
15549 HOST_WIDE_INT val
= trunc_int_for_mode (val64
, mode
);
15550 if (IN_RANGE (val
, -0x80, 0x7f))
15552 /* DUP with no shift. */
15554 *info
= simd_immediate_info (mode
, val
);
15557 if ((val
& 0xff) == 0 && IN_RANGE (val
, -0x8000, 0x7f00))
15559 /* DUP with LSL #8. */
15561 *info
= simd_immediate_info (mode
, val
);
15564 if (aarch64_bitmask_imm (val64
, mode
))
15568 *info
= simd_immediate_info (mode
, val
);
15574 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
15575 it to describe valid immediates. */
15578 aarch64_sve_pred_valid_immediate (rtx x
, simd_immediate_info
*info
)
15580 if (x
== CONST0_RTX (GET_MODE (x
)))
15583 *info
= simd_immediate_info (DImode
, 0);
15587 /* Analyze the value as a VNx16BImode. This should be relatively
15588 efficient, since rtx_vector_builder has enough built-in capacity
15589 to store all VLA predicate constants without needing the heap. */
15590 rtx_vector_builder builder
;
15591 if (!aarch64_get_sve_pred_bits (builder
, x
))
15594 unsigned int elt_size
= aarch64_widest_sve_pred_elt_size (builder
);
15595 if (int vl
= aarch64_partial_ptrue_length (builder
, elt_size
))
15597 machine_mode mode
= aarch64_sve_pred_mode (elt_size
).require ();
15598 aarch64_svpattern pattern
= aarch64_svpattern_for_vl (mode
, vl
);
15599 if (pattern
!= AARCH64_NUM_SVPATTERNS
)
15603 scalar_int_mode int_mode
= aarch64_sve_element_int_mode (mode
);
15604 *info
= simd_immediate_info (int_mode
, pattern
);
15612 /* Return true if OP is a valid SIMD immediate for the operation
15613 described by WHICH. If INFO is nonnull, use it to describe valid
15616 aarch64_simd_valid_immediate (rtx op
, simd_immediate_info
*info
,
15617 enum simd_immediate_check which
)
15619 machine_mode mode
= GET_MODE (op
);
15620 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
15621 if (vec_flags
== 0 || vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
15624 if (vec_flags
& VEC_SVE_PRED
)
15625 return aarch64_sve_pred_valid_immediate (op
, info
);
15627 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
15629 unsigned int n_elts
;
15630 if (GET_CODE (op
) == CONST_VECTOR
15631 && CONST_VECTOR_DUPLICATE_P (op
))
15632 n_elts
= CONST_VECTOR_NPATTERNS (op
);
15633 else if ((vec_flags
& VEC_SVE_DATA
)
15634 && const_vec_series_p (op
, &base
, &step
))
15636 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
);
15637 if (!aarch64_sve_index_immediate_p (base
)
15638 || !aarch64_sve_index_immediate_p (step
))
15642 *info
= simd_immediate_info (elt_mode
, base
, step
);
15645 else if (GET_CODE (op
) == CONST_VECTOR
15646 && CONST_VECTOR_NUNITS (op
).is_constant (&n_elts
))
15647 /* N_ELTS set above. */;
15651 scalar_float_mode elt_float_mode
;
15653 && is_a
<scalar_float_mode
> (elt_mode
, &elt_float_mode
))
15655 rtx elt
= CONST_VECTOR_ENCODED_ELT (op
, 0);
15656 if (aarch64_float_const_zero_rtx_p (elt
)
15657 || aarch64_float_const_representable_p (elt
))
15660 *info
= simd_immediate_info (elt_float_mode
, elt
);
15665 unsigned int elt_size
= GET_MODE_SIZE (elt_mode
);
15669 scalar_int_mode elt_int_mode
= int_mode_for_mode (elt_mode
).require ();
15671 /* Expand the vector constant out into a byte vector, with the least
15672 significant byte of the register first. */
15673 auto_vec
<unsigned char, 16> bytes
;
15674 bytes
.reserve (n_elts
* elt_size
);
15675 for (unsigned int i
= 0; i
< n_elts
; i
++)
15677 /* The vector is provided in gcc endian-neutral fashion.
15678 For aarch64_be Advanced SIMD, it must be laid out in the vector
15679 register in reverse order. */
15680 bool swap_p
= ((vec_flags
& VEC_ADVSIMD
) != 0 && BYTES_BIG_ENDIAN
);
15681 rtx elt
= CONST_VECTOR_ELT (op
, swap_p
? (n_elts
- 1 - i
) : i
);
15683 if (elt_mode
!= elt_int_mode
)
15684 elt
= gen_lowpart (elt_int_mode
, elt
);
15686 if (!CONST_INT_P (elt
))
15689 unsigned HOST_WIDE_INT elt_val
= INTVAL (elt
);
15690 for (unsigned int byte
= 0; byte
< elt_size
; byte
++)
15692 bytes
.quick_push (elt_val
& 0xff);
15693 elt_val
>>= BITS_PER_UNIT
;
15697 /* The immediate must repeat every eight bytes. */
15698 unsigned int nbytes
= bytes
.length ();
15699 for (unsigned i
= 8; i
< nbytes
; ++i
)
15700 if (bytes
[i
] != bytes
[i
- 8])
15703 /* Get the repeating 8-byte value as an integer. No endian correction
15704 is needed here because bytes is already in lsb-first order. */
15705 unsigned HOST_WIDE_INT val64
= 0;
15706 for (unsigned int i
= 0; i
< 8; i
++)
15707 val64
|= ((unsigned HOST_WIDE_INT
) bytes
[i
% nbytes
]
15708 << (i
* BITS_PER_UNIT
));
15710 if (vec_flags
& VEC_SVE_DATA
)
15711 return aarch64_sve_valid_immediate (val64
, info
);
15713 return aarch64_advsimd_valid_immediate (val64
, info
, which
);
15716 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
15717 has a step in the range of INDEX. Return the index expression if so,
15718 otherwise return null. */
15720 aarch64_check_zero_based_sve_index_immediate (rtx x
)
15723 if (const_vec_series_p (x
, &base
, &step
)
15724 && base
== const0_rtx
15725 && aarch64_sve_index_immediate_p (step
))
15730 /* Check of immediate shift constants are within range. */
15732 aarch64_simd_shift_imm_p (rtx x
, machine_mode mode
, bool left
)
15734 int bit_width
= GET_MODE_UNIT_SIZE (mode
) * BITS_PER_UNIT
;
15736 return aarch64_const_vec_all_same_in_range_p (x
, 0, bit_width
- 1);
15738 return aarch64_const_vec_all_same_in_range_p (x
, 1, bit_width
);
15741 /* Return the bitmask CONST_INT to select the bits required by a zero extract
15742 operation of width WIDTH at bit position POS. */
15745 aarch64_mask_from_zextract_ops (rtx width
, rtx pos
)
15747 gcc_assert (CONST_INT_P (width
));
15748 gcc_assert (CONST_INT_P (pos
));
15750 unsigned HOST_WIDE_INT mask
15751 = ((unsigned HOST_WIDE_INT
) 1 << UINTVAL (width
)) - 1;
15752 return GEN_INT (mask
<< UINTVAL (pos
));
15756 aarch64_mov_operand_p (rtx x
, machine_mode mode
)
15758 if (GET_CODE (x
) == HIGH
15759 && aarch64_valid_symref (XEXP (x
, 0), GET_MODE (XEXP (x
, 0))))
15762 if (CONST_INT_P (x
))
15765 if (VECTOR_MODE_P (GET_MODE (x
)))
15767 /* Require predicate constants to be VNx16BI before RA, so that we
15768 force everything to have a canonical form. */
15769 if (!lra_in_progress
15770 && !reload_completed
15771 && GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_BOOL
15772 && GET_MODE (x
) != VNx16BImode
)
15775 return aarch64_simd_valid_immediate (x
, NULL
);
15778 if (GET_CODE (x
) == SYMBOL_REF
&& mode
== DImode
&& CONSTANT_ADDRESS_P (x
))
15781 if (aarch64_sve_cnt_immediate_p (x
))
15784 return aarch64_classify_symbolic_expression (x
)
15785 == SYMBOL_TINY_ABSOLUTE
;
15788 /* Return a const_int vector of VAL. */
15790 aarch64_simd_gen_const_vector_dup (machine_mode mode
, HOST_WIDE_INT val
)
15792 rtx c
= gen_int_mode (val
, GET_MODE_INNER (mode
));
15793 return gen_const_vec_duplicate (mode
, c
);
15796 /* Check OP is a legal scalar immediate for the MOVI instruction. */
15799 aarch64_simd_scalar_immediate_valid_for_move (rtx op
, scalar_int_mode mode
)
15801 machine_mode vmode
;
15803 vmode
= aarch64_simd_container_mode (mode
, 64);
15804 rtx op_v
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (op
));
15805 return aarch64_simd_valid_immediate (op_v
, NULL
);
15808 /* Construct and return a PARALLEL RTX vector with elements numbering the
15809 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
15810 the vector - from the perspective of the architecture. This does not
15811 line up with GCC's perspective on lane numbers, so we end up with
15812 different masks depending on our target endian-ness. The diagram
15813 below may help. We must draw the distinction when building masks
15814 which select one half of the vector. An instruction selecting
15815 architectural low-lanes for a big-endian target, must be described using
15816 a mask selecting GCC high-lanes.
15818 Big-Endian Little-Endian
15820 GCC 0 1 2 3 3 2 1 0
15821 | x | x | x | x | | x | x | x | x |
15822 Architecture 3 2 1 0 3 2 1 0
15824 Low Mask: { 2, 3 } { 0, 1 }
15825 High Mask: { 0, 1 } { 2, 3 }
15827 MODE Is the mode of the vector and NUNITS is the number of units in it. */
15830 aarch64_simd_vect_par_cnst_half (machine_mode mode
, int nunits
, bool high
)
15832 rtvec v
= rtvec_alloc (nunits
/ 2);
15833 int high_base
= nunits
/ 2;
15839 if (BYTES_BIG_ENDIAN
)
15840 base
= high
? low_base
: high_base
;
15842 base
= high
? high_base
: low_base
;
15844 for (i
= 0; i
< nunits
/ 2; i
++)
15845 RTVEC_ELT (v
, i
) = GEN_INT (base
+ i
);
15847 t1
= gen_rtx_PARALLEL (mode
, v
);
15851 /* Check OP for validity as a PARALLEL RTX vector with elements
15852 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
15853 from the perspective of the architecture. See the diagram above
15854 aarch64_simd_vect_par_cnst_half for more details. */
15857 aarch64_simd_check_vect_par_cnst_half (rtx op
, machine_mode mode
,
15861 if (!VECTOR_MODE_P (mode
) || !GET_MODE_NUNITS (mode
).is_constant (&nelts
))
15864 rtx ideal
= aarch64_simd_vect_par_cnst_half (mode
, nelts
, high
);
15865 HOST_WIDE_INT count_op
= XVECLEN (op
, 0);
15866 HOST_WIDE_INT count_ideal
= XVECLEN (ideal
, 0);
15869 if (count_op
!= count_ideal
)
15872 for (i
= 0; i
< count_ideal
; i
++)
15874 rtx elt_op
= XVECEXP (op
, 0, i
);
15875 rtx elt_ideal
= XVECEXP (ideal
, 0, i
);
15877 if (!CONST_INT_P (elt_op
)
15878 || INTVAL (elt_ideal
) != INTVAL (elt_op
))
15884 /* Return a PARALLEL containing NELTS elements, with element I equal
15885 to BASE + I * STEP. */
15888 aarch64_gen_stepped_int_parallel (unsigned int nelts
, int base
, int step
)
15890 rtvec vec
= rtvec_alloc (nelts
);
15891 for (unsigned int i
= 0; i
< nelts
; ++i
)
15892 RTVEC_ELT (vec
, i
) = gen_int_mode (base
+ i
* step
, DImode
);
15893 return gen_rtx_PARALLEL (VOIDmode
, vec
);
15896 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
15897 series with step STEP. */
15900 aarch64_stepped_int_parallel_p (rtx op
, int step
)
15902 if (GET_CODE (op
) != PARALLEL
|| !CONST_INT_P (XVECEXP (op
, 0, 0)))
15905 unsigned HOST_WIDE_INT base
= UINTVAL (XVECEXP (op
, 0, 0));
15906 for (int i
= 1; i
< XVECLEN (op
, 0); ++i
)
15907 if (!CONST_INT_P (XVECEXP (op
, 0, i
))
15908 || UINTVAL (XVECEXP (op
, 0, i
)) != base
+ i
* step
)
15914 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
15915 HIGH (exclusive). */
15917 aarch64_simd_lane_bounds (rtx operand
, HOST_WIDE_INT low
, HOST_WIDE_INT high
,
15920 HOST_WIDE_INT lane
;
15921 gcc_assert (CONST_INT_P (operand
));
15922 lane
= INTVAL (operand
);
15924 if (lane
< low
|| lane
>= high
)
15927 error ("%Klane %wd out of range %wd - %wd", exp
, lane
, low
, high
- 1);
15929 error ("lane %wd out of range %wd - %wd", lane
, low
, high
- 1);
15933 /* Peform endian correction on lane number N, which indexes a vector
15934 of mode MODE, and return the result as an SImode rtx. */
15937 aarch64_endian_lane_rtx (machine_mode mode
, unsigned int n
)
15939 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode
), n
), SImode
);
15942 /* Return TRUE if OP is a valid vector addressing mode. */
15945 aarch64_simd_mem_operand_p (rtx op
)
15947 return MEM_P (op
) && (GET_CODE (XEXP (op
, 0)) == POST_INC
15948 || REG_P (XEXP (op
, 0)));
15951 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
15954 aarch64_sve_ld1r_operand_p (rtx op
)
15956 struct aarch64_address_info addr
;
15960 && is_a
<scalar_mode
> (GET_MODE (op
), &mode
)
15961 && aarch64_classify_address (&addr
, XEXP (op
, 0), mode
, false)
15962 && addr
.type
== ADDRESS_REG_IMM
15963 && offset_6bit_unsigned_scaled_p (mode
, addr
.const_offset
));
15966 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
15968 aarch64_sve_ld1rq_operand_p (rtx op
)
15970 struct aarch64_address_info addr
;
15971 scalar_mode elem_mode
= GET_MODE_INNER (GET_MODE (op
));
15973 || !aarch64_classify_address (&addr
, XEXP (op
, 0), elem_mode
, false))
15976 if (addr
.type
== ADDRESS_REG_IMM
)
15977 return offset_4bit_signed_scaled_p (TImode
, addr
.const_offset
);
15979 if (addr
.type
== ADDRESS_REG_REG
)
15980 return (1U << addr
.shift
) == GET_MODE_SIZE (elem_mode
);
15985 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
15986 The conditions for STR are the same. */
15988 aarch64_sve_ldr_operand_p (rtx op
)
15990 struct aarch64_address_info addr
;
15993 && aarch64_classify_address (&addr
, XEXP (op
, 0), GET_MODE (op
),
15994 false, ADDR_QUERY_ANY
)
15995 && addr
.type
== ADDRESS_REG_IMM
);
15998 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
15999 We need to be able to access the individual pieces, so the range
16000 is different from LD[234] and ST[234]. */
16002 aarch64_sve_struct_memory_operand_p (rtx op
)
16007 machine_mode mode
= GET_MODE (op
);
16008 struct aarch64_address_info addr
;
16009 if (!aarch64_classify_address (&addr
, XEXP (op
, 0), SVE_BYTE_MODE
, false,
16011 || addr
.type
!= ADDRESS_REG_IMM
)
16014 poly_int64 first
= addr
.const_offset
;
16015 poly_int64 last
= first
+ GET_MODE_SIZE (mode
) - BYTES_PER_SVE_VECTOR
;
16016 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, first
)
16017 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, last
));
16020 /* Emit a register copy from operand to operand, taking care not to
16021 early-clobber source registers in the process.
16023 COUNT is the number of components into which the copy needs to be
16026 aarch64_simd_emit_reg_reg_move (rtx
*operands
, machine_mode mode
,
16027 unsigned int count
)
16030 int rdest
= REGNO (operands
[0]);
16031 int rsrc
= REGNO (operands
[1]);
16033 if (!reg_overlap_mentioned_p (operands
[0], operands
[1])
16035 for (i
= 0; i
< count
; i
++)
16036 emit_move_insn (gen_rtx_REG (mode
, rdest
+ i
),
16037 gen_rtx_REG (mode
, rsrc
+ i
));
16039 for (i
= 0; i
< count
; i
++)
16040 emit_move_insn (gen_rtx_REG (mode
, rdest
+ count
- i
- 1),
16041 gen_rtx_REG (mode
, rsrc
+ count
- i
- 1));
16044 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
16045 one of VSTRUCT modes: OI, CI, or XI. */
16047 aarch64_simd_attr_length_rglist (machine_mode mode
)
16049 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
16050 return (GET_MODE_SIZE (mode
).to_constant () / UNITS_PER_VREG
) * 4;
16053 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
16054 alignment of a vector to 128 bits. SVE predicates have an alignment of
16056 static HOST_WIDE_INT
16057 aarch64_simd_vector_alignment (const_tree type
)
16059 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
16060 be set for non-predicate vectors of booleans. Modes are the most
16061 direct way we have of identifying real SVE predicate types. */
16062 if (GET_MODE_CLASS (TYPE_MODE (type
)) == MODE_VECTOR_BOOL
)
16064 if (TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
16066 return wi::umin (wi::to_wide (TYPE_SIZE (type
)), 128).to_uhwi ();
16069 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
16071 aarch64_vectorize_preferred_vector_alignment (const_tree type
)
16073 if (aarch64_sve_data_mode_p (TYPE_MODE (type
)))
16075 /* If the length of the vector is fixed, try to align to that length,
16076 otherwise don't try to align at all. */
16077 HOST_WIDE_INT result
;
16078 if (!BITS_PER_SVE_VECTOR
.is_constant (&result
))
16079 result
= TYPE_ALIGN (TREE_TYPE (type
));
16082 return TYPE_ALIGN (type
);
16085 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
16087 aarch64_simd_vector_alignment_reachable (const_tree type
, bool is_packed
)
16092 /* For fixed-length vectors, check that the vectorizer will aim for
16093 full-vector alignment. This isn't true for generic GCC vectors
16094 that are wider than the ABI maximum of 128 bits. */
16095 poly_uint64 preferred_alignment
=
16096 aarch64_vectorize_preferred_vector_alignment (type
);
16097 if (TREE_CODE (TYPE_SIZE (type
)) == INTEGER_CST
16098 && maybe_ne (wi::to_widest (TYPE_SIZE (type
)),
16099 preferred_alignment
))
16102 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
16106 /* Return true if the vector misalignment factor is supported by the
16109 aarch64_builtin_support_vector_misalignment (machine_mode mode
,
16110 const_tree type
, int misalignment
,
16113 if (TARGET_SIMD
&& STRICT_ALIGNMENT
)
16115 /* Return if movmisalign pattern is not supported for this mode. */
16116 if (optab_handler (movmisalign_optab
, mode
) == CODE_FOR_nothing
)
16119 /* Misalignment factor is unknown at compile time. */
16120 if (misalignment
== -1)
16123 return default_builtin_support_vector_misalignment (mode
, type
, misalignment
,
16127 /* If VALS is a vector constant that can be loaded into a register
16128 using DUP, generate instructions to do so and return an RTX to
16129 assign to the register. Otherwise return NULL_RTX. */
16131 aarch64_simd_dup_constant (rtx vals
)
16133 machine_mode mode
= GET_MODE (vals
);
16134 machine_mode inner_mode
= GET_MODE_INNER (mode
);
16137 if (!const_vec_duplicate_p (vals
, &x
))
16140 /* We can load this constant by using DUP and a constant in a
16141 single ARM register. This will be cheaper than a vector
16143 x
= copy_to_mode_reg (inner_mode
, x
);
16144 return gen_vec_duplicate (mode
, x
);
16148 /* Generate code to load VALS, which is a PARALLEL containing only
16149 constants (for vec_init) or CONST_VECTOR, efficiently into a
16150 register. Returns an RTX to copy into the register, or NULL_RTX
16151 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
16153 aarch64_simd_make_constant (rtx vals
)
16155 machine_mode mode
= GET_MODE (vals
);
16157 rtx const_vec
= NULL_RTX
;
16161 if (GET_CODE (vals
) == CONST_VECTOR
)
16163 else if (GET_CODE (vals
) == PARALLEL
)
16165 /* A CONST_VECTOR must contain only CONST_INTs and
16166 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
16167 Only store valid constants in a CONST_VECTOR. */
16168 int n_elts
= XVECLEN (vals
, 0);
16169 for (i
= 0; i
< n_elts
; ++i
)
16171 rtx x
= XVECEXP (vals
, 0, i
);
16172 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
16175 if (n_const
== n_elts
)
16176 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0));
16179 gcc_unreachable ();
16181 if (const_vec
!= NULL_RTX
16182 && aarch64_simd_valid_immediate (const_vec
, NULL
))
16183 /* Load using MOVI/MVNI. */
16185 else if ((const_dup
= aarch64_simd_dup_constant (vals
)) != NULL_RTX
)
16186 /* Loaded using DUP. */
16188 else if (const_vec
!= NULL_RTX
)
16189 /* Load from constant pool. We cannot take advantage of single-cycle
16190 LD1 because we need a PC-relative addressing mode. */
16193 /* A PARALLEL containing something not valid inside CONST_VECTOR.
16194 We cannot construct an initializer. */
16198 /* Expand a vector initialisation sequence, such that TARGET is
16199 initialised to contain VALS. */
16202 aarch64_expand_vector_init (rtx target
, rtx vals
)
16204 machine_mode mode
= GET_MODE (target
);
16205 scalar_mode inner_mode
= GET_MODE_INNER (mode
);
16206 /* The number of vector elements. */
16207 int n_elts
= XVECLEN (vals
, 0);
16208 /* The number of vector elements which are not constant. */
16210 rtx any_const
= NULL_RTX
;
16211 /* The first element of vals. */
16212 rtx v0
= XVECEXP (vals
, 0, 0);
16213 bool all_same
= true;
16215 /* This is a special vec_init<M><N> where N is not an element mode but a
16216 vector mode with half the elements of M. We expect to find two entries
16217 of mode N in VALS and we must put their concatentation into TARGET. */
16218 if (XVECLEN (vals
, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals
, 0, 0))))
16220 gcc_assert (known_eq (GET_MODE_SIZE (mode
),
16221 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals
, 0, 0)))));
16222 rtx lo
= XVECEXP (vals
, 0, 0);
16223 rtx hi
= XVECEXP (vals
, 0, 1);
16224 machine_mode narrow_mode
= GET_MODE (lo
);
16225 gcc_assert (GET_MODE_INNER (narrow_mode
) == inner_mode
);
16226 gcc_assert (narrow_mode
== GET_MODE (hi
));
16228 /* When we want to concatenate a half-width vector with zeroes we can
16229 use the aarch64_combinez[_be] patterns. Just make sure that the
16230 zeroes are in the right half. */
16231 if (BYTES_BIG_ENDIAN
16232 && aarch64_simd_imm_zero (lo
, narrow_mode
)
16233 && general_operand (hi
, narrow_mode
))
16234 emit_insn (gen_aarch64_combinez_be (narrow_mode
, target
, hi
, lo
));
16235 else if (!BYTES_BIG_ENDIAN
16236 && aarch64_simd_imm_zero (hi
, narrow_mode
)
16237 && general_operand (lo
, narrow_mode
))
16238 emit_insn (gen_aarch64_combinez (narrow_mode
, target
, lo
, hi
));
16241 /* Else create the two half-width registers and combine them. */
16243 lo
= force_reg (GET_MODE (lo
), lo
);
16245 hi
= force_reg (GET_MODE (hi
), hi
);
16247 if (BYTES_BIG_ENDIAN
)
16248 std::swap (lo
, hi
);
16249 emit_insn (gen_aarch64_simd_combine (narrow_mode
, target
, lo
, hi
));
16254 /* Count the number of variable elements to initialise. */
16255 for (int i
= 0; i
< n_elts
; ++i
)
16257 rtx x
= XVECEXP (vals
, 0, i
);
16258 if (!(CONST_INT_P (x
) || CONST_DOUBLE_P (x
)))
16263 all_same
&= rtx_equal_p (x
, v0
);
16266 /* No variable elements, hand off to aarch64_simd_make_constant which knows
16267 how best to handle this. */
16270 rtx constant
= aarch64_simd_make_constant (vals
);
16271 if (constant
!= NULL_RTX
)
16273 emit_move_insn (target
, constant
);
16278 /* Splat a single non-constant element if we can. */
16281 rtx x
= copy_to_mode_reg (inner_mode
, v0
);
16282 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
16286 enum insn_code icode
= optab_handler (vec_set_optab
, mode
);
16287 gcc_assert (icode
!= CODE_FOR_nothing
);
16289 /* If there are only variable elements, try to optimize
16290 the insertion using dup for the most common element
16291 followed by insertions. */
16293 /* The algorithm will fill matches[*][0] with the earliest matching element,
16294 and matches[X][1] with the count of duplicate elements (if X is the
16295 earliest element which has duplicates). */
16297 if (n_var
== n_elts
&& n_elts
<= 16)
16299 int matches
[16][2] = {0};
16300 for (int i
= 0; i
< n_elts
; i
++)
16302 for (int j
= 0; j
<= i
; j
++)
16304 if (rtx_equal_p (XVECEXP (vals
, 0, i
), XVECEXP (vals
, 0, j
)))
16312 int maxelement
= 0;
16314 for (int i
= 0; i
< n_elts
; i
++)
16315 if (matches
[i
][1] > maxv
)
16318 maxv
= matches
[i
][1];
16321 /* Create a duplicate of the most common element, unless all elements
16322 are equally useless to us, in which case just immediately set the
16323 vector register using the first element. */
16327 /* For vectors of two 64-bit elements, we can do even better. */
16329 && (inner_mode
== E_DImode
16330 || inner_mode
== E_DFmode
))
16333 rtx x0
= XVECEXP (vals
, 0, 0);
16334 rtx x1
= XVECEXP (vals
, 0, 1);
16335 /* Combine can pick up this case, but handling it directly
16336 here leaves clearer RTL.
16338 This is load_pair_lanes<mode>, and also gives us a clean-up
16339 for store_pair_lanes<mode>. */
16340 if (memory_operand (x0
, inner_mode
)
16341 && memory_operand (x1
, inner_mode
)
16342 && !STRICT_ALIGNMENT
16343 && rtx_equal_p (XEXP (x1
, 0),
16344 plus_constant (Pmode
,
16346 GET_MODE_SIZE (inner_mode
))))
16349 if (inner_mode
== DFmode
)
16350 t
= gen_load_pair_lanesdf (target
, x0
, x1
);
16352 t
= gen_load_pair_lanesdi (target
, x0
, x1
);
16357 /* The subreg-move sequence below will move into lane zero of the
16358 vector register. For big-endian we want that position to hold
16359 the last element of VALS. */
16360 maxelement
= BYTES_BIG_ENDIAN
? n_elts
- 1 : 0;
16361 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
16362 aarch64_emit_move (target
, lowpart_subreg (mode
, x
, inner_mode
));
16366 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
16367 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
16370 /* Insert the rest. */
16371 for (int i
= 0; i
< n_elts
; i
++)
16373 rtx x
= XVECEXP (vals
, 0, i
);
16374 if (matches
[i
][0] == maxelement
)
16376 x
= copy_to_mode_reg (inner_mode
, x
);
16377 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
16382 /* Initialise a vector which is part-variable. We want to first try
16383 to build those lanes which are constant in the most efficient way we
16385 if (n_var
!= n_elts
)
16387 rtx copy
= copy_rtx (vals
);
16389 /* Load constant part of vector. We really don't care what goes into the
16390 parts we will overwrite, but we're more likely to be able to load the
16391 constant efficiently if it has fewer, larger, repeating parts
16392 (see aarch64_simd_valid_immediate). */
16393 for (int i
= 0; i
< n_elts
; i
++)
16395 rtx x
= XVECEXP (vals
, 0, i
);
16396 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
16398 rtx subst
= any_const
;
16399 for (int bit
= n_elts
/ 2; bit
> 0; bit
/= 2)
16401 /* Look in the copied vector, as more elements are const. */
16402 rtx test
= XVECEXP (copy
, 0, i
^ bit
);
16403 if (CONST_INT_P (test
) || CONST_DOUBLE_P (test
))
16409 XVECEXP (copy
, 0, i
) = subst
;
16411 aarch64_expand_vector_init (target
, copy
);
16414 /* Insert the variable lanes directly. */
16415 for (int i
= 0; i
< n_elts
; i
++)
16417 rtx x
= XVECEXP (vals
, 0, i
);
16418 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
16420 x
= copy_to_mode_reg (inner_mode
, x
);
16421 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
16425 /* Emit RTL corresponding to:
16426 insr TARGET, ELEM. */
16429 emit_insr (rtx target
, rtx elem
)
16431 machine_mode mode
= GET_MODE (target
);
16432 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
16433 elem
= force_reg (elem_mode
, elem
);
16435 insn_code icode
= optab_handler (vec_shl_insert_optab
, mode
);
16436 gcc_assert (icode
!= CODE_FOR_nothing
);
16437 emit_insn (GEN_FCN (icode
) (target
, target
, elem
));
16440 /* Subroutine of aarch64_sve_expand_vector_init for handling
16441 trailing constants.
16442 This function works as follows:
16443 (a) Create a new vector consisting of trailing constants.
16444 (b) Initialize TARGET with the constant vector using emit_move_insn.
16445 (c) Insert remaining elements in TARGET using insr.
16446 NELTS is the total number of elements in original vector while
16447 while NELTS_REQD is the number of elements that are actually
16450 ??? The heuristic used is to do above only if number of constants
16451 is at least half the total number of elements. May need fine tuning. */
16454 aarch64_sve_expand_vector_init_handle_trailing_constants
16455 (rtx target
, const rtx_vector_builder
&builder
, int nelts
, int nelts_reqd
)
16457 machine_mode mode
= GET_MODE (target
);
16458 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
16459 int n_trailing_constants
= 0;
16461 for (int i
= nelts_reqd
- 1;
16462 i
>= 0 && aarch64_legitimate_constant_p (elem_mode
, builder
.elt (i
));
16464 n_trailing_constants
++;
16466 if (n_trailing_constants
>= nelts_reqd
/ 2)
16468 rtx_vector_builder
v (mode
, 1, nelts
);
16469 for (int i
= 0; i
< nelts
; i
++)
16470 v
.quick_push (builder
.elt (i
+ nelts_reqd
- n_trailing_constants
));
16471 rtx const_vec
= v
.build ();
16472 emit_move_insn (target
, const_vec
);
16474 for (int i
= nelts_reqd
- n_trailing_constants
- 1; i
>= 0; i
--)
16475 emit_insr (target
, builder
.elt (i
));
16483 /* Subroutine of aarch64_sve_expand_vector_init.
16485 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
16486 (b) Skip trailing elements from BUILDER, which are the same as
16487 element NELTS_REQD - 1.
16488 (c) Insert earlier elements in reverse order in TARGET using insr. */
16491 aarch64_sve_expand_vector_init_insert_elems (rtx target
,
16492 const rtx_vector_builder
&builder
,
16495 machine_mode mode
= GET_MODE (target
);
16496 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
16498 struct expand_operand ops
[2];
16499 enum insn_code icode
= optab_handler (vec_duplicate_optab
, mode
);
16500 gcc_assert (icode
!= CODE_FOR_nothing
);
16502 create_output_operand (&ops
[0], target
, mode
);
16503 create_input_operand (&ops
[1], builder
.elt (nelts_reqd
- 1), elem_mode
);
16504 expand_insn (icode
, 2, ops
);
16506 int ndups
= builder
.count_dups (nelts_reqd
- 1, -1, -1);
16507 for (int i
= nelts_reqd
- ndups
- 1; i
>= 0; i
--)
16508 emit_insr (target
, builder
.elt (i
));
16511 /* Subroutine of aarch64_sve_expand_vector_init to handle case
16512 when all trailing elements of builder are same.
16513 This works as follows:
16514 (a) Use expand_insn interface to broadcast last vector element in TARGET.
16515 (b) Insert remaining elements in TARGET using insr.
16517 ??? The heuristic used is to do above if number of same trailing elements
16518 is at least 3/4 of total number of elements, loosely based on
16519 heuristic from mostly_zeros_p. May need fine-tuning. */
16522 aarch64_sve_expand_vector_init_handle_trailing_same_elem
16523 (rtx target
, const rtx_vector_builder
&builder
, int nelts_reqd
)
16525 int ndups
= builder
.count_dups (nelts_reqd
- 1, -1, -1);
16526 if (ndups
>= (3 * nelts_reqd
) / 4)
16528 aarch64_sve_expand_vector_init_insert_elems (target
, builder
,
16529 nelts_reqd
- ndups
+ 1);
16536 /* Initialize register TARGET from BUILDER. NELTS is the constant number
16537 of elements in BUILDER.
16539 The function tries to initialize TARGET from BUILDER if it fits one
16540 of the special cases outlined below.
16542 Failing that, the function divides BUILDER into two sub-vectors:
16543 v_even = even elements of BUILDER;
16544 v_odd = odd elements of BUILDER;
16546 and recursively calls itself with v_even and v_odd.
16548 if (recursive call succeeded for v_even or v_odd)
16549 TARGET = zip (v_even, v_odd)
16551 The function returns true if it managed to build TARGET from BUILDER
16552 with one of the special cases, false otherwise.
16554 Example: {a, 1, b, 2, c, 3, d, 4}
16556 The vector gets divided into:
16557 v_even = {a, b, c, d}
16558 v_odd = {1, 2, 3, 4}
16560 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
16561 initialize tmp2 from constant vector v_odd using emit_move_insn.
16563 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
16564 4 elements, so we construct tmp1 from v_even using insr:
16571 TARGET = zip (tmp1, tmp2)
16572 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
16575 aarch64_sve_expand_vector_init (rtx target
, const rtx_vector_builder
&builder
,
16576 int nelts
, int nelts_reqd
)
16578 machine_mode mode
= GET_MODE (target
);
16580 /* Case 1: Vector contains trailing constants. */
16582 if (aarch64_sve_expand_vector_init_handle_trailing_constants
16583 (target
, builder
, nelts
, nelts_reqd
))
16586 /* Case 2: Vector contains leading constants. */
16588 rtx_vector_builder
rev_builder (mode
, 1, nelts_reqd
);
16589 for (int i
= 0; i
< nelts_reqd
; i
++)
16590 rev_builder
.quick_push (builder
.elt (nelts_reqd
- i
- 1));
16591 rev_builder
.finalize ();
16593 if (aarch64_sve_expand_vector_init_handle_trailing_constants
16594 (target
, rev_builder
, nelts
, nelts_reqd
))
16596 emit_insn (gen_aarch64_sve_rev (mode
, target
, target
));
16600 /* Case 3: Vector contains trailing same element. */
16602 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16603 (target
, builder
, nelts_reqd
))
16606 /* Case 4: Vector contains leading same element. */
16608 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16609 (target
, rev_builder
, nelts_reqd
) && nelts_reqd
== nelts
)
16611 emit_insn (gen_aarch64_sve_rev (mode
, target
, target
));
16615 /* Avoid recursing below 4-elements.
16616 ??? The threshold 4 may need fine-tuning. */
16618 if (nelts_reqd
<= 4)
16621 rtx_vector_builder
v_even (mode
, 1, nelts
);
16622 rtx_vector_builder
v_odd (mode
, 1, nelts
);
16624 for (int i
= 0; i
< nelts
* 2; i
+= 2)
16626 v_even
.quick_push (builder
.elt (i
));
16627 v_odd
.quick_push (builder
.elt (i
+ 1));
16630 v_even
.finalize ();
16633 rtx tmp1
= gen_reg_rtx (mode
);
16634 bool did_even_p
= aarch64_sve_expand_vector_init (tmp1
, v_even
,
16635 nelts
, nelts_reqd
/ 2);
16637 rtx tmp2
= gen_reg_rtx (mode
);
16638 bool did_odd_p
= aarch64_sve_expand_vector_init (tmp2
, v_odd
,
16639 nelts
, nelts_reqd
/ 2);
16641 if (!did_even_p
&& !did_odd_p
)
16644 /* Initialize v_even and v_odd using INSR if it didn't match any of the
16645 special cases and zip v_even, v_odd. */
16648 aarch64_sve_expand_vector_init_insert_elems (tmp1
, v_even
, nelts_reqd
/ 2);
16651 aarch64_sve_expand_vector_init_insert_elems (tmp2
, v_odd
, nelts_reqd
/ 2);
16653 rtvec v
= gen_rtvec (2, tmp1
, tmp2
);
16654 emit_set_insn (target
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
16658 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
16661 aarch64_sve_expand_vector_init (rtx target
, rtx vals
)
16663 machine_mode mode
= GET_MODE (target
);
16664 int nelts
= XVECLEN (vals
, 0);
16666 rtx_vector_builder
v (mode
, 1, nelts
);
16667 for (int i
= 0; i
< nelts
; i
++)
16668 v
.quick_push (XVECEXP (vals
, 0, i
));
16671 /* If neither sub-vectors of v could be initialized specially,
16672 then use INSR to insert all elements from v into TARGET.
16673 ??? This might not be optimal for vectors with large
16674 initializers like 16-element or above.
16675 For nelts < 4, it probably isn't useful to handle specially. */
16678 || !aarch64_sve_expand_vector_init (target
, v
, nelts
, nelts
))
16679 aarch64_sve_expand_vector_init_insert_elems (target
, v
, nelts
);
16682 /* Check whether VALUE is a vector constant in which every element
16683 is either a power of 2 or a negated power of 2. If so, return
16684 a constant vector of log2s, and flip CODE between PLUS and MINUS
16685 if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
16688 aarch64_convert_mult_to_shift (rtx value
, rtx_code
&code
)
16690 if (GET_CODE (value
) != CONST_VECTOR
)
16693 rtx_vector_builder builder
;
16694 if (!builder
.new_unary_operation (GET_MODE (value
), value
, false))
16697 scalar_mode int_mode
= GET_MODE_INNER (GET_MODE (value
));
16698 /* 1 if the result of the multiplication must be negated,
16699 0 if it mustn't, or -1 if we don't yet care. */
16701 unsigned int encoded_nelts
= const_vector_encoded_nelts (value
);
16702 for (unsigned int i
= 0; i
< encoded_nelts
; ++i
)
16704 rtx elt
= CONST_VECTOR_ENCODED_ELT (value
, i
);
16705 if (!CONST_SCALAR_INT_P (elt
))
16707 rtx_mode_t
val (elt
, int_mode
);
16708 wide_int pow2
= wi::neg (val
);
16711 /* It matters whether we negate or not. Make that choice,
16712 and make sure that it's consistent with previous elements. */
16713 if (negate
== !wi::neg_p (val
))
16715 negate
= wi::neg_p (val
);
16719 /* POW2 is now the value that we want to be a power of 2. */
16720 int shift
= wi::exact_log2 (pow2
);
16723 builder
.quick_push (gen_int_mode (shift
, int_mode
));
16726 /* PLUS and MINUS are equivalent; canonicalize on PLUS. */
16728 else if (negate
== 1)
16729 code
= code
== PLUS
? MINUS
: PLUS
;
16730 return builder
.build ();
16733 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
16734 CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
16735 operands array, in the same order as for fma_optab. Return true if
16736 the function emitted all the necessary instructions, false if the caller
16737 should generate the pattern normally with the new OPERANDS array. */
16740 aarch64_prepare_sve_int_fma (rtx
*operands
, rtx_code code
)
16742 machine_mode mode
= GET_MODE (operands
[0]);
16743 if (rtx shifts
= aarch64_convert_mult_to_shift (operands
[2], code
))
16745 rtx product
= expand_binop (mode
, vashl_optab
, operands
[1], shifts
,
16746 NULL_RTX
, true, OPTAB_DIRECT
);
16747 force_expand_binop (mode
, code
== PLUS
? add_optab
: sub_optab
,
16748 operands
[3], product
, operands
[0], true,
16752 operands
[2] = force_reg (mode
, operands
[2]);
16756 /* Likewise, but for a conditional pattern. */
16759 aarch64_prepare_sve_cond_int_fma (rtx
*operands
, rtx_code code
)
16761 machine_mode mode
= GET_MODE (operands
[0]);
16762 if (rtx shifts
= aarch64_convert_mult_to_shift (operands
[3], code
))
16764 rtx product
= expand_binop (mode
, vashl_optab
, operands
[2], shifts
,
16765 NULL_RTX
, true, OPTAB_DIRECT
);
16766 emit_insn (gen_cond (code
, mode
, operands
[0], operands
[1],
16767 operands
[4], product
, operands
[5]));
16770 operands
[3] = force_reg (mode
, operands
[3]);
16774 static unsigned HOST_WIDE_INT
16775 aarch64_shift_truncation_mask (machine_mode mode
)
16777 if (!SHIFT_COUNT_TRUNCATED
|| aarch64_vector_data_mode_p (mode
))
16779 return GET_MODE_UNIT_BITSIZE (mode
) - 1;
16782 /* Select a format to encode pointers in exception handling data. */
16784 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED
, int global
)
16787 switch (aarch64_cmodel
)
16789 case AARCH64_CMODEL_TINY
:
16790 case AARCH64_CMODEL_TINY_PIC
:
16791 case AARCH64_CMODEL_SMALL
:
16792 case AARCH64_CMODEL_SMALL_PIC
:
16793 case AARCH64_CMODEL_SMALL_SPIC
:
16794 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
16796 type
= DW_EH_PE_sdata4
;
16799 /* No assumptions here. 8-byte relocs required. */
16800 type
= DW_EH_PE_sdata8
;
16803 return (global
? DW_EH_PE_indirect
: 0) | DW_EH_PE_pcrel
| type
;
16806 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
16809 aarch64_asm_output_variant_pcs (FILE *stream
, const tree decl
, const char* name
)
16811 if (aarch64_simd_decl_p (decl
))
16813 fprintf (stream
, "\t.variant_pcs\t");
16814 assemble_name (stream
, name
);
16815 fprintf (stream
, "\n");
16819 /* The last .arch and .tune assembly strings that we printed. */
16820 static std::string aarch64_last_printed_arch_string
;
16821 static std::string aarch64_last_printed_tune_string
;
16823 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
16824 by the function fndecl. */
16827 aarch64_declare_function_name (FILE *stream
, const char* name
,
16830 tree target_parts
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
16832 struct cl_target_option
*targ_options
;
16834 targ_options
= TREE_TARGET_OPTION (target_parts
);
16836 targ_options
= TREE_TARGET_OPTION (target_option_current_node
);
16837 gcc_assert (targ_options
);
16839 const struct processor
*this_arch
16840 = aarch64_get_arch (targ_options
->x_explicit_arch
);
16842 uint64_t isa_flags
= targ_options
->x_aarch64_isa_flags
;
16843 std::string extension
16844 = aarch64_get_extension_string_for_isa_flags (isa_flags
,
16846 /* Only update the assembler .arch string if it is distinct from the last
16847 such string we printed. */
16848 std::string to_print
= this_arch
->name
+ extension
;
16849 if (to_print
!= aarch64_last_printed_arch_string
)
16851 asm_fprintf (asm_out_file
, "\t.arch %s\n", to_print
.c_str ());
16852 aarch64_last_printed_arch_string
= to_print
;
16855 /* Print the cpu name we're tuning for in the comments, might be
16856 useful to readers of the generated asm. Do it only when it changes
16857 from function to function and verbose assembly is requested. */
16858 const struct processor
*this_tune
16859 = aarch64_get_tune_cpu (targ_options
->x_explicit_tune_core
);
16861 if (flag_debug_asm
&& aarch64_last_printed_tune_string
!= this_tune
->name
)
16863 asm_fprintf (asm_out_file
, "\t" ASM_COMMENT_START
".tune %s\n",
16865 aarch64_last_printed_tune_string
= this_tune
->name
;
16868 aarch64_asm_output_variant_pcs (stream
, fndecl
, name
);
16870 /* Don't forget the type directive for ELF. */
16871 ASM_OUTPUT_TYPE_DIRECTIVE (stream
, name
, "function");
16872 ASM_OUTPUT_LABEL (stream
, name
);
16875 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
16878 aarch64_asm_output_alias (FILE *stream
, const tree decl
, const tree target
)
16880 const char *name
= XSTR (XEXP (DECL_RTL (decl
), 0), 0);
16881 const char *value
= IDENTIFIER_POINTER (target
);
16882 aarch64_asm_output_variant_pcs (stream
, decl
, name
);
16883 ASM_OUTPUT_DEF (stream
, name
, value
);
16886 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
16887 function symbol references. */
16890 aarch64_asm_output_external (FILE *stream
, tree decl
, const char* name
)
16892 default_elf_asm_output_external (stream
, decl
, name
);
16893 aarch64_asm_output_variant_pcs (stream
, decl
, name
);
16896 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
16897 Used to output the .cfi_b_key_frame directive when signing the current
16898 function with the B key. */
16901 aarch64_post_cfi_startproc (FILE *f
, tree ignored ATTRIBUTE_UNUSED
)
16903 if (cfun
->machine
->frame
.laid_out
&& aarch64_return_address_signing_enabled ()
16904 && aarch64_ra_sign_key
== AARCH64_KEY_B
)
16905 asm_fprintf (f
, "\t.cfi_b_key_frame\n");
16908 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
16911 aarch64_start_file (void)
16913 struct cl_target_option
*default_options
16914 = TREE_TARGET_OPTION (target_option_default_node
);
16916 const struct processor
*default_arch
16917 = aarch64_get_arch (default_options
->x_explicit_arch
);
16918 uint64_t default_isa_flags
= default_options
->x_aarch64_isa_flags
;
16919 std::string extension
16920 = aarch64_get_extension_string_for_isa_flags (default_isa_flags
,
16921 default_arch
->flags
);
16923 aarch64_last_printed_arch_string
= default_arch
->name
+ extension
;
16924 aarch64_last_printed_tune_string
= "";
16925 asm_fprintf (asm_out_file
, "\t.arch %s\n",
16926 aarch64_last_printed_arch_string
.c_str ());
16928 default_file_start ();
16931 /* Emit load exclusive. */
16934 aarch64_emit_load_exclusive (machine_mode mode
, rtx rval
,
16935 rtx mem
, rtx model_rtx
)
16937 if (mode
== TImode
)
16938 emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode
, rval
),
16939 gen_highpart (DImode
, rval
),
16942 emit_insn (gen_aarch64_load_exclusive (mode
, rval
, mem
, model_rtx
));
16945 /* Emit store exclusive. */
16948 aarch64_emit_store_exclusive (machine_mode mode
, rtx bval
,
16949 rtx mem
, rtx rval
, rtx model_rtx
)
16951 if (mode
== TImode
)
16952 emit_insn (gen_aarch64_store_exclusive_pair
16953 (bval
, mem
, operand_subword (rval
, 0, 0, TImode
),
16954 operand_subword (rval
, 1, 0, TImode
), model_rtx
));
16956 emit_insn (gen_aarch64_store_exclusive (mode
, bval
, mem
, rval
, model_rtx
));
16959 /* Mark the previous jump instruction as unlikely. */
16962 aarch64_emit_unlikely_jump (rtx insn
)
16964 rtx_insn
*jump
= emit_jump_insn (insn
);
16965 add_reg_br_prob_note (jump
, profile_probability::very_unlikely ());
16968 /* We store the names of the various atomic helpers in a 5x4 array.
16969 Return the libcall function given MODE, MODEL and NAMES. */
16972 aarch64_atomic_ool_func(machine_mode mode
, rtx model_rtx
,
16973 const atomic_ool_names
*names
)
16975 memmodel model
= memmodel_base (INTVAL (model_rtx
));
16976 int mode_idx
, model_idx
;
16996 gcc_unreachable ();
17001 case MEMMODEL_RELAXED
:
17004 case MEMMODEL_CONSUME
:
17005 case MEMMODEL_ACQUIRE
:
17008 case MEMMODEL_RELEASE
:
17011 case MEMMODEL_ACQ_REL
:
17012 case MEMMODEL_SEQ_CST
:
17016 gcc_unreachable ();
17019 return init_one_libfunc_visibility (names
->str
[mode_idx
][model_idx
],
17020 VISIBILITY_HIDDEN
);
17023 #define DEF0(B, N) \
17024 { "__aarch64_" #B #N "_relax", \
17025 "__aarch64_" #B #N "_acq", \
17026 "__aarch64_" #B #N "_rel", \
17027 "__aarch64_" #B #N "_acq_rel" }
17029 #define DEF4(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
17030 { NULL, NULL, NULL, NULL }
17031 #define DEF5(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
17033 static const atomic_ool_names aarch64_ool_cas_names
= { { DEF5(cas
) } };
17034 const atomic_ool_names aarch64_ool_swp_names
= { { DEF4(swp
) } };
17035 const atomic_ool_names aarch64_ool_ldadd_names
= { { DEF4(ldadd
) } };
17036 const atomic_ool_names aarch64_ool_ldset_names
= { { DEF4(ldset
) } };
17037 const atomic_ool_names aarch64_ool_ldclr_names
= { { DEF4(ldclr
) } };
17038 const atomic_ool_names aarch64_ool_ldeor_names
= { { DEF4(ldeor
) } };
17044 /* Expand a compare and swap pattern. */
17047 aarch64_expand_compare_and_swap (rtx operands
[])
17049 rtx bval
, rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
, x
, cc_reg
;
17050 machine_mode mode
, r_mode
;
17052 bval
= operands
[0];
17053 rval
= operands
[1];
17055 oldval
= operands
[3];
17056 newval
= operands
[4];
17057 is_weak
= operands
[5];
17058 mod_s
= operands
[6];
17059 mod_f
= operands
[7];
17060 mode
= GET_MODE (mem
);
17062 /* Normally the succ memory model must be stronger than fail, but in the
17063 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
17064 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
17065 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f
)))
17066 && is_mm_release (memmodel_from_int (INTVAL (mod_s
))))
17067 mod_s
= GEN_INT (MEMMODEL_ACQ_REL
);
17070 if (mode
== QImode
|| mode
== HImode
)
17073 rval
= gen_reg_rtx (r_mode
);
17078 /* The CAS insn requires oldval and rval overlap, but we need to
17079 have a copy of oldval saved across the operation to tell if
17080 the operation is successful. */
17081 if (reg_overlap_mentioned_p (rval
, oldval
))
17082 rval
= copy_to_mode_reg (r_mode
, oldval
);
17084 emit_move_insn (rval
, gen_lowpart (r_mode
, oldval
));
17086 emit_insn (gen_aarch64_compare_and_swap_lse (mode
, rval
, mem
,
17088 cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
17090 else if (TARGET_OUTLINE_ATOMICS
)
17092 /* Oldval must satisfy compare afterward. */
17093 if (!aarch64_plus_operand (oldval
, mode
))
17094 oldval
= force_reg (mode
, oldval
);
17095 rtx func
= aarch64_atomic_ool_func (mode
, mod_s
, &aarch64_ool_cas_names
);
17096 rval
= emit_library_call_value (func
, NULL_RTX
, LCT_NORMAL
, r_mode
,
17097 oldval
, mode
, newval
, mode
,
17098 XEXP (mem
, 0), Pmode
);
17099 cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
17103 /* The oldval predicate varies by mode. Test it and force to reg. */
17104 insn_code code
= code_for_aarch64_compare_and_swap (mode
);
17105 if (!insn_data
[code
].operand
[2].predicate (oldval
, mode
))
17106 oldval
= force_reg (mode
, oldval
);
17108 emit_insn (GEN_FCN (code
) (rval
, mem
, oldval
, newval
,
17109 is_weak
, mod_s
, mod_f
));
17110 cc_reg
= gen_rtx_REG (CCmode
, CC_REGNUM
);
17113 if (r_mode
!= mode
)
17114 rval
= gen_lowpart (mode
, rval
);
17115 emit_move_insn (operands
[1], rval
);
17117 x
= gen_rtx_EQ (SImode
, cc_reg
, const0_rtx
);
17118 emit_insn (gen_rtx_SET (bval
, x
));
17121 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
17122 sequence implementing an atomic operation. */
17125 aarch64_emit_post_barrier (enum memmodel model
)
17127 const enum memmodel base_model
= memmodel_base (model
);
17129 if (is_mm_sync (model
)
17130 && (base_model
== MEMMODEL_ACQUIRE
17131 || base_model
== MEMMODEL_ACQ_REL
17132 || base_model
== MEMMODEL_SEQ_CST
))
17134 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST
)));
17138 /* Split a compare and swap pattern. */
17141 aarch64_split_compare_and_swap (rtx operands
[])
17143 rtx rval
, mem
, oldval
, newval
, scratch
, x
, model_rtx
;
17146 rtx_code_label
*label1
, *label2
;
17147 enum memmodel model
;
17149 rval
= operands
[0];
17151 oldval
= operands
[2];
17152 newval
= operands
[3];
17153 is_weak
= (operands
[4] != const0_rtx
);
17154 model_rtx
= operands
[5];
17155 scratch
= operands
[7];
17156 mode
= GET_MODE (mem
);
17157 model
= memmodel_from_int (INTVAL (model_rtx
));
17159 /* When OLDVAL is zero and we want the strong version we can emit a tighter
17162 LD[A]XR rval, [mem]
17164 ST[L]XR scratch, newval, [mem]
17165 CBNZ scratch, .label1
17168 bool strong_zero_p
= (!is_weak
&& !aarch64_track_speculation
&&
17169 oldval
== const0_rtx
&& mode
!= TImode
);
17174 label1
= gen_label_rtx ();
17175 emit_label (label1
);
17177 label2
= gen_label_rtx ();
17179 /* The initial load can be relaxed for a __sync operation since a final
17180 barrier will be emitted to stop code hoisting. */
17181 if (is_mm_sync (model
))
17182 aarch64_emit_load_exclusive (mode
, rval
, mem
, GEN_INT (MEMMODEL_RELAXED
));
17184 aarch64_emit_load_exclusive (mode
, rval
, mem
, model_rtx
);
17187 x
= gen_rtx_NE (VOIDmode
, rval
, const0_rtx
);
17190 rtx cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
17191 x
= gen_rtx_NE (VOIDmode
, cc_reg
, const0_rtx
);
17193 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
17194 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
17195 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
17197 aarch64_emit_store_exclusive (mode
, scratch
, mem
, newval
, model_rtx
);
17201 if (aarch64_track_speculation
)
17203 /* Emit an explicit compare instruction, so that we can correctly
17204 track the condition codes. */
17205 rtx cc_reg
= aarch64_gen_compare_reg (NE
, scratch
, const0_rtx
);
17206 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
17209 x
= gen_rtx_NE (VOIDmode
, scratch
, const0_rtx
);
17211 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
17212 gen_rtx_LABEL_REF (Pmode
, label1
), pc_rtx
);
17213 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
17216 aarch64_gen_compare_reg (NE
, scratch
, const0_rtx
);
17218 emit_label (label2
);
17220 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
17221 to set the condition flags. If this is not used it will be removed by
17224 aarch64_gen_compare_reg (NE
, rval
, const0_rtx
);
17226 /* Emit any final barrier needed for a __sync operation. */
17227 if (is_mm_sync (model
))
17228 aarch64_emit_post_barrier (model
);
17231 /* Split an atomic operation. */
17234 aarch64_split_atomic_op (enum rtx_code code
, rtx old_out
, rtx new_out
, rtx mem
,
17235 rtx value
, rtx model_rtx
, rtx cond
)
17237 machine_mode mode
= GET_MODE (mem
);
17238 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
17239 const enum memmodel model
= memmodel_from_int (INTVAL (model_rtx
));
17240 const bool is_sync
= is_mm_sync (model
);
17241 rtx_code_label
*label
;
17244 /* Split the atomic operation into a sequence. */
17245 label
= gen_label_rtx ();
17246 emit_label (label
);
17249 new_out
= gen_lowpart (wmode
, new_out
);
17251 old_out
= gen_lowpart (wmode
, old_out
);
17254 value
= simplify_gen_subreg (wmode
, value
, mode
, 0);
17256 /* The initial load can be relaxed for a __sync operation since a final
17257 barrier will be emitted to stop code hoisting. */
17259 aarch64_emit_load_exclusive (mode
, old_out
, mem
,
17260 GEN_INT (MEMMODEL_RELAXED
));
17262 aarch64_emit_load_exclusive (mode
, old_out
, mem
, model_rtx
);
17271 x
= gen_rtx_AND (wmode
, old_out
, value
);
17272 emit_insn (gen_rtx_SET (new_out
, x
));
17273 x
= gen_rtx_NOT (wmode
, new_out
);
17274 emit_insn (gen_rtx_SET (new_out
, x
));
17278 if (CONST_INT_P (value
))
17280 value
= GEN_INT (-INTVAL (value
));
17283 /* Fall through. */
17286 x
= gen_rtx_fmt_ee (code
, wmode
, old_out
, value
);
17287 emit_insn (gen_rtx_SET (new_out
, x
));
17291 aarch64_emit_store_exclusive (mode
, cond
, mem
,
17292 gen_lowpart (mode
, new_out
), model_rtx
);
17294 if (aarch64_track_speculation
)
17296 /* Emit an explicit compare instruction, so that we can correctly
17297 track the condition codes. */
17298 rtx cc_reg
= aarch64_gen_compare_reg (NE
, cond
, const0_rtx
);
17299 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
17302 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
17304 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
17305 gen_rtx_LABEL_REF (Pmode
, label
), pc_rtx
);
17306 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
17308 /* Emit any final barrier needed for a __sync operation. */
17310 aarch64_emit_post_barrier (model
);
17314 aarch64_init_libfuncs (void)
17316 /* Half-precision float operations. The compiler handles all operations
17317 with NULL libfuncs by converting to SFmode. */
17320 set_conv_libfunc (trunc_optab
, HFmode
, SFmode
, "__gnu_f2h_ieee");
17321 set_conv_libfunc (sext_optab
, SFmode
, HFmode
, "__gnu_h2f_ieee");
17324 set_optab_libfunc (add_optab
, HFmode
, NULL
);
17325 set_optab_libfunc (sdiv_optab
, HFmode
, NULL
);
17326 set_optab_libfunc (smul_optab
, HFmode
, NULL
);
17327 set_optab_libfunc (neg_optab
, HFmode
, NULL
);
17328 set_optab_libfunc (sub_optab
, HFmode
, NULL
);
17331 set_optab_libfunc (eq_optab
, HFmode
, NULL
);
17332 set_optab_libfunc (ne_optab
, HFmode
, NULL
);
17333 set_optab_libfunc (lt_optab
, HFmode
, NULL
);
17334 set_optab_libfunc (le_optab
, HFmode
, NULL
);
17335 set_optab_libfunc (ge_optab
, HFmode
, NULL
);
17336 set_optab_libfunc (gt_optab
, HFmode
, NULL
);
17337 set_optab_libfunc (unord_optab
, HFmode
, NULL
);
17340 /* Target hook for c_mode_for_suffix. */
17341 static machine_mode
17342 aarch64_c_mode_for_suffix (char suffix
)
17350 /* We can only represent floating point constants which will fit in
17351 "quarter-precision" values. These values are characterised by
17352 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
17355 (-1)^s * (n/16) * 2^r
17358 's' is the sign bit.
17359 'n' is an integer in the range 16 <= n <= 31.
17360 'r' is an integer in the range -3 <= r <= 4. */
17362 /* Return true iff X can be represented by a quarter-precision
17363 floating point immediate operand X. Note, we cannot represent 0.0. */
17365 aarch64_float_const_representable_p (rtx x
)
17367 /* This represents our current view of how many bits
17368 make up the mantissa. */
17369 int point_pos
= 2 * HOST_BITS_PER_WIDE_INT
- 1;
17371 unsigned HOST_WIDE_INT mantissa
, mask
;
17372 REAL_VALUE_TYPE r
, m
;
17375 x
= unwrap_const_vec_duplicate (x
);
17376 if (!CONST_DOUBLE_P (x
))
17379 if (GET_MODE (x
) == VOIDmode
17380 || (GET_MODE (x
) == HFmode
&& !TARGET_FP_F16INST
))
17383 r
= *CONST_DOUBLE_REAL_VALUE (x
);
17385 /* We cannot represent infinities, NaNs or +/-zero. We won't
17386 know if we have +zero until we analyse the mantissa, but we
17387 can reject the other invalid values. */
17388 if (REAL_VALUE_ISINF (r
) || REAL_VALUE_ISNAN (r
)
17389 || REAL_VALUE_MINUS_ZERO (r
))
17392 /* Extract exponent. */
17393 r
= real_value_abs (&r
);
17394 exponent
= REAL_EXP (&r
);
17396 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
17397 highest (sign) bit, with a fixed binary point at bit point_pos.
17398 m1 holds the low part of the mantissa, m2 the high part.
17399 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
17400 bits for the mantissa, this can fail (low bits will be lost). */
17401 real_ldexp (&m
, &r
, point_pos
- exponent
);
17402 wide_int w
= real_to_integer (&m
, &fail
, HOST_BITS_PER_WIDE_INT
* 2);
17404 /* If the low part of the mantissa has bits set we cannot represent
17406 if (w
.ulow () != 0)
17408 /* We have rejected the lower HOST_WIDE_INT, so update our
17409 understanding of how many bits lie in the mantissa and
17410 look only at the high HOST_WIDE_INT. */
17411 mantissa
= w
.elt (1);
17412 point_pos
-= HOST_BITS_PER_WIDE_INT
;
17414 /* We can only represent values with a mantissa of the form 1.xxxx. */
17415 mask
= ((unsigned HOST_WIDE_INT
)1 << (point_pos
- 5)) - 1;
17416 if ((mantissa
& mask
) != 0)
17419 /* Having filtered unrepresentable values, we may now remove all
17420 but the highest 5 bits. */
17421 mantissa
>>= point_pos
- 5;
17423 /* We cannot represent the value 0.0, so reject it. This is handled
17428 /* Then, as bit 4 is always set, we can mask it off, leaving
17429 the mantissa in the range [0, 15]. */
17430 mantissa
&= ~(1 << 4);
17431 gcc_assert (mantissa
<= 15);
17433 /* GCC internally does not use IEEE754-like encoding (where normalized
17434 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
17435 Our mantissa values are shifted 4 places to the left relative to
17436 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
17437 by 5 places to correct for GCC's representation. */
17438 exponent
= 5 - exponent
;
17440 return (exponent
>= 0 && exponent
<= 7);
17443 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
17444 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
17445 output MOVI/MVNI, ORR or BIC immediate. */
17447 aarch64_output_simd_mov_immediate (rtx const_vector
, unsigned width
,
17448 enum simd_immediate_check which
)
17451 static char templ
[40];
17452 const char *mnemonic
;
17453 const char *shift_op
;
17454 unsigned int lane_count
= 0;
17457 struct simd_immediate_info info
;
17459 /* This will return true to show const_vector is legal for use as either
17460 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
17461 It will also update INFO to show how the immediate should be generated.
17462 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
17463 is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
, which
);
17464 gcc_assert (is_valid
);
17466 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
17467 lane_count
= width
/ GET_MODE_BITSIZE (info
.elt_mode
);
17469 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
17471 gcc_assert (info
.insn
== simd_immediate_info::MOV
17472 && info
.u
.mov
.shift
== 0);
17473 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
17474 move immediate path. */
17475 if (aarch64_float_const_zero_rtx_p (info
.u
.mov
.value
))
17476 info
.u
.mov
.value
= GEN_INT (0);
17479 const unsigned int buf_size
= 20;
17480 char float_buf
[buf_size
] = {'\0'};
17481 real_to_decimal_for_mode (float_buf
,
17482 CONST_DOUBLE_REAL_VALUE (info
.u
.mov
.value
),
17483 buf_size
, buf_size
, 1, info
.elt_mode
);
17485 if (lane_count
== 1)
17486 snprintf (templ
, sizeof (templ
), "fmov\t%%d0, %s", float_buf
);
17488 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%d%c, %s",
17489 lane_count
, element_char
, float_buf
);
17494 gcc_assert (CONST_INT_P (info
.u
.mov
.value
));
17496 if (which
== AARCH64_CHECK_MOV
)
17498 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "mvni" : "movi";
17499 shift_op
= (info
.u
.mov
.modifier
== simd_immediate_info::MSL
17501 if (lane_count
== 1)
17502 snprintf (templ
, sizeof (templ
), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX
,
17503 mnemonic
, UINTVAL (info
.u
.mov
.value
));
17504 else if (info
.u
.mov
.shift
)
17505 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
17506 HOST_WIDE_INT_PRINT_HEX
", %s %d", mnemonic
, lane_count
,
17507 element_char
, UINTVAL (info
.u
.mov
.value
), shift_op
,
17510 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
17511 HOST_WIDE_INT_PRINT_HEX
, mnemonic
, lane_count
,
17512 element_char
, UINTVAL (info
.u
.mov
.value
));
17516 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
17517 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "bic" : "orr";
17518 if (info
.u
.mov
.shift
)
17519 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
17520 HOST_WIDE_INT_PRINT_DEC
", %s #%d", mnemonic
, lane_count
,
17521 element_char
, UINTVAL (info
.u
.mov
.value
), "lsl",
17524 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
17525 HOST_WIDE_INT_PRINT_DEC
, mnemonic
, lane_count
,
17526 element_char
, UINTVAL (info
.u
.mov
.value
));
17532 aarch64_output_scalar_simd_mov_immediate (rtx immediate
, scalar_int_mode mode
)
17535 /* If a floating point number was passed and we desire to use it in an
17536 integer mode do the conversion to integer. */
17537 if (CONST_DOUBLE_P (immediate
) && GET_MODE_CLASS (mode
) == MODE_INT
)
17539 unsigned HOST_WIDE_INT ival
;
17540 if (!aarch64_reinterpret_float_as_int (immediate
, &ival
))
17541 gcc_unreachable ();
17542 immediate
= gen_int_mode (ival
, mode
);
17545 machine_mode vmode
;
17546 /* use a 64 bit mode for everything except for DI/DF mode, where we use
17547 a 128 bit vector mode. */
17548 int width
= GET_MODE_BITSIZE (mode
) == 64 ? 128 : 64;
17550 vmode
= aarch64_simd_container_mode (mode
, width
);
17551 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (immediate
));
17552 return aarch64_output_simd_mov_immediate (v_op
, width
);
17555 /* Return the output string to use for moving immediate CONST_VECTOR
17556 into an SVE register. */
17559 aarch64_output_sve_mov_immediate (rtx const_vector
)
17561 static char templ
[40];
17562 struct simd_immediate_info info
;
17565 bool is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
);
17566 gcc_assert (is_valid
);
17568 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
17570 machine_mode vec_mode
= GET_MODE (const_vector
);
17571 if (aarch64_sve_pred_mode_p (vec_mode
))
17573 static char buf
[sizeof ("ptrue\t%0.N, vlNNNNN")];
17574 if (info
.insn
== simd_immediate_info::MOV
)
17576 gcc_assert (info
.u
.mov
.value
== const0_rtx
);
17577 snprintf (buf
, sizeof (buf
), "pfalse\t%%0.b");
17581 gcc_assert (info
.insn
== simd_immediate_info::PTRUE
);
17582 unsigned int total_bytes
;
17583 if (info
.u
.pattern
== AARCH64_SV_ALL
17584 && BYTES_PER_SVE_VECTOR
.is_constant (&total_bytes
))
17585 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, vl%d", element_char
,
17586 total_bytes
/ GET_MODE_SIZE (info
.elt_mode
));
17588 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, %s", element_char
,
17589 svpattern_token (info
.u
.pattern
));
17594 if (info
.insn
== simd_immediate_info::INDEX
)
17596 snprintf (templ
, sizeof (templ
), "index\t%%0.%c, #"
17597 HOST_WIDE_INT_PRINT_DEC
", #" HOST_WIDE_INT_PRINT_DEC
,
17598 element_char
, INTVAL (info
.u
.index
.base
),
17599 INTVAL (info
.u
.index
.step
));
17603 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
17605 if (aarch64_float_const_zero_rtx_p (info
.u
.mov
.value
))
17606 info
.u
.mov
.value
= GEN_INT (0);
17609 const int buf_size
= 20;
17610 char float_buf
[buf_size
] = {};
17611 real_to_decimal_for_mode (float_buf
,
17612 CONST_DOUBLE_REAL_VALUE (info
.u
.mov
.value
),
17613 buf_size
, buf_size
, 1, info
.elt_mode
);
17615 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%c, #%s",
17616 element_char
, float_buf
);
17621 snprintf (templ
, sizeof (templ
), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC
,
17622 element_char
, INTVAL (info
.u
.mov
.value
));
17626 /* Split operands into moves from op[1] + op[2] into op[0]. */
17629 aarch64_split_combinev16qi (rtx operands
[3])
17631 unsigned int dest
= REGNO (operands
[0]);
17632 unsigned int src1
= REGNO (operands
[1]);
17633 unsigned int src2
= REGNO (operands
[2]);
17634 machine_mode halfmode
= GET_MODE (operands
[1]);
17635 unsigned int halfregs
= REG_NREGS (operands
[1]);
17636 rtx destlo
, desthi
;
17638 gcc_assert (halfmode
== V16QImode
);
17640 if (src1
== dest
&& src2
== dest
+ halfregs
)
17642 /* No-op move. Can't split to nothing; emit something. */
17643 emit_note (NOTE_INSN_DELETED
);
17647 /* Preserve register attributes for variable tracking. */
17648 destlo
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
, 0);
17649 desthi
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
+ halfregs
,
17650 GET_MODE_SIZE (halfmode
));
17652 /* Special case of reversed high/low parts. */
17653 if (reg_overlap_mentioned_p (operands
[2], destlo
)
17654 && reg_overlap_mentioned_p (operands
[1], desthi
))
17656 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
17657 emit_insn (gen_xorv16qi3 (operands
[2], operands
[1], operands
[2]));
17658 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
17660 else if (!reg_overlap_mentioned_p (operands
[2], destlo
))
17662 /* Try to avoid unnecessary moves if part of the result
17663 is in the right place already. */
17665 emit_move_insn (destlo
, operands
[1]);
17666 if (src2
!= dest
+ halfregs
)
17667 emit_move_insn (desthi
, operands
[2]);
17671 if (src2
!= dest
+ halfregs
)
17672 emit_move_insn (desthi
, operands
[2]);
17674 emit_move_insn (destlo
, operands
[1]);
17678 /* vec_perm support. */
17680 struct expand_vec_perm_d
17682 rtx target
, op0
, op1
;
17683 vec_perm_indices perm
;
17684 machine_mode vmode
;
17685 unsigned int vec_flags
;
17690 /* Generate a variable permutation. */
17693 aarch64_expand_vec_perm_1 (rtx target
, rtx op0
, rtx op1
, rtx sel
)
17695 machine_mode vmode
= GET_MODE (target
);
17696 bool one_vector_p
= rtx_equal_p (op0
, op1
);
17698 gcc_checking_assert (vmode
== V8QImode
|| vmode
== V16QImode
);
17699 gcc_checking_assert (GET_MODE (op0
) == vmode
);
17700 gcc_checking_assert (GET_MODE (op1
) == vmode
);
17701 gcc_checking_assert (GET_MODE (sel
) == vmode
);
17702 gcc_checking_assert (TARGET_SIMD
);
17706 if (vmode
== V8QImode
)
17708 /* Expand the argument to a V16QI mode by duplicating it. */
17709 rtx pair
= gen_reg_rtx (V16QImode
);
17710 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op0
));
17711 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
17715 emit_insn (gen_aarch64_tbl1v16qi (target
, op0
, sel
));
17722 if (vmode
== V8QImode
)
17724 pair
= gen_reg_rtx (V16QImode
);
17725 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op1
));
17726 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
17730 pair
= gen_reg_rtx (OImode
);
17731 emit_insn (gen_aarch64_combinev16qi (pair
, op0
, op1
));
17732 emit_insn (gen_aarch64_tbl2v16qi (target
, pair
, sel
));
17737 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
17738 NELT is the number of elements in the vector. */
17741 aarch64_expand_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
,
17744 machine_mode vmode
= GET_MODE (target
);
17745 bool one_vector_p
= rtx_equal_p (op0
, op1
);
17748 /* The TBL instruction does not use a modulo index, so we must take care
17749 of that ourselves. */
17750 mask
= aarch64_simd_gen_const_vector_dup (vmode
,
17751 one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
17752 sel
= expand_simple_binop (vmode
, AND
, sel
, mask
, NULL
, 0, OPTAB_LIB_WIDEN
);
17754 /* For big-endian, we also need to reverse the index within the vector
17755 (but not which vector). */
17756 if (BYTES_BIG_ENDIAN
)
17758 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
17760 mask
= aarch64_simd_gen_const_vector_dup (vmode
, nelt
- 1);
17761 sel
= expand_simple_binop (vmode
, XOR
, sel
, mask
,
17762 NULL
, 0, OPTAB_LIB_WIDEN
);
17764 aarch64_expand_vec_perm_1 (target
, op0
, op1
, sel
);
17767 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
17770 emit_unspec2 (rtx target
, int code
, rtx op0
, rtx op1
)
17772 emit_insn (gen_rtx_SET (target
,
17773 gen_rtx_UNSPEC (GET_MODE (target
),
17774 gen_rtvec (2, op0
, op1
), code
)));
17777 /* Expand an SVE vec_perm with the given operands. */
17780 aarch64_expand_sve_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
)
17782 machine_mode data_mode
= GET_MODE (target
);
17783 machine_mode sel_mode
= GET_MODE (sel
);
17784 /* Enforced by the pattern condition. */
17785 int nunits
= GET_MODE_NUNITS (sel_mode
).to_constant ();
17787 /* Note: vec_perm indices are supposed to wrap when they go beyond the
17788 size of the two value vectors, i.e. the upper bits of the indices
17789 are effectively ignored. SVE TBL instead produces 0 for any
17790 out-of-range indices, so we need to modulo all the vec_perm indices
17791 to ensure they are all in range. */
17792 rtx sel_reg
= force_reg (sel_mode
, sel
);
17794 /* Check if the sel only references the first values vector. */
17795 if (GET_CODE (sel
) == CONST_VECTOR
17796 && aarch64_const_vec_all_in_range_p (sel
, 0, nunits
- 1))
17798 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_reg
);
17802 /* Check if the two values vectors are the same. */
17803 if (rtx_equal_p (op0
, op1
))
17805 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
, nunits
- 1);
17806 rtx sel_mod
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
17807 NULL
, 0, OPTAB_DIRECT
);
17808 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_mod
);
17812 /* Run TBL on for each value vector and combine the results. */
17814 rtx res0
= gen_reg_rtx (data_mode
);
17815 rtx res1
= gen_reg_rtx (data_mode
);
17816 rtx neg_num_elems
= aarch64_simd_gen_const_vector_dup (sel_mode
, -nunits
);
17817 if (GET_CODE (sel
) != CONST_VECTOR
17818 || !aarch64_const_vec_all_in_range_p (sel
, 0, 2 * nunits
- 1))
17820 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
,
17822 sel_reg
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
17823 NULL
, 0, OPTAB_DIRECT
);
17825 emit_unspec2 (res0
, UNSPEC_TBL
, op0
, sel_reg
);
17826 rtx sel_sub
= expand_simple_binop (sel_mode
, PLUS
, sel_reg
, neg_num_elems
,
17827 NULL
, 0, OPTAB_DIRECT
);
17828 emit_unspec2 (res1
, UNSPEC_TBL
, op1
, sel_sub
);
17829 if (GET_MODE_CLASS (data_mode
) == MODE_VECTOR_INT
)
17830 emit_insn (gen_rtx_SET (target
, gen_rtx_IOR (data_mode
, res0
, res1
)));
17832 emit_unspec2 (target
, UNSPEC_IORF
, res0
, res1
);
17835 /* Recognize patterns suitable for the TRN instructions. */
17837 aarch64_evpc_trn (struct expand_vec_perm_d
*d
)
17840 poly_uint64 nelt
= d
->perm
.length ();
17841 rtx out
, in0
, in1
, x
;
17842 machine_mode vmode
= d
->vmode
;
17844 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
17847 /* Note that these are little-endian tests.
17848 We correct for big-endian later. */
17849 if (!d
->perm
[0].is_constant (&odd
)
17850 || (odd
!= 0 && odd
!= 1)
17851 || !d
->perm
.series_p (0, 2, odd
, 2)
17852 || !d
->perm
.series_p (1, 2, nelt
+ odd
, 2))
17861 /* We don't need a big-endian lane correction for SVE; see the comment
17862 at the head of aarch64-sve.md for details. */
17863 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
17865 x
= in0
, in0
= in1
, in1
= x
;
17870 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
17871 odd
? UNSPEC_TRN2
: UNSPEC_TRN1
));
17875 /* Recognize patterns suitable for the UZP instructions. */
17877 aarch64_evpc_uzp (struct expand_vec_perm_d
*d
)
17880 rtx out
, in0
, in1
, x
;
17881 machine_mode vmode
= d
->vmode
;
17883 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
17886 /* Note that these are little-endian tests.
17887 We correct for big-endian later. */
17888 if (!d
->perm
[0].is_constant (&odd
)
17889 || (odd
!= 0 && odd
!= 1)
17890 || !d
->perm
.series_p (0, 1, odd
, 2))
17899 /* We don't need a big-endian lane correction for SVE; see the comment
17900 at the head of aarch64-sve.md for details. */
17901 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
17903 x
= in0
, in0
= in1
, in1
= x
;
17908 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
17909 odd
? UNSPEC_UZP2
: UNSPEC_UZP1
));
17913 /* Recognize patterns suitable for the ZIP instructions. */
17915 aarch64_evpc_zip (struct expand_vec_perm_d
*d
)
17918 poly_uint64 nelt
= d
->perm
.length ();
17919 rtx out
, in0
, in1
, x
;
17920 machine_mode vmode
= d
->vmode
;
17922 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
17925 /* Note that these are little-endian tests.
17926 We correct for big-endian later. */
17927 poly_uint64 first
= d
->perm
[0];
17928 if ((maybe_ne (first
, 0U) && maybe_ne (first
* 2, nelt
))
17929 || !d
->perm
.series_p (0, 2, first
, 1)
17930 || !d
->perm
.series_p (1, 2, first
+ nelt
, 1))
17932 high
= maybe_ne (first
, 0U);
17940 /* We don't need a big-endian lane correction for SVE; see the comment
17941 at the head of aarch64-sve.md for details. */
17942 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
17944 x
= in0
, in0
= in1
, in1
= x
;
17949 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
17950 high
? UNSPEC_ZIP2
: UNSPEC_ZIP1
));
17954 /* Recognize patterns for the EXT insn. */
17957 aarch64_evpc_ext (struct expand_vec_perm_d
*d
)
17959 HOST_WIDE_INT location
;
17962 /* The first element always refers to the first vector.
17963 Check if the extracted indices are increasing by one. */
17964 if (d
->vec_flags
== VEC_SVE_PRED
17965 || !d
->perm
[0].is_constant (&location
)
17966 || !d
->perm
.series_p (0, 1, location
, 1))
17973 /* The case where (location == 0) is a no-op for both big- and little-endian,
17974 and is removed by the mid-end at optimization levels -O1 and higher.
17976 We don't need a big-endian lane correction for SVE; see the comment
17977 at the head of aarch64-sve.md for details. */
17978 if (BYTES_BIG_ENDIAN
&& location
!= 0 && d
->vec_flags
== VEC_ADVSIMD
)
17980 /* After setup, we want the high elements of the first vector (stored
17981 at the LSB end of the register), and the low elements of the second
17982 vector (stored at the MSB end of the register). So swap. */
17983 std::swap (d
->op0
, d
->op1
);
17984 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
17985 to_constant () is safe since this is restricted to Advanced SIMD
17987 location
= d
->perm
.length ().to_constant () - location
;
17990 offset
= GEN_INT (location
);
17991 emit_set_insn (d
->target
,
17992 gen_rtx_UNSPEC (d
->vmode
,
17993 gen_rtvec (3, d
->op0
, d
->op1
, offset
),
17998 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
17999 within each 64-bit, 32-bit or 16-bit granule. */
18002 aarch64_evpc_rev_local (struct expand_vec_perm_d
*d
)
18004 HOST_WIDE_INT diff
;
18005 unsigned int i
, size
, unspec
;
18006 machine_mode pred_mode
;
18008 if (d
->vec_flags
== VEC_SVE_PRED
18009 || !d
->one_vector_p
18010 || !d
->perm
[0].is_constant (&diff
))
18013 size
= (diff
+ 1) * GET_MODE_UNIT_SIZE (d
->vmode
);
18016 unspec
= UNSPEC_REV64
;
18017 pred_mode
= VNx2BImode
;
18019 else if (size
== 4)
18021 unspec
= UNSPEC_REV32
;
18022 pred_mode
= VNx4BImode
;
18024 else if (size
== 2)
18026 unspec
= UNSPEC_REV16
;
18027 pred_mode
= VNx8BImode
;
18032 unsigned int step
= diff
+ 1;
18033 for (i
= 0; i
< step
; ++i
)
18034 if (!d
->perm
.series_p (i
, step
, diff
- i
, step
))
18041 if (d
->vec_flags
== VEC_SVE_DATA
)
18043 machine_mode int_mode
= aarch64_sve_int_mode (pred_mode
);
18044 rtx target
= gen_reg_rtx (int_mode
);
18045 if (BYTES_BIG_ENDIAN
)
18046 /* The act of taking a subreg between INT_MODE and d->vmode
18047 is itself a reversing operation on big-endian targets;
18048 see the comment at the head of aarch64-sve.md for details.
18049 First reinterpret OP0 as INT_MODE without using a subreg
18050 and without changing the contents. */
18051 emit_insn (gen_aarch64_sve_reinterpret (int_mode
, target
, d
->op0
));
18054 /* For SVE we use REV[BHW] unspecs derived from the element size
18055 of v->mode and vector modes whose elements have SIZE bytes.
18056 This ensures that the vector modes match the predicate modes. */
18057 int unspec
= aarch64_sve_rev_unspec (d
->vmode
);
18058 rtx pred
= aarch64_ptrue_reg (pred_mode
);
18059 emit_insn (gen_aarch64_pred (unspec
, int_mode
, target
, pred
,
18060 gen_lowpart (int_mode
, d
->op0
)));
18062 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
18065 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), unspec
);
18066 emit_set_insn (d
->target
, src
);
18070 /* Recognize patterns for the REV insn, which reverses elements within
18074 aarch64_evpc_rev_global (struct expand_vec_perm_d
*d
)
18076 poly_uint64 nelt
= d
->perm
.length ();
18078 if (!d
->one_vector_p
|| d
->vec_flags
!= VEC_SVE_DATA
)
18081 if (!d
->perm
.series_p (0, 1, nelt
- 1, -1))
18088 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), UNSPEC_REV
);
18089 emit_set_insn (d
->target
, src
);
18094 aarch64_evpc_dup (struct expand_vec_perm_d
*d
)
18096 rtx out
= d
->target
;
18099 machine_mode vmode
= d
->vmode
;
18102 if (d
->vec_flags
== VEC_SVE_PRED
18103 || d
->perm
.encoding ().encoded_nelts () != 1
18104 || !d
->perm
[0].is_constant (&elt
))
18107 if (d
->vec_flags
== VEC_SVE_DATA
&& elt
>= 64 * GET_MODE_UNIT_SIZE (vmode
))
18114 /* The generic preparation in aarch64_expand_vec_perm_const_1
18115 swaps the operand order and the permute indices if it finds
18116 d->perm[0] to be in the second operand. Thus, we can always
18117 use d->op0 and need not do any extra arithmetic to get the
18118 correct lane number. */
18120 lane
= GEN_INT (elt
); /* The pattern corrects for big-endian. */
18122 rtx parallel
= gen_rtx_PARALLEL (vmode
, gen_rtvec (1, lane
));
18123 rtx select
= gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode
), in0
, parallel
);
18124 emit_set_insn (out
, gen_rtx_VEC_DUPLICATE (vmode
, select
));
18129 aarch64_evpc_tbl (struct expand_vec_perm_d
*d
)
18131 rtx rperm
[MAX_COMPILE_TIME_VEC_BYTES
], sel
;
18132 machine_mode vmode
= d
->vmode
;
18134 /* Make sure that the indices are constant. */
18135 unsigned int encoded_nelts
= d
->perm
.encoding ().encoded_nelts ();
18136 for (unsigned int i
= 0; i
< encoded_nelts
; ++i
)
18137 if (!d
->perm
[i
].is_constant ())
18143 /* Generic code will try constant permutation twice. Once with the
18144 original mode and again with the elements lowered to QImode.
18145 So wait and don't do the selector expansion ourselves. */
18146 if (vmode
!= V8QImode
&& vmode
!= V16QImode
)
18149 /* to_constant is safe since this routine is specific to Advanced SIMD
18151 unsigned int nelt
= d
->perm
.length ().to_constant ();
18152 for (unsigned int i
= 0; i
< nelt
; ++i
)
18153 /* If big-endian and two vectors we end up with a weird mixed-endian
18154 mode on NEON. Reverse the index within each word but not the word
18155 itself. to_constant is safe because we checked is_constant above. */
18156 rperm
[i
] = GEN_INT (BYTES_BIG_ENDIAN
18157 ? d
->perm
[i
].to_constant () ^ (nelt
- 1)
18158 : d
->perm
[i
].to_constant ());
18160 sel
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
18161 sel
= force_reg (vmode
, sel
);
18163 aarch64_expand_vec_perm_1 (d
->target
, d
->op0
, d
->op1
, sel
);
18167 /* Try to implement D using an SVE TBL instruction. */
18170 aarch64_evpc_sve_tbl (struct expand_vec_perm_d
*d
)
18172 unsigned HOST_WIDE_INT nelt
;
18174 /* Permuting two variable-length vectors could overflow the
18176 if (!d
->one_vector_p
&& !d
->perm
.length ().is_constant (&nelt
))
18182 machine_mode sel_mode
= mode_for_int_vector (d
->vmode
).require ();
18183 rtx sel
= vec_perm_indices_to_rtx (sel_mode
, d
->perm
);
18184 if (d
->one_vector_p
)
18185 emit_unspec2 (d
->target
, UNSPEC_TBL
, d
->op0
, force_reg (sel_mode
, sel
));
18187 aarch64_expand_sve_vec_perm (d
->target
, d
->op0
, d
->op1
, sel
);
18191 /* Try to implement D using SVE SEL instruction. */
18194 aarch64_evpc_sel (struct expand_vec_perm_d
*d
)
18196 machine_mode vmode
= d
->vmode
;
18197 int unit_size
= GET_MODE_UNIT_SIZE (vmode
);
18199 if (d
->vec_flags
!= VEC_SVE_DATA
18203 int n_patterns
= d
->perm
.encoding ().npatterns ();
18204 poly_int64 vec_len
= d
->perm
.length ();
18206 for (int i
= 0; i
< n_patterns
; ++i
)
18207 if (!known_eq (d
->perm
[i
], i
)
18208 && !known_eq (d
->perm
[i
], vec_len
+ i
))
18211 for (int i
= n_patterns
; i
< n_patterns
* 2; i
++)
18212 if (!d
->perm
.series_p (i
, n_patterns
, i
, n_patterns
)
18213 && !d
->perm
.series_p (i
, n_patterns
, vec_len
+ i
, n_patterns
))
18219 machine_mode pred_mode
= aarch64_sve_pred_mode (unit_size
).require ();
18221 rtx_vector_builder
builder (pred_mode
, n_patterns
, 2);
18222 for (int i
= 0; i
< n_patterns
* 2; i
++)
18224 rtx elem
= known_eq (d
->perm
[i
], i
) ? CONST1_RTX (BImode
)
18225 : CONST0_RTX (BImode
);
18226 builder
.quick_push (elem
);
18229 rtx const_vec
= builder
.build ();
18230 rtx pred
= force_reg (pred_mode
, const_vec
);
18231 emit_insn (gen_vcond_mask (vmode
, vmode
, d
->target
, d
->op1
, d
->op0
, pred
));
18236 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
18238 /* The pattern matching functions above are written to look for a small
18239 number to begin the sequence (0, 1, N/2). If we begin with an index
18240 from the second operand, we can swap the operands. */
18241 poly_int64 nelt
= d
->perm
.length ();
18242 if (known_ge (d
->perm
[0], nelt
))
18244 d
->perm
.rotate_inputs (1);
18245 std::swap (d
->op0
, d
->op1
);
18248 if ((d
->vec_flags
== VEC_ADVSIMD
18249 || d
->vec_flags
== VEC_SVE_DATA
18250 || d
->vec_flags
== VEC_SVE_PRED
)
18251 && known_gt (nelt
, 1))
18253 if (aarch64_evpc_rev_local (d
))
18255 else if (aarch64_evpc_rev_global (d
))
18257 else if (aarch64_evpc_ext (d
))
18259 else if (aarch64_evpc_dup (d
))
18261 else if (aarch64_evpc_zip (d
))
18263 else if (aarch64_evpc_uzp (d
))
18265 else if (aarch64_evpc_trn (d
))
18267 else if (aarch64_evpc_sel (d
))
18269 if (d
->vec_flags
== VEC_SVE_DATA
)
18270 return aarch64_evpc_sve_tbl (d
);
18271 else if (d
->vec_flags
== VEC_ADVSIMD
)
18272 return aarch64_evpc_tbl (d
);
18277 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
18280 aarch64_vectorize_vec_perm_const (machine_mode vmode
, rtx target
, rtx op0
,
18281 rtx op1
, const vec_perm_indices
&sel
)
18283 struct expand_vec_perm_d d
;
18285 /* Check whether the mask can be applied to a single vector. */
18286 if (sel
.ninputs () == 1
18287 || (op0
&& rtx_equal_p (op0
, op1
)))
18288 d
.one_vector_p
= true;
18289 else if (sel
.all_from_input_p (0))
18291 d
.one_vector_p
= true;
18294 else if (sel
.all_from_input_p (1))
18296 d
.one_vector_p
= true;
18300 d
.one_vector_p
= false;
18302 d
.perm
.new_vector (sel
.encoding (), d
.one_vector_p
? 1 : 2,
18303 sel
.nelts_per_input ());
18305 d
.vec_flags
= aarch64_classify_vector_mode (d
.vmode
);
18309 d
.testing_p
= !target
;
18312 return aarch64_expand_vec_perm_const_1 (&d
);
18314 rtx_insn
*last
= get_last_insn ();
18315 bool ret
= aarch64_expand_vec_perm_const_1 (&d
);
18316 gcc_assert (last
== get_last_insn ());
18321 /* Generate a byte permute mask for a register of mode MODE,
18322 which has NUNITS units. */
18325 aarch64_reverse_mask (machine_mode mode
, unsigned int nunits
)
18327 /* We have to reverse each vector because we dont have
18328 a permuted load that can reverse-load according to ABI rules. */
18330 rtvec v
= rtvec_alloc (16);
18332 unsigned int usize
= GET_MODE_UNIT_SIZE (mode
);
18334 gcc_assert (BYTES_BIG_ENDIAN
);
18335 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode
));
18337 for (i
= 0; i
< nunits
; i
++)
18338 for (j
= 0; j
< usize
; j
++)
18339 RTVEC_ELT (v
, i
* usize
+ j
) = GEN_INT ((i
+ 1) * usize
- 1 - j
);
18340 mask
= gen_rtx_CONST_VECTOR (V16QImode
, v
);
18341 return force_reg (V16QImode
, mask
);
18344 /* Expand an SVE integer comparison using the SVE equivalent of:
18346 (set TARGET (CODE OP0 OP1)). */
18349 aarch64_expand_sve_vec_cmp_int (rtx target
, rtx_code code
, rtx op0
, rtx op1
)
18351 machine_mode pred_mode
= GET_MODE (target
);
18352 machine_mode data_mode
= GET_MODE (op0
);
18353 rtx res
= aarch64_sve_emit_int_cmp (target
, pred_mode
, code
, data_mode
,
18355 if (!rtx_equal_p (target
, res
))
18356 emit_move_insn (target
, res
);
18359 /* Return the UNSPEC_COND_* code for comparison CODE. */
18361 static unsigned int
18362 aarch64_unspec_cond_code (rtx_code code
)
18367 return UNSPEC_COND_FCMNE
;
18369 return UNSPEC_COND_FCMEQ
;
18371 return UNSPEC_COND_FCMLT
;
18373 return UNSPEC_COND_FCMGT
;
18375 return UNSPEC_COND_FCMLE
;
18377 return UNSPEC_COND_FCMGE
;
18379 return UNSPEC_COND_FCMUO
;
18381 gcc_unreachable ();
18387 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
18389 where <X> is the operation associated with comparison CODE.
18390 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
18393 aarch64_emit_sve_fp_cond (rtx target
, rtx_code code
, rtx pred
,
18394 bool known_ptrue_p
, rtx op0
, rtx op1
)
18396 rtx flag
= gen_int_mode (known_ptrue_p
, SImode
);
18397 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (pred
),
18398 gen_rtvec (4, pred
, flag
, op0
, op1
),
18399 aarch64_unspec_cond_code (code
));
18400 emit_set_insn (target
, unspec
);
18403 /* Emit the SVE equivalent of:
18405 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
18406 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
18407 (set TARGET (ior:PRED_MODE TMP1 TMP2))
18409 where <Xi> is the operation associated with comparison CODEi.
18410 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
18413 aarch64_emit_sve_or_fp_conds (rtx target
, rtx_code code1
, rtx_code code2
,
18414 rtx pred
, bool known_ptrue_p
, rtx op0
, rtx op1
)
18416 machine_mode pred_mode
= GET_MODE (pred
);
18417 rtx tmp1
= gen_reg_rtx (pred_mode
);
18418 aarch64_emit_sve_fp_cond (tmp1
, code1
, pred
, known_ptrue_p
, op0
, op1
);
18419 rtx tmp2
= gen_reg_rtx (pred_mode
);
18420 aarch64_emit_sve_fp_cond (tmp2
, code2
, pred
, known_ptrue_p
, op0
, op1
);
18421 aarch64_emit_binop (target
, ior_optab
, tmp1
, tmp2
);
18424 /* Emit the SVE equivalent of:
18426 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
18427 (set TARGET (not TMP))
18429 where <X> is the operation associated with comparison CODE.
18430 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
18433 aarch64_emit_sve_invert_fp_cond (rtx target
, rtx_code code
, rtx pred
,
18434 bool known_ptrue_p
, rtx op0
, rtx op1
)
18436 machine_mode pred_mode
= GET_MODE (pred
);
18437 rtx tmp
= gen_reg_rtx (pred_mode
);
18438 aarch64_emit_sve_fp_cond (tmp
, code
, pred
, known_ptrue_p
, op0
, op1
);
18439 aarch64_emit_unop (target
, one_cmpl_optab
, tmp
);
18442 /* Expand an SVE floating-point comparison using the SVE equivalent of:
18444 (set TARGET (CODE OP0 OP1))
18446 If CAN_INVERT_P is true, the caller can also handle inverted results;
18447 return true if the result is in fact inverted. */
18450 aarch64_expand_sve_vec_cmp_float (rtx target
, rtx_code code
,
18451 rtx op0
, rtx op1
, bool can_invert_p
)
18453 machine_mode pred_mode
= GET_MODE (target
);
18454 machine_mode data_mode
= GET_MODE (op0
);
18456 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
18460 /* UNORDERED has no immediate form. */
18461 op1
= force_reg (data_mode
, op1
);
18470 /* There is native support for the comparison. */
18471 aarch64_emit_sve_fp_cond (target
, code
, ptrue
, true, op0
, op1
);
18476 /* This is a trapping operation (LT or GT). */
18477 aarch64_emit_sve_or_fp_conds (target
, LT
, GT
, ptrue
, true, op0
, op1
);
18481 if (!flag_trapping_math
)
18483 /* This would trap for signaling NaNs. */
18484 op1
= force_reg (data_mode
, op1
);
18485 aarch64_emit_sve_or_fp_conds (target
, UNORDERED
, EQ
,
18486 ptrue
, true, op0
, op1
);
18494 if (flag_trapping_math
)
18496 /* Work out which elements are ordered. */
18497 rtx ordered
= gen_reg_rtx (pred_mode
);
18498 op1
= force_reg (data_mode
, op1
);
18499 aarch64_emit_sve_invert_fp_cond (ordered
, UNORDERED
,
18500 ptrue
, true, op0
, op1
);
18502 /* Test the opposite condition for the ordered elements,
18503 then invert the result. */
18507 code
= reverse_condition_maybe_unordered (code
);
18510 aarch64_emit_sve_fp_cond (target
, code
,
18511 ordered
, false, op0
, op1
);
18514 aarch64_emit_sve_invert_fp_cond (target
, code
,
18515 ordered
, false, op0
, op1
);
18521 /* ORDERED has no immediate form. */
18522 op1
= force_reg (data_mode
, op1
);
18526 gcc_unreachable ();
18529 /* There is native support for the inverse comparison. */
18530 code
= reverse_condition_maybe_unordered (code
);
18533 aarch64_emit_sve_fp_cond (target
, code
, ptrue
, true, op0
, op1
);
18536 aarch64_emit_sve_invert_fp_cond (target
, code
, ptrue
, true, op0
, op1
);
18540 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
18541 of the data being selected and CMP_MODE is the mode of the values being
18545 aarch64_expand_sve_vcond (machine_mode data_mode
, machine_mode cmp_mode
,
18548 machine_mode pred_mode
18549 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode
),
18550 GET_MODE_SIZE (cmp_mode
)).require ();
18551 rtx pred
= gen_reg_rtx (pred_mode
);
18552 if (FLOAT_MODE_P (cmp_mode
))
18554 if (aarch64_expand_sve_vec_cmp_float (pred
, GET_CODE (ops
[3]),
18555 ops
[4], ops
[5], true))
18556 std::swap (ops
[1], ops
[2]);
18559 aarch64_expand_sve_vec_cmp_int (pred
, GET_CODE (ops
[3]), ops
[4], ops
[5]);
18561 if (!aarch64_sve_reg_or_dup_imm (ops
[1], data_mode
))
18562 ops
[1] = force_reg (data_mode
, ops
[1]);
18563 /* The "false" value can only be zero if the "true" value is a constant. */
18564 if (register_operand (ops
[1], data_mode
)
18565 || !aarch64_simd_reg_or_zero (ops
[2], data_mode
))
18566 ops
[2] = force_reg (data_mode
, ops
[2]);
18568 rtvec vec
= gen_rtvec (3, pred
, ops
[1], ops
[2]);
18569 emit_set_insn (ops
[0], gen_rtx_UNSPEC (data_mode
, vec
, UNSPEC_SEL
));
18572 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
18573 true. However due to issues with register allocation it is preferable
18574 to avoid tieing integer scalar and FP scalar modes. Executing integer
18575 operations in general registers is better than treating them as scalar
18576 vector operations. This reduces latency and avoids redundant int<->FP
18577 moves. So tie modes if they are either the same class, or vector modes
18578 with other vector modes, vector structs or any scalar mode. */
18581 aarch64_modes_tieable_p (machine_mode mode1
, machine_mode mode2
)
18583 if (GET_MODE_CLASS (mode1
) == GET_MODE_CLASS (mode2
))
18586 /* We specifically want to allow elements of "structure" modes to
18587 be tieable to the structure. This more general condition allows
18588 other rarer situations too. The reason we don't extend this to
18589 predicate modes is that there are no predicate structure modes
18590 nor any specific instructions for extracting part of a predicate
18592 if (aarch64_vector_data_mode_p (mode1
)
18593 && aarch64_vector_data_mode_p (mode2
))
18596 /* Also allow any scalar modes with vectors. */
18597 if (aarch64_vector_mode_supported_p (mode1
)
18598 || aarch64_vector_mode_supported_p (mode2
))
18604 /* Return a new RTX holding the result of moving POINTER forward by
18608 aarch64_move_pointer (rtx pointer
, poly_int64 amount
)
18610 rtx next
= plus_constant (Pmode
, XEXP (pointer
, 0), amount
);
18612 return adjust_automodify_address (pointer
, GET_MODE (pointer
),
18616 /* Return a new RTX holding the result of moving POINTER forward by the
18617 size of the mode it points to. */
18620 aarch64_progress_pointer (rtx pointer
)
18622 return aarch64_move_pointer (pointer
, GET_MODE_SIZE (GET_MODE (pointer
)));
18625 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
18629 aarch64_copy_one_block_and_progress_pointers (rtx
*src
, rtx
*dst
,
18632 rtx reg
= gen_reg_rtx (mode
);
18634 /* "Cast" the pointers to the correct mode. */
18635 *src
= adjust_address (*src
, mode
, 0);
18636 *dst
= adjust_address (*dst
, mode
, 0);
18637 /* Emit the memcpy. */
18638 emit_move_insn (reg
, *src
);
18639 emit_move_insn (*dst
, reg
);
18640 /* Move the pointers forward. */
18641 *src
= aarch64_progress_pointer (*src
);
18642 *dst
= aarch64_progress_pointer (*dst
);
18645 /* Expand cpymem, as if from a __builtin_memcpy. Return true if
18646 we succeed, otherwise return false. */
18649 aarch64_expand_cpymem (rtx
*operands
)
18652 rtx dst
= operands
[0];
18653 rtx src
= operands
[1];
18655 machine_mode cur_mode
= BLKmode
, next_mode
;
18656 bool speed_p
= !optimize_function_for_size_p (cfun
);
18658 /* When optimizing for size, give a better estimate of the length of a
18659 memcpy call, but use the default otherwise. Moves larger than 8 bytes
18660 will always require an even number of instructions to do now. And each
18661 operation requires both a load+store, so devide the max number by 2. */
18662 int max_num_moves
= (speed_p
? 16 : AARCH64_CALL_RATIO
) / 2;
18664 /* We can't do anything smart if the amount to copy is not constant. */
18665 if (!CONST_INT_P (operands
[2]))
18668 n
= INTVAL (operands
[2]);
18670 /* Try to keep the number of instructions low. For all cases we will do at
18671 most two moves for the residual amount, since we'll always overlap the
18673 if (((n
/ 16) + (n
% 16 ? 2 : 0)) > max_num_moves
)
18676 base
= copy_to_mode_reg (Pmode
, XEXP (dst
, 0));
18677 dst
= adjust_automodify_address (dst
, VOIDmode
, base
, 0);
18679 base
= copy_to_mode_reg (Pmode
, XEXP (src
, 0));
18680 src
= adjust_automodify_address (src
, VOIDmode
, base
, 0);
18682 /* Convert n to bits to make the rest of the code simpler. */
18683 n
= n
* BITS_PER_UNIT
;
18685 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
18686 larger than TImode, but we should not use them for loads/stores here. */
18687 const int copy_limit
= GET_MODE_BITSIZE (TImode
);
18691 /* Find the largest mode in which to do the copy in without over reading
18693 opt_scalar_int_mode mode_iter
;
18694 FOR_EACH_MODE_IN_CLASS (mode_iter
, MODE_INT
)
18695 if (GET_MODE_BITSIZE (mode_iter
.require ()) <= MIN (n
, copy_limit
))
18696 cur_mode
= mode_iter
.require ();
18698 gcc_assert (cur_mode
!= BLKmode
);
18700 mode_bits
= GET_MODE_BITSIZE (cur_mode
).to_constant ();
18701 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, cur_mode
);
18705 /* Do certain trailing copies as overlapping if it's going to be
18706 cheaper. i.e. less instructions to do so. For instance doing a 15
18707 byte copy it's more efficient to do two overlapping 8 byte copies than
18709 if (n
> 0 && n
<= 8 * BITS_PER_UNIT
)
18711 next_mode
= smallest_mode_for_size (n
, MODE_INT
);
18712 int n_bits
= GET_MODE_BITSIZE (next_mode
).to_constant ();
18713 src
= aarch64_move_pointer (src
, (n
- n_bits
) / BITS_PER_UNIT
);
18714 dst
= aarch64_move_pointer (dst
, (n
- n_bits
) / BITS_PER_UNIT
);
18722 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
18723 SImode stores. Handle the case when the constant has identical
18724 bottom and top halves. This is beneficial when the two stores can be
18725 merged into an STP and we avoid synthesising potentially expensive
18726 immediates twice. Return true if such a split is possible. */
18729 aarch64_split_dimode_const_store (rtx dst
, rtx src
)
18731 rtx lo
= gen_lowpart (SImode
, src
);
18732 rtx hi
= gen_highpart_mode (SImode
, DImode
, src
);
18734 bool size_p
= optimize_function_for_size_p (cfun
);
18736 if (!rtx_equal_p (lo
, hi
))
18739 unsigned int orig_cost
18740 = aarch64_internal_mov_immediate (NULL_RTX
, src
, false, DImode
);
18741 unsigned int lo_cost
18742 = aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, SImode
);
18744 /* We want to transform:
18746 MOVK x1, 0x140, lsl 16
18747 MOVK x1, 0xc0da, lsl 32
18748 MOVK x1, 0x140, lsl 48
18752 MOVK w1, 0x140, lsl 16
18754 So we want to perform this only when we save two instructions
18755 or more. When optimizing for size, however, accept any code size
18757 if (size_p
&& orig_cost
<= lo_cost
)
18761 && (orig_cost
<= lo_cost
+ 1))
18764 rtx mem_lo
= adjust_address (dst
, SImode
, 0);
18765 if (!aarch64_mem_pair_operand (mem_lo
, SImode
))
18768 rtx tmp_reg
= gen_reg_rtx (SImode
);
18769 aarch64_expand_mov_immediate (tmp_reg
, lo
);
18770 rtx mem_hi
= aarch64_move_pointer (mem_lo
, GET_MODE_SIZE (SImode
));
18771 /* Don't emit an explicit store pair as this may not be always profitable.
18772 Let the sched-fusion logic decide whether to merge them. */
18773 emit_move_insn (mem_lo
, tmp_reg
);
18774 emit_move_insn (mem_hi
, tmp_reg
);
18779 /* Generate RTL for a conditional branch with rtx comparison CODE in
18780 mode CC_MODE. The destination of the unlikely conditional branch
18784 aarch64_gen_unlikely_cbranch (enum rtx_code code
, machine_mode cc_mode
,
18788 x
= gen_rtx_fmt_ee (code
, VOIDmode
,
18789 gen_rtx_REG (cc_mode
, CC_REGNUM
),
18792 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
18793 gen_rtx_LABEL_REF (VOIDmode
, label_ref
),
18795 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
18798 /* Generate DImode scratch registers for 128-bit (TImode) addition.
18800 OP1 represents the TImode destination operand 1
18801 OP2 represents the TImode destination operand 2
18802 LOW_DEST represents the low half (DImode) of TImode operand 0
18803 LOW_IN1 represents the low half (DImode) of TImode operand 1
18804 LOW_IN2 represents the low half (DImode) of TImode operand 2
18805 HIGH_DEST represents the high half (DImode) of TImode operand 0
18806 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18807 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
18810 aarch64_addti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
18811 rtx
*low_in1
, rtx
*low_in2
,
18812 rtx
*high_dest
, rtx
*high_in1
,
18815 *low_dest
= gen_reg_rtx (DImode
);
18816 *low_in1
= gen_lowpart (DImode
, op1
);
18817 *low_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
18818 subreg_lowpart_offset (DImode
, TImode
));
18819 *high_dest
= gen_reg_rtx (DImode
);
18820 *high_in1
= gen_highpart (DImode
, op1
);
18821 *high_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
18822 subreg_highpart_offset (DImode
, TImode
));
18825 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
18827 This function differs from 'arch64_addti_scratch_regs' in that
18828 OP1 can be an immediate constant (zero). We must call
18829 subreg_highpart_offset with DImode and TImode arguments, otherwise
18830 VOIDmode will be used for the const_int which generates an internal
18831 error from subreg_size_highpart_offset which does not expect a size of zero.
18833 OP1 represents the TImode destination operand 1
18834 OP2 represents the TImode destination operand 2
18835 LOW_DEST represents the low half (DImode) of TImode operand 0
18836 LOW_IN1 represents the low half (DImode) of TImode operand 1
18837 LOW_IN2 represents the low half (DImode) of TImode operand 2
18838 HIGH_DEST represents the high half (DImode) of TImode operand 0
18839 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18840 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
18844 aarch64_subvti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
18845 rtx
*low_in1
, rtx
*low_in2
,
18846 rtx
*high_dest
, rtx
*high_in1
,
18849 *low_dest
= gen_reg_rtx (DImode
);
18850 *low_in1
= simplify_gen_subreg (DImode
, op1
, TImode
,
18851 subreg_lowpart_offset (DImode
, TImode
));
18853 *low_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
18854 subreg_lowpart_offset (DImode
, TImode
));
18855 *high_dest
= gen_reg_rtx (DImode
);
18857 *high_in1
= simplify_gen_subreg (DImode
, op1
, TImode
,
18858 subreg_highpart_offset (DImode
, TImode
));
18859 *high_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
18860 subreg_highpart_offset (DImode
, TImode
));
18863 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
18865 OP0 represents the TImode destination operand 0
18866 LOW_DEST represents the low half (DImode) of TImode operand 0
18867 LOW_IN1 represents the low half (DImode) of TImode operand 1
18868 LOW_IN2 represents the low half (DImode) of TImode operand 2
18869 HIGH_DEST represents the high half (DImode) of TImode operand 0
18870 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18871 HIGH_IN2 represents the high half (DImode) of TImode operand 2
18872 UNSIGNED_P is true if the operation is being performed on unsigned
18875 aarch64_expand_subvti (rtx op0
, rtx low_dest
, rtx low_in1
,
18876 rtx low_in2
, rtx high_dest
, rtx high_in1
,
18877 rtx high_in2
, bool unsigned_p
)
18879 if (low_in2
== const0_rtx
)
18881 low_dest
= low_in1
;
18882 high_in2
= force_reg (DImode
, high_in2
);
18884 emit_insn (gen_subdi3_compare1 (high_dest
, high_in1
, high_in2
));
18886 emit_insn (gen_subvdi_insn (high_dest
, high_in1
, high_in2
));
18890 if (CONST_INT_P (low_in2
))
18892 high_in2
= force_reg (DImode
, high_in2
);
18893 emit_insn (gen_subdi3_compare1_imm (low_dest
, low_in1
, low_in2
,
18894 GEN_INT (-INTVAL (low_in2
))));
18897 emit_insn (gen_subdi3_compare1 (low_dest
, low_in1
, low_in2
));
18900 emit_insn (gen_usubdi3_carryinC (high_dest
, high_in1
, high_in2
));
18902 emit_insn (gen_subdi3_carryinV (high_dest
, high_in1
, high_in2
));
18905 emit_move_insn (gen_lowpart (DImode
, op0
), low_dest
);
18906 emit_move_insn (gen_highpart (DImode
, op0
), high_dest
);
18910 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
18912 static unsigned HOST_WIDE_INT
18913 aarch64_asan_shadow_offset (void)
18916 return (HOST_WIDE_INT_1
<< 29);
18918 return (HOST_WIDE_INT_1
<< 36);
18922 aarch64_gen_ccmp_first (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
,
18923 int code
, tree treeop0
, tree treeop1
)
18925 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
18927 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
18929 struct expand_operand ops
[4];
18932 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
18934 op_mode
= GET_MODE (op0
);
18935 if (op_mode
== VOIDmode
)
18936 op_mode
= GET_MODE (op1
);
18944 icode
= CODE_FOR_cmpsi
;
18949 icode
= CODE_FOR_cmpdi
;
18954 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
18955 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpesf
: CODE_FOR_fcmpsf
;
18960 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
18961 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpedf
: CODE_FOR_fcmpdf
;
18969 op0
= prepare_operand (icode
, op0
, 0, op_mode
, cmp_mode
, unsignedp
);
18970 op1
= prepare_operand (icode
, op1
, 1, op_mode
, cmp_mode
, unsignedp
);
18976 *prep_seq
= get_insns ();
18979 create_fixed_operand (&ops
[0], op0
);
18980 create_fixed_operand (&ops
[1], op1
);
18983 if (!maybe_expand_insn (icode
, 2, ops
))
18988 *gen_seq
= get_insns ();
18991 return gen_rtx_fmt_ee ((rtx_code
) code
, cc_mode
,
18992 gen_rtx_REG (cc_mode
, CC_REGNUM
), const0_rtx
);
18996 aarch64_gen_ccmp_next (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
, rtx prev
,
18997 int cmp_code
, tree treeop0
, tree treeop1
, int bit_code
)
18999 rtx op0
, op1
, target
;
19000 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
19001 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
19003 struct expand_operand ops
[6];
19006 push_to_sequence (*prep_seq
);
19007 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
19009 op_mode
= GET_MODE (op0
);
19010 if (op_mode
== VOIDmode
)
19011 op_mode
= GET_MODE (op1
);
19019 icode
= CODE_FOR_ccmpsi
;
19024 icode
= CODE_FOR_ccmpdi
;
19029 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
19030 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpesf
: CODE_FOR_fccmpsf
;
19035 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
19036 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpedf
: CODE_FOR_fccmpdf
;
19044 op0
= prepare_operand (icode
, op0
, 2, op_mode
, cmp_mode
, unsignedp
);
19045 op1
= prepare_operand (icode
, op1
, 3, op_mode
, cmp_mode
, unsignedp
);
19051 *prep_seq
= get_insns ();
19054 target
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
19055 aarch64_cond
= aarch64_get_condition_code_1 (cc_mode
, (rtx_code
) cmp_code
);
19057 if (bit_code
!= AND
)
19059 prev
= gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev
),
19060 GET_MODE (XEXP (prev
, 0))),
19061 VOIDmode
, XEXP (prev
, 0), const0_rtx
);
19062 aarch64_cond
= AARCH64_INVERSE_CONDITION_CODE (aarch64_cond
);
19065 create_fixed_operand (&ops
[0], XEXP (prev
, 0));
19066 create_fixed_operand (&ops
[1], target
);
19067 create_fixed_operand (&ops
[2], op0
);
19068 create_fixed_operand (&ops
[3], op1
);
19069 create_fixed_operand (&ops
[4], prev
);
19070 create_fixed_operand (&ops
[5], GEN_INT (aarch64_cond
));
19072 push_to_sequence (*gen_seq
);
19073 if (!maybe_expand_insn (icode
, 6, ops
))
19079 *gen_seq
= get_insns ();
19082 return gen_rtx_fmt_ee ((rtx_code
) cmp_code
, VOIDmode
, target
, const0_rtx
);
19085 #undef TARGET_GEN_CCMP_FIRST
19086 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
19088 #undef TARGET_GEN_CCMP_NEXT
19089 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
19091 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
19092 instruction fusion of some sort. */
19095 aarch64_macro_fusion_p (void)
19097 return aarch64_tune_params
.fusible_ops
!= AARCH64_FUSE_NOTHING
;
19101 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
19102 should be kept together during scheduling. */
19105 aarch_macro_fusion_pair_p (rtx_insn
*prev
, rtx_insn
*curr
)
19108 rtx prev_set
= single_set (prev
);
19109 rtx curr_set
= single_set (curr
);
19110 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
19111 bool simple_sets_p
= prev_set
&& curr_set
&& !any_condjump_p (curr
);
19113 if (!aarch64_macro_fusion_p ())
19116 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK
))
19118 /* We are trying to match:
19119 prev (mov) == (set (reg r0) (const_int imm16))
19120 curr (movk) == (set (zero_extract (reg r0)
19123 (const_int imm16_1)) */
19125 set_dest
= SET_DEST (curr_set
);
19127 if (GET_CODE (set_dest
) == ZERO_EXTRACT
19128 && CONST_INT_P (SET_SRC (curr_set
))
19129 && CONST_INT_P (SET_SRC (prev_set
))
19130 && CONST_INT_P (XEXP (set_dest
, 2))
19131 && INTVAL (XEXP (set_dest
, 2)) == 16
19132 && REG_P (XEXP (set_dest
, 0))
19133 && REG_P (SET_DEST (prev_set
))
19134 && REGNO (XEXP (set_dest
, 0)) == REGNO (SET_DEST (prev_set
)))
19140 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD
))
19143 /* We're trying to match:
19144 prev (adrp) == (set (reg r1)
19145 (high (symbol_ref ("SYM"))))
19146 curr (add) == (set (reg r0)
19148 (symbol_ref ("SYM"))))
19149 Note that r0 need not necessarily be the same as r1, especially
19150 during pre-regalloc scheduling. */
19152 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
19153 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
19155 if (GET_CODE (SET_SRC (curr_set
)) == LO_SUM
19156 && REG_P (XEXP (SET_SRC (curr_set
), 0))
19157 && REGNO (XEXP (SET_SRC (curr_set
), 0))
19158 == REGNO (SET_DEST (prev_set
))
19159 && rtx_equal_p (XEXP (SET_SRC (prev_set
), 0),
19160 XEXP (SET_SRC (curr_set
), 1)))
19165 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK
))
19168 /* We're trying to match:
19169 prev (movk) == (set (zero_extract (reg r0)
19172 (const_int imm16_1))
19173 curr (movk) == (set (zero_extract (reg r0)
19176 (const_int imm16_2)) */
19178 if (GET_CODE (SET_DEST (prev_set
)) == ZERO_EXTRACT
19179 && GET_CODE (SET_DEST (curr_set
)) == ZERO_EXTRACT
19180 && REG_P (XEXP (SET_DEST (prev_set
), 0))
19181 && REG_P (XEXP (SET_DEST (curr_set
), 0))
19182 && REGNO (XEXP (SET_DEST (prev_set
), 0))
19183 == REGNO (XEXP (SET_DEST (curr_set
), 0))
19184 && CONST_INT_P (XEXP (SET_DEST (prev_set
), 2))
19185 && CONST_INT_P (XEXP (SET_DEST (curr_set
), 2))
19186 && INTVAL (XEXP (SET_DEST (prev_set
), 2)) == 32
19187 && INTVAL (XEXP (SET_DEST (curr_set
), 2)) == 48
19188 && CONST_INT_P (SET_SRC (prev_set
))
19189 && CONST_INT_P (SET_SRC (curr_set
)))
19193 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR
))
19195 /* We're trying to match:
19196 prev (adrp) == (set (reg r0)
19197 (high (symbol_ref ("SYM"))))
19198 curr (ldr) == (set (reg r1)
19199 (mem (lo_sum (reg r0)
19200 (symbol_ref ("SYM")))))
19202 curr (ldr) == (set (reg r1)
19205 (symbol_ref ("SYM")))))) */
19206 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
19207 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
19209 rtx curr_src
= SET_SRC (curr_set
);
19211 if (GET_CODE (curr_src
) == ZERO_EXTEND
)
19212 curr_src
= XEXP (curr_src
, 0);
19214 if (MEM_P (curr_src
) && GET_CODE (XEXP (curr_src
, 0)) == LO_SUM
19215 && REG_P (XEXP (XEXP (curr_src
, 0), 0))
19216 && REGNO (XEXP (XEXP (curr_src
, 0), 0))
19217 == REGNO (SET_DEST (prev_set
))
19218 && rtx_equal_p (XEXP (XEXP (curr_src
, 0), 1),
19219 XEXP (SET_SRC (prev_set
), 0)))
19224 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH
)
19225 && any_condjump_p (curr
))
19227 unsigned int condreg1
, condreg2
;
19229 aarch64_fixed_condition_code_regs (&condreg1
, &condreg2
);
19230 cc_reg_1
= gen_rtx_REG (CCmode
, condreg1
);
19232 if (reg_referenced_p (cc_reg_1
, PATTERN (curr
))
19234 && modified_in_p (cc_reg_1
, prev
))
19236 enum attr_type prev_type
= get_attr_type (prev
);
19238 /* FIXME: this misses some which is considered simple arthematic
19239 instructions for ThunderX. Simple shifts are missed here. */
19240 if (prev_type
== TYPE_ALUS_SREG
19241 || prev_type
== TYPE_ALUS_IMM
19242 || prev_type
== TYPE_LOGICS_REG
19243 || prev_type
== TYPE_LOGICS_IMM
)
19250 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH
)
19251 && any_condjump_p (curr
))
19253 /* We're trying to match:
19254 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
19255 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
19257 (label_ref ("SYM"))
19259 if (SET_DEST (curr_set
) == (pc_rtx
)
19260 && GET_CODE (SET_SRC (curr_set
)) == IF_THEN_ELSE
19261 && REG_P (XEXP (XEXP (SET_SRC (curr_set
), 0), 0))
19262 && REG_P (SET_DEST (prev_set
))
19263 && REGNO (SET_DEST (prev_set
))
19264 == REGNO (XEXP (XEXP (SET_SRC (curr_set
), 0), 0)))
19266 /* Fuse ALU operations followed by conditional branch instruction. */
19267 switch (get_attr_type (prev
))
19270 case TYPE_ALU_SREG
:
19273 case TYPE_ADCS_REG
:
19274 case TYPE_ADCS_IMM
:
19275 case TYPE_LOGIC_REG
:
19276 case TYPE_LOGIC_IMM
:
19280 case TYPE_SHIFT_REG
:
19281 case TYPE_SHIFT_IMM
:
19296 /* Return true iff the instruction fusion described by OP is enabled. */
19299 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op
)
19301 return (aarch64_tune_params
.fusible_ops
& op
) != 0;
19304 /* If MEM is in the form of [base+offset], extract the two parts
19305 of address and set to BASE and OFFSET, otherwise return false
19306 after clearing BASE and OFFSET. */
19309 extract_base_offset_in_addr (rtx mem
, rtx
*base
, rtx
*offset
)
19313 gcc_assert (MEM_P (mem
));
19315 addr
= XEXP (mem
, 0);
19320 *offset
= const0_rtx
;
19324 if (GET_CODE (addr
) == PLUS
19325 && REG_P (XEXP (addr
, 0)) && CONST_INT_P (XEXP (addr
, 1)))
19327 *base
= XEXP (addr
, 0);
19328 *offset
= XEXP (addr
, 1);
19333 *offset
= NULL_RTX
;
19338 /* Types for scheduling fusion. */
19339 enum sched_fusion_type
19341 SCHED_FUSION_NONE
= 0,
19342 SCHED_FUSION_LD_SIGN_EXTEND
,
19343 SCHED_FUSION_LD_ZERO_EXTEND
,
19349 /* If INSN is a load or store of address in the form of [base+offset],
19350 extract the two parts and set to BASE and OFFSET. Return scheduling
19351 fusion type this INSN is. */
19353 static enum sched_fusion_type
19354 fusion_load_store (rtx_insn
*insn
, rtx
*base
, rtx
*offset
)
19357 enum sched_fusion_type fusion
= SCHED_FUSION_LD
;
19359 gcc_assert (INSN_P (insn
));
19360 x
= PATTERN (insn
);
19361 if (GET_CODE (x
) != SET
)
19362 return SCHED_FUSION_NONE
;
19365 dest
= SET_DEST (x
);
19367 machine_mode dest_mode
= GET_MODE (dest
);
19369 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode
))
19370 return SCHED_FUSION_NONE
;
19372 if (GET_CODE (src
) == SIGN_EXTEND
)
19374 fusion
= SCHED_FUSION_LD_SIGN_EXTEND
;
19375 src
= XEXP (src
, 0);
19376 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
19377 return SCHED_FUSION_NONE
;
19379 else if (GET_CODE (src
) == ZERO_EXTEND
)
19381 fusion
= SCHED_FUSION_LD_ZERO_EXTEND
;
19382 src
= XEXP (src
, 0);
19383 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
19384 return SCHED_FUSION_NONE
;
19387 if (GET_CODE (src
) == MEM
&& REG_P (dest
))
19388 extract_base_offset_in_addr (src
, base
, offset
);
19389 else if (GET_CODE (dest
) == MEM
&& (REG_P (src
) || src
== const0_rtx
))
19391 fusion
= SCHED_FUSION_ST
;
19392 extract_base_offset_in_addr (dest
, base
, offset
);
19395 return SCHED_FUSION_NONE
;
19397 if (*base
== NULL_RTX
|| *offset
== NULL_RTX
)
19398 fusion
= SCHED_FUSION_NONE
;
19403 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
19405 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
19406 and PRI are only calculated for these instructions. For other instruction,
19407 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
19408 type instruction fusion can be added by returning different priorities.
19410 It's important that irrelevant instructions get the largest FUSION_PRI. */
19413 aarch64_sched_fusion_priority (rtx_insn
*insn
, int max_pri
,
19414 int *fusion_pri
, int *pri
)
19418 enum sched_fusion_type fusion
;
19420 gcc_assert (INSN_P (insn
));
19423 fusion
= fusion_load_store (insn
, &base
, &offset
);
19424 if (fusion
== SCHED_FUSION_NONE
)
19431 /* Set FUSION_PRI according to fusion type and base register. */
19432 *fusion_pri
= tmp
- fusion
* FIRST_PSEUDO_REGISTER
- REGNO (base
);
19434 /* Calculate PRI. */
19437 /* INSN with smaller offset goes first. */
19438 off_val
= (int)(INTVAL (offset
));
19440 tmp
-= (off_val
& 0xfffff);
19442 tmp
+= ((- off_val
) & 0xfffff);
19448 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
19449 Adjust priority of sha1h instructions so they are scheduled before
19450 other SHA1 instructions. */
19453 aarch64_sched_adjust_priority (rtx_insn
*insn
, int priority
)
19455 rtx x
= PATTERN (insn
);
19457 if (GET_CODE (x
) == SET
)
19461 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_SHA1H
)
19462 return priority
+ 10;
19468 /* Given OPERANDS of consecutive load/store, check if we can merge
19469 them into ldp/stp. LOAD is true if they are load instructions.
19470 MODE is the mode of memory operands. */
19473 aarch64_operands_ok_for_ldpstp (rtx
*operands
, bool load
,
19476 HOST_WIDE_INT offval_1
, offval_2
, msize
;
19477 enum reg_class rclass_1
, rclass_2
;
19478 rtx mem_1
, mem_2
, reg_1
, reg_2
, base_1
, base_2
, offset_1
, offset_2
;
19482 mem_1
= operands
[1];
19483 mem_2
= operands
[3];
19484 reg_1
= operands
[0];
19485 reg_2
= operands
[2];
19486 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
));
19487 if (REGNO (reg_1
) == REGNO (reg_2
))
19492 mem_1
= operands
[0];
19493 mem_2
= operands
[2];
19494 reg_1
= operands
[1];
19495 reg_2
= operands
[3];
19498 /* The mems cannot be volatile. */
19499 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
))
19502 /* If we have SImode and slow unaligned ldp,
19503 check the alignment to be at least 8 byte. */
19505 && (aarch64_tune_params
.extra_tuning_flags
19506 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
19508 && MEM_ALIGN (mem_1
) < 8 * BITS_PER_UNIT
)
19511 /* Check if the addresses are in the form of [base+offset]. */
19512 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
19513 if (base_1
== NULL_RTX
|| offset_1
== NULL_RTX
)
19515 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
19516 if (base_2
== NULL_RTX
|| offset_2
== NULL_RTX
)
19519 /* Check if the bases are same. */
19520 if (!rtx_equal_p (base_1
, base_2
))
19523 /* The operands must be of the same size. */
19524 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1
)),
19525 GET_MODE_SIZE (GET_MODE (mem_2
))));
19527 offval_1
= INTVAL (offset_1
);
19528 offval_2
= INTVAL (offset_2
);
19529 /* We should only be trying this for fixed-sized modes. There is no
19530 SVE LDP/STP instruction. */
19531 msize
= GET_MODE_SIZE (mode
).to_constant ();
19532 /* Check if the offsets are consecutive. */
19533 if (offval_1
!= (offval_2
+ msize
) && offval_2
!= (offval_1
+ msize
))
19536 /* Check if the addresses are clobbered by load. */
19539 if (reg_mentioned_p (reg_1
, mem_1
))
19542 /* In increasing order, the last load can clobber the address. */
19543 if (offval_1
> offval_2
&& reg_mentioned_p (reg_2
, mem_2
))
19547 /* One of the memory accesses must be a mempair operand.
19548 If it is not the first one, they need to be swapped by the
19550 if (!aarch64_mem_pair_operand (mem_1
, GET_MODE (mem_1
))
19551 && !aarch64_mem_pair_operand (mem_2
, GET_MODE (mem_2
)))
19554 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
19555 rclass_1
= FP_REGS
;
19557 rclass_1
= GENERAL_REGS
;
19559 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
19560 rclass_2
= FP_REGS
;
19562 rclass_2
= GENERAL_REGS
;
19564 /* Check if the registers are of same class. */
19565 if (rclass_1
!= rclass_2
)
19571 /* Given OPERANDS of consecutive load/store that can be merged,
19572 swap them if they are not in ascending order. */
19574 aarch64_swap_ldrstr_operands (rtx
* operands
, bool load
)
19576 rtx mem_1
, mem_2
, base_1
, base_2
, offset_1
, offset_2
;
19577 HOST_WIDE_INT offval_1
, offval_2
;
19581 mem_1
= operands
[1];
19582 mem_2
= operands
[3];
19586 mem_1
= operands
[0];
19587 mem_2
= operands
[2];
19590 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
19591 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
19593 offval_1
= INTVAL (offset_1
);
19594 offval_2
= INTVAL (offset_2
);
19596 if (offval_1
> offval_2
)
19598 /* Irrespective of whether this is a load or a store,
19599 we do the same swap. */
19600 std::swap (operands
[0], operands
[2]);
19601 std::swap (operands
[1], operands
[3]);
19605 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
19606 comparison between the two. */
19608 aarch64_host_wide_int_compare (const void *x
, const void *y
)
19610 return wi::cmps (* ((const HOST_WIDE_INT
*) x
),
19611 * ((const HOST_WIDE_INT
*) y
));
19614 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
19615 other pointing to a REG rtx containing an offset, compare the offsets
19620 1 iff offset (X) > offset (Y)
19621 0 iff offset (X) == offset (Y)
19622 -1 iff offset (X) < offset (Y) */
19624 aarch64_ldrstr_offset_compare (const void *x
, const void *y
)
19626 const rtx
* operands_1
= (const rtx
*) x
;
19627 const rtx
* operands_2
= (const rtx
*) y
;
19628 rtx mem_1
, mem_2
, base
, offset_1
, offset_2
;
19630 if (MEM_P (operands_1
[0]))
19631 mem_1
= operands_1
[0];
19633 mem_1
= operands_1
[1];
19635 if (MEM_P (operands_2
[0]))
19636 mem_2
= operands_2
[0];
19638 mem_2
= operands_2
[1];
19640 /* Extract the offsets. */
19641 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
19642 extract_base_offset_in_addr (mem_2
, &base
, &offset_2
);
19644 gcc_assert (offset_1
!= NULL_RTX
&& offset_2
!= NULL_RTX
);
19646 return wi::cmps (INTVAL (offset_1
), INTVAL (offset_2
));
19649 /* Given OPERANDS of consecutive load/store, check if we can merge
19650 them into ldp/stp by adjusting the offset. LOAD is true if they
19651 are load instructions. MODE is the mode of memory operands.
19653 Given below consecutive stores:
19655 str w1, [xb, 0x100]
19656 str w1, [xb, 0x104]
19657 str w1, [xb, 0x108]
19658 str w1, [xb, 0x10c]
19660 Though the offsets are out of the range supported by stp, we can
19661 still pair them after adjusting the offset, like:
19663 add scratch, xb, 0x100
19664 stp w1, w1, [scratch]
19665 stp w1, w1, [scratch, 0x8]
19667 The peephole patterns detecting this opportunity should guarantee
19668 the scratch register is avaliable. */
19671 aarch64_operands_adjust_ok_for_ldpstp (rtx
*operands
, bool load
,
19674 const int num_insns
= 4;
19675 enum reg_class rclass
;
19676 HOST_WIDE_INT offvals
[num_insns
], msize
;
19677 rtx mem
[num_insns
], reg
[num_insns
], base
[num_insns
], offset
[num_insns
];
19681 for (int i
= 0; i
< num_insns
; i
++)
19683 reg
[i
] = operands
[2 * i
];
19684 mem
[i
] = operands
[2 * i
+ 1];
19686 gcc_assert (REG_P (reg
[i
]));
19689 /* Do not attempt to merge the loads if the loads clobber each other. */
19690 for (int i
= 0; i
< 8; i
+= 2)
19691 for (int j
= i
+ 2; j
< 8; j
+= 2)
19692 if (reg_overlap_mentioned_p (operands
[i
], operands
[j
]))
19696 for (int i
= 0; i
< num_insns
; i
++)
19698 mem
[i
] = operands
[2 * i
];
19699 reg
[i
] = operands
[2 * i
+ 1];
19702 /* Skip if memory operand is by itself valid for ldp/stp. */
19703 if (!MEM_P (mem
[0]) || aarch64_mem_pair_operand (mem
[0], mode
))
19706 for (int i
= 0; i
< num_insns
; i
++)
19708 /* The mems cannot be volatile. */
19709 if (MEM_VOLATILE_P (mem
[i
]))
19712 /* Check if the addresses are in the form of [base+offset]. */
19713 extract_base_offset_in_addr (mem
[i
], base
+ i
, offset
+ i
);
19714 if (base
[i
] == NULL_RTX
|| offset
[i
] == NULL_RTX
)
19718 /* Check if the registers are of same class. */
19719 rclass
= REG_P (reg
[0]) && FP_REGNUM_P (REGNO (reg
[0]))
19720 ? FP_REGS
: GENERAL_REGS
;
19722 for (int i
= 1; i
< num_insns
; i
++)
19723 if (REG_P (reg
[i
]) && FP_REGNUM_P (REGNO (reg
[i
])))
19725 if (rclass
!= FP_REGS
)
19730 if (rclass
!= GENERAL_REGS
)
19734 /* Only the last register in the order in which they occur
19735 may be clobbered by the load. */
19736 if (rclass
== GENERAL_REGS
&& load
)
19737 for (int i
= 0; i
< num_insns
- 1; i
++)
19738 if (reg_mentioned_p (reg
[i
], mem
[i
]))
19741 /* Check if the bases are same. */
19742 for (int i
= 0; i
< num_insns
- 1; i
++)
19743 if (!rtx_equal_p (base
[i
], base
[i
+ 1]))
19746 for (int i
= 0; i
< num_insns
; i
++)
19747 offvals
[i
] = INTVAL (offset
[i
]);
19749 msize
= GET_MODE_SIZE (mode
);
19751 /* Check if the offsets can be put in the right order to do a ldp/stp. */
19752 qsort (offvals
, num_insns
, sizeof (HOST_WIDE_INT
),
19753 aarch64_host_wide_int_compare
);
19755 if (!(offvals
[1] == offvals
[0] + msize
19756 && offvals
[3] == offvals
[2] + msize
))
19759 /* Check that offsets are within range of each other. The ldp/stp
19760 instructions have 7 bit immediate offsets, so use 0x80. */
19761 if (offvals
[2] - offvals
[0] >= msize
* 0x80)
19764 /* The offsets must be aligned with respect to each other. */
19765 if (offvals
[0] % msize
!= offvals
[2] % msize
)
19768 /* If we have SImode and slow unaligned ldp,
19769 check the alignment to be at least 8 byte. */
19771 && (aarch64_tune_params
.extra_tuning_flags
19772 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
19774 && MEM_ALIGN (mem
[0]) < 8 * BITS_PER_UNIT
)
19780 /* Given OPERANDS of consecutive load/store, this function pairs them
19781 into LDP/STP after adjusting the offset. It depends on the fact
19782 that the operands can be sorted so the offsets are correct for STP.
19783 MODE is the mode of memory operands. CODE is the rtl operator
19784 which should be applied to all memory operands, it's SIGN_EXTEND,
19785 ZERO_EXTEND or UNKNOWN. */
19788 aarch64_gen_adjusted_ldpstp (rtx
*operands
, bool load
,
19789 scalar_mode mode
, RTX_CODE code
)
19791 rtx base
, offset_1
, offset_3
, t1
, t2
;
19792 rtx mem_1
, mem_2
, mem_3
, mem_4
;
19793 rtx temp_operands
[8];
19794 HOST_WIDE_INT off_val_1
, off_val_3
, base_off
, new_off_1
, new_off_3
,
19795 stp_off_upper_limit
, stp_off_lower_limit
, msize
;
19797 /* We make changes on a copy as we may still bail out. */
19798 for (int i
= 0; i
< 8; i
++)
19799 temp_operands
[i
] = operands
[i
];
19801 /* Sort the operands. */
19802 qsort (temp_operands
, 4, 2 * sizeof (rtx
*), aarch64_ldrstr_offset_compare
);
19804 /* Copy the memory operands so that if we have to bail for some
19805 reason the original addresses are unchanged. */
19808 mem_1
= copy_rtx (temp_operands
[1]);
19809 mem_2
= copy_rtx (temp_operands
[3]);
19810 mem_3
= copy_rtx (temp_operands
[5]);
19811 mem_4
= copy_rtx (temp_operands
[7]);
19815 mem_1
= copy_rtx (temp_operands
[0]);
19816 mem_2
= copy_rtx (temp_operands
[2]);
19817 mem_3
= copy_rtx (temp_operands
[4]);
19818 mem_4
= copy_rtx (temp_operands
[6]);
19819 gcc_assert (code
== UNKNOWN
);
19822 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
19823 extract_base_offset_in_addr (mem_3
, &base
, &offset_3
);
19824 gcc_assert (base
!= NULL_RTX
&& offset_1
!= NULL_RTX
19825 && offset_3
!= NULL_RTX
);
19827 /* Adjust offset so it can fit in LDP/STP instruction. */
19828 msize
= GET_MODE_SIZE (mode
);
19829 stp_off_upper_limit
= msize
* (0x40 - 1);
19830 stp_off_lower_limit
= - msize
* 0x40;
19832 off_val_1
= INTVAL (offset_1
);
19833 off_val_3
= INTVAL (offset_3
);
19835 /* The base offset is optimally half way between the two STP/LDP offsets. */
19837 base_off
= (off_val_1
+ off_val_3
) / 2;
19839 /* However, due to issues with negative LDP/STP offset generation for
19840 larger modes, for DF, DI and vector modes. we must not use negative
19841 addresses smaller than 9 signed unadjusted bits can store. This
19842 provides the most range in this case. */
19843 base_off
= off_val_1
;
19845 /* Adjust the base so that it is aligned with the addresses but still
19847 if (base_off
% msize
!= off_val_1
% msize
)
19848 /* Fix the offset, bearing in mind we want to make it bigger not
19850 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
19851 else if (msize
<= 4)
19852 /* The negative range of LDP/STP is one larger than the positive range. */
19855 /* Check if base offset is too big or too small. We can attempt to resolve
19856 this issue by setting it to the maximum value and seeing if the offsets
19858 if (base_off
>= 0x1000)
19860 base_off
= 0x1000 - 1;
19861 /* We must still make sure that the base offset is aligned with respect
19862 to the address. But it may may not be made any bigger. */
19863 base_off
-= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
19866 /* Likewise for the case where the base is too small. */
19867 if (base_off
<= -0x1000)
19869 base_off
= -0x1000 + 1;
19870 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
19873 /* Offset of the first STP/LDP. */
19874 new_off_1
= off_val_1
- base_off
;
19876 /* Offset of the second STP/LDP. */
19877 new_off_3
= off_val_3
- base_off
;
19879 /* The offsets must be within the range of the LDP/STP instructions. */
19880 if (new_off_1
> stp_off_upper_limit
|| new_off_1
< stp_off_lower_limit
19881 || new_off_3
> stp_off_upper_limit
|| new_off_3
< stp_off_lower_limit
)
19884 replace_equiv_address_nv (mem_1
, plus_constant (Pmode
, operands
[8],
19886 replace_equiv_address_nv (mem_2
, plus_constant (Pmode
, operands
[8],
19887 new_off_1
+ msize
), true);
19888 replace_equiv_address_nv (mem_3
, plus_constant (Pmode
, operands
[8],
19890 replace_equiv_address_nv (mem_4
, plus_constant (Pmode
, operands
[8],
19891 new_off_3
+ msize
), true);
19893 if (!aarch64_mem_pair_operand (mem_1
, mode
)
19894 || !aarch64_mem_pair_operand (mem_3
, mode
))
19897 if (code
== ZERO_EXTEND
)
19899 mem_1
= gen_rtx_ZERO_EXTEND (DImode
, mem_1
);
19900 mem_2
= gen_rtx_ZERO_EXTEND (DImode
, mem_2
);
19901 mem_3
= gen_rtx_ZERO_EXTEND (DImode
, mem_3
);
19902 mem_4
= gen_rtx_ZERO_EXTEND (DImode
, mem_4
);
19904 else if (code
== SIGN_EXTEND
)
19906 mem_1
= gen_rtx_SIGN_EXTEND (DImode
, mem_1
);
19907 mem_2
= gen_rtx_SIGN_EXTEND (DImode
, mem_2
);
19908 mem_3
= gen_rtx_SIGN_EXTEND (DImode
, mem_3
);
19909 mem_4
= gen_rtx_SIGN_EXTEND (DImode
, mem_4
);
19914 operands
[0] = temp_operands
[0];
19915 operands
[1] = mem_1
;
19916 operands
[2] = temp_operands
[2];
19917 operands
[3] = mem_2
;
19918 operands
[4] = temp_operands
[4];
19919 operands
[5] = mem_3
;
19920 operands
[6] = temp_operands
[6];
19921 operands
[7] = mem_4
;
19925 operands
[0] = mem_1
;
19926 operands
[1] = temp_operands
[1];
19927 operands
[2] = mem_2
;
19928 operands
[3] = temp_operands
[3];
19929 operands
[4] = mem_3
;
19930 operands
[5] = temp_operands
[5];
19931 operands
[6] = mem_4
;
19932 operands
[7] = temp_operands
[7];
19935 /* Emit adjusting instruction. */
19936 emit_insn (gen_rtx_SET (operands
[8], plus_constant (DImode
, base
, base_off
)));
19937 /* Emit ldp/stp instructions. */
19938 t1
= gen_rtx_SET (operands
[0], operands
[1]);
19939 t2
= gen_rtx_SET (operands
[2], operands
[3]);
19940 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
19941 t1
= gen_rtx_SET (operands
[4], operands
[5]);
19942 t2
= gen_rtx_SET (operands
[6], operands
[7]);
19943 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
19947 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
19948 it isn't worth branching around empty masked ops (including masked
19952 aarch64_empty_mask_is_expensive (unsigned)
19957 /* Return 1 if pseudo register should be created and used to hold
19958 GOT address for PIC code. */
19961 aarch64_use_pseudo_pic_reg (void)
19963 return aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
;
19966 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
19969 aarch64_unspec_may_trap_p (const_rtx x
, unsigned flags
)
19971 switch (XINT (x
, 1))
19973 case UNSPEC_GOTSMALLPIC
:
19974 case UNSPEC_GOTSMALLPIC28K
:
19975 case UNSPEC_GOTTINYPIC
:
19981 return default_unspec_may_trap_p (x
, flags
);
19985 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
19986 return the log2 of that value. Otherwise return -1. */
19989 aarch64_fpconst_pow_of_2 (rtx x
)
19991 const REAL_VALUE_TYPE
*r
;
19993 if (!CONST_DOUBLE_P (x
))
19996 r
= CONST_DOUBLE_REAL_VALUE (x
);
19998 if (REAL_VALUE_NEGATIVE (*r
)
19999 || REAL_VALUE_ISNAN (*r
)
20000 || REAL_VALUE_ISINF (*r
)
20001 || !real_isinteger (r
, DFmode
))
20004 return exact_log2 (real_to_integer (r
));
20007 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
20008 power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
20009 return n. Otherwise return -1. */
20012 aarch64_fpconst_pow2_recip (rtx x
)
20014 REAL_VALUE_TYPE r0
;
20016 if (!CONST_DOUBLE_P (x
))
20019 r0
= *CONST_DOUBLE_REAL_VALUE (x
);
20020 if (exact_real_inverse (DFmode
, &r0
)
20021 && !REAL_VALUE_NEGATIVE (r0
))
20023 int ret
= exact_log2 (real_to_integer (&r0
));
20024 if (ret
>= 1 && ret
<= 32)
20030 /* If X is a vector of equal CONST_DOUBLE values and that value is
20031 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
20034 aarch64_vec_fpconst_pow_of_2 (rtx x
)
20037 if (GET_CODE (x
) != CONST_VECTOR
20038 || !CONST_VECTOR_NUNITS (x
).is_constant (&nelts
))
20041 if (GET_MODE_CLASS (GET_MODE (x
)) != MODE_VECTOR_FLOAT
)
20044 int firstval
= aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, 0));
20048 for (int i
= 1; i
< nelts
; i
++)
20049 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, i
)) != firstval
)
20055 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
20058 __fp16 always promotes through this hook.
20059 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
20060 through the generic excess precision logic rather than here. */
20063 aarch64_promoted_type (const_tree t
)
20065 if (SCALAR_FLOAT_TYPE_P (t
)
20066 && TYPE_MAIN_VARIANT (t
) == aarch64_fp16_type_node
)
20067 return float_type_node
;
20072 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
20075 aarch64_optab_supported_p (int op
, machine_mode mode1
, machine_mode
,
20076 optimization_type opt_type
)
20081 return opt_type
== OPTIMIZE_FOR_SPEED
&& use_rsqrt_p (mode1
);
20088 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
20090 static unsigned int
20091 aarch64_dwarf_poly_indeterminate_value (unsigned int i
, unsigned int *factor
,
20094 /* Polynomial invariant 1 == (VG / 2) - 1. */
20095 gcc_assert (i
== 1);
20098 return AARCH64_DWARF_VG
;
20101 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
20102 if MODE is HFmode, and punt to the generic implementation otherwise. */
20105 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode
)
20107 return (mode
== HFmode
20109 : default_libgcc_floating_mode_supported_p (mode
));
20112 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
20113 if MODE is HFmode, and punt to the generic implementation otherwise. */
20116 aarch64_scalar_mode_supported_p (scalar_mode mode
)
20118 return (mode
== HFmode
20120 : default_scalar_mode_supported_p (mode
));
20123 /* Set the value of FLT_EVAL_METHOD.
20124 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
20126 0: evaluate all operations and constants, whose semantic type has at
20127 most the range and precision of type float, to the range and
20128 precision of float; evaluate all other operations and constants to
20129 the range and precision of the semantic type;
20131 N, where _FloatN is a supported interchange floating type
20132 evaluate all operations and constants, whose semantic type has at
20133 most the range and precision of _FloatN type, to the range and
20134 precision of the _FloatN type; evaluate all other operations and
20135 constants to the range and precision of the semantic type;
20137 If we have the ARMv8.2-A extensions then we support _Float16 in native
20138 precision, so we should set this to 16. Otherwise, we support the type,
20139 but want to evaluate expressions in float precision, so set this to
20142 static enum flt_eval_method
20143 aarch64_excess_precision (enum excess_precision_type type
)
20147 case EXCESS_PRECISION_TYPE_FAST
:
20148 case EXCESS_PRECISION_TYPE_STANDARD
:
20149 /* We can calculate either in 16-bit range and precision or
20150 32-bit range and precision. Make that decision based on whether
20151 we have native support for the ARMv8.2-A 16-bit floating-point
20152 instructions or not. */
20153 return (TARGET_FP_F16INST
20154 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
20155 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
);
20156 case EXCESS_PRECISION_TYPE_IMPLICIT
:
20157 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
;
20159 gcc_unreachable ();
20161 return FLT_EVAL_METHOD_UNPREDICTABLE
;
20164 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
20165 scheduled for speculative execution. Reject the long-running division
20166 and square-root instructions. */
20169 aarch64_sched_can_speculate_insn (rtx_insn
*insn
)
20171 switch (get_attr_type (insn
))
20179 case TYPE_NEON_FP_SQRT_S
:
20180 case TYPE_NEON_FP_SQRT_D
:
20181 case TYPE_NEON_FP_SQRT_S_Q
:
20182 case TYPE_NEON_FP_SQRT_D_Q
:
20183 case TYPE_NEON_FP_DIV_S
:
20184 case TYPE_NEON_FP_DIV_D
:
20185 case TYPE_NEON_FP_DIV_S_Q
:
20186 case TYPE_NEON_FP_DIV_D_Q
:
20193 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
20196 aarch64_compute_pressure_classes (reg_class
*classes
)
20199 classes
[i
++] = GENERAL_REGS
;
20200 classes
[i
++] = FP_REGS
;
20201 /* PR_REGS isn't a useful pressure class because many predicate pseudo
20202 registers need to go in PR_LO_REGS at some point during their
20203 lifetime. Splitting it into two halves has the effect of making
20204 all predicates count against PR_LO_REGS, so that we try whenever
20205 possible to restrict the number of live predicates to 8. This
20206 greatly reduces the amount of spilling in certain loops. */
20207 classes
[i
++] = PR_LO_REGS
;
20208 classes
[i
++] = PR_HI_REGS
;
20212 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
20215 aarch64_can_change_mode_class (machine_mode from
,
20216 machine_mode to
, reg_class_t
)
20218 if (BYTES_BIG_ENDIAN
)
20220 bool from_sve_p
= aarch64_sve_data_mode_p (from
);
20221 bool to_sve_p
= aarch64_sve_data_mode_p (to
);
20223 /* Don't allow changes between SVE data modes and non-SVE modes.
20224 See the comment at the head of aarch64-sve.md for details. */
20225 if (from_sve_p
!= to_sve_p
)
20228 /* Don't allow changes in element size: lane 0 of the new vector
20229 would not then be lane 0 of the old vector. See the comment
20230 above aarch64_maybe_expand_sve_subreg_move for a more detailed
20233 In the worst case, this forces a register to be spilled in
20234 one mode and reloaded in the other, which handles the
20235 endianness correctly. */
20236 if (from_sve_p
&& GET_MODE_UNIT_SIZE (from
) != GET_MODE_UNIT_SIZE (to
))
20242 /* Implement TARGET_EARLY_REMAT_MODES. */
20245 aarch64_select_early_remat_modes (sbitmap modes
)
20247 /* SVE values are not normally live across a call, so it should be
20248 worth doing early rematerialization even in VL-specific mode. */
20249 for (int i
= 0; i
< NUM_MACHINE_MODES
; ++i
)
20250 if (aarch64_sve_mode_p ((machine_mode
) i
))
20251 bitmap_set_bit (modes
, i
);
20254 /* Override the default target speculation_safe_value. */
20256 aarch64_speculation_safe_value (machine_mode mode
,
20257 rtx result
, rtx val
, rtx failval
)
20259 /* Maybe we should warn if falling back to hard barriers. They are
20260 likely to be noticably more expensive than the alternative below. */
20261 if (!aarch64_track_speculation
)
20262 return default_speculation_safe_value (mode
, result
, val
, failval
);
20265 val
= copy_to_mode_reg (mode
, val
);
20267 if (!aarch64_reg_or_zero (failval
, mode
))
20268 failval
= copy_to_mode_reg (mode
, failval
);
20270 emit_insn (gen_despeculate_copy (mode
, result
, val
, failval
));
20274 /* Implement TARGET_ESTIMATED_POLY_VALUE.
20275 Look into the tuning structure for an estimate.
20276 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
20277 Advanced SIMD 128 bits. */
20279 static HOST_WIDE_INT
20280 aarch64_estimated_poly_value (poly_int64 val
)
20282 enum aarch64_sve_vector_bits_enum width_source
20283 = aarch64_tune_params
.sve_width
;
20285 /* If we still don't have an estimate, use the default. */
20286 if (width_source
== SVE_SCALABLE
)
20287 return default_estimated_poly_value (val
);
20289 HOST_WIDE_INT over_128
= width_source
- 128;
20290 return val
.coeffs
[0] + val
.coeffs
[1] * over_128
/ 128;
20294 /* Return true for types that could be supported as SIMD return or
20298 supported_simd_type (tree t
)
20300 if (SCALAR_FLOAT_TYPE_P (t
) || INTEGRAL_TYPE_P (t
) || POINTER_TYPE_P (t
))
20302 HOST_WIDE_INT s
= tree_to_shwi (TYPE_SIZE_UNIT (t
));
20303 return s
== 1 || s
== 2 || s
== 4 || s
== 8;
20308 /* Return true for types that currently are supported as SIMD return
20309 or argument types. */
20312 currently_supported_simd_type (tree t
, tree b
)
20314 if (COMPLEX_FLOAT_TYPE_P (t
))
20317 if (TYPE_SIZE (t
) != TYPE_SIZE (b
))
20320 return supported_simd_type (t
);
20323 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
20326 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node
*node
,
20327 struct cgraph_simd_clone
*clonei
,
20328 tree base_type
, int num
)
20330 tree t
, ret_type
, arg_type
;
20331 unsigned int elt_bits
, vec_bits
, count
;
20336 if (clonei
->simdlen
20337 && (clonei
->simdlen
< 2
20338 || clonei
->simdlen
> 1024
20339 || (clonei
->simdlen
& (clonei
->simdlen
- 1)) != 0))
20341 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
20342 "unsupported simdlen %d", clonei
->simdlen
);
20346 ret_type
= TREE_TYPE (TREE_TYPE (node
->decl
));
20347 if (TREE_CODE (ret_type
) != VOID_TYPE
20348 && !currently_supported_simd_type (ret_type
, base_type
))
20350 if (TYPE_SIZE (ret_type
) != TYPE_SIZE (base_type
))
20351 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
20352 "GCC does not currently support mixed size types "
20353 "for %<simd%> functions");
20354 else if (supported_simd_type (ret_type
))
20355 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
20356 "GCC does not currently support return type %qT "
20357 "for %<simd%> functions", ret_type
);
20359 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
20360 "unsupported return type %qT for %<simd%> functions",
20365 for (t
= DECL_ARGUMENTS (node
->decl
); t
; t
= DECL_CHAIN (t
))
20367 arg_type
= TREE_TYPE (t
);
20369 if (!currently_supported_simd_type (arg_type
, base_type
))
20371 if (TYPE_SIZE (arg_type
) != TYPE_SIZE (base_type
))
20372 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
20373 "GCC does not currently support mixed size types "
20374 "for %<simd%> functions");
20376 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
20377 "GCC does not currently support argument type %qT "
20378 "for %<simd%> functions", arg_type
);
20383 clonei
->vecsize_mangle
= 'n';
20384 clonei
->mask_mode
= VOIDmode
;
20385 elt_bits
= GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type
));
20386 if (clonei
->simdlen
== 0)
20389 vec_bits
= (num
== 0 ? 64 : 128);
20390 clonei
->simdlen
= vec_bits
/ elt_bits
;
20395 vec_bits
= clonei
->simdlen
* elt_bits
;
20396 if (vec_bits
!= 64 && vec_bits
!= 128)
20398 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
20399 "GCC does not currently support simdlen %d for type %qT",
20400 clonei
->simdlen
, base_type
);
20404 clonei
->vecsize_int
= vec_bits
;
20405 clonei
->vecsize_float
= vec_bits
;
20409 /* Implement TARGET_SIMD_CLONE_ADJUST. */
20412 aarch64_simd_clone_adjust (struct cgraph_node
*node
)
20414 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
20415 use the correct ABI. */
20417 tree t
= TREE_TYPE (node
->decl
);
20418 TYPE_ATTRIBUTES (t
) = make_attribute ("aarch64_vector_pcs", "default",
20419 TYPE_ATTRIBUTES (t
));
20422 /* Implement TARGET_SIMD_CLONE_USABLE. */
20425 aarch64_simd_clone_usable (struct cgraph_node
*node
)
20427 switch (node
->simdclone
->vecsize_mangle
)
20434 gcc_unreachable ();
20438 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
20441 aarch64_comp_type_attributes (const_tree type1
, const_tree type2
)
20443 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1
))
20444 != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2
)))
20449 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
20451 static const char *
20452 aarch64_get_multilib_abi_name (void)
20454 if (TARGET_BIG_END
)
20455 return TARGET_ILP32
? "aarch64_be_ilp32" : "aarch64_be";
20456 return TARGET_ILP32
? "aarch64_ilp32" : "aarch64";
20459 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
20460 global variable based guard use the default else
20461 return a null tree. */
20463 aarch64_stack_protect_guard (void)
20465 if (aarch64_stack_protector_guard
== SSP_GLOBAL
)
20466 return default_stack_protect_guard ();
20471 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
20472 section at the end if needed. */
20473 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
20474 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
20475 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
20477 aarch64_file_end_indicate_exec_stack ()
20479 file_end_indicate_exec_stack ();
20481 unsigned feature_1_and
= 0;
20482 if (aarch64_bti_enabled ())
20483 feature_1_and
|= GNU_PROPERTY_AARCH64_FEATURE_1_BTI
;
20485 if (aarch64_ra_sign_scope
!= AARCH64_FUNCTION_NONE
)
20486 feature_1_and
|= GNU_PROPERTY_AARCH64_FEATURE_1_PAC
;
20490 /* Generate .note.gnu.property section. */
20491 switch_to_section (get_section (".note.gnu.property",
20492 SECTION_NOTYPE
, NULL
));
20494 /* PT_NOTE header: namesz, descsz, type.
20495 namesz = 4 ("GNU\0")
20496 descsz = 16 (Size of the program property array)
20497 [(12 + padding) * Number of array elements]
20498 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
20499 assemble_align (POINTER_SIZE
);
20500 assemble_integer (GEN_INT (4), 4, 32, 1);
20501 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES
)), 4, 32, 1);
20502 assemble_integer (GEN_INT (5), 4, 32, 1);
20504 /* PT_NOTE name. */
20505 assemble_string ("GNU", 4);
20507 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
20508 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
20510 data = feature_1_and. */
20511 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND
), 4, 32, 1);
20512 assemble_integer (GEN_INT (4), 4, 32, 1);
20513 assemble_integer (GEN_INT (feature_1_and
), 4, 32, 1);
20515 /* Pad the size of the note to the required alignment. */
20516 assemble_align (POINTER_SIZE
);
20519 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
20520 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
20521 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
20523 /* Target-specific selftests. */
20527 namespace selftest
{
20529 /* Selftest for the RTL loader.
20530 Verify that the RTL loader copes with a dump from
20531 print_rtx_function. This is essentially just a test that class
20532 function_reader can handle a real dump, but it also verifies
20533 that lookup_reg_by_dump_name correctly handles hard regs.
20534 The presence of hard reg names in the dump means that the test is
20535 target-specific, hence it is in this file. */
20538 aarch64_test_loading_full_dump ()
20540 rtl_dump_test
t (SELFTEST_LOCATION
, locate_file ("aarch64/times-two.rtl"));
20542 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun
->decl
)));
20544 rtx_insn
*insn_1
= get_insn_by_uid (1);
20545 ASSERT_EQ (NOTE
, GET_CODE (insn_1
));
20547 rtx_insn
*insn_15
= get_insn_by_uid (15);
20548 ASSERT_EQ (INSN
, GET_CODE (insn_15
));
20549 ASSERT_EQ (USE
, GET_CODE (PATTERN (insn_15
)));
20551 /* Verify crtl->return_rtx. */
20552 ASSERT_EQ (REG
, GET_CODE (crtl
->return_rtx
));
20553 ASSERT_EQ (0, REGNO (crtl
->return_rtx
));
20554 ASSERT_EQ (SImode
, GET_MODE (crtl
->return_rtx
));
20557 /* Run all target-specific selftests. */
20560 aarch64_run_selftests (void)
20562 aarch64_test_loading_full_dump ();
20565 } // namespace selftest
20567 #endif /* #if CHECKING_P */
20569 #undef TARGET_STACK_PROTECT_GUARD
20570 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
20572 #undef TARGET_ADDRESS_COST
20573 #define TARGET_ADDRESS_COST aarch64_address_cost
20575 /* This hook will determines whether unnamed bitfields affect the alignment
20576 of the containing structure. The hook returns true if the structure
20577 should inherit the alignment requirements of an unnamed bitfield's
20579 #undef TARGET_ALIGN_ANON_BITFIELD
20580 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
20582 #undef TARGET_ASM_ALIGNED_DI_OP
20583 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
20585 #undef TARGET_ASM_ALIGNED_HI_OP
20586 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
20588 #undef TARGET_ASM_ALIGNED_SI_OP
20589 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
20591 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
20592 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
20593 hook_bool_const_tree_hwi_hwi_const_tree_true
20595 #undef TARGET_ASM_FILE_START
20596 #define TARGET_ASM_FILE_START aarch64_start_file
20598 #undef TARGET_ASM_OUTPUT_MI_THUNK
20599 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
20601 #undef TARGET_ASM_SELECT_RTX_SECTION
20602 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
20604 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
20605 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
20607 #undef TARGET_BUILD_BUILTIN_VA_LIST
20608 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
20610 #undef TARGET_CALLEE_COPIES
20611 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
20613 #undef TARGET_CAN_ELIMINATE
20614 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
20616 #undef TARGET_CAN_INLINE_P
20617 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
20619 #undef TARGET_CANNOT_FORCE_CONST_MEM
20620 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
20622 #undef TARGET_CASE_VALUES_THRESHOLD
20623 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
20625 #undef TARGET_CONDITIONAL_REGISTER_USAGE
20626 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
20628 /* Only the least significant bit is used for initialization guard
20630 #undef TARGET_CXX_GUARD_MASK_BIT
20631 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
20633 #undef TARGET_C_MODE_FOR_SUFFIX
20634 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
20636 #ifdef TARGET_BIG_ENDIAN_DEFAULT
20637 #undef TARGET_DEFAULT_TARGET_FLAGS
20638 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
20641 #undef TARGET_CLASS_MAX_NREGS
20642 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
20644 #undef TARGET_BUILTIN_DECL
20645 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
20647 #undef TARGET_BUILTIN_RECIPROCAL
20648 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
20650 #undef TARGET_C_EXCESS_PRECISION
20651 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
20653 #undef TARGET_EXPAND_BUILTIN
20654 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
20656 #undef TARGET_EXPAND_BUILTIN_VA_START
20657 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
20659 #undef TARGET_FOLD_BUILTIN
20660 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
20662 #undef TARGET_FUNCTION_ARG
20663 #define TARGET_FUNCTION_ARG aarch64_function_arg
20665 #undef TARGET_FUNCTION_ARG_ADVANCE
20666 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
20668 #undef TARGET_FUNCTION_ARG_BOUNDARY
20669 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
20671 #undef TARGET_FUNCTION_ARG_PADDING
20672 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
20674 #undef TARGET_GET_RAW_RESULT_MODE
20675 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
20676 #undef TARGET_GET_RAW_ARG_MODE
20677 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
20679 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
20680 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
20682 #undef TARGET_FUNCTION_VALUE
20683 #define TARGET_FUNCTION_VALUE aarch64_function_value
20685 #undef TARGET_FUNCTION_VALUE_REGNO_P
20686 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
20688 #undef TARGET_GIMPLE_FOLD_BUILTIN
20689 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
20691 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
20692 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
20694 #undef TARGET_INIT_BUILTINS
20695 #define TARGET_INIT_BUILTINS aarch64_init_builtins
20697 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
20698 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
20699 aarch64_ira_change_pseudo_allocno_class
20701 #undef TARGET_LEGITIMATE_ADDRESS_P
20702 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
20704 #undef TARGET_LEGITIMATE_CONSTANT_P
20705 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
20707 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
20708 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
20709 aarch64_legitimize_address_displacement
20711 #undef TARGET_LIBGCC_CMP_RETURN_MODE
20712 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
20714 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
20715 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
20716 aarch64_libgcc_floating_mode_supported_p
20718 #undef TARGET_MANGLE_TYPE
20719 #define TARGET_MANGLE_TYPE aarch64_mangle_type
20721 #undef TARGET_MEMORY_MOVE_COST
20722 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
20724 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
20725 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
20727 #undef TARGET_MUST_PASS_IN_STACK
20728 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
20730 /* This target hook should return true if accesses to volatile bitfields
20731 should use the narrowest mode possible. It should return false if these
20732 accesses should use the bitfield container type. */
20733 #undef TARGET_NARROW_VOLATILE_BITFIELD
20734 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
20736 #undef TARGET_OPTION_OVERRIDE
20737 #define TARGET_OPTION_OVERRIDE aarch64_override_options
20739 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
20740 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
20741 aarch64_override_options_after_change
20743 #undef TARGET_OPTION_SAVE
20744 #define TARGET_OPTION_SAVE aarch64_option_save
20746 #undef TARGET_OPTION_RESTORE
20747 #define TARGET_OPTION_RESTORE aarch64_option_restore
20749 #undef TARGET_OPTION_PRINT
20750 #define TARGET_OPTION_PRINT aarch64_option_print
20752 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
20753 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
20755 #undef TARGET_SET_CURRENT_FUNCTION
20756 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
20758 #undef TARGET_PASS_BY_REFERENCE
20759 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
20761 #undef TARGET_PREFERRED_RELOAD_CLASS
20762 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
20764 #undef TARGET_SCHED_REASSOCIATION_WIDTH
20765 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
20767 #undef TARGET_PROMOTED_TYPE
20768 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
20770 #undef TARGET_SECONDARY_RELOAD
20771 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
20773 #undef TARGET_SHIFT_TRUNCATION_MASK
20774 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
20776 #undef TARGET_SETUP_INCOMING_VARARGS
20777 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
20779 #undef TARGET_STRUCT_VALUE_RTX
20780 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
20782 #undef TARGET_REGISTER_MOVE_COST
20783 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
20785 #undef TARGET_RETURN_IN_MEMORY
20786 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
20788 #undef TARGET_RETURN_IN_MSB
20789 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
20791 #undef TARGET_RTX_COSTS
20792 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
20794 #undef TARGET_SCALAR_MODE_SUPPORTED_P
20795 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
20797 #undef TARGET_SCHED_ISSUE_RATE
20798 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
20800 #undef TARGET_SCHED_VARIABLE_ISSUE
20801 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
20803 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
20804 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
20805 aarch64_sched_first_cycle_multipass_dfa_lookahead
20807 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
20808 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
20809 aarch64_first_cycle_multipass_dfa_lookahead_guard
20811 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
20812 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
20813 aarch64_get_separate_components
20815 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
20816 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
20817 aarch64_components_for_bb
20819 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
20820 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
20821 aarch64_disqualify_components
20823 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
20824 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
20825 aarch64_emit_prologue_components
20827 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
20828 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
20829 aarch64_emit_epilogue_components
20831 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
20832 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
20833 aarch64_set_handled_components
20835 #undef TARGET_TRAMPOLINE_INIT
20836 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
20838 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
20839 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
20841 #undef TARGET_VECTOR_MODE_SUPPORTED_P
20842 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
20844 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
20845 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
20846 aarch64_builtin_support_vector_misalignment
20848 #undef TARGET_ARRAY_MODE
20849 #define TARGET_ARRAY_MODE aarch64_array_mode
20851 #undef TARGET_ARRAY_MODE_SUPPORTED_P
20852 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
20854 #undef TARGET_VECTORIZE_ADD_STMT_COST
20855 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
20857 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
20858 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
20859 aarch64_builtin_vectorization_cost
20861 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
20862 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
20864 #undef TARGET_VECTORIZE_BUILTINS
20865 #define TARGET_VECTORIZE_BUILTINS
20867 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
20868 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
20869 aarch64_builtin_vectorized_function
20871 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
20872 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
20873 aarch64_autovectorize_vector_sizes
20875 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
20876 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
20877 aarch64_atomic_assign_expand_fenv
20879 /* Section anchor support. */
20881 #undef TARGET_MIN_ANCHOR_OFFSET
20882 #define TARGET_MIN_ANCHOR_OFFSET -256
20884 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
20885 byte offset; we can do much more for larger data types, but have no way
20886 to determine the size of the access. We assume accesses are aligned. */
20887 #undef TARGET_MAX_ANCHOR_OFFSET
20888 #define TARGET_MAX_ANCHOR_OFFSET 4095
20890 #undef TARGET_VECTOR_ALIGNMENT
20891 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
20893 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
20894 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
20895 aarch64_vectorize_preferred_vector_alignment
20896 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
20897 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
20898 aarch64_simd_vector_alignment_reachable
20900 /* vec_perm support. */
20902 #undef TARGET_VECTORIZE_VEC_PERM_CONST
20903 #define TARGET_VECTORIZE_VEC_PERM_CONST \
20904 aarch64_vectorize_vec_perm_const
20906 #undef TARGET_VECTORIZE_GET_MASK_MODE
20907 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
20908 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
20909 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
20910 aarch64_empty_mask_is_expensive
20911 #undef TARGET_PREFERRED_ELSE_VALUE
20912 #define TARGET_PREFERRED_ELSE_VALUE \
20913 aarch64_preferred_else_value
20915 #undef TARGET_INIT_LIBFUNCS
20916 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
20918 #undef TARGET_FIXED_CONDITION_CODE_REGS
20919 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
20921 #undef TARGET_FLAGS_REGNUM
20922 #define TARGET_FLAGS_REGNUM CC_REGNUM
20924 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
20925 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
20927 #undef TARGET_ASAN_SHADOW_OFFSET
20928 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
20930 #undef TARGET_LEGITIMIZE_ADDRESS
20931 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
20933 #undef TARGET_SCHED_CAN_SPECULATE_INSN
20934 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
20936 #undef TARGET_CAN_USE_DOLOOP_P
20937 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
20939 #undef TARGET_SCHED_ADJUST_PRIORITY
20940 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
20942 #undef TARGET_SCHED_MACRO_FUSION_P
20943 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
20945 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
20946 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
20948 #undef TARGET_SCHED_FUSION_PRIORITY
20949 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
20951 #undef TARGET_UNSPEC_MAY_TRAP_P
20952 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
20954 #undef TARGET_USE_PSEUDO_PIC_REG
20955 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
20957 #undef TARGET_PRINT_OPERAND
20958 #define TARGET_PRINT_OPERAND aarch64_print_operand
20960 #undef TARGET_PRINT_OPERAND_ADDRESS
20961 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
20963 #undef TARGET_OPTAB_SUPPORTED_P
20964 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
20966 #undef TARGET_OMIT_STRUCT_RETURN_REG
20967 #define TARGET_OMIT_STRUCT_RETURN_REG true
20969 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
20970 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
20971 aarch64_dwarf_poly_indeterminate_value
20973 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
20974 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
20975 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
20977 #undef TARGET_HARD_REGNO_NREGS
20978 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
20979 #undef TARGET_HARD_REGNO_MODE_OK
20980 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
20982 #undef TARGET_MODES_TIEABLE_P
20983 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
20985 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
20986 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
20987 aarch64_hard_regno_call_part_clobbered
20989 #undef TARGET_INSN_CALLEE_ABI
20990 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
20992 #undef TARGET_CONSTANT_ALIGNMENT
20993 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
20995 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
20996 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
20997 aarch64_stack_clash_protection_alloca_probe_range
20999 #undef TARGET_COMPUTE_PRESSURE_CLASSES
21000 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
21002 #undef TARGET_CAN_CHANGE_MODE_CLASS
21003 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
21005 #undef TARGET_SELECT_EARLY_REMAT_MODES
21006 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
21008 #undef TARGET_SPECULATION_SAFE_VALUE
21009 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
21011 #undef TARGET_ESTIMATED_POLY_VALUE
21012 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
21014 #undef TARGET_ATTRIBUTE_TABLE
21015 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
21017 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
21018 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
21019 aarch64_simd_clone_compute_vecsize_and_simdlen
21021 #undef TARGET_SIMD_CLONE_ADJUST
21022 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
21024 #undef TARGET_SIMD_CLONE_USABLE
21025 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
21027 #undef TARGET_COMP_TYPE_ATTRIBUTES
21028 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
21030 #undef TARGET_GET_MULTILIB_ABI_NAME
21031 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
21033 #undef TARGET_FNTYPE_ABI
21034 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
21037 #undef TARGET_RUN_TARGET_SELFTESTS
21038 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
21039 #endif /* #if CHECKING_P */
21041 #undef TARGET_ASM_POST_CFI_STARTPROC
21042 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
21044 struct gcc_target targetm
= TARGET_INITIALIZER
;
21046 #include "gt-aarch64.h"