1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2020 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
24 #define INCLUDE_STRING
26 #include "coretypes.h"
37 #include "stringpool.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
56 #include "langhooks.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
76 #include "function-abi.h"
78 /* This file should be included last. */
79 #include "target-def.h"
81 /* Defined for convenience. */
82 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
84 /* Information about a legitimate vector immediate operand. */
85 struct simd_immediate_info
87 enum insn_type
{ MOV
, MVN
, INDEX
, PTRUE
};
88 enum modifier_type
{ LSL
, MSL
};
90 simd_immediate_info () {}
91 simd_immediate_info (scalar_float_mode
, rtx
);
92 simd_immediate_info (scalar_int_mode
, unsigned HOST_WIDE_INT
,
93 insn_type
= MOV
, modifier_type
= LSL
,
95 simd_immediate_info (scalar_mode
, rtx
, rtx
);
96 simd_immediate_info (scalar_int_mode
, aarch64_svpattern
);
98 /* The mode of the elements. */
101 /* The instruction to use to move the immediate into a vector. */
106 /* For MOV and MVN. */
109 /* The value of each element. */
112 /* The kind of shift modifier to use, and the number of bits to shift.
113 This is (LSL, 0) if no shift is needed. */
114 modifier_type modifier
;
121 /* The value of the first element and the step to be added for each
122 subsequent element. */
127 aarch64_svpattern pattern
;
131 /* Construct a floating-point immediate in which each element has mode
132 ELT_MODE_IN and value VALUE_IN. */
133 inline simd_immediate_info
134 ::simd_immediate_info (scalar_float_mode elt_mode_in
, rtx value_in
)
135 : elt_mode (elt_mode_in
), insn (MOV
)
137 u
.mov
.value
= value_in
;
138 u
.mov
.modifier
= LSL
;
142 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
143 and value VALUE_IN. The other parameters are as for the structure
145 inline simd_immediate_info
146 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
147 unsigned HOST_WIDE_INT value_in
,
148 insn_type insn_in
, modifier_type modifier_in
,
149 unsigned int shift_in
)
150 : elt_mode (elt_mode_in
), insn (insn_in
)
152 u
.mov
.value
= gen_int_mode (value_in
, elt_mode_in
);
153 u
.mov
.modifier
= modifier_in
;
154 u
.mov
.shift
= shift_in
;
157 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
158 and where element I is equal to BASE_IN + I * STEP_IN. */
159 inline simd_immediate_info
160 ::simd_immediate_info (scalar_mode elt_mode_in
, rtx base_in
, rtx step_in
)
161 : elt_mode (elt_mode_in
), insn (INDEX
)
163 u
.index
.base
= base_in
;
164 u
.index
.step
= step_in
;
167 /* Construct a predicate that controls elements of mode ELT_MODE_IN
168 and has PTRUE pattern PATTERN_IN. */
169 inline simd_immediate_info
170 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
171 aarch64_svpattern pattern_in
)
172 : elt_mode (elt_mode_in
), insn (PTRUE
)
174 u
.pattern
= pattern_in
;
177 /* The current code model. */
178 enum aarch64_code_model aarch64_cmodel
;
180 /* The number of 64-bit elements in an SVE vector. */
181 poly_uint16 aarch64_sve_vg
;
184 #undef TARGET_HAVE_TLS
185 #define TARGET_HAVE_TLS 1
188 static bool aarch64_composite_type_p (const_tree
, machine_mode
);
189 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode
,
191 machine_mode
*, int *,
193 static void aarch64_elf_asm_constructor (rtx
, int) ATTRIBUTE_UNUSED
;
194 static void aarch64_elf_asm_destructor (rtx
, int) ATTRIBUTE_UNUSED
;
195 static void aarch64_override_options_after_change (void);
196 static bool aarch64_vector_mode_supported_p (machine_mode
);
197 static int aarch64_address_cost (rtx
, machine_mode
, addr_space_t
, bool);
198 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode
,
202 static machine_mode
aarch64_simd_container_mode (scalar_mode
, poly_int64
);
203 static bool aarch64_print_address_internal (FILE*, machine_mode
, rtx
,
204 aarch64_addr_query_type
);
205 static HOST_WIDE_INT
aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val
);
207 /* Major revision number of the ARM Architecture implemented by the target. */
208 unsigned aarch64_architecture_version
;
210 /* The processor for which instructions should be scheduled. */
211 enum aarch64_processor aarch64_tune
= cortexa53
;
213 /* Mask to specify which instruction scheduling options should be used. */
214 uint64_t aarch64_tune_flags
= 0;
216 /* Global flag for PC relative loads. */
217 bool aarch64_pcrelative_literal_loads
;
219 /* Global flag for whether frame pointer is enabled. */
220 bool aarch64_use_frame_pointer
;
222 #define BRANCH_PROTECT_STR_MAX 255
223 char *accepted_branch_protection_string
= NULL
;
225 static enum aarch64_parse_opt_result
226 aarch64_parse_branch_protection (const char*, char**);
228 /* Support for command line parsing of boolean flags in the tuning
230 struct aarch64_flag_desc
236 #define AARCH64_FUSION_PAIR(name, internal_name) \
237 { name, AARCH64_FUSE_##internal_name },
238 static const struct aarch64_flag_desc aarch64_fusible_pairs
[] =
240 { "none", AARCH64_FUSE_NOTHING
},
241 #include "aarch64-fusion-pairs.def"
242 { "all", AARCH64_FUSE_ALL
},
243 { NULL
, AARCH64_FUSE_NOTHING
}
246 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
247 { name, AARCH64_EXTRA_TUNE_##internal_name },
248 static const struct aarch64_flag_desc aarch64_tuning_flags
[] =
250 { "none", AARCH64_EXTRA_TUNE_NONE
},
251 #include "aarch64-tuning-flags.def"
252 { "all", AARCH64_EXTRA_TUNE_ALL
},
253 { NULL
, AARCH64_EXTRA_TUNE_NONE
}
256 /* Tuning parameters. */
258 static const struct cpu_addrcost_table generic_addrcost_table
=
268 0, /* register_offset */
269 0, /* register_sextend */
270 0, /* register_zextend */
274 static const struct cpu_addrcost_table exynosm1_addrcost_table
=
284 1, /* register_offset */
285 1, /* register_sextend */
286 2, /* register_zextend */
290 static const struct cpu_addrcost_table xgene1_addrcost_table
=
300 0, /* register_offset */
301 1, /* register_sextend */
302 1, /* register_zextend */
306 static const struct cpu_addrcost_table thunderx2t99_addrcost_table
=
316 2, /* register_offset */
317 3, /* register_sextend */
318 3, /* register_zextend */
322 static const struct cpu_addrcost_table tsv110_addrcost_table
=
332 0, /* register_offset */
333 1, /* register_sextend */
334 1, /* register_zextend */
338 static const struct cpu_addrcost_table qdf24xx_addrcost_table
=
348 3, /* register_offset */
349 3, /* register_sextend */
350 3, /* register_zextend */
354 static const struct cpu_regmove_cost generic_regmove_cost
=
357 /* Avoid the use of slow int<->fp moves for spilling by setting
358 their cost higher than memmov_cost. */
364 static const struct cpu_regmove_cost cortexa57_regmove_cost
=
367 /* Avoid the use of slow int<->fp moves for spilling by setting
368 their cost higher than memmov_cost. */
374 static const struct cpu_regmove_cost cortexa53_regmove_cost
=
377 /* Avoid the use of slow int<->fp moves for spilling by setting
378 their cost higher than memmov_cost. */
384 static const struct cpu_regmove_cost exynosm1_regmove_cost
=
387 /* Avoid the use of slow int<->fp moves for spilling by setting
388 their cost higher than memmov_cost (actual, 4 and 9). */
394 static const struct cpu_regmove_cost thunderx_regmove_cost
=
402 static const struct cpu_regmove_cost xgene1_regmove_cost
=
405 /* Avoid the use of slow int<->fp moves for spilling by setting
406 their cost higher than memmov_cost. */
412 static const struct cpu_regmove_cost qdf24xx_regmove_cost
=
415 /* Avoid the use of int<->fp moves for spilling. */
421 static const struct cpu_regmove_cost thunderx2t99_regmove_cost
=
424 /* Avoid the use of int<->fp moves for spilling. */
430 static const struct cpu_regmove_cost tsv110_regmove_cost
=
433 /* Avoid the use of slow int<->fp moves for spilling by setting
434 their cost higher than memmov_cost. */
440 /* Generic costs for vector insn classes. */
441 static const struct cpu_vector_cost generic_vector_cost
=
443 1, /* scalar_int_stmt_cost */
444 1, /* scalar_fp_stmt_cost */
445 1, /* scalar_load_cost */
446 1, /* scalar_store_cost */
447 1, /* vec_int_stmt_cost */
448 1, /* vec_fp_stmt_cost */
449 2, /* vec_permute_cost */
450 2, /* vec_to_scalar_cost */
451 1, /* scalar_to_vec_cost */
452 1, /* vec_align_load_cost */
453 1, /* vec_unalign_load_cost */
454 1, /* vec_unalign_store_cost */
455 1, /* vec_store_cost */
456 3, /* cond_taken_branch_cost */
457 1 /* cond_not_taken_branch_cost */
460 /* QDF24XX costs for vector insn classes. */
461 static const struct cpu_vector_cost qdf24xx_vector_cost
=
463 1, /* scalar_int_stmt_cost */
464 1, /* scalar_fp_stmt_cost */
465 1, /* scalar_load_cost */
466 1, /* scalar_store_cost */
467 1, /* vec_int_stmt_cost */
468 3, /* vec_fp_stmt_cost */
469 2, /* vec_permute_cost */
470 1, /* vec_to_scalar_cost */
471 1, /* scalar_to_vec_cost */
472 1, /* vec_align_load_cost */
473 1, /* vec_unalign_load_cost */
474 1, /* vec_unalign_store_cost */
475 1, /* vec_store_cost */
476 3, /* cond_taken_branch_cost */
477 1 /* cond_not_taken_branch_cost */
480 /* ThunderX costs for vector insn classes. */
481 static const struct cpu_vector_cost thunderx_vector_cost
=
483 1, /* scalar_int_stmt_cost */
484 1, /* scalar_fp_stmt_cost */
485 3, /* scalar_load_cost */
486 1, /* scalar_store_cost */
487 4, /* vec_int_stmt_cost */
488 1, /* vec_fp_stmt_cost */
489 4, /* vec_permute_cost */
490 2, /* vec_to_scalar_cost */
491 2, /* scalar_to_vec_cost */
492 3, /* vec_align_load_cost */
493 5, /* vec_unalign_load_cost */
494 5, /* vec_unalign_store_cost */
495 1, /* vec_store_cost */
496 3, /* cond_taken_branch_cost */
497 3 /* cond_not_taken_branch_cost */
500 static const struct cpu_vector_cost tsv110_vector_cost
=
502 1, /* scalar_int_stmt_cost */
503 1, /* scalar_fp_stmt_cost */
504 5, /* scalar_load_cost */
505 1, /* scalar_store_cost */
506 2, /* vec_int_stmt_cost */
507 2, /* vec_fp_stmt_cost */
508 2, /* vec_permute_cost */
509 3, /* vec_to_scalar_cost */
510 2, /* scalar_to_vec_cost */
511 5, /* vec_align_load_cost */
512 5, /* vec_unalign_load_cost */
513 1, /* vec_unalign_store_cost */
514 1, /* vec_store_cost */
515 1, /* cond_taken_branch_cost */
516 1 /* cond_not_taken_branch_cost */
519 /* Generic costs for vector insn classes. */
520 static const struct cpu_vector_cost cortexa57_vector_cost
=
522 1, /* scalar_int_stmt_cost */
523 1, /* scalar_fp_stmt_cost */
524 4, /* scalar_load_cost */
525 1, /* scalar_store_cost */
526 2, /* vec_int_stmt_cost */
527 2, /* vec_fp_stmt_cost */
528 3, /* vec_permute_cost */
529 8, /* vec_to_scalar_cost */
530 8, /* scalar_to_vec_cost */
531 4, /* vec_align_load_cost */
532 4, /* vec_unalign_load_cost */
533 1, /* vec_unalign_store_cost */
534 1, /* vec_store_cost */
535 1, /* cond_taken_branch_cost */
536 1 /* cond_not_taken_branch_cost */
539 static const struct cpu_vector_cost exynosm1_vector_cost
=
541 1, /* scalar_int_stmt_cost */
542 1, /* scalar_fp_stmt_cost */
543 5, /* scalar_load_cost */
544 1, /* scalar_store_cost */
545 3, /* vec_int_stmt_cost */
546 3, /* vec_fp_stmt_cost */
547 3, /* vec_permute_cost */
548 3, /* vec_to_scalar_cost */
549 3, /* scalar_to_vec_cost */
550 5, /* vec_align_load_cost */
551 5, /* vec_unalign_load_cost */
552 1, /* vec_unalign_store_cost */
553 1, /* vec_store_cost */
554 1, /* cond_taken_branch_cost */
555 1 /* cond_not_taken_branch_cost */
558 /* Generic costs for vector insn classes. */
559 static const struct cpu_vector_cost xgene1_vector_cost
=
561 1, /* scalar_int_stmt_cost */
562 1, /* scalar_fp_stmt_cost */
563 5, /* scalar_load_cost */
564 1, /* scalar_store_cost */
565 2, /* vec_int_stmt_cost */
566 2, /* vec_fp_stmt_cost */
567 2, /* vec_permute_cost */
568 4, /* vec_to_scalar_cost */
569 4, /* scalar_to_vec_cost */
570 10, /* vec_align_load_cost */
571 10, /* vec_unalign_load_cost */
572 2, /* vec_unalign_store_cost */
573 2, /* vec_store_cost */
574 2, /* cond_taken_branch_cost */
575 1 /* cond_not_taken_branch_cost */
578 /* Costs for vector insn classes for Vulcan. */
579 static const struct cpu_vector_cost thunderx2t99_vector_cost
=
581 1, /* scalar_int_stmt_cost */
582 6, /* scalar_fp_stmt_cost */
583 4, /* scalar_load_cost */
584 1, /* scalar_store_cost */
585 5, /* vec_int_stmt_cost */
586 6, /* vec_fp_stmt_cost */
587 10, /* vec_permute_cost */
588 6, /* vec_to_scalar_cost */
589 5, /* scalar_to_vec_cost */
590 8, /* vec_align_load_cost */
591 8, /* vec_unalign_load_cost */
592 4, /* vec_unalign_store_cost */
593 4, /* vec_store_cost */
594 2, /* cond_taken_branch_cost */
595 1 /* cond_not_taken_branch_cost */
598 /* Generic costs for branch instructions. */
599 static const struct cpu_branch_cost generic_branch_cost
=
601 1, /* Predictable. */
602 3 /* Unpredictable. */
605 /* Generic approximation modes. */
606 static const cpu_approx_modes generic_approx_modes
=
608 AARCH64_APPROX_NONE
, /* division */
609 AARCH64_APPROX_NONE
, /* sqrt */
610 AARCH64_APPROX_NONE
/* recip_sqrt */
613 /* Approximation modes for Exynos M1. */
614 static const cpu_approx_modes exynosm1_approx_modes
=
616 AARCH64_APPROX_NONE
, /* division */
617 AARCH64_APPROX_ALL
, /* sqrt */
618 AARCH64_APPROX_ALL
/* recip_sqrt */
621 /* Approximation modes for X-Gene 1. */
622 static const cpu_approx_modes xgene1_approx_modes
=
624 AARCH64_APPROX_NONE
, /* division */
625 AARCH64_APPROX_NONE
, /* sqrt */
626 AARCH64_APPROX_ALL
/* recip_sqrt */
629 /* Generic prefetch settings (which disable prefetch). */
630 static const cpu_prefetch_tune generic_prefetch_tune
=
633 -1, /* l1_cache_size */
634 -1, /* l1_cache_line_size */
635 -1, /* l2_cache_size */
636 true, /* prefetch_dynamic_strides */
637 -1, /* minimum_stride */
638 -1 /* default_opt_level */
641 static const cpu_prefetch_tune exynosm1_prefetch_tune
=
644 -1, /* l1_cache_size */
645 64, /* l1_cache_line_size */
646 -1, /* l2_cache_size */
647 true, /* prefetch_dynamic_strides */
648 -1, /* minimum_stride */
649 -1 /* default_opt_level */
652 static const cpu_prefetch_tune qdf24xx_prefetch_tune
=
655 32, /* l1_cache_size */
656 64, /* l1_cache_line_size */
657 512, /* l2_cache_size */
658 false, /* prefetch_dynamic_strides */
659 2048, /* minimum_stride */
660 3 /* default_opt_level */
663 static const cpu_prefetch_tune thunderxt88_prefetch_tune
=
666 32, /* l1_cache_size */
667 128, /* l1_cache_line_size */
668 16*1024, /* l2_cache_size */
669 true, /* prefetch_dynamic_strides */
670 -1, /* minimum_stride */
671 3 /* default_opt_level */
674 static const cpu_prefetch_tune thunderx_prefetch_tune
=
677 32, /* l1_cache_size */
678 128, /* l1_cache_line_size */
679 -1, /* l2_cache_size */
680 true, /* prefetch_dynamic_strides */
681 -1, /* minimum_stride */
682 -1 /* default_opt_level */
685 static const cpu_prefetch_tune thunderx2t99_prefetch_tune
=
688 32, /* l1_cache_size */
689 64, /* l1_cache_line_size */
690 256, /* l2_cache_size */
691 true, /* prefetch_dynamic_strides */
692 -1, /* minimum_stride */
693 -1 /* default_opt_level */
696 static const cpu_prefetch_tune tsv110_prefetch_tune
=
699 64, /* l1_cache_size */
700 64, /* l1_cache_line_size */
701 512, /* l2_cache_size */
702 true, /* prefetch_dynamic_strides */
703 -1, /* minimum_stride */
704 -1 /* default_opt_level */
707 static const cpu_prefetch_tune xgene1_prefetch_tune
=
710 32, /* l1_cache_size */
711 64, /* l1_cache_line_size */
712 256, /* l2_cache_size */
713 true, /* prefetch_dynamic_strides */
714 -1, /* minimum_stride */
715 -1 /* default_opt_level */
718 static const struct tune_params generic_tunings
=
720 &cortexa57_extra_costs
,
721 &generic_addrcost_table
,
722 &generic_regmove_cost
,
723 &generic_vector_cost
,
724 &generic_branch_cost
,
725 &generic_approx_modes
,
726 SVE_NOT_IMPLEMENTED
, /* sve_width */
729 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
730 "16:12", /* function_align. */
731 "4", /* jump_align. */
732 "8", /* loop_align. */
733 2, /* int_reassoc_width. */
734 4, /* fp_reassoc_width. */
735 1, /* vec_reassoc_width. */
736 2, /* min_div_recip_mul_sf. */
737 2, /* min_div_recip_mul_df. */
738 0, /* max_case_values. */
739 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
740 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
741 &generic_prefetch_tune
744 static const struct tune_params cortexa35_tunings
=
746 &cortexa53_extra_costs
,
747 &generic_addrcost_table
,
748 &cortexa53_regmove_cost
,
749 &generic_vector_cost
,
750 &generic_branch_cost
,
751 &generic_approx_modes
,
752 SVE_NOT_IMPLEMENTED
, /* sve_width */
755 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
756 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
757 "16", /* function_align. */
758 "4", /* jump_align. */
759 "8", /* loop_align. */
760 2, /* int_reassoc_width. */
761 4, /* fp_reassoc_width. */
762 1, /* vec_reassoc_width. */
763 2, /* min_div_recip_mul_sf. */
764 2, /* min_div_recip_mul_df. */
765 0, /* max_case_values. */
766 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
767 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
768 &generic_prefetch_tune
771 static const struct tune_params cortexa53_tunings
=
773 &cortexa53_extra_costs
,
774 &generic_addrcost_table
,
775 &cortexa53_regmove_cost
,
776 &generic_vector_cost
,
777 &generic_branch_cost
,
778 &generic_approx_modes
,
779 SVE_NOT_IMPLEMENTED
, /* sve_width */
782 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
783 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
784 "16", /* function_align. */
785 "4", /* jump_align. */
786 "8", /* loop_align. */
787 2, /* int_reassoc_width. */
788 4, /* fp_reassoc_width. */
789 1, /* vec_reassoc_width. */
790 2, /* min_div_recip_mul_sf. */
791 2, /* min_div_recip_mul_df. */
792 0, /* max_case_values. */
793 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
794 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
795 &generic_prefetch_tune
798 static const struct tune_params cortexa57_tunings
=
800 &cortexa57_extra_costs
,
801 &generic_addrcost_table
,
802 &cortexa57_regmove_cost
,
803 &cortexa57_vector_cost
,
804 &generic_branch_cost
,
805 &generic_approx_modes
,
806 SVE_NOT_IMPLEMENTED
, /* sve_width */
809 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
810 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
811 "16", /* function_align. */
812 "4", /* jump_align. */
813 "8", /* loop_align. */
814 2, /* int_reassoc_width. */
815 4, /* fp_reassoc_width. */
816 1, /* vec_reassoc_width. */
817 2, /* min_div_recip_mul_sf. */
818 2, /* min_div_recip_mul_df. */
819 0, /* max_case_values. */
820 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
821 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
), /* tune_flags. */
822 &generic_prefetch_tune
825 static const struct tune_params cortexa72_tunings
=
827 &cortexa57_extra_costs
,
828 &generic_addrcost_table
,
829 &cortexa57_regmove_cost
,
830 &cortexa57_vector_cost
,
831 &generic_branch_cost
,
832 &generic_approx_modes
,
833 SVE_NOT_IMPLEMENTED
, /* sve_width */
836 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
837 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
838 "16", /* function_align. */
839 "4", /* jump_align. */
840 "8", /* loop_align. */
841 2, /* int_reassoc_width. */
842 4, /* fp_reassoc_width. */
843 1, /* vec_reassoc_width. */
844 2, /* min_div_recip_mul_sf. */
845 2, /* min_div_recip_mul_df. */
846 0, /* max_case_values. */
847 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
848 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
849 &generic_prefetch_tune
852 static const struct tune_params cortexa73_tunings
=
854 &cortexa57_extra_costs
,
855 &generic_addrcost_table
,
856 &cortexa57_regmove_cost
,
857 &cortexa57_vector_cost
,
858 &generic_branch_cost
,
859 &generic_approx_modes
,
860 SVE_NOT_IMPLEMENTED
, /* sve_width */
861 4, /* memmov_cost. */
863 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
864 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
865 "16", /* function_align. */
866 "4", /* jump_align. */
867 "8", /* loop_align. */
868 2, /* int_reassoc_width. */
869 4, /* fp_reassoc_width. */
870 1, /* vec_reassoc_width. */
871 2, /* min_div_recip_mul_sf. */
872 2, /* min_div_recip_mul_df. */
873 0, /* max_case_values. */
874 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
875 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
876 &generic_prefetch_tune
881 static const struct tune_params exynosm1_tunings
=
883 &exynosm1_extra_costs
,
884 &exynosm1_addrcost_table
,
885 &exynosm1_regmove_cost
,
886 &exynosm1_vector_cost
,
887 &generic_branch_cost
,
888 &exynosm1_approx_modes
,
889 SVE_NOT_IMPLEMENTED
, /* sve_width */
892 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
893 "4", /* function_align. */
894 "4", /* jump_align. */
895 "4", /* loop_align. */
896 2, /* int_reassoc_width. */
897 4, /* fp_reassoc_width. */
898 1, /* vec_reassoc_width. */
899 2, /* min_div_recip_mul_sf. */
900 2, /* min_div_recip_mul_df. */
901 48, /* max_case_values. */
902 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
903 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
904 &exynosm1_prefetch_tune
907 static const struct tune_params thunderxt88_tunings
=
909 &thunderx_extra_costs
,
910 &generic_addrcost_table
,
911 &thunderx_regmove_cost
,
912 &thunderx_vector_cost
,
913 &generic_branch_cost
,
914 &generic_approx_modes
,
915 SVE_NOT_IMPLEMENTED
, /* sve_width */
918 AARCH64_FUSE_ALU_BRANCH
, /* fusible_ops */
919 "8", /* function_align. */
920 "8", /* jump_align. */
921 "8", /* loop_align. */
922 2, /* int_reassoc_width. */
923 4, /* fp_reassoc_width. */
924 1, /* vec_reassoc_width. */
925 2, /* min_div_recip_mul_sf. */
926 2, /* min_div_recip_mul_df. */
927 0, /* max_case_values. */
928 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
929 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
), /* tune_flags. */
930 &thunderxt88_prefetch_tune
933 static const struct tune_params thunderx_tunings
=
935 &thunderx_extra_costs
,
936 &generic_addrcost_table
,
937 &thunderx_regmove_cost
,
938 &thunderx_vector_cost
,
939 &generic_branch_cost
,
940 &generic_approx_modes
,
941 SVE_NOT_IMPLEMENTED
, /* sve_width */
944 AARCH64_FUSE_ALU_BRANCH
, /* fusible_ops */
945 "8", /* function_align. */
946 "8", /* jump_align. */
947 "8", /* loop_align. */
948 2, /* int_reassoc_width. */
949 4, /* fp_reassoc_width. */
950 1, /* vec_reassoc_width. */
951 2, /* min_div_recip_mul_sf. */
952 2, /* min_div_recip_mul_df. */
953 0, /* max_case_values. */
954 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
955 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
956 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
), /* tune_flags. */
957 &thunderx_prefetch_tune
960 static const struct tune_params tsv110_tunings
=
963 &tsv110_addrcost_table
,
964 &tsv110_regmove_cost
,
966 &generic_branch_cost
,
967 &generic_approx_modes
,
968 SVE_NOT_IMPLEMENTED
, /* sve_width */
971 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_ALU_BRANCH
972 | AARCH64_FUSE_ALU_CBZ
), /* fusible_ops */
973 "16", /* function_align. */
974 "4", /* jump_align. */
975 "8", /* loop_align. */
976 2, /* int_reassoc_width. */
977 4, /* fp_reassoc_width. */
978 1, /* vec_reassoc_width. */
979 2, /* min_div_recip_mul_sf. */
980 2, /* min_div_recip_mul_df. */
981 0, /* max_case_values. */
982 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
983 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
984 &tsv110_prefetch_tune
987 static const struct tune_params xgene1_tunings
=
990 &xgene1_addrcost_table
,
991 &xgene1_regmove_cost
,
993 &generic_branch_cost
,
994 &xgene1_approx_modes
,
995 SVE_NOT_IMPLEMENTED
, /* sve_width */
998 AARCH64_FUSE_NOTHING
, /* fusible_ops */
999 "16", /* function_align. */
1000 "16", /* jump_align. */
1001 "16", /* loop_align. */
1002 2, /* int_reassoc_width. */
1003 4, /* fp_reassoc_width. */
1004 1, /* vec_reassoc_width. */
1005 2, /* min_div_recip_mul_sf. */
1006 2, /* min_div_recip_mul_df. */
1007 17, /* max_case_values. */
1008 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
1009 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
), /* tune_flags. */
1010 &xgene1_prefetch_tune
1013 static const struct tune_params emag_tunings
=
1015 &xgene1_extra_costs
,
1016 &xgene1_addrcost_table
,
1017 &xgene1_regmove_cost
,
1018 &xgene1_vector_cost
,
1019 &generic_branch_cost
,
1020 &xgene1_approx_modes
,
1021 SVE_NOT_IMPLEMENTED
,
1022 6, /* memmov_cost */
1024 AARCH64_FUSE_NOTHING
, /* fusible_ops */
1025 "16", /* function_align. */
1026 "16", /* jump_align. */
1027 "16", /* loop_align. */
1028 2, /* int_reassoc_width. */
1029 4, /* fp_reassoc_width. */
1030 1, /* vec_reassoc_width. */
1031 2, /* min_div_recip_mul_sf. */
1032 2, /* min_div_recip_mul_df. */
1033 17, /* max_case_values. */
1034 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
1035 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
), /* tune_flags. */
1036 &xgene1_prefetch_tune
1039 static const struct tune_params qdf24xx_tunings
=
1041 &qdf24xx_extra_costs
,
1042 &qdf24xx_addrcost_table
,
1043 &qdf24xx_regmove_cost
,
1044 &qdf24xx_vector_cost
,
1045 &generic_branch_cost
,
1046 &generic_approx_modes
,
1047 SVE_NOT_IMPLEMENTED
, /* sve_width */
1048 4, /* memmov_cost */
1050 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
1051 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
1052 "16", /* function_align. */
1053 "8", /* jump_align. */
1054 "16", /* loop_align. */
1055 2, /* int_reassoc_width. */
1056 4, /* fp_reassoc_width. */
1057 1, /* vec_reassoc_width. */
1058 2, /* min_div_recip_mul_sf. */
1059 2, /* min_div_recip_mul_df. */
1060 0, /* max_case_values. */
1061 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1062 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS
, /* tune_flags. */
1063 &qdf24xx_prefetch_tune
1066 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1068 static const struct tune_params saphira_tunings
=
1070 &generic_extra_costs
,
1071 &generic_addrcost_table
,
1072 &generic_regmove_cost
,
1073 &generic_vector_cost
,
1074 &generic_branch_cost
,
1075 &generic_approx_modes
,
1076 SVE_NOT_IMPLEMENTED
, /* sve_width */
1077 4, /* memmov_cost */
1079 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
1080 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
1081 "16", /* function_align. */
1082 "8", /* jump_align. */
1083 "16", /* loop_align. */
1084 2, /* int_reassoc_width. */
1085 4, /* fp_reassoc_width. */
1086 1, /* vec_reassoc_width. */
1087 2, /* min_div_recip_mul_sf. */
1088 2, /* min_div_recip_mul_df. */
1089 0, /* max_case_values. */
1090 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1091 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1092 &generic_prefetch_tune
1095 static const struct tune_params thunderx2t99_tunings
=
1097 &thunderx2t99_extra_costs
,
1098 &thunderx2t99_addrcost_table
,
1099 &thunderx2t99_regmove_cost
,
1100 &thunderx2t99_vector_cost
,
1101 &generic_branch_cost
,
1102 &generic_approx_modes
,
1103 SVE_NOT_IMPLEMENTED
, /* sve_width */
1104 4, /* memmov_cost. */
1105 4, /* issue_rate. */
1106 (AARCH64_FUSE_ALU_BRANCH
| AARCH64_FUSE_AES_AESMC
1107 | AARCH64_FUSE_ALU_CBZ
), /* fusible_ops */
1108 "16", /* function_align. */
1109 "8", /* jump_align. */
1110 "16", /* loop_align. */
1111 3, /* int_reassoc_width. */
1112 2, /* fp_reassoc_width. */
1113 2, /* vec_reassoc_width. */
1114 2, /* min_div_recip_mul_sf. */
1115 2, /* min_div_recip_mul_df. */
1116 0, /* max_case_values. */
1117 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1118 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1119 &thunderx2t99_prefetch_tune
1122 static const struct tune_params neoversen1_tunings
=
1124 &cortexa57_extra_costs
,
1125 &generic_addrcost_table
,
1126 &generic_regmove_cost
,
1127 &cortexa57_vector_cost
,
1128 &generic_branch_cost
,
1129 &generic_approx_modes
,
1130 SVE_NOT_IMPLEMENTED
, /* sve_width */
1131 4, /* memmov_cost */
1133 AARCH64_FUSE_AES_AESMC
, /* fusible_ops */
1134 "32:16", /* function_align. */
1135 "32:16", /* jump_align. */
1136 "32:16", /* loop_align. */
1137 2, /* int_reassoc_width. */
1138 4, /* fp_reassoc_width. */
1139 2, /* vec_reassoc_width. */
1140 2, /* min_div_recip_mul_sf. */
1141 2, /* min_div_recip_mul_df. */
1142 0, /* max_case_values. */
1143 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1144 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1145 &generic_prefetch_tune
1148 /* Support for fine-grained override of the tuning structures. */
1149 struct aarch64_tuning_override_function
1152 void (*parse_override
)(const char*, struct tune_params
*);
1155 static void aarch64_parse_fuse_string (const char*, struct tune_params
*);
1156 static void aarch64_parse_tune_string (const char*, struct tune_params
*);
1157 static void aarch64_parse_sve_width_string (const char*, struct tune_params
*);
1159 static const struct aarch64_tuning_override_function
1160 aarch64_tuning_override_functions
[] =
1162 { "fuse", aarch64_parse_fuse_string
},
1163 { "tune", aarch64_parse_tune_string
},
1164 { "sve_width", aarch64_parse_sve_width_string
},
1168 /* A processor implementing AArch64. */
1171 const char *const name
;
1172 enum aarch64_processor ident
;
1173 enum aarch64_processor sched_core
;
1174 enum aarch64_arch arch
;
1175 unsigned architecture_version
;
1176 const uint64_t flags
;
1177 const struct tune_params
*const tune
;
1180 /* Architectures implementing AArch64. */
1181 static const struct processor all_architectures
[] =
1183 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1184 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1185 #include "aarch64-arches.def"
1186 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
1189 /* Processor cores implementing AArch64. */
1190 static const struct processor all_cores
[] =
1192 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1193 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1194 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1195 FLAGS, &COSTS##_tunings},
1196 #include "aarch64-cores.def"
1197 {"generic", generic
, cortexa53
, AARCH64_ARCH_8A
, 8,
1198 AARCH64_FL_FOR_ARCH8
, &generic_tunings
},
1199 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
1203 /* Target specification. These are populated by the -march, -mtune, -mcpu
1204 handling code or by target attributes. */
1205 static const struct processor
*selected_arch
;
1206 static const struct processor
*selected_cpu
;
1207 static const struct processor
*selected_tune
;
1209 enum aarch64_key_type aarch64_ra_sign_key
= AARCH64_KEY_A
;
1211 /* The current tuning set. */
1212 struct tune_params aarch64_tune_params
= generic_tunings
;
1214 /* Check whether an 'aarch64_vector_pcs' attribute is valid. */
1217 handle_aarch64_vector_pcs_attribute (tree
*node
, tree name
, tree
,
1218 int, bool *no_add_attrs
)
1220 /* Since we set fn_type_req to true, the caller should have checked
1222 gcc_assert (FUNC_OR_METHOD_TYPE_P (*node
));
1223 switch ((arm_pcs
) fntype_abi (*node
).id ())
1225 case ARM_PCS_AAPCS64
:
1230 error ("the %qE attribute cannot be applied to an SVE function type",
1232 *no_add_attrs
= true;
1235 case ARM_PCS_TLSDESC
:
1236 case ARM_PCS_UNKNOWN
:
1242 /* Table of machine attributes. */
1243 static const struct attribute_spec aarch64_attribute_table
[] =
1245 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1246 affects_type_identity, handler, exclude } */
1247 { "aarch64_vector_pcs", 0, 0, false, true, true, true,
1248 handle_aarch64_vector_pcs_attribute
, NULL
},
1249 { NULL
, 0, 0, false, false, false, false, NULL
, NULL
}
1252 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1254 /* An ISA extension in the co-processor and main instruction set space. */
1255 struct aarch64_option_extension
1257 const char *const name
;
1258 const unsigned long flags_on
;
1259 const unsigned long flags_off
;
1262 typedef enum aarch64_cond_code
1264 AARCH64_EQ
= 0, AARCH64_NE
, AARCH64_CS
, AARCH64_CC
, AARCH64_MI
, AARCH64_PL
,
1265 AARCH64_VS
, AARCH64_VC
, AARCH64_HI
, AARCH64_LS
, AARCH64_GE
, AARCH64_LT
,
1266 AARCH64_GT
, AARCH64_LE
, AARCH64_AL
, AARCH64_NV
1270 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1272 struct aarch64_branch_protect_type
1274 /* The type's name that the user passes to the branch-protection option
1277 /* Function to handle the protection type and set global variables.
1278 First argument is the string token corresponding with this type and the
1279 second argument is the next token in the option string.
1281 * AARCH64_PARSE_OK: Handling was sucessful.
1282 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1283 should print an error.
1284 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1286 enum aarch64_parse_opt_result (*handler
)(char*, char*);
1287 /* A list of types that can follow this type in the option string. */
1288 const aarch64_branch_protect_type
* subtypes
;
1289 unsigned int num_subtypes
;
1292 static enum aarch64_parse_opt_result
1293 aarch64_handle_no_branch_protection (char* str
, char* rest
)
1295 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NONE
;
1296 aarch64_enable_bti
= 0;
1299 error ("unexpected %<%s%> after %<%s%>", rest
, str
);
1300 return AARCH64_PARSE_INVALID_FEATURE
;
1302 return AARCH64_PARSE_OK
;
1305 static enum aarch64_parse_opt_result
1306 aarch64_handle_standard_branch_protection (char* str
, char* rest
)
1308 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
1309 aarch64_ra_sign_key
= AARCH64_KEY_A
;
1310 aarch64_enable_bti
= 1;
1313 error ("unexpected %<%s%> after %<%s%>", rest
, str
);
1314 return AARCH64_PARSE_INVALID_FEATURE
;
1316 return AARCH64_PARSE_OK
;
1319 static enum aarch64_parse_opt_result
1320 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED
,
1321 char* rest ATTRIBUTE_UNUSED
)
1323 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
1324 aarch64_ra_sign_key
= AARCH64_KEY_A
;
1325 return AARCH64_PARSE_OK
;
1328 static enum aarch64_parse_opt_result
1329 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED
,
1330 char* rest ATTRIBUTE_UNUSED
)
1332 aarch64_ra_sign_scope
= AARCH64_FUNCTION_ALL
;
1333 return AARCH64_PARSE_OK
;
1336 static enum aarch64_parse_opt_result
1337 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED
,
1338 char* rest ATTRIBUTE_UNUSED
)
1340 aarch64_ra_sign_key
= AARCH64_KEY_B
;
1341 return AARCH64_PARSE_OK
;
1344 static enum aarch64_parse_opt_result
1345 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED
,
1346 char* rest ATTRIBUTE_UNUSED
)
1348 aarch64_enable_bti
= 1;
1349 return AARCH64_PARSE_OK
;
1352 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes
[] = {
1353 { "leaf", aarch64_handle_pac_ret_leaf
, NULL
, 0 },
1354 { "b-key", aarch64_handle_pac_ret_b_key
, NULL
, 0 },
1355 { NULL
, NULL
, NULL
, 0 }
1358 static const struct aarch64_branch_protect_type aarch64_branch_protect_types
[] = {
1359 { "none", aarch64_handle_no_branch_protection
, NULL
, 0 },
1360 { "standard", aarch64_handle_standard_branch_protection
, NULL
, 0 },
1361 { "pac-ret", aarch64_handle_pac_ret_protection
, aarch64_pac_ret_subtypes
,
1362 ARRAY_SIZE (aarch64_pac_ret_subtypes
) },
1363 { "bti", aarch64_handle_bti_protection
, NULL
, 0 },
1364 { NULL
, NULL
, NULL
, 0 }
1367 /* The condition codes of the processor, and the inverse function. */
1368 static const char * const aarch64_condition_codes
[] =
1370 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1371 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1374 /* The preferred condition codes for SVE conditions. */
1375 static const char *const aarch64_sve_condition_codes
[] =
1377 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1378 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1381 /* Return the assembly token for svpattern value VALUE. */
1384 svpattern_token (enum aarch64_svpattern pattern
)
1388 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1389 AARCH64_FOR_SVPATTERN (CASE
)
1391 case AARCH64_NUM_SVPATTERNS
:
1397 /* Return the descriptor of the SIMD ABI. */
1399 static const predefined_function_abi
&
1400 aarch64_simd_abi (void)
1402 predefined_function_abi
&simd_abi
= function_abis
[ARM_PCS_SIMD
];
1403 if (!simd_abi
.initialized_p ())
1405 HARD_REG_SET full_reg_clobbers
1406 = default_function_abi
.full_reg_clobbers ();
1407 for (int regno
= 0; regno
< FIRST_PSEUDO_REGISTER
; regno
++)
1408 if (FP_SIMD_SAVED_REGNUM_P (regno
))
1409 CLEAR_HARD_REG_BIT (full_reg_clobbers
, regno
);
1410 simd_abi
.initialize (ARM_PCS_SIMD
, full_reg_clobbers
);
1415 /* Return the descriptor of the SVE PCS. */
1417 static const predefined_function_abi
&
1418 aarch64_sve_abi (void)
1420 predefined_function_abi
&sve_abi
= function_abis
[ARM_PCS_SVE
];
1421 if (!sve_abi
.initialized_p ())
1423 HARD_REG_SET full_reg_clobbers
1424 = default_function_abi
.full_reg_clobbers ();
1425 for (int regno
= V8_REGNUM
; regno
<= V23_REGNUM
; ++regno
)
1426 CLEAR_HARD_REG_BIT (full_reg_clobbers
, regno
);
1427 for (int regno
= P4_REGNUM
; regno
<= P11_REGNUM
; ++regno
)
1428 CLEAR_HARD_REG_BIT (full_reg_clobbers
, regno
);
1429 sve_abi
.initialize (ARM_PCS_SVE
, full_reg_clobbers
);
1434 /* Generate code to enable conditional branches in functions over 1 MiB. */
1436 aarch64_gen_far_branch (rtx
* operands
, int pos_label
, const char * dest
,
1437 const char * branch_format
)
1439 rtx_code_label
* tmp_label
= gen_label_rtx ();
1440 char label_buf
[256];
1442 ASM_GENERATE_INTERNAL_LABEL (label_buf
, dest
,
1443 CODE_LABEL_NUMBER (tmp_label
));
1444 const char *label_ptr
= targetm
.strip_name_encoding (label_buf
);
1445 rtx dest_label
= operands
[pos_label
];
1446 operands
[pos_label
] = tmp_label
;
1448 snprintf (buffer
, sizeof (buffer
), "%s%s", branch_format
, label_ptr
);
1449 output_asm_insn (buffer
, operands
);
1451 snprintf (buffer
, sizeof (buffer
), "b\t%%l%d\n%s:", pos_label
, label_ptr
);
1452 operands
[pos_label
] = dest_label
;
1453 output_asm_insn (buffer
, operands
);
1458 aarch64_err_no_fpadvsimd (machine_mode mode
)
1460 if (TARGET_GENERAL_REGS_ONLY
)
1461 if (FLOAT_MODE_P (mode
))
1462 error ("%qs is incompatible with the use of floating-point types",
1463 "-mgeneral-regs-only");
1465 error ("%qs is incompatible with the use of vector types",
1466 "-mgeneral-regs-only");
1468 if (FLOAT_MODE_P (mode
))
1469 error ("%qs feature modifier is incompatible with the use of"
1470 " floating-point types", "+nofp");
1472 error ("%qs feature modifier is incompatible with the use of"
1473 " vector types", "+nofp");
1476 /* Report when we try to do something that requires SVE when SVE is disabled.
1477 This is an error of last resort and isn't very high-quality. It usually
1478 involves attempts to measure the vector length in some way. */
1480 aarch64_report_sve_required (void)
1482 static bool reported_p
= false;
1484 /* Avoid reporting a slew of messages for a single oversight. */
1488 error ("this operation requires the SVE ISA extension");
1489 inform (input_location
, "you can enable SVE using the command-line"
1490 " option %<-march%>, or by using the %<target%>"
1491 " attribute or pragma");
1495 /* Return true if REGNO is P0-P15 or one of the special FFR-related
1498 pr_or_ffr_regnum_p (unsigned int regno
)
1500 return PR_REGNUM_P (regno
) || regno
== FFR_REGNUM
|| regno
== FFRT_REGNUM
;
1503 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1504 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1505 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1506 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1507 and GENERAL_REGS is lower than the memory cost (in this case the best class
1508 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1509 cost results in bad allocations with many redundant int<->FP moves which
1510 are expensive on various cores.
1511 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1512 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1513 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1514 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1515 The result of this is that it is no longer inefficient to have a higher
1516 memory move cost than the register move cost.
1520 aarch64_ira_change_pseudo_allocno_class (int regno
, reg_class_t allocno_class
,
1521 reg_class_t best_class
)
1525 if (!reg_class_subset_p (GENERAL_REGS
, allocno_class
)
1526 || !reg_class_subset_p (FP_REGS
, allocno_class
))
1527 return allocno_class
;
1529 if (!reg_class_subset_p (GENERAL_REGS
, best_class
)
1530 || !reg_class_subset_p (FP_REGS
, best_class
))
1533 mode
= PSEUDO_REGNO_MODE (regno
);
1534 return FLOAT_MODE_P (mode
) || VECTOR_MODE_P (mode
) ? FP_REGS
: GENERAL_REGS
;
1538 aarch64_min_divisions_for_recip_mul (machine_mode mode
)
1540 if (GET_MODE_UNIT_SIZE (mode
) == 4)
1541 return aarch64_tune_params
.min_div_recip_mul_sf
;
1542 return aarch64_tune_params
.min_div_recip_mul_df
;
1545 /* Return the reassociation width of treeop OPC with mode MODE. */
1547 aarch64_reassociation_width (unsigned opc
, machine_mode mode
)
1549 if (VECTOR_MODE_P (mode
))
1550 return aarch64_tune_params
.vec_reassoc_width
;
1551 if (INTEGRAL_MODE_P (mode
))
1552 return aarch64_tune_params
.int_reassoc_width
;
1553 /* Avoid reassociating floating point addition so we emit more FMAs. */
1554 if (FLOAT_MODE_P (mode
) && opc
!= PLUS_EXPR
)
1555 return aarch64_tune_params
.fp_reassoc_width
;
1559 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1561 aarch64_dbx_register_number (unsigned regno
)
1563 if (GP_REGNUM_P (regno
))
1564 return AARCH64_DWARF_R0
+ regno
- R0_REGNUM
;
1565 else if (regno
== SP_REGNUM
)
1566 return AARCH64_DWARF_SP
;
1567 else if (FP_REGNUM_P (regno
))
1568 return AARCH64_DWARF_V0
+ regno
- V0_REGNUM
;
1569 else if (PR_REGNUM_P (regno
))
1570 return AARCH64_DWARF_P0
+ regno
- P0_REGNUM
;
1571 else if (regno
== VG_REGNUM
)
1572 return AARCH64_DWARF_VG
;
1574 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1575 equivalent DWARF register. */
1576 return DWARF_FRAME_REGISTERS
;
1579 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1580 integer, otherwise return X unmodified. */
1582 aarch64_bit_representation (rtx x
)
1584 if (CONST_DOUBLE_P (x
))
1585 x
= gen_lowpart (int_mode_for_mode (GET_MODE (x
)).require (), x
);
1589 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1591 aarch64_advsimd_struct_mode_p (machine_mode mode
)
1594 && (mode
== OImode
|| mode
== CImode
|| mode
== XImode
));
1597 /* Return true if MODE is an SVE predicate mode. */
1599 aarch64_sve_pred_mode_p (machine_mode mode
)
1602 && (mode
== VNx16BImode
1603 || mode
== VNx8BImode
1604 || mode
== VNx4BImode
1605 || mode
== VNx2BImode
));
1608 /* Three mutually-exclusive flags describing a vector or predicate type. */
1609 const unsigned int VEC_ADVSIMD
= 1;
1610 const unsigned int VEC_SVE_DATA
= 2;
1611 const unsigned int VEC_SVE_PRED
= 4;
1612 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1613 a structure of 2, 3 or 4 vectors. */
1614 const unsigned int VEC_STRUCT
= 8;
1615 /* Can be used in combination with VEC_SVE_DATA to indicate that the
1616 vector has fewer significant bytes than a full SVE vector. */
1617 const unsigned int VEC_PARTIAL
= 16;
1618 /* Useful combinations of the above. */
1619 const unsigned int VEC_ANY_SVE
= VEC_SVE_DATA
| VEC_SVE_PRED
;
1620 const unsigned int VEC_ANY_DATA
= VEC_ADVSIMD
| VEC_SVE_DATA
;
1622 /* Return a set of flags describing the vector properties of mode MODE.
1623 Ignore modes that are not supported by the current target. */
1625 aarch64_classify_vector_mode (machine_mode mode
)
1627 if (aarch64_advsimd_struct_mode_p (mode
))
1628 return VEC_ADVSIMD
| VEC_STRUCT
;
1630 if (aarch64_sve_pred_mode_p (mode
))
1631 return VEC_SVE_PRED
;
1633 /* Make the decision based on the mode's enum value rather than its
1634 properties, so that we keep the correct classification regardless
1635 of -msve-vector-bits. */
1638 /* Partial SVE QI vectors. */
1642 /* Partial SVE HI vectors. */
1645 /* Partial SVE SI vector. */
1647 /* Partial SVE HF vectors. */
1650 /* Partial SVE SF vector. */
1652 return TARGET_SVE
? VEC_SVE_DATA
| VEC_PARTIAL
: 0;
1661 return TARGET_SVE
? VEC_SVE_DATA
: 0;
1663 /* x2 SVE vectors. */
1671 /* x3 SVE vectors. */
1679 /* x4 SVE vectors. */
1687 return TARGET_SVE
? VEC_SVE_DATA
| VEC_STRUCT
: 0;
1689 /* 64-bit Advanced SIMD vectors. */
1693 /* ...E_V1DImode doesn't exist. */
1697 /* 128-bit Advanced SIMD vectors. */
1705 return TARGET_SIMD
? VEC_ADVSIMD
: 0;
1712 /* Return true if MODE is any of the data vector modes, including
1715 aarch64_vector_data_mode_p (machine_mode mode
)
1717 return aarch64_classify_vector_mode (mode
) & VEC_ANY_DATA
;
1720 /* Return true if MODE is any form of SVE mode, including predicates,
1721 vectors and structures. */
1723 aarch64_sve_mode_p (machine_mode mode
)
1725 return aarch64_classify_vector_mode (mode
) & VEC_ANY_SVE
;
1728 /* Return true if MODE is an SVE data vector mode; either a single vector
1729 or a structure of vectors. */
1731 aarch64_sve_data_mode_p (machine_mode mode
)
1733 return aarch64_classify_vector_mode (mode
) & VEC_SVE_DATA
;
1736 /* Return the number of defined bytes in one constituent vector of
1737 SVE mode MODE, which has vector flags VEC_FLAGS. */
1739 aarch64_vl_bytes (machine_mode mode
, unsigned int vec_flags
)
1741 if (vec_flags
& VEC_PARTIAL
)
1742 /* A single partial vector. */
1743 return GET_MODE_SIZE (mode
);
1745 if (vec_flags
& VEC_SVE_DATA
)
1746 /* A single vector or a tuple. */
1747 return BYTES_PER_SVE_VECTOR
;
1749 /* A single predicate. */
1750 gcc_assert (vec_flags
& VEC_SVE_PRED
);
1751 return BYTES_PER_SVE_PRED
;
1754 /* Implement target hook TARGET_ARRAY_MODE. */
1755 static opt_machine_mode
1756 aarch64_array_mode (machine_mode mode
, unsigned HOST_WIDE_INT nelems
)
1758 if (aarch64_classify_vector_mode (mode
) == VEC_SVE_DATA
1759 && IN_RANGE (nelems
, 2, 4))
1760 return mode_for_vector (GET_MODE_INNER (mode
),
1761 GET_MODE_NUNITS (mode
) * nelems
);
1763 return opt_machine_mode ();
1766 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1768 aarch64_array_mode_supported_p (machine_mode mode
,
1769 unsigned HOST_WIDE_INT nelems
)
1772 && (AARCH64_VALID_SIMD_QREG_MODE (mode
)
1773 || AARCH64_VALID_SIMD_DREG_MODE (mode
))
1774 && (nelems
>= 2 && nelems
<= 4))
1780 /* MODE is some form of SVE vector mode. For data modes, return the number
1781 of vector register bits that each element of MODE occupies, such as 64
1782 for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
1783 in a 64-bit container). For predicate modes, return the number of
1784 data bits controlled by each significant predicate bit. */
1787 aarch64_sve_container_bits (machine_mode mode
)
1789 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1790 poly_uint64 vector_bits
= (vec_flags
& (VEC_PARTIAL
| VEC_SVE_PRED
)
1791 ? BITS_PER_SVE_VECTOR
1792 : GET_MODE_BITSIZE (mode
));
1793 return vector_element_size (vector_bits
, GET_MODE_NUNITS (mode
));
1796 /* Return the SVE predicate mode to use for elements that have
1797 ELEM_NBYTES bytes, if such a mode exists. */
1800 aarch64_sve_pred_mode (unsigned int elem_nbytes
)
1804 if (elem_nbytes
== 1)
1806 if (elem_nbytes
== 2)
1808 if (elem_nbytes
== 4)
1810 if (elem_nbytes
== 8)
1813 return opt_machine_mode ();
1816 /* Return the SVE predicate mode that should be used to control
1820 aarch64_sve_pred_mode (machine_mode mode
)
1822 unsigned int bits
= aarch64_sve_container_bits (mode
);
1823 return aarch64_sve_pred_mode (bits
/ BITS_PER_UNIT
).require ();
1826 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1828 static opt_machine_mode
1829 aarch64_get_mask_mode (machine_mode mode
)
1831 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1832 if (vec_flags
& VEC_SVE_DATA
)
1833 return aarch64_sve_pred_mode (mode
);
1835 return default_get_mask_mode (mode
);
1838 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
1841 aarch64_sve_data_mode (scalar_mode inner_mode
, poly_uint64 nunits
)
1843 enum mode_class mclass
= (is_a
<scalar_float_mode
> (inner_mode
)
1844 ? MODE_VECTOR_FLOAT
: MODE_VECTOR_INT
);
1846 FOR_EACH_MODE_IN_CLASS (mode
, mclass
)
1847 if (inner_mode
== GET_MODE_INNER (mode
)
1848 && known_eq (nunits
, GET_MODE_NUNITS (mode
))
1849 && aarch64_sve_data_mode_p (mode
))
1851 return opt_machine_mode ();
1854 /* Return the integer element mode associated with SVE mode MODE. */
1856 static scalar_int_mode
1857 aarch64_sve_element_int_mode (machine_mode mode
)
1859 poly_uint64 vector_bits
= (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
1860 ? BITS_PER_SVE_VECTOR
1861 : GET_MODE_BITSIZE (mode
));
1862 unsigned int elt_bits
= vector_element_size (vector_bits
,
1863 GET_MODE_NUNITS (mode
));
1864 return int_mode_for_size (elt_bits
, 0).require ();
1867 /* Return an integer element mode that contains exactly
1868 aarch64_sve_container_bits (MODE) bits. This is wider than
1869 aarch64_sve_element_int_mode if MODE is a partial vector,
1870 otherwise it's the same. */
1872 static scalar_int_mode
1873 aarch64_sve_container_int_mode (machine_mode mode
)
1875 return int_mode_for_size (aarch64_sve_container_bits (mode
), 0).require ();
1878 /* Return the integer vector mode associated with SVE mode MODE.
1879 Unlike related_int_vector_mode, this can handle the case in which
1880 MODE is a predicate (and thus has a different total size). */
1883 aarch64_sve_int_mode (machine_mode mode
)
1885 scalar_int_mode int_mode
= aarch64_sve_element_int_mode (mode
);
1886 return aarch64_sve_data_mode (int_mode
, GET_MODE_NUNITS (mode
)).require ();
1889 /* Implement TARGET_VECTORIZE_RELATED_MODE. */
1891 static opt_machine_mode
1892 aarch64_vectorize_related_mode (machine_mode vector_mode
,
1893 scalar_mode element_mode
,
1896 unsigned int vec_flags
= aarch64_classify_vector_mode (vector_mode
);
1898 /* If we're operating on SVE vectors, try to return an SVE mode. */
1899 poly_uint64 sve_nunits
;
1900 if ((vec_flags
& VEC_SVE_DATA
)
1901 && multiple_p (BYTES_PER_SVE_VECTOR
,
1902 GET_MODE_SIZE (element_mode
), &sve_nunits
))
1904 machine_mode sve_mode
;
1905 if (maybe_ne (nunits
, 0U))
1907 /* Try to find a full or partial SVE mode with exactly
1909 if (multiple_p (sve_nunits
, nunits
)
1910 && aarch64_sve_data_mode (element_mode
,
1911 nunits
).exists (&sve_mode
))
1916 /* Take the preferred number of units from the number of bytes
1917 that fit in VECTOR_MODE. We always start by "autodetecting"
1918 a full vector mode with preferred_simd_mode, so vectors
1919 chosen here will also be full vector modes. Then
1920 autovectorize_vector_modes tries smaller starting modes
1921 and thus smaller preferred numbers of units. */
1922 sve_nunits
= ordered_min (sve_nunits
, GET_MODE_SIZE (vector_mode
));
1923 if (aarch64_sve_data_mode (element_mode
,
1924 sve_nunits
).exists (&sve_mode
))
1929 /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors. */
1930 if ((vec_flags
& VEC_ADVSIMD
)
1931 && known_eq (nunits
, 0U)
1932 && known_eq (GET_MODE_BITSIZE (vector_mode
), 64U)
1933 && maybe_ge (GET_MODE_BITSIZE (element_mode
)
1934 * GET_MODE_NUNITS (vector_mode
), 128U))
1936 machine_mode res
= aarch64_simd_container_mode (element_mode
, 128);
1937 if (VECTOR_MODE_P (res
))
1941 return default_vectorize_related_mode (vector_mode
, element_mode
, nunits
);
1944 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1945 prefer to use the first arithmetic operand as the else value if
1946 the else value doesn't matter, since that exactly matches the SVE
1947 destructive merging form. For ternary operations we could either
1948 pick the first operand and use FMAD-like instructions or the last
1949 operand and use FMLA-like instructions; the latter seems more
1953 aarch64_preferred_else_value (unsigned, tree
, unsigned int nops
, tree
*ops
)
1955 return nops
== 3 ? ops
[2] : ops
[0];
1958 /* Implement TARGET_HARD_REGNO_NREGS. */
1961 aarch64_hard_regno_nregs (unsigned regno
, machine_mode mode
)
1963 /* ??? Logically we should only need to provide a value when
1964 HARD_REGNO_MODE_OK says that the combination is valid,
1965 but at the moment we need to handle all modes. Just ignore
1966 any runtime parts for registers that can't store them. */
1967 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
1968 switch (aarch64_regno_regclass (regno
))
1974 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1975 if (vec_flags
& VEC_SVE_DATA
)
1976 return exact_div (GET_MODE_SIZE (mode
),
1977 aarch64_vl_bytes (mode
, vec_flags
)).to_constant ();
1978 return CEIL (lowest_size
, UNITS_PER_VREG
);
1984 case PR_AND_FFR_REGS
:
1987 return CEIL (lowest_size
, UNITS_PER_WORD
);
1992 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1995 aarch64_hard_regno_mode_ok (unsigned regno
, machine_mode mode
)
1997 if (GET_MODE_CLASS (mode
) == MODE_CC
)
1998 return regno
== CC_REGNUM
;
2000 if (regno
== VG_REGNUM
)
2001 /* This must have the same size as _Unwind_Word. */
2002 return mode
== DImode
;
2004 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
2005 if (vec_flags
& VEC_SVE_PRED
)
2006 return pr_or_ffr_regnum_p (regno
);
2008 if (pr_or_ffr_regnum_p (regno
))
2011 if (regno
== SP_REGNUM
)
2012 /* The purpose of comparing with ptr_mode is to support the
2013 global register variable associated with the stack pointer
2014 register via the syntax of asm ("wsp") in ILP32. */
2015 return mode
== Pmode
|| mode
== ptr_mode
;
2017 if (regno
== FRAME_POINTER_REGNUM
|| regno
== ARG_POINTER_REGNUM
)
2018 return mode
== Pmode
;
2020 if (GP_REGNUM_P (regno
))
2022 if (vec_flags
& VEC_ANY_SVE
)
2024 if (known_le (GET_MODE_SIZE (mode
), 8))
2026 if (known_le (GET_MODE_SIZE (mode
), 16))
2027 return (regno
& 1) == 0;
2029 else if (FP_REGNUM_P (regno
))
2031 if (vec_flags
& VEC_STRUCT
)
2032 return end_hard_regno (mode
, regno
) - 1 <= V31_REGNUM
;
2034 return !VECTOR_MODE_P (mode
) || vec_flags
!= 0;
2040 /* Return true if TYPE is a type that should be passed or returned in
2041 SVE registers, assuming enough registers are available. When returning
2042 true, set *NUM_ZR and *NUM_PR to the number of required Z and P registers
2046 aarch64_sve_argument_p (const_tree type
, unsigned int *num_zr
,
2047 unsigned int *num_pr
)
2049 if (aarch64_sve::svbool_type_p (type
))
2056 if (unsigned int nvectors
= aarch64_sve::nvectors_if_data_type (type
))
2066 /* Return true if a function with type FNTYPE returns its value in
2067 SVE vector or predicate registers. */
2070 aarch64_returns_value_in_sve_regs_p (const_tree fntype
)
2072 unsigned int num_zr
, num_pr
;
2073 tree return_type
= TREE_TYPE (fntype
);
2074 return (return_type
!= error_mark_node
2075 && aarch64_sve_argument_p (return_type
, &num_zr
, &num_pr
));
2078 /* Return true if a function with type FNTYPE takes arguments in
2079 SVE vector or predicate registers. */
2082 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype
)
2084 CUMULATIVE_ARGS args_so_far_v
;
2085 aarch64_init_cumulative_args (&args_so_far_v
, NULL_TREE
, NULL_RTX
,
2086 NULL_TREE
, 0, true);
2087 cumulative_args_t args_so_far
= pack_cumulative_args (&args_so_far_v
);
2089 for (tree chain
= TYPE_ARG_TYPES (fntype
);
2090 chain
&& chain
!= void_list_node
;
2091 chain
= TREE_CHAIN (chain
))
2093 tree arg_type
= TREE_VALUE (chain
);
2094 if (arg_type
== error_mark_node
)
2097 function_arg_info
arg (arg_type
, /*named=*/true);
2098 apply_pass_by_reference_rules (&args_so_far_v
, arg
);
2099 unsigned int num_zr
, num_pr
;
2100 if (aarch64_sve_argument_p (arg
.type
, &num_zr
, &num_pr
))
2103 targetm
.calls
.function_arg_advance (args_so_far
, arg
);
2108 /* Implement TARGET_FNTYPE_ABI. */
2110 static const predefined_function_abi
&
2111 aarch64_fntype_abi (const_tree fntype
)
2113 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype
)))
2114 return aarch64_simd_abi ();
2116 if (aarch64_returns_value_in_sve_regs_p (fntype
)
2117 || aarch64_takes_arguments_in_sve_regs_p (fntype
))
2118 return aarch64_sve_abi ();
2120 return default_function_abi
;
2123 /* Return true if we should emit CFI for register REGNO. */
2126 aarch64_emit_cfi_for_reg_p (unsigned int regno
)
2128 return (GP_REGNUM_P (regno
)
2129 || !default_function_abi
.clobbers_full_reg_p (regno
));
2132 /* Return the mode we should use to save and restore register REGNO. */
2135 aarch64_reg_save_mode (unsigned int regno
)
2137 if (GP_REGNUM_P (regno
))
2140 if (FP_REGNUM_P (regno
))
2141 switch (crtl
->abi
->id ())
2143 case ARM_PCS_AAPCS64
:
2144 /* Only the low 64 bits are saved by the base PCS. */
2148 /* The vector PCS saves the low 128 bits (which is the full
2149 register on non-SVE targets). */
2153 /* Use vectors of DImode for registers that need frame
2154 information, so that the first 64 bytes of the save slot
2155 are always the equivalent of what storing D<n> would give. */
2156 if (aarch64_emit_cfi_for_reg_p (regno
))
2159 /* Use vectors of bytes otherwise, so that the layout is
2160 endian-agnostic, and so that we can use LDR and STR for
2161 big-endian targets. */
2164 case ARM_PCS_TLSDESC
:
2165 case ARM_PCS_UNKNOWN
:
2169 if (PR_REGNUM_P (regno
))
2170 /* Save the full predicate register. */
2176 /* Implement TARGET_INSN_CALLEE_ABI. */
2178 const predefined_function_abi
&
2179 aarch64_insn_callee_abi (const rtx_insn
*insn
)
2181 rtx pat
= PATTERN (insn
);
2182 gcc_assert (GET_CODE (pat
) == PARALLEL
);
2183 rtx unspec
= XVECEXP (pat
, 0, 1);
2184 gcc_assert (GET_CODE (unspec
) == UNSPEC
2185 && XINT (unspec
, 1) == UNSPEC_CALLEE_ABI
);
2186 return function_abis
[INTVAL (XVECEXP (unspec
, 0, 0))];
2189 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
2190 the lower 64 bits of a 128-bit register. Tell the compiler the callee
2191 clobbers the top 64 bits when restoring the bottom 64 bits. */
2194 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id
,
2198 if (FP_REGNUM_P (regno
) && abi_id
!= ARM_PCS_SVE
)
2200 poly_int64 per_register_size
= GET_MODE_SIZE (mode
);
2201 unsigned int nregs
= hard_regno_nregs (regno
, mode
);
2203 per_register_size
= exact_div (per_register_size
, nregs
);
2204 if (abi_id
== ARM_PCS_SIMD
|| abi_id
== ARM_PCS_TLSDESC
)
2205 return maybe_gt (per_register_size
, 16);
2206 return maybe_gt (per_register_size
, 8);
2211 /* Implement REGMODE_NATURAL_SIZE. */
2213 aarch64_regmode_natural_size (machine_mode mode
)
2215 /* The natural size for SVE data modes is one SVE data vector,
2216 and similarly for predicates. We can't independently modify
2217 anything smaller than that. */
2218 /* ??? For now, only do this for variable-width SVE registers.
2219 Doing it for constant-sized registers breaks lower-subreg.c. */
2220 /* ??? And once that's fixed, we should probably have similar
2221 code for Advanced SIMD. */
2222 if (!aarch64_sve_vg
.is_constant ())
2224 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
2225 if (vec_flags
& VEC_SVE_PRED
)
2226 return BYTES_PER_SVE_PRED
;
2227 if (vec_flags
& VEC_SVE_DATA
)
2228 return BYTES_PER_SVE_VECTOR
;
2230 return UNITS_PER_WORD
;
2233 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
2235 aarch64_hard_regno_caller_save_mode (unsigned regno
, unsigned,
2238 /* The predicate mode determines which bits are significant and
2239 which are "don't care". Decreasing the number of lanes would
2240 lose data while increasing the number of lanes would make bits
2241 unnecessarily significant. */
2242 if (PR_REGNUM_P (regno
))
2244 if (known_ge (GET_MODE_SIZE (mode
), 4))
2250 /* Return true if I's bits are consecutive ones from the MSB. */
2252 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i
)
2254 return exact_log2 (-i
) != HOST_WIDE_INT_M1
;
2257 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
2258 that strcpy from constants will be faster. */
2260 static HOST_WIDE_INT
2261 aarch64_constant_alignment (const_tree exp
, HOST_WIDE_INT align
)
2263 if (TREE_CODE (exp
) == STRING_CST
&& !optimize_size
)
2264 return MAX (align
, BITS_PER_WORD
);
2268 /* Return true if calls to DECL should be treated as
2269 long-calls (ie called via a register). */
2271 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED
)
2276 /* Return true if calls to symbol-ref SYM should be treated as
2277 long-calls (ie called via a register). */
2279 aarch64_is_long_call_p (rtx sym
)
2281 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym
));
2284 /* Return true if calls to symbol-ref SYM should not go through
2288 aarch64_is_noplt_call_p (rtx sym
)
2290 const_tree decl
= SYMBOL_REF_DECL (sym
);
2295 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl
)))
2296 && !targetm
.binds_local_p (decl
))
2302 /* Return true if the offsets to a zero/sign-extract operation
2303 represent an expression that matches an extend operation. The
2304 operands represent the paramters from
2306 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
2308 aarch64_is_extend_from_extract (scalar_int_mode mode
, rtx mult_imm
,
2311 HOST_WIDE_INT mult_val
, extract_val
;
2313 if (! CONST_INT_P (mult_imm
) || ! CONST_INT_P (extract_imm
))
2316 mult_val
= INTVAL (mult_imm
);
2317 extract_val
= INTVAL (extract_imm
);
2320 && extract_val
< GET_MODE_BITSIZE (mode
)
2321 && exact_log2 (extract_val
& ~7) > 0
2322 && (extract_val
& 7) <= 4
2323 && mult_val
== (1 << (extract_val
& 7)))
2329 /* Emit an insn that's a simple single-set. Both the operands must be
2330 known to be valid. */
2331 inline static rtx_insn
*
2332 emit_set_insn (rtx x
, rtx y
)
2334 return emit_insn (gen_rtx_SET (x
, y
));
2337 /* X and Y are two things to compare using CODE. Emit the compare insn and
2338 return the rtx for register 0 in the proper mode. */
2340 aarch64_gen_compare_reg (RTX_CODE code
, rtx x
, rtx y
)
2342 machine_mode cmp_mode
= GET_MODE (x
);
2343 machine_mode cc_mode
;
2346 if (cmp_mode
== TImode
)
2348 gcc_assert (code
== NE
);
2351 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
2353 rtx x_lo
= operand_subword (x
, 0, 0, TImode
);
2354 rtx y_lo
= operand_subword (y
, 0, 0, TImode
);
2355 emit_set_insn (cc_reg
, gen_rtx_COMPARE (cc_mode
, x_lo
, y_lo
));
2357 rtx x_hi
= operand_subword (x
, 1, 0, TImode
);
2358 rtx y_hi
= operand_subword (y
, 1, 0, TImode
);
2359 emit_insn (gen_ccmpdi (cc_reg
, cc_reg
, x_hi
, y_hi
,
2360 gen_rtx_EQ (cc_mode
, cc_reg
, const0_rtx
),
2361 GEN_INT (AARCH64_EQ
)));
2365 cc_mode
= SELECT_CC_MODE (code
, x
, y
);
2366 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
2367 emit_set_insn (cc_reg
, gen_rtx_COMPARE (cc_mode
, x
, y
));
2372 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
2375 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code
, rtx x
, rtx y
,
2376 machine_mode y_mode
)
2378 if (y_mode
== E_QImode
|| y_mode
== E_HImode
)
2380 if (CONST_INT_P (y
))
2381 y
= GEN_INT (INTVAL (y
) & GET_MODE_MASK (y_mode
));
2385 machine_mode cc_mode
;
2387 t
= gen_rtx_ZERO_EXTEND (SImode
, y
);
2388 t
= gen_rtx_COMPARE (CC_SWPmode
, t
, x
);
2389 cc_mode
= CC_SWPmode
;
2390 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
2391 emit_set_insn (cc_reg
, t
);
2396 if (!aarch64_plus_operand (y
, y_mode
))
2397 y
= force_reg (y_mode
, y
);
2399 return aarch64_gen_compare_reg (code
, x
, y
);
2402 /* Build the SYMBOL_REF for __tls_get_addr. */
2404 static GTY(()) rtx tls_get_addr_libfunc
;
2407 aarch64_tls_get_addr (void)
2409 if (!tls_get_addr_libfunc
)
2410 tls_get_addr_libfunc
= init_one_libfunc ("__tls_get_addr");
2411 return tls_get_addr_libfunc
;
2414 /* Return the TLS model to use for ADDR. */
2416 static enum tls_model
2417 tls_symbolic_operand_type (rtx addr
)
2419 enum tls_model tls_kind
= TLS_MODEL_NONE
;
2420 if (GET_CODE (addr
) == CONST
)
2423 rtx sym
= strip_offset (addr
, &addend
);
2424 if (GET_CODE (sym
) == SYMBOL_REF
)
2425 tls_kind
= SYMBOL_REF_TLS_MODEL (sym
);
2427 else if (GET_CODE (addr
) == SYMBOL_REF
)
2428 tls_kind
= SYMBOL_REF_TLS_MODEL (addr
);
2433 /* We'll allow lo_sum's in addresses in our legitimate addresses
2434 so that combine would take care of combining addresses where
2435 necessary, but for generation purposes, we'll generate the address
2438 tmp = hi (symbol_ref); adrp x1, foo
2439 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
2443 adrp x1, :got:foo adrp tmp, :tlsgd:foo
2444 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
2448 Load TLS symbol, depending on TLS mechanism and TLS access model.
2450 Global Dynamic - Traditional TLS:
2451 adrp tmp, :tlsgd:imm
2452 add dest, tmp, #:tlsgd_lo12:imm
2455 Global Dynamic - TLS Descriptors:
2456 adrp dest, :tlsdesc:imm
2457 ldr tmp, [dest, #:tlsdesc_lo12:imm]
2458 add dest, dest, #:tlsdesc_lo12:imm
2465 adrp tmp, :gottprel:imm
2466 ldr dest, [tmp, #:gottprel_lo12:imm]
2471 add t0, tp, #:tprel_hi12:imm, lsl #12
2472 add t0, t0, #:tprel_lo12_nc:imm
2476 aarch64_load_symref_appropriately (rtx dest
, rtx imm
,
2477 enum aarch64_symbol_type type
)
2481 case SYMBOL_SMALL_ABSOLUTE
:
2483 /* In ILP32, the mode of dest can be either SImode or DImode. */
2485 machine_mode mode
= GET_MODE (dest
);
2487 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
2489 if (can_create_pseudo_p ())
2490 tmp_reg
= gen_reg_rtx (mode
);
2492 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
2493 emit_insn (gen_add_losym (dest
, tmp_reg
, imm
));
2497 case SYMBOL_TINY_ABSOLUTE
:
2498 emit_insn (gen_rtx_SET (dest
, imm
));
2501 case SYMBOL_SMALL_GOT_28K
:
2503 machine_mode mode
= GET_MODE (dest
);
2504 rtx gp_rtx
= pic_offset_table_rtx
;
2508 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2509 here before rtl expand. Tree IVOPT will generate rtl pattern to
2510 decide rtx costs, in which case pic_offset_table_rtx is not
2511 initialized. For that case no need to generate the first adrp
2512 instruction as the final cost for global variable access is
2516 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2517 using the page base as GOT base, the first page may be wasted,
2518 in the worst scenario, there is only 28K space for GOT).
2520 The generate instruction sequence for accessing global variable
2523 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2525 Only one instruction needed. But we must initialize
2526 pic_offset_table_rtx properly. We generate initialize insn for
2527 every global access, and allow CSE to remove all redundant.
2529 The final instruction sequences will look like the following
2530 for multiply global variables access.
2532 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2534 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2535 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2536 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2539 rtx s
= gen_rtx_SYMBOL_REF (Pmode
, "_GLOBAL_OFFSET_TABLE_");
2540 crtl
->uses_pic_offset_table
= 1;
2541 emit_move_insn (gp_rtx
, gen_rtx_HIGH (Pmode
, s
));
2543 if (mode
!= GET_MODE (gp_rtx
))
2544 gp_rtx
= gen_lowpart (mode
, gp_rtx
);
2548 if (mode
== ptr_mode
)
2551 insn
= gen_ldr_got_small_28k_di (dest
, gp_rtx
, imm
);
2553 insn
= gen_ldr_got_small_28k_si (dest
, gp_rtx
, imm
);
2555 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
2559 gcc_assert (mode
== Pmode
);
2561 insn
= gen_ldr_got_small_28k_sidi (dest
, gp_rtx
, imm
);
2562 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
2565 /* The operand is expected to be MEM. Whenever the related insn
2566 pattern changed, above code which calculate mem should be
2568 gcc_assert (GET_CODE (mem
) == MEM
);
2569 MEM_READONLY_P (mem
) = 1;
2570 MEM_NOTRAP_P (mem
) = 1;
2575 case SYMBOL_SMALL_GOT_4G
:
2577 /* In ILP32, the mode of dest can be either SImode or DImode,
2578 while the got entry is always of SImode size. The mode of
2579 dest depends on how dest is used: if dest is assigned to a
2580 pointer (e.g. in the memory), it has SImode; it may have
2581 DImode if dest is dereferenced to access the memeory.
2582 This is why we have to handle three different ldr_got_small
2583 patterns here (two patterns for ILP32). */
2588 machine_mode mode
= GET_MODE (dest
);
2590 if (can_create_pseudo_p ())
2591 tmp_reg
= gen_reg_rtx (mode
);
2593 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
2594 if (mode
== ptr_mode
)
2597 insn
= gen_ldr_got_small_di (dest
, tmp_reg
, imm
);
2599 insn
= gen_ldr_got_small_si (dest
, tmp_reg
, imm
);
2601 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
2605 gcc_assert (mode
== Pmode
);
2607 insn
= gen_ldr_got_small_sidi (dest
, tmp_reg
, imm
);
2608 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
2611 gcc_assert (GET_CODE (mem
) == MEM
);
2612 MEM_READONLY_P (mem
) = 1;
2613 MEM_NOTRAP_P (mem
) = 1;
2618 case SYMBOL_SMALL_TLSGD
:
2621 machine_mode mode
= GET_MODE (dest
);
2622 rtx result
= gen_rtx_REG (mode
, R0_REGNUM
);
2626 aarch64_emit_call_insn (gen_tlsgd_small_si (result
, imm
));
2628 aarch64_emit_call_insn (gen_tlsgd_small_di (result
, imm
));
2629 insns
= get_insns ();
2632 RTL_CONST_CALL_P (insns
) = 1;
2633 emit_libcall_block (insns
, dest
, result
, imm
);
2637 case SYMBOL_SMALL_TLSDESC
:
2639 machine_mode mode
= GET_MODE (dest
);
2640 rtx x0
= gen_rtx_REG (mode
, R0_REGNUM
);
2643 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
2645 /* In ILP32, the got entry is always of SImode size. Unlike
2646 small GOT, the dest is fixed at reg 0. */
2648 emit_insn (gen_tlsdesc_small_si (imm
));
2650 emit_insn (gen_tlsdesc_small_di (imm
));
2651 tp
= aarch64_load_tp (NULL
);
2654 tp
= gen_lowpart (mode
, tp
);
2656 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, x0
)));
2658 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2662 case SYMBOL_SMALL_TLSIE
:
2664 /* In ILP32, the mode of dest can be either SImode or DImode,
2665 while the got entry is always of SImode size. The mode of
2666 dest depends on how dest is used: if dest is assigned to a
2667 pointer (e.g. in the memory), it has SImode; it may have
2668 DImode if dest is dereferenced to access the memeory.
2669 This is why we have to handle three different tlsie_small
2670 patterns here (two patterns for ILP32). */
2671 machine_mode mode
= GET_MODE (dest
);
2672 rtx tmp_reg
= gen_reg_rtx (mode
);
2673 rtx tp
= aarch64_load_tp (NULL
);
2675 if (mode
== ptr_mode
)
2678 emit_insn (gen_tlsie_small_di (tmp_reg
, imm
));
2681 emit_insn (gen_tlsie_small_si (tmp_reg
, imm
));
2682 tp
= gen_lowpart (mode
, tp
);
2687 gcc_assert (mode
== Pmode
);
2688 emit_insn (gen_tlsie_small_sidi (tmp_reg
, imm
));
2691 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, tmp_reg
)));
2693 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2697 case SYMBOL_TLSLE12
:
2698 case SYMBOL_TLSLE24
:
2699 case SYMBOL_TLSLE32
:
2700 case SYMBOL_TLSLE48
:
2702 machine_mode mode
= GET_MODE (dest
);
2703 rtx tp
= aarch64_load_tp (NULL
);
2706 tp
= gen_lowpart (mode
, tp
);
2710 case SYMBOL_TLSLE12
:
2711 emit_insn ((mode
== DImode
? gen_tlsle12_di
: gen_tlsle12_si
)
2714 case SYMBOL_TLSLE24
:
2715 emit_insn ((mode
== DImode
? gen_tlsle24_di
: gen_tlsle24_si
)
2718 case SYMBOL_TLSLE32
:
2719 emit_insn ((mode
== DImode
? gen_tlsle32_di
: gen_tlsle32_si
)
2721 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
2724 case SYMBOL_TLSLE48
:
2725 emit_insn ((mode
== DImode
? gen_tlsle48_di
: gen_tlsle48_si
)
2727 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
2735 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2739 case SYMBOL_TINY_GOT
:
2740 emit_insn (gen_ldr_got_tiny (dest
, imm
));
2743 case SYMBOL_TINY_TLSIE
:
2745 machine_mode mode
= GET_MODE (dest
);
2746 rtx tp
= aarch64_load_tp (NULL
);
2748 if (mode
== ptr_mode
)
2751 emit_insn (gen_tlsie_tiny_di (dest
, imm
, tp
));
2754 tp
= gen_lowpart (mode
, tp
);
2755 emit_insn (gen_tlsie_tiny_si (dest
, imm
, tp
));
2760 gcc_assert (mode
== Pmode
);
2761 emit_insn (gen_tlsie_tiny_sidi (dest
, imm
, tp
));
2765 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2774 /* Emit a move from SRC to DEST. Assume that the move expanders can
2775 handle all moves if !can_create_pseudo_p (). The distinction is
2776 important because, unlike emit_move_insn, the move expanders know
2777 how to force Pmode objects into the constant pool even when the
2778 constant pool address is not itself legitimate. */
2780 aarch64_emit_move (rtx dest
, rtx src
)
2782 return (can_create_pseudo_p ()
2783 ? emit_move_insn (dest
, src
)
2784 : emit_move_insn_1 (dest
, src
));
2787 /* Apply UNOPTAB to OP and store the result in DEST. */
2790 aarch64_emit_unop (rtx dest
, optab unoptab
, rtx op
)
2792 rtx tmp
= expand_unop (GET_MODE (dest
), unoptab
, op
, dest
, 0);
2794 emit_move_insn (dest
, tmp
);
2797 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
2800 aarch64_emit_binop (rtx dest
, optab binoptab
, rtx op0
, rtx op1
)
2802 rtx tmp
= expand_binop (GET_MODE (dest
), binoptab
, op0
, op1
, dest
, 0,
2805 emit_move_insn (dest
, tmp
);
2808 /* Split a 128-bit move operation into two 64-bit move operations,
2809 taking care to handle partial overlap of register to register
2810 copies. Special cases are needed when moving between GP regs and
2811 FP regs. SRC can be a register, constant or memory; DST a register
2812 or memory. If either operand is memory it must not have any side
2815 aarch64_split_128bit_move (rtx dst
, rtx src
)
2820 machine_mode mode
= GET_MODE (dst
);
2822 gcc_assert (mode
== TImode
|| mode
== TFmode
);
2823 gcc_assert (!(side_effects_p (src
) || side_effects_p (dst
)));
2824 gcc_assert (mode
== GET_MODE (src
) || GET_MODE (src
) == VOIDmode
);
2826 if (REG_P (dst
) && REG_P (src
))
2828 int src_regno
= REGNO (src
);
2829 int dst_regno
= REGNO (dst
);
2831 /* Handle FP <-> GP regs. */
2832 if (FP_REGNUM_P (dst_regno
) && GP_REGNUM_P (src_regno
))
2834 src_lo
= gen_lowpart (word_mode
, src
);
2835 src_hi
= gen_highpart (word_mode
, src
);
2837 emit_insn (gen_aarch64_movlow_di (mode
, dst
, src_lo
));
2838 emit_insn (gen_aarch64_movhigh_di (mode
, dst
, src_hi
));
2841 else if (GP_REGNUM_P (dst_regno
) && FP_REGNUM_P (src_regno
))
2843 dst_lo
= gen_lowpart (word_mode
, dst
);
2844 dst_hi
= gen_highpart (word_mode
, dst
);
2846 emit_insn (gen_aarch64_movdi_low (mode
, dst_lo
, src
));
2847 emit_insn (gen_aarch64_movdi_high (mode
, dst_hi
, src
));
2852 dst_lo
= gen_lowpart (word_mode
, dst
);
2853 dst_hi
= gen_highpart (word_mode
, dst
);
2854 src_lo
= gen_lowpart (word_mode
, src
);
2855 src_hi
= gen_highpart_mode (word_mode
, mode
, src
);
2857 /* At most one pairing may overlap. */
2858 if (reg_overlap_mentioned_p (dst_lo
, src_hi
))
2860 aarch64_emit_move (dst_hi
, src_hi
);
2861 aarch64_emit_move (dst_lo
, src_lo
);
2865 aarch64_emit_move (dst_lo
, src_lo
);
2866 aarch64_emit_move (dst_hi
, src_hi
);
2871 aarch64_split_128bit_move_p (rtx dst
, rtx src
)
2873 return (! REG_P (src
)
2874 || ! (FP_REGNUM_P (REGNO (dst
)) && FP_REGNUM_P (REGNO (src
))));
2877 /* Split a complex SIMD combine. */
2880 aarch64_split_simd_combine (rtx dst
, rtx src1
, rtx src2
)
2882 machine_mode src_mode
= GET_MODE (src1
);
2883 machine_mode dst_mode
= GET_MODE (dst
);
2885 gcc_assert (VECTOR_MODE_P (dst_mode
));
2886 gcc_assert (register_operand (dst
, dst_mode
)
2887 && register_operand (src1
, src_mode
)
2888 && register_operand (src2
, src_mode
));
2890 emit_insn (gen_aarch64_simd_combine (src_mode
, dst
, src1
, src2
));
2894 /* Split a complex SIMD move. */
2897 aarch64_split_simd_move (rtx dst
, rtx src
)
2899 machine_mode src_mode
= GET_MODE (src
);
2900 machine_mode dst_mode
= GET_MODE (dst
);
2902 gcc_assert (VECTOR_MODE_P (dst_mode
));
2904 if (REG_P (dst
) && REG_P (src
))
2906 gcc_assert (VECTOR_MODE_P (src_mode
));
2907 emit_insn (gen_aarch64_split_simd_mov (src_mode
, dst
, src
));
2912 aarch64_zero_extend_const_eq (machine_mode xmode
, rtx x
,
2913 machine_mode ymode
, rtx y
)
2915 rtx r
= simplify_const_unary_operation (ZERO_EXTEND
, xmode
, y
, ymode
);
2916 gcc_assert (r
!= NULL
);
2917 return rtx_equal_p (x
, r
);
2920 /* Return TARGET if it is nonnull and a register of mode MODE.
2921 Otherwise, return a fresh register of mode MODE if we can,
2922 or TARGET reinterpreted as MODE if we can't. */
2925 aarch64_target_reg (rtx target
, machine_mode mode
)
2927 if (target
&& REG_P (target
) && GET_MODE (target
) == mode
)
2929 if (!can_create_pseudo_p ())
2931 gcc_assert (target
);
2932 return gen_lowpart (mode
, target
);
2934 return gen_reg_rtx (mode
);
2937 /* Return a register that contains the constant in BUILDER, given that
2938 the constant is a legitimate move operand. Use TARGET as the register
2939 if it is nonnull and convenient. */
2942 aarch64_emit_set_immediate (rtx target
, rtx_vector_builder
&builder
)
2944 rtx src
= builder
.build ();
2945 target
= aarch64_target_reg (target
, GET_MODE (src
));
2946 emit_insn (gen_rtx_SET (target
, src
));
2951 aarch64_force_temporary (machine_mode mode
, rtx x
, rtx value
)
2953 if (can_create_pseudo_p ())
2954 return force_reg (mode
, value
);
2958 aarch64_emit_move (x
, value
);
2963 /* Return true if predicate value X is a constant in which every element
2964 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
2965 value, i.e. as a predicate in which all bits are significant. */
2968 aarch64_get_sve_pred_bits (rtx_vector_builder
&builder
, rtx x
)
2970 if (GET_CODE (x
) != CONST_VECTOR
)
2973 unsigned int factor
= vector_element_size (GET_MODE_NUNITS (VNx16BImode
),
2974 GET_MODE_NUNITS (GET_MODE (x
)));
2975 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (x
) * factor
;
2976 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (x
);
2977 builder
.new_vector (VNx16BImode
, npatterns
, nelts_per_pattern
);
2979 unsigned int nelts
= const_vector_encoded_nelts (x
);
2980 for (unsigned int i
= 0; i
< nelts
; ++i
)
2982 rtx elt
= CONST_VECTOR_ENCODED_ELT (x
, i
);
2983 if (!CONST_INT_P (elt
))
2986 builder
.quick_push (elt
);
2987 for (unsigned int j
= 1; j
< factor
; ++j
)
2988 builder
.quick_push (const0_rtx
);
2990 builder
.finalize ();
2994 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
2995 widest predicate element size it can have (that is, the largest size
2996 for which each element would still be 0 or 1). */
2999 aarch64_widest_sve_pred_elt_size (rtx_vector_builder
&builder
)
3001 /* Start with the most optimistic assumption: that we only need
3002 one bit per pattern. This is what we will use if only the first
3003 bit in each pattern is ever set. */
3004 unsigned int mask
= GET_MODE_SIZE (DImode
);
3005 mask
|= builder
.npatterns ();
3007 /* Look for set bits. */
3008 unsigned int nelts
= builder
.encoded_nelts ();
3009 for (unsigned int i
= 1; i
< nelts
; ++i
)
3010 if (INTVAL (builder
.elt (i
)) != 0)
3016 return mask
& -mask
;
3019 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
3020 return that predicate mode, otherwise return opt_machine_mode (). */
3023 aarch64_ptrue_all_mode (rtx x
)
3025 gcc_assert (GET_MODE (x
) == VNx16BImode
);
3026 if (GET_CODE (x
) != CONST_VECTOR
3027 || !CONST_VECTOR_DUPLICATE_P (x
)
3028 || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x
, 0))
3029 || INTVAL (CONST_VECTOR_ENCODED_ELT (x
, 0)) == 0)
3030 return opt_machine_mode ();
3032 unsigned int nelts
= const_vector_encoded_nelts (x
);
3033 for (unsigned int i
= 1; i
< nelts
; ++i
)
3034 if (CONST_VECTOR_ENCODED_ELT (x
, i
) != const0_rtx
)
3035 return opt_machine_mode ();
3037 return aarch64_sve_pred_mode (nelts
);
3040 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
3041 that the constant would have with predicate element size ELT_SIZE
3042 (ignoring the upper bits in each element) and return:
3044 * -1 if all bits are set
3045 * N if the predicate has N leading set bits followed by all clear bits
3046 * 0 if the predicate does not have any of these forms. */
3049 aarch64_partial_ptrue_length (rtx_vector_builder
&builder
,
3050 unsigned int elt_size
)
3052 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
3053 followed by set bits. */
3054 if (builder
.nelts_per_pattern () == 3)
3057 /* Skip over leading set bits. */
3058 unsigned int nelts
= builder
.encoded_nelts ();
3060 for (; i
< nelts
; i
+= elt_size
)
3061 if (INTVAL (builder
.elt (i
)) == 0)
3063 unsigned int vl
= i
/ elt_size
;
3065 /* Check for the all-true case. */
3069 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
3070 repeating pattern of set bits followed by clear bits. */
3071 if (builder
.nelts_per_pattern () != 2)
3074 /* We have a "foreground" value and a duplicated "background" value.
3075 If the background might repeat and the last set bit belongs to it,
3076 we might have set bits followed by clear bits followed by set bits. */
3077 if (i
> builder
.npatterns () && maybe_ne (nelts
, builder
.full_nelts ()))
3080 /* Make sure that the rest are all clear. */
3081 for (; i
< nelts
; i
+= elt_size
)
3082 if (INTVAL (builder
.elt (i
)) != 0)
3088 /* See if there is an svpattern that encodes an SVE predicate of mode
3089 PRED_MODE in which the first VL bits are set and the rest are clear.
3090 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
3091 A VL of -1 indicates an all-true vector. */
3094 aarch64_svpattern_for_vl (machine_mode pred_mode
, int vl
)
3097 return AARCH64_SV_ALL
;
3099 if (maybe_gt (vl
, GET_MODE_NUNITS (pred_mode
)))
3100 return AARCH64_NUM_SVPATTERNS
;
3102 if (vl
>= 1 && vl
<= 8)
3103 return aarch64_svpattern (AARCH64_SV_VL1
+ (vl
- 1));
3105 if (vl
>= 16 && vl
<= 256 && pow2p_hwi (vl
))
3106 return aarch64_svpattern (AARCH64_SV_VL16
+ (exact_log2 (vl
) - 4));
3109 if (GET_MODE_NUNITS (pred_mode
).is_constant (&max_vl
))
3111 if (vl
== (max_vl
/ 3) * 3)
3112 return AARCH64_SV_MUL3
;
3113 /* These would only trigger for non-power-of-2 lengths. */
3114 if (vl
== (max_vl
& -4))
3115 return AARCH64_SV_MUL4
;
3116 if (vl
== (1 << floor_log2 (max_vl
)))
3117 return AARCH64_SV_POW2
;
3119 return AARCH64_SV_ALL
;
3121 return AARCH64_NUM_SVPATTERNS
;
3124 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
3125 bits has the lowest bit set and the upper bits clear. This is the
3126 VNx16BImode equivalent of a PTRUE for controlling elements of
3127 ELT_SIZE bytes. However, because the constant is VNx16BImode,
3128 all bits are significant, even the upper zeros. */
3131 aarch64_ptrue_all (unsigned int elt_size
)
3133 rtx_vector_builder
builder (VNx16BImode
, elt_size
, 1);
3134 builder
.quick_push (const1_rtx
);
3135 for (unsigned int i
= 1; i
< elt_size
; ++i
)
3136 builder
.quick_push (const0_rtx
);
3137 return builder
.build ();
3140 /* Return an all-true predicate register of mode MODE. */
3143 aarch64_ptrue_reg (machine_mode mode
)
3145 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
);
3146 rtx reg
= force_reg (VNx16BImode
, CONSTM1_RTX (VNx16BImode
));
3147 return gen_lowpart (mode
, reg
);
3150 /* Return an all-false predicate register of mode MODE. */
3153 aarch64_pfalse_reg (machine_mode mode
)
3155 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
);
3156 rtx reg
= force_reg (VNx16BImode
, CONST0_RTX (VNx16BImode
));
3157 return gen_lowpart (mode
, reg
);
3160 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
3161 true, or alternatively if we know that the operation predicated by
3162 PRED1[0] is safe to perform whenever PRED2 is true. PRED1[1] is a
3163 aarch64_sve_gp_strictness operand that describes the operation
3164 predicated by PRED1[0]. */
3167 aarch64_sve_pred_dominates_p (rtx
*pred1
, rtx pred2
)
3169 machine_mode mode
= GET_MODE (pred2
);
3170 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
3171 && mode
== GET_MODE (pred1
[0])
3172 && aarch64_sve_gp_strictness (pred1
[1], SImode
));
3173 return (pred1
[0] == CONSTM1_RTX (mode
)
3174 || INTVAL (pred1
[1]) == SVE_RELAXED_GP
3175 || rtx_equal_p (pred1
[0], pred2
));
3178 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
3179 for it. PRED2[0] is the predicate for the instruction whose result
3180 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
3181 for it. Return true if we can prove that the two predicates are
3182 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
3183 with PRED1[0] without changing behavior. */
3186 aarch64_sve_same_pred_for_ptest_p (rtx
*pred1
, rtx
*pred2
)
3188 machine_mode mode
= GET_MODE (pred1
[0]);
3189 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
3190 && mode
== GET_MODE (pred2
[0])
3191 && aarch64_sve_ptrue_flag (pred1
[1], SImode
)
3192 && aarch64_sve_ptrue_flag (pred2
[1], SImode
));
3194 bool ptrue1_p
= (pred1
[0] == CONSTM1_RTX (mode
)
3195 || INTVAL (pred1
[1]) == SVE_KNOWN_PTRUE
);
3196 bool ptrue2_p
= (pred2
[0] == CONSTM1_RTX (mode
)
3197 || INTVAL (pred2
[1]) == SVE_KNOWN_PTRUE
);
3198 return (ptrue1_p
&& ptrue2_p
) || rtx_equal_p (pred1
[0], pred2
[0]);
3201 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
3202 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
3203 Use TARGET as the target register if nonnull and convenient. */
3206 aarch64_sve_emit_int_cmp (rtx target
, machine_mode pred_mode
, rtx_code cmp
,
3207 machine_mode data_mode
, rtx op1
, rtx op2
)
3209 insn_code icode
= code_for_aarch64_pred_cmp (cmp
, data_mode
);
3210 expand_operand ops
[5];
3211 create_output_operand (&ops
[0], target
, pred_mode
);
3212 create_input_operand (&ops
[1], CONSTM1_RTX (pred_mode
), pred_mode
);
3213 create_integer_operand (&ops
[2], SVE_KNOWN_PTRUE
);
3214 create_input_operand (&ops
[3], op1
, data_mode
);
3215 create_input_operand (&ops
[4], op2
, data_mode
);
3216 expand_insn (icode
, 5, ops
);
3217 return ops
[0].value
;
3220 /* Use a comparison to convert integer vector SRC into MODE, which is
3221 the corresponding SVE predicate mode. Use TARGET for the result
3222 if it's nonnull and convenient. */
3225 aarch64_convert_sve_data_to_pred (rtx target
, machine_mode mode
, rtx src
)
3227 machine_mode src_mode
= GET_MODE (src
);
3228 return aarch64_sve_emit_int_cmp (target
, mode
, NE
, src_mode
,
3229 src
, CONST0_RTX (src_mode
));
3232 /* Return the assembly token for svprfop value PRFOP. */
3235 svprfop_token (enum aarch64_svprfop prfop
)
3239 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
3240 AARCH64_FOR_SVPRFOP (CASE
)
3242 case AARCH64_NUM_SVPRFOPS
:
3248 /* Return the assembly string for an SVE prefetch operation with
3249 mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
3250 and that SUFFIX is the format for the remaining operands. */
3253 aarch64_output_sve_prefetch (const char *mnemonic
, rtx prfop_rtx
,
3256 static char buffer
[128];
3257 aarch64_svprfop prfop
= (aarch64_svprfop
) INTVAL (prfop_rtx
);
3258 unsigned int written
= snprintf (buffer
, sizeof (buffer
), "%s\t%s, %s",
3259 mnemonic
, svprfop_token (prfop
), suffix
);
3260 gcc_assert (written
< sizeof (buffer
));
3264 /* Check whether we can calculate the number of elements in PATTERN
3265 at compile time, given that there are NELTS_PER_VQ elements per
3266 128-bit block. Return the value if so, otherwise return -1. */
3269 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern
, unsigned int nelts_per_vq
)
3271 unsigned int vl
, const_vg
;
3272 if (pattern
>= AARCH64_SV_VL1
&& pattern
<= AARCH64_SV_VL8
)
3273 vl
= 1 + (pattern
- AARCH64_SV_VL1
);
3274 else if (pattern
>= AARCH64_SV_VL16
&& pattern
<= AARCH64_SV_VL256
)
3275 vl
= 16 << (pattern
- AARCH64_SV_VL16
);
3276 else if (aarch64_sve_vg
.is_constant (&const_vg
))
3278 /* There are two vector granules per quadword. */
3279 unsigned int nelts
= (const_vg
/ 2) * nelts_per_vq
;
3282 case AARCH64_SV_POW2
: return 1 << floor_log2 (nelts
);
3283 case AARCH64_SV_MUL4
: return nelts
& -4;
3284 case AARCH64_SV_MUL3
: return (nelts
/ 3) * 3;
3285 case AARCH64_SV_ALL
: return nelts
;
3286 default: gcc_unreachable ();
3292 /* There are two vector granules per quadword. */
3293 poly_uint64 nelts_all
= exact_div (aarch64_sve_vg
, 2) * nelts_per_vq
;
3294 if (known_le (vl
, nelts_all
))
3297 /* Requesting more elements than are available results in a PFALSE. */
3298 if (known_gt (vl
, nelts_all
))
3304 /* Return true if we can move VALUE into a register using a single
3305 CNT[BHWD] instruction. */
3308 aarch64_sve_cnt_immediate_p (poly_int64 value
)
3310 HOST_WIDE_INT factor
= value
.coeffs
[0];
3311 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
3312 return (value
.coeffs
[1] == factor
3313 && IN_RANGE (factor
, 2, 16 * 16)
3314 && (factor
& 1) == 0
3315 && factor
<= 16 * (factor
& -factor
));
3318 /* Likewise for rtx X. */
3321 aarch64_sve_cnt_immediate_p (rtx x
)
3324 return poly_int_rtx_p (x
, &value
) && aarch64_sve_cnt_immediate_p (value
);
3327 /* Return the asm string for an instruction with a CNT-like vector size
3328 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3329 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3330 first part of the operands template (the part that comes before the
3331 vector size itself). PATTERN is the pattern to use. FACTOR is the
3332 number of quadwords. NELTS_PER_VQ, if nonzero, is the number of elements
3333 in each quadword. If it is zero, we can use any element size. */
3336 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
3337 aarch64_svpattern pattern
,
3338 unsigned int factor
,
3339 unsigned int nelts_per_vq
)
3341 static char buffer
[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
3343 if (nelts_per_vq
== 0)
3344 /* There is some overlap in the ranges of the four CNT instructions.
3345 Here we always use the smallest possible element size, so that the
3346 multiplier is 1 whereever possible. */
3347 nelts_per_vq
= factor
& -factor
;
3348 int shift
= std::min (exact_log2 (nelts_per_vq
), 4);
3349 gcc_assert (IN_RANGE (shift
, 1, 4));
3350 char suffix
= "dwhb"[shift
- 1];
3353 unsigned int written
;
3354 if (pattern
== AARCH64_SV_ALL
&& factor
== 1)
3355 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s",
3356 prefix
, suffix
, operands
);
3357 else if (factor
== 1)
3358 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s, %s",
3359 prefix
, suffix
, operands
, svpattern_token (pattern
));
3361 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s, %s, mul #%d",
3362 prefix
, suffix
, operands
, svpattern_token (pattern
),
3364 gcc_assert (written
< sizeof (buffer
));
3368 /* Return the asm string for an instruction with a CNT-like vector size
3369 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3370 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3371 first part of the operands template (the part that comes before the
3372 vector size itself). X is the value of the vector size operand,
3373 as a polynomial integer rtx; we need to convert this into an "all"
3374 pattern with a multiplier. */
3377 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
3380 poly_int64 value
= rtx_to_poly_int64 (x
);
3381 gcc_assert (aarch64_sve_cnt_immediate_p (value
));
3382 return aarch64_output_sve_cnt_immediate (prefix
, operands
, AARCH64_SV_ALL
,
3383 value
.coeffs
[1], 0);
3386 /* Return the asm string for an instruction with a CNT-like vector size
3387 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3388 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3389 first part of the operands template (the part that comes before the
3390 vector size itself). CNT_PAT[0..2] are the operands of the
3391 UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details. */
3394 aarch64_output_sve_cnt_pat_immediate (const char *prefix
,
3395 const char *operands
, rtx
*cnt_pat
)
3397 aarch64_svpattern pattern
= (aarch64_svpattern
) INTVAL (cnt_pat
[0]);
3398 unsigned int nelts_per_vq
= INTVAL (cnt_pat
[1]);
3399 unsigned int factor
= INTVAL (cnt_pat
[2]) * nelts_per_vq
;
3400 return aarch64_output_sve_cnt_immediate (prefix
, operands
, pattern
,
3401 factor
, nelts_per_vq
);
3404 /* Return true if we can add X using a single SVE INC or DEC instruction. */
3407 aarch64_sve_scalar_inc_dec_immediate_p (rtx x
)
3410 return (poly_int_rtx_p (x
, &value
)
3411 && (aarch64_sve_cnt_immediate_p (value
)
3412 || aarch64_sve_cnt_immediate_p (-value
)));
3415 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
3419 aarch64_output_sve_scalar_inc_dec (rtx offset
)
3421 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
3422 gcc_assert (offset_value
.coeffs
[0] == offset_value
.coeffs
[1]);
3423 if (offset_value
.coeffs
[1] > 0)
3424 return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL
,
3425 offset_value
.coeffs
[1], 0);
3427 return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL
,
3428 -offset_value
.coeffs
[1], 0);
3431 /* Return true if we can add VALUE to a register using a single ADDVL
3432 or ADDPL instruction. */
3435 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value
)
3437 HOST_WIDE_INT factor
= value
.coeffs
[0];
3438 if (factor
== 0 || value
.coeffs
[1] != factor
)
3440 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
3441 and a value of 16 is one vector width. */
3442 return (((factor
& 15) == 0 && IN_RANGE (factor
, -32 * 16, 31 * 16))
3443 || ((factor
& 1) == 0 && IN_RANGE (factor
, -32 * 2, 31 * 2)));
3446 /* Likewise for rtx X. */
3449 aarch64_sve_addvl_addpl_immediate_p (rtx x
)
3452 return (poly_int_rtx_p (x
, &value
)
3453 && aarch64_sve_addvl_addpl_immediate_p (value
));
3456 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3457 to operand 1 and storing the result in operand 0. */
3460 aarch64_output_sve_addvl_addpl (rtx offset
)
3462 static char buffer
[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3463 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
3464 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value
));
3466 int factor
= offset_value
.coeffs
[1];
3467 if ((factor
& 15) == 0)
3468 snprintf (buffer
, sizeof (buffer
), "addvl\t%%x0, %%x1, #%d", factor
/ 16);
3470 snprintf (buffer
, sizeof (buffer
), "addpl\t%%x0, %%x1, #%d", factor
/ 2);
3474 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3475 instruction. If it is, store the number of elements in each vector
3476 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3477 factor in *FACTOR_OUT (if nonnull). */
3480 aarch64_sve_vector_inc_dec_immediate_p (rtx x
, int *factor_out
,
3481 unsigned int *nelts_per_vq_out
)
3486 if (!const_vec_duplicate_p (x
, &elt
)
3487 || !poly_int_rtx_p (elt
, &value
))
3490 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x
));
3491 if (nelts_per_vq
!= 8 && nelts_per_vq
!= 4 && nelts_per_vq
!= 2)
3492 /* There's no vector INCB. */
3495 HOST_WIDE_INT factor
= value
.coeffs
[0];
3496 if (value
.coeffs
[1] != factor
)
3499 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
3500 if ((factor
% nelts_per_vq
) != 0
3501 || !IN_RANGE (abs (factor
), nelts_per_vq
, 16 * nelts_per_vq
))
3505 *factor_out
= factor
;
3506 if (nelts_per_vq_out
)
3507 *nelts_per_vq_out
= nelts_per_vq
;
3511 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3515 aarch64_sve_vector_inc_dec_immediate_p (rtx x
)
3517 return aarch64_sve_vector_inc_dec_immediate_p (x
, NULL
, NULL
);
3520 /* Return the asm template for an SVE vector INC or DEC instruction.
3521 OPERANDS gives the operands before the vector count and X is the
3522 value of the vector count operand itself. */
3525 aarch64_output_sve_vector_inc_dec (const char *operands
, rtx x
)
3528 unsigned int nelts_per_vq
;
3529 if (!aarch64_sve_vector_inc_dec_immediate_p (x
, &factor
, &nelts_per_vq
))
3532 return aarch64_output_sve_cnt_immediate ("dec", operands
, AARCH64_SV_ALL
,
3533 -factor
, nelts_per_vq
);
3535 return aarch64_output_sve_cnt_immediate ("inc", operands
, AARCH64_SV_ALL
,
3536 factor
, nelts_per_vq
);
3540 aarch64_internal_mov_immediate (rtx dest
, rtx imm
, bool generate
,
3541 scalar_int_mode mode
)
3544 unsigned HOST_WIDE_INT val
, val2
, mask
;
3545 int one_match
, zero_match
;
3550 if (aarch64_move_imm (val
, mode
))
3553 emit_insn (gen_rtx_SET (dest
, imm
));
3557 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3558 (with XXXX non-zero). In that case check to see if the move can be done in
3560 val2
= val
& 0xffffffff;
3562 && aarch64_move_imm (val2
, SImode
)
3563 && (((val
>> 32) & 0xffff) == 0 || (val
>> 48) == 0))
3566 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
3568 /* Check if we have to emit a second instruction by checking to see
3569 if any of the upper 32 bits of the original DI mode value is set. */
3573 i
= (val
>> 48) ? 48 : 32;
3576 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
3577 GEN_INT ((val
>> i
) & 0xffff)));
3582 if ((val
>> 32) == 0 || mode
== SImode
)
3586 emit_insn (gen_rtx_SET (dest
, GEN_INT (val
& 0xffff)));
3588 emit_insn (gen_insv_immsi (dest
, GEN_INT (16),
3589 GEN_INT ((val
>> 16) & 0xffff)));
3591 emit_insn (gen_insv_immdi (dest
, GEN_INT (16),
3592 GEN_INT ((val
>> 16) & 0xffff)));
3597 /* Remaining cases are all for DImode. */
3600 zero_match
= ((val
& mask
) == 0) + ((val
& (mask
<< 16)) == 0) +
3601 ((val
& (mask
<< 32)) == 0) + ((val
& (mask
<< 48)) == 0);
3602 one_match
= ((~val
& mask
) == 0) + ((~val
& (mask
<< 16)) == 0) +
3603 ((~val
& (mask
<< 32)) == 0) + ((~val
& (mask
<< 48)) == 0);
3605 if (zero_match
!= 2 && one_match
!= 2)
3607 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3608 For a 64-bit bitmask try whether changing 16 bits to all ones or
3609 zeroes creates a valid bitmask. To check any repeated bitmask,
3610 try using 16 bits from the other 32-bit half of val. */
3612 for (i
= 0; i
< 64; i
+= 16, mask
<<= 16)
3615 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
3618 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
3620 val2
= val2
& ~mask
;
3621 val2
= val2
| (((val2
>> 32) | (val2
<< 32)) & mask
);
3622 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
3629 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
3630 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
3631 GEN_INT ((val
>> i
) & 0xffff)));
3637 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3638 are emitted by the initial mov. If one_match > zero_match, skip set bits,
3639 otherwise skip zero bits. */
3643 val2
= one_match
> zero_match
? ~val
: val
;
3644 i
= (val2
& mask
) != 0 ? 0 : (val2
& (mask
<< 16)) != 0 ? 16 : 32;
3647 emit_insn (gen_rtx_SET (dest
, GEN_INT (one_match
> zero_match
3648 ? (val
| ~(mask
<< i
))
3649 : (val
& (mask
<< i
)))));
3650 for (i
+= 16; i
< 64; i
+= 16)
3652 if ((val2
& (mask
<< i
)) == 0)
3655 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
3656 GEN_INT ((val
>> i
) & 0xffff)));
3663 /* Return whether imm is a 128-bit immediate which is simple enough to
3666 aarch64_mov128_immediate (rtx imm
)
3668 if (GET_CODE (imm
) == CONST_INT
)
3671 gcc_assert (CONST_WIDE_INT_NUNITS (imm
) == 2);
3673 rtx lo
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 0));
3674 rtx hi
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 1));
3676 return aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, DImode
)
3677 + aarch64_internal_mov_immediate (NULL_RTX
, hi
, false, DImode
) <= 4;
3681 /* Return the number of temporary registers that aarch64_add_offset_1
3682 would need to add OFFSET to a register. */
3685 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset
)
3687 return abs_hwi (offset
) < 0x1000000 ? 0 : 1;
3690 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
3691 a non-polynomial OFFSET. MODE is the mode of the addition.
3692 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3693 be set and CFA adjustments added to the generated instructions.
3695 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3696 temporary if register allocation is already complete. This temporary
3697 register may overlap DEST but must not overlap SRC. If TEMP1 is known
3698 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3699 the immediate again.
3701 Since this function may be used to adjust the stack pointer, we must
3702 ensure that it cannot cause transient stack deallocation (for example
3703 by first incrementing SP and then decrementing when adjusting by a
3704 large immediate). */
3707 aarch64_add_offset_1 (scalar_int_mode mode
, rtx dest
,
3708 rtx src
, HOST_WIDE_INT offset
, rtx temp1
,
3709 bool frame_related_p
, bool emit_move_imm
)
3711 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
3712 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
3714 HOST_WIDE_INT moffset
= abs_hwi (offset
);
3719 if (!rtx_equal_p (dest
, src
))
3721 insn
= emit_insn (gen_rtx_SET (dest
, src
));
3722 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3727 /* Single instruction adjustment. */
3728 if (aarch64_uimm12_shift (moffset
))
3730 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (offset
)));
3731 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3735 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3738 a) the offset cannot be loaded by a 16-bit move or
3739 b) there is no spare register into which we can move it. */
3740 if (moffset
< 0x1000000
3741 && ((!temp1
&& !can_create_pseudo_p ())
3742 || !aarch64_move_imm (moffset
, mode
)))
3744 HOST_WIDE_INT low_off
= moffset
& 0xfff;
3746 low_off
= offset
< 0 ? -low_off
: low_off
;
3747 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (low_off
)));
3748 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3749 insn
= emit_insn (gen_add2_insn (dest
, GEN_INT (offset
- low_off
)));
3750 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3754 /* Emit a move immediate if required and an addition/subtraction. */
3757 gcc_assert (temp1
!= NULL_RTX
|| can_create_pseudo_p ());
3758 temp1
= aarch64_force_temporary (mode
, temp1
, GEN_INT (moffset
));
3760 insn
= emit_insn (offset
< 0
3761 ? gen_sub3_insn (dest
, src
, temp1
)
3762 : gen_add3_insn (dest
, src
, temp1
));
3763 if (frame_related_p
)
3765 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3766 rtx adj
= plus_constant (mode
, src
, offset
);
3767 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, gen_rtx_SET (dest
, adj
));
3771 /* Return the number of temporary registers that aarch64_add_offset
3772 would need to move OFFSET into a register or add OFFSET to a register;
3773 ADD_P is true if we want the latter rather than the former. */
3776 aarch64_offset_temporaries (bool add_p
, poly_int64 offset
)
3778 /* This follows the same structure as aarch64_add_offset. */
3779 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
3782 unsigned int count
= 0;
3783 HOST_WIDE_INT factor
= offset
.coeffs
[1];
3784 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
3785 poly_int64
poly_offset (factor
, factor
);
3786 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
3787 /* Need one register for the ADDVL/ADDPL result. */
3789 else if (factor
!= 0)
3791 factor
= abs (factor
);
3792 if (factor
> 16 * (factor
& -factor
))
3793 /* Need one register for the CNT result and one for the multiplication
3794 factor. If necessary, the second temporary can be reused for the
3795 constant part of the offset. */
3797 /* Need one register for the CNT result (which might then
3801 return count
+ aarch64_add_offset_1_temporaries (constant
);
3804 /* If X can be represented as a poly_int64, return the number
3805 of temporaries that are required to add it to a register.
3806 Return -1 otherwise. */
3809 aarch64_add_offset_temporaries (rtx x
)
3812 if (!poly_int_rtx_p (x
, &offset
))
3814 return aarch64_offset_temporaries (true, offset
);
3817 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
3818 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3819 be set and CFA adjustments added to the generated instructions.
3821 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3822 temporary if register allocation is already complete. This temporary
3823 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3824 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3825 false to avoid emitting the immediate again.
3827 TEMP2, if nonnull, is a second temporary register that doesn't
3828 overlap either DEST or REG.
3830 Since this function may be used to adjust the stack pointer, we must
3831 ensure that it cannot cause transient stack deallocation (for example
3832 by first incrementing SP and then decrementing when adjusting by a
3833 large immediate). */
3836 aarch64_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
3837 poly_int64 offset
, rtx temp1
, rtx temp2
,
3838 bool frame_related_p
, bool emit_move_imm
= true)
3840 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
3841 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
3842 gcc_assert (temp1
== NULL_RTX
3844 || !reg_overlap_mentioned_p (temp1
, dest
));
3845 gcc_assert (temp2
== NULL_RTX
|| !reg_overlap_mentioned_p (dest
, temp2
));
3847 /* Try using ADDVL or ADDPL to add the whole value. */
3848 if (src
!= const0_rtx
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
3850 rtx offset_rtx
= gen_int_mode (offset
, mode
);
3851 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
3852 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3856 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3857 SVE vector register, over and above the minimum size of 128 bits.
3858 This is equivalent to half the value returned by CNTD with a
3859 vector shape of ALL. */
3860 HOST_WIDE_INT factor
= offset
.coeffs
[1];
3861 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
3863 /* Try using ADDVL or ADDPL to add the VG-based part. */
3864 poly_int64
poly_offset (factor
, factor
);
3865 if (src
!= const0_rtx
3866 && aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
3868 rtx offset_rtx
= gen_int_mode (poly_offset
, mode
);
3869 if (frame_related_p
)
3871 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
3872 RTX_FRAME_RELATED_P (insn
) = true;
3877 rtx addr
= gen_rtx_PLUS (mode
, src
, offset_rtx
);
3878 src
= aarch64_force_temporary (mode
, temp1
, addr
);
3883 /* Otherwise use a CNT-based sequence. */
3884 else if (factor
!= 0)
3886 /* Use a subtraction if we have a negative factor. */
3887 rtx_code code
= PLUS
;
3894 /* Calculate CNTD * FACTOR / 2. First try to fold the division
3895 into the multiplication. */
3899 /* Use a right shift by 1. */
3903 HOST_WIDE_INT low_bit
= factor
& -factor
;
3904 if (factor
<= 16 * low_bit
)
3906 if (factor
> 16 * 8)
3908 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3909 the value with the minimum multiplier and shift it into
3911 int extra_shift
= exact_log2 (low_bit
);
3912 shift
+= extra_shift
;
3913 factor
>>= extra_shift
;
3915 val
= gen_int_mode (poly_int64 (factor
* 2, factor
* 2), mode
);
3919 /* Base the factor on LOW_BIT if we can calculate LOW_BIT
3920 directly, since that should increase the chances of being
3921 able to use a shift and add sequence. If LOW_BIT itself
3922 is out of range, just use CNTD. */
3923 if (low_bit
<= 16 * 8)
3928 val
= gen_int_mode (poly_int64 (low_bit
* 2, low_bit
* 2), mode
);
3929 val
= aarch64_force_temporary (mode
, temp1
, val
);
3931 if (can_create_pseudo_p ())
3933 rtx coeff1
= gen_int_mode (factor
, mode
);
3934 val
= expand_mult (mode
, val
, coeff1
, NULL_RTX
, false, true);
3938 /* Go back to using a negative multiplication factor if we have
3939 no register from which to subtract. */
3940 if (code
== MINUS
&& src
== const0_rtx
)
3945 rtx coeff1
= gen_int_mode (factor
, mode
);
3946 coeff1
= aarch64_force_temporary (mode
, temp2
, coeff1
);
3947 val
= gen_rtx_MULT (mode
, val
, coeff1
);
3953 /* Multiply by 1 << SHIFT. */
3954 val
= aarch64_force_temporary (mode
, temp1
, val
);
3955 val
= gen_rtx_ASHIFT (mode
, val
, GEN_INT (shift
));
3957 else if (shift
== -1)
3960 val
= aarch64_force_temporary (mode
, temp1
, val
);
3961 val
= gen_rtx_ASHIFTRT (mode
, val
, const1_rtx
);
3964 /* Calculate SRC +/- CNTD * FACTOR / 2. */
3965 if (src
!= const0_rtx
)
3967 val
= aarch64_force_temporary (mode
, temp1
, val
);
3968 val
= gen_rtx_fmt_ee (code
, mode
, src
, val
);
3970 else if (code
== MINUS
)
3972 val
= aarch64_force_temporary (mode
, temp1
, val
);
3973 val
= gen_rtx_NEG (mode
, val
);
3976 if (constant
== 0 || frame_related_p
)
3978 rtx_insn
*insn
= emit_insn (gen_rtx_SET (dest
, val
));
3979 if (frame_related_p
)
3981 RTX_FRAME_RELATED_P (insn
) = true;
3982 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
3983 gen_rtx_SET (dest
, plus_constant (Pmode
, src
,
3992 src
= aarch64_force_temporary (mode
, temp1
, val
);
3997 emit_move_imm
= true;
4000 aarch64_add_offset_1 (mode
, dest
, src
, constant
, temp1
,
4001 frame_related_p
, emit_move_imm
);
4004 /* Like aarch64_add_offset, but the offset is given as an rtx rather
4005 than a poly_int64. */
4008 aarch64_split_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
4009 rtx offset_rtx
, rtx temp1
, rtx temp2
)
4011 aarch64_add_offset (mode
, dest
, src
, rtx_to_poly_int64 (offset_rtx
),
4012 temp1
, temp2
, false);
4015 /* Add DELTA to the stack pointer, marking the instructions frame-related.
4016 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
4017 if TEMP1 already contains abs (DELTA). */
4020 aarch64_add_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool emit_move_imm
)
4022 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, delta
,
4023 temp1
, temp2
, true, emit_move_imm
);
4026 /* Subtract DELTA from the stack pointer, marking the instructions
4027 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
4031 aarch64_sub_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool frame_related_p
,
4032 bool emit_move_imm
= true)
4034 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, -delta
,
4035 temp1
, temp2
, frame_related_p
, emit_move_imm
);
4038 /* Set DEST to (vec_series BASE STEP). */
4041 aarch64_expand_vec_series (rtx dest
, rtx base
, rtx step
)
4043 machine_mode mode
= GET_MODE (dest
);
4044 scalar_mode inner
= GET_MODE_INNER (mode
);
4046 /* Each operand can be a register or an immediate in the range [-16, 15]. */
4047 if (!aarch64_sve_index_immediate_p (base
))
4048 base
= force_reg (inner
, base
);
4049 if (!aarch64_sve_index_immediate_p (step
))
4050 step
= force_reg (inner
, step
);
4052 emit_set_insn (dest
, gen_rtx_VEC_SERIES (mode
, base
, step
));
4055 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
4056 register of mode MODE. Use TARGET for the result if it's nonnull
4059 The two vector modes must have the same element mode. The behavior
4060 is to duplicate architectural lane N of SRC into architectural lanes
4061 N + I * STEP of the result. On big-endian targets, architectural
4062 lane 0 of an Advanced SIMD vector is the last element of the vector
4063 in memory layout, so for big-endian targets this operation has the
4064 effect of reversing SRC before duplicating it. Callers need to
4065 account for this. */
4068 aarch64_expand_sve_dupq (rtx target
, machine_mode mode
, rtx src
)
4070 machine_mode src_mode
= GET_MODE (src
);
4071 gcc_assert (GET_MODE_INNER (mode
) == GET_MODE_INNER (src_mode
));
4072 insn_code icode
= (BYTES_BIG_ENDIAN
4073 ? code_for_aarch64_vec_duplicate_vq_be (mode
)
4074 : code_for_aarch64_vec_duplicate_vq_le (mode
));
4077 expand_operand ops
[3];
4078 create_output_operand (&ops
[i
++], target
, mode
);
4079 create_output_operand (&ops
[i
++], src
, src_mode
);
4080 if (BYTES_BIG_ENDIAN
)
4082 /* Create a PARALLEL describing the reversal of SRC. */
4083 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (mode
);
4084 rtx sel
= aarch64_gen_stepped_int_parallel (nelts_per_vq
,
4085 nelts_per_vq
- 1, -1);
4086 create_fixed_operand (&ops
[i
++], sel
);
4088 expand_insn (icode
, i
, ops
);
4089 return ops
[0].value
;
4092 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
4093 the memory image into DEST. Return true on success. */
4096 aarch64_expand_sve_ld1rq (rtx dest
, rtx src
)
4098 src
= force_const_mem (GET_MODE (src
), src
);
4102 /* Make sure that the address is legitimate. */
4103 if (!aarch64_sve_ld1rq_operand_p (src
))
4105 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
4106 src
= replace_equiv_address (src
, addr
);
4109 machine_mode mode
= GET_MODE (dest
);
4110 machine_mode pred_mode
= aarch64_sve_pred_mode (mode
);
4111 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
4112 emit_insn (gen_aarch64_sve_ld1rq (mode
, dest
, src
, ptrue
));
4116 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
4117 SVE data mode and isn't a legitimate constant. Use TARGET for the
4118 result if convenient.
4120 The returned register can have whatever mode seems most natural
4121 given the contents of SRC. */
4124 aarch64_expand_sve_const_vector (rtx target
, rtx src
)
4126 machine_mode mode
= GET_MODE (src
);
4127 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (src
);
4128 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (src
);
4129 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
4130 unsigned int elt_bits
= GET_MODE_BITSIZE (elt_mode
);
4131 unsigned int container_bits
= aarch64_sve_container_bits (mode
);
4132 unsigned int encoded_bits
= npatterns
* nelts_per_pattern
* container_bits
;
4134 if (nelts_per_pattern
== 1
4135 && encoded_bits
<= 128
4136 && container_bits
!= elt_bits
)
4138 /* We have a partial vector mode and a constant whose full-vector
4139 equivalent would occupy a repeating 128-bit sequence. Build that
4140 full-vector equivalent instead, so that we have the option of
4141 using LD1RQ and Advanced SIMD operations. */
4142 unsigned int repeat
= container_bits
/ elt_bits
;
4143 machine_mode full_mode
= aarch64_full_sve_mode (elt_mode
).require ();
4144 rtx_vector_builder
builder (full_mode
, npatterns
* repeat
, 1);
4145 for (unsigned int i
= 0; i
< npatterns
; ++i
)
4146 for (unsigned int j
= 0; j
< repeat
; ++j
)
4147 builder
.quick_push (CONST_VECTOR_ENCODED_ELT (src
, i
));
4148 target
= aarch64_target_reg (target
, full_mode
);
4149 return aarch64_expand_sve_const_vector (target
, builder
.build ());
4152 if (nelts_per_pattern
== 1 && encoded_bits
== 128)
4154 /* The constant is a duplicated quadword but can't be narrowed
4155 beyond a quadword. Get the memory image of the first quadword
4156 as a 128-bit vector and try using LD1RQ to load it from memory.
4158 The effect for both endiannesses is to load memory lane N into
4159 architectural lanes N + I * STEP of the result. On big-endian
4160 targets, the layout of the 128-bit vector in an Advanced SIMD
4161 register would be different from its layout in an SVE register,
4162 but this 128-bit vector is a memory value only. */
4163 machine_mode vq_mode
= aarch64_vq_mode (elt_mode
).require ();
4164 rtx vq_value
= simplify_gen_subreg (vq_mode
, src
, mode
, 0);
4165 if (vq_value
&& aarch64_expand_sve_ld1rq (target
, vq_value
))
4169 if (nelts_per_pattern
== 1 && encoded_bits
< 128)
4171 /* The vector is a repeating sequence of 64 bits or fewer.
4172 See if we can load them using an Advanced SIMD move and then
4173 duplicate it to fill a vector. This is better than using a GPR
4174 move because it keeps everything in the same register file. */
4175 machine_mode vq_mode
= aarch64_vq_mode (elt_mode
).require ();
4176 rtx_vector_builder
builder (vq_mode
, npatterns
, 1);
4177 for (unsigned int i
= 0; i
< npatterns
; ++i
)
4179 /* We want memory lane N to go into architectural lane N,
4180 so reverse for big-endian targets. The DUP .Q pattern
4181 has a compensating reverse built-in. */
4182 unsigned int srci
= BYTES_BIG_ENDIAN
? npatterns
- i
- 1 : i
;
4183 builder
.quick_push (CONST_VECTOR_ENCODED_ELT (src
, srci
));
4185 rtx vq_src
= builder
.build ();
4186 if (aarch64_simd_valid_immediate (vq_src
, NULL
))
4188 vq_src
= force_reg (vq_mode
, vq_src
);
4189 return aarch64_expand_sve_dupq (target
, mode
, vq_src
);
4192 /* Get an integer representation of the repeating part of Advanced
4193 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
4194 which for big-endian targets is lane-swapped wrt a normal
4195 Advanced SIMD vector. This means that for both endiannesses,
4196 memory lane N of SVE vector SRC corresponds to architectural
4197 lane N of a register holding VQ_SRC. This in turn means that
4198 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
4199 as a single 128-bit value) and thus that memory lane 0 of SRC is
4200 in the lsb of the integer. Duplicating the integer therefore
4201 ensures that memory lane N of SRC goes into architectural lane
4202 N + I * INDEX of the SVE register. */
4203 scalar_mode int_mode
= int_mode_for_size (encoded_bits
, 0).require ();
4204 rtx elt_value
= simplify_gen_subreg (int_mode
, vq_src
, vq_mode
, 0);
4207 /* Pretend that we had a vector of INT_MODE to start with. */
4208 elt_mode
= int_mode
;
4209 mode
= aarch64_full_sve_mode (int_mode
).require ();
4211 /* If the integer can be moved into a general register by a
4212 single instruction, do that and duplicate the result. */
4213 if (CONST_INT_P (elt_value
)
4214 && aarch64_move_imm (INTVAL (elt_value
), elt_mode
))
4216 elt_value
= force_reg (elt_mode
, elt_value
);
4217 return expand_vector_broadcast (mode
, elt_value
);
4220 else if (npatterns
== 1)
4221 /* We're duplicating a single value, but can't do better than
4222 force it to memory and load from there. This handles things
4223 like symbolic constants. */
4224 elt_value
= CONST_VECTOR_ENCODED_ELT (src
, 0);
4228 /* Load the element from memory if we can, otherwise move it into
4229 a register and use a DUP. */
4230 rtx op
= force_const_mem (elt_mode
, elt_value
);
4232 op
= force_reg (elt_mode
, elt_value
);
4233 return expand_vector_broadcast (mode
, op
);
4237 /* Try using INDEX. */
4239 if (const_vec_series_p (src
, &base
, &step
))
4241 aarch64_expand_vec_series (target
, base
, step
);
4245 /* From here on, it's better to force the whole constant to memory
4247 if (GET_MODE_NUNITS (mode
).is_constant ())
4250 /* Expand each pattern individually. */
4251 gcc_assert (npatterns
> 1);
4252 rtx_vector_builder builder
;
4253 auto_vec
<rtx
, 16> vectors (npatterns
);
4254 for (unsigned int i
= 0; i
< npatterns
; ++i
)
4256 builder
.new_vector (mode
, 1, nelts_per_pattern
);
4257 for (unsigned int j
= 0; j
< nelts_per_pattern
; ++j
)
4258 builder
.quick_push (CONST_VECTOR_ELT (src
, i
+ j
* npatterns
));
4259 vectors
.quick_push (force_reg (mode
, builder
.build ()));
4262 /* Use permutes to interleave the separate vectors. */
4263 while (npatterns
> 1)
4266 for (unsigned int i
= 0; i
< npatterns
; ++i
)
4268 rtx tmp
= (npatterns
== 1 ? target
: gen_reg_rtx (mode
));
4269 rtvec v
= gen_rtvec (2, vectors
[i
], vectors
[i
+ npatterns
]);
4270 emit_set_insn (tmp
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
4274 gcc_assert (vectors
[0] == target
);
4278 /* Use WHILE to set a predicate register of mode MODE in which the first
4279 VL bits are set and the rest are clear. Use TARGET for the register
4280 if it's nonnull and convenient. */
4283 aarch64_sve_move_pred_via_while (rtx target
, machine_mode mode
,
4286 rtx limit
= force_reg (DImode
, gen_int_mode (vl
, DImode
));
4287 target
= aarch64_target_reg (target
, mode
);
4288 emit_insn (gen_while (UNSPEC_WHILE_LO
, DImode
, mode
,
4289 target
, const0_rtx
, limit
));
4294 aarch64_expand_sve_const_pred_1 (rtx
, rtx_vector_builder
&, bool);
4296 /* BUILDER is a constant predicate in which the index of every set bit
4297 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
4298 by inverting every element at a multiple of ELT_SIZE and EORing the
4299 result with an ELT_SIZE PTRUE.
4301 Return a register that contains the constant on success, otherwise
4302 return null. Use TARGET as the register if it is nonnull and
4306 aarch64_expand_sve_const_pred_eor (rtx target
, rtx_vector_builder
&builder
,
4307 unsigned int elt_size
)
4309 /* Invert every element at a multiple of ELT_SIZE, keeping the
4311 rtx_vector_builder
inv_builder (VNx16BImode
, builder
.npatterns (),
4312 builder
.nelts_per_pattern ());
4313 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); ++i
)
4314 if ((i
& (elt_size
- 1)) == 0 && INTVAL (builder
.elt (i
)) == 0)
4315 inv_builder
.quick_push (const1_rtx
);
4317 inv_builder
.quick_push (const0_rtx
);
4318 inv_builder
.finalize ();
4320 /* See if we can load the constant cheaply. */
4321 rtx inv
= aarch64_expand_sve_const_pred_1 (NULL_RTX
, inv_builder
, false);
4325 /* EOR the result with an ELT_SIZE PTRUE. */
4326 rtx mask
= aarch64_ptrue_all (elt_size
);
4327 mask
= force_reg (VNx16BImode
, mask
);
4328 target
= aarch64_target_reg (target
, VNx16BImode
);
4329 emit_insn (gen_aarch64_pred_z (XOR
, VNx16BImode
, target
, mask
, inv
, mask
));
4333 /* BUILDER is a constant predicate in which the index of every set bit
4334 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
4335 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
4336 register on success, otherwise return null. Use TARGET as the register
4337 if nonnull and convenient. */
4340 aarch64_expand_sve_const_pred_trn (rtx target
, rtx_vector_builder
&builder
,
4341 unsigned int elt_size
,
4342 unsigned int permute_size
)
4344 /* We're going to split the constant into two new constants A and B,
4345 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
4346 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
4348 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
4349 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
4351 where _ indicates elements that will be discarded by the permute.
4353 First calculate the ELT_SIZEs for A and B. */
4354 unsigned int a_elt_size
= GET_MODE_SIZE (DImode
);
4355 unsigned int b_elt_size
= GET_MODE_SIZE (DImode
);
4356 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); i
+= elt_size
)
4357 if (INTVAL (builder
.elt (i
)) != 0)
4359 if (i
& permute_size
)
4360 b_elt_size
|= i
- permute_size
;
4364 a_elt_size
&= -a_elt_size
;
4365 b_elt_size
&= -b_elt_size
;
4367 /* Now construct the vectors themselves. */
4368 rtx_vector_builder
a_builder (VNx16BImode
, builder
.npatterns (),
4369 builder
.nelts_per_pattern ());
4370 rtx_vector_builder
b_builder (VNx16BImode
, builder
.npatterns (),
4371 builder
.nelts_per_pattern ());
4372 unsigned int nelts
= builder
.encoded_nelts ();
4373 for (unsigned int i
= 0; i
< nelts
; ++i
)
4374 if (i
& (elt_size
- 1))
4376 a_builder
.quick_push (const0_rtx
);
4377 b_builder
.quick_push (const0_rtx
);
4379 else if ((i
& permute_size
) == 0)
4381 /* The A and B elements are significant. */
4382 a_builder
.quick_push (builder
.elt (i
));
4383 b_builder
.quick_push (builder
.elt (i
+ permute_size
));
4387 /* The A and B elements are going to be discarded, so pick whatever
4388 is likely to give a nice constant. We are targeting element
4389 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
4390 with the aim of each being a sequence of ones followed by
4391 a sequence of zeros. So:
4393 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
4394 duplicate the last X_ELT_SIZE element, to extend the
4395 current sequence of ones or zeros.
4397 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
4398 zero, so that the constant really does have X_ELT_SIZE and
4399 not a smaller size. */
4400 if (a_elt_size
> permute_size
)
4401 a_builder
.quick_push (const0_rtx
);
4403 a_builder
.quick_push (a_builder
.elt (i
- a_elt_size
));
4404 if (b_elt_size
> permute_size
)
4405 b_builder
.quick_push (const0_rtx
);
4407 b_builder
.quick_push (b_builder
.elt (i
- b_elt_size
));
4409 a_builder
.finalize ();
4410 b_builder
.finalize ();
4412 /* Try loading A into a register. */
4413 rtx_insn
*last
= get_last_insn ();
4414 rtx a
= aarch64_expand_sve_const_pred_1 (NULL_RTX
, a_builder
, false);
4418 /* Try loading B into a register. */
4420 if (a_builder
!= b_builder
)
4422 b
= aarch64_expand_sve_const_pred_1 (NULL_RTX
, b_builder
, false);
4425 delete_insns_since (last
);
4430 /* Emit the TRN1 itself. */
4431 machine_mode mode
= aarch64_sve_pred_mode (permute_size
).require ();
4432 target
= aarch64_target_reg (target
, mode
);
4433 emit_insn (gen_aarch64_sve (UNSPEC_TRN1
, mode
, target
,
4434 gen_lowpart (mode
, a
),
4435 gen_lowpart (mode
, b
)));
4439 /* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
4440 constant in BUILDER into an SVE predicate register. Return the register
4441 on success, otherwise return null. Use TARGET for the register if
4442 nonnull and convenient.
4444 ALLOW_RECURSE_P is true if we can use methods that would call this
4445 function recursively. */
4448 aarch64_expand_sve_const_pred_1 (rtx target
, rtx_vector_builder
&builder
,
4449 bool allow_recurse_p
)
4451 if (builder
.encoded_nelts () == 1)
4452 /* A PFALSE or a PTRUE .B ALL. */
4453 return aarch64_emit_set_immediate (target
, builder
);
4455 unsigned int elt_size
= aarch64_widest_sve_pred_elt_size (builder
);
4456 if (int vl
= aarch64_partial_ptrue_length (builder
, elt_size
))
4458 /* If we can load the constant using PTRUE, use it as-is. */
4459 machine_mode mode
= aarch64_sve_pred_mode (elt_size
).require ();
4460 if (aarch64_svpattern_for_vl (mode
, vl
) != AARCH64_NUM_SVPATTERNS
)
4461 return aarch64_emit_set_immediate (target
, builder
);
4463 /* Otherwise use WHILE to set the first VL bits. */
4464 return aarch64_sve_move_pred_via_while (target
, mode
, vl
);
4467 if (!allow_recurse_p
)
4470 /* Try inverting the vector in element size ELT_SIZE and then EORing
4471 the result with an ELT_SIZE PTRUE. */
4472 if (INTVAL (builder
.elt (0)) == 0)
4473 if (rtx res
= aarch64_expand_sve_const_pred_eor (target
, builder
,
4477 /* Try using TRN1 to permute two simpler constants. */
4478 for (unsigned int i
= elt_size
; i
<= 8; i
*= 2)
4479 if (rtx res
= aarch64_expand_sve_const_pred_trn (target
, builder
,
4486 /* Return an SVE predicate register that contains the VNx16BImode
4487 constant in BUILDER, without going through the move expanders.
4489 The returned register can have whatever mode seems most natural
4490 given the contents of BUILDER. Use TARGET for the result if
4494 aarch64_expand_sve_const_pred (rtx target
, rtx_vector_builder
&builder
)
4496 /* Try loading the constant using pure predicate operations. */
4497 if (rtx res
= aarch64_expand_sve_const_pred_1 (target
, builder
, true))
4500 /* Try forcing the constant to memory. */
4501 if (builder
.full_nelts ().is_constant ())
4502 if (rtx mem
= force_const_mem (VNx16BImode
, builder
.build ()))
4504 target
= aarch64_target_reg (target
, VNx16BImode
);
4505 emit_move_insn (target
, mem
);
4509 /* The last resort is to load the constant as an integer and then
4510 compare it against zero. Use -1 for set bits in order to increase
4511 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
4512 rtx_vector_builder
int_builder (VNx16QImode
, builder
.npatterns (),
4513 builder
.nelts_per_pattern ());
4514 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); ++i
)
4515 int_builder
.quick_push (INTVAL (builder
.elt (i
))
4516 ? constm1_rtx
: const0_rtx
);
4517 return aarch64_convert_sve_data_to_pred (target
, VNx16BImode
,
4518 int_builder
.build ());
4521 /* Set DEST to immediate IMM. */
4524 aarch64_expand_mov_immediate (rtx dest
, rtx imm
)
4526 machine_mode mode
= GET_MODE (dest
);
4528 /* Check on what type of symbol it is. */
4529 scalar_int_mode int_mode
;
4530 if ((GET_CODE (imm
) == SYMBOL_REF
4531 || GET_CODE (imm
) == LABEL_REF
4532 || GET_CODE (imm
) == CONST
4533 || GET_CODE (imm
) == CONST_POLY_INT
)
4534 && is_a
<scalar_int_mode
> (mode
, &int_mode
))
4538 HOST_WIDE_INT const_offset
;
4539 enum aarch64_symbol_type sty
;
4541 /* If we have (const (plus symbol offset)), separate out the offset
4542 before we start classifying the symbol. */
4543 rtx base
= strip_offset (imm
, &offset
);
4545 /* We must always add an offset involving VL separately, rather than
4546 folding it into the relocation. */
4547 if (!offset
.is_constant (&const_offset
))
4551 aarch64_report_sve_required ();
4554 if (base
== const0_rtx
&& aarch64_sve_cnt_immediate_p (offset
))
4555 emit_insn (gen_rtx_SET (dest
, imm
));
4558 /* Do arithmetic on 32-bit values if the result is smaller
4560 if (partial_subreg_p (int_mode
, SImode
))
4562 /* It is invalid to do symbol calculations in modes
4563 narrower than SImode. */
4564 gcc_assert (base
== const0_rtx
);
4565 dest
= gen_lowpart (SImode
, dest
);
4568 if (base
!= const0_rtx
)
4570 base
= aarch64_force_temporary (int_mode
, dest
, base
);
4571 aarch64_add_offset (int_mode
, dest
, base
, offset
,
4572 NULL_RTX
, NULL_RTX
, false);
4575 aarch64_add_offset (int_mode
, dest
, base
, offset
,
4576 dest
, NULL_RTX
, false);
4581 sty
= aarch64_classify_symbol (base
, const_offset
);
4584 case SYMBOL_FORCE_TO_MEM
:
4585 if (const_offset
!= 0
4586 && targetm
.cannot_force_const_mem (int_mode
, imm
))
4588 gcc_assert (can_create_pseudo_p ());
4589 base
= aarch64_force_temporary (int_mode
, dest
, base
);
4590 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
4591 NULL_RTX
, NULL_RTX
, false);
4595 mem
= force_const_mem (ptr_mode
, imm
);
4598 /* If we aren't generating PC relative literals, then
4599 we need to expand the literal pool access carefully.
4600 This is something that needs to be done in a number
4601 of places, so could well live as a separate function. */
4602 if (!aarch64_pcrelative_literal_loads
)
4604 gcc_assert (can_create_pseudo_p ());
4605 base
= gen_reg_rtx (ptr_mode
);
4606 aarch64_expand_mov_immediate (base
, XEXP (mem
, 0));
4607 if (ptr_mode
!= Pmode
)
4608 base
= convert_memory_address (Pmode
, base
);
4609 mem
= gen_rtx_MEM (ptr_mode
, base
);
4612 if (int_mode
!= ptr_mode
)
4613 mem
= gen_rtx_ZERO_EXTEND (int_mode
, mem
);
4615 emit_insn (gen_rtx_SET (dest
, mem
));
4619 case SYMBOL_SMALL_TLSGD
:
4620 case SYMBOL_SMALL_TLSDESC
:
4621 case SYMBOL_SMALL_TLSIE
:
4622 case SYMBOL_SMALL_GOT_28K
:
4623 case SYMBOL_SMALL_GOT_4G
:
4624 case SYMBOL_TINY_GOT
:
4625 case SYMBOL_TINY_TLSIE
:
4626 if (const_offset
!= 0)
4628 gcc_assert(can_create_pseudo_p ());
4629 base
= aarch64_force_temporary (int_mode
, dest
, base
);
4630 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
4631 NULL_RTX
, NULL_RTX
, false);
4636 case SYMBOL_SMALL_ABSOLUTE
:
4637 case SYMBOL_TINY_ABSOLUTE
:
4638 case SYMBOL_TLSLE12
:
4639 case SYMBOL_TLSLE24
:
4640 case SYMBOL_TLSLE32
:
4641 case SYMBOL_TLSLE48
:
4642 aarch64_load_symref_appropriately (dest
, imm
, sty
);
4650 if (!CONST_INT_P (imm
))
4652 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
)
4654 /* Only the low bit of each .H, .S and .D element is defined,
4655 so we can set the upper bits to whatever we like. If the
4656 predicate is all-true in MODE, prefer to set all the undefined
4657 bits as well, so that we can share a single .B predicate for
4659 if (imm
== CONSTM1_RTX (mode
))
4660 imm
= CONSTM1_RTX (VNx16BImode
);
4662 /* All methods for constructing predicate modes wider than VNx16BI
4663 will set the upper bits of each element to zero. Expose this
4664 by moving such constants as a VNx16BI, so that all bits are
4665 significant and so that constants for different modes can be
4666 shared. The wider constant will still be available as a
4668 rtx_vector_builder builder
;
4669 if (aarch64_get_sve_pred_bits (builder
, imm
))
4671 rtx res
= aarch64_expand_sve_const_pred (dest
, builder
);
4673 emit_move_insn (dest
, gen_lowpart (mode
, res
));
4678 if (GET_CODE (imm
) == HIGH
4679 || aarch64_simd_valid_immediate (imm
, NULL
))
4681 emit_insn (gen_rtx_SET (dest
, imm
));
4685 if (GET_CODE (imm
) == CONST_VECTOR
&& aarch64_sve_data_mode_p (mode
))
4686 if (rtx res
= aarch64_expand_sve_const_vector (dest
, imm
))
4689 emit_insn (gen_aarch64_sve_reinterpret (mode
, dest
, res
));
4693 rtx mem
= force_const_mem (mode
, imm
);
4695 emit_move_insn (dest
, mem
);
4699 aarch64_internal_mov_immediate (dest
, imm
, true,
4700 as_a
<scalar_int_mode
> (mode
));
4703 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
4704 that is known to contain PTRUE. */
4707 aarch64_emit_sve_pred_move (rtx dest
, rtx pred
, rtx src
)
4709 expand_operand ops
[3];
4710 machine_mode mode
= GET_MODE (dest
);
4711 create_output_operand (&ops
[0], dest
, mode
);
4712 create_input_operand (&ops
[1], pred
, GET_MODE(pred
));
4713 create_input_operand (&ops
[2], src
, mode
);
4714 temporary_volatile_ok
v (true);
4715 expand_insn (code_for_aarch64_pred_mov (mode
), 3, ops
);
4718 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
4719 operand is in memory. In this case we need to use the predicated LD1
4720 and ST1 instead of LDR and STR, both for correctness on big-endian
4721 targets and because LD1 and ST1 support a wider range of addressing modes.
4722 PRED_MODE is the mode of the predicate.
4724 See the comment at the head of aarch64-sve.md for details about the
4725 big-endian handling. */
4728 aarch64_expand_sve_mem_move (rtx dest
, rtx src
, machine_mode pred_mode
)
4730 machine_mode mode
= GET_MODE (dest
);
4731 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
4732 if (!register_operand (src
, mode
)
4733 && !register_operand (dest
, mode
))
4735 rtx tmp
= gen_reg_rtx (mode
);
4737 aarch64_emit_sve_pred_move (tmp
, ptrue
, src
);
4739 emit_move_insn (tmp
, src
);
4742 aarch64_emit_sve_pred_move (dest
, ptrue
, src
);
4745 /* Called only on big-endian targets. See whether an SVE vector move
4746 from SRC to DEST is effectively a REV[BHW] instruction, because at
4747 least one operand is a subreg of an SVE vector that has wider or
4748 narrower elements. Return true and emit the instruction if so.
4752 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
4754 represents a VIEW_CONVERT between the following vectors, viewed
4757 R2: { [0].high, [0].low, [1].high, [1].low, ... }
4758 R1: { [0], [1], [2], [3], ... }
4760 The high part of lane X in R2 should therefore correspond to lane X*2
4761 of R1, but the register representations are:
4764 R2: ...... [1].high [1].low [0].high [0].low
4765 R1: ...... [3] [2] [1] [0]
4767 where the low part of lane X in R2 corresponds to lane X*2 in R1.
4768 We therefore need a reverse operation to swap the high and low values
4771 This is purely an optimization. Without it we would spill the
4772 subreg operand to the stack in one mode and reload it in the
4773 other mode, which has the same effect as the REV. */
4776 aarch64_maybe_expand_sve_subreg_move (rtx dest
, rtx src
)
4778 gcc_assert (BYTES_BIG_ENDIAN
);
4779 if (GET_CODE (dest
) == SUBREG
)
4780 dest
= SUBREG_REG (dest
);
4781 if (GET_CODE (src
) == SUBREG
)
4782 src
= SUBREG_REG (src
);
4784 /* The optimization handles two single SVE REGs with different element
4788 || aarch64_classify_vector_mode (GET_MODE (dest
)) != VEC_SVE_DATA
4789 || aarch64_classify_vector_mode (GET_MODE (src
)) != VEC_SVE_DATA
4790 || (GET_MODE_UNIT_SIZE (GET_MODE (dest
))
4791 == GET_MODE_UNIT_SIZE (GET_MODE (src
))))
4794 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
4795 rtx ptrue
= aarch64_ptrue_reg (VNx16BImode
);
4796 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (dest
), gen_rtvec (2, ptrue
, src
),
4798 emit_insn (gen_rtx_SET (dest
, unspec
));
4802 /* Return a copy of X with mode MODE, without changing its other
4803 attributes. Unlike gen_lowpart, this doesn't care whether the
4804 mode change is valid. */
4807 aarch64_replace_reg_mode (rtx x
, machine_mode mode
)
4809 if (GET_MODE (x
) == mode
)
4812 x
= shallow_copy_rtx (x
);
4813 set_mode_and_regno (x
, mode
, REGNO (x
));
4817 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
4818 stored in wider integer containers. */
4821 aarch64_sve_rev_unspec (machine_mode mode
)
4823 switch (GET_MODE_UNIT_SIZE (mode
))
4825 case 1: return UNSPEC_REVB
;
4826 case 2: return UNSPEC_REVH
;
4827 case 4: return UNSPEC_REVW
;
4832 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
4836 aarch64_split_sve_subreg_move (rtx dest
, rtx ptrue
, rtx src
)
4838 /* Decide which REV operation we need. The mode with wider elements
4839 determines the mode of the operands and the mode with the narrower
4840 elements determines the reverse width. */
4841 machine_mode mode_with_wider_elts
= GET_MODE (dest
);
4842 machine_mode mode_with_narrower_elts
= GET_MODE (src
);
4843 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts
)
4844 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts
))
4845 std::swap (mode_with_wider_elts
, mode_with_narrower_elts
);
4847 unsigned int unspec
= aarch64_sve_rev_unspec (mode_with_narrower_elts
);
4848 machine_mode pred_mode
= aarch64_sve_pred_mode (mode_with_wider_elts
);
4850 /* Get the operands in the appropriate modes and emit the instruction. */
4851 ptrue
= gen_lowpart (pred_mode
, ptrue
);
4852 dest
= aarch64_replace_reg_mode (dest
, mode_with_wider_elts
);
4853 src
= aarch64_replace_reg_mode (src
, mode_with_wider_elts
);
4854 emit_insn (gen_aarch64_pred (unspec
, mode_with_wider_elts
,
4859 aarch64_function_ok_for_sibcall (tree
, tree exp
)
4861 if (crtl
->abi
->id () != expr_callee_abi (exp
).id ())
4867 /* Implement TARGET_PASS_BY_REFERENCE. */
4870 aarch64_pass_by_reference (cumulative_args_t pcum_v
,
4871 const function_arg_info
&arg
)
4873 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
4875 machine_mode dummymode
;
4878 unsigned int num_zr
, num_pr
;
4879 if (arg
.type
&& aarch64_sve_argument_p (arg
.type
, &num_zr
, &num_pr
))
4881 if (pcum
&& !pcum
->silent_p
&& !TARGET_SVE
)
4882 /* We can't gracefully recover at this point, so make this a
4884 fatal_error (input_location
, "arguments of type %qT require"
4885 " the SVE ISA extension", arg
.type
);
4887 /* Variadic SVE types are passed by reference. Normal non-variadic
4888 arguments are too if we've run out of registers. */
4890 || pcum
->aapcs_nvrn
+ num_zr
> NUM_FP_ARG_REGS
4891 || pcum
->aapcs_nprn
+ num_pr
> NUM_PR_ARG_REGS
);
4894 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
4895 if (arg
.mode
== BLKmode
&& arg
.type
)
4896 size
= int_size_in_bytes (arg
.type
);
4898 /* No frontends can create types with variable-sized modes, so we
4899 shouldn't be asked to pass or return them. */
4900 size
= GET_MODE_SIZE (arg
.mode
).to_constant ();
4902 /* Aggregates are passed by reference based on their size. */
4903 if (arg
.aggregate_type_p ())
4904 size
= int_size_in_bytes (arg
.type
);
4906 /* Variable sized arguments are always returned by reference. */
4910 /* Can this be a candidate to be passed in fp/simd register(s)? */
4911 if (aarch64_vfp_is_call_or_return_candidate (arg
.mode
, arg
.type
,
4916 /* Arguments which are variable sized or larger than 2 registers are
4917 passed by reference unless they are a homogenous floating point
4919 return size
> 2 * UNITS_PER_WORD
;
4922 /* Return TRUE if VALTYPE is padded to its least significant bits. */
4924 aarch64_return_in_msb (const_tree valtype
)
4926 machine_mode dummy_mode
;
4929 /* Never happens in little-endian mode. */
4930 if (!BYTES_BIG_ENDIAN
)
4933 /* Only composite types smaller than or equal to 16 bytes can
4934 be potentially returned in registers. */
4935 if (!aarch64_composite_type_p (valtype
, TYPE_MODE (valtype
))
4936 || int_size_in_bytes (valtype
) <= 0
4937 || int_size_in_bytes (valtype
) > 16)
4940 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4941 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4942 is always passed/returned in the least significant bits of fp/simd
4944 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype
), valtype
,
4945 &dummy_mode
, &dummy_int
, NULL
))
4951 /* Subroutine of aarch64_function_value. MODE is the mode of the argument
4952 after promotion, and after partial SVE types have been replaced by
4953 their integer equivalents. */
4955 aarch64_function_value_1 (const_tree type
, machine_mode mode
)
4957 unsigned int num_zr
, num_pr
;
4958 if (type
&& aarch64_sve_argument_p (type
, &num_zr
, &num_pr
))
4960 /* Don't raise an error here if we're called when SVE is disabled,
4961 since this is really just a query function. Other code must
4962 do that where appropriate. */
4963 mode
= TYPE_MODE_RAW (type
);
4964 gcc_assert (VECTOR_MODE_P (mode
)
4965 && (!TARGET_SVE
|| aarch64_sve_mode_p (mode
)));
4967 if (num_zr
> 0 && num_pr
== 0)
4968 return gen_rtx_REG (mode
, V0_REGNUM
);
4970 if (num_zr
== 0 && num_pr
== 1)
4971 return gen_rtx_REG (mode
, P0_REGNUM
);
4976 /* Generic vectors that map to SVE modes with -msve-vector-bits=N are
4977 returned in memory, not by value. */
4978 gcc_assert (!aarch64_sve_mode_p (mode
));
4980 if (aarch64_return_in_msb (type
))
4982 HOST_WIDE_INT size
= int_size_in_bytes (type
);
4984 if (size
% UNITS_PER_WORD
!= 0)
4986 size
+= UNITS_PER_WORD
- size
% UNITS_PER_WORD
;
4987 mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 0).require ();
4992 machine_mode ag_mode
;
4993 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
4994 &ag_mode
, &count
, NULL
))
4996 if (!aarch64_composite_type_p (type
, mode
))
4998 gcc_assert (count
== 1 && mode
== ag_mode
);
4999 return gen_rtx_REG (mode
, V0_REGNUM
);
5006 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (count
));
5007 for (i
= 0; i
< count
; i
++)
5009 rtx tmp
= gen_rtx_REG (ag_mode
, V0_REGNUM
+ i
);
5010 rtx offset
= gen_int_mode (i
* GET_MODE_SIZE (ag_mode
), Pmode
);
5011 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
5012 XVECEXP (par
, 0, i
) = tmp
;
5018 return gen_rtx_REG (mode
, R0_REGNUM
);
5021 /* Implement TARGET_FUNCTION_VALUE.
5022 Define how to find the value returned by a function. */
5025 aarch64_function_value (const_tree type
, const_tree func
,
5026 bool outgoing ATTRIBUTE_UNUSED
)
5031 mode
= TYPE_MODE (type
);
5032 if (INTEGRAL_TYPE_P (type
))
5033 mode
= promote_function_mode (type
, mode
, &unsignedp
, func
, 1);
5035 /* Vector types can acquire a partial SVE mode using things like
5036 __attribute__((vector_size(N))), and this is potentially useful.
5037 However, the choice of mode doesn't affect the type's ABI identity,
5038 so we should treat the types as though they had the associated
5039 integer mode, just like they did before SVE was introduced.
5041 We know that the vector must be 128 bits or smaller, otherwise we'd
5042 have returned it in memory instead. */
5043 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
5044 if ((vec_flags
& VEC_ANY_SVE
) && (vec_flags
& VEC_PARTIAL
))
5046 scalar_int_mode int_mode
= int_mode_for_mode (mode
).require ();
5047 rtx reg
= aarch64_function_value_1 (type
, int_mode
);
5048 /* Vector types are never returned in the MSB and are never split. */
5049 gcc_assert (REG_P (reg
) && GET_MODE (reg
) == int_mode
);
5050 rtx pair
= gen_rtx_EXPR_LIST (VOIDmode
, reg
, const0_rtx
);
5051 return gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, pair
));
5054 return aarch64_function_value_1 (type
, mode
);
5057 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
5058 Return true if REGNO is the number of a hard register in which the values
5059 of called function may come back. */
5062 aarch64_function_value_regno_p (const unsigned int regno
)
5064 /* Maximum of 16 bytes can be returned in the general registers. Examples
5065 of 16-byte return values are: 128-bit integers and 16-byte small
5066 structures (excluding homogeneous floating-point aggregates). */
5067 if (regno
== R0_REGNUM
|| regno
== R1_REGNUM
)
5070 /* Up to four fp/simd registers can return a function value, e.g. a
5071 homogeneous floating-point aggregate having four members. */
5072 if (regno
>= V0_REGNUM
&& regno
< V0_REGNUM
+ HA_MAX_NUM_FLDS
)
5073 return TARGET_FLOAT
;
5078 /* Implement TARGET_RETURN_IN_MEMORY.
5080 If the type T of the result of a function is such that
5082 would require that arg be passed as a value in a register (or set of
5083 registers) according to the parameter passing rules, then the result
5084 is returned in the same registers as would be used for such an
5088 aarch64_return_in_memory (const_tree type
, const_tree fndecl ATTRIBUTE_UNUSED
)
5091 machine_mode ag_mode
;
5094 if (!AGGREGATE_TYPE_P (type
)
5095 && TREE_CODE (type
) != COMPLEX_TYPE
5096 && TREE_CODE (type
) != VECTOR_TYPE
)
5097 /* Simple scalar types always returned in registers. */
5100 unsigned int num_zr
, num_pr
;
5101 if (type
&& aarch64_sve_argument_p (type
, &num_zr
, &num_pr
))
5103 /* All SVE types we support fit in registers. For example, it isn't
5104 yet possible to define an aggregate of 9+ SVE vectors or 5+ SVE
5106 gcc_assert (num_zr
<= NUM_FP_ARG_REGS
&& num_pr
<= NUM_PR_ARG_REGS
);
5110 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
),
5117 /* Types larger than 2 registers returned in memory. */
5118 size
= int_size_in_bytes (type
);
5119 return (size
< 0 || size
> 2 * UNITS_PER_WORD
);
5123 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v
, machine_mode mode
,
5124 const_tree type
, int *nregs
)
5126 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
5127 return aarch64_vfp_is_call_or_return_candidate (mode
,
5129 &pcum
->aapcs_vfp_rmode
,
5134 /* Given MODE and TYPE of a function argument, return the alignment in
5135 bits. The idea is to suppress any stronger alignment requested by
5136 the user and opt for the natural alignment (specified in AAPCS64 \S
5137 4.1). ABI_BREAK is set to true if the alignment was incorrectly
5138 calculated in versions of GCC prior to GCC-9. This is a helper
5139 function for local use only. */
5142 aarch64_function_arg_alignment (machine_mode mode
, const_tree type
,
5147 return GET_MODE_ALIGNMENT (mode
);
5149 if (integer_zerop (TYPE_SIZE (type
)))
5152 gcc_assert (TYPE_MODE (type
) == mode
);
5154 if (!AGGREGATE_TYPE_P (type
))
5155 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type
));
5157 if (TREE_CODE (type
) == ARRAY_TYPE
)
5158 return TYPE_ALIGN (TREE_TYPE (type
));
5160 unsigned int alignment
= 0;
5161 unsigned int bitfield_alignment
= 0;
5162 for (tree field
= TYPE_FIELDS (type
); field
; field
= DECL_CHAIN (field
))
5163 if (TREE_CODE (field
) == FIELD_DECL
)
5165 alignment
= std::max (alignment
, DECL_ALIGN (field
));
5166 if (DECL_BIT_FIELD_TYPE (field
))
5168 = std::max (bitfield_alignment
,
5169 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field
)));
5172 if (bitfield_alignment
> alignment
)
5175 return bitfield_alignment
;
5181 /* Layout a function argument according to the AAPCS64 rules. The rule
5182 numbers refer to the rule numbers in the AAPCS64. ORIG_MODE is the
5183 mode that was originally given to us by the target hook, whereas the
5184 mode in ARG might be the result of replacing partial SVE modes with
5185 the equivalent integer mode. */
5188 aarch64_layout_arg (cumulative_args_t pcum_v
, const function_arg_info
&arg
,
5189 machine_mode orig_mode
)
5191 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
5192 tree type
= arg
.type
;
5193 machine_mode mode
= arg
.mode
;
5194 int ncrn
, nvrn
, nregs
;
5195 bool allocate_ncrn
, allocate_nvrn
;
5199 /* We need to do this once per argument. */
5200 if (pcum
->aapcs_arg_processed
)
5203 /* Vector types can acquire a partial SVE mode using things like
5204 __attribute__((vector_size(N))), and this is potentially useful.
5205 However, the choice of mode doesn't affect the type's ABI identity,
5206 so we should treat the types as though they had the associated
5207 integer mode, just like they did before SVE was introduced.
5209 We know that the vector must be 128 bits or smaller, otherwise we'd
5210 have passed it by reference instead. */
5211 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
5212 if ((vec_flags
& VEC_ANY_SVE
) && (vec_flags
& VEC_PARTIAL
))
5214 function_arg_info tmp_arg
= arg
;
5215 tmp_arg
.mode
= int_mode_for_mode (mode
).require ();
5216 aarch64_layout_arg (pcum_v
, tmp_arg
, orig_mode
);
5217 if (rtx reg
= pcum
->aapcs_reg
)
5219 gcc_assert (REG_P (reg
) && GET_MODE (reg
) == tmp_arg
.mode
);
5220 rtx pair
= gen_rtx_EXPR_LIST (VOIDmode
, reg
, const0_rtx
);
5221 pcum
->aapcs_reg
= gen_rtx_PARALLEL (mode
, gen_rtvec (1, pair
));
5226 pcum
->aapcs_arg_processed
= true;
5228 unsigned int num_zr
, num_pr
;
5229 if (type
&& aarch64_sve_argument_p (type
, &num_zr
, &num_pr
))
5231 /* The PCS says that it is invalid to pass an SVE value to an
5232 unprototyped function. There is no ABI-defined location we
5233 can return in this case, so we have no real choice but to raise
5234 an error immediately, even though this is only a query function. */
5235 if (arg
.named
&& pcum
->pcs_variant
!= ARM_PCS_SVE
)
5237 gcc_assert (!pcum
->silent_p
);
5238 error ("SVE type %qT cannot be passed to an unprototyped function",
5240 /* Avoid repeating the message, and avoid tripping the assert
5242 pcum
->pcs_variant
= ARM_PCS_SVE
;
5245 /* We would have converted the argument into pass-by-reference
5246 form if it didn't fit in registers. */
5247 pcum
->aapcs_nextnvrn
= pcum
->aapcs_nvrn
+ num_zr
;
5248 pcum
->aapcs_nextnprn
= pcum
->aapcs_nprn
+ num_pr
;
5249 gcc_assert (arg
.named
5250 && pcum
->pcs_variant
== ARM_PCS_SVE
5251 && aarch64_sve_mode_p (mode
)
5252 && pcum
->aapcs_nextnvrn
<= NUM_FP_ARG_REGS
5253 && pcum
->aapcs_nextnprn
<= NUM_PR_ARG_REGS
);
5255 if (num_zr
> 0 && num_pr
== 0)
5256 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ pcum
->aapcs_nvrn
);
5257 else if (num_zr
== 0 && num_pr
== 1)
5258 pcum
->aapcs_reg
= gen_rtx_REG (mode
, P0_REGNUM
+ pcum
->aapcs_nprn
);
5264 /* Generic vectors that map to SVE modes with -msve-vector-bits=N are
5265 passed by reference, not by value. */
5266 gcc_assert (!aarch64_sve_mode_p (mode
));
5268 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
5270 size
= int_size_in_bytes (type
);
5272 /* No frontends can create types with variable-sized modes, so we
5273 shouldn't be asked to pass or return them. */
5274 size
= GET_MODE_SIZE (mode
).to_constant ();
5275 size
= ROUND_UP (size
, UNITS_PER_WORD
);
5277 allocate_ncrn
= (type
) ? !(FLOAT_TYPE_P (type
)) : !FLOAT_MODE_P (mode
);
5278 allocate_nvrn
= aarch64_vfp_is_call_candidate (pcum_v
,
5283 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
5284 The following code thus handles passing by SIMD/FP registers first. */
5286 nvrn
= pcum
->aapcs_nvrn
;
5288 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
5289 and homogenous short-vector aggregates (HVA). */
5292 if (!pcum
->silent_p
&& !TARGET_FLOAT
)
5293 aarch64_err_no_fpadvsimd (mode
);
5295 if (nvrn
+ nregs
<= NUM_FP_ARG_REGS
)
5297 pcum
->aapcs_nextnvrn
= nvrn
+ nregs
;
5298 if (!aarch64_composite_type_p (type
, mode
))
5300 gcc_assert (nregs
== 1);
5301 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ nvrn
);
5307 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
5308 for (i
= 0; i
< nregs
; i
++)
5310 rtx tmp
= gen_rtx_REG (pcum
->aapcs_vfp_rmode
,
5311 V0_REGNUM
+ nvrn
+ i
);
5312 rtx offset
= gen_int_mode
5313 (i
* GET_MODE_SIZE (pcum
->aapcs_vfp_rmode
), Pmode
);
5314 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
5315 XVECEXP (par
, 0, i
) = tmp
;
5317 pcum
->aapcs_reg
= par
;
5323 /* C.3 NSRN is set to 8. */
5324 pcum
->aapcs_nextnvrn
= NUM_FP_ARG_REGS
;
5329 ncrn
= pcum
->aapcs_ncrn
;
5330 nregs
= size
/ UNITS_PER_WORD
;
5332 /* C6 - C9. though the sign and zero extension semantics are
5333 handled elsewhere. This is the case where the argument fits
5334 entirely general registers. */
5335 if (allocate_ncrn
&& (ncrn
+ nregs
<= NUM_ARG_REGS
))
5337 gcc_assert (nregs
== 0 || nregs
== 1 || nregs
== 2);
5339 /* C.8 if the argument has an alignment of 16 then the NGRN is
5340 rounded up to the next even number. */
5343 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
5344 comparison is there because for > 16 * BITS_PER_UNIT
5345 alignment nregs should be > 2 and therefore it should be
5346 passed by reference rather than value. */
5347 && (aarch64_function_arg_alignment (orig_mode
, type
, &abi_break
)
5348 == 16 * BITS_PER_UNIT
))
5350 if (abi_break
&& warn_psabi
&& currently_expanding_gimple_stmt
)
5351 inform (input_location
, "parameter passing for argument of type "
5352 "%qT changed in GCC 9.1", type
);
5354 gcc_assert (ncrn
+ nregs
<= NUM_ARG_REGS
);
5357 /* NREGS can be 0 when e.g. an empty structure is to be passed.
5358 A reg is still generated for it, but the caller should be smart
5359 enough not to use it. */
5360 if (nregs
== 0 || nregs
== 1 || GET_MODE_CLASS (mode
) == MODE_INT
)
5361 pcum
->aapcs_reg
= gen_rtx_REG (mode
, R0_REGNUM
+ ncrn
);
5367 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
5368 for (i
= 0; i
< nregs
; i
++)
5370 rtx tmp
= gen_rtx_REG (word_mode
, R0_REGNUM
+ ncrn
+ i
);
5371 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
,
5372 GEN_INT (i
* UNITS_PER_WORD
));
5373 XVECEXP (par
, 0, i
) = tmp
;
5375 pcum
->aapcs_reg
= par
;
5378 pcum
->aapcs_nextncrn
= ncrn
+ nregs
;
5383 pcum
->aapcs_nextncrn
= NUM_ARG_REGS
;
5385 /* The argument is passed on stack; record the needed number of words for
5386 this argument and align the total size if necessary. */
5388 pcum
->aapcs_stack_words
= size
/ UNITS_PER_WORD
;
5390 if (aarch64_function_arg_alignment (orig_mode
, type
, &abi_break
)
5391 == 16 * BITS_PER_UNIT
)
5393 int new_size
= ROUND_UP (pcum
->aapcs_stack_size
, 16 / UNITS_PER_WORD
);
5394 if (pcum
->aapcs_stack_size
!= new_size
)
5396 if (abi_break
&& warn_psabi
&& currently_expanding_gimple_stmt
)
5397 inform (input_location
, "parameter passing for argument of type "
5398 "%qT changed in GCC 9.1", type
);
5399 pcum
->aapcs_stack_size
= new_size
;
5405 /* Implement TARGET_FUNCTION_ARG. */
5408 aarch64_function_arg (cumulative_args_t pcum_v
, const function_arg_info
&arg
)
5410 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
5411 gcc_assert (pcum
->pcs_variant
== ARM_PCS_AAPCS64
5412 || pcum
->pcs_variant
== ARM_PCS_SIMD
5413 || pcum
->pcs_variant
== ARM_PCS_SVE
);
5415 if (arg
.end_marker_p ())
5416 return gen_int_mode (pcum
->pcs_variant
, DImode
);
5418 aarch64_layout_arg (pcum_v
, arg
, arg
.mode
);
5419 return pcum
->aapcs_reg
;
5423 aarch64_init_cumulative_args (CUMULATIVE_ARGS
*pcum
,
5425 rtx libname ATTRIBUTE_UNUSED
,
5426 const_tree fndecl ATTRIBUTE_UNUSED
,
5427 unsigned n_named ATTRIBUTE_UNUSED
,
5430 pcum
->aapcs_ncrn
= 0;
5431 pcum
->aapcs_nvrn
= 0;
5432 pcum
->aapcs_nprn
= 0;
5433 pcum
->aapcs_nextncrn
= 0;
5434 pcum
->aapcs_nextnvrn
= 0;
5435 pcum
->aapcs_nextnprn
= 0;
5437 pcum
->pcs_variant
= (arm_pcs
) fntype_abi (fntype
).id ();
5439 pcum
->pcs_variant
= ARM_PCS_AAPCS64
;
5440 pcum
->aapcs_reg
= NULL_RTX
;
5441 pcum
->aapcs_arg_processed
= false;
5442 pcum
->aapcs_stack_words
= 0;
5443 pcum
->aapcs_stack_size
= 0;
5444 pcum
->silent_p
= silent_p
;
5448 && fndecl
&& TREE_PUBLIC (fndecl
)
5449 && fntype
&& fntype
!= error_mark_node
)
5451 const_tree type
= TREE_TYPE (fntype
);
5452 machine_mode mode ATTRIBUTE_UNUSED
; /* To pass pointer as argument. */
5453 int nregs ATTRIBUTE_UNUSED
; /* Likewise. */
5454 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
), type
,
5455 &mode
, &nregs
, NULL
))
5456 aarch64_err_no_fpadvsimd (TYPE_MODE (type
));
5461 && pcum
->pcs_variant
== ARM_PCS_SVE
)
5463 /* We can't gracefully recover at this point, so make this a
5466 fatal_error (input_location
, "%qE requires the SVE ISA extension",
5469 fatal_error (input_location
, "calls to functions of type %qT require"
5470 " the SVE ISA extension", fntype
);
5475 aarch64_function_arg_advance (cumulative_args_t pcum_v
,
5476 const function_arg_info
&arg
)
5478 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
5479 if (pcum
->pcs_variant
== ARM_PCS_AAPCS64
5480 || pcum
->pcs_variant
== ARM_PCS_SIMD
5481 || pcum
->pcs_variant
== ARM_PCS_SVE
)
5483 aarch64_layout_arg (pcum_v
, arg
, arg
.mode
);
5484 gcc_assert ((pcum
->aapcs_reg
!= NULL_RTX
)
5485 != (pcum
->aapcs_stack_words
!= 0));
5486 pcum
->aapcs_arg_processed
= false;
5487 pcum
->aapcs_ncrn
= pcum
->aapcs_nextncrn
;
5488 pcum
->aapcs_nvrn
= pcum
->aapcs_nextnvrn
;
5489 pcum
->aapcs_nprn
= pcum
->aapcs_nextnprn
;
5490 pcum
->aapcs_stack_size
+= pcum
->aapcs_stack_words
;
5491 pcum
->aapcs_stack_words
= 0;
5492 pcum
->aapcs_reg
= NULL_RTX
;
5497 aarch64_function_arg_regno_p (unsigned regno
)
5499 return ((GP_REGNUM_P (regno
) && regno
< R0_REGNUM
+ NUM_ARG_REGS
)
5500 || (FP_REGNUM_P (regno
) && regno
< V0_REGNUM
+ NUM_FP_ARG_REGS
));
5503 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
5504 PARM_BOUNDARY bits of alignment, but will be given anything up
5505 to STACK_BOUNDARY bits if the type requires it. This makes sure
5506 that both before and after the layout of each argument, the Next
5507 Stacked Argument Address (NSAA) will have a minimum alignment of
5511 aarch64_function_arg_boundary (machine_mode mode
, const_tree type
)
5514 unsigned int alignment
= aarch64_function_arg_alignment (mode
, type
,
5516 if (abi_break
& warn_psabi
)
5517 inform (input_location
, "parameter passing for argument of type "
5518 "%qT changed in GCC 9.1", type
);
5520 return MIN (MAX (alignment
, PARM_BOUNDARY
), STACK_BOUNDARY
);
5523 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
5525 static fixed_size_mode
5526 aarch64_get_reg_raw_mode (int regno
)
5528 if (TARGET_SVE
&& FP_REGNUM_P (regno
))
5529 /* Don't use the SVE part of the register for __builtin_apply and
5530 __builtin_return. The SVE registers aren't used by the normal PCS,
5531 so using them there would be a waste of time. The PCS extensions
5532 for SVE types are fundamentally incompatible with the
5533 __builtin_return/__builtin_apply interface. */
5534 return as_a
<fixed_size_mode
> (V16QImode
);
5535 return default_get_reg_raw_mode (regno
);
5538 /* Implement TARGET_FUNCTION_ARG_PADDING.
5540 Small aggregate types are placed in the lowest memory address.
5542 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
5544 static pad_direction
5545 aarch64_function_arg_padding (machine_mode mode
, const_tree type
)
5547 /* On little-endian targets, the least significant byte of every stack
5548 argument is passed at the lowest byte address of the stack slot. */
5549 if (!BYTES_BIG_ENDIAN
)
5552 /* Otherwise, integral, floating-point and pointer types are padded downward:
5553 the least significant byte of a stack argument is passed at the highest
5554 byte address of the stack slot. */
5556 ? (INTEGRAL_TYPE_P (type
) || SCALAR_FLOAT_TYPE_P (type
)
5557 || POINTER_TYPE_P (type
))
5558 : (SCALAR_INT_MODE_P (mode
) || SCALAR_FLOAT_MODE_P (mode
)))
5559 return PAD_DOWNWARD
;
5561 /* Everything else padded upward, i.e. data in first byte of stack slot. */
5565 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
5567 It specifies padding for the last (may also be the only)
5568 element of a block move between registers and memory. If
5569 assuming the block is in the memory, padding upward means that
5570 the last element is padded after its highest significant byte,
5571 while in downward padding, the last element is padded at the
5572 its least significant byte side.
5574 Small aggregates and small complex types are always padded
5577 We don't need to worry about homogeneous floating-point or
5578 short-vector aggregates; their move is not affected by the
5579 padding direction determined here. Regardless of endianness,
5580 each element of such an aggregate is put in the least
5581 significant bits of a fp/simd register.
5583 Return !BYTES_BIG_ENDIAN if the least significant byte of the
5584 register has useful data, and return the opposite if the most
5585 significant byte does. */
5588 aarch64_pad_reg_upward (machine_mode mode
, const_tree type
,
5589 bool first ATTRIBUTE_UNUSED
)
5592 /* Small composite types are always padded upward. */
5593 if (BYTES_BIG_ENDIAN
&& aarch64_composite_type_p (type
, mode
))
5597 size
= int_size_in_bytes (type
);
5599 /* No frontends can create types with variable-sized modes, so we
5600 shouldn't be asked to pass or return them. */
5601 size
= GET_MODE_SIZE (mode
).to_constant ();
5602 if (size
< 2 * UNITS_PER_WORD
)
5606 /* Otherwise, use the default padding. */
5607 return !BYTES_BIG_ENDIAN
;
5610 static scalar_int_mode
5611 aarch64_libgcc_cmp_return_mode (void)
5616 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
5618 /* We use the 12-bit shifted immediate arithmetic instructions so values
5619 must be multiple of (1 << 12), i.e. 4096. */
5620 #define ARITH_FACTOR 4096
5622 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
5623 #error Cannot use simple address calculation for stack probing
5626 /* The pair of scratch registers used for stack probing. */
5627 #define PROBE_STACK_FIRST_REG R9_REGNUM
5628 #define PROBE_STACK_SECOND_REG R10_REGNUM
5630 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
5631 inclusive. These are offsets from the current stack pointer. */
5634 aarch64_emit_probe_stack_range (HOST_WIDE_INT first
, poly_int64 poly_size
)
5637 if (!poly_size
.is_constant (&size
))
5639 sorry ("stack probes for SVE frames");
5643 rtx reg1
= gen_rtx_REG (Pmode
, PROBE_STACK_FIRST_REG
);
5645 /* See the same assertion on PROBE_INTERVAL above. */
5646 gcc_assert ((first
% ARITH_FACTOR
) == 0);
5648 /* See if we have a constant small number of probes to generate. If so,
5649 that's the easy case. */
5650 if (size
<= PROBE_INTERVAL
)
5652 const HOST_WIDE_INT base
= ROUND_UP (size
, ARITH_FACTOR
);
5654 emit_set_insn (reg1
,
5655 plus_constant (Pmode
,
5656 stack_pointer_rtx
, -(first
+ base
)));
5657 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- size
));
5660 /* The run-time loop is made up of 8 insns in the generic case while the
5661 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
5662 else if (size
<= 4 * PROBE_INTERVAL
)
5664 HOST_WIDE_INT i
, rem
;
5666 emit_set_insn (reg1
,
5667 plus_constant (Pmode
,
5669 -(first
+ PROBE_INTERVAL
)));
5670 emit_stack_probe (reg1
);
5672 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
5673 it exceeds SIZE. If only two probes are needed, this will not
5674 generate any code. Then probe at FIRST + SIZE. */
5675 for (i
= 2 * PROBE_INTERVAL
; i
< size
; i
+= PROBE_INTERVAL
)
5677 emit_set_insn (reg1
,
5678 plus_constant (Pmode
, reg1
, -PROBE_INTERVAL
));
5679 emit_stack_probe (reg1
);
5682 rem
= size
- (i
- PROBE_INTERVAL
);
5685 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
5687 emit_set_insn (reg1
, plus_constant (Pmode
, reg1
, -base
));
5688 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- rem
));
5691 emit_stack_probe (plus_constant (Pmode
, reg1
, -rem
));
5694 /* Otherwise, do the same as above, but in a loop. Note that we must be
5695 extra careful with variables wrapping around because we might be at
5696 the very top (or the very bottom) of the address space and we have
5697 to be able to handle this case properly; in particular, we use an
5698 equality test for the loop condition. */
5701 rtx reg2
= gen_rtx_REG (Pmode
, PROBE_STACK_SECOND_REG
);
5703 /* Step 1: round SIZE to the previous multiple of the interval. */
5705 HOST_WIDE_INT rounded_size
= size
& -PROBE_INTERVAL
;
5708 /* Step 2: compute initial and final value of the loop counter. */
5710 /* TEST_ADDR = SP + FIRST. */
5711 emit_set_insn (reg1
,
5712 plus_constant (Pmode
, stack_pointer_rtx
, -first
));
5714 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
5715 HOST_WIDE_INT adjustment
= - (first
+ rounded_size
);
5716 if (! aarch64_uimm12_shift (adjustment
))
5718 aarch64_internal_mov_immediate (reg2
, GEN_INT (adjustment
),
5720 emit_set_insn (reg2
, gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, reg2
));
5723 emit_set_insn (reg2
,
5724 plus_constant (Pmode
, stack_pointer_rtx
, adjustment
));
5730 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
5733 while (TEST_ADDR != LAST_ADDR)
5735 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
5736 until it is equal to ROUNDED_SIZE. */
5738 emit_insn (gen_probe_stack_range (reg1
, reg1
, reg2
));
5741 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
5742 that SIZE is equal to ROUNDED_SIZE. */
5744 if (size
!= rounded_size
)
5746 HOST_WIDE_INT rem
= size
- rounded_size
;
5750 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
5752 emit_set_insn (reg2
, plus_constant (Pmode
, reg2
, -base
));
5753 emit_stack_probe (plus_constant (Pmode
, reg2
, base
- rem
));
5756 emit_stack_probe (plus_constant (Pmode
, reg2
, -rem
));
5760 /* Make sure nothing is scheduled before we are done. */
5761 emit_insn (gen_blockage ());
5764 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
5765 absolute addresses. */
5768 aarch64_output_probe_stack_range (rtx reg1
, rtx reg2
)
5770 static int labelno
= 0;
5774 ASM_GENERATE_INTERNAL_LABEL (loop_lab
, "LPSRL", labelno
++);
5777 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_lab
);
5779 HOST_WIDE_INT stack_clash_probe_interval
5780 = 1 << param_stack_clash_protection_guard_size
;
5782 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
5784 HOST_WIDE_INT interval
;
5785 if (flag_stack_clash_protection
)
5786 interval
= stack_clash_probe_interval
;
5788 interval
= PROBE_INTERVAL
;
5790 gcc_assert (aarch64_uimm12_shift (interval
));
5791 xops
[1] = GEN_INT (interval
);
5793 output_asm_insn ("sub\t%0, %0, %1", xops
);
5795 /* If doing stack clash protection then we probe up by the ABI specified
5796 amount. We do this because we're dropping full pages at a time in the
5797 loop. But if we're doing non-stack clash probing, probe at SP 0. */
5798 if (flag_stack_clash_protection
)
5799 xops
[1] = GEN_INT (STACK_CLASH_CALLER_GUARD
);
5801 xops
[1] = CONST0_RTX (GET_MODE (xops
[1]));
5803 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
5804 by this amount for each iteration. */
5805 output_asm_insn ("str\txzr, [%0, %1]", xops
);
5807 /* Test if TEST_ADDR == LAST_ADDR. */
5809 output_asm_insn ("cmp\t%0, %1", xops
);
5812 fputs ("\tb.ne\t", asm_out_file
);
5813 assemble_name_raw (asm_out_file
, loop_lab
);
5814 fputc ('\n', asm_out_file
);
5819 /* Emit the probe loop for doing stack clash probes and stack adjustments for
5820 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
5821 of GUARD_SIZE. When a probe is emitted it is done at most
5822 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
5823 at most MIN_PROBE_THRESHOLD. By the end of this function
5824 BASE = BASE - ADJUSTMENT. */
5827 aarch64_output_probe_sve_stack_clash (rtx base
, rtx adjustment
,
5828 rtx min_probe_threshold
, rtx guard_size
)
5830 /* This function is not allowed to use any instruction generation function
5831 like gen_ and friends. If you do you'll likely ICE during CFG validation,
5832 so instead emit the code you want using output_asm_insn. */
5833 gcc_assert (flag_stack_clash_protection
);
5834 gcc_assert (CONST_INT_P (min_probe_threshold
) && CONST_INT_P (guard_size
));
5835 gcc_assert (INTVAL (guard_size
) > INTVAL (min_probe_threshold
));
5837 /* The minimum required allocation before the residual requires probing. */
5838 HOST_WIDE_INT residual_probe_guard
= INTVAL (min_probe_threshold
);
5840 /* Clamp the value down to the nearest value that can be used with a cmp. */
5841 residual_probe_guard
= aarch64_clamp_to_uimm12_shift (residual_probe_guard
);
5842 rtx probe_offset_value_rtx
= gen_int_mode (residual_probe_guard
, Pmode
);
5844 gcc_assert (INTVAL (min_probe_threshold
) >= residual_probe_guard
);
5845 gcc_assert (aarch64_uimm12_shift (residual_probe_guard
));
5847 static int labelno
= 0;
5848 char loop_start_lab
[32];
5849 char loop_end_lab
[32];
5852 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab
, "SVLPSPL", labelno
);
5853 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab
, "SVLPEND", labelno
++);
5855 /* Emit loop start label. */
5856 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_start_lab
);
5858 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
5859 xops
[0] = adjustment
;
5860 xops
[1] = probe_offset_value_rtx
;
5861 output_asm_insn ("cmp\t%0, %1", xops
);
5863 /* Branch to end if not enough adjustment to probe. */
5864 fputs ("\tb.lt\t", asm_out_file
);
5865 assemble_name_raw (asm_out_file
, loop_end_lab
);
5866 fputc ('\n', asm_out_file
);
5868 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
5870 xops
[1] = probe_offset_value_rtx
;
5871 output_asm_insn ("sub\t%0, %0, %1", xops
);
5873 /* Probe at BASE. */
5874 xops
[1] = const0_rtx
;
5875 output_asm_insn ("str\txzr, [%0, %1]", xops
);
5877 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
5878 xops
[0] = adjustment
;
5879 xops
[1] = probe_offset_value_rtx
;
5880 output_asm_insn ("sub\t%0, %0, %1", xops
);
5882 /* Branch to start if still more bytes to allocate. */
5883 fputs ("\tb\t", asm_out_file
);
5884 assemble_name_raw (asm_out_file
, loop_start_lab
);
5885 fputc ('\n', asm_out_file
);
5887 /* No probe leave. */
5888 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_end_lab
);
5890 /* BASE = BASE - ADJUSTMENT. */
5892 xops
[1] = adjustment
;
5893 output_asm_insn ("sub\t%0, %0, %1", xops
);
5897 /* Determine whether a frame chain needs to be generated. */
5899 aarch64_needs_frame_chain (void)
5901 /* Force a frame chain for EH returns so the return address is at FP+8. */
5902 if (frame_pointer_needed
|| crtl
->calls_eh_return
)
5905 /* A leaf function cannot have calls or write LR. */
5906 bool is_leaf
= crtl
->is_leaf
&& !df_regs_ever_live_p (LR_REGNUM
);
5908 /* Don't use a frame chain in leaf functions if leaf frame pointers
5910 if (flag_omit_leaf_frame_pointer
&& is_leaf
)
5913 return aarch64_use_frame_pointer
;
5916 /* Mark the registers that need to be saved by the callee and calculate
5917 the size of the callee-saved registers area and frame record (both FP
5918 and LR may be omitted). */
5920 aarch64_layout_frame (void)
5922 poly_int64 offset
= 0;
5923 int regno
, last_fp_reg
= INVALID_REGNUM
;
5924 machine_mode vector_save_mode
= aarch64_reg_save_mode (V8_REGNUM
);
5925 poly_int64 vector_save_size
= GET_MODE_SIZE (vector_save_mode
);
5926 bool frame_related_fp_reg_p
= false;
5927 aarch64_frame
&frame
= cfun
->machine
->frame
;
5929 frame
.emit_frame_chain
= aarch64_needs_frame_chain ();
5931 /* Adjust the outgoing arguments size if required. Keep it in sync with what
5932 the mid-end is doing. */
5933 crtl
->outgoing_args_size
= STACK_DYNAMIC_OFFSET (cfun
);
5935 #define SLOT_NOT_REQUIRED (-2)
5936 #define SLOT_REQUIRED (-1)
5938 frame
.wb_candidate1
= INVALID_REGNUM
;
5939 frame
.wb_candidate2
= INVALID_REGNUM
;
5940 frame
.spare_pred_reg
= INVALID_REGNUM
;
5942 /* First mark all the registers that really need to be saved... */
5943 for (regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
5944 frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
5946 /* ... that includes the eh data registers (if needed)... */
5947 if (crtl
->calls_eh_return
)
5948 for (regno
= 0; EH_RETURN_DATA_REGNO (regno
) != INVALID_REGNUM
; regno
++)
5949 frame
.reg_offset
[EH_RETURN_DATA_REGNO (regno
)] = SLOT_REQUIRED
;
5951 /* ... and any callee saved register that dataflow says is live. */
5952 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
5953 if (df_regs_ever_live_p (regno
)
5954 && !fixed_regs
[regno
]
5955 && (regno
== R30_REGNUM
5956 || !crtl
->abi
->clobbers_full_reg_p (regno
)))
5957 frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
5959 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
5960 if (df_regs_ever_live_p (regno
)
5961 && !fixed_regs
[regno
]
5962 && !crtl
->abi
->clobbers_full_reg_p (regno
))
5964 frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
5965 last_fp_reg
= regno
;
5966 if (aarch64_emit_cfi_for_reg_p (regno
))
5967 frame_related_fp_reg_p
= true;
5970 /* Big-endian SVE frames need a spare predicate register in order
5971 to save Z8-Z15. Decide which register they should use. Prefer
5972 an unused argument register if possible, so that we don't force P4
5973 to be saved unnecessarily. */
5974 if (frame_related_fp_reg_p
5975 && crtl
->abi
->id () == ARM_PCS_SVE
5976 && BYTES_BIG_ENDIAN
)
5978 bitmap live1
= df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun
));
5979 bitmap live2
= df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun
));
5980 for (regno
= P0_REGNUM
; regno
<= P7_REGNUM
; regno
++)
5981 if (!bitmap_bit_p (live1
, regno
) && !bitmap_bit_p (live2
, regno
))
5983 gcc_assert (regno
<= P7_REGNUM
);
5984 frame
.spare_pred_reg
= regno
;
5985 df_set_regs_ever_live (regno
, true);
5988 for (regno
= P0_REGNUM
; regno
<= P15_REGNUM
; regno
++)
5989 if (df_regs_ever_live_p (regno
)
5990 && !fixed_regs
[regno
]
5991 && !crtl
->abi
->clobbers_full_reg_p (regno
))
5992 frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
5994 /* With stack-clash, LR must be saved in non-leaf functions. */
5995 gcc_assert (crtl
->is_leaf
5996 || maybe_ne (frame
.reg_offset
[R30_REGNUM
], SLOT_NOT_REQUIRED
));
5998 /* Now assign stack slots for the registers. Start with the predicate
5999 registers, since predicate LDR and STR have a relatively small
6000 offset range. These saves happen below the hard frame pointer. */
6001 for (regno
= P0_REGNUM
; regno
<= P15_REGNUM
; regno
++)
6002 if (known_eq (frame
.reg_offset
[regno
], SLOT_REQUIRED
))
6004 frame
.reg_offset
[regno
] = offset
;
6005 offset
+= BYTES_PER_SVE_PRED
;
6008 /* We save a maximum of 8 predicate registers, and since vector
6009 registers are 8 times the size of a predicate register, all the
6010 saved predicates fit within a single vector. Doing this also
6011 rounds the offset to a 128-bit boundary. */
6012 if (maybe_ne (offset
, 0))
6014 gcc_assert (known_le (offset
, vector_save_size
));
6015 offset
= vector_save_size
;
6018 /* If we need to save any SVE vector registers, add them next. */
6019 if (last_fp_reg
!= (int) INVALID_REGNUM
&& crtl
->abi
->id () == ARM_PCS_SVE
)
6020 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
6021 if (known_eq (frame
.reg_offset
[regno
], SLOT_REQUIRED
))
6023 frame
.reg_offset
[regno
] = offset
;
6024 offset
+= vector_save_size
;
6027 /* OFFSET is now the offset of the hard frame pointer from the bottom
6028 of the callee save area. */
6029 bool saves_below_hard_fp_p
= maybe_ne (offset
, 0);
6030 frame
.below_hard_fp_saved_regs_size
= offset
;
6031 if (frame
.emit_frame_chain
)
6033 /* FP and LR are placed in the linkage record. */
6034 frame
.reg_offset
[R29_REGNUM
] = offset
;
6035 frame
.wb_candidate1
= R29_REGNUM
;
6036 frame
.reg_offset
[R30_REGNUM
] = offset
+ UNITS_PER_WORD
;
6037 frame
.wb_candidate2
= R30_REGNUM
;
6038 offset
+= 2 * UNITS_PER_WORD
;
6041 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
6042 if (known_eq (frame
.reg_offset
[regno
], SLOT_REQUIRED
))
6044 frame
.reg_offset
[regno
] = offset
;
6045 if (frame
.wb_candidate1
== INVALID_REGNUM
)
6046 frame
.wb_candidate1
= regno
;
6047 else if (frame
.wb_candidate2
== INVALID_REGNUM
)
6048 frame
.wb_candidate2
= regno
;
6049 offset
+= UNITS_PER_WORD
;
6052 poly_int64 max_int_offset
= offset
;
6053 offset
= aligned_upper_bound (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
6054 bool has_align_gap
= maybe_ne (offset
, max_int_offset
);
6056 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
6057 if (known_eq (frame
.reg_offset
[regno
], SLOT_REQUIRED
))
6059 /* If there is an alignment gap between integer and fp callee-saves,
6060 allocate the last fp register to it if possible. */
6061 if (regno
== last_fp_reg
6063 && known_eq (vector_save_size
, 8)
6064 && multiple_p (offset
, 16))
6066 frame
.reg_offset
[regno
] = max_int_offset
;
6070 frame
.reg_offset
[regno
] = offset
;
6071 if (frame
.wb_candidate1
== INVALID_REGNUM
)
6072 frame
.wb_candidate1
= regno
;
6073 else if (frame
.wb_candidate2
== INVALID_REGNUM
6074 && frame
.wb_candidate1
>= V0_REGNUM
)
6075 frame
.wb_candidate2
= regno
;
6076 offset
+= vector_save_size
;
6079 offset
= aligned_upper_bound (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
6081 frame
.saved_regs_size
= offset
;
6083 poly_int64 varargs_and_saved_regs_size
= offset
+ frame
.saved_varargs_size
;
6085 poly_int64 above_outgoing_args
6086 = aligned_upper_bound (varargs_and_saved_regs_size
6087 + get_frame_size (),
6088 STACK_BOUNDARY
/ BITS_PER_UNIT
);
6090 frame
.hard_fp_offset
6091 = above_outgoing_args
- frame
.below_hard_fp_saved_regs_size
;
6093 /* Both these values are already aligned. */
6094 gcc_assert (multiple_p (crtl
->outgoing_args_size
,
6095 STACK_BOUNDARY
/ BITS_PER_UNIT
));
6096 frame
.frame_size
= above_outgoing_args
+ crtl
->outgoing_args_size
;
6098 frame
.locals_offset
= frame
.saved_varargs_size
;
6100 frame
.initial_adjust
= 0;
6101 frame
.final_adjust
= 0;
6102 frame
.callee_adjust
= 0;
6103 frame
.sve_callee_adjust
= 0;
6104 frame
.callee_offset
= 0;
6106 HOST_WIDE_INT max_push_offset
= 0;
6107 if (frame
.wb_candidate2
!= INVALID_REGNUM
)
6108 max_push_offset
= 512;
6109 else if (frame
.wb_candidate1
!= INVALID_REGNUM
)
6110 max_push_offset
= 256;
6112 HOST_WIDE_INT const_size
, const_outgoing_args_size
, const_fp_offset
;
6113 HOST_WIDE_INT const_saved_regs_size
;
6114 if (frame
.frame_size
.is_constant (&const_size
)
6115 && const_size
< max_push_offset
6116 && known_eq (frame
.hard_fp_offset
, const_size
))
6118 /* Simple, small frame with no outgoing arguments:
6120 stp reg1, reg2, [sp, -frame_size]!
6121 stp reg3, reg4, [sp, 16] */
6122 frame
.callee_adjust
= const_size
;
6124 else if (crtl
->outgoing_args_size
.is_constant (&const_outgoing_args_size
)
6125 && frame
.saved_regs_size
.is_constant (&const_saved_regs_size
)
6126 && const_outgoing_args_size
+ const_saved_regs_size
< 512
6127 /* We could handle this case even with outgoing args, provided
6128 that the number of args left us with valid offsets for all
6129 predicate and vector save slots. It's such a rare case that
6130 it hardly seems worth the effort though. */
6131 && (!saves_below_hard_fp_p
|| const_outgoing_args_size
== 0)
6132 && !(cfun
->calls_alloca
6133 && frame
.hard_fp_offset
.is_constant (&const_fp_offset
)
6134 && const_fp_offset
< max_push_offset
))
6136 /* Frame with small outgoing arguments:
6138 sub sp, sp, frame_size
6139 stp reg1, reg2, [sp, outgoing_args_size]
6140 stp reg3, reg4, [sp, outgoing_args_size + 16] */
6141 frame
.initial_adjust
= frame
.frame_size
;
6142 frame
.callee_offset
= const_outgoing_args_size
;
6144 else if (saves_below_hard_fp_p
6145 && known_eq (frame
.saved_regs_size
,
6146 frame
.below_hard_fp_saved_regs_size
))
6148 /* Frame in which all saves are SVE saves:
6150 sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
6151 save SVE registers relative to SP
6152 sub sp, sp, outgoing_args_size */
6153 frame
.initial_adjust
= (frame
.hard_fp_offset
6154 + frame
.below_hard_fp_saved_regs_size
);
6155 frame
.final_adjust
= crtl
->outgoing_args_size
;
6157 else if (frame
.hard_fp_offset
.is_constant (&const_fp_offset
)
6158 && const_fp_offset
< max_push_offset
)
6160 /* Frame with large outgoing arguments or SVE saves, but with
6163 stp reg1, reg2, [sp, -hard_fp_offset]!
6164 stp reg3, reg4, [sp, 16]
6165 [sub sp, sp, below_hard_fp_saved_regs_size]
6166 [save SVE registers relative to SP]
6167 sub sp, sp, outgoing_args_size */
6168 frame
.callee_adjust
= const_fp_offset
;
6169 frame
.sve_callee_adjust
= frame
.below_hard_fp_saved_regs_size
;
6170 frame
.final_adjust
= crtl
->outgoing_args_size
;
6174 /* Frame with large local area and outgoing arguments or SVE saves,
6175 using frame pointer:
6177 sub sp, sp, hard_fp_offset
6178 stp x29, x30, [sp, 0]
6180 stp reg3, reg4, [sp, 16]
6181 [sub sp, sp, below_hard_fp_saved_regs_size]
6182 [save SVE registers relative to SP]
6183 sub sp, sp, outgoing_args_size */
6184 frame
.initial_adjust
= frame
.hard_fp_offset
;
6185 frame
.sve_callee_adjust
= frame
.below_hard_fp_saved_regs_size
;
6186 frame
.final_adjust
= crtl
->outgoing_args_size
;
6189 /* Make sure the individual adjustments add up to the full frame size. */
6190 gcc_assert (known_eq (frame
.initial_adjust
6191 + frame
.callee_adjust
6192 + frame
.sve_callee_adjust
6193 + frame
.final_adjust
, frame
.frame_size
));
6195 frame
.laid_out
= true;
6198 /* Return true if the register REGNO is saved on entry to
6199 the current function. */
6202 aarch64_register_saved_on_entry (int regno
)
6204 return known_ge (cfun
->machine
->frame
.reg_offset
[regno
], 0);
6207 /* Return the next register up from REGNO up to LIMIT for the callee
6211 aarch64_next_callee_save (unsigned regno
, unsigned limit
)
6213 while (regno
<= limit
&& !aarch64_register_saved_on_entry (regno
))
6218 /* Push the register number REGNO of mode MODE to the stack with write-back
6219 adjusting the stack by ADJUSTMENT. */
6222 aarch64_pushwb_single_reg (machine_mode mode
, unsigned regno
,
6223 HOST_WIDE_INT adjustment
)
6225 rtx base_rtx
= stack_pointer_rtx
;
6228 reg
= gen_rtx_REG (mode
, regno
);
6229 mem
= gen_rtx_PRE_MODIFY (Pmode
, base_rtx
,
6230 plus_constant (Pmode
, base_rtx
, -adjustment
));
6231 mem
= gen_frame_mem (mode
, mem
);
6233 insn
= emit_move_insn (mem
, reg
);
6234 RTX_FRAME_RELATED_P (insn
) = 1;
6237 /* Generate and return an instruction to store the pair of registers
6238 REG and REG2 of mode MODE to location BASE with write-back adjusting
6239 the stack location BASE by ADJUSTMENT. */
6242 aarch64_gen_storewb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
6243 HOST_WIDE_INT adjustment
)
6248 return gen_storewb_pairdi_di (base
, base
, reg
, reg2
,
6249 GEN_INT (-adjustment
),
6250 GEN_INT (UNITS_PER_WORD
- adjustment
));
6252 return gen_storewb_pairdf_di (base
, base
, reg
, reg2
,
6253 GEN_INT (-adjustment
),
6254 GEN_INT (UNITS_PER_WORD
- adjustment
));
6256 return gen_storewb_pairtf_di (base
, base
, reg
, reg2
,
6257 GEN_INT (-adjustment
),
6258 GEN_INT (UNITS_PER_VREG
- adjustment
));
6264 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
6265 stack pointer by ADJUSTMENT. */
6268 aarch64_push_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
)
6271 machine_mode mode
= aarch64_reg_save_mode (regno1
);
6273 if (regno2
== INVALID_REGNUM
)
6274 return aarch64_pushwb_single_reg (mode
, regno1
, adjustment
);
6276 rtx reg1
= gen_rtx_REG (mode
, regno1
);
6277 rtx reg2
= gen_rtx_REG (mode
, regno2
);
6279 insn
= emit_insn (aarch64_gen_storewb_pair (mode
, stack_pointer_rtx
, reg1
,
6281 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 2)) = 1;
6282 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
6283 RTX_FRAME_RELATED_P (insn
) = 1;
6286 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
6287 adjusting it by ADJUSTMENT afterwards. */
6290 aarch64_gen_loadwb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
6291 HOST_WIDE_INT adjustment
)
6296 return gen_loadwb_pairdi_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
6297 GEN_INT (UNITS_PER_WORD
));
6299 return gen_loadwb_pairdf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
6300 GEN_INT (UNITS_PER_WORD
));
6302 return gen_loadwb_pairtf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
6303 GEN_INT (UNITS_PER_VREG
));
6309 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
6310 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
6314 aarch64_pop_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
,
6317 machine_mode mode
= aarch64_reg_save_mode (regno1
);
6318 rtx reg1
= gen_rtx_REG (mode
, regno1
);
6320 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg1
, *cfi_ops
);
6322 if (regno2
== INVALID_REGNUM
)
6324 rtx mem
= plus_constant (Pmode
, stack_pointer_rtx
, adjustment
);
6325 mem
= gen_rtx_POST_MODIFY (Pmode
, stack_pointer_rtx
, mem
);
6326 emit_move_insn (reg1
, gen_frame_mem (mode
, mem
));
6330 rtx reg2
= gen_rtx_REG (mode
, regno2
);
6331 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
6332 emit_insn (aarch64_gen_loadwb_pair (mode
, stack_pointer_rtx
, reg1
,
6337 /* Generate and return a store pair instruction of mode MODE to store
6338 register REG1 to MEM1 and register REG2 to MEM2. */
6341 aarch64_gen_store_pair (machine_mode mode
, rtx mem1
, rtx reg1
, rtx mem2
,
6347 return gen_store_pair_dw_didi (mem1
, reg1
, mem2
, reg2
);
6350 return gen_store_pair_dw_dfdf (mem1
, reg1
, mem2
, reg2
);
6353 return gen_store_pair_dw_tftf (mem1
, reg1
, mem2
, reg2
);
6360 /* Generate and regurn a load pair isntruction of mode MODE to load register
6361 REG1 from MEM1 and register REG2 from MEM2. */
6364 aarch64_gen_load_pair (machine_mode mode
, rtx reg1
, rtx mem1
, rtx reg2
,
6370 return gen_load_pair_dw_didi (reg1
, mem1
, reg2
, mem2
);
6373 return gen_load_pair_dw_dfdf (reg1
, mem1
, reg2
, mem2
);
6376 return gen_load_pair_dw_tftf (reg1
, mem1
, reg2
, mem2
);
6383 /* Return TRUE if return address signing should be enabled for the current
6384 function, otherwise return FALSE. */
6387 aarch64_return_address_signing_enabled (void)
6389 /* This function should only be called after frame laid out. */
6390 gcc_assert (cfun
->machine
->frame
.laid_out
);
6392 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
6393 if its LR is pushed onto stack. */
6394 return (aarch64_ra_sign_scope
== AARCH64_FUNCTION_ALL
6395 || (aarch64_ra_sign_scope
== AARCH64_FUNCTION_NON_LEAF
6396 && known_ge (cfun
->machine
->frame
.reg_offset
[LR_REGNUM
], 0)));
6399 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
6401 aarch64_bti_enabled (void)
6403 return (aarch64_enable_bti
== 1);
6406 /* The caller is going to use ST1D or LD1D to save or restore an SVE
6407 register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
6408 the range [1, 16] * GET_MODE_SIZE (MODE). Prepare for this by:
6410 (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
6413 (2) setting PRED to a valid predicate register for the ST1D or LD1D,
6414 if the variable isn't already nonnull
6416 (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
6417 Handle this case using a temporary base register that is suitable for
6418 all offsets in that range. Use ANCHOR_REG as this base register if it
6419 is nonnull, otherwise create a new register and store it in ANCHOR_REG. */
6422 aarch64_adjust_sve_callee_save_base (machine_mode mode
, rtx
&base_rtx
,
6423 rtx
&anchor_reg
, poly_int64
&offset
,
6426 if (maybe_ge (offset
, 8 * GET_MODE_SIZE (mode
)))
6428 /* This is the maximum valid offset of the anchor from the base.
6429 Lower values would be valid too. */
6430 poly_int64 anchor_offset
= 16 * GET_MODE_SIZE (mode
);
6433 anchor_reg
= gen_rtx_REG (Pmode
, STACK_CLASH_SVE_CFA_REGNUM
);
6434 emit_insn (gen_add3_insn (anchor_reg
, base_rtx
,
6435 gen_int_mode (anchor_offset
, Pmode
)));
6437 base_rtx
= anchor_reg
;
6438 offset
-= anchor_offset
;
6442 int pred_reg
= cfun
->machine
->frame
.spare_pred_reg
;
6443 emit_move_insn (gen_rtx_REG (VNx16BImode
, pred_reg
),
6444 CONSTM1_RTX (VNx16BImode
));
6445 ptrue
= gen_rtx_REG (VNx2BImode
, pred_reg
);
6449 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6450 is saved at BASE + OFFSET. */
6453 aarch64_add_cfa_expression (rtx_insn
*insn
, rtx reg
,
6454 rtx base
, poly_int64 offset
)
6456 rtx mem
= gen_frame_mem (GET_MODE (reg
),
6457 plus_constant (Pmode
, base
, offset
));
6458 add_reg_note (insn
, REG_CFA_EXPRESSION
, gen_rtx_SET (mem
, reg
));
6461 /* Emit code to save the callee-saved registers from register number START
6462 to LIMIT to the stack at the location starting at offset START_OFFSET,
6463 skipping any write-back candidates if SKIP_WB is true. HARD_FP_VALID_P
6464 is true if the hard frame pointer has been set up. */
6467 aarch64_save_callee_saves (poly_int64 start_offset
,
6468 unsigned start
, unsigned limit
, bool skip_wb
,
6469 bool hard_fp_valid_p
)
6474 rtx anchor_reg
= NULL_RTX
, ptrue
= NULL_RTX
;
6476 for (regno
= aarch64_next_callee_save (start
, limit
);
6478 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
6482 bool frame_related_p
= aarch64_emit_cfi_for_reg_p (regno
);
6485 && (regno
== cfun
->machine
->frame
.wb_candidate1
6486 || regno
== cfun
->machine
->frame
.wb_candidate2
))
6489 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
6492 machine_mode mode
= aarch64_reg_save_mode (regno
);
6493 reg
= gen_rtx_REG (mode
, regno
);
6494 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
6495 rtx base_rtx
= stack_pointer_rtx
;
6496 poly_int64 sp_offset
= offset
;
6498 HOST_WIDE_INT const_offset
;
6499 if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
6500 aarch64_adjust_sve_callee_save_base (mode
, base_rtx
, anchor_reg
,
6502 else if (GP_REGNUM_P (regno
)
6503 && (!offset
.is_constant (&const_offset
) || const_offset
>= 512))
6505 gcc_assert (known_eq (start_offset
, 0));
6506 poly_int64 fp_offset
6507 = cfun
->machine
->frame
.below_hard_fp_saved_regs_size
;
6508 if (hard_fp_valid_p
)
6509 base_rtx
= hard_frame_pointer_rtx
;
6514 anchor_reg
= gen_rtx_REG (Pmode
, STACK_CLASH_SVE_CFA_REGNUM
);
6515 emit_insn (gen_add3_insn (anchor_reg
, base_rtx
,
6516 gen_int_mode (fp_offset
, Pmode
)));
6518 base_rtx
= anchor_reg
;
6520 offset
-= fp_offset
;
6522 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
6523 bool need_cfa_note_p
= (base_rtx
!= stack_pointer_rtx
);
6525 if (!aarch64_sve_mode_p (mode
)
6526 && (regno2
= aarch64_next_callee_save (regno
+ 1, limit
)) <= limit
6527 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
6528 && known_eq (GET_MODE_SIZE (mode
),
6529 cfun
->machine
->frame
.reg_offset
[regno2
]
6530 - cfun
->machine
->frame
.reg_offset
[regno
]))
6532 rtx reg2
= gen_rtx_REG (mode
, regno2
);
6535 offset
+= GET_MODE_SIZE (mode
);
6536 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
6537 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
,
6540 /* The first part of a frame-related parallel insn is
6541 always assumed to be relevant to the frame
6542 calculations; subsequent parts, are only
6543 frame-related if explicitly marked. */
6544 if (aarch64_emit_cfi_for_reg_p (regno2
))
6546 if (need_cfa_note_p
)
6547 aarch64_add_cfa_expression (insn
, reg2
, stack_pointer_rtx
,
6548 sp_offset
+ GET_MODE_SIZE (mode
));
6550 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
6555 else if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
6557 insn
= emit_insn (gen_aarch64_pred_mov (mode
, mem
, ptrue
, reg
));
6558 need_cfa_note_p
= true;
6560 else if (aarch64_sve_mode_p (mode
))
6561 insn
= emit_insn (gen_rtx_SET (mem
, reg
));
6563 insn
= emit_move_insn (mem
, reg
);
6565 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
6566 if (frame_related_p
&& need_cfa_note_p
)
6567 aarch64_add_cfa_expression (insn
, reg
, stack_pointer_rtx
, sp_offset
);
6571 /* Emit code to restore the callee registers from register number START
6572 up to and including LIMIT. Restore from the stack offset START_OFFSET,
6573 skipping any write-back candidates if SKIP_WB is true. Write the
6574 appropriate REG_CFA_RESTORE notes into CFI_OPS. */
6577 aarch64_restore_callee_saves (poly_int64 start_offset
, unsigned start
,
6578 unsigned limit
, bool skip_wb
, rtx
*cfi_ops
)
6583 rtx anchor_reg
= NULL_RTX
, ptrue
= NULL_RTX
;
6585 for (regno
= aarch64_next_callee_save (start
, limit
);
6587 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
6589 bool frame_related_p
= aarch64_emit_cfi_for_reg_p (regno
);
6590 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
6596 && (regno
== cfun
->machine
->frame
.wb_candidate1
6597 || regno
== cfun
->machine
->frame
.wb_candidate2
))
6600 machine_mode mode
= aarch64_reg_save_mode (regno
);
6601 reg
= gen_rtx_REG (mode
, regno
);
6602 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
6603 rtx base_rtx
= stack_pointer_rtx
;
6604 if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
6605 aarch64_adjust_sve_callee_save_base (mode
, base_rtx
, anchor_reg
,
6607 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
6609 if (!aarch64_sve_mode_p (mode
)
6610 && (regno2
= aarch64_next_callee_save (regno
+ 1, limit
)) <= limit
6611 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
6612 && known_eq (GET_MODE_SIZE (mode
),
6613 cfun
->machine
->frame
.reg_offset
[regno2
]
6614 - cfun
->machine
->frame
.reg_offset
[regno
]))
6616 rtx reg2
= gen_rtx_REG (mode
, regno2
);
6619 offset
+= GET_MODE_SIZE (mode
);
6620 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
6621 emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
6623 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
6626 else if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
6627 emit_insn (gen_aarch64_pred_mov (mode
, reg
, ptrue
, mem
));
6628 else if (aarch64_sve_mode_p (mode
))
6629 emit_insn (gen_rtx_SET (reg
, mem
));
6631 emit_move_insn (reg
, mem
);
6632 if (frame_related_p
)
6633 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg
, *cfi_ops
);
6637 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
6641 offset_4bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
6643 HOST_WIDE_INT multiple
;
6644 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
6645 && IN_RANGE (multiple
, -8, 7));
6648 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
6652 offset_6bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
6654 HOST_WIDE_INT multiple
;
6655 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
6656 && IN_RANGE (multiple
, 0, 63));
6659 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
6663 aarch64_offset_7bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
6665 HOST_WIDE_INT multiple
;
6666 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
6667 && IN_RANGE (multiple
, -64, 63));
6670 /* Return true if OFFSET is a signed 9-bit value. */
6673 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED
,
6676 HOST_WIDE_INT const_offset
;
6677 return (offset
.is_constant (&const_offset
)
6678 && IN_RANGE (const_offset
, -256, 255));
6681 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
6685 offset_9bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
6687 HOST_WIDE_INT multiple
;
6688 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
6689 && IN_RANGE (multiple
, -256, 255));
6692 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
6696 offset_12bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
6698 HOST_WIDE_INT multiple
;
6699 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
6700 && IN_RANGE (multiple
, 0, 4095));
6703 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
6706 aarch64_get_separate_components (void)
6708 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
6709 bitmap_clear (components
);
6711 /* The registers we need saved to the frame. */
6712 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
6713 if (aarch64_register_saved_on_entry (regno
))
6715 /* Punt on saves and restores that use ST1D and LD1D. We could
6716 try to be smarter, but it would involve making sure that the
6717 spare predicate register itself is safe to use at the save
6718 and restore points. Also, when a frame pointer is being used,
6719 the slots are often out of reach of ST1D and LD1D anyway. */
6720 machine_mode mode
= aarch64_reg_save_mode (regno
);
6721 if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
6724 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
6726 /* If the register is saved in the first SVE save slot, we use
6727 it as a stack probe for -fstack-clash-protection. */
6728 if (flag_stack_clash_protection
6729 && maybe_ne (cfun
->machine
->frame
.below_hard_fp_saved_regs_size
, 0)
6730 && known_eq (offset
, 0))
6733 /* Get the offset relative to the register we'll use. */
6734 if (frame_pointer_needed
)
6735 offset
-= cfun
->machine
->frame
.below_hard_fp_saved_regs_size
;
6737 offset
+= crtl
->outgoing_args_size
;
6739 /* Check that we can access the stack slot of the register with one
6740 direct load with no adjustments needed. */
6741 if (aarch64_sve_mode_p (mode
)
6742 ? offset_9bit_signed_scaled_p (mode
, offset
)
6743 : offset_12bit_unsigned_scaled_p (mode
, offset
))
6744 bitmap_set_bit (components
, regno
);
6747 /* Don't mess with the hard frame pointer. */
6748 if (frame_pointer_needed
)
6749 bitmap_clear_bit (components
, HARD_FRAME_POINTER_REGNUM
);
6751 /* If the spare predicate register used by big-endian SVE code
6752 is call-preserved, it must be saved in the main prologue
6753 before any saves that use it. */
6754 if (cfun
->machine
->frame
.spare_pred_reg
!= INVALID_REGNUM
)
6755 bitmap_clear_bit (components
, cfun
->machine
->frame
.spare_pred_reg
);
6757 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
6758 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
6759 /* If registers have been chosen to be stored/restored with
6760 writeback don't interfere with them to avoid having to output explicit
6761 stack adjustment instructions. */
6762 if (reg2
!= INVALID_REGNUM
)
6763 bitmap_clear_bit (components
, reg2
);
6764 if (reg1
!= INVALID_REGNUM
)
6765 bitmap_clear_bit (components
, reg1
);
6767 bitmap_clear_bit (components
, LR_REGNUM
);
6768 bitmap_clear_bit (components
, SP_REGNUM
);
6773 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
6776 aarch64_components_for_bb (basic_block bb
)
6778 bitmap in
= DF_LIVE_IN (bb
);
6779 bitmap gen
= &DF_LIVE_BB_INFO (bb
)->gen
;
6780 bitmap kill
= &DF_LIVE_BB_INFO (bb
)->kill
;
6782 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
6783 bitmap_clear (components
);
6785 /* Clobbered registers don't generate values in any meaningful sense,
6786 since nothing after the clobber can rely on their value. And we can't
6787 say that partially-clobbered registers are unconditionally killed,
6788 because whether they're killed or not depends on the mode of the
6789 value they're holding. Thus partially call-clobbered registers
6790 appear in neither the kill set nor the gen set.
6792 Check manually for any calls that clobber more of a register than the
6793 current function can. */
6794 function_abi_aggregator callee_abis
;
6796 FOR_BB_INSNS (bb
, insn
)
6798 callee_abis
.note_callee_abi (insn_callee_abi (insn
));
6799 HARD_REG_SET extra_caller_saves
= callee_abis
.caller_save_regs (*crtl
->abi
);
6801 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
6802 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
6803 if (!fixed_regs
[regno
]
6804 && !crtl
->abi
->clobbers_full_reg_p (regno
)
6805 && (TEST_HARD_REG_BIT (extra_caller_saves
, regno
)
6806 || bitmap_bit_p (in
, regno
)
6807 || bitmap_bit_p (gen
, regno
)
6808 || bitmap_bit_p (kill
, regno
)))
6810 bitmap_set_bit (components
, regno
);
6812 /* If there is a callee-save at an adjacent offset, add it too
6813 to increase the use of LDP/STP. */
6814 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
6815 unsigned regno2
= multiple_p (offset
, 16) ? regno
+ 1 : regno
- 1;
6817 if (regno2
<= LAST_SAVED_REGNUM
)
6819 poly_int64 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
6821 ? known_eq (offset
+ 8, offset2
)
6822 : multiple_p (offset2
, 16) && known_eq (offset2
+ 8, offset
))
6823 bitmap_set_bit (components
, regno2
);
6830 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
6831 Nothing to do for aarch64. */
6834 aarch64_disqualify_components (sbitmap
, edge
, sbitmap
, bool)
6838 /* Return the next set bit in BMP from START onwards. Return the total number
6839 of bits in BMP if no set bit is found at or after START. */
6842 aarch64_get_next_set_bit (sbitmap bmp
, unsigned int start
)
6844 unsigned int nbits
= SBITMAP_SIZE (bmp
);
6848 gcc_assert (start
< nbits
);
6849 for (unsigned int i
= start
; i
< nbits
; i
++)
6850 if (bitmap_bit_p (bmp
, i
))
6856 /* Do the work for aarch64_emit_prologue_components and
6857 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
6858 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
6859 for these components or the epilogue sequence. That is, it determines
6860 whether we should emit stores or loads and what kind of CFA notes to attach
6861 to the insns. Otherwise the logic for the two sequences is very
6865 aarch64_process_components (sbitmap components
, bool prologue_p
)
6867 rtx ptr_reg
= gen_rtx_REG (Pmode
, frame_pointer_needed
6868 ? HARD_FRAME_POINTER_REGNUM
6869 : STACK_POINTER_REGNUM
);
6871 unsigned last_regno
= SBITMAP_SIZE (components
);
6872 unsigned regno
= aarch64_get_next_set_bit (components
, R0_REGNUM
);
6873 rtx_insn
*insn
= NULL
;
6875 while (regno
!= last_regno
)
6877 bool frame_related_p
= aarch64_emit_cfi_for_reg_p (regno
);
6878 machine_mode mode
= aarch64_reg_save_mode (regno
);
6880 rtx reg
= gen_rtx_REG (mode
, regno
);
6881 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
6882 if (frame_pointer_needed
)
6883 offset
-= cfun
->machine
->frame
.below_hard_fp_saved_regs_size
;
6885 offset
+= crtl
->outgoing_args_size
;
6887 rtx addr
= plus_constant (Pmode
, ptr_reg
, offset
);
6888 rtx mem
= gen_frame_mem (mode
, addr
);
6890 rtx set
= prologue_p
? gen_rtx_SET (mem
, reg
) : gen_rtx_SET (reg
, mem
);
6891 unsigned regno2
= aarch64_get_next_set_bit (components
, regno
+ 1);
6892 /* No more registers to handle after REGNO.
6893 Emit a single save/restore and exit. */
6894 if (regno2
== last_regno
)
6896 insn
= emit_insn (set
);
6897 if (frame_related_p
)
6899 RTX_FRAME_RELATED_P (insn
) = 1;
6901 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
6903 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
6908 poly_int64 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
6909 /* The next register is not of the same class or its offset is not
6910 mergeable with the current one into a pair. */
6911 if (aarch64_sve_mode_p (mode
)
6912 || !satisfies_constraint_Ump (mem
)
6913 || GP_REGNUM_P (regno
) != GP_REGNUM_P (regno2
)
6914 || (crtl
->abi
->id () == ARM_PCS_SIMD
&& FP_REGNUM_P (regno
))
6915 || maybe_ne ((offset2
- cfun
->machine
->frame
.reg_offset
[regno
]),
6916 GET_MODE_SIZE (mode
)))
6918 insn
= emit_insn (set
);
6919 if (frame_related_p
)
6921 RTX_FRAME_RELATED_P (insn
) = 1;
6923 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
6925 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
6932 bool frame_related2_p
= aarch64_emit_cfi_for_reg_p (regno2
);
6934 /* REGNO2 can be saved/restored in a pair with REGNO. */
6935 rtx reg2
= gen_rtx_REG (mode
, regno2
);
6936 if (frame_pointer_needed
)
6937 offset2
-= cfun
->machine
->frame
.below_hard_fp_saved_regs_size
;
6939 offset2
+= crtl
->outgoing_args_size
;
6940 rtx addr2
= plus_constant (Pmode
, ptr_reg
, offset2
);
6941 rtx mem2
= gen_frame_mem (mode
, addr2
);
6942 rtx set2
= prologue_p
? gen_rtx_SET (mem2
, reg2
)
6943 : gen_rtx_SET (reg2
, mem2
);
6946 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
, reg2
));
6948 insn
= emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
6950 if (frame_related_p
|| frame_related2_p
)
6952 RTX_FRAME_RELATED_P (insn
) = 1;
6955 if (frame_related_p
)
6956 add_reg_note (insn
, REG_CFA_OFFSET
, set
);
6957 if (frame_related2_p
)
6958 add_reg_note (insn
, REG_CFA_OFFSET
, set2
);
6962 if (frame_related_p
)
6963 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
6964 if (frame_related2_p
)
6965 add_reg_note (insn
, REG_CFA_RESTORE
, reg2
);
6969 regno
= aarch64_get_next_set_bit (components
, regno2
+ 1);
6973 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
6976 aarch64_emit_prologue_components (sbitmap components
)
6978 aarch64_process_components (components
, true);
6981 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
6984 aarch64_emit_epilogue_components (sbitmap components
)
6986 aarch64_process_components (components
, false);
6989 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
6992 aarch64_set_handled_components (sbitmap components
)
6994 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
6995 if (bitmap_bit_p (components
, regno
))
6996 cfun
->machine
->reg_is_wrapped_separately
[regno
] = true;
6999 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
7000 determining the probe offset for alloca. */
7002 static HOST_WIDE_INT
7003 aarch64_stack_clash_protection_alloca_probe_range (void)
7005 return STACK_CLASH_CALLER_GUARD
;
7009 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
7010 registers. If POLY_SIZE is not large enough to require a probe this function
7011 will only adjust the stack. When allocating the stack space
7012 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
7013 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
7014 arguments. If we are then we ensure that any allocation larger than the ABI
7015 defined buffer needs a probe so that the invariant of having a 1KB buffer is
7018 We emit barriers after each stack adjustment to prevent optimizations from
7019 breaking the invariant that we never drop the stack more than a page. This
7020 invariant is needed to make it easier to correctly handle asynchronous
7021 events, e.g. if we were to allow the stack to be dropped by more than a page
7022 and then have multiple probes up and we take a signal somewhere in between
7023 then the signal handler doesn't know the state of the stack and can make no
7024 assumptions about which pages have been probed. */
7027 aarch64_allocate_and_probe_stack_space (rtx temp1
, rtx temp2
,
7028 poly_int64 poly_size
,
7029 bool frame_related_p
,
7030 bool final_adjustment_p
)
7032 HOST_WIDE_INT guard_size
7033 = 1 << param_stack_clash_protection_guard_size
;
7034 HOST_WIDE_INT guard_used_by_caller
= STACK_CLASH_CALLER_GUARD
;
7035 HOST_WIDE_INT min_probe_threshold
7036 = (final_adjustment_p
7037 ? guard_used_by_caller
7038 : guard_size
- guard_used_by_caller
);
7039 /* When doing the final adjustment for the outgoing arguments, take into
7040 account any unprobed space there is above the current SP. There are
7043 - When saving SVE registers below the hard frame pointer, we force
7044 the lowest save to take place in the prologue before doing the final
7045 adjustment (i.e. we don't allow the save to be shrink-wrapped).
7046 This acts as a probe at SP, so there is no unprobed space.
7048 - When there are no SVE register saves, we use the store of the link
7049 register as a probe. We can't assume that LR was saved at position 0
7050 though, so treat any space below it as unprobed. */
7051 if (final_adjustment_p
7052 && known_eq (cfun
->machine
->frame
.below_hard_fp_saved_regs_size
, 0))
7054 poly_int64 lr_offset
= cfun
->machine
->frame
.reg_offset
[LR_REGNUM
];
7055 if (known_ge (lr_offset
, 0))
7056 min_probe_threshold
-= lr_offset
.to_constant ();
7058 gcc_assert (!flag_stack_clash_protection
|| known_eq (poly_size
, 0));
7061 poly_int64 frame_size
= cfun
->machine
->frame
.frame_size
;
7063 /* We should always have a positive probe threshold. */
7064 gcc_assert (min_probe_threshold
> 0);
7066 if (flag_stack_clash_protection
&& !final_adjustment_p
)
7068 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
7069 poly_int64 sve_callee_adjust
= cfun
->machine
->frame
.sve_callee_adjust
;
7070 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
7072 if (known_eq (frame_size
, 0))
7074 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME
, false);
7076 else if (known_lt (initial_adjust
+ sve_callee_adjust
,
7077 guard_size
- guard_used_by_caller
)
7078 && known_lt (final_adjust
, guard_used_by_caller
))
7080 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME
, true);
7084 /* If SIZE is not large enough to require probing, just adjust the stack and
7086 if (known_lt (poly_size
, min_probe_threshold
)
7087 || !flag_stack_clash_protection
)
7089 aarch64_sub_sp (temp1
, temp2
, poly_size
, frame_related_p
);
7094 /* Handle the SVE non-constant case first. */
7095 if (!poly_size
.is_constant (&size
))
7099 fprintf (dump_file
, "Stack clash SVE prologue: ");
7100 print_dec (poly_size
, dump_file
);
7101 fprintf (dump_file
, " bytes, dynamic probing will be required.\n");
7104 /* First calculate the amount of bytes we're actually spilling. */
7105 aarch64_add_offset (Pmode
, temp1
, CONST0_RTX (Pmode
),
7106 poly_size
, temp1
, temp2
, false, true);
7108 rtx_insn
*insn
= get_last_insn ();
7110 if (frame_related_p
)
7112 /* This is done to provide unwinding information for the stack
7113 adjustments we're about to do, however to prevent the optimizers
7114 from removing the R11 move and leaving the CFA note (which would be
7115 very wrong) we tie the old and new stack pointer together.
7116 The tie will expand to nothing but the optimizers will not touch
7118 rtx stack_ptr_copy
= gen_rtx_REG (Pmode
, STACK_CLASH_SVE_CFA_REGNUM
);
7119 emit_move_insn (stack_ptr_copy
, stack_pointer_rtx
);
7120 emit_insn (gen_stack_tie (stack_ptr_copy
, stack_pointer_rtx
));
7122 /* We want the CFA independent of the stack pointer for the
7123 duration of the loop. */
7124 add_reg_note (insn
, REG_CFA_DEF_CFA
, stack_ptr_copy
);
7125 RTX_FRAME_RELATED_P (insn
) = 1;
7128 rtx probe_const
= gen_int_mode (min_probe_threshold
, Pmode
);
7129 rtx guard_const
= gen_int_mode (guard_size
, Pmode
);
7131 insn
= emit_insn (gen_probe_sve_stack_clash (Pmode
, stack_pointer_rtx
,
7132 stack_pointer_rtx
, temp1
,
7133 probe_const
, guard_const
));
7135 /* Now reset the CFA register if needed. */
7136 if (frame_related_p
)
7138 add_reg_note (insn
, REG_CFA_DEF_CFA
,
7139 gen_rtx_PLUS (Pmode
, stack_pointer_rtx
,
7140 gen_int_mode (poly_size
, Pmode
)));
7141 RTX_FRAME_RELATED_P (insn
) = 1;
7149 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
7150 " bytes, probing will be required.\n", size
);
7152 /* Round size to the nearest multiple of guard_size, and calculate the
7153 residual as the difference between the original size and the rounded
7155 HOST_WIDE_INT rounded_size
= ROUND_DOWN (size
, guard_size
);
7156 HOST_WIDE_INT residual
= size
- rounded_size
;
7158 /* We can handle a small number of allocations/probes inline. Otherwise
7160 if (rounded_size
<= STACK_CLASH_MAX_UNROLL_PAGES
* guard_size
)
7162 for (HOST_WIDE_INT i
= 0; i
< rounded_size
; i
+= guard_size
)
7164 aarch64_sub_sp (NULL
, temp2
, guard_size
, true);
7165 emit_stack_probe (plus_constant (Pmode
, stack_pointer_rtx
,
7166 guard_used_by_caller
));
7167 emit_insn (gen_blockage ());
7169 dump_stack_clash_frame_info (PROBE_INLINE
, size
!= rounded_size
);
7173 /* Compute the ending address. */
7174 aarch64_add_offset (Pmode
, temp1
, stack_pointer_rtx
, -rounded_size
,
7175 temp1
, NULL
, false, true);
7176 rtx_insn
*insn
= get_last_insn ();
7178 /* For the initial allocation, we don't have a frame pointer
7179 set up, so we always need CFI notes. If we're doing the
7180 final allocation, then we may have a frame pointer, in which
7181 case it is the CFA, otherwise we need CFI notes.
7183 We can determine which allocation we are doing by looking at
7184 the value of FRAME_RELATED_P since the final allocations are not
7186 if (frame_related_p
)
7188 /* We want the CFA independent of the stack pointer for the
7189 duration of the loop. */
7190 add_reg_note (insn
, REG_CFA_DEF_CFA
,
7191 plus_constant (Pmode
, temp1
, rounded_size
));
7192 RTX_FRAME_RELATED_P (insn
) = 1;
7195 /* This allocates and probes the stack. Note that this re-uses some of
7196 the existing Ada stack protection code. However we are guaranteed not
7197 to enter the non loop or residual branches of that code.
7199 The non-loop part won't be entered because if our allocation amount
7200 doesn't require a loop, the case above would handle it.
7202 The residual amount won't be entered because TEMP1 is a mutliple of
7203 the allocation size. The residual will always be 0. As such, the only
7204 part we are actually using from that code is the loop setup. The
7205 actual probing is done in aarch64_output_probe_stack_range. */
7206 insn
= emit_insn (gen_probe_stack_range (stack_pointer_rtx
,
7207 stack_pointer_rtx
, temp1
));
7209 /* Now reset the CFA register if needed. */
7210 if (frame_related_p
)
7212 add_reg_note (insn
, REG_CFA_DEF_CFA
,
7213 plus_constant (Pmode
, stack_pointer_rtx
, rounded_size
));
7214 RTX_FRAME_RELATED_P (insn
) = 1;
7217 emit_insn (gen_blockage ());
7218 dump_stack_clash_frame_info (PROBE_LOOP
, size
!= rounded_size
);
7221 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
7222 be probed. This maintains the requirement that each page is probed at
7223 least once. For initial probing we probe only if the allocation is
7224 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
7225 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
7226 GUARD_SIZE. This works that for any allocation that is large enough to
7227 trigger a probe here, we'll have at least one, and if they're not large
7228 enough for this code to emit anything for them, The page would have been
7229 probed by the saving of FP/LR either by this function or any callees. If
7230 we don't have any callees then we won't have more stack adjustments and so
7234 HOST_WIDE_INT residual_probe_offset
= guard_used_by_caller
;
7235 /* If we're doing final adjustments, and we've done any full page
7236 allocations then any residual needs to be probed. */
7237 if (final_adjustment_p
&& rounded_size
!= 0)
7238 min_probe_threshold
= 0;
7239 /* If doing a small final adjustment, we always probe at offset 0.
7240 This is done to avoid issues when LR is not at position 0 or when
7241 the final adjustment is smaller than the probing offset. */
7242 else if (final_adjustment_p
&& rounded_size
== 0)
7243 residual_probe_offset
= 0;
7245 aarch64_sub_sp (temp1
, temp2
, residual
, frame_related_p
);
7246 if (residual
>= min_probe_threshold
)
7250 "Stack clash AArch64 prologue residuals: "
7251 HOST_WIDE_INT_PRINT_DEC
" bytes, probing will be required."
7254 emit_stack_probe (plus_constant (Pmode
, stack_pointer_rtx
,
7255 residual_probe_offset
));
7256 emit_insn (gen_blockage ());
7261 /* Return 1 if the register is used by the epilogue. We need to say the
7262 return register is used, but only after epilogue generation is complete.
7263 Note that in the case of sibcalls, the values "used by the epilogue" are
7264 considered live at the start of the called function.
7266 For SIMD functions we need to return 1 for FP registers that are saved and
7267 restored by a function but are not zero in call_used_regs. If we do not do
7268 this optimizations may remove the restore of the register. */
7271 aarch64_epilogue_uses (int regno
)
7273 if (epilogue_completed
)
7275 if (regno
== LR_REGNUM
)
7281 /* AArch64 stack frames generated by this compiler look like:
7283 +-------------------------------+
7285 | incoming stack arguments |
7287 +-------------------------------+
7288 | | <-- incoming stack pointer (aligned)
7289 | callee-allocated save area |
7290 | for register varargs |
7292 +-------------------------------+
7293 | local variables | <-- frame_pointer_rtx
7295 +-------------------------------+
7297 +-------------------------------+ |
7298 | callee-saved registers | | frame.saved_regs_size
7299 +-------------------------------+ |
7301 +-------------------------------+ |
7303 +-------------------------------+ |<- hard_frame_pointer_rtx (aligned)
7304 | SVE vector registers | | \
7305 +-------------------------------+ | | below_hard_fp_saved_regs_size
7306 | SVE predicate registers | / /
7307 +-------------------------------+
7308 | dynamic allocation |
7309 +-------------------------------+
7311 +-------------------------------+
7312 | outgoing stack arguments | <-- arg_pointer
7314 +-------------------------------+
7315 | | <-- stack_pointer_rtx (aligned)
7317 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
7318 but leave frame_pointer_rtx and hard_frame_pointer_rtx
7321 By default for stack-clash we assume the guard is at least 64KB, but this
7322 value is configurable to either 4KB or 64KB. We also force the guard size to
7323 be the same as the probing interval and both values are kept in sync.
7325 With those assumptions the callee can allocate up to 63KB (or 3KB depending
7326 on the guard size) of stack space without probing.
7328 When probing is needed, we emit a probe at the start of the prologue
7329 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
7331 We have to track how much space has been allocated and the only stores
7332 to the stack we track as implicit probes are the FP/LR stores.
7334 For outgoing arguments we probe if the size is larger than 1KB, such that
7335 the ABI specified buffer is maintained for the next callee.
7337 The following registers are reserved during frame layout and should not be
7338 used for any other purpose:
7340 - r11: Used by stack clash protection when SVE is enabled, and also
7341 as an anchor register when saving and restoring registers
7342 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
7343 - r14 and r15: Used for speculation tracking.
7344 - r16(IP0), r17(IP1): Used by indirect tailcalls.
7345 - r30(LR), r29(FP): Used by standard frame layout.
7347 These registers must be avoided in frame layout related code unless the
7348 explicit intention is to interact with one of the features listed above. */
7350 /* Generate the prologue instructions for entry into a function.
7351 Establish the stack frame by decreasing the stack pointer with a
7352 properly calculated size and, if necessary, create a frame record
7353 filled with the values of LR and previous frame pointer. The
7354 current FP is also set up if it is in use. */
7357 aarch64_expand_prologue (void)
7359 poly_int64 frame_size
= cfun
->machine
->frame
.frame_size
;
7360 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
7361 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
7362 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
7363 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
7364 poly_int64 sve_callee_adjust
= cfun
->machine
->frame
.sve_callee_adjust
;
7365 poly_int64 below_hard_fp_saved_regs_size
7366 = cfun
->machine
->frame
.below_hard_fp_saved_regs_size
;
7367 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
7368 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
7369 bool emit_frame_chain
= cfun
->machine
->frame
.emit_frame_chain
;
7372 if (flag_stack_clash_protection
&& known_eq (callee_adjust
, 0))
7374 /* Fold the SVE allocation into the initial allocation.
7375 We don't do this in aarch64_layout_arg to avoid pessimizing
7376 the epilogue code. */
7377 initial_adjust
+= sve_callee_adjust
;
7378 sve_callee_adjust
= 0;
7381 /* Sign return address for functions. */
7382 if (aarch64_return_address_signing_enabled ())
7384 switch (aarch64_ra_sign_key
)
7387 insn
= emit_insn (gen_paciasp ());
7390 insn
= emit_insn (gen_pacibsp ());
7395 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
7396 RTX_FRAME_RELATED_P (insn
) = 1;
7399 if (flag_stack_usage_info
)
7400 current_function_static_stack_size
= constant_lower_bound (frame_size
);
7402 if (flag_stack_check
== STATIC_BUILTIN_STACK_CHECK
)
7404 if (crtl
->is_leaf
&& !cfun
->calls_alloca
)
7406 if (maybe_gt (frame_size
, PROBE_INTERVAL
)
7407 && maybe_gt (frame_size
, get_stack_check_protect ()))
7408 aarch64_emit_probe_stack_range (get_stack_check_protect (),
7410 - get_stack_check_protect ()));
7412 else if (maybe_gt (frame_size
, 0))
7413 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size
);
7416 rtx tmp0_rtx
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
7417 rtx tmp1_rtx
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
7419 /* In theory we should never have both an initial adjustment
7420 and a callee save adjustment. Verify that is the case since the
7421 code below does not handle it for -fstack-clash-protection. */
7422 gcc_assert (known_eq (initial_adjust
, 0) || callee_adjust
== 0);
7424 /* Will only probe if the initial adjustment is larger than the guard
7425 less the amount of the guard reserved for use by the caller's
7427 aarch64_allocate_and_probe_stack_space (tmp0_rtx
, tmp1_rtx
, initial_adjust
,
7430 if (callee_adjust
!= 0)
7431 aarch64_push_regs (reg1
, reg2
, callee_adjust
);
7433 /* The offset of the frame chain record (if any) from the current SP. */
7434 poly_int64 chain_offset
= (initial_adjust
+ callee_adjust
7435 - cfun
->machine
->frame
.hard_fp_offset
);
7436 gcc_assert (known_ge (chain_offset
, 0));
7438 /* The offset of the bottom of the save area from the current SP. */
7439 poly_int64 saved_regs_offset
= chain_offset
- below_hard_fp_saved_regs_size
;
7441 if (emit_frame_chain
)
7443 if (callee_adjust
== 0)
7447 aarch64_save_callee_saves (saved_regs_offset
, reg1
, reg2
,
7451 gcc_assert (known_eq (chain_offset
, 0));
7452 aarch64_add_offset (Pmode
, hard_frame_pointer_rtx
,
7453 stack_pointer_rtx
, chain_offset
,
7454 tmp1_rtx
, tmp0_rtx
, frame_pointer_needed
);
7455 if (frame_pointer_needed
&& !frame_size
.is_constant ())
7457 /* Variable-sized frames need to describe the save slot
7458 address using DW_CFA_expression rather than DW_CFA_offset.
7459 This means that, without taking further action, the
7460 locations of the registers that we've already saved would
7461 remain based on the stack pointer even after we redefine
7462 the CFA based on the frame pointer. We therefore need new
7463 DW_CFA_expressions to re-express the save slots with addresses
7464 based on the frame pointer. */
7465 rtx_insn
*insn
= get_last_insn ();
7466 gcc_assert (RTX_FRAME_RELATED_P (insn
));
7468 /* Add an explicit CFA definition if this was previously
7470 if (!find_reg_note (insn
, REG_CFA_ADJUST_CFA
, NULL_RTX
))
7472 rtx src
= plus_constant (Pmode
, stack_pointer_rtx
,
7474 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
7475 gen_rtx_SET (hard_frame_pointer_rtx
, src
));
7478 /* Change the save slot expressions for the registers that
7479 we've already saved. */
7480 aarch64_add_cfa_expression (insn
, regno_reg_rtx
[reg2
],
7481 hard_frame_pointer_rtx
, UNITS_PER_WORD
);
7482 aarch64_add_cfa_expression (insn
, regno_reg_rtx
[reg1
],
7483 hard_frame_pointer_rtx
, 0);
7485 emit_insn (gen_stack_tie (stack_pointer_rtx
, hard_frame_pointer_rtx
));
7488 aarch64_save_callee_saves (saved_regs_offset
, R0_REGNUM
, R30_REGNUM
,
7489 callee_adjust
!= 0 || emit_frame_chain
,
7491 if (maybe_ne (sve_callee_adjust
, 0))
7493 gcc_assert (!flag_stack_clash_protection
7494 || known_eq (initial_adjust
, 0));
7495 aarch64_allocate_and_probe_stack_space (tmp1_rtx
, tmp0_rtx
,
7497 !frame_pointer_needed
, false);
7498 saved_regs_offset
+= sve_callee_adjust
;
7500 aarch64_save_callee_saves (saved_regs_offset
, P0_REGNUM
, P15_REGNUM
,
7501 false, emit_frame_chain
);
7502 aarch64_save_callee_saves (saved_regs_offset
, V0_REGNUM
, V31_REGNUM
,
7503 callee_adjust
!= 0 || emit_frame_chain
,
7506 /* We may need to probe the final adjustment if it is larger than the guard
7507 that is assumed by the called. */
7508 aarch64_allocate_and_probe_stack_space (tmp1_rtx
, tmp0_rtx
, final_adjust
,
7509 !frame_pointer_needed
, true);
7512 /* Return TRUE if we can use a simple_return insn.
7514 This function checks whether the callee saved stack is empty, which
7515 means no restore actions are need. The pro_and_epilogue will use
7516 this to check whether shrink-wrapping opt is feasible. */
7519 aarch64_use_return_insn_p (void)
7521 if (!reload_completed
)
7527 return known_eq (cfun
->machine
->frame
.frame_size
, 0);
7530 /* Generate the epilogue instructions for returning from a function.
7531 This is almost exactly the reverse of the prolog sequence, except
7532 that we need to insert barriers to avoid scheduling loads that read
7533 from a deallocated stack, and we optimize the unwind records by
7534 emitting them all together if possible. */
7536 aarch64_expand_epilogue (bool for_sibcall
)
7538 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
7539 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
7540 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
7541 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
7542 poly_int64 sve_callee_adjust
= cfun
->machine
->frame
.sve_callee_adjust
;
7543 poly_int64 below_hard_fp_saved_regs_size
7544 = cfun
->machine
->frame
.below_hard_fp_saved_regs_size
;
7545 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
7546 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
7549 /* A stack clash protection prologue may not have left EP0_REGNUM or
7550 EP1_REGNUM in a usable state. The same is true for allocations
7551 with an SVE component, since we then need both temporary registers
7552 for each allocation. For stack clash we are in a usable state if
7553 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
7554 HOST_WIDE_INT guard_size
7555 = 1 << param_stack_clash_protection_guard_size
;
7556 HOST_WIDE_INT guard_used_by_caller
= STACK_CLASH_CALLER_GUARD
;
7558 /* We can re-use the registers when:
7560 (a) the deallocation amount is the same as the corresponding
7561 allocation amount (which is false if we combine the initial
7562 and SVE callee save allocations in the prologue); and
7564 (b) the allocation amount doesn't need a probe (which is false
7565 if the amount is guard_size - guard_used_by_caller or greater).
7567 In such situations the register should remain live with the correct
7569 bool can_inherit_p
= (initial_adjust
.is_constant ()
7570 && final_adjust
.is_constant ()
7571 && (!flag_stack_clash_protection
7572 || (known_lt (initial_adjust
,
7573 guard_size
- guard_used_by_caller
)
7574 && known_eq (sve_callee_adjust
, 0))));
7576 /* We need to add memory barrier to prevent read from deallocated stack. */
7578 = maybe_ne (get_frame_size ()
7579 + cfun
->machine
->frame
.saved_varargs_size
, 0);
7581 /* Emit a barrier to prevent loads from a deallocated stack. */
7582 if (maybe_gt (final_adjust
, crtl
->outgoing_args_size
)
7583 || cfun
->calls_alloca
7584 || crtl
->calls_eh_return
)
7586 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
7587 need_barrier_p
= false;
7590 /* Restore the stack pointer from the frame pointer if it may not
7591 be the same as the stack pointer. */
7592 rtx tmp0_rtx
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
7593 rtx tmp1_rtx
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
7594 if (frame_pointer_needed
7595 && (maybe_ne (final_adjust
, 0) || cfun
->calls_alloca
))
7596 /* If writeback is used when restoring callee-saves, the CFA
7597 is restored on the instruction doing the writeback. */
7598 aarch64_add_offset (Pmode
, stack_pointer_rtx
,
7599 hard_frame_pointer_rtx
,
7600 -callee_offset
- below_hard_fp_saved_regs_size
,
7601 tmp1_rtx
, tmp0_rtx
, callee_adjust
== 0);
7603 /* The case where we need to re-use the register here is very rare, so
7604 avoid the complicated condition and just always emit a move if the
7605 immediate doesn't fit. */
7606 aarch64_add_sp (tmp1_rtx
, tmp0_rtx
, final_adjust
, true);
7608 /* Restore the vector registers before the predicate registers,
7609 so that we can use P4 as a temporary for big-endian SVE frames. */
7610 aarch64_restore_callee_saves (callee_offset
, V0_REGNUM
, V31_REGNUM
,
7611 callee_adjust
!= 0, &cfi_ops
);
7612 aarch64_restore_callee_saves (callee_offset
, P0_REGNUM
, P15_REGNUM
,
7614 if (maybe_ne (sve_callee_adjust
, 0))
7615 aarch64_add_sp (NULL_RTX
, NULL_RTX
, sve_callee_adjust
, true);
7616 aarch64_restore_callee_saves (callee_offset
- sve_callee_adjust
,
7617 R0_REGNUM
, R30_REGNUM
,
7618 callee_adjust
!= 0, &cfi_ops
);
7621 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
7623 if (callee_adjust
!= 0)
7624 aarch64_pop_regs (reg1
, reg2
, callee_adjust
, &cfi_ops
);
7626 if (callee_adjust
!= 0 || maybe_gt (initial_adjust
, 65536))
7628 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
7629 insn
= get_last_insn ();
7630 rtx new_cfa
= plus_constant (Pmode
, stack_pointer_rtx
, initial_adjust
);
7631 REG_NOTES (insn
) = alloc_reg_note (REG_CFA_DEF_CFA
, new_cfa
, cfi_ops
);
7632 RTX_FRAME_RELATED_P (insn
) = 1;
7636 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
7637 add restriction on emit_move optimization to leaf functions. */
7638 aarch64_add_sp (tmp0_rtx
, tmp1_rtx
, initial_adjust
,
7639 (!can_inherit_p
|| !crtl
->is_leaf
7640 || df_regs_ever_live_p (EP0_REGNUM
)));
7644 /* Emit delayed restores and reset the CFA to be SP. */
7645 insn
= get_last_insn ();
7646 cfi_ops
= alloc_reg_note (REG_CFA_DEF_CFA
, stack_pointer_rtx
, cfi_ops
);
7647 REG_NOTES (insn
) = cfi_ops
;
7648 RTX_FRAME_RELATED_P (insn
) = 1;
7651 /* We prefer to emit the combined return/authenticate instruction RETAA,
7652 however there are three cases in which we must instead emit an explicit
7653 authentication instruction.
7655 1) Sibcalls don't return in a normal way, so if we're about to call one
7656 we must authenticate.
7658 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
7659 generating code for !TARGET_ARMV8_3 we can't use it and must
7660 explicitly authenticate.
7662 3) On an eh_return path we make extra stack adjustments to update the
7663 canonical frame address to be the exception handler's CFA. We want
7664 to authenticate using the CFA of the function which calls eh_return.
7666 if (aarch64_return_address_signing_enabled ()
7667 && (for_sibcall
|| !TARGET_ARMV8_3
|| crtl
->calls_eh_return
))
7669 switch (aarch64_ra_sign_key
)
7672 insn
= emit_insn (gen_autiasp ());
7675 insn
= emit_insn (gen_autibsp ());
7680 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
7681 RTX_FRAME_RELATED_P (insn
) = 1;
7684 /* Stack adjustment for exception handler. */
7685 if (crtl
->calls_eh_return
&& !for_sibcall
)
7687 /* We need to unwind the stack by the offset computed by
7688 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
7689 to be SP; letting the CFA move during this adjustment
7690 is just as correct as retaining the CFA from the body
7691 of the function. Therefore, do nothing special. */
7692 emit_insn (gen_add2_insn (stack_pointer_rtx
, EH_RETURN_STACKADJ_RTX
));
7695 emit_use (gen_rtx_REG (DImode
, LR_REGNUM
));
7697 emit_jump_insn (ret_rtx
);
7700 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
7701 normally or return to a previous frame after unwinding.
7703 An EH return uses a single shared return sequence. The epilogue is
7704 exactly like a normal epilogue except that it has an extra input
7705 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
7706 that must be applied after the frame has been destroyed. An extra label
7707 is inserted before the epilogue which initializes this register to zero,
7708 and this is the entry point for a normal return.
7710 An actual EH return updates the return address, initializes the stack
7711 adjustment and jumps directly into the epilogue (bypassing the zeroing
7712 of the adjustment). Since the return address is typically saved on the
7713 stack when a function makes a call, the saved LR must be updated outside
7716 This poses problems as the store is generated well before the epilogue,
7717 so the offset of LR is not known yet. Also optimizations will remove the
7718 store as it appears dead, even after the epilogue is generated (as the
7719 base or offset for loading LR is different in many cases).
7721 To avoid these problems this implementation forces the frame pointer
7722 in eh_return functions so that the location of LR is fixed and known early.
7723 It also marks the store volatile, so no optimization is permitted to
7724 remove the store. */
7726 aarch64_eh_return_handler_rtx (void)
7728 rtx tmp
= gen_frame_mem (Pmode
,
7729 plus_constant (Pmode
, hard_frame_pointer_rtx
, UNITS_PER_WORD
));
7731 /* Mark the store volatile, so no optimization is permitted to remove it. */
7732 MEM_VOLATILE_P (tmp
) = true;
7736 /* Output code to add DELTA to the first argument, and then jump
7737 to FUNCTION. Used for C++ multiple inheritance. */
7739 aarch64_output_mi_thunk (FILE *file
, tree thunk ATTRIBUTE_UNUSED
,
7740 HOST_WIDE_INT delta
,
7741 HOST_WIDE_INT vcall_offset
,
7744 /* The this pointer is always in x0. Note that this differs from
7745 Arm where the this pointer maybe bumped to r1 if r0 is required
7746 to return a pointer to an aggregate. On AArch64 a result value
7747 pointer will be in x8. */
7748 int this_regno
= R0_REGNUM
;
7749 rtx this_rtx
, temp0
, temp1
, addr
, funexp
;
7751 const char *fnname
= IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk
));
7753 if (aarch64_bti_enabled ())
7754 emit_insn (gen_bti_c());
7756 reload_completed
= 1;
7757 emit_note (NOTE_INSN_PROLOGUE_END
);
7759 this_rtx
= gen_rtx_REG (Pmode
, this_regno
);
7760 temp0
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
7761 temp1
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
7763 if (vcall_offset
== 0)
7764 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
, temp1
, temp0
, false);
7767 gcc_assert ((vcall_offset
& (POINTER_BYTES
- 1)) == 0);
7772 if (delta
>= -256 && delta
< 256)
7773 addr
= gen_rtx_PRE_MODIFY (Pmode
, this_rtx
,
7774 plus_constant (Pmode
, this_rtx
, delta
));
7776 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
,
7777 temp1
, temp0
, false);
7780 if (Pmode
== ptr_mode
)
7781 aarch64_emit_move (temp0
, gen_rtx_MEM (ptr_mode
, addr
));
7783 aarch64_emit_move (temp0
,
7784 gen_rtx_ZERO_EXTEND (Pmode
,
7785 gen_rtx_MEM (ptr_mode
, addr
)));
7787 if (vcall_offset
>= -256 && vcall_offset
< 4096 * POINTER_BYTES
)
7788 addr
= plus_constant (Pmode
, temp0
, vcall_offset
);
7791 aarch64_internal_mov_immediate (temp1
, GEN_INT (vcall_offset
), true,
7793 addr
= gen_rtx_PLUS (Pmode
, temp0
, temp1
);
7796 if (Pmode
== ptr_mode
)
7797 aarch64_emit_move (temp1
, gen_rtx_MEM (ptr_mode
,addr
));
7799 aarch64_emit_move (temp1
,
7800 gen_rtx_SIGN_EXTEND (Pmode
,
7801 gen_rtx_MEM (ptr_mode
, addr
)));
7803 emit_insn (gen_add2_insn (this_rtx
, temp1
));
7806 /* Generate a tail call to the target function. */
7807 if (!TREE_USED (function
))
7809 assemble_external (function
);
7810 TREE_USED (function
) = 1;
7812 funexp
= XEXP (DECL_RTL (function
), 0);
7813 funexp
= gen_rtx_MEM (FUNCTION_MODE
, funexp
);
7814 rtx callee_abi
= gen_int_mode (fndecl_abi (function
).id (), DImode
);
7815 insn
= emit_call_insn (gen_sibcall (funexp
, const0_rtx
, callee_abi
));
7816 SIBLING_CALL_P (insn
) = 1;
7818 insn
= get_insns ();
7819 shorten_branches (insn
);
7821 assemble_start_function (thunk
, fnname
);
7822 final_start_function (insn
, file
, 1);
7823 final (insn
, file
, 1);
7824 final_end_function ();
7825 assemble_end_function (thunk
, fnname
);
7827 /* Stop pretending to be a post-reload pass. */
7828 reload_completed
= 0;
7832 aarch64_tls_referenced_p (rtx x
)
7834 if (!TARGET_HAVE_TLS
)
7836 subrtx_iterator::array_type array
;
7837 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
7839 const_rtx x
= *iter
;
7840 if (GET_CODE (x
) == SYMBOL_REF
&& SYMBOL_REF_TLS_MODEL (x
) != 0)
7842 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
7843 TLS offsets, not real symbol references. */
7844 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
7845 iter
.skip_subrtxes ();
7851 /* Return true if val can be encoded as a 12-bit unsigned immediate with
7852 a left shift of 0 or 12 bits. */
7854 aarch64_uimm12_shift (HOST_WIDE_INT val
)
7856 return ((val
& (((HOST_WIDE_INT
) 0xfff) << 0)) == val
7857 || (val
& (((HOST_WIDE_INT
) 0xfff) << 12)) == val
7861 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
7862 that can be created with a left shift of 0 or 12. */
7863 static HOST_WIDE_INT
7864 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val
)
7866 /* Check to see if the value fits in 24 bits, as that is the maximum we can
7867 handle correctly. */
7868 gcc_assert ((val
& 0xffffff) == val
);
7870 if (((val
& 0xfff) << 0) == val
)
7873 return val
& (0xfff << 12);
7876 /* Return true if val is an immediate that can be loaded into a
7877 register by a MOVZ instruction. */
7879 aarch64_movw_imm (HOST_WIDE_INT val
, scalar_int_mode mode
)
7881 if (GET_MODE_SIZE (mode
) > 4)
7883 if ((val
& (((HOST_WIDE_INT
) 0xffff) << 32)) == val
7884 || (val
& (((HOST_WIDE_INT
) 0xffff) << 48)) == val
)
7889 /* Ignore sign extension. */
7890 val
&= (HOST_WIDE_INT
) 0xffffffff;
7892 return ((val
& (((HOST_WIDE_INT
) 0xffff) << 0)) == val
7893 || (val
& (((HOST_WIDE_INT
) 0xffff) << 16)) == val
);
7896 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
7897 64-bit (DImode) integer. */
7899 static unsigned HOST_WIDE_INT
7900 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val
, machine_mode mode
)
7902 unsigned int size
= GET_MODE_UNIT_PRECISION (mode
);
7905 val
&= (HOST_WIDE_INT_1U
<< size
) - 1;
7912 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
7914 static const unsigned HOST_WIDE_INT bitmask_imm_mul
[] =
7916 0x0000000100000001ull
,
7917 0x0001000100010001ull
,
7918 0x0101010101010101ull
,
7919 0x1111111111111111ull
,
7920 0x5555555555555555ull
,
7924 /* Return true if val is a valid bitmask immediate. */
7927 aarch64_bitmask_imm (HOST_WIDE_INT val_in
, machine_mode mode
)
7929 unsigned HOST_WIDE_INT val
, tmp
, mask
, first_one
, next_one
;
7932 /* Check for a single sequence of one bits and return quickly if so.
7933 The special cases of all ones and all zeroes returns false. */
7934 val
= aarch64_replicate_bitmask_imm (val_in
, mode
);
7935 tmp
= val
+ (val
& -val
);
7937 if (tmp
== (tmp
& -tmp
))
7938 return (val
+ 1) > 1;
7940 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
7942 val
= (val
<< 32) | (val
& 0xffffffff);
7944 /* Invert if the immediate doesn't start with a zero bit - this means we
7945 only need to search for sequences of one bits. */
7949 /* Find the first set bit and set tmp to val with the first sequence of one
7950 bits removed. Return success if there is a single sequence of ones. */
7951 first_one
= val
& -val
;
7952 tmp
= val
& (val
+ first_one
);
7957 /* Find the next set bit and compute the difference in bit position. */
7958 next_one
= tmp
& -tmp
;
7959 bits
= clz_hwi (first_one
) - clz_hwi (next_one
);
7962 /* Check the bit position difference is a power of 2, and that the first
7963 sequence of one bits fits within 'bits' bits. */
7964 if ((mask
>> bits
) != 0 || bits
!= (bits
& -bits
))
7967 /* Check the sequence of one bits is repeated 64/bits times. */
7968 return val
== mask
* bitmask_imm_mul
[__builtin_clz (bits
) - 26];
7971 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
7972 Assumed precondition: VAL_IN Is not zero. */
7974 unsigned HOST_WIDE_INT
7975 aarch64_and_split_imm1 (HOST_WIDE_INT val_in
)
7977 int lowest_bit_set
= ctz_hwi (val_in
);
7978 int highest_bit_set
= floor_log2 (val_in
);
7979 gcc_assert (val_in
!= 0);
7981 return ((HOST_WIDE_INT_UC (2) << highest_bit_set
) -
7982 (HOST_WIDE_INT_1U
<< lowest_bit_set
));
7985 /* Create constant where bits outside of lowest bit set to highest bit set
7988 unsigned HOST_WIDE_INT
7989 aarch64_and_split_imm2 (HOST_WIDE_INT val_in
)
7991 return val_in
| ~aarch64_and_split_imm1 (val_in
);
7994 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
7997 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in
, machine_mode mode
)
7999 scalar_int_mode int_mode
;
8000 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
8003 if (aarch64_bitmask_imm (val_in
, int_mode
))
8006 if (aarch64_move_imm (val_in
, int_mode
))
8009 unsigned HOST_WIDE_INT imm2
= aarch64_and_split_imm2 (val_in
);
8011 return aarch64_bitmask_imm (imm2
, int_mode
);
8014 /* Return true if val is an immediate that can be loaded into a
8015 register in a single instruction. */
8017 aarch64_move_imm (HOST_WIDE_INT val
, machine_mode mode
)
8019 scalar_int_mode int_mode
;
8020 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
8023 if (aarch64_movw_imm (val
, int_mode
) || aarch64_movw_imm (~val
, int_mode
))
8025 return aarch64_bitmask_imm (val
, int_mode
);
8029 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED
, rtx x
)
8033 if (GET_CODE (x
) == HIGH
)
8036 /* There's no way to calculate VL-based values using relocations. */
8037 subrtx_iterator::array_type array
;
8038 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
8039 if (GET_CODE (*iter
) == CONST_POLY_INT
)
8042 split_const (x
, &base
, &offset
);
8043 if (GET_CODE (base
) == SYMBOL_REF
|| GET_CODE (base
) == LABEL_REF
)
8045 if (aarch64_classify_symbol (base
, INTVAL (offset
))
8046 != SYMBOL_FORCE_TO_MEM
)
8049 /* Avoid generating a 64-bit relocation in ILP32; leave
8050 to aarch64_expand_mov_immediate to handle it properly. */
8051 return mode
!= ptr_mode
;
8054 return aarch64_tls_referenced_p (x
);
8057 /* Implement TARGET_CASE_VALUES_THRESHOLD.
8058 The expansion for a table switch is quite expensive due to the number
8059 of instructions, the table lookup and hard to predict indirect jump.
8060 When optimizing for speed, and -O3 enabled, use the per-core tuning if
8061 set, otherwise use tables for > 16 cases as a tradeoff between size and
8062 performance. When optimizing for size, use the default setting. */
8065 aarch64_case_values_threshold (void)
8067 /* Use the specified limit for the number of cases before using jump
8068 tables at higher optimization levels. */
8070 && selected_cpu
->tune
->max_case_values
!= 0)
8071 return selected_cpu
->tune
->max_case_values
;
8073 return optimize_size
? default_case_values_threshold () : 17;
8076 /* Return true if register REGNO is a valid index register.
8077 STRICT_P is true if REG_OK_STRICT is in effect. */
8080 aarch64_regno_ok_for_index_p (int regno
, bool strict_p
)
8082 if (!HARD_REGISTER_NUM_P (regno
))
8090 regno
= reg_renumber
[regno
];
8092 return GP_REGNUM_P (regno
);
8095 /* Return true if register REGNO is a valid base register for mode MODE.
8096 STRICT_P is true if REG_OK_STRICT is in effect. */
8099 aarch64_regno_ok_for_base_p (int regno
, bool strict_p
)
8101 if (!HARD_REGISTER_NUM_P (regno
))
8109 regno
= reg_renumber
[regno
];
8112 /* The fake registers will be eliminated to either the stack or
8113 hard frame pointer, both of which are usually valid base registers.
8114 Reload deals with the cases where the eliminated form isn't valid. */
8115 return (GP_REGNUM_P (regno
)
8116 || regno
== SP_REGNUM
8117 || regno
== FRAME_POINTER_REGNUM
8118 || regno
== ARG_POINTER_REGNUM
);
8121 /* Return true if X is a valid base register for mode MODE.
8122 STRICT_P is true if REG_OK_STRICT is in effect. */
8125 aarch64_base_register_rtx_p (rtx x
, bool strict_p
)
8128 && GET_CODE (x
) == SUBREG
8129 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (x
))])
8132 return (REG_P (x
) && aarch64_regno_ok_for_base_p (REGNO (x
), strict_p
));
8135 /* Return true if address offset is a valid index. If it is, fill in INFO
8136 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
8139 aarch64_classify_index (struct aarch64_address_info
*info
, rtx x
,
8140 machine_mode mode
, bool strict_p
)
8142 enum aarch64_address_type type
;
8147 if ((REG_P (x
) || GET_CODE (x
) == SUBREG
)
8148 && GET_MODE (x
) == Pmode
)
8150 type
= ADDRESS_REG_REG
;
8154 /* (sign_extend:DI (reg:SI)) */
8155 else if ((GET_CODE (x
) == SIGN_EXTEND
8156 || GET_CODE (x
) == ZERO_EXTEND
)
8157 && GET_MODE (x
) == DImode
8158 && GET_MODE (XEXP (x
, 0)) == SImode
)
8160 type
= (GET_CODE (x
) == SIGN_EXTEND
)
8161 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
8162 index
= XEXP (x
, 0);
8165 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
8166 else if (GET_CODE (x
) == MULT
8167 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
8168 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
8169 && GET_MODE (XEXP (x
, 0)) == DImode
8170 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
8171 && CONST_INT_P (XEXP (x
, 1)))
8173 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
8174 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
8175 index
= XEXP (XEXP (x
, 0), 0);
8176 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
8178 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
8179 else if (GET_CODE (x
) == ASHIFT
8180 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
8181 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
8182 && GET_MODE (XEXP (x
, 0)) == DImode
8183 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
8184 && CONST_INT_P (XEXP (x
, 1)))
8186 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
8187 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
8188 index
= XEXP (XEXP (x
, 0), 0);
8189 shift
= INTVAL (XEXP (x
, 1));
8191 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
8192 else if ((GET_CODE (x
) == SIGN_EXTRACT
8193 || GET_CODE (x
) == ZERO_EXTRACT
)
8194 && GET_MODE (x
) == DImode
8195 && GET_CODE (XEXP (x
, 0)) == MULT
8196 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
8197 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
8199 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
8200 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
8201 index
= XEXP (XEXP (x
, 0), 0);
8202 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
8203 if (INTVAL (XEXP (x
, 1)) != 32 + shift
8204 || INTVAL (XEXP (x
, 2)) != 0)
8207 /* (and:DI (mult:DI (reg:DI) (const_int scale))
8208 (const_int 0xffffffff<<shift)) */
8209 else if (GET_CODE (x
) == AND
8210 && GET_MODE (x
) == DImode
8211 && GET_CODE (XEXP (x
, 0)) == MULT
8212 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
8213 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
8214 && CONST_INT_P (XEXP (x
, 1)))
8216 type
= ADDRESS_REG_UXTW
;
8217 index
= XEXP (XEXP (x
, 0), 0);
8218 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
8219 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
8222 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
8223 else if ((GET_CODE (x
) == SIGN_EXTRACT
8224 || GET_CODE (x
) == ZERO_EXTRACT
)
8225 && GET_MODE (x
) == DImode
8226 && GET_CODE (XEXP (x
, 0)) == ASHIFT
8227 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
8228 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
8230 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
8231 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
8232 index
= XEXP (XEXP (x
, 0), 0);
8233 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
8234 if (INTVAL (XEXP (x
, 1)) != 32 + shift
8235 || INTVAL (XEXP (x
, 2)) != 0)
8238 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
8239 (const_int 0xffffffff<<shift)) */
8240 else if (GET_CODE (x
) == AND
8241 && GET_MODE (x
) == DImode
8242 && GET_CODE (XEXP (x
, 0)) == ASHIFT
8243 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
8244 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
8245 && CONST_INT_P (XEXP (x
, 1)))
8247 type
= ADDRESS_REG_UXTW
;
8248 index
= XEXP (XEXP (x
, 0), 0);
8249 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
8250 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
8253 /* (mult:P (reg:P) (const_int scale)) */
8254 else if (GET_CODE (x
) == MULT
8255 && GET_MODE (x
) == Pmode
8256 && GET_MODE (XEXP (x
, 0)) == Pmode
8257 && CONST_INT_P (XEXP (x
, 1)))
8259 type
= ADDRESS_REG_REG
;
8260 index
= XEXP (x
, 0);
8261 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
8263 /* (ashift:P (reg:P) (const_int shift)) */
8264 else if (GET_CODE (x
) == ASHIFT
8265 && GET_MODE (x
) == Pmode
8266 && GET_MODE (XEXP (x
, 0)) == Pmode
8267 && CONST_INT_P (XEXP (x
, 1)))
8269 type
= ADDRESS_REG_REG
;
8270 index
= XEXP (x
, 0);
8271 shift
= INTVAL (XEXP (x
, 1));
8277 && GET_CODE (index
) == SUBREG
8278 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (index
))])
8279 index
= SUBREG_REG (index
);
8281 if (aarch64_sve_data_mode_p (mode
))
8283 if (type
!= ADDRESS_REG_REG
8284 || (1 << shift
) != GET_MODE_UNIT_SIZE (mode
))
8290 && !(IN_RANGE (shift
, 1, 3)
8291 && known_eq (1 << shift
, GET_MODE_SIZE (mode
))))
8296 && aarch64_regno_ok_for_index_p (REGNO (index
), strict_p
))
8299 info
->offset
= index
;
8300 info
->shift
= shift
;
8307 /* Return true if MODE is one of the modes for which we
8308 support LDP/STP operations. */
8311 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode
)
8313 return mode
== SImode
|| mode
== DImode
8314 || mode
== SFmode
|| mode
== DFmode
8315 || (aarch64_vector_mode_supported_p (mode
)
8316 && (known_eq (GET_MODE_SIZE (mode
), 8)
8317 || (known_eq (GET_MODE_SIZE (mode
), 16)
8318 && (aarch64_tune_params
.extra_tuning_flags
8319 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
) == 0)));
8322 /* Return true if REGNO is a virtual pointer register, or an eliminable
8323 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
8324 include stack_pointer or hard_frame_pointer. */
8326 virt_or_elim_regno_p (unsigned regno
)
8328 return ((regno
>= FIRST_VIRTUAL_REGISTER
8329 && regno
<= LAST_VIRTUAL_POINTER_REGISTER
)
8330 || regno
== FRAME_POINTER_REGNUM
8331 || regno
== ARG_POINTER_REGNUM
);
8334 /* Return true if X is a valid address of type TYPE for machine mode MODE.
8335 If it is, fill in INFO appropriately. STRICT_P is true if
8336 REG_OK_STRICT is in effect. */
8339 aarch64_classify_address (struct aarch64_address_info
*info
,
8340 rtx x
, machine_mode mode
, bool strict_p
,
8341 aarch64_addr_query_type type
)
8343 enum rtx_code code
= GET_CODE (x
);
8347 HOST_WIDE_INT const_size
;
8349 /* Whether a vector mode is partial doesn't affect address legitimacy.
8350 Partial vectors like VNx8QImode allow the same indexed addressing
8351 mode and MUL VL addressing mode as full vectors like VNx16QImode;
8352 in both cases, MUL VL counts multiples of GET_MODE_SIZE. */
8353 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
8354 vec_flags
&= ~VEC_PARTIAL
;
8356 /* On BE, we use load/store pair for all large int mode load/stores.
8357 TI/TFmode may also use a load/store pair. */
8358 bool advsimd_struct_p
= (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
));
8359 bool load_store_pair_p
= (type
== ADDR_QUERY_LDP_STP
8360 || type
== ADDR_QUERY_LDP_STP_N
8363 || (BYTES_BIG_ENDIAN
&& advsimd_struct_p
));
8365 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
8366 corresponds to the actual size of the memory being loaded/stored and the
8367 mode of the corresponding addressing mode is half of that. */
8368 if (type
== ADDR_QUERY_LDP_STP_N
8369 && known_eq (GET_MODE_SIZE (mode
), 16))
8372 bool allow_reg_index_p
= (!load_store_pair_p
8373 && (known_lt (GET_MODE_SIZE (mode
), 16)
8374 || vec_flags
== VEC_ADVSIMD
8375 || vec_flags
& VEC_SVE_DATA
));
8377 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
8378 [Rn, #offset, MUL VL]. */
8379 if ((vec_flags
& (VEC_SVE_DATA
| VEC_SVE_PRED
)) != 0
8380 && (code
!= REG
&& code
!= PLUS
))
8383 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
8385 if (advsimd_struct_p
8386 && !BYTES_BIG_ENDIAN
8387 && (code
!= POST_INC
&& code
!= REG
))
8390 gcc_checking_assert (GET_MODE (x
) == VOIDmode
8391 || SCALAR_INT_MODE_P (GET_MODE (x
)));
8397 info
->type
= ADDRESS_REG_IMM
;
8399 info
->offset
= const0_rtx
;
8400 info
->const_offset
= 0;
8401 return aarch64_base_register_rtx_p (x
, strict_p
);
8409 && virt_or_elim_regno_p (REGNO (op0
))
8410 && poly_int_rtx_p (op1
, &offset
))
8412 info
->type
= ADDRESS_REG_IMM
;
8415 info
->const_offset
= offset
;
8420 if (maybe_ne (GET_MODE_SIZE (mode
), 0)
8421 && aarch64_base_register_rtx_p (op0
, strict_p
)
8422 && poly_int_rtx_p (op1
, &offset
))
8424 info
->type
= ADDRESS_REG_IMM
;
8427 info
->const_offset
= offset
;
8429 /* TImode and TFmode values are allowed in both pairs of X
8430 registers and individual Q registers. The available
8432 X,X: 7-bit signed scaled offset
8433 Q: 9-bit signed offset
8434 We conservatively require an offset representable in either mode.
8435 When performing the check for pairs of X registers i.e. LDP/STP
8436 pass down DImode since that is the natural size of the LDP/STP
8437 instruction memory accesses. */
8438 if (mode
== TImode
|| mode
== TFmode
)
8439 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
8440 && (aarch64_offset_9bit_signed_unscaled_p (mode
, offset
)
8441 || offset_12bit_unsigned_scaled_p (mode
, offset
)));
8443 /* A 7bit offset check because OImode will emit a ldp/stp
8444 instruction (only big endian will get here).
8445 For ldp/stp instructions, the offset is scaled for the size of a
8446 single element of the pair. */
8448 return aarch64_offset_7bit_signed_scaled_p (TImode
, offset
);
8450 /* Three 9/12 bit offsets checks because CImode will emit three
8451 ldr/str instructions (only big endian will get here). */
8453 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
8454 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode
,
8456 || offset_12bit_unsigned_scaled_p (V16QImode
,
8459 /* Two 7bit offsets checks because XImode will emit two ldp/stp
8460 instructions (only big endian will get here). */
8462 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
8463 && aarch64_offset_7bit_signed_scaled_p (TImode
,
8466 /* Make "m" use the LD1 offset range for SVE data modes, so
8467 that pre-RTL optimizers like ivopts will work to that
8468 instead of the wider LDR/STR range. */
8469 if (vec_flags
== VEC_SVE_DATA
)
8470 return (type
== ADDR_QUERY_M
8471 ? offset_4bit_signed_scaled_p (mode
, offset
)
8472 : offset_9bit_signed_scaled_p (mode
, offset
));
8474 if (vec_flags
== (VEC_SVE_DATA
| VEC_STRUCT
))
8476 poly_int64 end_offset
= (offset
8477 + GET_MODE_SIZE (mode
)
8478 - BYTES_PER_SVE_VECTOR
);
8479 return (type
== ADDR_QUERY_M
8480 ? offset_4bit_signed_scaled_p (mode
, offset
)
8481 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE
, offset
)
8482 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE
,
8486 if (vec_flags
== VEC_SVE_PRED
)
8487 return offset_9bit_signed_scaled_p (mode
, offset
);
8489 if (load_store_pair_p
)
8490 return ((known_eq (GET_MODE_SIZE (mode
), 4)
8491 || known_eq (GET_MODE_SIZE (mode
), 8)
8492 || known_eq (GET_MODE_SIZE (mode
), 16))
8493 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
8495 return (aarch64_offset_9bit_signed_unscaled_p (mode
, offset
)
8496 || offset_12bit_unsigned_scaled_p (mode
, offset
));
8499 if (allow_reg_index_p
)
8501 /* Look for base + (scaled/extended) index register. */
8502 if (aarch64_base_register_rtx_p (op0
, strict_p
)
8503 && aarch64_classify_index (info
, op1
, mode
, strict_p
))
8508 if (aarch64_base_register_rtx_p (op1
, strict_p
)
8509 && aarch64_classify_index (info
, op0
, mode
, strict_p
))
8522 info
->type
= ADDRESS_REG_WB
;
8523 info
->base
= XEXP (x
, 0);
8524 info
->offset
= NULL_RTX
;
8525 return aarch64_base_register_rtx_p (info
->base
, strict_p
);
8529 info
->type
= ADDRESS_REG_WB
;
8530 info
->base
= XEXP (x
, 0);
8531 if (GET_CODE (XEXP (x
, 1)) == PLUS
8532 && poly_int_rtx_p (XEXP (XEXP (x
, 1), 1), &offset
)
8533 && rtx_equal_p (XEXP (XEXP (x
, 1), 0), info
->base
)
8534 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
8536 info
->offset
= XEXP (XEXP (x
, 1), 1);
8537 info
->const_offset
= offset
;
8539 /* TImode and TFmode values are allowed in both pairs of X
8540 registers and individual Q registers. The available
8542 X,X: 7-bit signed scaled offset
8543 Q: 9-bit signed offset
8544 We conservatively require an offset representable in either mode.
8546 if (mode
== TImode
|| mode
== TFmode
)
8547 return (aarch64_offset_7bit_signed_scaled_p (mode
, offset
)
8548 && aarch64_offset_9bit_signed_unscaled_p (mode
, offset
));
8550 if (load_store_pair_p
)
8551 return ((known_eq (GET_MODE_SIZE (mode
), 4)
8552 || known_eq (GET_MODE_SIZE (mode
), 8)
8553 || known_eq (GET_MODE_SIZE (mode
), 16))
8554 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
8556 return aarch64_offset_9bit_signed_unscaled_p (mode
, offset
);
8563 /* load literal: pc-relative constant pool entry. Only supported
8564 for SI mode or larger. */
8565 info
->type
= ADDRESS_SYMBOLIC
;
8567 if (!load_store_pair_p
8568 && GET_MODE_SIZE (mode
).is_constant (&const_size
)
8573 split_const (x
, &sym
, &addend
);
8574 return ((GET_CODE (sym
) == LABEL_REF
8575 || (GET_CODE (sym
) == SYMBOL_REF
8576 && CONSTANT_POOL_ADDRESS_P (sym
)
8577 && aarch64_pcrelative_literal_loads
)));
8582 info
->type
= ADDRESS_LO_SUM
;
8583 info
->base
= XEXP (x
, 0);
8584 info
->offset
= XEXP (x
, 1);
8585 if (allow_reg_index_p
8586 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
8589 split_const (info
->offset
, &sym
, &offs
);
8590 if (GET_CODE (sym
) == SYMBOL_REF
8591 && (aarch64_classify_symbol (sym
, INTVAL (offs
))
8592 == SYMBOL_SMALL_ABSOLUTE
))
8594 /* The symbol and offset must be aligned to the access size. */
8597 if (CONSTANT_POOL_ADDRESS_P (sym
))
8598 align
= GET_MODE_ALIGNMENT (get_pool_mode (sym
));
8599 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym
))
8601 tree exp
= SYMBOL_REF_DECL (sym
);
8602 align
= TYPE_ALIGN (TREE_TYPE (exp
));
8603 align
= aarch64_constant_alignment (exp
, align
);
8605 else if (SYMBOL_REF_DECL (sym
))
8606 align
= DECL_ALIGN (SYMBOL_REF_DECL (sym
));
8607 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym
)
8608 && SYMBOL_REF_BLOCK (sym
) != NULL
)
8609 align
= SYMBOL_REF_BLOCK (sym
)->alignment
;
8611 align
= BITS_PER_UNIT
;
8613 poly_int64 ref_size
= GET_MODE_SIZE (mode
);
8614 if (known_eq (ref_size
, 0))
8615 ref_size
= GET_MODE_SIZE (DImode
);
8617 return (multiple_p (INTVAL (offs
), ref_size
)
8618 && multiple_p (align
/ BITS_PER_UNIT
, ref_size
));
8628 /* Return true if the address X is valid for a PRFM instruction.
8629 STRICT_P is true if we should do strict checking with
8630 aarch64_classify_address. */
8633 aarch64_address_valid_for_prefetch_p (rtx x
, bool strict_p
)
8635 struct aarch64_address_info addr
;
8637 /* PRFM accepts the same addresses as DImode... */
8638 bool res
= aarch64_classify_address (&addr
, x
, DImode
, strict_p
);
8642 /* ... except writeback forms. */
8643 return addr
.type
!= ADDRESS_REG_WB
;
8647 aarch64_symbolic_address_p (rtx x
)
8651 split_const (x
, &x
, &offset
);
8652 return GET_CODE (x
) == SYMBOL_REF
|| GET_CODE (x
) == LABEL_REF
;
8655 /* Classify the base of symbolic expression X. */
8657 enum aarch64_symbol_type
8658 aarch64_classify_symbolic_expression (rtx x
)
8662 split_const (x
, &x
, &offset
);
8663 return aarch64_classify_symbol (x
, INTVAL (offset
));
8667 /* Return TRUE if X is a legitimate address for accessing memory in
8670 aarch64_legitimate_address_hook_p (machine_mode mode
, rtx x
, bool strict_p
)
8672 struct aarch64_address_info addr
;
8674 return aarch64_classify_address (&addr
, x
, mode
, strict_p
);
8677 /* Return TRUE if X is a legitimate address of type TYPE for accessing
8678 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
8680 aarch64_legitimate_address_p (machine_mode mode
, rtx x
, bool strict_p
,
8681 aarch64_addr_query_type type
)
8683 struct aarch64_address_info addr
;
8685 return aarch64_classify_address (&addr
, x
, mode
, strict_p
, type
);
8688 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
8691 aarch64_legitimize_address_displacement (rtx
*offset1
, rtx
*offset2
,
8692 poly_int64 orig_offset
,
8696 if (GET_MODE_SIZE (mode
).is_constant (&size
))
8698 HOST_WIDE_INT const_offset
, second_offset
;
8700 /* A general SVE offset is A * VQ + B. Remove the A component from
8701 coefficient 0 in order to get the constant B. */
8702 const_offset
= orig_offset
.coeffs
[0] - orig_offset
.coeffs
[1];
8704 /* Split an out-of-range address displacement into a base and
8705 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
8706 range otherwise to increase opportunities for sharing the base
8707 address of different sizes. Unaligned accesses use the signed
8708 9-bit range, TImode/TFmode use the intersection of signed
8709 scaled 7-bit and signed 9-bit offset. */
8710 if (mode
== TImode
|| mode
== TFmode
)
8711 second_offset
= ((const_offset
+ 0x100) & 0x1f8) - 0x100;
8712 else if ((const_offset
& (size
- 1)) != 0)
8713 second_offset
= ((const_offset
+ 0x100) & 0x1ff) - 0x100;
8715 second_offset
= const_offset
& (size
< 4 ? 0xfff : 0x3ffc);
8717 if (second_offset
== 0 || known_eq (orig_offset
, second_offset
))
8720 /* Split the offset into second_offset and the rest. */
8721 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
8722 *offset2
= gen_int_mode (second_offset
, Pmode
);
8727 /* Get the mode we should use as the basis of the range. For structure
8728 modes this is the mode of one vector. */
8729 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
8730 machine_mode step_mode
8731 = (vec_flags
& VEC_STRUCT
) != 0 ? SVE_BYTE_MODE
: mode
;
8733 /* Get the "mul vl" multiplier we'd like to use. */
8734 HOST_WIDE_INT factor
= GET_MODE_SIZE (step_mode
).coeffs
[1];
8735 HOST_WIDE_INT vnum
= orig_offset
.coeffs
[1] / factor
;
8736 if (vec_flags
& VEC_SVE_DATA
)
8737 /* LDR supports a 9-bit range, but the move patterns for
8738 structure modes require all vectors to be in range of the
8739 same base. The simplest way of accomodating that while still
8740 promoting reuse of anchor points between different modes is
8741 to use an 8-bit range unconditionally. */
8742 vnum
= ((vnum
+ 128) & 255) - 128;
8744 /* Predicates are only handled singly, so we might as well use
8746 vnum
= ((vnum
+ 256) & 511) - 256;
8750 /* Convert the "mul vl" multiplier into a byte offset. */
8751 poly_int64 second_offset
= GET_MODE_SIZE (step_mode
) * vnum
;
8752 if (known_eq (second_offset
, orig_offset
))
8755 /* Split the offset into second_offset and the rest. */
8756 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
8757 *offset2
= gen_int_mode (second_offset
, Pmode
);
8762 /* Return the binary representation of floating point constant VALUE in INTVAL.
8763 If the value cannot be converted, return false without setting INTVAL.
8764 The conversion is done in the given MODE. */
8766 aarch64_reinterpret_float_as_int (rtx value
, unsigned HOST_WIDE_INT
*intval
)
8769 /* We make a general exception for 0. */
8770 if (aarch64_float_const_zero_rtx_p (value
))
8776 scalar_float_mode mode
;
8777 if (GET_CODE (value
) != CONST_DOUBLE
8778 || !is_a
<scalar_float_mode
> (GET_MODE (value
), &mode
)
8779 || GET_MODE_BITSIZE (mode
) > HOST_BITS_PER_WIDE_INT
8780 /* Only support up to DF mode. */
8781 || GET_MODE_BITSIZE (mode
) > GET_MODE_BITSIZE (DFmode
))
8784 unsigned HOST_WIDE_INT ival
= 0;
8787 real_to_target (res
,
8788 CONST_DOUBLE_REAL_VALUE (value
),
8789 REAL_MODE_FORMAT (mode
));
8793 int order
= BYTES_BIG_ENDIAN
? 1 : 0;
8794 ival
= zext_hwi (res
[order
], 32);
8795 ival
|= (zext_hwi (res
[1 - order
], 32) << 32);
8798 ival
= zext_hwi (res
[0], 32);
8804 /* Return TRUE if rtx X is an immediate constant that can be moved using a
8805 single MOV(+MOVK) followed by an FMOV. */
8807 aarch64_float_const_rtx_p (rtx x
)
8809 machine_mode mode
= GET_MODE (x
);
8810 if (mode
== VOIDmode
)
8813 /* Determine whether it's cheaper to write float constants as
8814 mov/movk pairs over ldr/adrp pairs. */
8815 unsigned HOST_WIDE_INT ival
;
8817 if (GET_CODE (x
) == CONST_DOUBLE
8818 && SCALAR_FLOAT_MODE_P (mode
)
8819 && aarch64_reinterpret_float_as_int (x
, &ival
))
8821 scalar_int_mode imode
= (mode
== HFmode
8823 : int_mode_for_mode (mode
).require ());
8824 int num_instr
= aarch64_internal_mov_immediate
8825 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
8826 return num_instr
< 3;
8832 /* Return TRUE if rtx X is immediate constant 0.0 */
8834 aarch64_float_const_zero_rtx_p (rtx x
)
8836 if (GET_MODE (x
) == VOIDmode
)
8839 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x
)))
8840 return !HONOR_SIGNED_ZEROS (GET_MODE (x
));
8841 return real_equal (CONST_DOUBLE_REAL_VALUE (x
), &dconst0
);
8844 /* Return TRUE if rtx X is immediate constant that fits in a single
8845 MOVI immediate operation. */
8847 aarch64_can_const_movi_rtx_p (rtx x
, machine_mode mode
)
8853 scalar_int_mode imode
;
8854 unsigned HOST_WIDE_INT ival
;
8856 if (GET_CODE (x
) == CONST_DOUBLE
8857 && SCALAR_FLOAT_MODE_P (mode
))
8859 if (!aarch64_reinterpret_float_as_int (x
, &ival
))
8862 /* We make a general exception for 0. */
8863 if (aarch64_float_const_zero_rtx_p (x
))
8866 imode
= int_mode_for_mode (mode
).require ();
8868 else if (GET_CODE (x
) == CONST_INT
8869 && is_a
<scalar_int_mode
> (mode
, &imode
))
8874 /* use a 64 bit mode for everything except for DI/DF mode, where we use
8875 a 128 bit vector mode. */
8876 int width
= GET_MODE_BITSIZE (imode
) == 64 ? 128 : 64;
8878 vmode
= aarch64_simd_container_mode (imode
, width
);
8879 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, ival
);
8881 return aarch64_simd_valid_immediate (v_op
, NULL
);
8885 /* Return the fixed registers used for condition codes. */
8888 aarch64_fixed_condition_code_regs (unsigned int *p1
, unsigned int *p2
)
8891 *p2
= INVALID_REGNUM
;
8895 /* This function is used by the call expanders of the machine description.
8896 RESULT is the register in which the result is returned. It's NULL for
8897 "call" and "sibcall".
8898 MEM is the location of the function call.
8899 CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
8900 SIBCALL indicates whether this function call is normal call or sibling call.
8901 It will generate different pattern accordingly. */
8904 aarch64_expand_call (rtx result
, rtx mem
, rtx callee_abi
, bool sibcall
)
8906 rtx call
, callee
, tmp
;
8910 gcc_assert (MEM_P (mem
));
8911 callee
= XEXP (mem
, 0);
8912 mode
= GET_MODE (callee
);
8913 gcc_assert (mode
== Pmode
);
8915 /* Decide if we should generate indirect calls by loading the
8916 address of the callee into a register before performing
8917 the branch-and-link. */
8918 if (SYMBOL_REF_P (callee
)
8919 ? (aarch64_is_long_call_p (callee
)
8920 || aarch64_is_noplt_call_p (callee
))
8922 XEXP (mem
, 0) = force_reg (mode
, callee
);
8924 call
= gen_rtx_CALL (VOIDmode
, mem
, const0_rtx
);
8926 if (result
!= NULL_RTX
)
8927 call
= gen_rtx_SET (result
, call
);
8932 tmp
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (Pmode
, LR_REGNUM
));
8934 gcc_assert (CONST_INT_P (callee_abi
));
8935 callee_abi
= gen_rtx_UNSPEC (DImode
, gen_rtvec (1, callee_abi
),
8938 vec
= gen_rtvec (3, call
, callee_abi
, tmp
);
8939 call
= gen_rtx_PARALLEL (VOIDmode
, vec
);
8941 aarch64_emit_call_insn (call
);
8944 /* Emit call insn with PAT and do aarch64-specific handling. */
8947 aarch64_emit_call_insn (rtx pat
)
8949 rtx insn
= emit_call_insn (pat
);
8951 rtx
*fusage
= &CALL_INSN_FUNCTION_USAGE (insn
);
8952 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP0_REGNUM
));
8953 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP1_REGNUM
));
8957 aarch64_select_cc_mode (RTX_CODE code
, rtx x
, rtx y
)
8959 machine_mode mode_x
= GET_MODE (x
);
8960 rtx_code code_x
= GET_CODE (x
);
8962 /* All floating point compares return CCFP if it is an equality
8963 comparison, and CCFPE otherwise. */
8964 if (GET_MODE_CLASS (mode_x
) == MODE_FLOAT
)
8991 /* Equality comparisons of short modes against zero can be performed
8992 using the TST instruction with the appropriate bitmask. */
8993 if (y
== const0_rtx
&& (REG_P (x
) || SUBREG_P (x
))
8994 && (code
== EQ
|| code
== NE
)
8995 && (mode_x
== HImode
|| mode_x
== QImode
))
8998 /* Similarly, comparisons of zero_extends from shorter modes can
8999 be performed using an ANDS with an immediate mask. */
9000 if (y
== const0_rtx
&& code_x
== ZERO_EXTEND
9001 && (mode_x
== SImode
|| mode_x
== DImode
)
9002 && (GET_MODE (XEXP (x
, 0)) == HImode
|| GET_MODE (XEXP (x
, 0)) == QImode
)
9003 && (code
== EQ
|| code
== NE
))
9006 if ((mode_x
== SImode
|| mode_x
== DImode
)
9008 && (code
== EQ
|| code
== NE
|| code
== LT
|| code
== GE
)
9009 && (code_x
== PLUS
|| code_x
== MINUS
|| code_x
== AND
9011 || (code_x
== ZERO_EXTRACT
&& CONST_INT_P (XEXP (x
, 1))
9012 && CONST_INT_P (XEXP (x
, 2)))))
9015 /* A compare with a shifted operand. Because of canonicalization,
9016 the comparison will have to be swapped when we emit the assembly
9018 if ((mode_x
== SImode
|| mode_x
== DImode
)
9019 && (REG_P (y
) || GET_CODE (y
) == SUBREG
|| y
== const0_rtx
)
9020 && (code_x
== ASHIFT
|| code_x
== ASHIFTRT
9021 || code_x
== LSHIFTRT
9022 || code_x
== ZERO_EXTEND
|| code_x
== SIGN_EXTEND
))
9025 /* Similarly for a negated operand, but we can only do this for
9027 if ((mode_x
== SImode
|| mode_x
== DImode
)
9028 && (REG_P (y
) || GET_CODE (y
) == SUBREG
)
9029 && (code
== EQ
|| code
== NE
)
9033 /* A test for unsigned overflow from an addition. */
9034 if ((mode_x
== DImode
|| mode_x
== TImode
)
9035 && (code
== LTU
|| code
== GEU
)
9037 && rtx_equal_p (XEXP (x
, 0), y
))
9040 /* A test for unsigned overflow from an add with carry. */
9041 if ((mode_x
== DImode
|| mode_x
== TImode
)
9042 && (code
== LTU
|| code
== GEU
)
9044 && CONST_SCALAR_INT_P (y
)
9045 && (rtx_mode_t (y
, mode_x
)
9046 == (wi::shwi (1, mode_x
)
9047 << (GET_MODE_BITSIZE (mode_x
).to_constant () / 2))))
9050 /* A test for signed overflow. */
9051 if ((mode_x
== DImode
|| mode_x
== TImode
)
9054 && GET_CODE (y
) == SIGN_EXTEND
)
9057 /* For everything else, return CCmode. */
9062 aarch64_get_condition_code_1 (machine_mode
, enum rtx_code
);
9065 aarch64_get_condition_code (rtx x
)
9067 machine_mode mode
= GET_MODE (XEXP (x
, 0));
9068 enum rtx_code comp_code
= GET_CODE (x
);
9070 if (GET_MODE_CLASS (mode
) != MODE_CC
)
9071 mode
= SELECT_CC_MODE (comp_code
, XEXP (x
, 0), XEXP (x
, 1));
9072 return aarch64_get_condition_code_1 (mode
, comp_code
);
9076 aarch64_get_condition_code_1 (machine_mode mode
, enum rtx_code comp_code
)
9084 case GE
: return AARCH64_GE
;
9085 case GT
: return AARCH64_GT
;
9086 case LE
: return AARCH64_LS
;
9087 case LT
: return AARCH64_MI
;
9088 case NE
: return AARCH64_NE
;
9089 case EQ
: return AARCH64_EQ
;
9090 case ORDERED
: return AARCH64_VC
;
9091 case UNORDERED
: return AARCH64_VS
;
9092 case UNLT
: return AARCH64_LT
;
9093 case UNLE
: return AARCH64_LE
;
9094 case UNGT
: return AARCH64_HI
;
9095 case UNGE
: return AARCH64_PL
;
9103 case NE
: return AARCH64_NE
;
9104 case EQ
: return AARCH64_EQ
;
9105 case GE
: return AARCH64_GE
;
9106 case GT
: return AARCH64_GT
;
9107 case LE
: return AARCH64_LE
;
9108 case LT
: return AARCH64_LT
;
9109 case GEU
: return AARCH64_CS
;
9110 case GTU
: return AARCH64_HI
;
9111 case LEU
: return AARCH64_LS
;
9112 case LTU
: return AARCH64_CC
;
9120 case NE
: return AARCH64_NE
;
9121 case EQ
: return AARCH64_EQ
;
9122 case GE
: return AARCH64_LE
;
9123 case GT
: return AARCH64_LT
;
9124 case LE
: return AARCH64_GE
;
9125 case LT
: return AARCH64_GT
;
9126 case GEU
: return AARCH64_LS
;
9127 case GTU
: return AARCH64_CC
;
9128 case LEU
: return AARCH64_CS
;
9129 case LTU
: return AARCH64_HI
;
9137 case NE
: return AARCH64_NE
; /* = any */
9138 case EQ
: return AARCH64_EQ
; /* = none */
9139 case GE
: return AARCH64_PL
; /* = nfrst */
9140 case LT
: return AARCH64_MI
; /* = first */
9141 case GEU
: return AARCH64_CS
; /* = nlast */
9142 case GTU
: return AARCH64_HI
; /* = pmore */
9143 case LEU
: return AARCH64_LS
; /* = plast */
9144 case LTU
: return AARCH64_CC
; /* = last */
9152 case NE
: return AARCH64_NE
;
9153 case EQ
: return AARCH64_EQ
;
9154 case GE
: return AARCH64_PL
;
9155 case LT
: return AARCH64_MI
;
9163 case NE
: return AARCH64_NE
;
9164 case EQ
: return AARCH64_EQ
;
9172 case LTU
: return AARCH64_CS
;
9173 case GEU
: return AARCH64_CC
;
9181 case GEU
: return AARCH64_CS
;
9182 case LTU
: return AARCH64_CC
;
9190 case NE
: return AARCH64_VS
;
9191 case EQ
: return AARCH64_VC
;
9204 aarch64_const_vec_all_same_in_range_p (rtx x
,
9205 HOST_WIDE_INT minval
,
9206 HOST_WIDE_INT maxval
)
9209 return (const_vec_duplicate_p (x
, &elt
)
9210 && CONST_INT_P (elt
)
9211 && IN_RANGE (INTVAL (elt
), minval
, maxval
));
9215 aarch64_const_vec_all_same_int_p (rtx x
, HOST_WIDE_INT val
)
9217 return aarch64_const_vec_all_same_in_range_p (x
, val
, val
);
9220 /* Return true if VEC is a constant in which every element is in the range
9221 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
9224 aarch64_const_vec_all_in_range_p (rtx vec
,
9225 HOST_WIDE_INT minval
,
9226 HOST_WIDE_INT maxval
)
9228 if (GET_CODE (vec
) != CONST_VECTOR
9229 || GET_MODE_CLASS (GET_MODE (vec
)) != MODE_VECTOR_INT
)
9233 if (!CONST_VECTOR_STEPPED_P (vec
))
9234 nunits
= const_vector_encoded_nelts (vec
);
9235 else if (!CONST_VECTOR_NUNITS (vec
).is_constant (&nunits
))
9238 for (int i
= 0; i
< nunits
; i
++)
9240 rtx vec_elem
= CONST_VECTOR_ELT (vec
, i
);
9241 if (!CONST_INT_P (vec_elem
)
9242 || !IN_RANGE (INTVAL (vec_elem
), minval
, maxval
))
9249 #define AARCH64_CC_V 1
9250 #define AARCH64_CC_C (1 << 1)
9251 #define AARCH64_CC_Z (1 << 2)
9252 #define AARCH64_CC_N (1 << 3)
9254 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
9255 static const int aarch64_nzcv_codes
[] =
9257 0, /* EQ, Z == 1. */
9258 AARCH64_CC_Z
, /* NE, Z == 0. */
9259 0, /* CS, C == 1. */
9260 AARCH64_CC_C
, /* CC, C == 0. */
9261 0, /* MI, N == 1. */
9262 AARCH64_CC_N
, /* PL, N == 0. */
9263 0, /* VS, V == 1. */
9264 AARCH64_CC_V
, /* VC, V == 0. */
9265 0, /* HI, C ==1 && Z == 0. */
9266 AARCH64_CC_C
, /* LS, !(C == 1 && Z == 0). */
9267 AARCH64_CC_V
, /* GE, N == V. */
9268 0, /* LT, N != V. */
9269 AARCH64_CC_Z
, /* GT, Z == 0 && N == V. */
9270 0, /* LE, !(Z == 0 && N == V). */
9275 /* Print floating-point vector immediate operand X to F, negating it
9276 first if NEGATE is true. Return true on success, false if it isn't
9277 a constant we can handle. */
9280 aarch64_print_vector_float_operand (FILE *f
, rtx x
, bool negate
)
9284 if (!const_vec_duplicate_p (x
, &elt
))
9287 REAL_VALUE_TYPE r
= *CONST_DOUBLE_REAL_VALUE (elt
);
9289 r
= real_value_negate (&r
);
9291 /* Handle the SVE single-bit immediates specially, since they have a
9292 fixed form in the assembly syntax. */
9293 if (real_equal (&r
, &dconst0
))
9294 asm_fprintf (f
, "0.0");
9295 else if (real_equal (&r
, &dconst2
))
9296 asm_fprintf (f
, "2.0");
9297 else if (real_equal (&r
, &dconst1
))
9298 asm_fprintf (f
, "1.0");
9299 else if (real_equal (&r
, &dconsthalf
))
9300 asm_fprintf (f
, "0.5");
9303 const int buf_size
= 20;
9304 char float_buf
[buf_size
] = {'\0'};
9305 real_to_decimal_for_mode (float_buf
, &r
, buf_size
, buf_size
,
9307 asm_fprintf (f
, "%s", float_buf
);
9313 /* Return the equivalent letter for size. */
9315 sizetochar (int size
)
9319 case 64: return 'd';
9320 case 32: return 's';
9321 case 16: return 'h';
9322 case 8 : return 'b';
9323 default: gcc_unreachable ();
9327 /* Print operand X to file F in a target specific manner according to CODE.
9328 The acceptable formatting commands given by CODE are:
9329 'c': An integer or symbol address without a preceding #
9331 'C': Take the duplicated element in a vector constant
9332 and print it in hex.
9333 'D': Take the duplicated element in a vector constant
9334 and print it as an unsigned integer, in decimal.
9335 'e': Print the sign/zero-extend size as a character 8->b,
9336 16->h, 32->w. Can also be used for masks:
9337 0xff->b, 0xffff->h, 0xffffffff->w.
9338 'I': If the operand is a duplicated vector constant,
9339 replace it with the duplicated scalar. If the
9340 operand is then a floating-point constant, replace
9341 it with the integer bit representation. Print the
9342 transformed constant as a signed decimal number.
9343 'p': Prints N such that 2^N == X (X must be power of 2 and
9345 'P': Print the number of non-zero bits in X (a const_int).
9346 'H': Print the higher numbered register of a pair (TImode)
9348 'm': Print a condition (eq, ne, etc).
9349 'M': Same as 'm', but invert condition.
9350 'N': Take the duplicated element in a vector constant
9351 and print the negative of it in decimal.
9352 'b/h/s/d/q': Print a scalar FP/SIMD register name.
9353 'S/T/U/V': Print a FP/SIMD register name for a register list.
9354 The register printed is the FP/SIMD register name
9355 of X + 0/1/2/3 for S/T/U/V.
9356 'R': Print a scalar Integer/FP/SIMD register name + 1.
9357 'X': Print bottom 16 bits of integer constant in hex.
9358 'w/x': Print a general register name or the zero register
9360 '0': Print a normal operand, if it's a general register,
9361 then we assume DImode.
9362 'k': Print NZCV for conditional compare instructions.
9363 'A': Output address constant representing the first
9364 argument of X, specifying a relocation offset
9366 'L': Output constant address specified by X
9367 with a relocation offset if appropriate.
9368 'G': Prints address of X, specifying a PC relative
9369 relocation mode if appropriate.
9370 'y': Output address of LDP or STP - this is used for
9371 some LDP/STPs which don't use a PARALLEL in their
9372 pattern (so the mode needs to be adjusted).
9373 'z': Output address of a typical LDP or STP. */
9376 aarch64_print_operand (FILE *f
, rtx x
, int code
)
9382 switch (GET_CODE (x
))
9385 fprintf (f
, HOST_WIDE_INT_PRINT_DEC
, INTVAL (x
));
9389 output_addr_const (f
, x
);
9393 if (GET_CODE (XEXP (x
, 0)) == PLUS
9394 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
)
9396 output_addr_const (f
, x
);
9402 output_operand_lossage ("unsupported operand for code '%c'", code
);
9408 x
= unwrap_const_vec_duplicate (x
);
9409 if (!CONST_INT_P (x
))
9411 output_operand_lossage ("invalid operand for '%%%c'", code
);
9415 HOST_WIDE_INT val
= INTVAL (x
);
9416 if ((val
& ~7) == 8 || val
== 0xff)
9418 else if ((val
& ~7) == 16 || val
== 0xffff)
9420 else if ((val
& ~7) == 32 || val
== 0xffffffff)
9424 output_operand_lossage ("invalid operand for '%%%c'", code
);
9434 if (!CONST_INT_P (x
) || (n
= exact_log2 (INTVAL (x
))) < 0)
9436 output_operand_lossage ("invalid operand for '%%%c'", code
);
9440 asm_fprintf (f
, "%d", n
);
9445 if (!CONST_INT_P (x
))
9447 output_operand_lossage ("invalid operand for '%%%c'", code
);
9451 asm_fprintf (f
, "%u", popcount_hwi (INTVAL (x
)));
9455 if (x
== const0_rtx
)
9457 asm_fprintf (f
, "xzr");
9461 if (!REG_P (x
) || !GP_REGNUM_P (REGNO (x
) + 1))
9463 output_operand_lossage ("invalid operand for '%%%c'", code
);
9467 asm_fprintf (f
, "%s", reg_names
[REGNO (x
) + 1]);
9472 x
= aarch64_bit_representation (unwrap_const_vec_duplicate (x
));
9473 if (CONST_INT_P (x
))
9474 asm_fprintf (f
, "%wd", INTVAL (x
));
9477 output_operand_lossage ("invalid operand for '%%%c'", code
);
9487 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
9488 if (x
== const_true_rtx
)
9495 if (!COMPARISON_P (x
))
9497 output_operand_lossage ("invalid operand for '%%%c'", code
);
9501 cond_code
= aarch64_get_condition_code (x
);
9502 gcc_assert (cond_code
>= 0);
9504 cond_code
= AARCH64_INVERSE_CONDITION_CODE (cond_code
);
9505 if (GET_MODE (XEXP (x
, 0)) == CC_NZCmode
)
9506 fputs (aarch64_sve_condition_codes
[cond_code
], f
);
9508 fputs (aarch64_condition_codes
[cond_code
], f
);
9513 if (!const_vec_duplicate_p (x
, &elt
))
9515 output_operand_lossage ("invalid vector constant");
9519 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
9520 asm_fprintf (f
, "%wd", -INTVAL (elt
));
9521 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
9522 && aarch64_print_vector_float_operand (f
, x
, true))
9526 output_operand_lossage ("invalid vector constant");
9536 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
9538 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
9541 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - V0_REGNUM
);
9548 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
9550 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
9553 asm_fprintf (f
, "%c%d",
9554 aarch64_sve_data_mode_p (GET_MODE (x
)) ? 'z' : 'v',
9555 REGNO (x
) - V0_REGNUM
+ (code
- 'S'));
9559 if (REG_P (x
) && FP_REGNUM_P (REGNO (x
)))
9560 asm_fprintf (f
, "q%d", REGNO (x
) - V0_REGNUM
+ 1);
9561 else if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
9562 asm_fprintf (f
, "x%d", REGNO (x
) - R0_REGNUM
+ 1);
9564 output_operand_lossage ("incompatible register operand for '%%%c'",
9569 if (!CONST_INT_P (x
))
9571 output_operand_lossage ("invalid operand for '%%%c'", code
);
9574 asm_fprintf (f
, "0x%wx", UINTVAL (x
) & 0xffff);
9579 /* Print a replicated constant in hex. */
9580 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
9582 output_operand_lossage ("invalid operand for '%%%c'", code
);
9585 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
9586 asm_fprintf (f
, "0x%wx", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
9592 /* Print a replicated constant in decimal, treating it as
9594 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
9596 output_operand_lossage ("invalid operand for '%%%c'", code
);
9599 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
9600 asm_fprintf (f
, "%wd", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
9607 || (CONST_DOUBLE_P (x
) && aarch64_float_const_zero_rtx_p (x
)))
9609 asm_fprintf (f
, "%czr", code
);
9613 if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
9615 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - R0_REGNUM
);
9619 if (REG_P (x
) && REGNO (x
) == SP_REGNUM
)
9621 asm_fprintf (f
, "%ssp", code
== 'w' ? "w" : "");
9630 output_operand_lossage ("missing operand");
9634 switch (GET_CODE (x
))
9637 if (aarch64_sve_data_mode_p (GET_MODE (x
)))
9639 if (REG_NREGS (x
) == 1)
9640 asm_fprintf (f
, "z%d", REGNO (x
) - V0_REGNUM
);
9644 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x
)));
9645 asm_fprintf (f
, "{z%d.%c - z%d.%c}",
9646 REGNO (x
) - V0_REGNUM
, suffix
,
9647 END_REGNO (x
) - V0_REGNUM
- 1, suffix
);
9651 asm_fprintf (f
, "%s", reg_names
[REGNO (x
)]);
9655 output_address (GET_MODE (x
), XEXP (x
, 0));
9660 output_addr_const (asm_out_file
, x
);
9664 asm_fprintf (f
, "%wd", INTVAL (x
));
9668 if (!VECTOR_MODE_P (GET_MODE (x
)))
9670 output_addr_const (asm_out_file
, x
);
9676 if (!const_vec_duplicate_p (x
, &elt
))
9678 output_operand_lossage ("invalid vector constant");
9682 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
9683 asm_fprintf (f
, "%wd", INTVAL (elt
));
9684 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
9685 && aarch64_print_vector_float_operand (f
, x
, false))
9689 output_operand_lossage ("invalid vector constant");
9695 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
9696 be getting CONST_DOUBLEs holding integers. */
9697 gcc_assert (GET_MODE (x
) != VOIDmode
);
9698 if (aarch64_float_const_zero_rtx_p (x
))
9703 else if (aarch64_float_const_representable_p (x
))
9706 char float_buf
[buf_size
] = {'\0'};
9707 real_to_decimal_for_mode (float_buf
,
9708 CONST_DOUBLE_REAL_VALUE (x
),
9711 asm_fprintf (asm_out_file
, "%s", float_buf
);
9715 output_operand_lossage ("invalid constant");
9718 output_operand_lossage ("invalid operand");
9724 if (GET_CODE (x
) == HIGH
)
9727 switch (aarch64_classify_symbolic_expression (x
))
9729 case SYMBOL_SMALL_GOT_4G
:
9730 asm_fprintf (asm_out_file
, ":got:");
9733 case SYMBOL_SMALL_TLSGD
:
9734 asm_fprintf (asm_out_file
, ":tlsgd:");
9737 case SYMBOL_SMALL_TLSDESC
:
9738 asm_fprintf (asm_out_file
, ":tlsdesc:");
9741 case SYMBOL_SMALL_TLSIE
:
9742 asm_fprintf (asm_out_file
, ":gottprel:");
9745 case SYMBOL_TLSLE24
:
9746 asm_fprintf (asm_out_file
, ":tprel:");
9749 case SYMBOL_TINY_GOT
:
9756 output_addr_const (asm_out_file
, x
);
9760 switch (aarch64_classify_symbolic_expression (x
))
9762 case SYMBOL_SMALL_GOT_4G
:
9763 asm_fprintf (asm_out_file
, ":lo12:");
9766 case SYMBOL_SMALL_TLSGD
:
9767 asm_fprintf (asm_out_file
, ":tlsgd_lo12:");
9770 case SYMBOL_SMALL_TLSDESC
:
9771 asm_fprintf (asm_out_file
, ":tlsdesc_lo12:");
9774 case SYMBOL_SMALL_TLSIE
:
9775 asm_fprintf (asm_out_file
, ":gottprel_lo12:");
9778 case SYMBOL_TLSLE12
:
9779 asm_fprintf (asm_out_file
, ":tprel_lo12:");
9782 case SYMBOL_TLSLE24
:
9783 asm_fprintf (asm_out_file
, ":tprel_lo12_nc:");
9786 case SYMBOL_TINY_GOT
:
9787 asm_fprintf (asm_out_file
, ":got:");
9790 case SYMBOL_TINY_TLSIE
:
9791 asm_fprintf (asm_out_file
, ":gottprel:");
9797 output_addr_const (asm_out_file
, x
);
9801 switch (aarch64_classify_symbolic_expression (x
))
9803 case SYMBOL_TLSLE24
:
9804 asm_fprintf (asm_out_file
, ":tprel_hi12:");
9809 output_addr_const (asm_out_file
, x
);
9814 HOST_WIDE_INT cond_code
;
9816 if (!CONST_INT_P (x
))
9818 output_operand_lossage ("invalid operand for '%%%c'", code
);
9822 cond_code
= INTVAL (x
);
9823 gcc_assert (cond_code
>= 0 && cond_code
<= AARCH64_NV
);
9824 asm_fprintf (f
, "%d", aarch64_nzcv_codes
[cond_code
]);
9831 machine_mode mode
= GET_MODE (x
);
9833 if (GET_CODE (x
) != MEM
9834 || (code
== 'y' && maybe_ne (GET_MODE_SIZE (mode
), 16)))
9836 output_operand_lossage ("invalid operand for '%%%c'", code
);
9840 if (!aarch64_print_address_internal (f
, mode
, XEXP (x
, 0),
9842 ? ADDR_QUERY_LDP_STP_N
9843 : ADDR_QUERY_LDP_STP
))
9844 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
9849 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
9854 /* Print address 'x' of a memory access with mode 'mode'.
9855 'op' is the context required by aarch64_classify_address. It can either be
9856 MEM for a normal memory access or PARALLEL for LDP/STP. */
9858 aarch64_print_address_internal (FILE *f
, machine_mode mode
, rtx x
,
9859 aarch64_addr_query_type type
)
9861 struct aarch64_address_info addr
;
9862 unsigned int size
, vec_flags
;
9864 /* Check all addresses are Pmode - including ILP32. */
9865 if (GET_MODE (x
) != Pmode
9866 && (!CONST_INT_P (x
)
9867 || trunc_int_for_mode (INTVAL (x
), Pmode
) != INTVAL (x
)))
9869 output_operand_lossage ("invalid address mode");
9873 if (aarch64_classify_address (&addr
, x
, mode
, true, type
))
9876 case ADDRESS_REG_IMM
:
9877 if (known_eq (addr
.const_offset
, 0))
9879 asm_fprintf (f
, "[%s]", reg_names
[REGNO (addr
.base
)]);
9883 vec_flags
= aarch64_classify_vector_mode (mode
);
9884 if (vec_flags
& VEC_ANY_SVE
)
9887 = exact_div (addr
.const_offset
,
9888 aarch64_vl_bytes (mode
, vec_flags
)).to_constant ();
9889 asm_fprintf (f
, "[%s, #%wd, mul vl]",
9890 reg_names
[REGNO (addr
.base
)], vnum
);
9894 asm_fprintf (f
, "[%s, %wd]", reg_names
[REGNO (addr
.base
)],
9895 INTVAL (addr
.offset
));
9898 case ADDRESS_REG_REG
:
9899 if (addr
.shift
== 0)
9900 asm_fprintf (f
, "[%s, %s]", reg_names
[REGNO (addr
.base
)],
9901 reg_names
[REGNO (addr
.offset
)]);
9903 asm_fprintf (f
, "[%s, %s, lsl %u]", reg_names
[REGNO (addr
.base
)],
9904 reg_names
[REGNO (addr
.offset
)], addr
.shift
);
9907 case ADDRESS_REG_UXTW
:
9908 if (addr
.shift
== 0)
9909 asm_fprintf (f
, "[%s, w%d, uxtw]", reg_names
[REGNO (addr
.base
)],
9910 REGNO (addr
.offset
) - R0_REGNUM
);
9912 asm_fprintf (f
, "[%s, w%d, uxtw %u]", reg_names
[REGNO (addr
.base
)],
9913 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
9916 case ADDRESS_REG_SXTW
:
9917 if (addr
.shift
== 0)
9918 asm_fprintf (f
, "[%s, w%d, sxtw]", reg_names
[REGNO (addr
.base
)],
9919 REGNO (addr
.offset
) - R0_REGNUM
);
9921 asm_fprintf (f
, "[%s, w%d, sxtw %u]", reg_names
[REGNO (addr
.base
)],
9922 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
9925 case ADDRESS_REG_WB
:
9926 /* Writeback is only supported for fixed-width modes. */
9927 size
= GET_MODE_SIZE (mode
).to_constant ();
9928 switch (GET_CODE (x
))
9931 asm_fprintf (f
, "[%s, %d]!", reg_names
[REGNO (addr
.base
)], size
);
9934 asm_fprintf (f
, "[%s], %d", reg_names
[REGNO (addr
.base
)], size
);
9937 asm_fprintf (f
, "[%s, -%d]!", reg_names
[REGNO (addr
.base
)], size
);
9940 asm_fprintf (f
, "[%s], -%d", reg_names
[REGNO (addr
.base
)], size
);
9943 asm_fprintf (f
, "[%s, %wd]!", reg_names
[REGNO (addr
.base
)],
9944 INTVAL (addr
.offset
));
9947 asm_fprintf (f
, "[%s], %wd", reg_names
[REGNO (addr
.base
)],
9948 INTVAL (addr
.offset
));
9955 case ADDRESS_LO_SUM
:
9956 asm_fprintf (f
, "[%s, #:lo12:", reg_names
[REGNO (addr
.base
)]);
9957 output_addr_const (f
, addr
.offset
);
9958 asm_fprintf (f
, "]");
9961 case ADDRESS_SYMBOLIC
:
9962 output_addr_const (f
, x
);
9969 /* Print address 'x' of a memory access with mode 'mode'. */
9971 aarch64_print_operand_address (FILE *f
, machine_mode mode
, rtx x
)
9973 if (!aarch64_print_address_internal (f
, mode
, x
, ADDR_QUERY_ANY
))
9974 output_addr_const (f
, x
);
9978 aarch64_label_mentioned_p (rtx x
)
9983 if (GET_CODE (x
) == LABEL_REF
)
9986 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
9987 referencing instruction, but they are constant offsets, not
9989 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
9992 fmt
= GET_RTX_FORMAT (GET_CODE (x
));
9993 for (i
= GET_RTX_LENGTH (GET_CODE (x
)) - 1; i
>= 0; i
--)
9999 for (j
= XVECLEN (x
, i
) - 1; j
>= 0; j
--)
10000 if (aarch64_label_mentioned_p (XVECEXP (x
, i
, j
)))
10003 else if (fmt
[i
] == 'e' && aarch64_label_mentioned_p (XEXP (x
, i
)))
10010 /* Implement REGNO_REG_CLASS. */
10013 aarch64_regno_regclass (unsigned regno
)
10015 if (GP_REGNUM_P (regno
))
10016 return GENERAL_REGS
;
10018 if (regno
== SP_REGNUM
)
10021 if (regno
== FRAME_POINTER_REGNUM
10022 || regno
== ARG_POINTER_REGNUM
)
10023 return POINTER_REGS
;
10025 if (FP_REGNUM_P (regno
))
10026 return (FP_LO8_REGNUM_P (regno
) ? FP_LO8_REGS
10027 : FP_LO_REGNUM_P (regno
) ? FP_LO_REGS
: FP_REGS
);
10029 if (PR_REGNUM_P (regno
))
10030 return PR_LO_REGNUM_P (regno
) ? PR_LO_REGS
: PR_HI_REGS
;
10032 if (regno
== FFR_REGNUM
|| regno
== FFRT_REGNUM
)
10038 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
10039 If OFFSET is out of range, return an offset of an anchor point
10040 that is in range. Return 0 otherwise. */
10042 static HOST_WIDE_INT
10043 aarch64_anchor_offset (HOST_WIDE_INT offset
, HOST_WIDE_INT size
,
10046 /* Does it look like we'll need a 16-byte load/store-pair operation? */
10048 return (offset
+ 0x400) & ~0x7f0;
10050 /* For offsets that aren't a multiple of the access size, the limit is
10052 if (offset
& (size
- 1))
10054 /* BLKmode typically uses LDP of X-registers. */
10055 if (mode
== BLKmode
)
10056 return (offset
+ 512) & ~0x3ff;
10057 return (offset
+ 0x100) & ~0x1ff;
10060 /* Small negative offsets are supported. */
10061 if (IN_RANGE (offset
, -256, 0))
10064 if (mode
== TImode
|| mode
== TFmode
)
10065 return (offset
+ 0x100) & ~0x1ff;
10067 /* Use 12-bit offset by access size. */
10068 return offset
& (~0xfff * size
);
10072 aarch64_legitimize_address (rtx x
, rtx
/* orig_x */, machine_mode mode
)
10074 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
10075 where mask is selected by alignment and size of the offset.
10076 We try to pick as large a range for the offset as possible to
10077 maximize the chance of a CSE. However, for aligned addresses
10078 we limit the range to 4k so that structures with different sized
10079 elements are likely to use the same base. We need to be careful
10080 not to split a CONST for some forms of address expression, otherwise
10081 it will generate sub-optimal code. */
10083 if (GET_CODE (x
) == PLUS
&& CONST_INT_P (XEXP (x
, 1)))
10085 rtx base
= XEXP (x
, 0);
10086 rtx offset_rtx
= XEXP (x
, 1);
10087 HOST_WIDE_INT offset
= INTVAL (offset_rtx
);
10089 if (GET_CODE (base
) == PLUS
)
10091 rtx op0
= XEXP (base
, 0);
10092 rtx op1
= XEXP (base
, 1);
10094 /* Force any scaling into a temp for CSE. */
10095 op0
= force_reg (Pmode
, op0
);
10096 op1
= force_reg (Pmode
, op1
);
10098 /* Let the pointer register be in op0. */
10099 if (REG_POINTER (op1
))
10100 std::swap (op0
, op1
);
10102 /* If the pointer is virtual or frame related, then we know that
10103 virtual register instantiation or register elimination is going
10104 to apply a second constant. We want the two constants folded
10105 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
10106 if (virt_or_elim_regno_p (REGNO (op0
)))
10108 base
= expand_binop (Pmode
, add_optab
, op0
, offset_rtx
,
10109 NULL_RTX
, true, OPTAB_DIRECT
);
10110 return gen_rtx_PLUS (Pmode
, base
, op1
);
10113 /* Otherwise, in order to encourage CSE (and thence loop strength
10114 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
10115 base
= expand_binop (Pmode
, add_optab
, op0
, op1
,
10116 NULL_RTX
, true, OPTAB_DIRECT
);
10117 x
= gen_rtx_PLUS (Pmode
, base
, offset_rtx
);
10120 HOST_WIDE_INT size
;
10121 if (GET_MODE_SIZE (mode
).is_constant (&size
))
10123 HOST_WIDE_INT base_offset
= aarch64_anchor_offset (offset
, size
,
10125 if (base_offset
!= 0)
10127 base
= plus_constant (Pmode
, base
, base_offset
);
10128 base
= force_operand (base
, NULL_RTX
);
10129 return plus_constant (Pmode
, base
, offset
- base_offset
);
10138 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED
, rtx x
,
10139 reg_class_t rclass
,
10141 secondary_reload_info
*sri
)
10143 /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
10144 LDR and STR. See the comment at the head of aarch64-sve.md for
10145 more details about the big-endian handling. */
10146 if (reg_class_subset_p (rclass
, FP_REGS
)
10147 && !((REG_P (x
) && HARD_REGISTER_P (x
))
10148 || aarch64_simd_valid_immediate (x
, NULL
))
10149 && mode
!= VNx16QImode
)
10151 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
10152 if ((vec_flags
& VEC_SVE_DATA
)
10153 && ((vec_flags
& VEC_PARTIAL
) || BYTES_BIG_ENDIAN
))
10155 sri
->icode
= CODE_FOR_aarch64_sve_reload_mem
;
10160 /* If we have to disable direct literal pool loads and stores because the
10161 function is too big, then we need a scratch register. */
10162 if (MEM_P (x
) && GET_CODE (x
) == SYMBOL_REF
&& CONSTANT_POOL_ADDRESS_P (x
)
10163 && (SCALAR_FLOAT_MODE_P (GET_MODE (x
))
10164 || targetm
.vector_mode_supported_p (GET_MODE (x
)))
10165 && !aarch64_pcrelative_literal_loads
)
10167 sri
->icode
= code_for_aarch64_reload_movcp (mode
, DImode
);
10171 /* Without the TARGET_SIMD instructions we cannot move a Q register
10172 to a Q register directly. We need a scratch. */
10173 if (REG_P (x
) && (mode
== TFmode
|| mode
== TImode
) && mode
== GET_MODE (x
)
10174 && FP_REGNUM_P (REGNO (x
)) && !TARGET_SIMD
10175 && reg_class_subset_p (rclass
, FP_REGS
))
10177 sri
->icode
= code_for_aarch64_reload_mov (mode
);
10181 /* A TFmode or TImode memory access should be handled via an FP_REGS
10182 because AArch64 has richer addressing modes for LDR/STR instructions
10183 than LDP/STP instructions. */
10184 if (TARGET_FLOAT
&& rclass
== GENERAL_REGS
10185 && known_eq (GET_MODE_SIZE (mode
), 16) && MEM_P (x
))
10188 if (rclass
== FP_REGS
&& (mode
== TImode
|| mode
== TFmode
) && CONSTANT_P(x
))
10189 return GENERAL_REGS
;
10195 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED
, const int to
)
10197 gcc_assert (from
== ARG_POINTER_REGNUM
|| from
== FRAME_POINTER_REGNUM
);
10199 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
10200 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
10201 if (frame_pointer_needed
)
10202 return to
== HARD_FRAME_POINTER_REGNUM
;
10207 aarch64_initial_elimination_offset (unsigned from
, unsigned to
)
10209 if (to
== HARD_FRAME_POINTER_REGNUM
)
10211 if (from
== ARG_POINTER_REGNUM
)
10212 return cfun
->machine
->frame
.hard_fp_offset
;
10214 if (from
== FRAME_POINTER_REGNUM
)
10215 return cfun
->machine
->frame
.hard_fp_offset
10216 - cfun
->machine
->frame
.locals_offset
;
10219 if (to
== STACK_POINTER_REGNUM
)
10221 if (from
== FRAME_POINTER_REGNUM
)
10222 return cfun
->machine
->frame
.frame_size
10223 - cfun
->machine
->frame
.locals_offset
;
10226 return cfun
->machine
->frame
.frame_size
;
10229 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
10233 aarch64_return_addr (int count
, rtx frame ATTRIBUTE_UNUSED
)
10237 return get_hard_reg_initial_val (Pmode
, LR_REGNUM
);
10242 aarch64_asm_trampoline_template (FILE *f
)
10247 if (aarch64_bti_enabled ())
10249 asm_fprintf (f
, "\thint\t34 // bti c\n");
10256 asm_fprintf (f
, "\tldr\tw%d, .+%d\n", IP1_REGNUM
- R0_REGNUM
, offset1
);
10257 asm_fprintf (f
, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM
- R0_REGNUM
,
10262 asm_fprintf (f
, "\tldr\t%s, .+%d\n", reg_names
[IP1_REGNUM
], offset1
);
10263 asm_fprintf (f
, "\tldr\t%s, .+%d\n", reg_names
[STATIC_CHAIN_REGNUM
],
10266 asm_fprintf (f
, "\tbr\t%s\n", reg_names
[IP1_REGNUM
]);
10268 /* The trampoline needs an extra padding instruction. In case if BTI is
10269 enabled the padding instruction is replaced by the BTI instruction at
10271 if (!aarch64_bti_enabled ())
10272 assemble_aligned_integer (4, const0_rtx
);
10274 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
10275 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
10279 aarch64_trampoline_init (rtx m_tramp
, tree fndecl
, rtx chain_value
)
10281 rtx fnaddr
, mem
, a_tramp
;
10282 const int tramp_code_sz
= 16;
10284 /* Don't need to copy the trailing D-words, we fill those in below. */
10285 emit_block_move (m_tramp
, assemble_trampoline_template (),
10286 GEN_INT (tramp_code_sz
), BLOCK_OP_NORMAL
);
10287 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
);
10288 fnaddr
= XEXP (DECL_RTL (fndecl
), 0);
10289 if (GET_MODE (fnaddr
) != ptr_mode
)
10290 fnaddr
= convert_memory_address (ptr_mode
, fnaddr
);
10291 emit_move_insn (mem
, fnaddr
);
10293 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
+ POINTER_BYTES
);
10294 emit_move_insn (mem
, chain_value
);
10296 /* XXX We should really define a "clear_cache" pattern and use
10297 gen_clear_cache(). */
10298 a_tramp
= XEXP (m_tramp
, 0);
10299 emit_library_call (gen_rtx_SYMBOL_REF (Pmode
, "__clear_cache"),
10300 LCT_NORMAL
, VOIDmode
, a_tramp
, ptr_mode
,
10301 plus_constant (ptr_mode
, a_tramp
, TRAMPOLINE_SIZE
),
10305 static unsigned char
10306 aarch64_class_max_nregs (reg_class_t regclass
, machine_mode mode
)
10308 /* ??? Logically we should only need to provide a value when
10309 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
10310 can hold MODE, but at the moment we need to handle all modes.
10311 Just ignore any runtime parts for registers that can't store them. */
10312 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
10313 unsigned int nregs
, vec_flags
;
10316 case TAILCALL_ADDR_REGS
:
10320 case POINTER_AND_FP_REGS
:
10324 vec_flags
= aarch64_classify_vector_mode (mode
);
10325 if ((vec_flags
& VEC_SVE_DATA
)
10326 && constant_multiple_p (GET_MODE_SIZE (mode
),
10327 aarch64_vl_bytes (mode
, vec_flags
), &nregs
))
10329 return (vec_flags
& VEC_ADVSIMD
10330 ? CEIL (lowest_size
, UNITS_PER_VREG
)
10331 : CEIL (lowest_size
, UNITS_PER_WORD
));
10337 case PR_AND_FFR_REGS
:
10346 gcc_unreachable ();
10350 aarch64_preferred_reload_class (rtx x
, reg_class_t regclass
)
10352 if (regclass
== POINTER_REGS
)
10353 return GENERAL_REGS
;
10355 if (regclass
== STACK_REG
)
10358 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x
)), POINTER_REGS
))
10364 /* Register eliminiation can result in a request for
10365 SP+constant->FP_REGS. We cannot support such operations which
10366 use SP as source and an FP_REG as destination, so reject out
10368 if (! reg_class_subset_p (regclass
, GENERAL_REGS
) && GET_CODE (x
) == PLUS
)
10370 rtx lhs
= XEXP (x
, 0);
10372 /* Look through a possible SUBREG introduced by ILP32. */
10373 if (GET_CODE (lhs
) == SUBREG
)
10374 lhs
= SUBREG_REG (lhs
);
10376 gcc_assert (REG_P (lhs
));
10377 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs
)),
10386 aarch64_asm_output_labelref (FILE* f
, const char *name
)
10388 asm_fprintf (f
, "%U%s", name
);
10392 aarch64_elf_asm_constructor (rtx symbol
, int priority
)
10394 if (priority
== DEFAULT_INIT_PRIORITY
)
10395 default_ctor_section_asm_out_constructor (symbol
, priority
);
10399 /* While priority is known to be in range [0, 65535], so 18 bytes
10400 would be enough, the compiler might not know that. To avoid
10401 -Wformat-truncation false positive, use a larger size. */
10403 snprintf (buf
, sizeof (buf
), ".init_array.%.5u", priority
);
10404 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
10405 switch_to_section (s
);
10406 assemble_align (POINTER_SIZE
);
10407 assemble_aligned_integer (POINTER_BYTES
, symbol
);
10412 aarch64_elf_asm_destructor (rtx symbol
, int priority
)
10414 if (priority
== DEFAULT_INIT_PRIORITY
)
10415 default_dtor_section_asm_out_destructor (symbol
, priority
);
10419 /* While priority is known to be in range [0, 65535], so 18 bytes
10420 would be enough, the compiler might not know that. To avoid
10421 -Wformat-truncation false positive, use a larger size. */
10423 snprintf (buf
, sizeof (buf
), ".fini_array.%.5u", priority
);
10424 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
10425 switch_to_section (s
);
10426 assemble_align (POINTER_SIZE
);
10427 assemble_aligned_integer (POINTER_BYTES
, symbol
);
10432 aarch64_output_casesi (rtx
*operands
)
10436 rtx diff_vec
= PATTERN (NEXT_INSN (as_a
<rtx_insn
*> (operands
[2])));
10438 static const char *const patterns
[4][2] =
10441 "ldrb\t%w3, [%0,%w1,uxtw]",
10442 "add\t%3, %4, %w3, sxtb #2"
10445 "ldrh\t%w3, [%0,%w1,uxtw #1]",
10446 "add\t%3, %4, %w3, sxth #2"
10449 "ldr\t%w3, [%0,%w1,uxtw #2]",
10450 "add\t%3, %4, %w3, sxtw #2"
10452 /* We assume that DImode is only generated when not optimizing and
10453 that we don't really need 64-bit address offsets. That would
10454 imply an object file with 8GB of code in a single function! */
10456 "ldr\t%w3, [%0,%w1,uxtw #2]",
10457 "add\t%3, %4, %w3, sxtw #2"
10461 gcc_assert (GET_CODE (diff_vec
) == ADDR_DIFF_VEC
);
10463 scalar_int_mode mode
= as_a
<scalar_int_mode
> (GET_MODE (diff_vec
));
10464 index
= exact_log2 (GET_MODE_SIZE (mode
));
10466 gcc_assert (index
>= 0 && index
<= 3);
10468 /* Need to implement table size reduction, by chaning the code below. */
10469 output_asm_insn (patterns
[index
][0], operands
);
10470 ASM_GENERATE_INTERNAL_LABEL (label
, "Lrtx", CODE_LABEL_NUMBER (operands
[2]));
10471 snprintf (buf
, sizeof (buf
),
10472 "adr\t%%4, %s", targetm
.strip_name_encoding (label
));
10473 output_asm_insn (buf
, operands
);
10474 output_asm_insn (patterns
[index
][1], operands
);
10475 output_asm_insn ("br\t%3", operands
);
10476 assemble_label (asm_out_file
, label
);
10481 /* Return size in bits of an arithmetic operand which is shifted/scaled and
10482 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
10486 aarch64_uxt_size (int shift
, HOST_WIDE_INT mask
)
10488 if (shift
>= 0 && shift
<= 3)
10491 for (size
= 8; size
<= 32; size
*= 2)
10493 HOST_WIDE_INT bits
= ((HOST_WIDE_INT
)1U << size
) - 1;
10494 if (mask
== bits
<< shift
)
10501 /* Constant pools are per function only when PC relative
10502 literal loads are true or we are in the large memory
10506 aarch64_can_use_per_function_literal_pools_p (void)
10508 return (aarch64_pcrelative_literal_loads
10509 || aarch64_cmodel
== AARCH64_CMODEL_LARGE
);
10513 aarch64_use_blocks_for_constant_p (machine_mode
, const_rtx
)
10515 /* We can't use blocks for constants when we're using a per-function
10517 return !aarch64_can_use_per_function_literal_pools_p ();
10520 /* Select appropriate section for constants depending
10521 on where we place literal pools. */
10524 aarch64_select_rtx_section (machine_mode mode
,
10526 unsigned HOST_WIDE_INT align
)
10528 if (aarch64_can_use_per_function_literal_pools_p ())
10529 return function_section (current_function_decl
);
10531 return default_elf_select_rtx_section (mode
, x
, align
);
10534 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
10536 aarch64_asm_output_pool_epilogue (FILE *f
, const char *, tree
,
10537 HOST_WIDE_INT offset
)
10539 /* When using per-function literal pools, we must ensure that any code
10540 section is aligned to the minimal instruction length, lest we get
10541 errors from the assembler re "unaligned instructions". */
10542 if ((offset
& 3) && aarch64_can_use_per_function_literal_pools_p ())
10543 ASM_OUTPUT_ALIGN (f
, 2);
10548 /* Helper function for rtx cost calculation. Strip a shift expression
10549 from X. Returns the inner operand if successful, or the original
10550 expression on failure. */
10552 aarch64_strip_shift (rtx x
)
10556 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
10557 we can convert both to ROR during final output. */
10558 if ((GET_CODE (op
) == ASHIFT
10559 || GET_CODE (op
) == ASHIFTRT
10560 || GET_CODE (op
) == LSHIFTRT
10561 || GET_CODE (op
) == ROTATERT
10562 || GET_CODE (op
) == ROTATE
)
10563 && CONST_INT_P (XEXP (op
, 1)))
10564 return XEXP (op
, 0);
10566 if (GET_CODE (op
) == MULT
10567 && CONST_INT_P (XEXP (op
, 1))
10568 && ((unsigned) exact_log2 (INTVAL (XEXP (op
, 1)))) < 64)
10569 return XEXP (op
, 0);
10574 /* Helper function for rtx cost calculation. Strip an extend
10575 expression from X. Returns the inner operand if successful, or the
10576 original expression on failure. We deal with a number of possible
10577 canonicalization variations here. If STRIP_SHIFT is true, then
10578 we can strip off a shift also. */
10580 aarch64_strip_extend (rtx x
, bool strip_shift
)
10582 scalar_int_mode mode
;
10585 if (!is_a
<scalar_int_mode
> (GET_MODE (op
), &mode
))
10588 /* Zero and sign extraction of a widened value. */
10589 if ((GET_CODE (op
) == ZERO_EXTRACT
|| GET_CODE (op
) == SIGN_EXTRACT
)
10590 && XEXP (op
, 2) == const0_rtx
10591 && GET_CODE (XEXP (op
, 0)) == MULT
10592 && aarch64_is_extend_from_extract (mode
, XEXP (XEXP (op
, 0), 1),
10594 return XEXP (XEXP (op
, 0), 0);
10596 /* It can also be represented (for zero-extend) as an AND with an
10598 if (GET_CODE (op
) == AND
10599 && GET_CODE (XEXP (op
, 0)) == MULT
10600 && CONST_INT_P (XEXP (XEXP (op
, 0), 1))
10601 && CONST_INT_P (XEXP (op
, 1))
10602 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op
, 0), 1))),
10603 INTVAL (XEXP (op
, 1))) != 0)
10604 return XEXP (XEXP (op
, 0), 0);
10606 /* Now handle extended register, as this may also have an optional
10607 left shift by 1..4. */
10609 && GET_CODE (op
) == ASHIFT
10610 && CONST_INT_P (XEXP (op
, 1))
10611 && ((unsigned HOST_WIDE_INT
) INTVAL (XEXP (op
, 1))) <= 4)
10614 if (GET_CODE (op
) == ZERO_EXTEND
10615 || GET_CODE (op
) == SIGN_EXTEND
)
10624 /* Return true iff CODE is a shift supported in combination
10625 with arithmetic instructions. */
10628 aarch64_shift_p (enum rtx_code code
)
10630 return code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
;
10634 /* Return true iff X is a cheap shift without a sign extend. */
10637 aarch64_cheap_mult_shift_p (rtx x
)
10644 if (!(aarch64_tune_params
.extra_tuning_flags
10645 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
))
10648 if (GET_CODE (op0
) == SIGN_EXTEND
)
10651 if (GET_CODE (x
) == ASHIFT
&& CONST_INT_P (op1
)
10652 && UINTVAL (op1
) <= 4)
10655 if (GET_CODE (x
) != MULT
|| !CONST_INT_P (op1
))
10658 HOST_WIDE_INT l2
= exact_log2 (INTVAL (op1
));
10660 if (l2
> 0 && l2
<= 4)
10666 /* Helper function for rtx cost calculation. Calculate the cost of
10667 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
10668 Return the calculated cost of the expression, recursing manually in to
10669 operands where needed. */
10672 aarch64_rtx_mult_cost (rtx x
, enum rtx_code code
, int outer
, bool speed
)
10675 const struct cpu_cost_table
*extra_cost
10676 = aarch64_tune_params
.insn_extra_cost
;
10678 bool compound_p
= (outer
== PLUS
|| outer
== MINUS
);
10679 machine_mode mode
= GET_MODE (x
);
10681 gcc_checking_assert (code
== MULT
);
10686 if (VECTOR_MODE_P (mode
))
10687 mode
= GET_MODE_INNER (mode
);
10689 /* Integer multiply/fma. */
10690 if (GET_MODE_CLASS (mode
) == MODE_INT
)
10692 /* The multiply will be canonicalized as a shift, cost it as such. */
10693 if (aarch64_shift_p (GET_CODE (x
))
10694 || (CONST_INT_P (op1
)
10695 && exact_log2 (INTVAL (op1
)) > 0))
10697 bool is_extend
= GET_CODE (op0
) == ZERO_EXTEND
10698 || GET_CODE (op0
) == SIGN_EXTEND
;
10703 /* If the shift is considered cheap,
10704 then don't add any cost. */
10705 if (aarch64_cheap_mult_shift_p (x
))
10707 else if (REG_P (op1
))
10708 /* ARITH + shift-by-register. */
10709 cost
+= extra_cost
->alu
.arith_shift_reg
;
10710 else if (is_extend
)
10711 /* ARITH + extended register. We don't have a cost field
10712 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
10713 cost
+= extra_cost
->alu
.extend_arith
;
10715 /* ARITH + shift-by-immediate. */
10716 cost
+= extra_cost
->alu
.arith_shift
;
10719 /* LSL (immediate). */
10720 cost
+= extra_cost
->alu
.shift
;
10723 /* Strip extends as we will have costed them in the case above. */
10725 op0
= aarch64_strip_extend (op0
, true);
10727 cost
+= rtx_cost (op0
, VOIDmode
, code
, 0, speed
);
10732 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
10733 compound and let the below cases handle it. After all, MNEG is a
10734 special-case alias of MSUB. */
10735 if (GET_CODE (op0
) == NEG
)
10737 op0
= XEXP (op0
, 0);
10741 /* Integer multiplies or FMAs have zero/sign extending variants. */
10742 if ((GET_CODE (op0
) == ZERO_EXTEND
10743 && GET_CODE (op1
) == ZERO_EXTEND
)
10744 || (GET_CODE (op0
) == SIGN_EXTEND
10745 && GET_CODE (op1
) == SIGN_EXTEND
))
10747 cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, MULT
, 0, speed
);
10748 cost
+= rtx_cost (XEXP (op1
, 0), VOIDmode
, MULT
, 1, speed
);
10753 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
10754 cost
+= extra_cost
->mult
[0].extend_add
;
10756 /* MUL/SMULL/UMULL. */
10757 cost
+= extra_cost
->mult
[0].extend
;
10763 /* This is either an integer multiply or a MADD. In both cases
10764 we want to recurse and cost the operands. */
10765 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
10766 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
10772 cost
+= extra_cost
->mult
[mode
== DImode
].add
;
10775 cost
+= extra_cost
->mult
[mode
== DImode
].simple
;
10784 /* Floating-point FMA/FMUL can also support negations of the
10785 operands, unless the rounding mode is upward or downward in
10786 which case FNMUL is different than FMUL with operand negation. */
10787 bool neg0
= GET_CODE (op0
) == NEG
;
10788 bool neg1
= GET_CODE (op1
) == NEG
;
10789 if (compound_p
|| !flag_rounding_math
|| (neg0
&& neg1
))
10792 op0
= XEXP (op0
, 0);
10794 op1
= XEXP (op1
, 0);
10798 /* FMADD/FNMADD/FNMSUB/FMSUB. */
10799 cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
10802 cost
+= extra_cost
->fp
[mode
== DFmode
].mult
;
10805 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
10806 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
10812 aarch64_address_cost (rtx x
,
10814 addr_space_t as ATTRIBUTE_UNUSED
,
10817 enum rtx_code c
= GET_CODE (x
);
10818 const struct cpu_addrcost_table
*addr_cost
= aarch64_tune_params
.addr_cost
;
10819 struct aarch64_address_info info
;
10823 if (!aarch64_classify_address (&info
, x
, mode
, false))
10825 if (GET_CODE (x
) == CONST
|| GET_CODE (x
) == SYMBOL_REF
)
10827 /* This is a CONST or SYMBOL ref which will be split
10828 in a different way depending on the code model in use.
10829 Cost it through the generic infrastructure. */
10830 int cost_symbol_ref
= rtx_cost (x
, Pmode
, MEM
, 1, speed
);
10831 /* Divide through by the cost of one instruction to
10832 bring it to the same units as the address costs. */
10833 cost_symbol_ref
/= COSTS_N_INSNS (1);
10834 /* The cost is then the cost of preparing the address,
10835 followed by an immediate (possibly 0) offset. */
10836 return cost_symbol_ref
+ addr_cost
->imm_offset
;
10840 /* This is most likely a jump table from a case
10842 return addr_cost
->register_offset
;
10848 case ADDRESS_LO_SUM
:
10849 case ADDRESS_SYMBOLIC
:
10850 case ADDRESS_REG_IMM
:
10851 cost
+= addr_cost
->imm_offset
;
10854 case ADDRESS_REG_WB
:
10855 if (c
== PRE_INC
|| c
== PRE_DEC
|| c
== PRE_MODIFY
)
10856 cost
+= addr_cost
->pre_modify
;
10857 else if (c
== POST_INC
|| c
== POST_DEC
|| c
== POST_MODIFY
)
10858 cost
+= addr_cost
->post_modify
;
10860 gcc_unreachable ();
10864 case ADDRESS_REG_REG
:
10865 cost
+= addr_cost
->register_offset
;
10868 case ADDRESS_REG_SXTW
:
10869 cost
+= addr_cost
->register_sextend
;
10872 case ADDRESS_REG_UXTW
:
10873 cost
+= addr_cost
->register_zextend
;
10877 gcc_unreachable ();
10881 if (info
.shift
> 0)
10883 /* For the sake of calculating the cost of the shifted register
10884 component, we can treat same sized modes in the same way. */
10885 if (known_eq (GET_MODE_BITSIZE (mode
), 16))
10886 cost
+= addr_cost
->addr_scale_costs
.hi
;
10887 else if (known_eq (GET_MODE_BITSIZE (mode
), 32))
10888 cost
+= addr_cost
->addr_scale_costs
.si
;
10889 else if (known_eq (GET_MODE_BITSIZE (mode
), 64))
10890 cost
+= addr_cost
->addr_scale_costs
.di
;
10892 /* We can't tell, or this is a 128-bit vector. */
10893 cost
+= addr_cost
->addr_scale_costs
.ti
;
10899 /* Return the cost of a branch. If SPEED_P is true then the compiler is
10900 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
10904 aarch64_branch_cost (bool speed_p
, bool predictable_p
)
10906 /* When optimizing for speed, use the cost of unpredictable branches. */
10907 const struct cpu_branch_cost
*branch_costs
=
10908 aarch64_tune_params
.branch_costs
;
10910 if (!speed_p
|| predictable_p
)
10911 return branch_costs
->predictable
;
10913 return branch_costs
->unpredictable
;
10916 /* Return true if the RTX X in mode MODE is a zero or sign extract
10917 usable in an ADD or SUB (extended register) instruction. */
10919 aarch64_rtx_arith_op_extract_p (rtx x
, scalar_int_mode mode
)
10921 /* Catch add with a sign extract.
10922 This is add_<optab><mode>_multp2. */
10923 if (GET_CODE (x
) == SIGN_EXTRACT
10924 || GET_CODE (x
) == ZERO_EXTRACT
)
10926 rtx op0
= XEXP (x
, 0);
10927 rtx op1
= XEXP (x
, 1);
10928 rtx op2
= XEXP (x
, 2);
10930 if (GET_CODE (op0
) == MULT
10931 && CONST_INT_P (op1
)
10932 && op2
== const0_rtx
10933 && CONST_INT_P (XEXP (op0
, 1))
10934 && aarch64_is_extend_from_extract (mode
,
10941 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
10943 else if (GET_CODE (x
) == SIGN_EXTEND
10944 || GET_CODE (x
) == ZERO_EXTEND
)
10945 return REG_P (XEXP (x
, 0));
10951 aarch64_frint_unspec_p (unsigned int u
)
10955 case UNSPEC_FRINTZ
:
10956 case UNSPEC_FRINTP
:
10957 case UNSPEC_FRINTM
:
10958 case UNSPEC_FRINTA
:
10959 case UNSPEC_FRINTN
:
10960 case UNSPEC_FRINTX
:
10961 case UNSPEC_FRINTI
:
10969 /* Return true iff X is an rtx that will match an extr instruction
10970 i.e. as described in the *extr<mode>5_insn family of patterns.
10971 OP0 and OP1 will be set to the operands of the shifts involved
10972 on success and will be NULL_RTX otherwise. */
10975 aarch64_extr_rtx_p (rtx x
, rtx
*res_op0
, rtx
*res_op1
)
10978 scalar_int_mode mode
;
10979 if (!is_a
<scalar_int_mode
> (GET_MODE (x
), &mode
))
10982 *res_op0
= NULL_RTX
;
10983 *res_op1
= NULL_RTX
;
10985 if (GET_CODE (x
) != IOR
)
10991 if ((GET_CODE (op0
) == ASHIFT
&& GET_CODE (op1
) == LSHIFTRT
)
10992 || (GET_CODE (op1
) == ASHIFT
&& GET_CODE (op0
) == LSHIFTRT
))
10994 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
10995 if (GET_CODE (op1
) == ASHIFT
)
10996 std::swap (op0
, op1
);
10998 if (!CONST_INT_P (XEXP (op0
, 1)) || !CONST_INT_P (XEXP (op1
, 1)))
11001 unsigned HOST_WIDE_INT shft_amnt_0
= UINTVAL (XEXP (op0
, 1));
11002 unsigned HOST_WIDE_INT shft_amnt_1
= UINTVAL (XEXP (op1
, 1));
11004 if (shft_amnt_0
< GET_MODE_BITSIZE (mode
)
11005 && shft_amnt_0
+ shft_amnt_1
== GET_MODE_BITSIZE (mode
))
11007 *res_op0
= XEXP (op0
, 0);
11008 *res_op1
= XEXP (op1
, 0);
11016 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
11017 storing it in *COST. Result is true if the total cost of the operation
11018 has now been calculated. */
11020 aarch64_if_then_else_costs (rtx op0
, rtx op1
, rtx op2
, int *cost
, bool speed
)
11024 enum rtx_code cmpcode
;
11026 if (COMPARISON_P (op0
))
11028 inner
= XEXP (op0
, 0);
11029 comparator
= XEXP (op0
, 1);
11030 cmpcode
= GET_CODE (op0
);
11035 comparator
= const0_rtx
;
11039 if (GET_CODE (op1
) == PC
|| GET_CODE (op2
) == PC
)
11041 /* Conditional branch. */
11042 if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
11046 if (cmpcode
== NE
|| cmpcode
== EQ
)
11048 if (comparator
== const0_rtx
)
11050 /* TBZ/TBNZ/CBZ/CBNZ. */
11051 if (GET_CODE (inner
) == ZERO_EXTRACT
)
11053 *cost
+= rtx_cost (XEXP (inner
, 0), VOIDmode
,
11054 ZERO_EXTRACT
, 0, speed
);
11057 *cost
+= rtx_cost (inner
, VOIDmode
, cmpcode
, 0, speed
);
11062 else if (cmpcode
== LT
|| cmpcode
== GE
)
11065 if (comparator
== const0_rtx
)
11070 else if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
11073 if (GET_CODE (op1
) == COMPARE
)
11075 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
11076 if (XEXP (op1
, 1) == const0_rtx
)
11080 machine_mode mode
= GET_MODE (XEXP (op1
, 0));
11081 const struct cpu_cost_table
*extra_cost
11082 = aarch64_tune_params
.insn_extra_cost
;
11084 if (GET_MODE_CLASS (mode
) == MODE_INT
)
11085 *cost
+= extra_cost
->alu
.arith
;
11087 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
11092 /* It's a conditional operation based on the status flags,
11093 so it must be some flavor of CSEL. */
11095 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
11096 if (GET_CODE (op1
) == NEG
11097 || GET_CODE (op1
) == NOT
11098 || (GET_CODE (op1
) == PLUS
&& XEXP (op1
, 1) == const1_rtx
))
11099 op1
= XEXP (op1
, 0);
11100 else if (GET_CODE (op1
) == ZERO_EXTEND
&& GET_CODE (op2
) == ZERO_EXTEND
)
11102 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
11103 op1
= XEXP (op1
, 0);
11104 op2
= XEXP (op2
, 0);
11107 *cost
+= rtx_cost (op1
, VOIDmode
, IF_THEN_ELSE
, 1, speed
);
11108 *cost
+= rtx_cost (op2
, VOIDmode
, IF_THEN_ELSE
, 2, speed
);
11112 /* We don't know what this is, cost all operands. */
11116 /* Check whether X is a bitfield operation of the form shift + extend that
11117 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
11118 operand to which the bitfield operation is applied. Otherwise return
11122 aarch64_extend_bitfield_pattern_p (rtx x
)
11124 rtx_code outer_code
= GET_CODE (x
);
11125 machine_mode outer_mode
= GET_MODE (x
);
11127 if (outer_code
!= ZERO_EXTEND
&& outer_code
!= SIGN_EXTEND
11128 && outer_mode
!= SImode
&& outer_mode
!= DImode
)
11131 rtx inner
= XEXP (x
, 0);
11132 rtx_code inner_code
= GET_CODE (inner
);
11133 machine_mode inner_mode
= GET_MODE (inner
);
11136 switch (inner_code
)
11139 if (CONST_INT_P (XEXP (inner
, 1))
11140 && (inner_mode
== QImode
|| inner_mode
== HImode
))
11141 op
= XEXP (inner
, 0);
11144 if (outer_code
== ZERO_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
11145 && (inner_mode
== QImode
|| inner_mode
== HImode
))
11146 op
= XEXP (inner
, 0);
11149 if (outer_code
== SIGN_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
11150 && (inner_mode
== QImode
|| inner_mode
== HImode
))
11151 op
= XEXP (inner
, 0);
11160 /* Return true if the mask and a shift amount from an RTX of the form
11161 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
11162 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
11165 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode
, rtx mask
,
11168 return CONST_INT_P (mask
) && CONST_INT_P (shft_amnt
)
11169 && INTVAL (shft_amnt
) < GET_MODE_BITSIZE (mode
)
11170 && exact_log2 ((INTVAL (mask
) >> INTVAL (shft_amnt
)) + 1) >= 0
11172 & ((HOST_WIDE_INT_1U
<< INTVAL (shft_amnt
)) - 1)) == 0;
11175 /* Return true if the masks and a shift amount from an RTX of the form
11176 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
11177 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
11180 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode
,
11181 unsigned HOST_WIDE_INT mask1
,
11182 unsigned HOST_WIDE_INT shft_amnt
,
11183 unsigned HOST_WIDE_INT mask2
)
11185 unsigned HOST_WIDE_INT t
;
11187 /* Verify that there is no overlap in what bits are set in the two masks. */
11188 if (mask1
!= ~mask2
)
11191 /* Verify that mask2 is not all zeros or ones. */
11192 if (mask2
== 0 || mask2
== HOST_WIDE_INT_M1U
)
11195 /* The shift amount should always be less than the mode size. */
11196 gcc_assert (shft_amnt
< GET_MODE_BITSIZE (mode
));
11198 /* Verify that the mask being shifted is contiguous and would be in the
11199 least significant bits after shifting by shft_amnt. */
11200 t
= mask2
+ (HOST_WIDE_INT_1U
<< shft_amnt
);
11201 return (t
== (t
& -t
));
11204 /* Calculate the cost of calculating X, storing it in *COST. Result
11205 is true if the total cost of the operation has now been calculated. */
11207 aarch64_rtx_costs (rtx x
, machine_mode mode
, int outer ATTRIBUTE_UNUSED
,
11208 int param ATTRIBUTE_UNUSED
, int *cost
, bool speed
)
11211 const struct cpu_cost_table
*extra_cost
11212 = aarch64_tune_params
.insn_extra_cost
;
11213 int code
= GET_CODE (x
);
11214 scalar_int_mode int_mode
;
11216 /* By default, assume that everything has equivalent cost to the
11217 cheapest instruction. Any additional costs are applied as a delta
11218 above this default. */
11219 *cost
= COSTS_N_INSNS (1);
11224 /* The cost depends entirely on the operands to SET. */
11226 op0
= SET_DEST (x
);
11229 switch (GET_CODE (op0
))
11234 rtx address
= XEXP (op0
, 0);
11235 if (VECTOR_MODE_P (mode
))
11236 *cost
+= extra_cost
->ldst
.storev
;
11237 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
11238 *cost
+= extra_cost
->ldst
.store
;
11239 else if (mode
== SFmode
)
11240 *cost
+= extra_cost
->ldst
.storef
;
11241 else if (mode
== DFmode
)
11242 *cost
+= extra_cost
->ldst
.stored
;
11245 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
11249 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
11253 if (! REG_P (SUBREG_REG (op0
)))
11254 *cost
+= rtx_cost (SUBREG_REG (op0
), VOIDmode
, SET
, 0, speed
);
11256 /* Fall through. */
11258 /* The cost is one per vector-register copied. */
11259 if (VECTOR_MODE_P (GET_MODE (op0
)) && REG_P (op1
))
11261 int nregs
= aarch64_hard_regno_nregs (V0_REGNUM
, GET_MODE (op0
));
11262 *cost
= COSTS_N_INSNS (nregs
);
11264 /* const0_rtx is in general free, but we will use an
11265 instruction to set a register to 0. */
11266 else if (REG_P (op1
) || op1
== const0_rtx
)
11268 /* The cost is 1 per register copied. */
11269 int nregs
= aarch64_hard_regno_nregs (R0_REGNUM
, GET_MODE (op0
));
11270 *cost
= COSTS_N_INSNS (nregs
);
11273 /* Cost is just the cost of the RHS of the set. */
11274 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
11279 /* Bit-field insertion. Strip any redundant widening of
11280 the RHS to meet the width of the target. */
11281 if (GET_CODE (op1
) == SUBREG
)
11282 op1
= SUBREG_REG (op1
);
11283 if ((GET_CODE (op1
) == ZERO_EXTEND
11284 || GET_CODE (op1
) == SIGN_EXTEND
)
11285 && CONST_INT_P (XEXP (op0
, 1))
11286 && is_a
<scalar_int_mode
> (GET_MODE (XEXP (op1
, 0)), &int_mode
)
11287 && GET_MODE_BITSIZE (int_mode
) >= INTVAL (XEXP (op0
, 1)))
11288 op1
= XEXP (op1
, 0);
11290 if (CONST_INT_P (op1
))
11292 /* MOV immediate is assumed to always be cheap. */
11293 *cost
= COSTS_N_INSNS (1);
11299 *cost
+= extra_cost
->alu
.bfi
;
11300 *cost
+= rtx_cost (op1
, VOIDmode
, (enum rtx_code
) code
, 1, speed
);
11306 /* We can't make sense of this, assume default cost. */
11307 *cost
= COSTS_N_INSNS (1);
11313 /* If an instruction can incorporate a constant within the
11314 instruction, the instruction's expression avoids calling
11315 rtx_cost() on the constant. If rtx_cost() is called on a
11316 constant, then it is usually because the constant must be
11317 moved into a register by one or more instructions.
11319 The exception is constant 0, which can be expressed
11320 as XZR/WZR and is therefore free. The exception to this is
11321 if we have (set (reg) (const0_rtx)) in which case we must cost
11322 the move. However, we can catch that when we cost the SET, so
11323 we don't need to consider that here. */
11324 if (x
== const0_rtx
)
11328 /* To an approximation, building any other constant is
11329 proportionally expensive to the number of instructions
11330 required to build that constant. This is true whether we
11331 are compiling for SPEED or otherwise. */
11332 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
11333 int_mode
= word_mode
;
11334 *cost
= COSTS_N_INSNS (aarch64_internal_mov_immediate
11335 (NULL_RTX
, x
, false, int_mode
));
11341 /* First determine number of instructions to do the move
11342 as an integer constant. */
11343 if (!aarch64_float_const_representable_p (x
)
11344 && !aarch64_can_const_movi_rtx_p (x
, mode
)
11345 && aarch64_float_const_rtx_p (x
))
11347 unsigned HOST_WIDE_INT ival
;
11348 bool succeed
= aarch64_reinterpret_float_as_int (x
, &ival
);
11349 gcc_assert (succeed
);
11351 scalar_int_mode imode
= (mode
== HFmode
11353 : int_mode_for_mode (mode
).require ());
11354 int ncost
= aarch64_internal_mov_immediate
11355 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
11356 *cost
+= COSTS_N_INSNS (ncost
);
11362 /* mov[df,sf]_aarch64. */
11363 if (aarch64_float_const_representable_p (x
))
11364 /* FMOV (scalar immediate). */
11365 *cost
+= extra_cost
->fp
[mode
== DFmode
].fpconst
;
11366 else if (!aarch64_float_const_zero_rtx_p (x
))
11368 /* This will be a load from memory. */
11369 if (mode
== DFmode
)
11370 *cost
+= extra_cost
->ldst
.loadd
;
11372 *cost
+= extra_cost
->ldst
.loadf
;
11375 /* Otherwise this is +0.0. We get this using MOVI d0, #0
11376 or MOV v0.s[0], wzr - neither of which are modeled by the
11377 cost tables. Just use the default cost. */
11387 /* For loads we want the base cost of a load, plus an
11388 approximation for the additional cost of the addressing
11390 rtx address
= XEXP (x
, 0);
11391 if (VECTOR_MODE_P (mode
))
11392 *cost
+= extra_cost
->ldst
.loadv
;
11393 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
11394 *cost
+= extra_cost
->ldst
.load
;
11395 else if (mode
== SFmode
)
11396 *cost
+= extra_cost
->ldst
.loadf
;
11397 else if (mode
== DFmode
)
11398 *cost
+= extra_cost
->ldst
.loadd
;
11401 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
11410 if (VECTOR_MODE_P (mode
))
11415 *cost
+= extra_cost
->vect
.alu
;
11420 if (GET_MODE_CLASS (mode
) == MODE_INT
)
11422 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
11423 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
11426 *cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, NEG
, 0, speed
);
11430 /* Cost this as SUB wzr, X. */
11431 op0
= CONST0_RTX (mode
);
11436 if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
11438 /* Support (neg(fma...)) as a single instruction only if
11439 sign of zeros is unimportant. This matches the decision
11440 making in aarch64.md. */
11441 if (GET_CODE (op0
) == FMA
&& !HONOR_SIGNED_ZEROS (GET_MODE (op0
)))
11444 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
11447 if (GET_CODE (op0
) == MULT
)
11450 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
11455 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
11465 if (VECTOR_MODE_P (mode
))
11466 *cost
+= extra_cost
->vect
.alu
;
11468 *cost
+= extra_cost
->alu
.clz
;
11477 if (op1
== const0_rtx
11478 && GET_CODE (op0
) == AND
)
11481 mode
= GET_MODE (op0
);
11485 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
)
11487 /* TODO: A write to the CC flags possibly costs extra, this
11488 needs encoding in the cost tables. */
11490 mode
= GET_MODE (op0
);
11492 if (GET_CODE (op0
) == AND
)
11498 if (GET_CODE (op0
) == PLUS
)
11500 /* ADDS (and CMN alias). */
11505 if (GET_CODE (op0
) == MINUS
)
11512 if (GET_CODE (op0
) == ZERO_EXTRACT
&& op1
== const0_rtx
11513 && GET_MODE (x
) == CC_NZmode
&& CONST_INT_P (XEXP (op0
, 1))
11514 && CONST_INT_P (XEXP (op0
, 2)))
11516 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
11517 Handle it here directly rather than going to cost_logic
11518 since we know the immediate generated for the TST is valid
11519 so we can avoid creating an intermediate rtx for it only
11520 for costing purposes. */
11522 *cost
+= extra_cost
->alu
.logical
;
11524 *cost
+= rtx_cost (XEXP (op0
, 0), GET_MODE (op0
),
11525 ZERO_EXTRACT
, 0, speed
);
11529 if (GET_CODE (op1
) == NEG
)
11533 *cost
+= extra_cost
->alu
.arith
;
11535 *cost
+= rtx_cost (op0
, mode
, COMPARE
, 0, speed
);
11536 *cost
+= rtx_cost (XEXP (op1
, 0), mode
, NEG
, 1, speed
);
11542 Compare can freely swap the order of operands, and
11543 canonicalization puts the more complex operation first.
11544 But the integer MINUS logic expects the shift/extend
11545 operation in op1. */
11547 || (GET_CODE (op0
) == SUBREG
&& REG_P (SUBREG_REG (op0
)))))
11555 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_FLOAT
)
11559 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
11561 if (CONST_DOUBLE_P (op1
) && aarch64_float_const_zero_rtx_p (op1
))
11563 *cost
+= rtx_cost (op0
, VOIDmode
, COMPARE
, 0, speed
);
11564 /* FCMP supports constant 0.0 for no extra cost. */
11570 if (VECTOR_MODE_P (mode
))
11572 /* Vector compare. */
11574 *cost
+= extra_cost
->vect
.alu
;
11576 if (aarch64_float_const_zero_rtx_p (op1
))
11578 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
11592 *cost
+= rtx_cost (op0
, mode
, MINUS
, 0, speed
);
11594 /* Detect valid immediates. */
11595 if ((GET_MODE_CLASS (mode
) == MODE_INT
11596 || (GET_MODE_CLASS (mode
) == MODE_CC
11597 && GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
))
11598 && CONST_INT_P (op1
)
11599 && aarch64_uimm12_shift (INTVAL (op1
)))
11602 /* SUB(S) (immediate). */
11603 *cost
+= extra_cost
->alu
.arith
;
11607 /* Look for SUB (extended register). */
11608 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
11609 && aarch64_rtx_arith_op_extract_p (op1
, int_mode
))
11612 *cost
+= extra_cost
->alu
.extend_arith
;
11614 op1
= aarch64_strip_extend (op1
, true);
11615 *cost
+= rtx_cost (op1
, VOIDmode
,
11616 (enum rtx_code
) GET_CODE (op1
), 0, speed
);
11620 rtx new_op1
= aarch64_strip_extend (op1
, false);
11622 /* Cost this as an FMA-alike operation. */
11623 if ((GET_CODE (new_op1
) == MULT
11624 || aarch64_shift_p (GET_CODE (new_op1
)))
11625 && code
!= COMPARE
)
11627 *cost
+= aarch64_rtx_mult_cost (new_op1
, MULT
,
11628 (enum rtx_code
) code
,
11633 *cost
+= rtx_cost (new_op1
, VOIDmode
, MINUS
, 1, speed
);
11637 if (VECTOR_MODE_P (mode
))
11640 *cost
+= extra_cost
->vect
.alu
;
11642 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
11645 *cost
+= extra_cost
->alu
.arith
;
11647 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
11650 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
11664 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
11665 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
11668 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, PLUS
, 0, speed
);
11669 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
11673 if (GET_MODE_CLASS (mode
) == MODE_INT
11674 && (aarch64_plus_immediate (op1
, mode
)
11675 || aarch64_sve_addvl_addpl_immediate (op1
, mode
)))
11677 *cost
+= rtx_cost (op0
, mode
, PLUS
, 0, speed
);
11680 /* ADD (immediate). */
11681 *cost
+= extra_cost
->alu
.arith
;
11685 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
11687 /* Look for ADD (extended register). */
11688 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
11689 && aarch64_rtx_arith_op_extract_p (op0
, int_mode
))
11692 *cost
+= extra_cost
->alu
.extend_arith
;
11694 op0
= aarch64_strip_extend (op0
, true);
11695 *cost
+= rtx_cost (op0
, VOIDmode
,
11696 (enum rtx_code
) GET_CODE (op0
), 0, speed
);
11700 /* Strip any extend, leave shifts behind as we will
11701 cost them through mult_cost. */
11702 new_op0
= aarch64_strip_extend (op0
, false);
11704 if (GET_CODE (new_op0
) == MULT
11705 || aarch64_shift_p (GET_CODE (new_op0
)))
11707 *cost
+= aarch64_rtx_mult_cost (new_op0
, MULT
, PLUS
,
11712 *cost
+= rtx_cost (new_op0
, VOIDmode
, PLUS
, 0, speed
);
11716 if (VECTOR_MODE_P (mode
))
11719 *cost
+= extra_cost
->vect
.alu
;
11721 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
11724 *cost
+= extra_cost
->alu
.arith
;
11726 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
11729 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
11736 *cost
= COSTS_N_INSNS (1);
11740 if (VECTOR_MODE_P (mode
))
11741 *cost
+= extra_cost
->vect
.alu
;
11743 *cost
+= extra_cost
->alu
.rev
;
11748 if (aarch_rev16_p (x
))
11750 *cost
= COSTS_N_INSNS (1);
11754 if (VECTOR_MODE_P (mode
))
11755 *cost
+= extra_cost
->vect
.alu
;
11757 *cost
+= extra_cost
->alu
.rev
;
11762 if (aarch64_extr_rtx_p (x
, &op0
, &op1
))
11764 *cost
+= rtx_cost (op0
, mode
, IOR
, 0, speed
);
11765 *cost
+= rtx_cost (op1
, mode
, IOR
, 1, speed
);
11767 *cost
+= extra_cost
->alu
.shift
;
11771 /* Fall through. */
11778 if (VECTOR_MODE_P (mode
))
11781 *cost
+= extra_cost
->vect
.alu
;
11786 && GET_CODE (op0
) == MULT
11787 && CONST_INT_P (XEXP (op0
, 1))
11788 && CONST_INT_P (op1
)
11789 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0
, 1))),
11790 INTVAL (op1
)) != 0)
11792 /* This is a UBFM/SBFM. */
11793 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, ZERO_EXTRACT
, 0, speed
);
11795 *cost
+= extra_cost
->alu
.bfx
;
11799 if (is_int_mode (mode
, &int_mode
))
11801 if (CONST_INT_P (op1
))
11803 /* We have a mask + shift version of a UBFIZ
11804 i.e. the *andim_ashift<mode>_bfiz pattern. */
11805 if (GET_CODE (op0
) == ASHIFT
11806 && aarch64_mask_and_shift_for_ubfiz_p (int_mode
, op1
,
11809 *cost
+= rtx_cost (XEXP (op0
, 0), int_mode
,
11810 (enum rtx_code
) code
, 0, speed
);
11812 *cost
+= extra_cost
->alu
.bfx
;
11816 else if (aarch64_bitmask_imm (INTVAL (op1
), int_mode
))
11818 /* We possibly get the immediate for free, this is not
11820 *cost
+= rtx_cost (op0
, int_mode
,
11821 (enum rtx_code
) code
, 0, speed
);
11823 *cost
+= extra_cost
->alu
.logical
;
11832 /* Handle ORN, EON, or BIC. */
11833 if (GET_CODE (op0
) == NOT
)
11834 op0
= XEXP (op0
, 0);
11836 new_op0
= aarch64_strip_shift (op0
);
11838 /* If we had a shift on op0 then this is a logical-shift-
11839 by-register/immediate operation. Otherwise, this is just
11840 a logical operation. */
11843 if (new_op0
!= op0
)
11845 /* Shift by immediate. */
11846 if (CONST_INT_P (XEXP (op0
, 1)))
11847 *cost
+= extra_cost
->alu
.log_shift
;
11849 *cost
+= extra_cost
->alu
.log_shift_reg
;
11852 *cost
+= extra_cost
->alu
.logical
;
11855 /* In both cases we want to cost both operands. */
11856 *cost
+= rtx_cost (new_op0
, int_mode
, (enum rtx_code
) code
,
11858 *cost
+= rtx_cost (op1
, int_mode
, (enum rtx_code
) code
,
11868 op0
= aarch64_strip_shift (x
);
11870 if (VECTOR_MODE_P (mode
))
11873 *cost
+= extra_cost
->vect
.alu
;
11877 /* MVN-shifted-reg. */
11880 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
11883 *cost
+= extra_cost
->alu
.log_shift
;
11887 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
11888 Handle the second form here taking care that 'a' in the above can
11890 else if (GET_CODE (op0
) == XOR
)
11892 rtx newop0
= XEXP (op0
, 0);
11893 rtx newop1
= XEXP (op0
, 1);
11894 rtx op0_stripped
= aarch64_strip_shift (newop0
);
11896 *cost
+= rtx_cost (newop1
, mode
, (enum rtx_code
) code
, 1, speed
);
11897 *cost
+= rtx_cost (op0_stripped
, mode
, XOR
, 0, speed
);
11901 if (op0_stripped
!= newop0
)
11902 *cost
+= extra_cost
->alu
.log_shift
;
11904 *cost
+= extra_cost
->alu
.logical
;
11911 *cost
+= extra_cost
->alu
.logical
;
11918 /* If a value is written in SI mode, then zero extended to DI
11919 mode, the operation will in general be free as a write to
11920 a 'w' register implicitly zeroes the upper bits of an 'x'
11921 register. However, if this is
11923 (set (reg) (zero_extend (reg)))
11925 we must cost the explicit register move. */
11927 && GET_MODE (op0
) == SImode
11930 int op_cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, 0, speed
);
11932 /* If OP_COST is non-zero, then the cost of the zero extend
11933 is effectively the cost of the inner operation. Otherwise
11934 we have a MOV instruction and we take the cost from the MOV
11935 itself. This is true independently of whether we are
11936 optimizing for space or time. */
11942 else if (MEM_P (op0
))
11944 /* All loads can zero extend to any size for free. */
11945 *cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, param
, speed
);
11949 op0
= aarch64_extend_bitfield_pattern_p (x
);
11952 *cost
+= rtx_cost (op0
, mode
, ZERO_EXTEND
, 0, speed
);
11954 *cost
+= extra_cost
->alu
.bfx
;
11960 if (VECTOR_MODE_P (mode
))
11963 *cost
+= extra_cost
->vect
.alu
;
11967 /* We generate an AND instead of UXTB/UXTH. */
11968 *cost
+= extra_cost
->alu
.logical
;
11974 if (MEM_P (XEXP (x
, 0)))
11979 rtx address
= XEXP (XEXP (x
, 0), 0);
11980 *cost
+= extra_cost
->ldst
.load_sign_extend
;
11983 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
11989 op0
= aarch64_extend_bitfield_pattern_p (x
);
11992 *cost
+= rtx_cost (op0
, mode
, SIGN_EXTEND
, 0, speed
);
11994 *cost
+= extra_cost
->alu
.bfx
;
12000 if (VECTOR_MODE_P (mode
))
12001 *cost
+= extra_cost
->vect
.alu
;
12003 *cost
+= extra_cost
->alu
.extend
;
12011 if (CONST_INT_P (op1
))
12015 if (VECTOR_MODE_P (mode
))
12017 /* Vector shift (immediate). */
12018 *cost
+= extra_cost
->vect
.alu
;
12022 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
12024 *cost
+= extra_cost
->alu
.shift
;
12028 /* We can incorporate zero/sign extend for free. */
12029 if (GET_CODE (op0
) == ZERO_EXTEND
12030 || GET_CODE (op0
) == SIGN_EXTEND
)
12031 op0
= XEXP (op0
, 0);
12033 *cost
+= rtx_cost (op0
, VOIDmode
, ASHIFT
, 0, speed
);
12038 if (VECTOR_MODE_P (mode
))
12041 /* Vector shift (register). */
12042 *cost
+= extra_cost
->vect
.alu
;
12048 *cost
+= extra_cost
->alu
.shift_reg
;
12050 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
12051 && CONST_INT_P (XEXP (op1
, 1))
12052 && known_eq (INTVAL (XEXP (op1
, 1)),
12053 GET_MODE_BITSIZE (mode
) - 1))
12055 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
12056 /* We already demanded XEXP (op1, 0) to be REG_P, so
12057 don't recurse into it. */
12061 return false; /* All arguments need to be in registers. */
12071 if (CONST_INT_P (op1
))
12073 /* ASR (immediate) and friends. */
12076 if (VECTOR_MODE_P (mode
))
12077 *cost
+= extra_cost
->vect
.alu
;
12079 *cost
+= extra_cost
->alu
.shift
;
12082 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
12087 if (VECTOR_MODE_P (mode
))
12090 /* Vector shift (register). */
12091 *cost
+= extra_cost
->vect
.alu
;
12096 /* ASR (register) and friends. */
12097 *cost
+= extra_cost
->alu
.shift_reg
;
12099 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
12100 && CONST_INT_P (XEXP (op1
, 1))
12101 && known_eq (INTVAL (XEXP (op1
, 1)),
12102 GET_MODE_BITSIZE (mode
) - 1))
12104 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
12105 /* We already demanded XEXP (op1, 0) to be REG_P, so
12106 don't recurse into it. */
12110 return false; /* All arguments need to be in registers. */
12115 if (aarch64_cmodel
== AARCH64_CMODEL_LARGE
12116 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
)
12120 *cost
+= extra_cost
->ldst
.load
;
12122 else if (aarch64_cmodel
== AARCH64_CMODEL_SMALL
12123 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_PIC
)
12125 /* ADRP, followed by ADD. */
12126 *cost
+= COSTS_N_INSNS (1);
12128 *cost
+= 2 * extra_cost
->alu
.arith
;
12130 else if (aarch64_cmodel
== AARCH64_CMODEL_TINY
12131 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
12135 *cost
+= extra_cost
->alu
.arith
;
12140 /* One extra load instruction, after accessing the GOT. */
12141 *cost
+= COSTS_N_INSNS (1);
12143 *cost
+= extra_cost
->ldst
.load
;
12149 /* ADRP/ADD (immediate). */
12151 *cost
+= extra_cost
->alu
.arith
;
12159 if (VECTOR_MODE_P (mode
))
12160 *cost
+= extra_cost
->vect
.alu
;
12162 *cost
+= extra_cost
->alu
.bfx
;
12165 /* We can trust that the immediates used will be correct (there
12166 are no by-register forms), so we need only cost op0. */
12167 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (enum rtx_code
) code
, 0, speed
);
12171 *cost
+= aarch64_rtx_mult_cost (x
, MULT
, 0, speed
);
12172 /* aarch64_rtx_mult_cost always handles recursion to its
12177 /* We can expand signed mod by power of 2 using a NEGS, two parallel
12178 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
12179 an unconditional negate. This case should only ever be reached through
12180 the set_smod_pow2_cheap check in expmed.c. */
12181 if (CONST_INT_P (XEXP (x
, 1))
12182 && exact_log2 (INTVAL (XEXP (x
, 1))) > 0
12183 && (mode
== SImode
|| mode
== DImode
))
12185 /* We expand to 4 instructions. Reset the baseline. */
12186 *cost
= COSTS_N_INSNS (4);
12189 *cost
+= 2 * extra_cost
->alu
.logical
12190 + 2 * extra_cost
->alu
.arith
;
12195 /* Fall-through. */
12199 /* Slighly prefer UMOD over SMOD. */
12200 if (VECTOR_MODE_P (mode
))
12201 *cost
+= extra_cost
->vect
.alu
;
12202 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
12203 *cost
+= (extra_cost
->mult
[mode
== DImode
].add
12204 + extra_cost
->mult
[mode
== DImode
].idiv
12205 + (code
== MOD
? 1 : 0));
12207 return false; /* All arguments need to be in registers. */
12214 if (VECTOR_MODE_P (mode
))
12215 *cost
+= extra_cost
->vect
.alu
;
12216 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
12217 /* There is no integer SQRT, so only DIV and UDIV can get
12219 *cost
+= (extra_cost
->mult
[mode
== DImode
].idiv
12220 /* Slighly prefer UDIV over SDIV. */
12221 + (code
== DIV
? 1 : 0));
12223 *cost
+= extra_cost
->fp
[mode
== DFmode
].div
;
12225 return false; /* All arguments need to be in registers. */
12228 return aarch64_if_then_else_costs (XEXP (x
, 0), XEXP (x
, 1),
12229 XEXP (x
, 2), cost
, speed
);
12242 return false; /* All arguments must be in registers. */
12251 if (VECTOR_MODE_P (mode
))
12252 *cost
+= extra_cost
->vect
.alu
;
12254 *cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
12257 /* FMSUB, FNMADD, and FNMSUB are free. */
12258 if (GET_CODE (op0
) == NEG
)
12259 op0
= XEXP (op0
, 0);
12261 if (GET_CODE (op2
) == NEG
)
12262 op2
= XEXP (op2
, 0);
12264 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
12265 and the by-element operand as operand 0. */
12266 if (GET_CODE (op1
) == NEG
)
12267 op1
= XEXP (op1
, 0);
12269 /* Catch vector-by-element operations. The by-element operand can
12270 either be (vec_duplicate (vec_select (x))) or just
12271 (vec_select (x)), depending on whether we are multiplying by
12272 a vector or a scalar.
12274 Canonicalization is not very good in these cases, FMA4 will put the
12275 by-element operand as operand 0, FNMA4 will have it as operand 1. */
12276 if (GET_CODE (op0
) == VEC_DUPLICATE
)
12277 op0
= XEXP (op0
, 0);
12278 else if (GET_CODE (op1
) == VEC_DUPLICATE
)
12279 op1
= XEXP (op1
, 0);
12281 if (GET_CODE (op0
) == VEC_SELECT
)
12282 op0
= XEXP (op0
, 0);
12283 else if (GET_CODE (op1
) == VEC_SELECT
)
12284 op1
= XEXP (op1
, 0);
12286 /* If the remaining parameters are not registers,
12287 get the cost to put them into registers. */
12288 *cost
+= rtx_cost (op0
, mode
, FMA
, 0, speed
);
12289 *cost
+= rtx_cost (op1
, mode
, FMA
, 1, speed
);
12290 *cost
+= rtx_cost (op2
, mode
, FMA
, 2, speed
);
12294 case UNSIGNED_FLOAT
:
12296 *cost
+= extra_cost
->fp
[mode
== DFmode
].fromint
;
12302 if (VECTOR_MODE_P (mode
))
12304 /*Vector truncate. */
12305 *cost
+= extra_cost
->vect
.alu
;
12308 *cost
+= extra_cost
->fp
[mode
== DFmode
].widen
;
12312 case FLOAT_TRUNCATE
:
12315 if (VECTOR_MODE_P (mode
))
12317 /*Vector conversion. */
12318 *cost
+= extra_cost
->vect
.alu
;
12321 *cost
+= extra_cost
->fp
[mode
== DFmode
].narrow
;
12328 /* Strip the rounding part. They will all be implemented
12329 by the fcvt* family of instructions anyway. */
12330 if (GET_CODE (x
) == UNSPEC
)
12332 unsigned int uns_code
= XINT (x
, 1);
12334 if (uns_code
== UNSPEC_FRINTA
12335 || uns_code
== UNSPEC_FRINTM
12336 || uns_code
== UNSPEC_FRINTN
12337 || uns_code
== UNSPEC_FRINTP
12338 || uns_code
== UNSPEC_FRINTZ
)
12339 x
= XVECEXP (x
, 0, 0);
12344 if (VECTOR_MODE_P (mode
))
12345 *cost
+= extra_cost
->vect
.alu
;
12347 *cost
+= extra_cost
->fp
[GET_MODE (x
) == DFmode
].toint
;
12350 /* We can combine fmul by a power of 2 followed by a fcvt into a single
12351 fixed-point fcvt. */
12352 if (GET_CODE (x
) == MULT
12353 && ((VECTOR_MODE_P (mode
)
12354 && aarch64_vec_fpconst_pow_of_2 (XEXP (x
, 1)) > 0)
12355 || aarch64_fpconst_pow_of_2 (XEXP (x
, 1)) > 0))
12357 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (rtx_code
) code
,
12362 *cost
+= rtx_cost (x
, VOIDmode
, (enum rtx_code
) code
, 0, speed
);
12366 if (VECTOR_MODE_P (mode
))
12368 /* ABS (vector). */
12370 *cost
+= extra_cost
->vect
.alu
;
12372 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
12376 /* FABD, which is analogous to FADD. */
12377 if (GET_CODE (op0
) == MINUS
)
12379 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, MINUS
, 0, speed
);
12380 *cost
+= rtx_cost (XEXP (op0
, 1), mode
, MINUS
, 1, speed
);
12382 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
12386 /* Simple FABS is analogous to FNEG. */
12388 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
12392 /* Integer ABS will either be split to
12393 two arithmetic instructions, or will be an ABS
12394 (scalar), which we don't model. */
12395 *cost
= COSTS_N_INSNS (2);
12397 *cost
+= 2 * extra_cost
->alu
.arith
;
12405 if (VECTOR_MODE_P (mode
))
12406 *cost
+= extra_cost
->vect
.alu
;
12409 /* FMAXNM/FMINNM/FMAX/FMIN.
12410 TODO: This may not be accurate for all implementations, but
12411 we do not model this in the cost tables. */
12412 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
12418 /* The floating point round to integer frint* instructions. */
12419 if (aarch64_frint_unspec_p (XINT (x
, 1)))
12422 *cost
+= extra_cost
->fp
[mode
== DFmode
].roundint
;
12427 if (XINT (x
, 1) == UNSPEC_RBIT
)
12430 *cost
+= extra_cost
->alu
.rev
;
12438 /* Decompose <su>muldi3_highpart. */
12439 if (/* (truncate:DI */
12442 && GET_MODE (XEXP (x
, 0)) == TImode
12443 && GET_CODE (XEXP (x
, 0)) == LSHIFTRT
12445 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == MULT
12446 /* (ANY_EXTEND:TI (reg:DI))
12447 (ANY_EXTEND:TI (reg:DI))) */
12448 && ((GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == ZERO_EXTEND
12449 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == ZERO_EXTEND
)
12450 || (GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == SIGN_EXTEND
12451 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == SIGN_EXTEND
))
12452 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0)) == DImode
12453 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0)) == DImode
12454 /* (const_int 64) */
12455 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
12456 && UINTVAL (XEXP (XEXP (x
, 0), 1)) == 64)
12460 *cost
+= extra_cost
->mult
[mode
== DImode
].extend
;
12461 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0),
12462 mode
, MULT
, 0, speed
);
12463 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0),
12464 mode
, MULT
, 1, speed
);
12468 /* Fall through. */
12474 && flag_aarch64_verbose_cost
)
12475 fprintf (dump_file
,
12476 "\nFailed to cost RTX. Assuming default cost.\n");
12481 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
12482 calculated for X. This cost is stored in *COST. Returns true
12483 if the total cost of X was calculated. */
12485 aarch64_rtx_costs_wrapper (rtx x
, machine_mode mode
, int outer
,
12486 int param
, int *cost
, bool speed
)
12488 bool result
= aarch64_rtx_costs (x
, mode
, outer
, param
, cost
, speed
);
12491 && flag_aarch64_verbose_cost
)
12493 print_rtl_single (dump_file
, x
);
12494 fprintf (dump_file
, "\n%s cost: %d (%s)\n",
12495 speed
? "Hot" : "Cold",
12496 *cost
, result
? "final" : "partial");
12503 aarch64_register_move_cost (machine_mode mode
,
12504 reg_class_t from_i
, reg_class_t to_i
)
12506 enum reg_class from
= (enum reg_class
) from_i
;
12507 enum reg_class to
= (enum reg_class
) to_i
;
12508 const struct cpu_regmove_cost
*regmove_cost
12509 = aarch64_tune_params
.regmove_cost
;
12511 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
12512 if (to
== TAILCALL_ADDR_REGS
|| to
== POINTER_REGS
)
12515 if (from
== TAILCALL_ADDR_REGS
|| from
== POINTER_REGS
)
12516 from
= GENERAL_REGS
;
12518 /* Make RDFFR very expensive. In particular, if we know that the FFR
12519 contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
12520 as a way of obtaining a PTRUE. */
12521 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
12522 && hard_reg_set_subset_p (reg_class_contents
[from_i
],
12523 reg_class_contents
[FFR_REGS
]))
12526 /* Moving between GPR and stack cost is the same as GP2GP. */
12527 if ((from
== GENERAL_REGS
&& to
== STACK_REG
)
12528 || (to
== GENERAL_REGS
&& from
== STACK_REG
))
12529 return regmove_cost
->GP2GP
;
12531 /* To/From the stack register, we move via the gprs. */
12532 if (to
== STACK_REG
|| from
== STACK_REG
)
12533 return aarch64_register_move_cost (mode
, from
, GENERAL_REGS
)
12534 + aarch64_register_move_cost (mode
, GENERAL_REGS
, to
);
12536 if (known_eq (GET_MODE_SIZE (mode
), 16))
12538 /* 128-bit operations on general registers require 2 instructions. */
12539 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
12540 return regmove_cost
->GP2GP
* 2;
12541 else if (from
== GENERAL_REGS
)
12542 return regmove_cost
->GP2FP
* 2;
12543 else if (to
== GENERAL_REGS
)
12544 return regmove_cost
->FP2GP
* 2;
12546 /* When AdvSIMD instructions are disabled it is not possible to move
12547 a 128-bit value directly between Q registers. This is handled in
12548 secondary reload. A general register is used as a scratch to move
12549 the upper DI value and the lower DI value is moved directly,
12550 hence the cost is the sum of three moves. */
12552 return regmove_cost
->GP2FP
+ regmove_cost
->FP2GP
+ regmove_cost
->FP2FP
;
12554 return regmove_cost
->FP2FP
;
12557 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
12558 return regmove_cost
->GP2GP
;
12559 else if (from
== GENERAL_REGS
)
12560 return regmove_cost
->GP2FP
;
12561 else if (to
== GENERAL_REGS
)
12562 return regmove_cost
->FP2GP
;
12564 return regmove_cost
->FP2FP
;
12568 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED
,
12569 reg_class_t rclass ATTRIBUTE_UNUSED
,
12570 bool in ATTRIBUTE_UNUSED
)
12572 return aarch64_tune_params
.memmov_cost
;
12575 /* Implement TARGET_INIT_BUILTINS. */
12577 aarch64_init_builtins ()
12579 aarch64_general_init_builtins ();
12580 aarch64_sve::init_builtins ();
12583 /* Implement TARGET_FOLD_BUILTIN. */
12585 aarch64_fold_builtin (tree fndecl
, int nargs
, tree
*args
, bool)
12587 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
12588 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
12589 tree type
= TREE_TYPE (TREE_TYPE (fndecl
));
12590 switch (code
& AARCH64_BUILTIN_CLASS
)
12592 case AARCH64_BUILTIN_GENERAL
:
12593 return aarch64_general_fold_builtin (subcode
, type
, nargs
, args
);
12595 case AARCH64_BUILTIN_SVE
:
12598 gcc_unreachable ();
12601 /* Implement TARGET_GIMPLE_FOLD_BUILTIN. */
12603 aarch64_gimple_fold_builtin (gimple_stmt_iterator
*gsi
)
12605 gcall
*stmt
= as_a
<gcall
*> (gsi_stmt (*gsi
));
12606 tree fndecl
= gimple_call_fndecl (stmt
);
12607 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
12608 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
12609 gimple
*new_stmt
= NULL
;
12610 switch (code
& AARCH64_BUILTIN_CLASS
)
12612 case AARCH64_BUILTIN_GENERAL
:
12613 new_stmt
= aarch64_general_gimple_fold_builtin (subcode
, stmt
);
12616 case AARCH64_BUILTIN_SVE
:
12617 new_stmt
= aarch64_sve::gimple_fold_builtin (subcode
, gsi
, stmt
);
12624 gsi_replace (gsi
, new_stmt
, true);
12628 /* Implement TARGET_EXPAND_BUILTIN. */
12630 aarch64_expand_builtin (tree exp
, rtx target
, rtx
, machine_mode
, int ignore
)
12632 tree fndecl
= TREE_OPERAND (CALL_EXPR_FN (exp
), 0);
12633 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
12634 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
12635 switch (code
& AARCH64_BUILTIN_CLASS
)
12637 case AARCH64_BUILTIN_GENERAL
:
12638 return aarch64_general_expand_builtin (subcode
, exp
, target
, ignore
);
12640 case AARCH64_BUILTIN_SVE
:
12641 return aarch64_sve::expand_builtin (subcode
, exp
, target
);
12643 gcc_unreachable ();
12646 /* Implement TARGET_BUILTIN_DECL. */
12648 aarch64_builtin_decl (unsigned int code
, bool initialize_p
)
12650 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
12651 switch (code
& AARCH64_BUILTIN_CLASS
)
12653 case AARCH64_BUILTIN_GENERAL
:
12654 return aarch64_general_builtin_decl (subcode
, initialize_p
);
12656 case AARCH64_BUILTIN_SVE
:
12657 return aarch64_sve::builtin_decl (subcode
, initialize_p
);
12659 gcc_unreachable ();
12662 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
12663 to optimize 1.0/sqrt. */
12666 use_rsqrt_p (machine_mode mode
)
12668 return (!flag_trapping_math
12669 && flag_unsafe_math_optimizations
12670 && ((aarch64_tune_params
.approx_modes
->recip_sqrt
12671 & AARCH64_APPROX_MODE (mode
))
12672 || flag_mrecip_low_precision_sqrt
));
12675 /* Function to decide when to use the approximate reciprocal square root
12679 aarch64_builtin_reciprocal (tree fndecl
)
12681 machine_mode mode
= TYPE_MODE (TREE_TYPE (fndecl
));
12683 if (!use_rsqrt_p (mode
))
12685 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
12686 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
12687 switch (code
& AARCH64_BUILTIN_CLASS
)
12689 case AARCH64_BUILTIN_GENERAL
:
12690 return aarch64_general_builtin_rsqrt (subcode
);
12692 case AARCH64_BUILTIN_SVE
:
12695 gcc_unreachable ();
12698 /* Emit instruction sequence to compute either the approximate square root
12699 or its approximate reciprocal, depending on the flag RECP, and return
12700 whether the sequence was emitted or not. */
12703 aarch64_emit_approx_sqrt (rtx dst
, rtx src
, bool recp
)
12705 machine_mode mode
= GET_MODE (dst
);
12707 if (GET_MODE_INNER (mode
) == HFmode
)
12709 gcc_assert (!recp
);
12715 if (!(flag_mlow_precision_sqrt
12716 || (aarch64_tune_params
.approx_modes
->sqrt
12717 & AARCH64_APPROX_MODE (mode
))))
12720 if (flag_finite_math_only
12721 || flag_trapping_math
12722 || !flag_unsafe_math_optimizations
12723 || optimize_function_for_size_p (cfun
))
12727 /* Caller assumes we cannot fail. */
12728 gcc_assert (use_rsqrt_p (mode
));
12730 machine_mode mmsk
= (VECTOR_MODE_P (mode
)
12731 ? related_int_vector_mode (mode
).require ()
12732 : int_mode_for_mode (mode
).require ());
12733 rtx xmsk
= gen_reg_rtx (mmsk
);
12735 /* When calculating the approximate square root, compare the
12736 argument with 0.0 and create a mask. */
12737 emit_insn (gen_rtx_SET (xmsk
,
12739 gen_rtx_EQ (mmsk
, src
,
12740 CONST0_RTX (mode
)))));
12742 /* Estimate the approximate reciprocal square root. */
12743 rtx xdst
= gen_reg_rtx (mode
);
12744 emit_insn (gen_aarch64_rsqrte (mode
, xdst
, src
));
12746 /* Iterate over the series twice for SF and thrice for DF. */
12747 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
12749 /* Optionally iterate over the series once less for faster performance
12750 while sacrificing the accuracy. */
12751 if ((recp
&& flag_mrecip_low_precision_sqrt
)
12752 || (!recp
&& flag_mlow_precision_sqrt
))
12755 /* Iterate over the series to calculate the approximate reciprocal square
12757 rtx x1
= gen_reg_rtx (mode
);
12758 while (iterations
--)
12760 rtx x2
= gen_reg_rtx (mode
);
12761 emit_set_insn (x2
, gen_rtx_MULT (mode
, xdst
, xdst
));
12763 emit_insn (gen_aarch64_rsqrts (mode
, x1
, src
, x2
));
12765 if (iterations
> 0)
12766 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, x1
));
12771 /* Qualify the approximate reciprocal square root when the argument is
12772 0.0 by squashing the intermediary result to 0.0. */
12773 rtx xtmp
= gen_reg_rtx (mmsk
);
12774 emit_set_insn (xtmp
, gen_rtx_AND (mmsk
, gen_rtx_NOT (mmsk
, xmsk
),
12775 gen_rtx_SUBREG (mmsk
, xdst
, 0)));
12776 emit_move_insn (xdst
, gen_rtx_SUBREG (mode
, xtmp
, 0));
12778 /* Calculate the approximate square root. */
12779 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, src
));
12782 /* Finalize the approximation. */
12783 emit_set_insn (dst
, gen_rtx_MULT (mode
, xdst
, x1
));
12788 /* Emit the instruction sequence to compute the approximation for the division
12789 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
12792 aarch64_emit_approx_div (rtx quo
, rtx num
, rtx den
)
12794 machine_mode mode
= GET_MODE (quo
);
12796 if (GET_MODE_INNER (mode
) == HFmode
)
12799 bool use_approx_division_p
= (flag_mlow_precision_div
12800 || (aarch64_tune_params
.approx_modes
->division
12801 & AARCH64_APPROX_MODE (mode
)));
12803 if (!flag_finite_math_only
12804 || flag_trapping_math
12805 || !flag_unsafe_math_optimizations
12806 || optimize_function_for_size_p (cfun
)
12807 || !use_approx_division_p
)
12810 if (!TARGET_SIMD
&& VECTOR_MODE_P (mode
))
12813 /* Estimate the approximate reciprocal. */
12814 rtx xrcp
= gen_reg_rtx (mode
);
12815 emit_insn (gen_aarch64_frecpe (mode
, xrcp
, den
));
12817 /* Iterate over the series twice for SF and thrice for DF. */
12818 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
12820 /* Optionally iterate over the series once less for faster performance,
12821 while sacrificing the accuracy. */
12822 if (flag_mlow_precision_div
)
12825 /* Iterate over the series to calculate the approximate reciprocal. */
12826 rtx xtmp
= gen_reg_rtx (mode
);
12827 while (iterations
--)
12829 emit_insn (gen_aarch64_frecps (mode
, xtmp
, xrcp
, den
));
12831 if (iterations
> 0)
12832 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
12835 if (num
!= CONST1_RTX (mode
))
12837 /* As the approximate reciprocal of DEN is already calculated, only
12838 calculate the approximate division when NUM is not 1.0. */
12839 rtx xnum
= force_reg (mode
, num
);
12840 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xnum
));
12843 /* Finalize the approximation. */
12844 emit_set_insn (quo
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
12848 /* Return the number of instructions that can be issued per cycle. */
12850 aarch64_sched_issue_rate (void)
12852 return aarch64_tune_params
.issue_rate
;
12855 /* Implement TARGET_SCHED_VARIABLE_ISSUE. */
12857 aarch64_sched_variable_issue (FILE *, int, rtx_insn
*insn
, int more
)
12859 if (DEBUG_INSN_P (insn
))
12862 rtx_code code
= GET_CODE (PATTERN (insn
));
12863 if (code
== USE
|| code
== CLOBBER
)
12866 if (get_attr_type (insn
) == TYPE_NO_INSN
)
12873 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
12875 int issue_rate
= aarch64_sched_issue_rate ();
12877 return issue_rate
> 1 && !sched_fusion
? issue_rate
: 0;
12881 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
12882 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
12883 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
12886 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn
*insn
,
12889 return autopref_multipass_dfa_lookahead_guard (insn
, ready_index
);
12893 /* Vectorizer cost model target hooks. */
12895 /* Implement targetm.vectorize.builtin_vectorization_cost. */
12897 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost
,
12899 int misalign ATTRIBUTE_UNUSED
)
12902 const cpu_vector_cost
*costs
= aarch64_tune_params
.vec_costs
;
12905 if (vectype
!= NULL
)
12906 fp
= FLOAT_TYPE_P (vectype
);
12908 switch (type_of_cost
)
12911 return fp
? costs
->scalar_fp_stmt_cost
: costs
->scalar_int_stmt_cost
;
12914 return costs
->scalar_load_cost
;
12917 return costs
->scalar_store_cost
;
12920 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
12923 return costs
->vec_align_load_cost
;
12926 return costs
->vec_store_cost
;
12928 case vec_to_scalar
:
12929 return costs
->vec_to_scalar_cost
;
12931 case scalar_to_vec
:
12932 return costs
->scalar_to_vec_cost
;
12934 case unaligned_load
:
12935 case vector_gather_load
:
12936 return costs
->vec_unalign_load_cost
;
12938 case unaligned_store
:
12939 case vector_scatter_store
:
12940 return costs
->vec_unalign_store_cost
;
12942 case cond_branch_taken
:
12943 return costs
->cond_taken_branch_cost
;
12945 case cond_branch_not_taken
:
12946 return costs
->cond_not_taken_branch_cost
;
12949 return costs
->vec_permute_cost
;
12951 case vec_promote_demote
:
12952 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
12954 case vec_construct
:
12955 elements
= estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype
));
12956 return elements
/ 2 + 1;
12959 gcc_unreachable ();
12963 /* Return true if STMT_INFO extends the result of a load. */
12965 aarch64_extending_load_p (stmt_vec_info stmt_info
)
12967 gassign
*assign
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
12968 if (!assign
|| !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign
)))
12971 tree rhs
= gimple_assign_rhs1 (stmt_info
->stmt
);
12972 tree lhs_type
= TREE_TYPE (gimple_assign_lhs (assign
));
12973 tree rhs_type
= TREE_TYPE (rhs
);
12974 if (!INTEGRAL_TYPE_P (lhs_type
)
12975 || !INTEGRAL_TYPE_P (rhs_type
)
12976 || TYPE_PRECISION (lhs_type
) <= TYPE_PRECISION (rhs_type
))
12979 stmt_vec_info def_stmt_info
= stmt_info
->vinfo
->lookup_def (rhs
);
12980 return (def_stmt_info
12981 && STMT_VINFO_DATA_REF (def_stmt_info
)
12982 && DR_IS_READ (STMT_VINFO_DATA_REF (def_stmt_info
)));
12985 /* Return true if STMT_INFO is an integer truncation. */
12987 aarch64_integer_truncation_p (stmt_vec_info stmt_info
)
12989 gassign
*assign
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
12990 if (!assign
|| !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign
)))
12993 tree lhs_type
= TREE_TYPE (gimple_assign_lhs (assign
));
12994 tree rhs_type
= TREE_TYPE (gimple_assign_rhs1 (assign
));
12995 return (INTEGRAL_TYPE_P (lhs_type
)
12996 && INTEGRAL_TYPE_P (rhs_type
)
12997 && TYPE_PRECISION (lhs_type
) < TYPE_PRECISION (rhs_type
));
13000 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
13001 for STMT_INFO, which has cost kind KIND. Adjust the cost as necessary
13002 for SVE targets. */
13003 static unsigned int
13004 aarch64_sve_adjust_stmt_cost (vect_cost_for_stmt kind
, stmt_vec_info stmt_info
,
13005 unsigned int stmt_cost
)
13007 /* Unlike vec_promote_demote, vector_stmt conversions do not change the
13008 vector register size or number of units. Integer promotions of this
13009 type therefore map to SXT[BHW] or UXT[BHW].
13011 Most loads have extending forms that can do the sign or zero extension
13012 on the fly. Optimistically assume that a load followed by an extension
13013 will fold to this form during combine, and that the extension therefore
13015 if (kind
== vector_stmt
&& aarch64_extending_load_p (stmt_info
))
13018 /* For similar reasons, vector_stmt integer truncations are a no-op,
13019 because we can just ignore the unused upper bits of the source. */
13020 if (kind
== vector_stmt
&& aarch64_integer_truncation_p (stmt_info
))
13026 /* Implement targetm.vectorize.add_stmt_cost. */
13028 aarch64_add_stmt_cost (void *data
, int count
, enum vect_cost_for_stmt kind
,
13029 struct _stmt_vec_info
*stmt_info
, int misalign
,
13030 enum vect_cost_model_location where
)
13032 unsigned *cost
= (unsigned *) data
;
13033 unsigned retval
= 0;
13035 if (flag_vect_cost_model
)
13037 tree vectype
= stmt_info
? stmt_vectype (stmt_info
) : NULL_TREE
;
13039 aarch64_builtin_vectorization_cost (kind
, vectype
, misalign
);
13041 if (stmt_info
&& vectype
&& aarch64_sve_mode_p (TYPE_MODE (vectype
)))
13042 stmt_cost
= aarch64_sve_adjust_stmt_cost (kind
, stmt_info
, stmt_cost
);
13044 /* Statements in an inner loop relative to the loop being
13045 vectorized are weighted more heavily. The value here is
13046 arbitrary and could potentially be improved with analysis. */
13047 if (where
== vect_body
&& stmt_info
&& stmt_in_inner_loop_p (stmt_info
))
13048 count
*= 50; /* FIXME */
13050 retval
= (unsigned) (count
* stmt_cost
);
13051 cost
[where
] += retval
;
13057 static void initialize_aarch64_code_model (struct gcc_options
*);
13059 /* Parse the TO_PARSE string and put the architecture struct that it
13060 selects into RES and the architectural features into ISA_FLAGS.
13061 Return an aarch64_parse_opt_result describing the parse result.
13062 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
13063 When the TO_PARSE string contains an invalid extension,
13064 a copy of the string is created and stored to INVALID_EXTENSION. */
13066 static enum aarch64_parse_opt_result
13067 aarch64_parse_arch (const char *to_parse
, const struct processor
**res
,
13068 uint64_t *isa_flags
, std::string
*invalid_extension
)
13071 const struct processor
*arch
;
13074 ext
= strchr (to_parse
, '+');
13077 len
= ext
- to_parse
;
13079 len
= strlen (to_parse
);
13082 return AARCH64_PARSE_MISSING_ARG
;
13085 /* Loop through the list of supported ARCHes to find a match. */
13086 for (arch
= all_architectures
; arch
->name
!= NULL
; arch
++)
13088 if (strlen (arch
->name
) == len
13089 && strncmp (arch
->name
, to_parse
, len
) == 0)
13091 uint64_t isa_temp
= arch
->flags
;
13095 /* TO_PARSE string contains at least one extension. */
13096 enum aarch64_parse_opt_result ext_res
13097 = aarch64_parse_extension (ext
, &isa_temp
, invalid_extension
);
13099 if (ext_res
!= AARCH64_PARSE_OK
)
13102 /* Extension parsing was successful. Confirm the result
13103 arch and ISA flags. */
13105 *isa_flags
= isa_temp
;
13106 return AARCH64_PARSE_OK
;
13110 /* ARCH name not found in list. */
13111 return AARCH64_PARSE_INVALID_ARG
;
13114 /* Parse the TO_PARSE string and put the result tuning in RES and the
13115 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
13116 describing the parse result. If there is an error parsing, RES and
13117 ISA_FLAGS are left unchanged.
13118 When the TO_PARSE string contains an invalid extension,
13119 a copy of the string is created and stored to INVALID_EXTENSION. */
13121 static enum aarch64_parse_opt_result
13122 aarch64_parse_cpu (const char *to_parse
, const struct processor
**res
,
13123 uint64_t *isa_flags
, std::string
*invalid_extension
)
13126 const struct processor
*cpu
;
13129 ext
= strchr (to_parse
, '+');
13132 len
= ext
- to_parse
;
13134 len
= strlen (to_parse
);
13137 return AARCH64_PARSE_MISSING_ARG
;
13140 /* Loop through the list of supported CPUs to find a match. */
13141 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
13143 if (strlen (cpu
->name
) == len
&& strncmp (cpu
->name
, to_parse
, len
) == 0)
13145 uint64_t isa_temp
= cpu
->flags
;
13150 /* TO_PARSE string contains at least one extension. */
13151 enum aarch64_parse_opt_result ext_res
13152 = aarch64_parse_extension (ext
, &isa_temp
, invalid_extension
);
13154 if (ext_res
!= AARCH64_PARSE_OK
)
13157 /* Extension parsing was successfull. Confirm the result
13158 cpu and ISA flags. */
13160 *isa_flags
= isa_temp
;
13161 return AARCH64_PARSE_OK
;
13165 /* CPU name not found in list. */
13166 return AARCH64_PARSE_INVALID_ARG
;
13169 /* Parse the TO_PARSE string and put the cpu it selects into RES.
13170 Return an aarch64_parse_opt_result describing the parse result.
13171 If the parsing fails the RES does not change. */
13173 static enum aarch64_parse_opt_result
13174 aarch64_parse_tune (const char *to_parse
, const struct processor
**res
)
13176 const struct processor
*cpu
;
13178 /* Loop through the list of supported CPUs to find a match. */
13179 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
13181 if (strcmp (cpu
->name
, to_parse
) == 0)
13184 return AARCH64_PARSE_OK
;
13188 /* CPU name not found in list. */
13189 return AARCH64_PARSE_INVALID_ARG
;
13192 /* Parse TOKEN, which has length LENGTH to see if it is an option
13193 described in FLAG. If it is, return the index bit for that fusion type.
13194 If not, error (printing OPTION_NAME) and return zero. */
13196 static unsigned int
13197 aarch64_parse_one_option_token (const char *token
,
13199 const struct aarch64_flag_desc
*flag
,
13200 const char *option_name
)
13202 for (; flag
->name
!= NULL
; flag
++)
13204 if (length
== strlen (flag
->name
)
13205 && !strncmp (flag
->name
, token
, length
))
13209 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name
, token
);
13213 /* Parse OPTION which is a comma-separated list of flags to enable.
13214 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
13215 default state we inherit from the CPU tuning structures. OPTION_NAME
13216 gives the top-level option we are parsing in the -moverride string,
13217 for use in error messages. */
13219 static unsigned int
13220 aarch64_parse_boolean_options (const char *option
,
13221 const struct aarch64_flag_desc
*flags
,
13222 unsigned int initial_state
,
13223 const char *option_name
)
13225 const char separator
= '.';
13226 const char* specs
= option
;
13227 const char* ntoken
= option
;
13228 unsigned int found_flags
= initial_state
;
13230 while ((ntoken
= strchr (specs
, separator
)))
13232 size_t token_length
= ntoken
- specs
;
13233 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
13237 /* If we find "none" (or, for simplicity's sake, an error) anywhere
13238 in the token stream, reset the supported operations. So:
13240 adrp+add.cmp+branch.none.adrp+add
13242 would have the result of turning on only adrp+add fusion. */
13246 found_flags
|= token_ops
;
13250 /* We ended with a comma, print something. */
13253 error ("%s string ill-formed\n", option_name
);
13257 /* We still have one more token to parse. */
13258 size_t token_length
= strlen (specs
);
13259 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
13266 found_flags
|= token_ops
;
13267 return found_flags
;
13270 /* Support for overriding instruction fusion. */
13273 aarch64_parse_fuse_string (const char *fuse_string
,
13274 struct tune_params
*tune
)
13276 tune
->fusible_ops
= aarch64_parse_boolean_options (fuse_string
,
13277 aarch64_fusible_pairs
,
13282 /* Support for overriding other tuning flags. */
13285 aarch64_parse_tune_string (const char *tune_string
,
13286 struct tune_params
*tune
)
13288 tune
->extra_tuning_flags
13289 = aarch64_parse_boolean_options (tune_string
,
13290 aarch64_tuning_flags
,
13291 tune
->extra_tuning_flags
,
13295 /* Parse the sve_width tuning moverride string in TUNE_STRING.
13296 Accept the valid SVE vector widths allowed by
13297 aarch64_sve_vector_bits_enum and use it to override sve_width
13301 aarch64_parse_sve_width_string (const char *tune_string
,
13302 struct tune_params
*tune
)
13306 int n
= sscanf (tune_string
, "%d", &width
);
13309 error ("invalid format for sve_width");
13321 error ("invalid sve_width value: %d", width
);
13323 tune
->sve_width
= (enum aarch64_sve_vector_bits_enum
) width
;
13326 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
13327 we understand. If it is, extract the option string and handoff to
13328 the appropriate function. */
13331 aarch64_parse_one_override_token (const char* token
,
13333 struct tune_params
*tune
)
13335 const struct aarch64_tuning_override_function
*fn
13336 = aarch64_tuning_override_functions
;
13338 const char *option_part
= strchr (token
, '=');
13341 error ("tuning string missing in option (%s)", token
);
13345 /* Get the length of the option name. */
13346 length
= option_part
- token
;
13347 /* Skip the '=' to get to the option string. */
13350 for (; fn
->name
!= NULL
; fn
++)
13352 if (!strncmp (fn
->name
, token
, length
))
13354 fn
->parse_override (option_part
, tune
);
13359 error ("unknown tuning option (%s)",token
);
13363 /* A checking mechanism for the implementation of the tls size. */
13366 initialize_aarch64_tls_size (struct gcc_options
*opts
)
13368 if (aarch64_tls_size
== 0)
13369 aarch64_tls_size
= 24;
13371 switch (opts
->x_aarch64_cmodel_var
)
13373 case AARCH64_CMODEL_TINY
:
13374 /* Both the default and maximum TLS size allowed under tiny is 1M which
13375 needs two instructions to address, so we clamp the size to 24. */
13376 if (aarch64_tls_size
> 24)
13377 aarch64_tls_size
= 24;
13379 case AARCH64_CMODEL_SMALL
:
13380 /* The maximum TLS size allowed under small is 4G. */
13381 if (aarch64_tls_size
> 32)
13382 aarch64_tls_size
= 32;
13384 case AARCH64_CMODEL_LARGE
:
13385 /* The maximum TLS size allowed under large is 16E.
13386 FIXME: 16E should be 64bit, we only support 48bit offset now. */
13387 if (aarch64_tls_size
> 48)
13388 aarch64_tls_size
= 48;
13391 gcc_unreachable ();
13397 /* Parse STRING looking for options in the format:
13398 string :: option:string
13399 option :: name=substring
13401 substring :: defined by option. */
13404 aarch64_parse_override_string (const char* input_string
,
13405 struct tune_params
* tune
)
13407 const char separator
= ':';
13408 size_t string_length
= strlen (input_string
) + 1;
13409 char *string_root
= (char *) xmalloc (sizeof (*string_root
) * string_length
);
13410 char *string
= string_root
;
13411 strncpy (string
, input_string
, string_length
);
13412 string
[string_length
- 1] = '\0';
13414 char* ntoken
= string
;
13416 while ((ntoken
= strchr (string
, separator
)))
13418 size_t token_length
= ntoken
- string
;
13419 /* Make this substring look like a string. */
13421 aarch64_parse_one_override_token (string
, token_length
, tune
);
13425 /* One last option to parse. */
13426 aarch64_parse_one_override_token (string
, strlen (string
), tune
);
13427 free (string_root
);
13432 aarch64_override_options_after_change_1 (struct gcc_options
*opts
)
13434 if (accepted_branch_protection_string
)
13436 opts
->x_aarch64_branch_protection_string
13437 = xstrdup (accepted_branch_protection_string
);
13440 /* PR 70044: We have to be careful about being called multiple times for the
13441 same function. This means all changes should be repeatable. */
13443 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
13444 Disable the frame pointer flag so the mid-end will not use a frame
13445 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
13446 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
13447 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
13448 aarch64_use_frame_pointer
= opts
->x_flag_omit_frame_pointer
!= 1;
13449 if (opts
->x_flag_omit_frame_pointer
== 0)
13450 opts
->x_flag_omit_frame_pointer
= 2;
13452 /* If not optimizing for size, set the default
13453 alignment to what the target wants. */
13454 if (!opts
->x_optimize_size
)
13456 if (opts
->x_flag_align_loops
&& !opts
->x_str_align_loops
)
13457 opts
->x_str_align_loops
= aarch64_tune_params
.loop_align
;
13458 if (opts
->x_flag_align_jumps
&& !opts
->x_str_align_jumps
)
13459 opts
->x_str_align_jumps
= aarch64_tune_params
.jump_align
;
13460 if (opts
->x_flag_align_functions
&& !opts
->x_str_align_functions
)
13461 opts
->x_str_align_functions
= aarch64_tune_params
.function_align
;
13464 /* We default to no pc-relative literal loads. */
13466 aarch64_pcrelative_literal_loads
= false;
13468 /* If -mpc-relative-literal-loads is set on the command line, this
13469 implies that the user asked for PC relative literal loads. */
13470 if (opts
->x_pcrelative_literal_loads
== 1)
13471 aarch64_pcrelative_literal_loads
= true;
13473 /* In the tiny memory model it makes no sense to disallow PC relative
13474 literal pool loads. */
13475 if (aarch64_cmodel
== AARCH64_CMODEL_TINY
13476 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
13477 aarch64_pcrelative_literal_loads
= true;
13479 /* When enabling the lower precision Newton series for the square root, also
13480 enable it for the reciprocal square root, since the latter is an
13481 intermediary step for the former. */
13482 if (flag_mlow_precision_sqrt
)
13483 flag_mrecip_low_precision_sqrt
= true;
13486 /* 'Unpack' up the internal tuning structs and update the options
13487 in OPTS. The caller must have set up selected_tune and selected_arch
13488 as all the other target-specific codegen decisions are
13489 derived from them. */
13492 aarch64_override_options_internal (struct gcc_options
*opts
)
13494 aarch64_tune_flags
= selected_tune
->flags
;
13495 aarch64_tune
= selected_tune
->sched_core
;
13496 /* Make a copy of the tuning parameters attached to the core, which
13497 we may later overwrite. */
13498 aarch64_tune_params
= *(selected_tune
->tune
);
13499 aarch64_architecture_version
= selected_arch
->architecture_version
;
13501 if (opts
->x_aarch64_override_tune_string
)
13502 aarch64_parse_override_string (opts
->x_aarch64_override_tune_string
,
13503 &aarch64_tune_params
);
13505 /* This target defaults to strict volatile bitfields. */
13506 if (opts
->x_flag_strict_volatile_bitfields
< 0 && abi_version_at_least (2))
13507 opts
->x_flag_strict_volatile_bitfields
= 1;
13509 if (aarch64_stack_protector_guard
== SSP_GLOBAL
13510 && opts
->x_aarch64_stack_protector_guard_offset_str
)
13512 error ("incompatible options %<-mstack-protector-guard=global%> and "
13513 "%<-mstack-protector-guard-offset=%s%>",
13514 aarch64_stack_protector_guard_offset_str
);
13517 if (aarch64_stack_protector_guard
== SSP_SYSREG
13518 && !(opts
->x_aarch64_stack_protector_guard_offset_str
13519 && opts
->x_aarch64_stack_protector_guard_reg_str
))
13521 error ("both %<-mstack-protector-guard-offset%> and "
13522 "%<-mstack-protector-guard-reg%> must be used "
13523 "with %<-mstack-protector-guard=sysreg%>");
13526 if (opts
->x_aarch64_stack_protector_guard_reg_str
)
13528 if (strlen (opts
->x_aarch64_stack_protector_guard_reg_str
) > 100)
13529 error ("specify a system register with a small string length.");
13532 if (opts
->x_aarch64_stack_protector_guard_offset_str
)
13535 const char *str
= aarch64_stack_protector_guard_offset_str
;
13537 long offs
= strtol (aarch64_stack_protector_guard_offset_str
, &end
, 0);
13538 if (!*str
|| *end
|| errno
)
13539 error ("%qs is not a valid offset in %qs", str
,
13540 "-mstack-protector-guard-offset=");
13541 aarch64_stack_protector_guard_offset
= offs
;
13544 initialize_aarch64_code_model (opts
);
13545 initialize_aarch64_tls_size (opts
);
13547 int queue_depth
= 0;
13548 switch (aarch64_tune_params
.autoprefetcher_model
)
13550 case tune_params::AUTOPREFETCHER_OFF
:
13553 case tune_params::AUTOPREFETCHER_WEAK
:
13556 case tune_params::AUTOPREFETCHER_STRONG
:
13557 queue_depth
= max_insn_queue_index
+ 1;
13560 gcc_unreachable ();
13563 /* We don't mind passing in global_options_set here as we don't use
13564 the *options_set structs anyway. */
13565 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
13566 param_sched_autopref_queue_depth
, queue_depth
);
13568 /* Set up parameters to be used in prefetching algorithm. Do not
13569 override the defaults unless we are tuning for a core we have
13570 researched values for. */
13571 if (aarch64_tune_params
.prefetch
->num_slots
> 0)
13572 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
13573 param_simultaneous_prefetches
,
13574 aarch64_tune_params
.prefetch
->num_slots
);
13575 if (aarch64_tune_params
.prefetch
->l1_cache_size
>= 0)
13576 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
13577 param_l1_cache_size
,
13578 aarch64_tune_params
.prefetch
->l1_cache_size
);
13579 if (aarch64_tune_params
.prefetch
->l1_cache_line_size
>= 0)
13580 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
13581 param_l1_cache_line_size
,
13582 aarch64_tune_params
.prefetch
->l1_cache_line_size
);
13583 if (aarch64_tune_params
.prefetch
->l2_cache_size
>= 0)
13584 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
13585 param_l2_cache_size
,
13586 aarch64_tune_params
.prefetch
->l2_cache_size
);
13587 if (!aarch64_tune_params
.prefetch
->prefetch_dynamic_strides
)
13588 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
13589 param_prefetch_dynamic_strides
, 0);
13590 if (aarch64_tune_params
.prefetch
->minimum_stride
>= 0)
13591 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
13592 param_prefetch_minimum_stride
,
13593 aarch64_tune_params
.prefetch
->minimum_stride
);
13595 /* Use the alternative scheduling-pressure algorithm by default. */
13596 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
13597 param_sched_pressure_algorithm
,
13598 SCHED_PRESSURE_MODEL
);
13600 /* Validate the guard size. */
13601 int guard_size
= param_stack_clash_protection_guard_size
;
13603 if (guard_size
!= 12 && guard_size
!= 16)
13604 error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
13605 "size. Given value %d (%llu KB) is out of range",
13606 guard_size
, (1ULL << guard_size
) / 1024ULL);
13608 /* Enforce that interval is the same size as size so the mid-end does the
13610 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
13611 param_stack_clash_protection_probe_interval
,
13614 /* The maybe_set calls won't update the value if the user has explicitly set
13615 one. Which means we need to validate that probing interval and guard size
13618 = param_stack_clash_protection_probe_interval
;
13619 if (guard_size
!= probe_interval
)
13620 error ("stack clash guard size %<%d%> must be equal to probing interval "
13621 "%<%d%>", guard_size
, probe_interval
);
13623 /* Enable sw prefetching at specified optimization level for
13624 CPUS that have prefetch. Lower optimization level threshold by 1
13625 when profiling is enabled. */
13626 if (opts
->x_flag_prefetch_loop_arrays
< 0
13627 && !opts
->x_optimize_size
13628 && aarch64_tune_params
.prefetch
->default_opt_level
>= 0
13629 && opts
->x_optimize
>= aarch64_tune_params
.prefetch
->default_opt_level
)
13630 opts
->x_flag_prefetch_loop_arrays
= 1;
13632 if (opts
->x_aarch64_arch_string
== NULL
)
13633 opts
->x_aarch64_arch_string
= selected_arch
->name
;
13634 if (opts
->x_aarch64_cpu_string
== NULL
)
13635 opts
->x_aarch64_cpu_string
= selected_cpu
->name
;
13636 if (opts
->x_aarch64_tune_string
== NULL
)
13637 opts
->x_aarch64_tune_string
= selected_tune
->name
;
13639 aarch64_override_options_after_change_1 (opts
);
13642 /* Print a hint with a suggestion for a core or architecture name that
13643 most closely resembles what the user passed in STR. ARCH is true if
13644 the user is asking for an architecture name. ARCH is false if the user
13645 is asking for a core name. */
13648 aarch64_print_hint_for_core_or_arch (const char *str
, bool arch
)
13650 auto_vec
<const char *> candidates
;
13651 const struct processor
*entry
= arch
? all_architectures
: all_cores
;
13652 for (; entry
->name
!= NULL
; entry
++)
13653 candidates
.safe_push (entry
->name
);
13655 #ifdef HAVE_LOCAL_CPU_DETECT
13656 /* Add also "native" as possible value. */
13658 candidates
.safe_push ("native");
13662 const char *hint
= candidates_list_and_hint (str
, s
, candidates
);
13664 inform (input_location
, "valid arguments are: %s;"
13665 " did you mean %qs?", s
, hint
);
13667 inform (input_location
, "valid arguments are: %s", s
);
13672 /* Print a hint with a suggestion for a core name that most closely resembles
13673 what the user passed in STR. */
13676 aarch64_print_hint_for_core (const char *str
)
13678 aarch64_print_hint_for_core_or_arch (str
, false);
13681 /* Print a hint with a suggestion for an architecture name that most closely
13682 resembles what the user passed in STR. */
13685 aarch64_print_hint_for_arch (const char *str
)
13687 aarch64_print_hint_for_core_or_arch (str
, true);
13691 /* Print a hint with a suggestion for an extension name
13692 that most closely resembles what the user passed in STR. */
13695 aarch64_print_hint_for_extensions (const std::string
&str
)
13697 auto_vec
<const char *> candidates
;
13698 aarch64_get_all_extension_candidates (&candidates
);
13700 const char *hint
= candidates_list_and_hint (str
.c_str (), s
, candidates
);
13702 inform (input_location
, "valid arguments are: %s;"
13703 " did you mean %qs?", s
, hint
);
13705 inform (input_location
, "valid arguments are: %s;", s
);
13710 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
13711 specified in STR and throw errors if appropriate. Put the results if
13712 they are valid in RES and ISA_FLAGS. Return whether the option is
13716 aarch64_validate_mcpu (const char *str
, const struct processor
**res
,
13717 uint64_t *isa_flags
)
13719 std::string invalid_extension
;
13720 enum aarch64_parse_opt_result parse_res
13721 = aarch64_parse_cpu (str
, res
, isa_flags
, &invalid_extension
);
13723 if (parse_res
== AARCH64_PARSE_OK
)
13728 case AARCH64_PARSE_MISSING_ARG
:
13729 error ("missing cpu name in %<-mcpu=%s%>", str
);
13731 case AARCH64_PARSE_INVALID_ARG
:
13732 error ("unknown value %qs for %<-mcpu%>", str
);
13733 aarch64_print_hint_for_core (str
);
13735 case AARCH64_PARSE_INVALID_FEATURE
:
13736 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
13737 invalid_extension
.c_str (), str
);
13738 aarch64_print_hint_for_extensions (invalid_extension
);
13741 gcc_unreachable ();
13747 /* Parses CONST_STR for branch protection features specified in
13748 aarch64_branch_protect_types, and set any global variables required. Returns
13749 the parsing result and assigns LAST_STR to the last processed token from
13750 CONST_STR so that it can be used for error reporting. */
13753 aarch64_parse_opt_result
aarch64_parse_branch_protection (const char *const_str
,
13756 char *str_root
= xstrdup (const_str
);
13757 char* token_save
= NULL
;
13758 char *str
= strtok_r (str_root
, "+", &token_save
);
13759 enum aarch64_parse_opt_result res
= AARCH64_PARSE_OK
;
13761 res
= AARCH64_PARSE_MISSING_ARG
;
13764 char *next_str
= strtok_r (NULL
, "+", &token_save
);
13765 /* Reset the branch protection features to their defaults. */
13766 aarch64_handle_no_branch_protection (NULL
, NULL
);
13768 while (str
&& res
== AARCH64_PARSE_OK
)
13770 const aarch64_branch_protect_type
* type
= aarch64_branch_protect_types
;
13771 bool found
= false;
13772 /* Search for this type. */
13773 while (type
&& type
->name
&& !found
&& res
== AARCH64_PARSE_OK
)
13775 if (strcmp (str
, type
->name
) == 0)
13778 res
= type
->handler (str
, next_str
);
13780 next_str
= strtok_r (NULL
, "+", &token_save
);
13785 if (found
&& res
== AARCH64_PARSE_OK
)
13787 bool found_subtype
= true;
13788 /* Loop through each token until we find one that isn't a
13790 while (found_subtype
)
13792 found_subtype
= false;
13793 const aarch64_branch_protect_type
*subtype
= type
->subtypes
;
13794 /* Search for the subtype. */
13795 while (str
&& subtype
&& subtype
->name
&& !found_subtype
13796 && res
== AARCH64_PARSE_OK
)
13798 if (strcmp (str
, subtype
->name
) == 0)
13800 found_subtype
= true;
13801 res
= subtype
->handler (str
, next_str
);
13803 next_str
= strtok_r (NULL
, "+", &token_save
);
13811 res
= AARCH64_PARSE_INVALID_ARG
;
13814 /* Copy the last processed token into the argument to pass it back.
13815 Used by option and attribute validation to print the offending token. */
13818 if (str
) strcpy (*last_str
, str
);
13819 else *last_str
= NULL
;
13821 if (res
== AARCH64_PARSE_OK
)
13823 /* If needed, alloc the accepted string then copy in const_str.
13824 Used by override_option_after_change_1. */
13825 if (!accepted_branch_protection_string
)
13826 accepted_branch_protection_string
= (char *) xmalloc (
13827 BRANCH_PROTECT_STR_MAX
13829 strncpy (accepted_branch_protection_string
, const_str
,
13830 BRANCH_PROTECT_STR_MAX
+ 1);
13831 /* Forcibly null-terminate. */
13832 accepted_branch_protection_string
[BRANCH_PROTECT_STR_MAX
] = '\0';
13838 aarch64_validate_mbranch_protection (const char *const_str
)
13840 char *str
= (char *) xmalloc (strlen (const_str
));
13841 enum aarch64_parse_opt_result res
=
13842 aarch64_parse_branch_protection (const_str
, &str
);
13843 if (res
== AARCH64_PARSE_INVALID_ARG
)
13844 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str
);
13845 else if (res
== AARCH64_PARSE_MISSING_ARG
)
13846 error ("missing argument for %<-mbranch-protection=%>");
13848 return res
== AARCH64_PARSE_OK
;
13851 /* Validate a command-line -march option. Parse the arch and extensions
13852 (if any) specified in STR and throw errors if appropriate. Put the
13853 results, if they are valid, in RES and ISA_FLAGS. Return whether the
13854 option is valid. */
13857 aarch64_validate_march (const char *str
, const struct processor
**res
,
13858 uint64_t *isa_flags
)
13860 std::string invalid_extension
;
13861 enum aarch64_parse_opt_result parse_res
13862 = aarch64_parse_arch (str
, res
, isa_flags
, &invalid_extension
);
13864 if (parse_res
== AARCH64_PARSE_OK
)
13869 case AARCH64_PARSE_MISSING_ARG
:
13870 error ("missing arch name in %<-march=%s%>", str
);
13872 case AARCH64_PARSE_INVALID_ARG
:
13873 error ("unknown value %qs for %<-march%>", str
);
13874 aarch64_print_hint_for_arch (str
);
13876 case AARCH64_PARSE_INVALID_FEATURE
:
13877 error ("invalid feature modifier %qs in %<-march=%s%>",
13878 invalid_extension
.c_str (), str
);
13879 aarch64_print_hint_for_extensions (invalid_extension
);
13882 gcc_unreachable ();
13888 /* Validate a command-line -mtune option. Parse the cpu
13889 specified in STR and throw errors if appropriate. Put the
13890 result, if it is valid, in RES. Return whether the option is
13894 aarch64_validate_mtune (const char *str
, const struct processor
**res
)
13896 enum aarch64_parse_opt_result parse_res
13897 = aarch64_parse_tune (str
, res
);
13899 if (parse_res
== AARCH64_PARSE_OK
)
13904 case AARCH64_PARSE_MISSING_ARG
:
13905 error ("missing cpu name in %<-mtune=%s%>", str
);
13907 case AARCH64_PARSE_INVALID_ARG
:
13908 error ("unknown value %qs for %<-mtune%>", str
);
13909 aarch64_print_hint_for_core (str
);
13912 gcc_unreachable ();
13917 /* Return the CPU corresponding to the enum CPU.
13918 If it doesn't specify a cpu, return the default. */
13920 static const struct processor
*
13921 aarch64_get_tune_cpu (enum aarch64_processor cpu
)
13923 if (cpu
!= aarch64_none
)
13924 return &all_cores
[cpu
];
13926 /* The & 0x3f is to extract the bottom 6 bits that encode the
13927 default cpu as selected by the --with-cpu GCC configure option
13929 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
13930 flags mechanism should be reworked to make it more sane. */
13931 return &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
13934 /* Return the architecture corresponding to the enum ARCH.
13935 If it doesn't specify a valid architecture, return the default. */
13937 static const struct processor
*
13938 aarch64_get_arch (enum aarch64_arch arch
)
13940 if (arch
!= aarch64_no_arch
)
13941 return &all_architectures
[arch
];
13943 const struct processor
*cpu
= &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
13945 return &all_architectures
[cpu
->arch
];
13948 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
13951 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value
)
13953 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
13954 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
13955 deciding which .md file patterns to use and when deciding whether
13956 something is a legitimate address or constant. */
13957 if (value
== SVE_SCALABLE
|| value
== SVE_128
)
13958 return poly_uint16 (2, 2);
13960 return (int) value
/ 64;
13963 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
13964 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
13965 tuning structs. In particular it must set selected_tune and
13966 aarch64_isa_flags that define the available ISA features and tuning
13967 decisions. It must also set selected_arch as this will be used to
13968 output the .arch asm tags for each function. */
13971 aarch64_override_options (void)
13973 uint64_t cpu_isa
= 0;
13974 uint64_t arch_isa
= 0;
13975 aarch64_isa_flags
= 0;
13977 bool valid_cpu
= true;
13978 bool valid_tune
= true;
13979 bool valid_arch
= true;
13981 selected_cpu
= NULL
;
13982 selected_arch
= NULL
;
13983 selected_tune
= NULL
;
13985 if (aarch64_branch_protection_string
)
13986 aarch64_validate_mbranch_protection (aarch64_branch_protection_string
);
13988 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
13989 If either of -march or -mtune is given, they override their
13990 respective component of -mcpu. */
13991 if (aarch64_cpu_string
)
13992 valid_cpu
= aarch64_validate_mcpu (aarch64_cpu_string
, &selected_cpu
,
13995 if (aarch64_arch_string
)
13996 valid_arch
= aarch64_validate_march (aarch64_arch_string
, &selected_arch
,
13999 if (aarch64_tune_string
)
14000 valid_tune
= aarch64_validate_mtune (aarch64_tune_string
, &selected_tune
);
14002 #ifdef SUBTARGET_OVERRIDE_OPTIONS
14003 SUBTARGET_OVERRIDE_OPTIONS
;
14006 /* If the user did not specify a processor, choose the default
14007 one for them. This will be the CPU set during configuration using
14008 --with-cpu, otherwise it is "generic". */
14013 selected_cpu
= &all_cores
[selected_arch
->ident
];
14014 aarch64_isa_flags
= arch_isa
;
14015 explicit_arch
= selected_arch
->arch
;
14019 /* Get default configure-time CPU. */
14020 selected_cpu
= aarch64_get_tune_cpu (aarch64_none
);
14021 aarch64_isa_flags
= TARGET_CPU_DEFAULT
>> 6;
14025 explicit_tune_core
= selected_tune
->ident
;
14027 /* If both -mcpu and -march are specified check that they are architecturally
14028 compatible, warn if they're not and prefer the -march ISA flags. */
14029 else if (selected_arch
)
14031 if (selected_arch
->arch
!= selected_cpu
->arch
)
14033 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
14034 all_architectures
[selected_cpu
->arch
].name
,
14035 selected_arch
->name
);
14037 aarch64_isa_flags
= arch_isa
;
14038 explicit_arch
= selected_arch
->arch
;
14039 explicit_tune_core
= selected_tune
? selected_tune
->ident
14040 : selected_cpu
->ident
;
14044 /* -mcpu but no -march. */
14045 aarch64_isa_flags
= cpu_isa
;
14046 explicit_tune_core
= selected_tune
? selected_tune
->ident
14047 : selected_cpu
->ident
;
14048 gcc_assert (selected_cpu
);
14049 selected_arch
= &all_architectures
[selected_cpu
->arch
];
14050 explicit_arch
= selected_arch
->arch
;
14053 /* Set the arch as well as we will need it when outputing
14054 the .arch directive in assembly. */
14055 if (!selected_arch
)
14057 gcc_assert (selected_cpu
);
14058 selected_arch
= &all_architectures
[selected_cpu
->arch
];
14061 if (!selected_tune
)
14062 selected_tune
= selected_cpu
;
14064 if (aarch64_enable_bti
== 2)
14066 #ifdef TARGET_ENABLE_BTI
14067 aarch64_enable_bti
= 1;
14069 aarch64_enable_bti
= 0;
14073 /* Return address signing is currently not supported for ILP32 targets. For
14074 LP64 targets use the configured option in the absence of a command-line
14075 option for -mbranch-protection. */
14076 if (!TARGET_ILP32
&& accepted_branch_protection_string
== NULL
)
14078 #ifdef TARGET_ENABLE_PAC_RET
14079 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
14081 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NONE
;
14085 #ifndef HAVE_AS_MABI_OPTION
14086 /* The compiler may have been configured with 2.23.* binutils, which does
14087 not have support for ILP32. */
14089 error ("assembler does not support %<-mabi=ilp32%>");
14092 /* Convert -msve-vector-bits to a VG count. */
14093 aarch64_sve_vg
= aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits
);
14095 if (aarch64_ra_sign_scope
!= AARCH64_FUNCTION_NONE
&& TARGET_ILP32
)
14096 sorry ("return address signing is only supported for %<-mabi=lp64%>");
14098 /* Make sure we properly set up the explicit options. */
14099 if ((aarch64_cpu_string
&& valid_cpu
)
14100 || (aarch64_tune_string
&& valid_tune
))
14101 gcc_assert (explicit_tune_core
!= aarch64_none
);
14103 if ((aarch64_cpu_string
&& valid_cpu
)
14104 || (aarch64_arch_string
&& valid_arch
))
14105 gcc_assert (explicit_arch
!= aarch64_no_arch
);
14107 /* The pass to insert speculation tracking runs before
14108 shrink-wrapping and the latter does not know how to update the
14109 tracking status. So disable it in this case. */
14110 if (aarch64_track_speculation
)
14111 flag_shrink_wrap
= 0;
14113 aarch64_override_options_internal (&global_options
);
14115 /* Save these options as the default ones in case we push and pop them later
14116 while processing functions with potential target attributes. */
14117 target_option_default_node
= target_option_current_node
14118 = build_target_option_node (&global_options
);
14121 /* Implement targetm.override_options_after_change. */
14124 aarch64_override_options_after_change (void)
14126 aarch64_override_options_after_change_1 (&global_options
);
14129 static struct machine_function
*
14130 aarch64_init_machine_status (void)
14132 struct machine_function
*machine
;
14133 machine
= ggc_cleared_alloc
<machine_function
> ();
14138 aarch64_init_expanders (void)
14140 init_machine_status
= aarch64_init_machine_status
;
14143 /* A checking mechanism for the implementation of the various code models. */
14145 initialize_aarch64_code_model (struct gcc_options
*opts
)
14147 if (opts
->x_flag_pic
)
14149 switch (opts
->x_aarch64_cmodel_var
)
14151 case AARCH64_CMODEL_TINY
:
14152 aarch64_cmodel
= AARCH64_CMODEL_TINY_PIC
;
14154 case AARCH64_CMODEL_SMALL
:
14155 #ifdef HAVE_AS_SMALL_PIC_RELOCS
14156 aarch64_cmodel
= (flag_pic
== 2
14157 ? AARCH64_CMODEL_SMALL_PIC
14158 : AARCH64_CMODEL_SMALL_SPIC
);
14160 aarch64_cmodel
= AARCH64_CMODEL_SMALL_PIC
;
14163 case AARCH64_CMODEL_LARGE
:
14164 sorry ("code model %qs with %<-f%s%>", "large",
14165 opts
->x_flag_pic
> 1 ? "PIC" : "pic");
14168 gcc_unreachable ();
14172 aarch64_cmodel
= opts
->x_aarch64_cmodel_var
;
14175 /* Implement TARGET_OPTION_SAVE. */
14178 aarch64_option_save (struct cl_target_option
*ptr
, struct gcc_options
*opts
)
14180 ptr
->x_aarch64_override_tune_string
= opts
->x_aarch64_override_tune_string
;
14181 ptr
->x_aarch64_branch_protection_string
14182 = opts
->x_aarch64_branch_protection_string
;
14185 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
14186 using the information saved in PTR. */
14189 aarch64_option_restore (struct gcc_options
*opts
, struct cl_target_option
*ptr
)
14191 opts
->x_explicit_tune_core
= ptr
->x_explicit_tune_core
;
14192 selected_tune
= aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
14193 opts
->x_explicit_arch
= ptr
->x_explicit_arch
;
14194 selected_arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
14195 opts
->x_aarch64_override_tune_string
= ptr
->x_aarch64_override_tune_string
;
14196 opts
->x_aarch64_branch_protection_string
14197 = ptr
->x_aarch64_branch_protection_string
;
14198 if (opts
->x_aarch64_branch_protection_string
)
14200 aarch64_parse_branch_protection (opts
->x_aarch64_branch_protection_string
,
14204 aarch64_override_options_internal (opts
);
14207 /* Implement TARGET_OPTION_PRINT. */
14210 aarch64_option_print (FILE *file
, int indent
, struct cl_target_option
*ptr
)
14212 const struct processor
*cpu
14213 = aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
14214 uint64_t isa_flags
= ptr
->x_aarch64_isa_flags
;
14215 const struct processor
*arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
14216 std::string extension
14217 = aarch64_get_extension_string_for_isa_flags (isa_flags
, arch
->flags
);
14219 fprintf (file
, "%*sselected tune = %s\n", indent
, "", cpu
->name
);
14220 fprintf (file
, "%*sselected arch = %s%s\n", indent
, "",
14221 arch
->name
, extension
.c_str ());
14224 static GTY(()) tree aarch64_previous_fndecl
;
14227 aarch64_reset_previous_fndecl (void)
14229 aarch64_previous_fndecl
= NULL
;
14232 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
14233 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
14234 make sure optab availability predicates are recomputed when necessary. */
14237 aarch64_save_restore_target_globals (tree new_tree
)
14239 if (TREE_TARGET_GLOBALS (new_tree
))
14240 restore_target_globals (TREE_TARGET_GLOBALS (new_tree
));
14241 else if (new_tree
== target_option_default_node
)
14242 restore_target_globals (&default_target_globals
);
14244 TREE_TARGET_GLOBALS (new_tree
) = save_target_globals_default_opts ();
14247 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
14248 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
14249 of the function, if such exists. This function may be called multiple
14250 times on a single function so use aarch64_previous_fndecl to avoid
14251 setting up identical state. */
14254 aarch64_set_current_function (tree fndecl
)
14256 if (!fndecl
|| fndecl
== aarch64_previous_fndecl
)
14259 tree old_tree
= (aarch64_previous_fndecl
14260 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl
)
14263 tree new_tree
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
14265 /* If current function has no attributes but the previous one did,
14266 use the default node. */
14267 if (!new_tree
&& old_tree
)
14268 new_tree
= target_option_default_node
;
14270 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
14271 the default have been handled by aarch64_save_restore_target_globals from
14272 aarch64_pragma_target_parse. */
14273 if (old_tree
== new_tree
)
14276 aarch64_previous_fndecl
= fndecl
;
14278 /* First set the target options. */
14279 cl_target_option_restore (&global_options
, TREE_TARGET_OPTION (new_tree
));
14281 aarch64_save_restore_target_globals (new_tree
);
14284 /* Enum describing the various ways we can handle attributes.
14285 In many cases we can reuse the generic option handling machinery. */
14287 enum aarch64_attr_opt_type
14289 aarch64_attr_mask
, /* Attribute should set a bit in target_flags. */
14290 aarch64_attr_bool
, /* Attribute sets or unsets a boolean variable. */
14291 aarch64_attr_enum
, /* Attribute sets an enum variable. */
14292 aarch64_attr_custom
/* Attribute requires a custom handling function. */
14295 /* All the information needed to handle a target attribute.
14296 NAME is the name of the attribute.
14297 ATTR_TYPE specifies the type of behavior of the attribute as described
14298 in the definition of enum aarch64_attr_opt_type.
14299 ALLOW_NEG is true if the attribute supports a "no-" form.
14300 HANDLER is the function that takes the attribute string as an argument
14301 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
14302 OPT_NUM is the enum specifying the option that the attribute modifies.
14303 This is needed for attributes that mirror the behavior of a command-line
14304 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
14305 aarch64_attr_enum. */
14307 struct aarch64_attribute_info
14310 enum aarch64_attr_opt_type attr_type
;
14312 bool (*handler
) (const char *);
14313 enum opt_code opt_num
;
14316 /* Handle the ARCH_STR argument to the arch= target attribute. */
14319 aarch64_handle_attr_arch (const char *str
)
14321 const struct processor
*tmp_arch
= NULL
;
14322 std::string invalid_extension
;
14323 enum aarch64_parse_opt_result parse_res
14324 = aarch64_parse_arch (str
, &tmp_arch
, &aarch64_isa_flags
, &invalid_extension
);
14326 if (parse_res
== AARCH64_PARSE_OK
)
14328 gcc_assert (tmp_arch
);
14329 selected_arch
= tmp_arch
;
14330 explicit_arch
= selected_arch
->arch
;
14336 case AARCH64_PARSE_MISSING_ARG
:
14337 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
14339 case AARCH64_PARSE_INVALID_ARG
:
14340 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str
);
14341 aarch64_print_hint_for_arch (str
);
14343 case AARCH64_PARSE_INVALID_FEATURE
:
14344 error ("invalid feature modifier %s of value (\"%s\") in "
14345 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
14346 aarch64_print_hint_for_extensions (invalid_extension
);
14349 gcc_unreachable ();
14355 /* Handle the argument CPU_STR to the cpu= target attribute. */
14358 aarch64_handle_attr_cpu (const char *str
)
14360 const struct processor
*tmp_cpu
= NULL
;
14361 std::string invalid_extension
;
14362 enum aarch64_parse_opt_result parse_res
14363 = aarch64_parse_cpu (str
, &tmp_cpu
, &aarch64_isa_flags
, &invalid_extension
);
14365 if (parse_res
== AARCH64_PARSE_OK
)
14367 gcc_assert (tmp_cpu
);
14368 selected_tune
= tmp_cpu
;
14369 explicit_tune_core
= selected_tune
->ident
;
14371 selected_arch
= &all_architectures
[tmp_cpu
->arch
];
14372 explicit_arch
= selected_arch
->arch
;
14378 case AARCH64_PARSE_MISSING_ARG
:
14379 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
14381 case AARCH64_PARSE_INVALID_ARG
:
14382 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str
);
14383 aarch64_print_hint_for_core (str
);
14385 case AARCH64_PARSE_INVALID_FEATURE
:
14386 error ("invalid feature modifier %s of value (\"%s\") in "
14387 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
14388 aarch64_print_hint_for_extensions (invalid_extension
);
14391 gcc_unreachable ();
14397 /* Handle the argument STR to the branch-protection= attribute. */
14400 aarch64_handle_attr_branch_protection (const char* str
)
14402 char *err_str
= (char *) xmalloc (strlen (str
) + 1);
14403 enum aarch64_parse_opt_result res
= aarch64_parse_branch_protection (str
,
14405 bool success
= false;
14408 case AARCH64_PARSE_MISSING_ARG
:
14409 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
14412 case AARCH64_PARSE_INVALID_ARG
:
14413 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
14414 "=\")%> pragma or attribute", err_str
);
14416 case AARCH64_PARSE_OK
:
14418 /* Fall through. */
14419 case AARCH64_PARSE_INVALID_FEATURE
:
14422 gcc_unreachable ();
14428 /* Handle the argument STR to the tune= target attribute. */
14431 aarch64_handle_attr_tune (const char *str
)
14433 const struct processor
*tmp_tune
= NULL
;
14434 enum aarch64_parse_opt_result parse_res
14435 = aarch64_parse_tune (str
, &tmp_tune
);
14437 if (parse_res
== AARCH64_PARSE_OK
)
14439 gcc_assert (tmp_tune
);
14440 selected_tune
= tmp_tune
;
14441 explicit_tune_core
= selected_tune
->ident
;
14447 case AARCH64_PARSE_INVALID_ARG
:
14448 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str
);
14449 aarch64_print_hint_for_core (str
);
14452 gcc_unreachable ();
14458 /* Parse an architecture extensions target attribute string specified in STR.
14459 For example "+fp+nosimd". Show any errors if needed. Return TRUE
14460 if successful. Update aarch64_isa_flags to reflect the ISA features
14464 aarch64_handle_attr_isa_flags (char *str
)
14466 enum aarch64_parse_opt_result parse_res
;
14467 uint64_t isa_flags
= aarch64_isa_flags
;
14469 /* We allow "+nothing" in the beginning to clear out all architectural
14470 features if the user wants to handpick specific features. */
14471 if (strncmp ("+nothing", str
, 8) == 0)
14477 std::string invalid_extension
;
14478 parse_res
= aarch64_parse_extension (str
, &isa_flags
, &invalid_extension
);
14480 if (parse_res
== AARCH64_PARSE_OK
)
14482 aarch64_isa_flags
= isa_flags
;
14488 case AARCH64_PARSE_MISSING_ARG
:
14489 error ("missing value in %<target()%> pragma or attribute");
14492 case AARCH64_PARSE_INVALID_FEATURE
:
14493 error ("invalid feature modifier %s of value (\"%s\") in "
14494 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
14498 gcc_unreachable ();
14504 /* The target attributes that we support. On top of these we also support just
14505 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
14506 handled explicitly in aarch64_process_one_target_attr. */
14508 static const struct aarch64_attribute_info aarch64_attributes
[] =
14510 { "general-regs-only", aarch64_attr_mask
, false, NULL
,
14511 OPT_mgeneral_regs_only
},
14512 { "fix-cortex-a53-835769", aarch64_attr_bool
, true, NULL
,
14513 OPT_mfix_cortex_a53_835769
},
14514 { "fix-cortex-a53-843419", aarch64_attr_bool
, true, NULL
,
14515 OPT_mfix_cortex_a53_843419
},
14516 { "cmodel", aarch64_attr_enum
, false, NULL
, OPT_mcmodel_
},
14517 { "strict-align", aarch64_attr_mask
, true, NULL
, OPT_mstrict_align
},
14518 { "omit-leaf-frame-pointer", aarch64_attr_bool
, true, NULL
,
14519 OPT_momit_leaf_frame_pointer
},
14520 { "tls-dialect", aarch64_attr_enum
, false, NULL
, OPT_mtls_dialect_
},
14521 { "arch", aarch64_attr_custom
, false, aarch64_handle_attr_arch
,
14523 { "cpu", aarch64_attr_custom
, false, aarch64_handle_attr_cpu
, OPT_mcpu_
},
14524 { "tune", aarch64_attr_custom
, false, aarch64_handle_attr_tune
,
14526 { "branch-protection", aarch64_attr_custom
, false,
14527 aarch64_handle_attr_branch_protection
, OPT_mbranch_protection_
},
14528 { "sign-return-address", aarch64_attr_enum
, false, NULL
,
14529 OPT_msign_return_address_
},
14530 { NULL
, aarch64_attr_custom
, false, NULL
, OPT____
}
14533 /* Parse ARG_STR which contains the definition of one target attribute.
14534 Show appropriate errors if any or return true if the attribute is valid. */
14537 aarch64_process_one_target_attr (char *arg_str
)
14539 bool invert
= false;
14541 size_t len
= strlen (arg_str
);
14545 error ("malformed %<target()%> pragma or attribute");
14549 char *str_to_check
= (char *) alloca (len
+ 1);
14550 strcpy (str_to_check
, arg_str
);
14552 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
14553 It is easier to detect and handle it explicitly here rather than going
14554 through the machinery for the rest of the target attributes in this
14556 if (*str_to_check
== '+')
14557 return aarch64_handle_attr_isa_flags (str_to_check
);
14559 if (len
> 3 && strncmp (str_to_check
, "no-", 3) == 0)
14564 char *arg
= strchr (str_to_check
, '=');
14566 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
14567 and point ARG to "foo". */
14573 const struct aarch64_attribute_info
*p_attr
;
14574 bool found
= false;
14575 for (p_attr
= aarch64_attributes
; p_attr
->name
; p_attr
++)
14577 /* If the names don't match up, or the user has given an argument
14578 to an attribute that doesn't accept one, or didn't give an argument
14579 to an attribute that expects one, fail to match. */
14580 if (strcmp (str_to_check
, p_attr
->name
) != 0)
14584 bool attr_need_arg_p
= p_attr
->attr_type
== aarch64_attr_custom
14585 || p_attr
->attr_type
== aarch64_attr_enum
;
14587 if (attr_need_arg_p
^ (arg
!= NULL
))
14589 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check
);
14593 /* If the name matches but the attribute does not allow "no-" versions
14594 then we can't match. */
14595 if (invert
&& !p_attr
->allow_neg
)
14597 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check
);
14601 switch (p_attr
->attr_type
)
14603 /* Has a custom handler registered.
14604 For example, cpu=, arch=, tune=. */
14605 case aarch64_attr_custom
:
14606 gcc_assert (p_attr
->handler
);
14607 if (!p_attr
->handler (arg
))
14611 /* Either set or unset a boolean option. */
14612 case aarch64_attr_bool
:
14614 struct cl_decoded_option decoded
;
14616 generate_option (p_attr
->opt_num
, NULL
, !invert
,
14617 CL_TARGET
, &decoded
);
14618 aarch64_handle_option (&global_options
, &global_options_set
,
14619 &decoded
, input_location
);
14622 /* Set or unset a bit in the target_flags. aarch64_handle_option
14623 should know what mask to apply given the option number. */
14624 case aarch64_attr_mask
:
14626 struct cl_decoded_option decoded
;
14627 /* We only need to specify the option number.
14628 aarch64_handle_option will know which mask to apply. */
14629 decoded
.opt_index
= p_attr
->opt_num
;
14630 decoded
.value
= !invert
;
14631 aarch64_handle_option (&global_options
, &global_options_set
,
14632 &decoded
, input_location
);
14635 /* Use the option setting machinery to set an option to an enum. */
14636 case aarch64_attr_enum
:
14641 valid
= opt_enum_arg_to_value (p_attr
->opt_num
, arg
,
14642 &value
, CL_TARGET
);
14645 set_option (&global_options
, NULL
, p_attr
->opt_num
, value
,
14646 NULL
, DK_UNSPECIFIED
, input_location
,
14651 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check
, arg
);
14656 gcc_unreachable ();
14660 /* If we reached here we either have found an attribute and validated
14661 it or didn't match any. If we matched an attribute but its arguments
14662 were malformed we will have returned false already. */
14666 /* Count how many times the character C appears in
14667 NULL-terminated string STR. */
14669 static unsigned int
14670 num_occurences_in_str (char c
, char *str
)
14672 unsigned int res
= 0;
14673 while (*str
!= '\0')
14684 /* Parse the tree in ARGS that contains the target attribute information
14685 and update the global target options space. */
14688 aarch64_process_target_attr (tree args
)
14690 if (TREE_CODE (args
) == TREE_LIST
)
14694 tree head
= TREE_VALUE (args
);
14697 if (!aarch64_process_target_attr (head
))
14700 args
= TREE_CHAIN (args
);
14706 if (TREE_CODE (args
) != STRING_CST
)
14708 error ("attribute %<target%> argument not a string");
14712 size_t len
= strlen (TREE_STRING_POINTER (args
));
14713 char *str_to_check
= (char *) alloca (len
+ 1);
14714 strcpy (str_to_check
, TREE_STRING_POINTER (args
));
14718 error ("malformed %<target()%> pragma or attribute");
14722 /* Used to catch empty spaces between commas i.e.
14723 attribute ((target ("attr1,,attr2"))). */
14724 unsigned int num_commas
= num_occurences_in_str (',', str_to_check
);
14726 /* Handle multiple target attributes separated by ','. */
14727 char *token
= strtok_r (str_to_check
, ",", &str_to_check
);
14729 unsigned int num_attrs
= 0;
14733 if (!aarch64_process_one_target_attr (token
))
14735 error ("pragma or attribute %<target(\"%s\")%> is not valid", token
);
14739 token
= strtok_r (NULL
, ",", &str_to_check
);
14742 if (num_attrs
!= num_commas
+ 1)
14744 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args
));
14751 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
14752 process attribute ((target ("..."))). */
14755 aarch64_option_valid_attribute_p (tree fndecl
, tree
, tree args
, int)
14757 struct cl_target_option cur_target
;
14760 tree new_target
, new_optimize
;
14761 tree existing_target
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
14763 /* If what we're processing is the current pragma string then the
14764 target option node is already stored in target_option_current_node
14765 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
14766 having to re-parse the string. This is especially useful to keep
14767 arm_neon.h compile times down since that header contains a lot
14768 of intrinsics enclosed in pragmas. */
14769 if (!existing_target
&& args
== current_target_pragma
)
14771 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = target_option_current_node
;
14774 tree func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
14776 old_optimize
= build_optimization_node (&global_options
);
14777 func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
14779 /* If the function changed the optimization levels as well as setting
14780 target options, start with the optimizations specified. */
14781 if (func_optimize
&& func_optimize
!= old_optimize
)
14782 cl_optimization_restore (&global_options
,
14783 TREE_OPTIMIZATION (func_optimize
));
14785 /* Save the current target options to restore at the end. */
14786 cl_target_option_save (&cur_target
, &global_options
);
14788 /* If fndecl already has some target attributes applied to it, unpack
14789 them so that we add this attribute on top of them, rather than
14790 overwriting them. */
14791 if (existing_target
)
14793 struct cl_target_option
*existing_options
14794 = TREE_TARGET_OPTION (existing_target
);
14796 if (existing_options
)
14797 cl_target_option_restore (&global_options
, existing_options
);
14800 cl_target_option_restore (&global_options
,
14801 TREE_TARGET_OPTION (target_option_current_node
));
14803 ret
= aarch64_process_target_attr (args
);
14805 /* Set up any additional state. */
14808 aarch64_override_options_internal (&global_options
);
14809 /* Initialize SIMD builtins if we haven't already.
14810 Set current_target_pragma to NULL for the duration so that
14811 the builtin initialization code doesn't try to tag the functions
14812 being built with the attributes specified by any current pragma, thus
14813 going into an infinite recursion. */
14816 tree saved_current_target_pragma
= current_target_pragma
;
14817 current_target_pragma
= NULL
;
14818 aarch64_init_simd_builtins ();
14819 current_target_pragma
= saved_current_target_pragma
;
14821 new_target
= build_target_option_node (&global_options
);
14826 new_optimize
= build_optimization_node (&global_options
);
14830 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = new_target
;
14832 if (old_optimize
!= new_optimize
)
14833 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
) = new_optimize
;
14836 cl_target_option_restore (&global_options
, &cur_target
);
14838 if (old_optimize
!= new_optimize
)
14839 cl_optimization_restore (&global_options
,
14840 TREE_OPTIMIZATION (old_optimize
));
14844 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
14845 tri-bool options (yes, no, don't care) and the default value is
14846 DEF, determine whether to reject inlining. */
14849 aarch64_tribools_ok_for_inlining_p (int caller
, int callee
,
14850 int dont_care
, int def
)
14852 /* If the callee doesn't care, always allow inlining. */
14853 if (callee
== dont_care
)
14856 /* If the caller doesn't care, always allow inlining. */
14857 if (caller
== dont_care
)
14860 /* Otherwise, allow inlining if either the callee and caller values
14861 agree, or if the callee is using the default value. */
14862 return (callee
== caller
|| callee
== def
);
14865 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
14866 to inline CALLEE into CALLER based on target-specific info.
14867 Make sure that the caller and callee have compatible architectural
14868 features. Then go through the other possible target attributes
14869 and see if they can block inlining. Try not to reject always_inline
14870 callees unless they are incompatible architecturally. */
14873 aarch64_can_inline_p (tree caller
, tree callee
)
14875 tree caller_tree
= DECL_FUNCTION_SPECIFIC_TARGET (caller
);
14876 tree callee_tree
= DECL_FUNCTION_SPECIFIC_TARGET (callee
);
14878 struct cl_target_option
*caller_opts
14879 = TREE_TARGET_OPTION (caller_tree
? caller_tree
14880 : target_option_default_node
);
14882 struct cl_target_option
*callee_opts
14883 = TREE_TARGET_OPTION (callee_tree
? callee_tree
14884 : target_option_default_node
);
14886 /* Callee's ISA flags should be a subset of the caller's. */
14887 if ((caller_opts
->x_aarch64_isa_flags
& callee_opts
->x_aarch64_isa_flags
)
14888 != callee_opts
->x_aarch64_isa_flags
)
14891 /* Allow non-strict aligned functions inlining into strict
14893 if ((TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)
14894 != TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
))
14895 && !(!TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
)
14896 && TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)))
14899 bool always_inline
= lookup_attribute ("always_inline",
14900 DECL_ATTRIBUTES (callee
));
14902 /* If the architectural features match up and the callee is always_inline
14903 then the other attributes don't matter. */
14907 if (caller_opts
->x_aarch64_cmodel_var
14908 != callee_opts
->x_aarch64_cmodel_var
)
14911 if (caller_opts
->x_aarch64_tls_dialect
14912 != callee_opts
->x_aarch64_tls_dialect
)
14915 /* Honour explicit requests to workaround errata. */
14916 if (!aarch64_tribools_ok_for_inlining_p (
14917 caller_opts
->x_aarch64_fix_a53_err835769
,
14918 callee_opts
->x_aarch64_fix_a53_err835769
,
14919 2, TARGET_FIX_ERR_A53_835769_DEFAULT
))
14922 if (!aarch64_tribools_ok_for_inlining_p (
14923 caller_opts
->x_aarch64_fix_a53_err843419
,
14924 callee_opts
->x_aarch64_fix_a53_err843419
,
14925 2, TARGET_FIX_ERR_A53_843419
))
14928 /* If the user explicitly specified -momit-leaf-frame-pointer for the
14929 caller and calle and they don't match up, reject inlining. */
14930 if (!aarch64_tribools_ok_for_inlining_p (
14931 caller_opts
->x_flag_omit_leaf_frame_pointer
,
14932 callee_opts
->x_flag_omit_leaf_frame_pointer
,
14936 /* If the callee has specific tuning overrides, respect them. */
14937 if (callee_opts
->x_aarch64_override_tune_string
!= NULL
14938 && caller_opts
->x_aarch64_override_tune_string
== NULL
)
14941 /* If the user specified tuning override strings for the
14942 caller and callee and they don't match up, reject inlining.
14943 We just do a string compare here, we don't analyze the meaning
14944 of the string, as it would be too costly for little gain. */
14945 if (callee_opts
->x_aarch64_override_tune_string
14946 && caller_opts
->x_aarch64_override_tune_string
14947 && (strcmp (callee_opts
->x_aarch64_override_tune_string
,
14948 caller_opts
->x_aarch64_override_tune_string
) != 0))
14954 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
14958 aarch64_tlsdesc_abi_id ()
14960 predefined_function_abi
&tlsdesc_abi
= function_abis
[ARM_PCS_TLSDESC
];
14961 if (!tlsdesc_abi
.initialized_p ())
14963 HARD_REG_SET full_reg_clobbers
;
14964 CLEAR_HARD_REG_SET (full_reg_clobbers
);
14965 SET_HARD_REG_BIT (full_reg_clobbers
, R0_REGNUM
);
14966 SET_HARD_REG_BIT (full_reg_clobbers
, CC_REGNUM
);
14967 for (int regno
= P0_REGNUM
; regno
<= P15_REGNUM
; ++regno
)
14968 SET_HARD_REG_BIT (full_reg_clobbers
, regno
);
14969 tlsdesc_abi
.initialize (ARM_PCS_TLSDESC
, full_reg_clobbers
);
14971 return tlsdesc_abi
.id ();
14974 /* Return true if SYMBOL_REF X binds locally. */
14977 aarch64_symbol_binds_local_p (const_rtx x
)
14979 return (SYMBOL_REF_DECL (x
)
14980 ? targetm
.binds_local_p (SYMBOL_REF_DECL (x
))
14981 : SYMBOL_REF_LOCAL_P (x
));
14984 /* Return true if SYMBOL_REF X is thread local */
14986 aarch64_tls_symbol_p (rtx x
)
14988 if (! TARGET_HAVE_TLS
)
14991 if (GET_CODE (x
) != SYMBOL_REF
)
14994 return SYMBOL_REF_TLS_MODEL (x
) != 0;
14997 /* Classify a TLS symbol into one of the TLS kinds. */
14998 enum aarch64_symbol_type
14999 aarch64_classify_tls_symbol (rtx x
)
15001 enum tls_model tls_kind
= tls_symbolic_operand_type (x
);
15005 case TLS_MODEL_GLOBAL_DYNAMIC
:
15006 case TLS_MODEL_LOCAL_DYNAMIC
:
15007 return TARGET_TLS_DESC
? SYMBOL_SMALL_TLSDESC
: SYMBOL_SMALL_TLSGD
;
15009 case TLS_MODEL_INITIAL_EXEC
:
15010 switch (aarch64_cmodel
)
15012 case AARCH64_CMODEL_TINY
:
15013 case AARCH64_CMODEL_TINY_PIC
:
15014 return SYMBOL_TINY_TLSIE
;
15016 return SYMBOL_SMALL_TLSIE
;
15019 case TLS_MODEL_LOCAL_EXEC
:
15020 if (aarch64_tls_size
== 12)
15021 return SYMBOL_TLSLE12
;
15022 else if (aarch64_tls_size
== 24)
15023 return SYMBOL_TLSLE24
;
15024 else if (aarch64_tls_size
== 32)
15025 return SYMBOL_TLSLE32
;
15026 else if (aarch64_tls_size
== 48)
15027 return SYMBOL_TLSLE48
;
15029 gcc_unreachable ();
15031 case TLS_MODEL_EMULATED
:
15032 case TLS_MODEL_NONE
:
15033 return SYMBOL_FORCE_TO_MEM
;
15036 gcc_unreachable ();
15040 /* Return the correct method for accessing X + OFFSET, where X is either
15041 a SYMBOL_REF or LABEL_REF. */
15043 enum aarch64_symbol_type
15044 aarch64_classify_symbol (rtx x
, HOST_WIDE_INT offset
)
15046 if (GET_CODE (x
) == LABEL_REF
)
15048 switch (aarch64_cmodel
)
15050 case AARCH64_CMODEL_LARGE
:
15051 return SYMBOL_FORCE_TO_MEM
;
15053 case AARCH64_CMODEL_TINY_PIC
:
15054 case AARCH64_CMODEL_TINY
:
15055 return SYMBOL_TINY_ABSOLUTE
;
15057 case AARCH64_CMODEL_SMALL_SPIC
:
15058 case AARCH64_CMODEL_SMALL_PIC
:
15059 case AARCH64_CMODEL_SMALL
:
15060 return SYMBOL_SMALL_ABSOLUTE
;
15063 gcc_unreachable ();
15067 if (GET_CODE (x
) == SYMBOL_REF
)
15069 if (aarch64_tls_symbol_p (x
))
15070 return aarch64_classify_tls_symbol (x
);
15072 switch (aarch64_cmodel
)
15074 case AARCH64_CMODEL_TINY
:
15075 /* When we retrieve symbol + offset address, we have to make sure
15076 the offset does not cause overflow of the final address. But
15077 we have no way of knowing the address of symbol at compile time
15078 so we can't accurately say if the distance between the PC and
15079 symbol + offset is outside the addressible range of +/-1MB in the
15080 TINY code model. So we limit the maximum offset to +/-64KB and
15081 assume the offset to the symbol is not larger than +/-(1MB - 64KB).
15082 If offset_within_block_p is true we allow larger offsets.
15083 Furthermore force to memory if the symbol is a weak reference to
15084 something that doesn't resolve to a symbol in this module. */
15086 if (SYMBOL_REF_WEAK (x
) && !aarch64_symbol_binds_local_p (x
))
15087 return SYMBOL_FORCE_TO_MEM
;
15088 if (!(IN_RANGE (offset
, -0x10000, 0x10000)
15089 || offset_within_block_p (x
, offset
)))
15090 return SYMBOL_FORCE_TO_MEM
;
15092 return SYMBOL_TINY_ABSOLUTE
;
15094 case AARCH64_CMODEL_SMALL
:
15095 /* Same reasoning as the tiny code model, but the offset cap here is
15096 1MB, allowing +/-3.9GB for the offset to the symbol. */
15098 if (SYMBOL_REF_WEAK (x
) && !aarch64_symbol_binds_local_p (x
))
15099 return SYMBOL_FORCE_TO_MEM
;
15100 if (!(IN_RANGE (offset
, -0x100000, 0x100000)
15101 || offset_within_block_p (x
, offset
)))
15102 return SYMBOL_FORCE_TO_MEM
;
15104 return SYMBOL_SMALL_ABSOLUTE
;
15106 case AARCH64_CMODEL_TINY_PIC
:
15107 if (!aarch64_symbol_binds_local_p (x
))
15108 return SYMBOL_TINY_GOT
;
15109 return SYMBOL_TINY_ABSOLUTE
;
15111 case AARCH64_CMODEL_SMALL_SPIC
:
15112 case AARCH64_CMODEL_SMALL_PIC
:
15113 if (!aarch64_symbol_binds_local_p (x
))
15114 return (aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
15115 ? SYMBOL_SMALL_GOT_28K
: SYMBOL_SMALL_GOT_4G
);
15116 return SYMBOL_SMALL_ABSOLUTE
;
15118 case AARCH64_CMODEL_LARGE
:
15119 /* This is alright even in PIC code as the constant
15120 pool reference is always PC relative and within
15121 the same translation unit. */
15122 if (!aarch64_pcrelative_literal_loads
&& CONSTANT_POOL_ADDRESS_P (x
))
15123 return SYMBOL_SMALL_ABSOLUTE
;
15125 return SYMBOL_FORCE_TO_MEM
;
15128 gcc_unreachable ();
15132 /* By default push everything into the constant pool. */
15133 return SYMBOL_FORCE_TO_MEM
;
15137 aarch64_constant_address_p (rtx x
)
15139 return (CONSTANT_P (x
) && memory_address_p (DImode
, x
));
15143 aarch64_legitimate_pic_operand_p (rtx x
)
15145 if (GET_CODE (x
) == SYMBOL_REF
15146 || (GET_CODE (x
) == CONST
15147 && GET_CODE (XEXP (x
, 0)) == PLUS
15148 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
))
15154 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
15155 that should be rematerialized rather than spilled. */
15158 aarch64_legitimate_constant_p (machine_mode mode
, rtx x
)
15160 /* Support CSE and rematerialization of common constants. */
15161 if (CONST_INT_P (x
)
15162 || (CONST_DOUBLE_P (x
) && GET_MODE_CLASS (mode
) == MODE_FLOAT
)
15163 || GET_CODE (x
) == CONST_VECTOR
)
15166 /* Do not allow vector struct mode constants for Advanced SIMD.
15167 We could support 0 and -1 easily, but they need support in
15168 aarch64-simd.md. */
15169 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
15170 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
15173 /* Only accept variable-length vector constants if they can be
15176 ??? It would be possible to handle rematerialization of other
15177 constants via secondary reloads. */
15178 if (vec_flags
& VEC_ANY_SVE
)
15179 return aarch64_simd_valid_immediate (x
, NULL
);
15181 if (GET_CODE (x
) == HIGH
)
15184 /* Accept polynomial constants that can be calculated by using the
15185 destination of a move as the sole temporary. Constants that
15186 require a second temporary cannot be rematerialized (they can't be
15187 forced to memory and also aren't legitimate constants). */
15189 if (poly_int_rtx_p (x
, &offset
))
15190 return aarch64_offset_temporaries (false, offset
) <= 1;
15192 /* If an offset is being added to something else, we need to allow the
15193 base to be moved into the destination register, meaning that there
15194 are no free temporaries for the offset. */
15195 x
= strip_offset (x
, &offset
);
15196 if (!offset
.is_constant () && aarch64_offset_temporaries (true, offset
) > 0)
15199 /* Do not allow const (plus (anchor_symbol, const_int)). */
15200 if (maybe_ne (offset
, 0) && SYMBOL_REF_P (x
) && SYMBOL_REF_ANCHOR_P (x
))
15203 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
15204 so spilling them is better than rematerialization. */
15205 if (SYMBOL_REF_P (x
) && !SYMBOL_REF_TLS_MODEL (x
))
15208 /* Label references are always constant. */
15209 if (GET_CODE (x
) == LABEL_REF
)
15216 aarch64_load_tp (rtx target
)
15219 || GET_MODE (target
) != Pmode
15220 || !register_operand (target
, Pmode
))
15221 target
= gen_reg_rtx (Pmode
);
15223 /* Can return in any reg. */
15224 emit_insn (gen_aarch64_load_tp_hard (target
));
15228 /* On AAPCS systems, this is the "struct __va_list". */
15229 static GTY(()) tree va_list_type
;
15231 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
15232 Return the type to use as __builtin_va_list.
15234 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
15246 aarch64_build_builtin_va_list (void)
15249 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
15251 /* Create the type. */
15252 va_list_type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
15253 /* Give it the required name. */
15254 va_list_name
= build_decl (BUILTINS_LOCATION
,
15256 get_identifier ("__va_list"),
15258 DECL_ARTIFICIAL (va_list_name
) = 1;
15259 TYPE_NAME (va_list_type
) = va_list_name
;
15260 TYPE_STUB_DECL (va_list_type
) = va_list_name
;
15262 /* Create the fields. */
15263 f_stack
= build_decl (BUILTINS_LOCATION
,
15264 FIELD_DECL
, get_identifier ("__stack"),
15266 f_grtop
= build_decl (BUILTINS_LOCATION
,
15267 FIELD_DECL
, get_identifier ("__gr_top"),
15269 f_vrtop
= build_decl (BUILTINS_LOCATION
,
15270 FIELD_DECL
, get_identifier ("__vr_top"),
15272 f_groff
= build_decl (BUILTINS_LOCATION
,
15273 FIELD_DECL
, get_identifier ("__gr_offs"),
15274 integer_type_node
);
15275 f_vroff
= build_decl (BUILTINS_LOCATION
,
15276 FIELD_DECL
, get_identifier ("__vr_offs"),
15277 integer_type_node
);
15279 /* Tell tree-stdarg pass about our internal offset fields.
15280 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
15281 purpose to identify whether the code is updating va_list internal
15282 offset fields through irregular way. */
15283 va_list_gpr_counter_field
= f_groff
;
15284 va_list_fpr_counter_field
= f_vroff
;
15286 DECL_ARTIFICIAL (f_stack
) = 1;
15287 DECL_ARTIFICIAL (f_grtop
) = 1;
15288 DECL_ARTIFICIAL (f_vrtop
) = 1;
15289 DECL_ARTIFICIAL (f_groff
) = 1;
15290 DECL_ARTIFICIAL (f_vroff
) = 1;
15292 DECL_FIELD_CONTEXT (f_stack
) = va_list_type
;
15293 DECL_FIELD_CONTEXT (f_grtop
) = va_list_type
;
15294 DECL_FIELD_CONTEXT (f_vrtop
) = va_list_type
;
15295 DECL_FIELD_CONTEXT (f_groff
) = va_list_type
;
15296 DECL_FIELD_CONTEXT (f_vroff
) = va_list_type
;
15298 TYPE_FIELDS (va_list_type
) = f_stack
;
15299 DECL_CHAIN (f_stack
) = f_grtop
;
15300 DECL_CHAIN (f_grtop
) = f_vrtop
;
15301 DECL_CHAIN (f_vrtop
) = f_groff
;
15302 DECL_CHAIN (f_groff
) = f_vroff
;
15304 /* Compute its layout. */
15305 layout_type (va_list_type
);
15307 return va_list_type
;
15310 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
15312 aarch64_expand_builtin_va_start (tree valist
, rtx nextarg ATTRIBUTE_UNUSED
)
15314 const CUMULATIVE_ARGS
*cum
;
15315 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
15316 tree stack
, grtop
, vrtop
, groff
, vroff
;
15318 int gr_save_area_size
= cfun
->va_list_gpr_size
;
15319 int vr_save_area_size
= cfun
->va_list_fpr_size
;
15322 cum
= &crtl
->args
.info
;
15323 if (cfun
->va_list_gpr_size
)
15324 gr_save_area_size
= MIN ((NUM_ARG_REGS
- cum
->aapcs_ncrn
) * UNITS_PER_WORD
,
15325 cfun
->va_list_gpr_size
);
15326 if (cfun
->va_list_fpr_size
)
15327 vr_save_area_size
= MIN ((NUM_FP_ARG_REGS
- cum
->aapcs_nvrn
)
15328 * UNITS_PER_VREG
, cfun
->va_list_fpr_size
);
15332 gcc_assert (cum
->aapcs_nvrn
== 0);
15333 vr_save_area_size
= 0;
15336 f_stack
= TYPE_FIELDS (va_list_type_node
);
15337 f_grtop
= DECL_CHAIN (f_stack
);
15338 f_vrtop
= DECL_CHAIN (f_grtop
);
15339 f_groff
= DECL_CHAIN (f_vrtop
);
15340 f_vroff
= DECL_CHAIN (f_groff
);
15342 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), valist
, f_stack
,
15344 grtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
), valist
, f_grtop
,
15346 vrtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
), valist
, f_vrtop
,
15348 groff
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
), valist
, f_groff
,
15350 vroff
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
), valist
, f_vroff
,
15353 /* Emit code to initialize STACK, which points to the next varargs stack
15354 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
15355 by named arguments. STACK is 8-byte aligned. */
15356 t
= make_tree (TREE_TYPE (stack
), virtual_incoming_args_rtx
);
15357 if (cum
->aapcs_stack_size
> 0)
15358 t
= fold_build_pointer_plus_hwi (t
, cum
->aapcs_stack_size
* UNITS_PER_WORD
);
15359 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), stack
, t
);
15360 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
15362 /* Emit code to initialize GRTOP, the top of the GR save area.
15363 virtual_incoming_args_rtx should have been 16 byte aligned. */
15364 t
= make_tree (TREE_TYPE (grtop
), virtual_incoming_args_rtx
);
15365 t
= build2 (MODIFY_EXPR
, TREE_TYPE (grtop
), grtop
, t
);
15366 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
15368 /* Emit code to initialize VRTOP, the top of the VR save area.
15369 This address is gr_save_area_bytes below GRTOP, rounded
15370 down to the next 16-byte boundary. */
15371 t
= make_tree (TREE_TYPE (vrtop
), virtual_incoming_args_rtx
);
15372 vr_offset
= ROUND_UP (gr_save_area_size
,
15373 STACK_BOUNDARY
/ BITS_PER_UNIT
);
15376 t
= fold_build_pointer_plus_hwi (t
, -vr_offset
);
15377 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vrtop
), vrtop
, t
);
15378 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
15380 /* Emit code to initialize GROFF, the offset from GRTOP of the
15381 next GPR argument. */
15382 t
= build2 (MODIFY_EXPR
, TREE_TYPE (groff
), groff
,
15383 build_int_cst (TREE_TYPE (groff
), -gr_save_area_size
));
15384 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
15386 /* Likewise emit code to initialize VROFF, the offset from FTOP
15387 of the next VR argument. */
15388 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vroff
), vroff
,
15389 build_int_cst (TREE_TYPE (vroff
), -vr_save_area_size
));
15390 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
15393 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
15396 aarch64_gimplify_va_arg_expr (tree valist
, tree type
, gimple_seq
*pre_p
,
15397 gimple_seq
*post_p ATTRIBUTE_UNUSED
)
15401 bool is_ha
; /* is HFA or HVA. */
15402 bool dw_align
; /* double-word align. */
15403 machine_mode ag_mode
= VOIDmode
;
15407 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
15408 tree stack
, f_top
, f_off
, off
, arg
, roundup
, on_stack
;
15409 HOST_WIDE_INT size
, rsize
, adjust
, align
;
15410 tree t
, u
, cond1
, cond2
;
15412 indirect_p
= pass_va_arg_by_reference (type
);
15414 type
= build_pointer_type (type
);
15416 mode
= TYPE_MODE (type
);
15418 f_stack
= TYPE_FIELDS (va_list_type_node
);
15419 f_grtop
= DECL_CHAIN (f_stack
);
15420 f_vrtop
= DECL_CHAIN (f_grtop
);
15421 f_groff
= DECL_CHAIN (f_vrtop
);
15422 f_vroff
= DECL_CHAIN (f_groff
);
15424 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), unshare_expr (valist
),
15425 f_stack
, NULL_TREE
);
15426 size
= int_size_in_bytes (type
);
15430 = aarch64_function_arg_alignment (mode
, type
, &abi_break
) / BITS_PER_UNIT
;
15434 if (aarch64_vfp_is_call_or_return_candidate (mode
,
15440 /* No frontends can create types with variable-sized modes, so we
15441 shouldn't be asked to pass or return them. */
15442 unsigned int ag_size
= GET_MODE_SIZE (ag_mode
).to_constant ();
15444 /* TYPE passed in fp/simd registers. */
15446 aarch64_err_no_fpadvsimd (mode
);
15448 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
),
15449 unshare_expr (valist
), f_vrtop
, NULL_TREE
);
15450 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
),
15451 unshare_expr (valist
), f_vroff
, NULL_TREE
);
15453 rsize
= nregs
* UNITS_PER_VREG
;
15457 if (BYTES_BIG_ENDIAN
&& ag_size
< UNITS_PER_VREG
)
15458 adjust
= UNITS_PER_VREG
- ag_size
;
15460 else if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
15461 && size
< UNITS_PER_VREG
)
15463 adjust
= UNITS_PER_VREG
- size
;
15468 /* TYPE passed in general registers. */
15469 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
),
15470 unshare_expr (valist
), f_grtop
, NULL_TREE
);
15471 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
),
15472 unshare_expr (valist
), f_groff
, NULL_TREE
);
15473 rsize
= ROUND_UP (size
, UNITS_PER_WORD
);
15474 nregs
= rsize
/ UNITS_PER_WORD
;
15478 if (abi_break
&& warn_psabi
)
15479 inform (input_location
, "parameter passing for argument of type "
15480 "%qT changed in GCC 9.1", type
);
15484 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
15485 && size
< UNITS_PER_WORD
)
15487 adjust
= UNITS_PER_WORD
- size
;
15491 /* Get a local temporary for the field value. */
15492 off
= get_initialized_tmp_var (f_off
, pre_p
, NULL
);
15494 /* Emit code to branch if off >= 0. */
15495 t
= build2 (GE_EXPR
, boolean_type_node
, off
,
15496 build_int_cst (TREE_TYPE (off
), 0));
15497 cond1
= build3 (COND_EXPR
, ptr_type_node
, t
, NULL_TREE
, NULL_TREE
);
15501 /* Emit: offs = (offs + 15) & -16. */
15502 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
15503 build_int_cst (TREE_TYPE (off
), 15));
15504 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (off
), t
,
15505 build_int_cst (TREE_TYPE (off
), -16));
15506 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (off
), off
, t
);
15511 /* Update ap.__[g|v]r_offs */
15512 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
15513 build_int_cst (TREE_TYPE (off
), rsize
));
15514 t
= build2 (MODIFY_EXPR
, TREE_TYPE (f_off
), unshare_expr (f_off
), t
);
15518 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
15520 /* [cond2] if (ap.__[g|v]r_offs > 0) */
15521 u
= build2 (GT_EXPR
, boolean_type_node
, unshare_expr (f_off
),
15522 build_int_cst (TREE_TYPE (f_off
), 0));
15523 cond2
= build3 (COND_EXPR
, ptr_type_node
, u
, NULL_TREE
, NULL_TREE
);
15525 /* String up: make sure the assignment happens before the use. */
15526 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (cond2
), t
, cond2
);
15527 COND_EXPR_ELSE (cond1
) = t
;
15529 /* Prepare the trees handling the argument that is passed on the stack;
15530 the top level node will store in ON_STACK. */
15531 arg
= get_initialized_tmp_var (stack
, pre_p
, NULL
);
15534 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
15535 t
= fold_build_pointer_plus_hwi (arg
, 15);
15536 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
15537 build_int_cst (TREE_TYPE (t
), -16));
15538 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (arg
), arg
, t
);
15542 /* Advance ap.__stack */
15543 t
= fold_build_pointer_plus_hwi (arg
, size
+ 7);
15544 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
15545 build_int_cst (TREE_TYPE (t
), -8));
15546 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), unshare_expr (stack
), t
);
15547 /* String up roundup and advance. */
15549 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
15550 /* String up with arg */
15551 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), t
, arg
);
15552 /* Big-endianness related address adjustment. */
15553 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
15554 && size
< UNITS_PER_WORD
)
15556 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (arg
), arg
,
15557 size_int (UNITS_PER_WORD
- size
));
15558 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), on_stack
, t
);
15561 COND_EXPR_THEN (cond1
) = unshare_expr (on_stack
);
15562 COND_EXPR_THEN (cond2
) = unshare_expr (on_stack
);
15564 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
15567 t
= build2 (PREINCREMENT_EXPR
, TREE_TYPE (off
), off
,
15568 build_int_cst (TREE_TYPE (off
), adjust
));
15570 t
= fold_convert (sizetype
, t
);
15571 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (f_top
), f_top
, t
);
15575 /* type ha; // treat as "struct {ftype field[n];}"
15576 ... [computing offs]
15577 for (i = 0; i <nregs; ++i, offs += 16)
15578 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
15581 tree tmp_ha
, field_t
, field_ptr_t
;
15583 /* Declare a local variable. */
15584 tmp_ha
= create_tmp_var_raw (type
, "ha");
15585 gimple_add_tmp_var (tmp_ha
);
15587 /* Establish the base type. */
15591 field_t
= float_type_node
;
15592 field_ptr_t
= float_ptr_type_node
;
15595 field_t
= double_type_node
;
15596 field_ptr_t
= double_ptr_type_node
;
15599 field_t
= long_double_type_node
;
15600 field_ptr_t
= long_double_ptr_type_node
;
15603 field_t
= aarch64_fp16_type_node
;
15604 field_ptr_t
= aarch64_fp16_ptr_type_node
;
15609 tree innertype
= make_signed_type (GET_MODE_PRECISION (SImode
));
15610 field_t
= build_vector_type_for_mode (innertype
, ag_mode
);
15611 field_ptr_t
= build_pointer_type (field_t
);
15618 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
15619 tmp_ha
= build1 (ADDR_EXPR
, field_ptr_t
, tmp_ha
);
15621 t
= fold_convert (field_ptr_t
, addr
);
15622 t
= build2 (MODIFY_EXPR
, field_t
,
15623 build1 (INDIRECT_REF
, field_t
, tmp_ha
),
15624 build1 (INDIRECT_REF
, field_t
, t
));
15626 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
15627 for (i
= 1; i
< nregs
; ++i
)
15629 addr
= fold_build_pointer_plus_hwi (addr
, UNITS_PER_VREG
);
15630 u
= fold_convert (field_ptr_t
, addr
);
15631 u
= build2 (MODIFY_EXPR
, field_t
,
15632 build2 (MEM_REF
, field_t
, tmp_ha
,
15633 build_int_cst (field_ptr_t
,
15635 int_size_in_bytes (field_t
)))),
15636 build1 (INDIRECT_REF
, field_t
, u
));
15637 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), t
, u
);
15640 u
= fold_convert (TREE_TYPE (f_top
), tmp_ha
);
15641 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (f_top
), t
, u
);
15644 COND_EXPR_ELSE (cond2
) = t
;
15645 addr
= fold_convert (build_pointer_type (type
), cond1
);
15646 addr
= build_va_arg_indirect_ref (addr
);
15649 addr
= build_va_arg_indirect_ref (addr
);
15654 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
15657 aarch64_setup_incoming_varargs (cumulative_args_t cum_v
,
15658 const function_arg_info
&arg
,
15659 int *pretend_size ATTRIBUTE_UNUSED
, int no_rtl
)
15661 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
15662 CUMULATIVE_ARGS local_cum
;
15663 int gr_saved
= cfun
->va_list_gpr_size
;
15664 int vr_saved
= cfun
->va_list_fpr_size
;
15666 /* The caller has advanced CUM up to, but not beyond, the last named
15667 argument. Advance a local copy of CUM past the last "real" named
15668 argument, to find out how many registers are left over. */
15670 aarch64_function_arg_advance (pack_cumulative_args(&local_cum
), arg
);
15672 /* Found out how many registers we need to save.
15673 Honor tree-stdvar analysis results. */
15674 if (cfun
->va_list_gpr_size
)
15675 gr_saved
= MIN (NUM_ARG_REGS
- local_cum
.aapcs_ncrn
,
15676 cfun
->va_list_gpr_size
/ UNITS_PER_WORD
);
15677 if (cfun
->va_list_fpr_size
)
15678 vr_saved
= MIN (NUM_FP_ARG_REGS
- local_cum
.aapcs_nvrn
,
15679 cfun
->va_list_fpr_size
/ UNITS_PER_VREG
);
15683 gcc_assert (local_cum
.aapcs_nvrn
== 0);
15693 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
15694 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
,
15695 - gr_saved
* UNITS_PER_WORD
);
15696 mem
= gen_frame_mem (BLKmode
, ptr
);
15697 set_mem_alias_set (mem
, get_varargs_alias_set ());
15699 move_block_from_reg (local_cum
.aapcs_ncrn
+ R0_REGNUM
,
15704 /* We can't use move_block_from_reg, because it will use
15705 the wrong mode, storing D regs only. */
15706 machine_mode mode
= TImode
;
15707 int off
, i
, vr_start
;
15709 /* Set OFF to the offset from virtual_incoming_args_rtx of
15710 the first vector register. The VR save area lies below
15711 the GR one, and is aligned to 16 bytes. */
15712 off
= -ROUND_UP (gr_saved
* UNITS_PER_WORD
,
15713 STACK_BOUNDARY
/ BITS_PER_UNIT
);
15714 off
-= vr_saved
* UNITS_PER_VREG
;
15716 vr_start
= V0_REGNUM
+ local_cum
.aapcs_nvrn
;
15717 for (i
= 0; i
< vr_saved
; ++i
)
15721 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
, off
);
15722 mem
= gen_frame_mem (mode
, ptr
);
15723 set_mem_alias_set (mem
, get_varargs_alias_set ());
15724 aarch64_emit_move (mem
, gen_rtx_REG (mode
, vr_start
+ i
));
15725 off
+= UNITS_PER_VREG
;
15730 /* We don't save the size into *PRETEND_SIZE because we want to avoid
15731 any complication of having crtl->args.pretend_args_size changed. */
15732 cfun
->machine
->frame
.saved_varargs_size
15733 = (ROUND_UP (gr_saved
* UNITS_PER_WORD
,
15734 STACK_BOUNDARY
/ BITS_PER_UNIT
)
15735 + vr_saved
* UNITS_PER_VREG
);
15739 aarch64_conditional_register_usage (void)
15744 for (i
= V0_REGNUM
; i
<= V31_REGNUM
; i
++)
15747 call_used_regs
[i
] = 1;
15751 for (i
= P0_REGNUM
; i
<= P15_REGNUM
; i
++)
15754 call_used_regs
[i
] = 1;
15757 /* Only allow the FFR and FFRT to be accessed via special patterns. */
15758 CLEAR_HARD_REG_BIT (operand_reg_set
, FFR_REGNUM
);
15759 CLEAR_HARD_REG_BIT (operand_reg_set
, FFRT_REGNUM
);
15761 /* When tracking speculation, we need a couple of call-clobbered registers
15762 to track the speculation state. It would be nice to just use
15763 IP0 and IP1, but currently there are numerous places that just
15764 assume these registers are free for other uses (eg pointer
15765 authentication). */
15766 if (aarch64_track_speculation
)
15768 fixed_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
15769 call_used_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
15770 fixed_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
15771 call_used_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
15775 /* Walk down the type tree of TYPE counting consecutive base elements.
15776 If *MODEP is VOIDmode, then set it to the first valid floating point
15777 type. If a non-floating point type is found, or if a floating point
15778 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
15779 otherwise return the count in the sub-tree. */
15781 aapcs_vfp_sub_candidate (const_tree type
, machine_mode
*modep
)
15784 HOST_WIDE_INT size
;
15786 /* SVE types (and types containing SVE types) must be handled
15787 before calling this function. */
15788 gcc_assert (!aarch64_sve::builtin_type_p (type
));
15790 switch (TREE_CODE (type
))
15793 mode
= TYPE_MODE (type
);
15794 if (mode
!= DFmode
&& mode
!= SFmode
15795 && mode
!= TFmode
&& mode
!= HFmode
)
15798 if (*modep
== VOIDmode
)
15801 if (*modep
== mode
)
15807 mode
= TYPE_MODE (TREE_TYPE (type
));
15808 if (mode
!= DFmode
&& mode
!= SFmode
15809 && mode
!= TFmode
&& mode
!= HFmode
)
15812 if (*modep
== VOIDmode
)
15815 if (*modep
== mode
)
15821 /* Use V2SImode and V4SImode as representatives of all 64-bit
15822 and 128-bit vector types. */
15823 size
= int_size_in_bytes (type
);
15836 if (*modep
== VOIDmode
)
15839 /* Vector modes are considered to be opaque: two vectors are
15840 equivalent for the purposes of being homogeneous aggregates
15841 if they are the same size. */
15842 if (*modep
== mode
)
15850 tree index
= TYPE_DOMAIN (type
);
15852 /* Can't handle incomplete types nor sizes that are not
15854 if (!COMPLETE_TYPE_P (type
)
15855 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
15858 count
= aapcs_vfp_sub_candidate (TREE_TYPE (type
), modep
);
15861 || !TYPE_MAX_VALUE (index
)
15862 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index
))
15863 || !TYPE_MIN_VALUE (index
)
15864 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index
))
15868 count
*= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index
))
15869 - tree_to_uhwi (TYPE_MIN_VALUE (index
)));
15871 /* There must be no padding. */
15872 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
15873 count
* GET_MODE_BITSIZE (*modep
)))
15885 /* Can't handle incomplete types nor sizes that are not
15887 if (!COMPLETE_TYPE_P (type
)
15888 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
15891 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
15893 if (TREE_CODE (field
) != FIELD_DECL
)
15896 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
15899 count
+= sub_count
;
15902 /* There must be no padding. */
15903 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
15904 count
* GET_MODE_BITSIZE (*modep
)))
15911 case QUAL_UNION_TYPE
:
15913 /* These aren't very interesting except in a degenerate case. */
15918 /* Can't handle incomplete types nor sizes that are not
15920 if (!COMPLETE_TYPE_P (type
)
15921 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
15924 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
15926 if (TREE_CODE (field
) != FIELD_DECL
)
15929 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
15932 count
= count
> sub_count
? count
: sub_count
;
15935 /* There must be no padding. */
15936 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
15937 count
* GET_MODE_BITSIZE (*modep
)))
15950 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
15951 type as described in AAPCS64 \S 4.1.2.
15953 See the comment above aarch64_composite_type_p for the notes on MODE. */
15956 aarch64_short_vector_p (const_tree type
,
15959 poly_int64 size
= -1;
15961 if (type
&& aarch64_sve::builtin_type_p (type
))
15964 if (type
&& TREE_CODE (type
) == VECTOR_TYPE
)
15965 size
= int_size_in_bytes (type
);
15966 else if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
15967 || GET_MODE_CLASS (mode
) == MODE_VECTOR_FLOAT
)
15968 size
= GET_MODE_SIZE (mode
);
15970 return known_eq (size
, 8) || known_eq (size
, 16);
15973 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
15974 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
15975 array types. The C99 floating-point complex types are also considered
15976 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
15977 types, which are GCC extensions and out of the scope of AAPCS64, are
15978 treated as composite types here as well.
15980 Note that MODE itself is not sufficient in determining whether a type
15981 is such a composite type or not. This is because
15982 stor-layout.c:compute_record_mode may have already changed the MODE
15983 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
15984 structure with only one field may have its MODE set to the mode of the
15985 field. Also an integer mode whose size matches the size of the
15986 RECORD_TYPE type may be used to substitute the original mode
15987 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
15988 solely relied on. */
15991 aarch64_composite_type_p (const_tree type
,
15994 if (aarch64_short_vector_p (type
, mode
))
15997 if (type
&& (AGGREGATE_TYPE_P (type
) || TREE_CODE (type
) == COMPLEX_TYPE
))
16000 if (mode
== BLKmode
16001 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
16002 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_INT
)
16008 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
16009 shall be passed or returned in simd/fp register(s) (providing these
16010 parameter passing registers are available).
16012 Upon successful return, *COUNT returns the number of needed registers,
16013 *BASE_MODE returns the mode of the individual register and when IS_HAF
16014 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
16015 floating-point aggregate or a homogeneous short-vector aggregate. */
16018 aarch64_vfp_is_call_or_return_candidate (machine_mode mode
,
16020 machine_mode
*base_mode
,
16024 if (is_ha
!= NULL
) *is_ha
= false;
16026 if (type
&& aarch64_sve::builtin_type_p (type
))
16029 machine_mode new_mode
= VOIDmode
;
16030 bool composite_p
= aarch64_composite_type_p (type
, mode
);
16032 if ((!composite_p
&& GET_MODE_CLASS (mode
) == MODE_FLOAT
)
16033 || aarch64_short_vector_p (type
, mode
))
16038 else if (GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
)
16040 if (is_ha
!= NULL
) *is_ha
= true;
16042 new_mode
= GET_MODE_INNER (mode
);
16044 else if (type
&& composite_p
)
16046 int ag_count
= aapcs_vfp_sub_candidate (type
, &new_mode
);
16048 if (ag_count
> 0 && ag_count
<= HA_MAX_NUM_FLDS
)
16050 if (is_ha
!= NULL
) *is_ha
= true;
16059 *base_mode
= new_mode
;
16063 /* Implement TARGET_STRUCT_VALUE_RTX. */
16066 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED
,
16067 int incoming ATTRIBUTE_UNUSED
)
16069 return gen_rtx_REG (Pmode
, AARCH64_STRUCT_VALUE_REGNUM
);
16072 /* Implements target hook vector_mode_supported_p. */
16074 aarch64_vector_mode_supported_p (machine_mode mode
)
16076 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
16077 return vec_flags
!= 0 && (vec_flags
& VEC_STRUCT
) == 0;
16080 /* Return the full-width SVE vector mode for element mode MODE, if one
16083 aarch64_full_sve_mode (scalar_mode mode
)
16100 return VNx16QImode
;
16102 return opt_machine_mode ();
16106 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
16109 aarch64_vq_mode (scalar_mode mode
)
16128 return opt_machine_mode ();
16132 /* Return appropriate SIMD container
16133 for MODE within a vector of WIDTH bits. */
16134 static machine_mode
16135 aarch64_simd_container_mode (scalar_mode mode
, poly_int64 width
)
16137 if (TARGET_SVE
&& known_eq (width
, BITS_PER_SVE_VECTOR
))
16138 return aarch64_full_sve_mode (mode
).else_mode (word_mode
);
16140 gcc_assert (known_eq (width
, 64) || known_eq (width
, 128));
16143 if (known_eq (width
, 128))
16144 return aarch64_vq_mode (mode
).else_mode (word_mode
);
16165 /* Return 128-bit container as the preferred SIMD mode for MODE. */
16166 static machine_mode
16167 aarch64_preferred_simd_mode (scalar_mode mode
)
16169 poly_int64 bits
= TARGET_SVE
? BITS_PER_SVE_VECTOR
: 128;
16170 return aarch64_simd_container_mode (mode
, bits
);
16173 /* Return a list of possible vector sizes for the vectorizer
16174 to iterate over. */
16175 static unsigned int
16176 aarch64_autovectorize_vector_modes (vector_modes
*modes
, bool)
16178 static const machine_mode sve_modes
[] = {
16179 /* Try using full vectors for all element types. */
16182 /* Try using 16-bit containers for 8-bit elements and full vectors
16183 for wider elements. */
16186 /* Try using 32-bit containers for 8-bit and 16-bit elements and
16187 full vectors for wider elements. */
16190 /* Try using 64-bit containers for all element types. */
16194 static const machine_mode advsimd_modes
[] = {
16195 /* Try using 128-bit vectors for all element types. */
16198 /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
16199 for wider elements. */
16202 /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
16203 for wider elements.
16205 TODO: We could support a limited form of V4QImode too, so that
16206 we use 32-bit vectors for 8-bit elements. */
16209 /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
16210 for 64-bit elements.
16212 TODO: We could similarly support limited forms of V2QImode and V2HImode
16217 /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
16220 - If we can't use N-byte Advanced SIMD vectors then the placement
16221 doesn't matter; we'll just continue as though the Advanced SIMD
16222 entry didn't exist.
16224 - If an SVE main loop with N bytes ends up being cheaper than an
16225 Advanced SIMD main loop with N bytes then by default we'll replace
16226 the Advanced SIMD version with the SVE one.
16228 - If an Advanced SIMD main loop with N bytes ends up being cheaper
16229 than an SVE main loop with N bytes then by default we'll try to
16230 use the SVE loop to vectorize the epilogue instead. */
16231 unsigned int sve_i
= TARGET_SVE
? 0 : ARRAY_SIZE (sve_modes
);
16232 unsigned int advsimd_i
= 0;
16233 while (advsimd_i
< ARRAY_SIZE (advsimd_modes
))
16235 if (sve_i
< ARRAY_SIZE (sve_modes
)
16236 && maybe_gt (GET_MODE_NUNITS (sve_modes
[sve_i
]),
16237 GET_MODE_NUNITS (advsimd_modes
[advsimd_i
])))
16238 modes
->safe_push (sve_modes
[sve_i
++]);
16240 modes
->safe_push (advsimd_modes
[advsimd_i
++]);
16242 while (sve_i
< ARRAY_SIZE (sve_modes
))
16243 modes
->safe_push (sve_modes
[sve_i
++]);
16245 unsigned int flags
= 0;
16246 /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we
16247 can compare SVE against Advanced SIMD and so that we can compare
16248 multiple SVE vectorization approaches against each other. There's
16249 not really any point doing this for Advanced SIMD only, since the
16250 first mode that works should always be the best. */
16251 if (TARGET_SVE
&& aarch64_sve_compare_costs
)
16252 flags
|= VECT_COMPARE_COSTS
;
16256 /* Implement TARGET_MANGLE_TYPE. */
16258 static const char *
16259 aarch64_mangle_type (const_tree type
)
16261 /* The AArch64 ABI documents say that "__va_list" has to be
16262 mangled as if it is in the "std" namespace. */
16263 if (lang_hooks
.types_compatible_p (CONST_CAST_TREE (type
), va_list_type
))
16264 return "St9__va_list";
16266 /* Half-precision float. */
16267 if (TREE_CODE (type
) == REAL_TYPE
&& TYPE_PRECISION (type
) == 16)
16270 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
16272 if (TYPE_NAME (type
) != NULL
)
16275 if ((res
= aarch64_general_mangle_builtin_type (type
))
16276 || (res
= aarch64_sve::mangle_builtin_type (type
)))
16280 /* Use the default mangling. */
16284 /* Implement TARGET_VERIFY_TYPE_CONTEXT. */
16287 aarch64_verify_type_context (location_t loc
, type_context_kind context
,
16288 const_tree type
, bool silent_p
)
16290 return aarch64_sve::verify_type_context (loc
, context
, type
, silent_p
);
16293 /* Find the first rtx_insn before insn that will generate an assembly
16297 aarch64_prev_real_insn (rtx_insn
*insn
)
16304 insn
= prev_real_insn (insn
);
16306 while (insn
&& recog_memoized (insn
) < 0);
16312 is_madd_op (enum attr_type t1
)
16315 /* A number of these may be AArch32 only. */
16316 enum attr_type mlatypes
[] = {
16317 TYPE_MLA
, TYPE_MLAS
, TYPE_SMLAD
, TYPE_SMLADX
, TYPE_SMLAL
, TYPE_SMLALD
,
16318 TYPE_SMLALS
, TYPE_SMLALXY
, TYPE_SMLAWX
, TYPE_SMLAWY
, TYPE_SMLAXY
,
16319 TYPE_SMMLA
, TYPE_UMLAL
, TYPE_UMLALS
,TYPE_SMLSD
, TYPE_SMLSDX
, TYPE_SMLSLD
16322 for (i
= 0; i
< sizeof (mlatypes
) / sizeof (enum attr_type
); i
++)
16324 if (t1
== mlatypes
[i
])
16331 /* Check if there is a register dependency between a load and the insn
16332 for which we hold recog_data. */
16335 dep_between_memop_and_curr (rtx memop
)
16340 gcc_assert (GET_CODE (memop
) == SET
);
16342 if (!REG_P (SET_DEST (memop
)))
16345 load_reg
= SET_DEST (memop
);
16346 for (opno
= 1; opno
< recog_data
.n_operands
; opno
++)
16348 rtx operand
= recog_data
.operand
[opno
];
16349 if (REG_P (operand
)
16350 && reg_overlap_mentioned_p (load_reg
, operand
))
16358 /* When working around the Cortex-A53 erratum 835769,
16359 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
16360 instruction and has a preceding memory instruction such that a NOP
16361 should be inserted between them. */
16364 aarch64_madd_needs_nop (rtx_insn
* insn
)
16366 enum attr_type attr_type
;
16370 if (!TARGET_FIX_ERR_A53_835769
)
16373 if (!INSN_P (insn
) || recog_memoized (insn
) < 0)
16376 attr_type
= get_attr_type (insn
);
16377 if (!is_madd_op (attr_type
))
16380 prev
= aarch64_prev_real_insn (insn
);
16381 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
16382 Restore recog state to INSN to avoid state corruption. */
16383 extract_constrain_insn_cached (insn
);
16385 if (!prev
|| !contains_mem_rtx_p (PATTERN (prev
)))
16388 body
= single_set (prev
);
16390 /* If the previous insn is a memory op and there is no dependency between
16391 it and the DImode madd, emit a NOP between them. If body is NULL then we
16392 have a complex memory operation, probably a load/store pair.
16393 Be conservative for now and emit a NOP. */
16394 if (GET_MODE (recog_data
.operand
[0]) == DImode
16395 && (!body
|| !dep_between_memop_and_curr (body
)))
16403 /* Implement FINAL_PRESCAN_INSN. */
16406 aarch64_final_prescan_insn (rtx_insn
*insn
)
16408 if (aarch64_madd_needs_nop (insn
))
16409 fprintf (asm_out_file
, "\tnop // between mem op and mult-accumulate\n");
16413 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
16417 aarch64_sve_index_immediate_p (rtx base_or_step
)
16419 return (CONST_INT_P (base_or_step
)
16420 && IN_RANGE (INTVAL (base_or_step
), -16, 15));
16423 /* Return true if X is a valid immediate for the SVE ADD and SUB
16424 instructions. Negate X first if NEGATE_P is true. */
16427 aarch64_sve_arith_immediate_p (rtx x
, bool negate_p
)
16431 if (!const_vec_duplicate_p (x
, &elt
)
16432 || !CONST_INT_P (elt
))
16435 HOST_WIDE_INT val
= INTVAL (elt
);
16438 val
&= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x
)));
16441 return IN_RANGE (val
, 0, 0xff);
16442 return IN_RANGE (val
, 0, 0xff00);
16445 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
16446 instructions. Negate X first if NEGATE_P is true. */
16449 aarch64_sve_sqadd_sqsub_immediate_p (rtx x
, bool negate_p
)
16453 if (!const_vec_duplicate_p (x
, &elt
)
16454 || !CONST_INT_P (elt
))
16457 if (!aarch64_sve_arith_immediate_p (x
, negate_p
))
16460 /* After the optional negation, the immediate must be nonnegative.
16461 E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
16462 instead of SQADD Zn.B, Zn.B, #129. */
16463 return negate_p
== (INTVAL (elt
) < 0);
16466 /* Return true if X is a valid immediate operand for an SVE logical
16467 instruction such as AND. */
16470 aarch64_sve_bitmask_immediate_p (rtx x
)
16474 return (const_vec_duplicate_p (x
, &elt
)
16475 && CONST_INT_P (elt
)
16476 && aarch64_bitmask_imm (INTVAL (elt
),
16477 GET_MODE_INNER (GET_MODE (x
))));
16480 /* Return true if X is a valid immediate for the SVE DUP and CPY
16484 aarch64_sve_dup_immediate_p (rtx x
)
16486 x
= aarch64_bit_representation (unwrap_const_vec_duplicate (x
));
16487 if (!CONST_INT_P (x
))
16490 HOST_WIDE_INT val
= INTVAL (x
);
16492 return IN_RANGE (val
, -0x80, 0x7f);
16493 return IN_RANGE (val
, -0x8000, 0x7f00);
16496 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
16497 SIGNED_P says whether the operand is signed rather than unsigned. */
16500 aarch64_sve_cmp_immediate_p (rtx x
, bool signed_p
)
16502 x
= unwrap_const_vec_duplicate (x
);
16503 return (CONST_INT_P (x
)
16505 ? IN_RANGE (INTVAL (x
), -16, 15)
16506 : IN_RANGE (INTVAL (x
), 0, 127)));
16509 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
16510 instruction. Negate X first if NEGATE_P is true. */
16513 aarch64_sve_float_arith_immediate_p (rtx x
, bool negate_p
)
16518 if (!const_vec_duplicate_p (x
, &elt
)
16519 || GET_CODE (elt
) != CONST_DOUBLE
)
16522 r
= *CONST_DOUBLE_REAL_VALUE (elt
);
16525 r
= real_value_negate (&r
);
16527 if (real_equal (&r
, &dconst1
))
16529 if (real_equal (&r
, &dconsthalf
))
16534 /* Return true if X is a valid immediate operand for an SVE FMUL
16538 aarch64_sve_float_mul_immediate_p (rtx x
)
16542 return (const_vec_duplicate_p (x
, &elt
)
16543 && GET_CODE (elt
) == CONST_DOUBLE
16544 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt
), &dconsthalf
)
16545 || real_equal (CONST_DOUBLE_REAL_VALUE (elt
), &dconst2
)));
16548 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
16549 for the Advanced SIMD operation described by WHICH and INSN. If INFO
16550 is nonnull, use it to describe valid immediates. */
16552 aarch64_advsimd_valid_immediate_hs (unsigned int val32
,
16553 simd_immediate_info
*info
,
16554 enum simd_immediate_check which
,
16555 simd_immediate_info::insn_type insn
)
16557 /* Try a 4-byte immediate with LSL. */
16558 for (unsigned int shift
= 0; shift
< 32; shift
+= 8)
16559 if ((val32
& (0xff << shift
)) == val32
)
16562 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
16563 simd_immediate_info::LSL
, shift
);
16567 /* Try a 2-byte immediate with LSL. */
16568 unsigned int imm16
= val32
& 0xffff;
16569 if (imm16
== (val32
>> 16))
16570 for (unsigned int shift
= 0; shift
< 16; shift
+= 8)
16571 if ((imm16
& (0xff << shift
)) == imm16
)
16574 *info
= simd_immediate_info (HImode
, imm16
>> shift
, insn
,
16575 simd_immediate_info::LSL
, shift
);
16579 /* Try a 4-byte immediate with MSL, except for cases that MVN
16581 if (which
== AARCH64_CHECK_MOV
)
16582 for (unsigned int shift
= 8; shift
< 24; shift
+= 8)
16584 unsigned int low
= (1 << shift
) - 1;
16585 if (((val32
& (0xff << shift
)) | low
) == val32
)
16588 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
16589 simd_immediate_info::MSL
, shift
);
16597 /* Return true if replicating VAL64 is a valid immediate for the
16598 Advanced SIMD operation described by WHICH. If INFO is nonnull,
16599 use it to describe valid immediates. */
16601 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64
,
16602 simd_immediate_info
*info
,
16603 enum simd_immediate_check which
)
16605 unsigned int val32
= val64
& 0xffffffff;
16606 unsigned int val16
= val64
& 0xffff;
16607 unsigned int val8
= val64
& 0xff;
16609 if (val32
== (val64
>> 32))
16611 if ((which
& AARCH64_CHECK_ORR
) != 0
16612 && aarch64_advsimd_valid_immediate_hs (val32
, info
, which
,
16613 simd_immediate_info::MOV
))
16616 if ((which
& AARCH64_CHECK_BIC
) != 0
16617 && aarch64_advsimd_valid_immediate_hs (~val32
, info
, which
,
16618 simd_immediate_info::MVN
))
16621 /* Try using a replicated byte. */
16622 if (which
== AARCH64_CHECK_MOV
16623 && val16
== (val32
>> 16)
16624 && val8
== (val16
>> 8))
16627 *info
= simd_immediate_info (QImode
, val8
);
16632 /* Try using a bit-to-bytemask. */
16633 if (which
== AARCH64_CHECK_MOV
)
16636 for (i
= 0; i
< 64; i
+= 8)
16638 unsigned char byte
= (val64
>> i
) & 0xff;
16639 if (byte
!= 0 && byte
!= 0xff)
16645 *info
= simd_immediate_info (DImode
, val64
);
16652 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
16653 instruction. If INFO is nonnull, use it to describe valid immediates. */
16656 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64
,
16657 simd_immediate_info
*info
)
16659 scalar_int_mode mode
= DImode
;
16660 unsigned int val32
= val64
& 0xffffffff;
16661 if (val32
== (val64
>> 32))
16664 unsigned int val16
= val32
& 0xffff;
16665 if (val16
== (val32
>> 16))
16668 unsigned int val8
= val16
& 0xff;
16669 if (val8
== (val16
>> 8))
16673 HOST_WIDE_INT val
= trunc_int_for_mode (val64
, mode
);
16674 if (IN_RANGE (val
, -0x80, 0x7f))
16676 /* DUP with no shift. */
16678 *info
= simd_immediate_info (mode
, val
);
16681 if ((val
& 0xff) == 0 && IN_RANGE (val
, -0x8000, 0x7f00))
16683 /* DUP with LSL #8. */
16685 *info
= simd_immediate_info (mode
, val
);
16688 if (aarch64_bitmask_imm (val64
, mode
))
16692 *info
= simd_immediate_info (mode
, val
);
16698 /* Return true if X is an UNSPEC_PTRUE constant of the form:
16700 (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
16702 where PATTERN is the svpattern as a CONST_INT and where ZERO
16703 is a zero constant of the required PTRUE mode (which can have
16704 fewer elements than X's mode, if zero bits are significant).
16706 If so, and if INFO is nonnull, describe the immediate in INFO. */
16708 aarch64_sve_ptrue_svpattern_p (rtx x
, struct simd_immediate_info
*info
)
16710 if (GET_CODE (x
) != CONST
)
16714 if (GET_CODE (x
) != UNSPEC
|| XINT (x
, 1) != UNSPEC_PTRUE
)
16719 aarch64_svpattern pattern
16720 = (aarch64_svpattern
) INTVAL (XVECEXP (x
, 0, 0));
16721 machine_mode pred_mode
= GET_MODE (XVECEXP (x
, 0, 1));
16722 scalar_int_mode int_mode
= aarch64_sve_element_int_mode (pred_mode
);
16723 *info
= simd_immediate_info (int_mode
, pattern
);
16728 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
16729 it to describe valid immediates. */
16732 aarch64_sve_pred_valid_immediate (rtx x
, simd_immediate_info
*info
)
16734 if (aarch64_sve_ptrue_svpattern_p (x
, info
))
16737 if (x
== CONST0_RTX (GET_MODE (x
)))
16740 *info
= simd_immediate_info (DImode
, 0);
16744 /* Analyze the value as a VNx16BImode. This should be relatively
16745 efficient, since rtx_vector_builder has enough built-in capacity
16746 to store all VLA predicate constants without needing the heap. */
16747 rtx_vector_builder builder
;
16748 if (!aarch64_get_sve_pred_bits (builder
, x
))
16751 unsigned int elt_size
= aarch64_widest_sve_pred_elt_size (builder
);
16752 if (int vl
= aarch64_partial_ptrue_length (builder
, elt_size
))
16754 machine_mode mode
= aarch64_sve_pred_mode (elt_size
).require ();
16755 aarch64_svpattern pattern
= aarch64_svpattern_for_vl (mode
, vl
);
16756 if (pattern
!= AARCH64_NUM_SVPATTERNS
)
16760 scalar_int_mode int_mode
= aarch64_sve_element_int_mode (mode
);
16761 *info
= simd_immediate_info (int_mode
, pattern
);
16769 /* Return true if OP is a valid SIMD immediate for the operation
16770 described by WHICH. If INFO is nonnull, use it to describe valid
16773 aarch64_simd_valid_immediate (rtx op
, simd_immediate_info
*info
,
16774 enum simd_immediate_check which
)
16776 machine_mode mode
= GET_MODE (op
);
16777 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
16778 if (vec_flags
== 0 || vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
16781 if (vec_flags
& VEC_SVE_PRED
)
16782 return aarch64_sve_pred_valid_immediate (op
, info
);
16784 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
16786 unsigned int n_elts
;
16787 if (GET_CODE (op
) == CONST_VECTOR
16788 && CONST_VECTOR_DUPLICATE_P (op
))
16789 n_elts
= CONST_VECTOR_NPATTERNS (op
);
16790 else if ((vec_flags
& VEC_SVE_DATA
)
16791 && const_vec_series_p (op
, &base
, &step
))
16793 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
);
16794 if (!aarch64_sve_index_immediate_p (base
)
16795 || !aarch64_sve_index_immediate_p (step
))
16800 /* Get the corresponding container mode. E.g. an INDEX on V2SI
16801 should yield two integer values per 128-bit block, meaning
16802 that we need to treat it in the same way as V2DI and then
16803 ignore the upper 32 bits of each element. */
16804 elt_mode
= aarch64_sve_container_int_mode (mode
);
16805 *info
= simd_immediate_info (elt_mode
, base
, step
);
16809 else if (GET_CODE (op
) == CONST_VECTOR
16810 && CONST_VECTOR_NUNITS (op
).is_constant (&n_elts
))
16811 /* N_ELTS set above. */;
16815 scalar_float_mode elt_float_mode
;
16817 && is_a
<scalar_float_mode
> (elt_mode
, &elt_float_mode
))
16819 rtx elt
= CONST_VECTOR_ENCODED_ELT (op
, 0);
16820 if (aarch64_float_const_zero_rtx_p (elt
)
16821 || aarch64_float_const_representable_p (elt
))
16824 *info
= simd_immediate_info (elt_float_mode
, elt
);
16829 /* If all elements in an SVE vector have the same value, we have a free
16830 choice between using the element mode and using the container mode.
16831 Using the element mode means that unused parts of the vector are
16832 duplicates of the used elements, while using the container mode means
16833 that the unused parts are an extension of the used elements. Using the
16834 element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
16835 for its container mode VNx4SI while 0x00000101 isn't.
16837 If not all elements in an SVE vector have the same value, we need the
16838 transition from one element to the next to occur at container boundaries.
16839 E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
16840 in the same way as a VNx4SI containing { 1, 2, 3, 4 }. */
16841 scalar_int_mode elt_int_mode
;
16842 if ((vec_flags
& VEC_SVE_DATA
) && n_elts
> 1)
16843 elt_int_mode
= aarch64_sve_container_int_mode (mode
);
16845 elt_int_mode
= int_mode_for_mode (elt_mode
).require ();
16847 unsigned int elt_size
= GET_MODE_SIZE (elt_int_mode
);
16851 /* Expand the vector constant out into a byte vector, with the least
16852 significant byte of the register first. */
16853 auto_vec
<unsigned char, 16> bytes
;
16854 bytes
.reserve (n_elts
* elt_size
);
16855 for (unsigned int i
= 0; i
< n_elts
; i
++)
16857 /* The vector is provided in gcc endian-neutral fashion.
16858 For aarch64_be Advanced SIMD, it must be laid out in the vector
16859 register in reverse order. */
16860 bool swap_p
= ((vec_flags
& VEC_ADVSIMD
) != 0 && BYTES_BIG_ENDIAN
);
16861 rtx elt
= CONST_VECTOR_ELT (op
, swap_p
? (n_elts
- 1 - i
) : i
);
16863 if (elt_mode
!= elt_int_mode
)
16864 elt
= gen_lowpart (elt_int_mode
, elt
);
16866 if (!CONST_INT_P (elt
))
16869 unsigned HOST_WIDE_INT elt_val
= INTVAL (elt
);
16870 for (unsigned int byte
= 0; byte
< elt_size
; byte
++)
16872 bytes
.quick_push (elt_val
& 0xff);
16873 elt_val
>>= BITS_PER_UNIT
;
16877 /* The immediate must repeat every eight bytes. */
16878 unsigned int nbytes
= bytes
.length ();
16879 for (unsigned i
= 8; i
< nbytes
; ++i
)
16880 if (bytes
[i
] != bytes
[i
- 8])
16883 /* Get the repeating 8-byte value as an integer. No endian correction
16884 is needed here because bytes is already in lsb-first order. */
16885 unsigned HOST_WIDE_INT val64
= 0;
16886 for (unsigned int i
= 0; i
< 8; i
++)
16887 val64
|= ((unsigned HOST_WIDE_INT
) bytes
[i
% nbytes
]
16888 << (i
* BITS_PER_UNIT
));
16890 if (vec_flags
& VEC_SVE_DATA
)
16891 return aarch64_sve_valid_immediate (val64
, info
);
16893 return aarch64_advsimd_valid_immediate (val64
, info
, which
);
16896 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
16897 has a step in the range of INDEX. Return the index expression if so,
16898 otherwise return null. */
16900 aarch64_check_zero_based_sve_index_immediate (rtx x
)
16903 if (const_vec_series_p (x
, &base
, &step
)
16904 && base
== const0_rtx
16905 && aarch64_sve_index_immediate_p (step
))
16910 /* Check of immediate shift constants are within range. */
16912 aarch64_simd_shift_imm_p (rtx x
, machine_mode mode
, bool left
)
16914 x
= unwrap_const_vec_duplicate (x
);
16915 if (!CONST_INT_P (x
))
16917 int bit_width
= GET_MODE_UNIT_SIZE (mode
) * BITS_PER_UNIT
;
16919 return IN_RANGE (INTVAL (x
), 0, bit_width
- 1);
16921 return IN_RANGE (INTVAL (x
), 1, bit_width
);
16924 /* Return the bitmask CONST_INT to select the bits required by a zero extract
16925 operation of width WIDTH at bit position POS. */
16928 aarch64_mask_from_zextract_ops (rtx width
, rtx pos
)
16930 gcc_assert (CONST_INT_P (width
));
16931 gcc_assert (CONST_INT_P (pos
));
16933 unsigned HOST_WIDE_INT mask
16934 = ((unsigned HOST_WIDE_INT
) 1 << UINTVAL (width
)) - 1;
16935 return GEN_INT (mask
<< UINTVAL (pos
));
16939 aarch64_mov_operand_p (rtx x
, machine_mode mode
)
16941 if (GET_CODE (x
) == HIGH
16942 && aarch64_valid_symref (XEXP (x
, 0), GET_MODE (XEXP (x
, 0))))
16945 if (CONST_INT_P (x
))
16948 if (VECTOR_MODE_P (GET_MODE (x
)))
16950 /* Require predicate constants to be VNx16BI before RA, so that we
16951 force everything to have a canonical form. */
16952 if (!lra_in_progress
16953 && !reload_completed
16954 && GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_BOOL
16955 && GET_MODE (x
) != VNx16BImode
)
16958 return aarch64_simd_valid_immediate (x
, NULL
);
16961 if (GET_CODE (x
) == SYMBOL_REF
&& mode
== DImode
&& CONSTANT_ADDRESS_P (x
))
16964 if (TARGET_SVE
&& aarch64_sve_cnt_immediate_p (x
))
16967 return aarch64_classify_symbolic_expression (x
)
16968 == SYMBOL_TINY_ABSOLUTE
;
16971 /* Return a const_int vector of VAL. */
16973 aarch64_simd_gen_const_vector_dup (machine_mode mode
, HOST_WIDE_INT val
)
16975 rtx c
= gen_int_mode (val
, GET_MODE_INNER (mode
));
16976 return gen_const_vec_duplicate (mode
, c
);
16979 /* Check OP is a legal scalar immediate for the MOVI instruction. */
16982 aarch64_simd_scalar_immediate_valid_for_move (rtx op
, scalar_int_mode mode
)
16984 machine_mode vmode
;
16986 vmode
= aarch64_simd_container_mode (mode
, 64);
16987 rtx op_v
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (op
));
16988 return aarch64_simd_valid_immediate (op_v
, NULL
);
16991 /* Construct and return a PARALLEL RTX vector with elements numbering the
16992 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
16993 the vector - from the perspective of the architecture. This does not
16994 line up with GCC's perspective on lane numbers, so we end up with
16995 different masks depending on our target endian-ness. The diagram
16996 below may help. We must draw the distinction when building masks
16997 which select one half of the vector. An instruction selecting
16998 architectural low-lanes for a big-endian target, must be described using
16999 a mask selecting GCC high-lanes.
17001 Big-Endian Little-Endian
17003 GCC 0 1 2 3 3 2 1 0
17004 | x | x | x | x | | x | x | x | x |
17005 Architecture 3 2 1 0 3 2 1 0
17007 Low Mask: { 2, 3 } { 0, 1 }
17008 High Mask: { 0, 1 } { 2, 3 }
17010 MODE Is the mode of the vector and NUNITS is the number of units in it. */
17013 aarch64_simd_vect_par_cnst_half (machine_mode mode
, int nunits
, bool high
)
17015 rtvec v
= rtvec_alloc (nunits
/ 2);
17016 int high_base
= nunits
/ 2;
17022 if (BYTES_BIG_ENDIAN
)
17023 base
= high
? low_base
: high_base
;
17025 base
= high
? high_base
: low_base
;
17027 for (i
= 0; i
< nunits
/ 2; i
++)
17028 RTVEC_ELT (v
, i
) = GEN_INT (base
+ i
);
17030 t1
= gen_rtx_PARALLEL (mode
, v
);
17034 /* Check OP for validity as a PARALLEL RTX vector with elements
17035 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
17036 from the perspective of the architecture. See the diagram above
17037 aarch64_simd_vect_par_cnst_half for more details. */
17040 aarch64_simd_check_vect_par_cnst_half (rtx op
, machine_mode mode
,
17044 if (!VECTOR_MODE_P (mode
) || !GET_MODE_NUNITS (mode
).is_constant (&nelts
))
17047 rtx ideal
= aarch64_simd_vect_par_cnst_half (mode
, nelts
, high
);
17048 HOST_WIDE_INT count_op
= XVECLEN (op
, 0);
17049 HOST_WIDE_INT count_ideal
= XVECLEN (ideal
, 0);
17052 if (count_op
!= count_ideal
)
17055 for (i
= 0; i
< count_ideal
; i
++)
17057 rtx elt_op
= XVECEXP (op
, 0, i
);
17058 rtx elt_ideal
= XVECEXP (ideal
, 0, i
);
17060 if (!CONST_INT_P (elt_op
)
17061 || INTVAL (elt_ideal
) != INTVAL (elt_op
))
17067 /* Return a PARALLEL containing NELTS elements, with element I equal
17068 to BASE + I * STEP. */
17071 aarch64_gen_stepped_int_parallel (unsigned int nelts
, int base
, int step
)
17073 rtvec vec
= rtvec_alloc (nelts
);
17074 for (unsigned int i
= 0; i
< nelts
; ++i
)
17075 RTVEC_ELT (vec
, i
) = gen_int_mode (base
+ i
* step
, DImode
);
17076 return gen_rtx_PARALLEL (VOIDmode
, vec
);
17079 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
17080 series with step STEP. */
17083 aarch64_stepped_int_parallel_p (rtx op
, int step
)
17085 if (GET_CODE (op
) != PARALLEL
|| !CONST_INT_P (XVECEXP (op
, 0, 0)))
17088 unsigned HOST_WIDE_INT base
= UINTVAL (XVECEXP (op
, 0, 0));
17089 for (int i
= 1; i
< XVECLEN (op
, 0); ++i
)
17090 if (!CONST_INT_P (XVECEXP (op
, 0, i
))
17091 || UINTVAL (XVECEXP (op
, 0, i
)) != base
+ i
* step
)
17097 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
17098 HIGH (exclusive). */
17100 aarch64_simd_lane_bounds (rtx operand
, HOST_WIDE_INT low
, HOST_WIDE_INT high
,
17103 HOST_WIDE_INT lane
;
17104 gcc_assert (CONST_INT_P (operand
));
17105 lane
= INTVAL (operand
);
17107 if (lane
< low
|| lane
>= high
)
17110 error ("%Klane %wd out of range %wd - %wd", exp
, lane
, low
, high
- 1);
17112 error ("lane %wd out of range %wd - %wd", lane
, low
, high
- 1);
17116 /* Peform endian correction on lane number N, which indexes a vector
17117 of mode MODE, and return the result as an SImode rtx. */
17120 aarch64_endian_lane_rtx (machine_mode mode
, unsigned int n
)
17122 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode
), n
), SImode
);
17125 /* Return TRUE if OP is a valid vector addressing mode. */
17128 aarch64_simd_mem_operand_p (rtx op
)
17130 return MEM_P (op
) && (GET_CODE (XEXP (op
, 0)) == POST_INC
17131 || REG_P (XEXP (op
, 0)));
17134 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
17137 aarch64_sve_ld1r_operand_p (rtx op
)
17139 struct aarch64_address_info addr
;
17143 && is_a
<scalar_mode
> (GET_MODE (op
), &mode
)
17144 && aarch64_classify_address (&addr
, XEXP (op
, 0), mode
, false)
17145 && addr
.type
== ADDRESS_REG_IMM
17146 && offset_6bit_unsigned_scaled_p (mode
, addr
.const_offset
));
17149 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
17151 aarch64_sve_ld1rq_operand_p (rtx op
)
17153 struct aarch64_address_info addr
;
17154 scalar_mode elem_mode
= GET_MODE_INNER (GET_MODE (op
));
17156 || !aarch64_classify_address (&addr
, XEXP (op
, 0), elem_mode
, false))
17159 if (addr
.type
== ADDRESS_REG_IMM
)
17160 return offset_4bit_signed_scaled_p (TImode
, addr
.const_offset
);
17162 if (addr
.type
== ADDRESS_REG_REG
)
17163 return (1U << addr
.shift
) == GET_MODE_SIZE (elem_mode
);
17168 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction. */
17170 aarch64_sve_ldff1_operand_p (rtx op
)
17175 struct aarch64_address_info addr
;
17176 if (!aarch64_classify_address (&addr
, XEXP (op
, 0), GET_MODE (op
), false))
17179 if (addr
.type
== ADDRESS_REG_IMM
)
17180 return known_eq (addr
.const_offset
, 0);
17182 return addr
.type
== ADDRESS_REG_REG
;
17185 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction. */
17187 aarch64_sve_ldnf1_operand_p (rtx op
)
17189 struct aarch64_address_info addr
;
17192 && aarch64_classify_address (&addr
, XEXP (op
, 0),
17193 GET_MODE (op
), false)
17194 && addr
.type
== ADDRESS_REG_IMM
);
17197 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
17198 The conditions for STR are the same. */
17200 aarch64_sve_ldr_operand_p (rtx op
)
17202 struct aarch64_address_info addr
;
17205 && aarch64_classify_address (&addr
, XEXP (op
, 0), GET_MODE (op
),
17206 false, ADDR_QUERY_ANY
)
17207 && addr
.type
== ADDRESS_REG_IMM
);
17210 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
17211 addressing memory of mode MODE. */
17213 aarch64_sve_prefetch_operand_p (rtx op
, machine_mode mode
)
17215 struct aarch64_address_info addr
;
17216 if (!aarch64_classify_address (&addr
, op
, mode
, false))
17219 if (addr
.type
== ADDRESS_REG_IMM
)
17220 return known_eq (addr
.const_offset
, 0);
17222 return addr
.type
== ADDRESS_REG_REG
;
17225 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
17226 We need to be able to access the individual pieces, so the range
17227 is different from LD[234] and ST[234]. */
17229 aarch64_sve_struct_memory_operand_p (rtx op
)
17234 machine_mode mode
= GET_MODE (op
);
17235 struct aarch64_address_info addr
;
17236 if (!aarch64_classify_address (&addr
, XEXP (op
, 0), SVE_BYTE_MODE
, false,
17238 || addr
.type
!= ADDRESS_REG_IMM
)
17241 poly_int64 first
= addr
.const_offset
;
17242 poly_int64 last
= first
+ GET_MODE_SIZE (mode
) - BYTES_PER_SVE_VECTOR
;
17243 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, first
)
17244 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, last
));
17247 /* Emit a register copy from operand to operand, taking care not to
17248 early-clobber source registers in the process.
17250 COUNT is the number of components into which the copy needs to be
17253 aarch64_simd_emit_reg_reg_move (rtx
*operands
, machine_mode mode
,
17254 unsigned int count
)
17257 int rdest
= REGNO (operands
[0]);
17258 int rsrc
= REGNO (operands
[1]);
17260 if (!reg_overlap_mentioned_p (operands
[0], operands
[1])
17262 for (i
= 0; i
< count
; i
++)
17263 emit_move_insn (gen_rtx_REG (mode
, rdest
+ i
),
17264 gen_rtx_REG (mode
, rsrc
+ i
));
17266 for (i
= 0; i
< count
; i
++)
17267 emit_move_insn (gen_rtx_REG (mode
, rdest
+ count
- i
- 1),
17268 gen_rtx_REG (mode
, rsrc
+ count
- i
- 1));
17271 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
17272 one of VSTRUCT modes: OI, CI, or XI. */
17274 aarch64_simd_attr_length_rglist (machine_mode mode
)
17276 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
17277 return (GET_MODE_SIZE (mode
).to_constant () / UNITS_PER_VREG
) * 4;
17280 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
17281 alignment of a vector to 128 bits. SVE predicates have an alignment of
17283 static HOST_WIDE_INT
17284 aarch64_simd_vector_alignment (const_tree type
)
17286 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
17287 be set for non-predicate vectors of booleans. Modes are the most
17288 direct way we have of identifying real SVE predicate types. */
17289 if (GET_MODE_CLASS (TYPE_MODE (type
)) == MODE_VECTOR_BOOL
)
17291 widest_int min_size
17292 = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type
)));
17293 return wi::umin (min_size
, 128).to_uhwi ();
17296 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
17298 aarch64_vectorize_preferred_vector_alignment (const_tree type
)
17300 if (aarch64_sve_data_mode_p (TYPE_MODE (type
)))
17302 /* If the length of the vector is fixed, try to align to that length,
17303 otherwise don't try to align at all. */
17304 HOST_WIDE_INT result
;
17305 if (!BITS_PER_SVE_VECTOR
.is_constant (&result
))
17306 result
= TYPE_ALIGN (TREE_TYPE (type
));
17309 return TYPE_ALIGN (type
);
17312 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
17314 aarch64_simd_vector_alignment_reachable (const_tree type
, bool is_packed
)
17319 /* For fixed-length vectors, check that the vectorizer will aim for
17320 full-vector alignment. This isn't true for generic GCC vectors
17321 that are wider than the ABI maximum of 128 bits. */
17322 poly_uint64 preferred_alignment
=
17323 aarch64_vectorize_preferred_vector_alignment (type
);
17324 if (TREE_CODE (TYPE_SIZE (type
)) == INTEGER_CST
17325 && maybe_ne (wi::to_widest (TYPE_SIZE (type
)),
17326 preferred_alignment
))
17329 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
17333 /* Return true if the vector misalignment factor is supported by the
17336 aarch64_builtin_support_vector_misalignment (machine_mode mode
,
17337 const_tree type
, int misalignment
,
17340 if (TARGET_SIMD
&& STRICT_ALIGNMENT
)
17342 /* Return if movmisalign pattern is not supported for this mode. */
17343 if (optab_handler (movmisalign_optab
, mode
) == CODE_FOR_nothing
)
17346 /* Misalignment factor is unknown at compile time. */
17347 if (misalignment
== -1)
17350 return default_builtin_support_vector_misalignment (mode
, type
, misalignment
,
17354 /* If VALS is a vector constant that can be loaded into a register
17355 using DUP, generate instructions to do so and return an RTX to
17356 assign to the register. Otherwise return NULL_RTX. */
17358 aarch64_simd_dup_constant (rtx vals
)
17360 machine_mode mode
= GET_MODE (vals
);
17361 machine_mode inner_mode
= GET_MODE_INNER (mode
);
17364 if (!const_vec_duplicate_p (vals
, &x
))
17367 /* We can load this constant by using DUP and a constant in a
17368 single ARM register. This will be cheaper than a vector
17370 x
= copy_to_mode_reg (inner_mode
, x
);
17371 return gen_vec_duplicate (mode
, x
);
17375 /* Generate code to load VALS, which is a PARALLEL containing only
17376 constants (for vec_init) or CONST_VECTOR, efficiently into a
17377 register. Returns an RTX to copy into the register, or NULL_RTX
17378 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
17380 aarch64_simd_make_constant (rtx vals
)
17382 machine_mode mode
= GET_MODE (vals
);
17384 rtx const_vec
= NULL_RTX
;
17388 if (GET_CODE (vals
) == CONST_VECTOR
)
17390 else if (GET_CODE (vals
) == PARALLEL
)
17392 /* A CONST_VECTOR must contain only CONST_INTs and
17393 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
17394 Only store valid constants in a CONST_VECTOR. */
17395 int n_elts
= XVECLEN (vals
, 0);
17396 for (i
= 0; i
< n_elts
; ++i
)
17398 rtx x
= XVECEXP (vals
, 0, i
);
17399 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
17402 if (n_const
== n_elts
)
17403 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0));
17406 gcc_unreachable ();
17408 if (const_vec
!= NULL_RTX
17409 && aarch64_simd_valid_immediate (const_vec
, NULL
))
17410 /* Load using MOVI/MVNI. */
17412 else if ((const_dup
= aarch64_simd_dup_constant (vals
)) != NULL_RTX
)
17413 /* Loaded using DUP. */
17415 else if (const_vec
!= NULL_RTX
)
17416 /* Load from constant pool. We cannot take advantage of single-cycle
17417 LD1 because we need a PC-relative addressing mode. */
17420 /* A PARALLEL containing something not valid inside CONST_VECTOR.
17421 We cannot construct an initializer. */
17425 /* Expand a vector initialisation sequence, such that TARGET is
17426 initialised to contain VALS. */
17429 aarch64_expand_vector_init (rtx target
, rtx vals
)
17431 machine_mode mode
= GET_MODE (target
);
17432 scalar_mode inner_mode
= GET_MODE_INNER (mode
);
17433 /* The number of vector elements. */
17434 int n_elts
= XVECLEN (vals
, 0);
17435 /* The number of vector elements which are not constant. */
17437 rtx any_const
= NULL_RTX
;
17438 /* The first element of vals. */
17439 rtx v0
= XVECEXP (vals
, 0, 0);
17440 bool all_same
= true;
17442 /* This is a special vec_init<M><N> where N is not an element mode but a
17443 vector mode with half the elements of M. We expect to find two entries
17444 of mode N in VALS and we must put their concatentation into TARGET. */
17445 if (XVECLEN (vals
, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals
, 0, 0))))
17447 gcc_assert (known_eq (GET_MODE_SIZE (mode
),
17448 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals
, 0, 0)))));
17449 rtx lo
= XVECEXP (vals
, 0, 0);
17450 rtx hi
= XVECEXP (vals
, 0, 1);
17451 machine_mode narrow_mode
= GET_MODE (lo
);
17452 gcc_assert (GET_MODE_INNER (narrow_mode
) == inner_mode
);
17453 gcc_assert (narrow_mode
== GET_MODE (hi
));
17455 /* When we want to concatenate a half-width vector with zeroes we can
17456 use the aarch64_combinez[_be] patterns. Just make sure that the
17457 zeroes are in the right half. */
17458 if (BYTES_BIG_ENDIAN
17459 && aarch64_simd_imm_zero (lo
, narrow_mode
)
17460 && general_operand (hi
, narrow_mode
))
17461 emit_insn (gen_aarch64_combinez_be (narrow_mode
, target
, hi
, lo
));
17462 else if (!BYTES_BIG_ENDIAN
17463 && aarch64_simd_imm_zero (hi
, narrow_mode
)
17464 && general_operand (lo
, narrow_mode
))
17465 emit_insn (gen_aarch64_combinez (narrow_mode
, target
, lo
, hi
));
17468 /* Else create the two half-width registers and combine them. */
17470 lo
= force_reg (GET_MODE (lo
), lo
);
17472 hi
= force_reg (GET_MODE (hi
), hi
);
17474 if (BYTES_BIG_ENDIAN
)
17475 std::swap (lo
, hi
);
17476 emit_insn (gen_aarch64_simd_combine (narrow_mode
, target
, lo
, hi
));
17481 /* Count the number of variable elements to initialise. */
17482 for (int i
= 0; i
< n_elts
; ++i
)
17484 rtx x
= XVECEXP (vals
, 0, i
);
17485 if (!(CONST_INT_P (x
) || CONST_DOUBLE_P (x
)))
17490 all_same
&= rtx_equal_p (x
, v0
);
17493 /* No variable elements, hand off to aarch64_simd_make_constant which knows
17494 how best to handle this. */
17497 rtx constant
= aarch64_simd_make_constant (vals
);
17498 if (constant
!= NULL_RTX
)
17500 emit_move_insn (target
, constant
);
17505 /* Splat a single non-constant element if we can. */
17508 rtx x
= copy_to_mode_reg (inner_mode
, v0
);
17509 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
17513 enum insn_code icode
= optab_handler (vec_set_optab
, mode
);
17514 gcc_assert (icode
!= CODE_FOR_nothing
);
17516 /* If there are only variable elements, try to optimize
17517 the insertion using dup for the most common element
17518 followed by insertions. */
17520 /* The algorithm will fill matches[*][0] with the earliest matching element,
17521 and matches[X][1] with the count of duplicate elements (if X is the
17522 earliest element which has duplicates). */
17524 if (n_var
== n_elts
&& n_elts
<= 16)
17526 int matches
[16][2] = {0};
17527 for (int i
= 0; i
< n_elts
; i
++)
17529 for (int j
= 0; j
<= i
; j
++)
17531 if (rtx_equal_p (XVECEXP (vals
, 0, i
), XVECEXP (vals
, 0, j
)))
17539 int maxelement
= 0;
17541 for (int i
= 0; i
< n_elts
; i
++)
17542 if (matches
[i
][1] > maxv
)
17545 maxv
= matches
[i
][1];
17548 /* Create a duplicate of the most common element, unless all elements
17549 are equally useless to us, in which case just immediately set the
17550 vector register using the first element. */
17554 /* For vectors of two 64-bit elements, we can do even better. */
17556 && (inner_mode
== E_DImode
17557 || inner_mode
== E_DFmode
))
17560 rtx x0
= XVECEXP (vals
, 0, 0);
17561 rtx x1
= XVECEXP (vals
, 0, 1);
17562 /* Combine can pick up this case, but handling it directly
17563 here leaves clearer RTL.
17565 This is load_pair_lanes<mode>, and also gives us a clean-up
17566 for store_pair_lanes<mode>. */
17567 if (memory_operand (x0
, inner_mode
)
17568 && memory_operand (x1
, inner_mode
)
17569 && !STRICT_ALIGNMENT
17570 && rtx_equal_p (XEXP (x1
, 0),
17571 plus_constant (Pmode
,
17573 GET_MODE_SIZE (inner_mode
))))
17576 if (inner_mode
== DFmode
)
17577 t
= gen_load_pair_lanesdf (target
, x0
, x1
);
17579 t
= gen_load_pair_lanesdi (target
, x0
, x1
);
17584 /* The subreg-move sequence below will move into lane zero of the
17585 vector register. For big-endian we want that position to hold
17586 the last element of VALS. */
17587 maxelement
= BYTES_BIG_ENDIAN
? n_elts
- 1 : 0;
17588 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
17589 aarch64_emit_move (target
, lowpart_subreg (mode
, x
, inner_mode
));
17593 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
17594 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
17597 /* Insert the rest. */
17598 for (int i
= 0; i
< n_elts
; i
++)
17600 rtx x
= XVECEXP (vals
, 0, i
);
17601 if (matches
[i
][0] == maxelement
)
17603 x
= copy_to_mode_reg (inner_mode
, x
);
17604 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
17609 /* Initialise a vector which is part-variable. We want to first try
17610 to build those lanes which are constant in the most efficient way we
17612 if (n_var
!= n_elts
)
17614 rtx copy
= copy_rtx (vals
);
17616 /* Load constant part of vector. We really don't care what goes into the
17617 parts we will overwrite, but we're more likely to be able to load the
17618 constant efficiently if it has fewer, larger, repeating parts
17619 (see aarch64_simd_valid_immediate). */
17620 for (int i
= 0; i
< n_elts
; i
++)
17622 rtx x
= XVECEXP (vals
, 0, i
);
17623 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
17625 rtx subst
= any_const
;
17626 for (int bit
= n_elts
/ 2; bit
> 0; bit
/= 2)
17628 /* Look in the copied vector, as more elements are const. */
17629 rtx test
= XVECEXP (copy
, 0, i
^ bit
);
17630 if (CONST_INT_P (test
) || CONST_DOUBLE_P (test
))
17636 XVECEXP (copy
, 0, i
) = subst
;
17638 aarch64_expand_vector_init (target
, copy
);
17641 /* Insert the variable lanes directly. */
17642 for (int i
= 0; i
< n_elts
; i
++)
17644 rtx x
= XVECEXP (vals
, 0, i
);
17645 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
17647 x
= copy_to_mode_reg (inner_mode
, x
);
17648 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
17652 /* Emit RTL corresponding to:
17653 insr TARGET, ELEM. */
17656 emit_insr (rtx target
, rtx elem
)
17658 machine_mode mode
= GET_MODE (target
);
17659 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
17660 elem
= force_reg (elem_mode
, elem
);
17662 insn_code icode
= optab_handler (vec_shl_insert_optab
, mode
);
17663 gcc_assert (icode
!= CODE_FOR_nothing
);
17664 emit_insn (GEN_FCN (icode
) (target
, target
, elem
));
17667 /* Subroutine of aarch64_sve_expand_vector_init for handling
17668 trailing constants.
17669 This function works as follows:
17670 (a) Create a new vector consisting of trailing constants.
17671 (b) Initialize TARGET with the constant vector using emit_move_insn.
17672 (c) Insert remaining elements in TARGET using insr.
17673 NELTS is the total number of elements in original vector while
17674 while NELTS_REQD is the number of elements that are actually
17677 ??? The heuristic used is to do above only if number of constants
17678 is at least half the total number of elements. May need fine tuning. */
17681 aarch64_sve_expand_vector_init_handle_trailing_constants
17682 (rtx target
, const rtx_vector_builder
&builder
, int nelts
, int nelts_reqd
)
17684 machine_mode mode
= GET_MODE (target
);
17685 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
17686 int n_trailing_constants
= 0;
17688 for (int i
= nelts_reqd
- 1;
17689 i
>= 0 && aarch64_legitimate_constant_p (elem_mode
, builder
.elt (i
));
17691 n_trailing_constants
++;
17693 if (n_trailing_constants
>= nelts_reqd
/ 2)
17695 rtx_vector_builder
v (mode
, 1, nelts
);
17696 for (int i
= 0; i
< nelts
; i
++)
17697 v
.quick_push (builder
.elt (i
+ nelts_reqd
- n_trailing_constants
));
17698 rtx const_vec
= v
.build ();
17699 emit_move_insn (target
, const_vec
);
17701 for (int i
= nelts_reqd
- n_trailing_constants
- 1; i
>= 0; i
--)
17702 emit_insr (target
, builder
.elt (i
));
17710 /* Subroutine of aarch64_sve_expand_vector_init.
17712 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
17713 (b) Skip trailing elements from BUILDER, which are the same as
17714 element NELTS_REQD - 1.
17715 (c) Insert earlier elements in reverse order in TARGET using insr. */
17718 aarch64_sve_expand_vector_init_insert_elems (rtx target
,
17719 const rtx_vector_builder
&builder
,
17722 machine_mode mode
= GET_MODE (target
);
17723 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
17725 struct expand_operand ops
[2];
17726 enum insn_code icode
= optab_handler (vec_duplicate_optab
, mode
);
17727 gcc_assert (icode
!= CODE_FOR_nothing
);
17729 create_output_operand (&ops
[0], target
, mode
);
17730 create_input_operand (&ops
[1], builder
.elt (nelts_reqd
- 1), elem_mode
);
17731 expand_insn (icode
, 2, ops
);
17733 int ndups
= builder
.count_dups (nelts_reqd
- 1, -1, -1);
17734 for (int i
= nelts_reqd
- ndups
- 1; i
>= 0; i
--)
17735 emit_insr (target
, builder
.elt (i
));
17738 /* Subroutine of aarch64_sve_expand_vector_init to handle case
17739 when all trailing elements of builder are same.
17740 This works as follows:
17741 (a) Use expand_insn interface to broadcast last vector element in TARGET.
17742 (b) Insert remaining elements in TARGET using insr.
17744 ??? The heuristic used is to do above if number of same trailing elements
17745 is at least 3/4 of total number of elements, loosely based on
17746 heuristic from mostly_zeros_p. May need fine-tuning. */
17749 aarch64_sve_expand_vector_init_handle_trailing_same_elem
17750 (rtx target
, const rtx_vector_builder
&builder
, int nelts_reqd
)
17752 int ndups
= builder
.count_dups (nelts_reqd
- 1, -1, -1);
17753 if (ndups
>= (3 * nelts_reqd
) / 4)
17755 aarch64_sve_expand_vector_init_insert_elems (target
, builder
,
17756 nelts_reqd
- ndups
+ 1);
17763 /* Initialize register TARGET from BUILDER. NELTS is the constant number
17764 of elements in BUILDER.
17766 The function tries to initialize TARGET from BUILDER if it fits one
17767 of the special cases outlined below.
17769 Failing that, the function divides BUILDER into two sub-vectors:
17770 v_even = even elements of BUILDER;
17771 v_odd = odd elements of BUILDER;
17773 and recursively calls itself with v_even and v_odd.
17775 if (recursive call succeeded for v_even or v_odd)
17776 TARGET = zip (v_even, v_odd)
17778 The function returns true if it managed to build TARGET from BUILDER
17779 with one of the special cases, false otherwise.
17781 Example: {a, 1, b, 2, c, 3, d, 4}
17783 The vector gets divided into:
17784 v_even = {a, b, c, d}
17785 v_odd = {1, 2, 3, 4}
17787 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
17788 initialize tmp2 from constant vector v_odd using emit_move_insn.
17790 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
17791 4 elements, so we construct tmp1 from v_even using insr:
17798 TARGET = zip (tmp1, tmp2)
17799 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
17802 aarch64_sve_expand_vector_init (rtx target
, const rtx_vector_builder
&builder
,
17803 int nelts
, int nelts_reqd
)
17805 machine_mode mode
= GET_MODE (target
);
17807 /* Case 1: Vector contains trailing constants. */
17809 if (aarch64_sve_expand_vector_init_handle_trailing_constants
17810 (target
, builder
, nelts
, nelts_reqd
))
17813 /* Case 2: Vector contains leading constants. */
17815 rtx_vector_builder
rev_builder (mode
, 1, nelts_reqd
);
17816 for (int i
= 0; i
< nelts_reqd
; i
++)
17817 rev_builder
.quick_push (builder
.elt (nelts_reqd
- i
- 1));
17818 rev_builder
.finalize ();
17820 if (aarch64_sve_expand_vector_init_handle_trailing_constants
17821 (target
, rev_builder
, nelts
, nelts_reqd
))
17823 emit_insn (gen_aarch64_sve_rev (mode
, target
, target
));
17827 /* Case 3: Vector contains trailing same element. */
17829 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
17830 (target
, builder
, nelts_reqd
))
17833 /* Case 4: Vector contains leading same element. */
17835 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
17836 (target
, rev_builder
, nelts_reqd
) && nelts_reqd
== nelts
)
17838 emit_insn (gen_aarch64_sve_rev (mode
, target
, target
));
17842 /* Avoid recursing below 4-elements.
17843 ??? The threshold 4 may need fine-tuning. */
17845 if (nelts_reqd
<= 4)
17848 rtx_vector_builder
v_even (mode
, 1, nelts
);
17849 rtx_vector_builder
v_odd (mode
, 1, nelts
);
17851 for (int i
= 0; i
< nelts
* 2; i
+= 2)
17853 v_even
.quick_push (builder
.elt (i
));
17854 v_odd
.quick_push (builder
.elt (i
+ 1));
17857 v_even
.finalize ();
17860 rtx tmp1
= gen_reg_rtx (mode
);
17861 bool did_even_p
= aarch64_sve_expand_vector_init (tmp1
, v_even
,
17862 nelts
, nelts_reqd
/ 2);
17864 rtx tmp2
= gen_reg_rtx (mode
);
17865 bool did_odd_p
= aarch64_sve_expand_vector_init (tmp2
, v_odd
,
17866 nelts
, nelts_reqd
/ 2);
17868 if (!did_even_p
&& !did_odd_p
)
17871 /* Initialize v_even and v_odd using INSR if it didn't match any of the
17872 special cases and zip v_even, v_odd. */
17875 aarch64_sve_expand_vector_init_insert_elems (tmp1
, v_even
, nelts_reqd
/ 2);
17878 aarch64_sve_expand_vector_init_insert_elems (tmp2
, v_odd
, nelts_reqd
/ 2);
17880 rtvec v
= gen_rtvec (2, tmp1
, tmp2
);
17881 emit_set_insn (target
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
17885 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
17888 aarch64_sve_expand_vector_init (rtx target
, rtx vals
)
17890 machine_mode mode
= GET_MODE (target
);
17891 int nelts
= XVECLEN (vals
, 0);
17893 rtx_vector_builder
v (mode
, 1, nelts
);
17894 for (int i
= 0; i
< nelts
; i
++)
17895 v
.quick_push (XVECEXP (vals
, 0, i
));
17898 /* If neither sub-vectors of v could be initialized specially,
17899 then use INSR to insert all elements from v into TARGET.
17900 ??? This might not be optimal for vectors with large
17901 initializers like 16-element or above.
17902 For nelts < 4, it probably isn't useful to handle specially. */
17905 || !aarch64_sve_expand_vector_init (target
, v
, nelts
, nelts
))
17906 aarch64_sve_expand_vector_init_insert_elems (target
, v
, nelts
);
17909 /* Check whether VALUE is a vector constant in which every element
17910 is either a power of 2 or a negated power of 2. If so, return
17911 a constant vector of log2s, and flip CODE between PLUS and MINUS
17912 if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
17915 aarch64_convert_mult_to_shift (rtx value
, rtx_code
&code
)
17917 if (GET_CODE (value
) != CONST_VECTOR
)
17920 rtx_vector_builder builder
;
17921 if (!builder
.new_unary_operation (GET_MODE (value
), value
, false))
17924 scalar_mode int_mode
= GET_MODE_INNER (GET_MODE (value
));
17925 /* 1 if the result of the multiplication must be negated,
17926 0 if it mustn't, or -1 if we don't yet care. */
17928 unsigned int encoded_nelts
= const_vector_encoded_nelts (value
);
17929 for (unsigned int i
= 0; i
< encoded_nelts
; ++i
)
17931 rtx elt
= CONST_VECTOR_ENCODED_ELT (value
, i
);
17932 if (!CONST_SCALAR_INT_P (elt
))
17934 rtx_mode_t
val (elt
, int_mode
);
17935 wide_int pow2
= wi::neg (val
);
17938 /* It matters whether we negate or not. Make that choice,
17939 and make sure that it's consistent with previous elements. */
17940 if (negate
== !wi::neg_p (val
))
17942 negate
= wi::neg_p (val
);
17946 /* POW2 is now the value that we want to be a power of 2. */
17947 int shift
= wi::exact_log2 (pow2
);
17950 builder
.quick_push (gen_int_mode (shift
, int_mode
));
17953 /* PLUS and MINUS are equivalent; canonicalize on PLUS. */
17955 else if (negate
== 1)
17956 code
= code
== PLUS
? MINUS
: PLUS
;
17957 return builder
.build ();
17960 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
17961 CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
17962 operands array, in the same order as for fma_optab. Return true if
17963 the function emitted all the necessary instructions, false if the caller
17964 should generate the pattern normally with the new OPERANDS array. */
17967 aarch64_prepare_sve_int_fma (rtx
*operands
, rtx_code code
)
17969 machine_mode mode
= GET_MODE (operands
[0]);
17970 if (rtx shifts
= aarch64_convert_mult_to_shift (operands
[2], code
))
17972 rtx product
= expand_binop (mode
, vashl_optab
, operands
[1], shifts
,
17973 NULL_RTX
, true, OPTAB_DIRECT
);
17974 force_expand_binop (mode
, code
== PLUS
? add_optab
: sub_optab
,
17975 operands
[3], product
, operands
[0], true,
17979 operands
[2] = force_reg (mode
, operands
[2]);
17983 /* Likewise, but for a conditional pattern. */
17986 aarch64_prepare_sve_cond_int_fma (rtx
*operands
, rtx_code code
)
17988 machine_mode mode
= GET_MODE (operands
[0]);
17989 if (rtx shifts
= aarch64_convert_mult_to_shift (operands
[3], code
))
17991 rtx product
= expand_binop (mode
, vashl_optab
, operands
[2], shifts
,
17992 NULL_RTX
, true, OPTAB_DIRECT
);
17993 emit_insn (gen_cond (code
, mode
, operands
[0], operands
[1],
17994 operands
[4], product
, operands
[5]));
17997 operands
[3] = force_reg (mode
, operands
[3]);
18001 static unsigned HOST_WIDE_INT
18002 aarch64_shift_truncation_mask (machine_mode mode
)
18004 if (!SHIFT_COUNT_TRUNCATED
|| aarch64_vector_data_mode_p (mode
))
18006 return GET_MODE_UNIT_BITSIZE (mode
) - 1;
18009 /* Select a format to encode pointers in exception handling data. */
18011 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED
, int global
)
18014 switch (aarch64_cmodel
)
18016 case AARCH64_CMODEL_TINY
:
18017 case AARCH64_CMODEL_TINY_PIC
:
18018 case AARCH64_CMODEL_SMALL
:
18019 case AARCH64_CMODEL_SMALL_PIC
:
18020 case AARCH64_CMODEL_SMALL_SPIC
:
18021 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
18023 type
= DW_EH_PE_sdata4
;
18026 /* No assumptions here. 8-byte relocs required. */
18027 type
= DW_EH_PE_sdata8
;
18030 return (global
? DW_EH_PE_indirect
: 0) | DW_EH_PE_pcrel
| type
;
18033 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
18036 aarch64_asm_output_variant_pcs (FILE *stream
, const tree decl
, const char* name
)
18038 if (TREE_CODE (decl
) == FUNCTION_DECL
)
18040 arm_pcs pcs
= (arm_pcs
) fndecl_abi (decl
).id ();
18041 if (pcs
== ARM_PCS_SIMD
|| pcs
== ARM_PCS_SVE
)
18043 fprintf (stream
, "\t.variant_pcs\t");
18044 assemble_name (stream
, name
);
18045 fprintf (stream
, "\n");
18050 /* The last .arch and .tune assembly strings that we printed. */
18051 static std::string aarch64_last_printed_arch_string
;
18052 static std::string aarch64_last_printed_tune_string
;
18054 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
18055 by the function fndecl. */
18058 aarch64_declare_function_name (FILE *stream
, const char* name
,
18061 tree target_parts
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
18063 struct cl_target_option
*targ_options
;
18065 targ_options
= TREE_TARGET_OPTION (target_parts
);
18067 targ_options
= TREE_TARGET_OPTION (target_option_current_node
);
18068 gcc_assert (targ_options
);
18070 const struct processor
*this_arch
18071 = aarch64_get_arch (targ_options
->x_explicit_arch
);
18073 uint64_t isa_flags
= targ_options
->x_aarch64_isa_flags
;
18074 std::string extension
18075 = aarch64_get_extension_string_for_isa_flags (isa_flags
,
18077 /* Only update the assembler .arch string if it is distinct from the last
18078 such string we printed. */
18079 std::string to_print
= this_arch
->name
+ extension
;
18080 if (to_print
!= aarch64_last_printed_arch_string
)
18082 asm_fprintf (asm_out_file
, "\t.arch %s\n", to_print
.c_str ());
18083 aarch64_last_printed_arch_string
= to_print
;
18086 /* Print the cpu name we're tuning for in the comments, might be
18087 useful to readers of the generated asm. Do it only when it changes
18088 from function to function and verbose assembly is requested. */
18089 const struct processor
*this_tune
18090 = aarch64_get_tune_cpu (targ_options
->x_explicit_tune_core
);
18092 if (flag_debug_asm
&& aarch64_last_printed_tune_string
!= this_tune
->name
)
18094 asm_fprintf (asm_out_file
, "\t" ASM_COMMENT_START
".tune %s\n",
18096 aarch64_last_printed_tune_string
= this_tune
->name
;
18099 aarch64_asm_output_variant_pcs (stream
, fndecl
, name
);
18101 /* Don't forget the type directive for ELF. */
18102 ASM_OUTPUT_TYPE_DIRECTIVE (stream
, name
, "function");
18103 ASM_OUTPUT_LABEL (stream
, name
);
18106 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
18109 aarch64_asm_output_alias (FILE *stream
, const tree decl
, const tree target
)
18111 const char *name
= XSTR (XEXP (DECL_RTL (decl
), 0), 0);
18112 const char *value
= IDENTIFIER_POINTER (target
);
18113 aarch64_asm_output_variant_pcs (stream
, decl
, name
);
18114 ASM_OUTPUT_DEF (stream
, name
, value
);
18117 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
18118 function symbol references. */
18121 aarch64_asm_output_external (FILE *stream
, tree decl
, const char* name
)
18123 default_elf_asm_output_external (stream
, decl
, name
);
18124 aarch64_asm_output_variant_pcs (stream
, decl
, name
);
18127 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
18128 Used to output the .cfi_b_key_frame directive when signing the current
18129 function with the B key. */
18132 aarch64_post_cfi_startproc (FILE *f
, tree ignored ATTRIBUTE_UNUSED
)
18134 if (cfun
->machine
->frame
.laid_out
&& aarch64_return_address_signing_enabled ()
18135 && aarch64_ra_sign_key
== AARCH64_KEY_B
)
18136 asm_fprintf (f
, "\t.cfi_b_key_frame\n");
18139 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
18142 aarch64_start_file (void)
18144 struct cl_target_option
*default_options
18145 = TREE_TARGET_OPTION (target_option_default_node
);
18147 const struct processor
*default_arch
18148 = aarch64_get_arch (default_options
->x_explicit_arch
);
18149 uint64_t default_isa_flags
= default_options
->x_aarch64_isa_flags
;
18150 std::string extension
18151 = aarch64_get_extension_string_for_isa_flags (default_isa_flags
,
18152 default_arch
->flags
);
18154 aarch64_last_printed_arch_string
= default_arch
->name
+ extension
;
18155 aarch64_last_printed_tune_string
= "";
18156 asm_fprintf (asm_out_file
, "\t.arch %s\n",
18157 aarch64_last_printed_arch_string
.c_str ());
18159 default_file_start ();
18162 /* Emit load exclusive. */
18165 aarch64_emit_load_exclusive (machine_mode mode
, rtx rval
,
18166 rtx mem
, rtx model_rtx
)
18168 if (mode
== TImode
)
18169 emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode
, rval
),
18170 gen_highpart (DImode
, rval
),
18173 emit_insn (gen_aarch64_load_exclusive (mode
, rval
, mem
, model_rtx
));
18176 /* Emit store exclusive. */
18179 aarch64_emit_store_exclusive (machine_mode mode
, rtx bval
,
18180 rtx mem
, rtx rval
, rtx model_rtx
)
18182 if (mode
== TImode
)
18183 emit_insn (gen_aarch64_store_exclusive_pair
18184 (bval
, mem
, operand_subword (rval
, 0, 0, TImode
),
18185 operand_subword (rval
, 1, 0, TImode
), model_rtx
));
18187 emit_insn (gen_aarch64_store_exclusive (mode
, bval
, mem
, rval
, model_rtx
));
18190 /* Mark the previous jump instruction as unlikely. */
18193 aarch64_emit_unlikely_jump (rtx insn
)
18195 rtx_insn
*jump
= emit_jump_insn (insn
);
18196 add_reg_br_prob_note (jump
, profile_probability::very_unlikely ());
18199 /* We store the names of the various atomic helpers in a 5x4 array.
18200 Return the libcall function given MODE, MODEL and NAMES. */
18203 aarch64_atomic_ool_func(machine_mode mode
, rtx model_rtx
,
18204 const atomic_ool_names
*names
)
18206 memmodel model
= memmodel_base (INTVAL (model_rtx
));
18207 int mode_idx
, model_idx
;
18227 gcc_unreachable ();
18232 case MEMMODEL_RELAXED
:
18235 case MEMMODEL_CONSUME
:
18236 case MEMMODEL_ACQUIRE
:
18239 case MEMMODEL_RELEASE
:
18242 case MEMMODEL_ACQ_REL
:
18243 case MEMMODEL_SEQ_CST
:
18247 gcc_unreachable ();
18250 return init_one_libfunc_visibility (names
->str
[mode_idx
][model_idx
],
18251 VISIBILITY_HIDDEN
);
18254 #define DEF0(B, N) \
18255 { "__aarch64_" #B #N "_relax", \
18256 "__aarch64_" #B #N "_acq", \
18257 "__aarch64_" #B #N "_rel", \
18258 "__aarch64_" #B #N "_acq_rel" }
18260 #define DEF4(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
18261 { NULL, NULL, NULL, NULL }
18262 #define DEF5(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
18264 static const atomic_ool_names aarch64_ool_cas_names
= { { DEF5(cas
) } };
18265 const atomic_ool_names aarch64_ool_swp_names
= { { DEF4(swp
) } };
18266 const atomic_ool_names aarch64_ool_ldadd_names
= { { DEF4(ldadd
) } };
18267 const atomic_ool_names aarch64_ool_ldset_names
= { { DEF4(ldset
) } };
18268 const atomic_ool_names aarch64_ool_ldclr_names
= { { DEF4(ldclr
) } };
18269 const atomic_ool_names aarch64_ool_ldeor_names
= { { DEF4(ldeor
) } };
18275 /* Expand a compare and swap pattern. */
18278 aarch64_expand_compare_and_swap (rtx operands
[])
18280 rtx bval
, rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
, x
, cc_reg
;
18281 machine_mode mode
, r_mode
;
18283 bval
= operands
[0];
18284 rval
= operands
[1];
18286 oldval
= operands
[3];
18287 newval
= operands
[4];
18288 is_weak
= operands
[5];
18289 mod_s
= operands
[6];
18290 mod_f
= operands
[7];
18291 mode
= GET_MODE (mem
);
18293 /* Normally the succ memory model must be stronger than fail, but in the
18294 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
18295 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
18296 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f
)))
18297 && is_mm_release (memmodel_from_int (INTVAL (mod_s
))))
18298 mod_s
= GEN_INT (MEMMODEL_ACQ_REL
);
18301 if (mode
== QImode
|| mode
== HImode
)
18304 rval
= gen_reg_rtx (r_mode
);
18309 /* The CAS insn requires oldval and rval overlap, but we need to
18310 have a copy of oldval saved across the operation to tell if
18311 the operation is successful. */
18312 if (reg_overlap_mentioned_p (rval
, oldval
))
18313 rval
= copy_to_mode_reg (r_mode
, oldval
);
18315 emit_move_insn (rval
, gen_lowpart (r_mode
, oldval
));
18317 emit_insn (gen_aarch64_compare_and_swap_lse (mode
, rval
, mem
,
18319 cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
18321 else if (TARGET_OUTLINE_ATOMICS
)
18323 /* Oldval must satisfy compare afterward. */
18324 if (!aarch64_plus_operand (oldval
, mode
))
18325 oldval
= force_reg (mode
, oldval
);
18326 rtx func
= aarch64_atomic_ool_func (mode
, mod_s
, &aarch64_ool_cas_names
);
18327 rval
= emit_library_call_value (func
, NULL_RTX
, LCT_NORMAL
, r_mode
,
18328 oldval
, mode
, newval
, mode
,
18329 XEXP (mem
, 0), Pmode
);
18330 cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
18334 /* The oldval predicate varies by mode. Test it and force to reg. */
18335 insn_code code
= code_for_aarch64_compare_and_swap (mode
);
18336 if (!insn_data
[code
].operand
[2].predicate (oldval
, mode
))
18337 oldval
= force_reg (mode
, oldval
);
18339 emit_insn (GEN_FCN (code
) (rval
, mem
, oldval
, newval
,
18340 is_weak
, mod_s
, mod_f
));
18341 cc_reg
= gen_rtx_REG (CCmode
, CC_REGNUM
);
18344 if (r_mode
!= mode
)
18345 rval
= gen_lowpart (mode
, rval
);
18346 emit_move_insn (operands
[1], rval
);
18348 x
= gen_rtx_EQ (SImode
, cc_reg
, const0_rtx
);
18349 emit_insn (gen_rtx_SET (bval
, x
));
18352 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
18353 sequence implementing an atomic operation. */
18356 aarch64_emit_post_barrier (enum memmodel model
)
18358 const enum memmodel base_model
= memmodel_base (model
);
18360 if (is_mm_sync (model
)
18361 && (base_model
== MEMMODEL_ACQUIRE
18362 || base_model
== MEMMODEL_ACQ_REL
18363 || base_model
== MEMMODEL_SEQ_CST
))
18365 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST
)));
18369 /* Split a compare and swap pattern. */
18372 aarch64_split_compare_and_swap (rtx operands
[])
18374 rtx rval
, mem
, oldval
, newval
, scratch
, x
, model_rtx
;
18377 rtx_code_label
*label1
, *label2
;
18378 enum memmodel model
;
18380 rval
= operands
[0];
18382 oldval
= operands
[2];
18383 newval
= operands
[3];
18384 is_weak
= (operands
[4] != const0_rtx
);
18385 model_rtx
= operands
[5];
18386 scratch
= operands
[7];
18387 mode
= GET_MODE (mem
);
18388 model
= memmodel_from_int (INTVAL (model_rtx
));
18390 /* When OLDVAL is zero and we want the strong version we can emit a tighter
18393 LD[A]XR rval, [mem]
18395 ST[L]XR scratch, newval, [mem]
18396 CBNZ scratch, .label1
18399 bool strong_zero_p
= (!is_weak
&& !aarch64_track_speculation
&&
18400 oldval
== const0_rtx
&& mode
!= TImode
);
18405 label1
= gen_label_rtx ();
18406 emit_label (label1
);
18408 label2
= gen_label_rtx ();
18410 /* The initial load can be relaxed for a __sync operation since a final
18411 barrier will be emitted to stop code hoisting. */
18412 if (is_mm_sync (model
))
18413 aarch64_emit_load_exclusive (mode
, rval
, mem
, GEN_INT (MEMMODEL_RELAXED
));
18415 aarch64_emit_load_exclusive (mode
, rval
, mem
, model_rtx
);
18418 x
= gen_rtx_NE (VOIDmode
, rval
, const0_rtx
);
18421 rtx cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
18422 x
= gen_rtx_NE (VOIDmode
, cc_reg
, const0_rtx
);
18424 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
18425 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
18426 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
18428 aarch64_emit_store_exclusive (mode
, scratch
, mem
, newval
, model_rtx
);
18432 if (aarch64_track_speculation
)
18434 /* Emit an explicit compare instruction, so that we can correctly
18435 track the condition codes. */
18436 rtx cc_reg
= aarch64_gen_compare_reg (NE
, scratch
, const0_rtx
);
18437 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
18440 x
= gen_rtx_NE (VOIDmode
, scratch
, const0_rtx
);
18442 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
18443 gen_rtx_LABEL_REF (Pmode
, label1
), pc_rtx
);
18444 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
18447 aarch64_gen_compare_reg (NE
, scratch
, const0_rtx
);
18449 emit_label (label2
);
18451 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
18452 to set the condition flags. If this is not used it will be removed by
18455 aarch64_gen_compare_reg (NE
, rval
, const0_rtx
);
18457 /* Emit any final barrier needed for a __sync operation. */
18458 if (is_mm_sync (model
))
18459 aarch64_emit_post_barrier (model
);
18462 /* Split an atomic operation. */
18465 aarch64_split_atomic_op (enum rtx_code code
, rtx old_out
, rtx new_out
, rtx mem
,
18466 rtx value
, rtx model_rtx
, rtx cond
)
18468 machine_mode mode
= GET_MODE (mem
);
18469 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
18470 const enum memmodel model
= memmodel_from_int (INTVAL (model_rtx
));
18471 const bool is_sync
= is_mm_sync (model
);
18472 rtx_code_label
*label
;
18475 /* Split the atomic operation into a sequence. */
18476 label
= gen_label_rtx ();
18477 emit_label (label
);
18480 new_out
= gen_lowpart (wmode
, new_out
);
18482 old_out
= gen_lowpart (wmode
, old_out
);
18485 value
= simplify_gen_subreg (wmode
, value
, mode
, 0);
18487 /* The initial load can be relaxed for a __sync operation since a final
18488 barrier will be emitted to stop code hoisting. */
18490 aarch64_emit_load_exclusive (mode
, old_out
, mem
,
18491 GEN_INT (MEMMODEL_RELAXED
));
18493 aarch64_emit_load_exclusive (mode
, old_out
, mem
, model_rtx
);
18502 x
= gen_rtx_AND (wmode
, old_out
, value
);
18503 emit_insn (gen_rtx_SET (new_out
, x
));
18504 x
= gen_rtx_NOT (wmode
, new_out
);
18505 emit_insn (gen_rtx_SET (new_out
, x
));
18509 if (CONST_INT_P (value
))
18511 value
= GEN_INT (-INTVAL (value
));
18514 /* Fall through. */
18517 x
= gen_rtx_fmt_ee (code
, wmode
, old_out
, value
);
18518 emit_insn (gen_rtx_SET (new_out
, x
));
18522 aarch64_emit_store_exclusive (mode
, cond
, mem
,
18523 gen_lowpart (mode
, new_out
), model_rtx
);
18525 if (aarch64_track_speculation
)
18527 /* Emit an explicit compare instruction, so that we can correctly
18528 track the condition codes. */
18529 rtx cc_reg
= aarch64_gen_compare_reg (NE
, cond
, const0_rtx
);
18530 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
18533 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
18535 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
18536 gen_rtx_LABEL_REF (Pmode
, label
), pc_rtx
);
18537 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
18539 /* Emit any final barrier needed for a __sync operation. */
18541 aarch64_emit_post_barrier (model
);
18545 aarch64_init_libfuncs (void)
18547 /* Half-precision float operations. The compiler handles all operations
18548 with NULL libfuncs by converting to SFmode. */
18551 set_conv_libfunc (trunc_optab
, HFmode
, SFmode
, "__gnu_f2h_ieee");
18552 set_conv_libfunc (sext_optab
, SFmode
, HFmode
, "__gnu_h2f_ieee");
18555 set_optab_libfunc (add_optab
, HFmode
, NULL
);
18556 set_optab_libfunc (sdiv_optab
, HFmode
, NULL
);
18557 set_optab_libfunc (smul_optab
, HFmode
, NULL
);
18558 set_optab_libfunc (neg_optab
, HFmode
, NULL
);
18559 set_optab_libfunc (sub_optab
, HFmode
, NULL
);
18562 set_optab_libfunc (eq_optab
, HFmode
, NULL
);
18563 set_optab_libfunc (ne_optab
, HFmode
, NULL
);
18564 set_optab_libfunc (lt_optab
, HFmode
, NULL
);
18565 set_optab_libfunc (le_optab
, HFmode
, NULL
);
18566 set_optab_libfunc (ge_optab
, HFmode
, NULL
);
18567 set_optab_libfunc (gt_optab
, HFmode
, NULL
);
18568 set_optab_libfunc (unord_optab
, HFmode
, NULL
);
18571 /* Target hook for c_mode_for_suffix. */
18572 static machine_mode
18573 aarch64_c_mode_for_suffix (char suffix
)
18581 /* We can only represent floating point constants which will fit in
18582 "quarter-precision" values. These values are characterised by
18583 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
18586 (-1)^s * (n/16) * 2^r
18589 's' is the sign bit.
18590 'n' is an integer in the range 16 <= n <= 31.
18591 'r' is an integer in the range -3 <= r <= 4. */
18593 /* Return true iff X can be represented by a quarter-precision
18594 floating point immediate operand X. Note, we cannot represent 0.0. */
18596 aarch64_float_const_representable_p (rtx x
)
18598 /* This represents our current view of how many bits
18599 make up the mantissa. */
18600 int point_pos
= 2 * HOST_BITS_PER_WIDE_INT
- 1;
18602 unsigned HOST_WIDE_INT mantissa
, mask
;
18603 REAL_VALUE_TYPE r
, m
;
18606 x
= unwrap_const_vec_duplicate (x
);
18607 if (!CONST_DOUBLE_P (x
))
18610 if (GET_MODE (x
) == VOIDmode
18611 || (GET_MODE (x
) == HFmode
&& !TARGET_FP_F16INST
))
18614 r
= *CONST_DOUBLE_REAL_VALUE (x
);
18616 /* We cannot represent infinities, NaNs or +/-zero. We won't
18617 know if we have +zero until we analyse the mantissa, but we
18618 can reject the other invalid values. */
18619 if (REAL_VALUE_ISINF (r
) || REAL_VALUE_ISNAN (r
)
18620 || REAL_VALUE_MINUS_ZERO (r
))
18623 /* Extract exponent. */
18624 r
= real_value_abs (&r
);
18625 exponent
= REAL_EXP (&r
);
18627 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
18628 highest (sign) bit, with a fixed binary point at bit point_pos.
18629 m1 holds the low part of the mantissa, m2 the high part.
18630 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
18631 bits for the mantissa, this can fail (low bits will be lost). */
18632 real_ldexp (&m
, &r
, point_pos
- exponent
);
18633 wide_int w
= real_to_integer (&m
, &fail
, HOST_BITS_PER_WIDE_INT
* 2);
18635 /* If the low part of the mantissa has bits set we cannot represent
18637 if (w
.ulow () != 0)
18639 /* We have rejected the lower HOST_WIDE_INT, so update our
18640 understanding of how many bits lie in the mantissa and
18641 look only at the high HOST_WIDE_INT. */
18642 mantissa
= w
.elt (1);
18643 point_pos
-= HOST_BITS_PER_WIDE_INT
;
18645 /* We can only represent values with a mantissa of the form 1.xxxx. */
18646 mask
= ((unsigned HOST_WIDE_INT
)1 << (point_pos
- 5)) - 1;
18647 if ((mantissa
& mask
) != 0)
18650 /* Having filtered unrepresentable values, we may now remove all
18651 but the highest 5 bits. */
18652 mantissa
>>= point_pos
- 5;
18654 /* We cannot represent the value 0.0, so reject it. This is handled
18659 /* Then, as bit 4 is always set, we can mask it off, leaving
18660 the mantissa in the range [0, 15]. */
18661 mantissa
&= ~(1 << 4);
18662 gcc_assert (mantissa
<= 15);
18664 /* GCC internally does not use IEEE754-like encoding (where normalized
18665 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
18666 Our mantissa values are shifted 4 places to the left relative to
18667 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
18668 by 5 places to correct for GCC's representation. */
18669 exponent
= 5 - exponent
;
18671 return (exponent
>= 0 && exponent
<= 7);
18674 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
18675 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
18676 output MOVI/MVNI, ORR or BIC immediate. */
18678 aarch64_output_simd_mov_immediate (rtx const_vector
, unsigned width
,
18679 enum simd_immediate_check which
)
18682 static char templ
[40];
18683 const char *mnemonic
;
18684 const char *shift_op
;
18685 unsigned int lane_count
= 0;
18688 struct simd_immediate_info info
;
18690 /* This will return true to show const_vector is legal for use as either
18691 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
18692 It will also update INFO to show how the immediate should be generated.
18693 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
18694 is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
, which
);
18695 gcc_assert (is_valid
);
18697 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
18698 lane_count
= width
/ GET_MODE_BITSIZE (info
.elt_mode
);
18700 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
18702 gcc_assert (info
.insn
== simd_immediate_info::MOV
18703 && info
.u
.mov
.shift
== 0);
18704 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
18705 move immediate path. */
18706 if (aarch64_float_const_zero_rtx_p (info
.u
.mov
.value
))
18707 info
.u
.mov
.value
= GEN_INT (0);
18710 const unsigned int buf_size
= 20;
18711 char float_buf
[buf_size
] = {'\0'};
18712 real_to_decimal_for_mode (float_buf
,
18713 CONST_DOUBLE_REAL_VALUE (info
.u
.mov
.value
),
18714 buf_size
, buf_size
, 1, info
.elt_mode
);
18716 if (lane_count
== 1)
18717 snprintf (templ
, sizeof (templ
), "fmov\t%%d0, %s", float_buf
);
18719 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%d%c, %s",
18720 lane_count
, element_char
, float_buf
);
18725 gcc_assert (CONST_INT_P (info
.u
.mov
.value
));
18727 if (which
== AARCH64_CHECK_MOV
)
18729 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "mvni" : "movi";
18730 shift_op
= (info
.u
.mov
.modifier
== simd_immediate_info::MSL
18732 if (lane_count
== 1)
18733 snprintf (templ
, sizeof (templ
), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX
,
18734 mnemonic
, UINTVAL (info
.u
.mov
.value
));
18735 else if (info
.u
.mov
.shift
)
18736 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
18737 HOST_WIDE_INT_PRINT_HEX
", %s %d", mnemonic
, lane_count
,
18738 element_char
, UINTVAL (info
.u
.mov
.value
), shift_op
,
18741 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
18742 HOST_WIDE_INT_PRINT_HEX
, mnemonic
, lane_count
,
18743 element_char
, UINTVAL (info
.u
.mov
.value
));
18747 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
18748 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "bic" : "orr";
18749 if (info
.u
.mov
.shift
)
18750 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
18751 HOST_WIDE_INT_PRINT_DEC
", %s #%d", mnemonic
, lane_count
,
18752 element_char
, UINTVAL (info
.u
.mov
.value
), "lsl",
18755 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
18756 HOST_WIDE_INT_PRINT_DEC
, mnemonic
, lane_count
,
18757 element_char
, UINTVAL (info
.u
.mov
.value
));
18763 aarch64_output_scalar_simd_mov_immediate (rtx immediate
, scalar_int_mode mode
)
18766 /* If a floating point number was passed and we desire to use it in an
18767 integer mode do the conversion to integer. */
18768 if (CONST_DOUBLE_P (immediate
) && GET_MODE_CLASS (mode
) == MODE_INT
)
18770 unsigned HOST_WIDE_INT ival
;
18771 if (!aarch64_reinterpret_float_as_int (immediate
, &ival
))
18772 gcc_unreachable ();
18773 immediate
= gen_int_mode (ival
, mode
);
18776 machine_mode vmode
;
18777 /* use a 64 bit mode for everything except for DI/DF mode, where we use
18778 a 128 bit vector mode. */
18779 int width
= GET_MODE_BITSIZE (mode
) == 64 ? 128 : 64;
18781 vmode
= aarch64_simd_container_mode (mode
, width
);
18782 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (immediate
));
18783 return aarch64_output_simd_mov_immediate (v_op
, width
);
18786 /* Return the output string to use for moving immediate CONST_VECTOR
18787 into an SVE register. */
18790 aarch64_output_sve_mov_immediate (rtx const_vector
)
18792 static char templ
[40];
18793 struct simd_immediate_info info
;
18796 bool is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
);
18797 gcc_assert (is_valid
);
18799 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
18801 machine_mode vec_mode
= GET_MODE (const_vector
);
18802 if (aarch64_sve_pred_mode_p (vec_mode
))
18804 static char buf
[sizeof ("ptrue\t%0.N, vlNNNNN")];
18805 if (info
.insn
== simd_immediate_info::MOV
)
18807 gcc_assert (info
.u
.mov
.value
== const0_rtx
);
18808 snprintf (buf
, sizeof (buf
), "pfalse\t%%0.b");
18812 gcc_assert (info
.insn
== simd_immediate_info::PTRUE
);
18813 unsigned int total_bytes
;
18814 if (info
.u
.pattern
== AARCH64_SV_ALL
18815 && BYTES_PER_SVE_VECTOR
.is_constant (&total_bytes
))
18816 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, vl%d", element_char
,
18817 total_bytes
/ GET_MODE_SIZE (info
.elt_mode
));
18819 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, %s", element_char
,
18820 svpattern_token (info
.u
.pattern
));
18825 if (info
.insn
== simd_immediate_info::INDEX
)
18827 snprintf (templ
, sizeof (templ
), "index\t%%0.%c, #"
18828 HOST_WIDE_INT_PRINT_DEC
", #" HOST_WIDE_INT_PRINT_DEC
,
18829 element_char
, INTVAL (info
.u
.index
.base
),
18830 INTVAL (info
.u
.index
.step
));
18834 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
18836 if (aarch64_float_const_zero_rtx_p (info
.u
.mov
.value
))
18837 info
.u
.mov
.value
= GEN_INT (0);
18840 const int buf_size
= 20;
18841 char float_buf
[buf_size
] = {};
18842 real_to_decimal_for_mode (float_buf
,
18843 CONST_DOUBLE_REAL_VALUE (info
.u
.mov
.value
),
18844 buf_size
, buf_size
, 1, info
.elt_mode
);
18846 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%c, #%s",
18847 element_char
, float_buf
);
18852 snprintf (templ
, sizeof (templ
), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC
,
18853 element_char
, INTVAL (info
.u
.mov
.value
));
18857 /* Return the asm template for a PTRUES. CONST_UNSPEC is the
18858 aarch64_sve_ptrue_svpattern_immediate that describes the predicate
18862 aarch64_output_sve_ptrues (rtx const_unspec
)
18864 static char templ
[40];
18866 struct simd_immediate_info info
;
18867 bool is_valid
= aarch64_simd_valid_immediate (const_unspec
, &info
);
18868 gcc_assert (is_valid
&& info
.insn
== simd_immediate_info::PTRUE
);
18870 char element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
18871 snprintf (templ
, sizeof (templ
), "ptrues\t%%0.%c, %s", element_char
,
18872 svpattern_token (info
.u
.pattern
));
18876 /* Split operands into moves from op[1] + op[2] into op[0]. */
18879 aarch64_split_combinev16qi (rtx operands
[3])
18881 unsigned int dest
= REGNO (operands
[0]);
18882 unsigned int src1
= REGNO (operands
[1]);
18883 unsigned int src2
= REGNO (operands
[2]);
18884 machine_mode halfmode
= GET_MODE (operands
[1]);
18885 unsigned int halfregs
= REG_NREGS (operands
[1]);
18886 rtx destlo
, desthi
;
18888 gcc_assert (halfmode
== V16QImode
);
18890 if (src1
== dest
&& src2
== dest
+ halfregs
)
18892 /* No-op move. Can't split to nothing; emit something. */
18893 emit_note (NOTE_INSN_DELETED
);
18897 /* Preserve register attributes for variable tracking. */
18898 destlo
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
, 0);
18899 desthi
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
+ halfregs
,
18900 GET_MODE_SIZE (halfmode
));
18902 /* Special case of reversed high/low parts. */
18903 if (reg_overlap_mentioned_p (operands
[2], destlo
)
18904 && reg_overlap_mentioned_p (operands
[1], desthi
))
18906 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
18907 emit_insn (gen_xorv16qi3 (operands
[2], operands
[1], operands
[2]));
18908 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
18910 else if (!reg_overlap_mentioned_p (operands
[2], destlo
))
18912 /* Try to avoid unnecessary moves if part of the result
18913 is in the right place already. */
18915 emit_move_insn (destlo
, operands
[1]);
18916 if (src2
!= dest
+ halfregs
)
18917 emit_move_insn (desthi
, operands
[2]);
18921 if (src2
!= dest
+ halfregs
)
18922 emit_move_insn (desthi
, operands
[2]);
18924 emit_move_insn (destlo
, operands
[1]);
18928 /* vec_perm support. */
18930 struct expand_vec_perm_d
18932 rtx target
, op0
, op1
;
18933 vec_perm_indices perm
;
18934 machine_mode vmode
;
18935 unsigned int vec_flags
;
18940 /* Generate a variable permutation. */
18943 aarch64_expand_vec_perm_1 (rtx target
, rtx op0
, rtx op1
, rtx sel
)
18945 machine_mode vmode
= GET_MODE (target
);
18946 bool one_vector_p
= rtx_equal_p (op0
, op1
);
18948 gcc_checking_assert (vmode
== V8QImode
|| vmode
== V16QImode
);
18949 gcc_checking_assert (GET_MODE (op0
) == vmode
);
18950 gcc_checking_assert (GET_MODE (op1
) == vmode
);
18951 gcc_checking_assert (GET_MODE (sel
) == vmode
);
18952 gcc_checking_assert (TARGET_SIMD
);
18956 if (vmode
== V8QImode
)
18958 /* Expand the argument to a V16QI mode by duplicating it. */
18959 rtx pair
= gen_reg_rtx (V16QImode
);
18960 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op0
));
18961 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
18965 emit_insn (gen_aarch64_tbl1v16qi (target
, op0
, sel
));
18972 if (vmode
== V8QImode
)
18974 pair
= gen_reg_rtx (V16QImode
);
18975 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op1
));
18976 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
18980 pair
= gen_reg_rtx (OImode
);
18981 emit_insn (gen_aarch64_combinev16qi (pair
, op0
, op1
));
18982 emit_insn (gen_aarch64_tbl2v16qi (target
, pair
, sel
));
18987 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
18988 NELT is the number of elements in the vector. */
18991 aarch64_expand_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
,
18994 machine_mode vmode
= GET_MODE (target
);
18995 bool one_vector_p
= rtx_equal_p (op0
, op1
);
18998 /* The TBL instruction does not use a modulo index, so we must take care
18999 of that ourselves. */
19000 mask
= aarch64_simd_gen_const_vector_dup (vmode
,
19001 one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
19002 sel
= expand_simple_binop (vmode
, AND
, sel
, mask
, NULL
, 0, OPTAB_LIB_WIDEN
);
19004 /* For big-endian, we also need to reverse the index within the vector
19005 (but not which vector). */
19006 if (BYTES_BIG_ENDIAN
)
19008 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
19010 mask
= aarch64_simd_gen_const_vector_dup (vmode
, nelt
- 1);
19011 sel
= expand_simple_binop (vmode
, XOR
, sel
, mask
,
19012 NULL
, 0, OPTAB_LIB_WIDEN
);
19014 aarch64_expand_vec_perm_1 (target
, op0
, op1
, sel
);
19017 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
19020 emit_unspec2 (rtx target
, int code
, rtx op0
, rtx op1
)
19022 emit_insn (gen_rtx_SET (target
,
19023 gen_rtx_UNSPEC (GET_MODE (target
),
19024 gen_rtvec (2, op0
, op1
), code
)));
19027 /* Expand an SVE vec_perm with the given operands. */
19030 aarch64_expand_sve_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
)
19032 machine_mode data_mode
= GET_MODE (target
);
19033 machine_mode sel_mode
= GET_MODE (sel
);
19034 /* Enforced by the pattern condition. */
19035 int nunits
= GET_MODE_NUNITS (sel_mode
).to_constant ();
19037 /* Note: vec_perm indices are supposed to wrap when they go beyond the
19038 size of the two value vectors, i.e. the upper bits of the indices
19039 are effectively ignored. SVE TBL instead produces 0 for any
19040 out-of-range indices, so we need to modulo all the vec_perm indices
19041 to ensure they are all in range. */
19042 rtx sel_reg
= force_reg (sel_mode
, sel
);
19044 /* Check if the sel only references the first values vector. */
19045 if (GET_CODE (sel
) == CONST_VECTOR
19046 && aarch64_const_vec_all_in_range_p (sel
, 0, nunits
- 1))
19048 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_reg
);
19052 /* Check if the two values vectors are the same. */
19053 if (rtx_equal_p (op0
, op1
))
19055 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
, nunits
- 1);
19056 rtx sel_mod
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
19057 NULL
, 0, OPTAB_DIRECT
);
19058 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_mod
);
19062 /* Run TBL on for each value vector and combine the results. */
19064 rtx res0
= gen_reg_rtx (data_mode
);
19065 rtx res1
= gen_reg_rtx (data_mode
);
19066 rtx neg_num_elems
= aarch64_simd_gen_const_vector_dup (sel_mode
, -nunits
);
19067 if (GET_CODE (sel
) != CONST_VECTOR
19068 || !aarch64_const_vec_all_in_range_p (sel
, 0, 2 * nunits
- 1))
19070 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
,
19072 sel_reg
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
19073 NULL
, 0, OPTAB_DIRECT
);
19075 emit_unspec2 (res0
, UNSPEC_TBL
, op0
, sel_reg
);
19076 rtx sel_sub
= expand_simple_binop (sel_mode
, PLUS
, sel_reg
, neg_num_elems
,
19077 NULL
, 0, OPTAB_DIRECT
);
19078 emit_unspec2 (res1
, UNSPEC_TBL
, op1
, sel_sub
);
19079 if (GET_MODE_CLASS (data_mode
) == MODE_VECTOR_INT
)
19080 emit_insn (gen_rtx_SET (target
, gen_rtx_IOR (data_mode
, res0
, res1
)));
19082 emit_unspec2 (target
, UNSPEC_IORF
, res0
, res1
);
19085 /* Recognize patterns suitable for the TRN instructions. */
19087 aarch64_evpc_trn (struct expand_vec_perm_d
*d
)
19090 poly_uint64 nelt
= d
->perm
.length ();
19091 rtx out
, in0
, in1
, x
;
19092 machine_mode vmode
= d
->vmode
;
19094 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
19097 /* Note that these are little-endian tests.
19098 We correct for big-endian later. */
19099 if (!d
->perm
[0].is_constant (&odd
)
19100 || (odd
!= 0 && odd
!= 1)
19101 || !d
->perm
.series_p (0, 2, odd
, 2)
19102 || !d
->perm
.series_p (1, 2, nelt
+ odd
, 2))
19111 /* We don't need a big-endian lane correction for SVE; see the comment
19112 at the head of aarch64-sve.md for details. */
19113 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
19115 x
= in0
, in0
= in1
, in1
= x
;
19120 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
19121 odd
? UNSPEC_TRN2
: UNSPEC_TRN1
));
19125 /* Recognize patterns suitable for the UZP instructions. */
19127 aarch64_evpc_uzp (struct expand_vec_perm_d
*d
)
19130 rtx out
, in0
, in1
, x
;
19131 machine_mode vmode
= d
->vmode
;
19133 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
19136 /* Note that these are little-endian tests.
19137 We correct for big-endian later. */
19138 if (!d
->perm
[0].is_constant (&odd
)
19139 || (odd
!= 0 && odd
!= 1)
19140 || !d
->perm
.series_p (0, 1, odd
, 2))
19149 /* We don't need a big-endian lane correction for SVE; see the comment
19150 at the head of aarch64-sve.md for details. */
19151 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
19153 x
= in0
, in0
= in1
, in1
= x
;
19158 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
19159 odd
? UNSPEC_UZP2
: UNSPEC_UZP1
));
19163 /* Recognize patterns suitable for the ZIP instructions. */
19165 aarch64_evpc_zip (struct expand_vec_perm_d
*d
)
19168 poly_uint64 nelt
= d
->perm
.length ();
19169 rtx out
, in0
, in1
, x
;
19170 machine_mode vmode
= d
->vmode
;
19172 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
19175 /* Note that these are little-endian tests.
19176 We correct for big-endian later. */
19177 poly_uint64 first
= d
->perm
[0];
19178 if ((maybe_ne (first
, 0U) && maybe_ne (first
* 2, nelt
))
19179 || !d
->perm
.series_p (0, 2, first
, 1)
19180 || !d
->perm
.series_p (1, 2, first
+ nelt
, 1))
19182 high
= maybe_ne (first
, 0U);
19190 /* We don't need a big-endian lane correction for SVE; see the comment
19191 at the head of aarch64-sve.md for details. */
19192 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
19194 x
= in0
, in0
= in1
, in1
= x
;
19199 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
19200 high
? UNSPEC_ZIP2
: UNSPEC_ZIP1
));
19204 /* Recognize patterns for the EXT insn. */
19207 aarch64_evpc_ext (struct expand_vec_perm_d
*d
)
19209 HOST_WIDE_INT location
;
19212 /* The first element always refers to the first vector.
19213 Check if the extracted indices are increasing by one. */
19214 if (d
->vec_flags
== VEC_SVE_PRED
19215 || !d
->perm
[0].is_constant (&location
)
19216 || !d
->perm
.series_p (0, 1, location
, 1))
19223 /* The case where (location == 0) is a no-op for both big- and little-endian,
19224 and is removed by the mid-end at optimization levels -O1 and higher.
19226 We don't need a big-endian lane correction for SVE; see the comment
19227 at the head of aarch64-sve.md for details. */
19228 if (BYTES_BIG_ENDIAN
&& location
!= 0 && d
->vec_flags
== VEC_ADVSIMD
)
19230 /* After setup, we want the high elements of the first vector (stored
19231 at the LSB end of the register), and the low elements of the second
19232 vector (stored at the MSB end of the register). So swap. */
19233 std::swap (d
->op0
, d
->op1
);
19234 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
19235 to_constant () is safe since this is restricted to Advanced SIMD
19237 location
= d
->perm
.length ().to_constant () - location
;
19240 offset
= GEN_INT (location
);
19241 emit_set_insn (d
->target
,
19242 gen_rtx_UNSPEC (d
->vmode
,
19243 gen_rtvec (3, d
->op0
, d
->op1
, offset
),
19248 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
19249 within each 64-bit, 32-bit or 16-bit granule. */
19252 aarch64_evpc_rev_local (struct expand_vec_perm_d
*d
)
19254 HOST_WIDE_INT diff
;
19255 unsigned int i
, size
, unspec
;
19256 machine_mode pred_mode
;
19258 if (d
->vec_flags
== VEC_SVE_PRED
19259 || !d
->one_vector_p
19260 || !d
->perm
[0].is_constant (&diff
))
19263 size
= (diff
+ 1) * GET_MODE_UNIT_SIZE (d
->vmode
);
19266 unspec
= UNSPEC_REV64
;
19267 pred_mode
= VNx2BImode
;
19269 else if (size
== 4)
19271 unspec
= UNSPEC_REV32
;
19272 pred_mode
= VNx4BImode
;
19274 else if (size
== 2)
19276 unspec
= UNSPEC_REV16
;
19277 pred_mode
= VNx8BImode
;
19282 unsigned int step
= diff
+ 1;
19283 for (i
= 0; i
< step
; ++i
)
19284 if (!d
->perm
.series_p (i
, step
, diff
- i
, step
))
19291 if (d
->vec_flags
== VEC_SVE_DATA
)
19293 machine_mode int_mode
= aarch64_sve_int_mode (pred_mode
);
19294 rtx target
= gen_reg_rtx (int_mode
);
19295 if (BYTES_BIG_ENDIAN
)
19296 /* The act of taking a subreg between INT_MODE and d->vmode
19297 is itself a reversing operation on big-endian targets;
19298 see the comment at the head of aarch64-sve.md for details.
19299 First reinterpret OP0 as INT_MODE without using a subreg
19300 and without changing the contents. */
19301 emit_insn (gen_aarch64_sve_reinterpret (int_mode
, target
, d
->op0
));
19304 /* For SVE we use REV[BHW] unspecs derived from the element size
19305 of v->mode and vector modes whose elements have SIZE bytes.
19306 This ensures that the vector modes match the predicate modes. */
19307 int unspec
= aarch64_sve_rev_unspec (d
->vmode
);
19308 rtx pred
= aarch64_ptrue_reg (pred_mode
);
19309 emit_insn (gen_aarch64_pred (unspec
, int_mode
, target
, pred
,
19310 gen_lowpart (int_mode
, d
->op0
)));
19312 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
19315 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), unspec
);
19316 emit_set_insn (d
->target
, src
);
19320 /* Recognize patterns for the REV insn, which reverses elements within
19324 aarch64_evpc_rev_global (struct expand_vec_perm_d
*d
)
19326 poly_uint64 nelt
= d
->perm
.length ();
19328 if (!d
->one_vector_p
|| d
->vec_flags
== VEC_ADVSIMD
)
19331 if (!d
->perm
.series_p (0, 1, nelt
- 1, -1))
19338 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), UNSPEC_REV
);
19339 emit_set_insn (d
->target
, src
);
19344 aarch64_evpc_dup (struct expand_vec_perm_d
*d
)
19346 rtx out
= d
->target
;
19349 machine_mode vmode
= d
->vmode
;
19352 if (d
->vec_flags
== VEC_SVE_PRED
19353 || d
->perm
.encoding ().encoded_nelts () != 1
19354 || !d
->perm
[0].is_constant (&elt
))
19357 if (d
->vec_flags
== VEC_SVE_DATA
&& elt
>= 64 * GET_MODE_UNIT_SIZE (vmode
))
19364 /* The generic preparation in aarch64_expand_vec_perm_const_1
19365 swaps the operand order and the permute indices if it finds
19366 d->perm[0] to be in the second operand. Thus, we can always
19367 use d->op0 and need not do any extra arithmetic to get the
19368 correct lane number. */
19370 lane
= GEN_INT (elt
); /* The pattern corrects for big-endian. */
19372 rtx parallel
= gen_rtx_PARALLEL (vmode
, gen_rtvec (1, lane
));
19373 rtx select
= gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode
), in0
, parallel
);
19374 emit_set_insn (out
, gen_rtx_VEC_DUPLICATE (vmode
, select
));
19379 aarch64_evpc_tbl (struct expand_vec_perm_d
*d
)
19381 rtx rperm
[MAX_COMPILE_TIME_VEC_BYTES
], sel
;
19382 machine_mode vmode
= d
->vmode
;
19384 /* Make sure that the indices are constant. */
19385 unsigned int encoded_nelts
= d
->perm
.encoding ().encoded_nelts ();
19386 for (unsigned int i
= 0; i
< encoded_nelts
; ++i
)
19387 if (!d
->perm
[i
].is_constant ())
19393 /* Generic code will try constant permutation twice. Once with the
19394 original mode and again with the elements lowered to QImode.
19395 So wait and don't do the selector expansion ourselves. */
19396 if (vmode
!= V8QImode
&& vmode
!= V16QImode
)
19399 /* to_constant is safe since this routine is specific to Advanced SIMD
19401 unsigned int nelt
= d
->perm
.length ().to_constant ();
19402 for (unsigned int i
= 0; i
< nelt
; ++i
)
19403 /* If big-endian and two vectors we end up with a weird mixed-endian
19404 mode on NEON. Reverse the index within each word but not the word
19405 itself. to_constant is safe because we checked is_constant above. */
19406 rperm
[i
] = GEN_INT (BYTES_BIG_ENDIAN
19407 ? d
->perm
[i
].to_constant () ^ (nelt
- 1)
19408 : d
->perm
[i
].to_constant ());
19410 sel
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
19411 sel
= force_reg (vmode
, sel
);
19413 aarch64_expand_vec_perm_1 (d
->target
, d
->op0
, d
->op1
, sel
);
19417 /* Try to implement D using an SVE TBL instruction. */
19420 aarch64_evpc_sve_tbl (struct expand_vec_perm_d
*d
)
19422 unsigned HOST_WIDE_INT nelt
;
19424 /* Permuting two variable-length vectors could overflow the
19426 if (!d
->one_vector_p
&& !d
->perm
.length ().is_constant (&nelt
))
19432 machine_mode sel_mode
= related_int_vector_mode (d
->vmode
).require ();
19433 rtx sel
= vec_perm_indices_to_rtx (sel_mode
, d
->perm
);
19434 if (d
->one_vector_p
)
19435 emit_unspec2 (d
->target
, UNSPEC_TBL
, d
->op0
, force_reg (sel_mode
, sel
));
19437 aarch64_expand_sve_vec_perm (d
->target
, d
->op0
, d
->op1
, sel
);
19441 /* Try to implement D using SVE SEL instruction. */
19444 aarch64_evpc_sel (struct expand_vec_perm_d
*d
)
19446 machine_mode vmode
= d
->vmode
;
19447 int unit_size
= GET_MODE_UNIT_SIZE (vmode
);
19449 if (d
->vec_flags
!= VEC_SVE_DATA
19453 int n_patterns
= d
->perm
.encoding ().npatterns ();
19454 poly_int64 vec_len
= d
->perm
.length ();
19456 for (int i
= 0; i
< n_patterns
; ++i
)
19457 if (!known_eq (d
->perm
[i
], i
)
19458 && !known_eq (d
->perm
[i
], vec_len
+ i
))
19461 for (int i
= n_patterns
; i
< n_patterns
* 2; i
++)
19462 if (!d
->perm
.series_p (i
, n_patterns
, i
, n_patterns
)
19463 && !d
->perm
.series_p (i
, n_patterns
, vec_len
+ i
, n_patterns
))
19469 machine_mode pred_mode
= aarch64_sve_pred_mode (vmode
);
19471 rtx_vector_builder
builder (pred_mode
, n_patterns
, 2);
19472 for (int i
= 0; i
< n_patterns
* 2; i
++)
19474 rtx elem
= known_eq (d
->perm
[i
], i
) ? CONST1_RTX (BImode
)
19475 : CONST0_RTX (BImode
);
19476 builder
.quick_push (elem
);
19479 rtx const_vec
= builder
.build ();
19480 rtx pred
= force_reg (pred_mode
, const_vec
);
19481 emit_insn (gen_vcond_mask (vmode
, vmode
, d
->target
, d
->op1
, d
->op0
, pred
));
19486 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
19488 /* The pattern matching functions above are written to look for a small
19489 number to begin the sequence (0, 1, N/2). If we begin with an index
19490 from the second operand, we can swap the operands. */
19491 poly_int64 nelt
= d
->perm
.length ();
19492 if (known_ge (d
->perm
[0], nelt
))
19494 d
->perm
.rotate_inputs (1);
19495 std::swap (d
->op0
, d
->op1
);
19498 if ((d
->vec_flags
== VEC_ADVSIMD
19499 || d
->vec_flags
== VEC_SVE_DATA
19500 || d
->vec_flags
== VEC_SVE_PRED
)
19501 && known_gt (nelt
, 1))
19503 if (aarch64_evpc_rev_local (d
))
19505 else if (aarch64_evpc_rev_global (d
))
19507 else if (aarch64_evpc_ext (d
))
19509 else if (aarch64_evpc_dup (d
))
19511 else if (aarch64_evpc_zip (d
))
19513 else if (aarch64_evpc_uzp (d
))
19515 else if (aarch64_evpc_trn (d
))
19517 else if (aarch64_evpc_sel (d
))
19519 if (d
->vec_flags
== VEC_SVE_DATA
)
19520 return aarch64_evpc_sve_tbl (d
);
19521 else if (d
->vec_flags
== VEC_ADVSIMD
)
19522 return aarch64_evpc_tbl (d
);
19527 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
19530 aarch64_vectorize_vec_perm_const (machine_mode vmode
, rtx target
, rtx op0
,
19531 rtx op1
, const vec_perm_indices
&sel
)
19533 struct expand_vec_perm_d d
;
19535 /* Check whether the mask can be applied to a single vector. */
19536 if (sel
.ninputs () == 1
19537 || (op0
&& rtx_equal_p (op0
, op1
)))
19538 d
.one_vector_p
= true;
19539 else if (sel
.all_from_input_p (0))
19541 d
.one_vector_p
= true;
19544 else if (sel
.all_from_input_p (1))
19546 d
.one_vector_p
= true;
19550 d
.one_vector_p
= false;
19552 d
.perm
.new_vector (sel
.encoding (), d
.one_vector_p
? 1 : 2,
19553 sel
.nelts_per_input ());
19555 d
.vec_flags
= aarch64_classify_vector_mode (d
.vmode
);
19559 d
.testing_p
= !target
;
19562 return aarch64_expand_vec_perm_const_1 (&d
);
19564 rtx_insn
*last
= get_last_insn ();
19565 bool ret
= aarch64_expand_vec_perm_const_1 (&d
);
19566 gcc_assert (last
== get_last_insn ());
19571 /* Generate a byte permute mask for a register of mode MODE,
19572 which has NUNITS units. */
19575 aarch64_reverse_mask (machine_mode mode
, unsigned int nunits
)
19577 /* We have to reverse each vector because we dont have
19578 a permuted load that can reverse-load according to ABI rules. */
19580 rtvec v
= rtvec_alloc (16);
19582 unsigned int usize
= GET_MODE_UNIT_SIZE (mode
);
19584 gcc_assert (BYTES_BIG_ENDIAN
);
19585 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode
));
19587 for (i
= 0; i
< nunits
; i
++)
19588 for (j
= 0; j
< usize
; j
++)
19589 RTVEC_ELT (v
, i
* usize
+ j
) = GEN_INT ((i
+ 1) * usize
- 1 - j
);
19590 mask
= gen_rtx_CONST_VECTOR (V16QImode
, v
);
19591 return force_reg (V16QImode
, mask
);
19594 /* Expand an SVE integer comparison using the SVE equivalent of:
19596 (set TARGET (CODE OP0 OP1)). */
19599 aarch64_expand_sve_vec_cmp_int (rtx target
, rtx_code code
, rtx op0
, rtx op1
)
19601 machine_mode pred_mode
= GET_MODE (target
);
19602 machine_mode data_mode
= GET_MODE (op0
);
19603 rtx res
= aarch64_sve_emit_int_cmp (target
, pred_mode
, code
, data_mode
,
19605 if (!rtx_equal_p (target
, res
))
19606 emit_move_insn (target
, res
);
19609 /* Return the UNSPEC_COND_* code for comparison CODE. */
19611 static unsigned int
19612 aarch64_unspec_cond_code (rtx_code code
)
19617 return UNSPEC_COND_FCMNE
;
19619 return UNSPEC_COND_FCMEQ
;
19621 return UNSPEC_COND_FCMLT
;
19623 return UNSPEC_COND_FCMGT
;
19625 return UNSPEC_COND_FCMLE
;
19627 return UNSPEC_COND_FCMGE
;
19629 return UNSPEC_COND_FCMUO
;
19631 gcc_unreachable ();
19637 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
19639 where <X> is the operation associated with comparison CODE.
19640 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
19643 aarch64_emit_sve_fp_cond (rtx target
, rtx_code code
, rtx pred
,
19644 bool known_ptrue_p
, rtx op0
, rtx op1
)
19646 rtx flag
= gen_int_mode (known_ptrue_p
, SImode
);
19647 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (pred
),
19648 gen_rtvec (4, pred
, flag
, op0
, op1
),
19649 aarch64_unspec_cond_code (code
));
19650 emit_set_insn (target
, unspec
);
19653 /* Emit the SVE equivalent of:
19655 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
19656 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
19657 (set TARGET (ior:PRED_MODE TMP1 TMP2))
19659 where <Xi> is the operation associated with comparison CODEi.
19660 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
19663 aarch64_emit_sve_or_fp_conds (rtx target
, rtx_code code1
, rtx_code code2
,
19664 rtx pred
, bool known_ptrue_p
, rtx op0
, rtx op1
)
19666 machine_mode pred_mode
= GET_MODE (pred
);
19667 rtx tmp1
= gen_reg_rtx (pred_mode
);
19668 aarch64_emit_sve_fp_cond (tmp1
, code1
, pred
, known_ptrue_p
, op0
, op1
);
19669 rtx tmp2
= gen_reg_rtx (pred_mode
);
19670 aarch64_emit_sve_fp_cond (tmp2
, code2
, pred
, known_ptrue_p
, op0
, op1
);
19671 aarch64_emit_binop (target
, ior_optab
, tmp1
, tmp2
);
19674 /* Emit the SVE equivalent of:
19676 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
19677 (set TARGET (not TMP))
19679 where <X> is the operation associated with comparison CODE.
19680 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
19683 aarch64_emit_sve_invert_fp_cond (rtx target
, rtx_code code
, rtx pred
,
19684 bool known_ptrue_p
, rtx op0
, rtx op1
)
19686 machine_mode pred_mode
= GET_MODE (pred
);
19687 rtx tmp
= gen_reg_rtx (pred_mode
);
19688 aarch64_emit_sve_fp_cond (tmp
, code
, pred
, known_ptrue_p
, op0
, op1
);
19689 aarch64_emit_unop (target
, one_cmpl_optab
, tmp
);
19692 /* Expand an SVE floating-point comparison using the SVE equivalent of:
19694 (set TARGET (CODE OP0 OP1))
19696 If CAN_INVERT_P is true, the caller can also handle inverted results;
19697 return true if the result is in fact inverted. */
19700 aarch64_expand_sve_vec_cmp_float (rtx target
, rtx_code code
,
19701 rtx op0
, rtx op1
, bool can_invert_p
)
19703 machine_mode pred_mode
= GET_MODE (target
);
19704 machine_mode data_mode
= GET_MODE (op0
);
19706 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
19710 /* UNORDERED has no immediate form. */
19711 op1
= force_reg (data_mode
, op1
);
19720 /* There is native support for the comparison. */
19721 aarch64_emit_sve_fp_cond (target
, code
, ptrue
, true, op0
, op1
);
19726 /* This is a trapping operation (LT or GT). */
19727 aarch64_emit_sve_or_fp_conds (target
, LT
, GT
, ptrue
, true, op0
, op1
);
19731 if (!flag_trapping_math
)
19733 /* This would trap for signaling NaNs. */
19734 op1
= force_reg (data_mode
, op1
);
19735 aarch64_emit_sve_or_fp_conds (target
, UNORDERED
, EQ
,
19736 ptrue
, true, op0
, op1
);
19744 if (flag_trapping_math
)
19746 /* Work out which elements are ordered. */
19747 rtx ordered
= gen_reg_rtx (pred_mode
);
19748 op1
= force_reg (data_mode
, op1
);
19749 aarch64_emit_sve_invert_fp_cond (ordered
, UNORDERED
,
19750 ptrue
, true, op0
, op1
);
19752 /* Test the opposite condition for the ordered elements,
19753 then invert the result. */
19757 code
= reverse_condition_maybe_unordered (code
);
19760 aarch64_emit_sve_fp_cond (target
, code
,
19761 ordered
, false, op0
, op1
);
19764 aarch64_emit_sve_invert_fp_cond (target
, code
,
19765 ordered
, false, op0
, op1
);
19771 /* ORDERED has no immediate form. */
19772 op1
= force_reg (data_mode
, op1
);
19776 gcc_unreachable ();
19779 /* There is native support for the inverse comparison. */
19780 code
= reverse_condition_maybe_unordered (code
);
19783 aarch64_emit_sve_fp_cond (target
, code
, ptrue
, true, op0
, op1
);
19786 aarch64_emit_sve_invert_fp_cond (target
, code
, ptrue
, true, op0
, op1
);
19790 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
19791 of the data being selected and CMP_MODE is the mode of the values being
19795 aarch64_expand_sve_vcond (machine_mode data_mode
, machine_mode cmp_mode
,
19798 machine_mode pred_mode
= aarch64_get_mask_mode (cmp_mode
).require ();
19799 rtx pred
= gen_reg_rtx (pred_mode
);
19800 if (FLOAT_MODE_P (cmp_mode
))
19802 if (aarch64_expand_sve_vec_cmp_float (pred
, GET_CODE (ops
[3]),
19803 ops
[4], ops
[5], true))
19804 std::swap (ops
[1], ops
[2]);
19807 aarch64_expand_sve_vec_cmp_int (pred
, GET_CODE (ops
[3]), ops
[4], ops
[5]);
19809 if (!aarch64_sve_reg_or_dup_imm (ops
[1], data_mode
))
19810 ops
[1] = force_reg (data_mode
, ops
[1]);
19811 /* The "false" value can only be zero if the "true" value is a constant. */
19812 if (register_operand (ops
[1], data_mode
)
19813 || !aarch64_simd_reg_or_zero (ops
[2], data_mode
))
19814 ops
[2] = force_reg (data_mode
, ops
[2]);
19816 rtvec vec
= gen_rtvec (3, pred
, ops
[1], ops
[2]);
19817 emit_set_insn (ops
[0], gen_rtx_UNSPEC (data_mode
, vec
, UNSPEC_SEL
));
19820 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
19821 true. However due to issues with register allocation it is preferable
19822 to avoid tieing integer scalar and FP scalar modes. Executing integer
19823 operations in general registers is better than treating them as scalar
19824 vector operations. This reduces latency and avoids redundant int<->FP
19825 moves. So tie modes if they are either the same class, or vector modes
19826 with other vector modes, vector structs or any scalar mode. */
19829 aarch64_modes_tieable_p (machine_mode mode1
, machine_mode mode2
)
19831 if (GET_MODE_CLASS (mode1
) == GET_MODE_CLASS (mode2
))
19834 /* We specifically want to allow elements of "structure" modes to
19835 be tieable to the structure. This more general condition allows
19836 other rarer situations too. The reason we don't extend this to
19837 predicate modes is that there are no predicate structure modes
19838 nor any specific instructions for extracting part of a predicate
19840 if (aarch64_vector_data_mode_p (mode1
)
19841 && aarch64_vector_data_mode_p (mode2
))
19844 /* Also allow any scalar modes with vectors. */
19845 if (aarch64_vector_mode_supported_p (mode1
)
19846 || aarch64_vector_mode_supported_p (mode2
))
19852 /* Return a new RTX holding the result of moving POINTER forward by
19856 aarch64_move_pointer (rtx pointer
, poly_int64 amount
)
19858 rtx next
= plus_constant (Pmode
, XEXP (pointer
, 0), amount
);
19860 return adjust_automodify_address (pointer
, GET_MODE (pointer
),
19864 /* Return a new RTX holding the result of moving POINTER forward by the
19865 size of the mode it points to. */
19868 aarch64_progress_pointer (rtx pointer
)
19870 return aarch64_move_pointer (pointer
, GET_MODE_SIZE (GET_MODE (pointer
)));
19873 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
19877 aarch64_copy_one_block_and_progress_pointers (rtx
*src
, rtx
*dst
,
19880 rtx reg
= gen_reg_rtx (mode
);
19882 /* "Cast" the pointers to the correct mode. */
19883 *src
= adjust_address (*src
, mode
, 0);
19884 *dst
= adjust_address (*dst
, mode
, 0);
19885 /* Emit the memcpy. */
19886 emit_move_insn (reg
, *src
);
19887 emit_move_insn (*dst
, reg
);
19888 /* Move the pointers forward. */
19889 *src
= aarch64_progress_pointer (*src
);
19890 *dst
= aarch64_progress_pointer (*dst
);
19893 /* Expand cpymem, as if from a __builtin_memcpy. Return true if
19894 we succeed, otherwise return false. */
19897 aarch64_expand_cpymem (rtx
*operands
)
19900 rtx dst
= operands
[0];
19901 rtx src
= operands
[1];
19903 machine_mode cur_mode
= BLKmode
, next_mode
;
19904 bool speed_p
= !optimize_function_for_size_p (cfun
);
19906 /* When optimizing for size, give a better estimate of the length of a
19907 memcpy call, but use the default otherwise. Moves larger than 8 bytes
19908 will always require an even number of instructions to do now. And each
19909 operation requires both a load+store, so devide the max number by 2. */
19910 int max_num_moves
= (speed_p
? 16 : AARCH64_CALL_RATIO
) / 2;
19912 /* We can't do anything smart if the amount to copy is not constant. */
19913 if (!CONST_INT_P (operands
[2]))
19916 n
= INTVAL (operands
[2]);
19918 /* Try to keep the number of instructions low. For all cases we will do at
19919 most two moves for the residual amount, since we'll always overlap the
19921 if (((n
/ 16) + (n
% 16 ? 2 : 0)) > max_num_moves
)
19924 base
= copy_to_mode_reg (Pmode
, XEXP (dst
, 0));
19925 dst
= adjust_automodify_address (dst
, VOIDmode
, base
, 0);
19927 base
= copy_to_mode_reg (Pmode
, XEXP (src
, 0));
19928 src
= adjust_automodify_address (src
, VOIDmode
, base
, 0);
19930 /* Convert n to bits to make the rest of the code simpler. */
19931 n
= n
* BITS_PER_UNIT
;
19933 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
19934 larger than TImode, but we should not use them for loads/stores here. */
19935 const int copy_limit
= GET_MODE_BITSIZE (TImode
);
19939 /* Find the largest mode in which to do the copy in without over reading
19941 opt_scalar_int_mode mode_iter
;
19942 FOR_EACH_MODE_IN_CLASS (mode_iter
, MODE_INT
)
19943 if (GET_MODE_BITSIZE (mode_iter
.require ()) <= MIN (n
, copy_limit
))
19944 cur_mode
= mode_iter
.require ();
19946 gcc_assert (cur_mode
!= BLKmode
);
19948 mode_bits
= GET_MODE_BITSIZE (cur_mode
).to_constant ();
19949 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, cur_mode
);
19953 /* Do certain trailing copies as overlapping if it's going to be
19954 cheaper. i.e. less instructions to do so. For instance doing a 15
19955 byte copy it's more efficient to do two overlapping 8 byte copies than
19957 if (n
> 0 && n
<= 8 * BITS_PER_UNIT
)
19959 next_mode
= smallest_mode_for_size (n
, MODE_INT
);
19960 int n_bits
= GET_MODE_BITSIZE (next_mode
).to_constant ();
19961 src
= aarch64_move_pointer (src
, (n
- n_bits
) / BITS_PER_UNIT
);
19962 dst
= aarch64_move_pointer (dst
, (n
- n_bits
) / BITS_PER_UNIT
);
19970 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
19971 SImode stores. Handle the case when the constant has identical
19972 bottom and top halves. This is beneficial when the two stores can be
19973 merged into an STP and we avoid synthesising potentially expensive
19974 immediates twice. Return true if such a split is possible. */
19977 aarch64_split_dimode_const_store (rtx dst
, rtx src
)
19979 rtx lo
= gen_lowpart (SImode
, src
);
19980 rtx hi
= gen_highpart_mode (SImode
, DImode
, src
);
19982 bool size_p
= optimize_function_for_size_p (cfun
);
19984 if (!rtx_equal_p (lo
, hi
))
19987 unsigned int orig_cost
19988 = aarch64_internal_mov_immediate (NULL_RTX
, src
, false, DImode
);
19989 unsigned int lo_cost
19990 = aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, SImode
);
19992 /* We want to transform:
19994 MOVK x1, 0x140, lsl 16
19995 MOVK x1, 0xc0da, lsl 32
19996 MOVK x1, 0x140, lsl 48
20000 MOVK w1, 0x140, lsl 16
20002 So we want to perform this only when we save two instructions
20003 or more. When optimizing for size, however, accept any code size
20005 if (size_p
&& orig_cost
<= lo_cost
)
20009 && (orig_cost
<= lo_cost
+ 1))
20012 rtx mem_lo
= adjust_address (dst
, SImode
, 0);
20013 if (!aarch64_mem_pair_operand (mem_lo
, SImode
))
20016 rtx tmp_reg
= gen_reg_rtx (SImode
);
20017 aarch64_expand_mov_immediate (tmp_reg
, lo
);
20018 rtx mem_hi
= aarch64_move_pointer (mem_lo
, GET_MODE_SIZE (SImode
));
20019 /* Don't emit an explicit store pair as this may not be always profitable.
20020 Let the sched-fusion logic decide whether to merge them. */
20021 emit_move_insn (mem_lo
, tmp_reg
);
20022 emit_move_insn (mem_hi
, tmp_reg
);
20027 /* Generate RTL for a conditional branch with rtx comparison CODE in
20028 mode CC_MODE. The destination of the unlikely conditional branch
20032 aarch64_gen_unlikely_cbranch (enum rtx_code code
, machine_mode cc_mode
,
20036 x
= gen_rtx_fmt_ee (code
, VOIDmode
,
20037 gen_rtx_REG (cc_mode
, CC_REGNUM
),
20040 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
20041 gen_rtx_LABEL_REF (VOIDmode
, label_ref
),
20043 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
20046 /* Generate DImode scratch registers for 128-bit (TImode) addition.
20048 OP1 represents the TImode destination operand 1
20049 OP2 represents the TImode destination operand 2
20050 LOW_DEST represents the low half (DImode) of TImode operand 0
20051 LOW_IN1 represents the low half (DImode) of TImode operand 1
20052 LOW_IN2 represents the low half (DImode) of TImode operand 2
20053 HIGH_DEST represents the high half (DImode) of TImode operand 0
20054 HIGH_IN1 represents the high half (DImode) of TImode operand 1
20055 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
20058 aarch64_addti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
20059 rtx
*low_in1
, rtx
*low_in2
,
20060 rtx
*high_dest
, rtx
*high_in1
,
20063 *low_dest
= gen_reg_rtx (DImode
);
20064 *low_in1
= gen_lowpart (DImode
, op1
);
20065 *low_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
20066 subreg_lowpart_offset (DImode
, TImode
));
20067 *high_dest
= gen_reg_rtx (DImode
);
20068 *high_in1
= gen_highpart (DImode
, op1
);
20069 *high_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
20070 subreg_highpart_offset (DImode
, TImode
));
20073 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
20075 This function differs from 'arch64_addti_scratch_regs' in that
20076 OP1 can be an immediate constant (zero). We must call
20077 subreg_highpart_offset with DImode and TImode arguments, otherwise
20078 VOIDmode will be used for the const_int which generates an internal
20079 error from subreg_size_highpart_offset which does not expect a size of zero.
20081 OP1 represents the TImode destination operand 1
20082 OP2 represents the TImode destination operand 2
20083 LOW_DEST represents the low half (DImode) of TImode operand 0
20084 LOW_IN1 represents the low half (DImode) of TImode operand 1
20085 LOW_IN2 represents the low half (DImode) of TImode operand 2
20086 HIGH_DEST represents the high half (DImode) of TImode operand 0
20087 HIGH_IN1 represents the high half (DImode) of TImode operand 1
20088 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
20092 aarch64_subvti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
20093 rtx
*low_in1
, rtx
*low_in2
,
20094 rtx
*high_dest
, rtx
*high_in1
,
20097 *low_dest
= gen_reg_rtx (DImode
);
20098 *low_in1
= simplify_gen_subreg (DImode
, op1
, TImode
,
20099 subreg_lowpart_offset (DImode
, TImode
));
20101 *low_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
20102 subreg_lowpart_offset (DImode
, TImode
));
20103 *high_dest
= gen_reg_rtx (DImode
);
20105 *high_in1
= simplify_gen_subreg (DImode
, op1
, TImode
,
20106 subreg_highpart_offset (DImode
, TImode
));
20107 *high_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
20108 subreg_highpart_offset (DImode
, TImode
));
20111 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
20113 OP0 represents the TImode destination operand 0
20114 LOW_DEST represents the low half (DImode) of TImode operand 0
20115 LOW_IN1 represents the low half (DImode) of TImode operand 1
20116 LOW_IN2 represents the low half (DImode) of TImode operand 2
20117 HIGH_DEST represents the high half (DImode) of TImode operand 0
20118 HIGH_IN1 represents the high half (DImode) of TImode operand 1
20119 HIGH_IN2 represents the high half (DImode) of TImode operand 2
20120 UNSIGNED_P is true if the operation is being performed on unsigned
20123 aarch64_expand_subvti (rtx op0
, rtx low_dest
, rtx low_in1
,
20124 rtx low_in2
, rtx high_dest
, rtx high_in1
,
20125 rtx high_in2
, bool unsigned_p
)
20127 if (low_in2
== const0_rtx
)
20129 low_dest
= low_in1
;
20130 high_in2
= force_reg (DImode
, high_in2
);
20132 emit_insn (gen_subdi3_compare1 (high_dest
, high_in1
, high_in2
));
20134 emit_insn (gen_subvdi_insn (high_dest
, high_in1
, high_in2
));
20138 if (CONST_INT_P (low_in2
))
20140 high_in2
= force_reg (DImode
, high_in2
);
20141 emit_insn (gen_subdi3_compare1_imm (low_dest
, low_in1
, low_in2
,
20142 GEN_INT (-INTVAL (low_in2
))));
20145 emit_insn (gen_subdi3_compare1 (low_dest
, low_in1
, low_in2
));
20148 emit_insn (gen_usubdi3_carryinC (high_dest
, high_in1
, high_in2
));
20150 emit_insn (gen_subdi3_carryinV (high_dest
, high_in1
, high_in2
));
20153 emit_move_insn (gen_lowpart (DImode
, op0
), low_dest
);
20154 emit_move_insn (gen_highpart (DImode
, op0
), high_dest
);
20158 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
20160 static unsigned HOST_WIDE_INT
20161 aarch64_asan_shadow_offset (void)
20164 return (HOST_WIDE_INT_1
<< 29);
20166 return (HOST_WIDE_INT_1
<< 36);
20170 aarch64_gen_ccmp_first (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
,
20171 int code
, tree treeop0
, tree treeop1
)
20173 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
20175 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
20177 struct expand_operand ops
[4];
20180 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
20182 op_mode
= GET_MODE (op0
);
20183 if (op_mode
== VOIDmode
)
20184 op_mode
= GET_MODE (op1
);
20192 icode
= CODE_FOR_cmpsi
;
20197 icode
= CODE_FOR_cmpdi
;
20202 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
20203 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpesf
: CODE_FOR_fcmpsf
;
20208 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
20209 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpedf
: CODE_FOR_fcmpdf
;
20217 op0
= prepare_operand (icode
, op0
, 0, op_mode
, cmp_mode
, unsignedp
);
20218 op1
= prepare_operand (icode
, op1
, 1, op_mode
, cmp_mode
, unsignedp
);
20224 *prep_seq
= get_insns ();
20227 create_fixed_operand (&ops
[0], op0
);
20228 create_fixed_operand (&ops
[1], op1
);
20231 if (!maybe_expand_insn (icode
, 2, ops
))
20236 *gen_seq
= get_insns ();
20239 return gen_rtx_fmt_ee ((rtx_code
) code
, cc_mode
,
20240 gen_rtx_REG (cc_mode
, CC_REGNUM
), const0_rtx
);
20244 aarch64_gen_ccmp_next (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
, rtx prev
,
20245 int cmp_code
, tree treeop0
, tree treeop1
, int bit_code
)
20247 rtx op0
, op1
, target
;
20248 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
20249 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
20251 struct expand_operand ops
[6];
20254 push_to_sequence (*prep_seq
);
20255 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
20257 op_mode
= GET_MODE (op0
);
20258 if (op_mode
== VOIDmode
)
20259 op_mode
= GET_MODE (op1
);
20267 icode
= CODE_FOR_ccmpsi
;
20272 icode
= CODE_FOR_ccmpdi
;
20277 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
20278 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpesf
: CODE_FOR_fccmpsf
;
20283 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
20284 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpedf
: CODE_FOR_fccmpdf
;
20292 op0
= prepare_operand (icode
, op0
, 2, op_mode
, cmp_mode
, unsignedp
);
20293 op1
= prepare_operand (icode
, op1
, 3, op_mode
, cmp_mode
, unsignedp
);
20299 *prep_seq
= get_insns ();
20302 target
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
20303 aarch64_cond
= aarch64_get_condition_code_1 (cc_mode
, (rtx_code
) cmp_code
);
20305 if (bit_code
!= AND
)
20307 prev
= gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev
),
20308 GET_MODE (XEXP (prev
, 0))),
20309 VOIDmode
, XEXP (prev
, 0), const0_rtx
);
20310 aarch64_cond
= AARCH64_INVERSE_CONDITION_CODE (aarch64_cond
);
20313 create_fixed_operand (&ops
[0], XEXP (prev
, 0));
20314 create_fixed_operand (&ops
[1], target
);
20315 create_fixed_operand (&ops
[2], op0
);
20316 create_fixed_operand (&ops
[3], op1
);
20317 create_fixed_operand (&ops
[4], prev
);
20318 create_fixed_operand (&ops
[5], GEN_INT (aarch64_cond
));
20320 push_to_sequence (*gen_seq
);
20321 if (!maybe_expand_insn (icode
, 6, ops
))
20327 *gen_seq
= get_insns ();
20330 return gen_rtx_fmt_ee ((rtx_code
) cmp_code
, VOIDmode
, target
, const0_rtx
);
20333 #undef TARGET_GEN_CCMP_FIRST
20334 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
20336 #undef TARGET_GEN_CCMP_NEXT
20337 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
20339 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
20340 instruction fusion of some sort. */
20343 aarch64_macro_fusion_p (void)
20345 return aarch64_tune_params
.fusible_ops
!= AARCH64_FUSE_NOTHING
;
20349 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
20350 should be kept together during scheduling. */
20353 aarch_macro_fusion_pair_p (rtx_insn
*prev
, rtx_insn
*curr
)
20356 rtx prev_set
= single_set (prev
);
20357 rtx curr_set
= single_set (curr
);
20358 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
20359 bool simple_sets_p
= prev_set
&& curr_set
&& !any_condjump_p (curr
);
20361 if (!aarch64_macro_fusion_p ())
20364 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK
))
20366 /* We are trying to match:
20367 prev (mov) == (set (reg r0) (const_int imm16))
20368 curr (movk) == (set (zero_extract (reg r0)
20371 (const_int imm16_1)) */
20373 set_dest
= SET_DEST (curr_set
);
20375 if (GET_CODE (set_dest
) == ZERO_EXTRACT
20376 && CONST_INT_P (SET_SRC (curr_set
))
20377 && CONST_INT_P (SET_SRC (prev_set
))
20378 && CONST_INT_P (XEXP (set_dest
, 2))
20379 && INTVAL (XEXP (set_dest
, 2)) == 16
20380 && REG_P (XEXP (set_dest
, 0))
20381 && REG_P (SET_DEST (prev_set
))
20382 && REGNO (XEXP (set_dest
, 0)) == REGNO (SET_DEST (prev_set
)))
20388 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD
))
20391 /* We're trying to match:
20392 prev (adrp) == (set (reg r1)
20393 (high (symbol_ref ("SYM"))))
20394 curr (add) == (set (reg r0)
20396 (symbol_ref ("SYM"))))
20397 Note that r0 need not necessarily be the same as r1, especially
20398 during pre-regalloc scheduling. */
20400 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
20401 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
20403 if (GET_CODE (SET_SRC (curr_set
)) == LO_SUM
20404 && REG_P (XEXP (SET_SRC (curr_set
), 0))
20405 && REGNO (XEXP (SET_SRC (curr_set
), 0))
20406 == REGNO (SET_DEST (prev_set
))
20407 && rtx_equal_p (XEXP (SET_SRC (prev_set
), 0),
20408 XEXP (SET_SRC (curr_set
), 1)))
20413 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK
))
20416 /* We're trying to match:
20417 prev (movk) == (set (zero_extract (reg r0)
20420 (const_int imm16_1))
20421 curr (movk) == (set (zero_extract (reg r0)
20424 (const_int imm16_2)) */
20426 if (GET_CODE (SET_DEST (prev_set
)) == ZERO_EXTRACT
20427 && GET_CODE (SET_DEST (curr_set
)) == ZERO_EXTRACT
20428 && REG_P (XEXP (SET_DEST (prev_set
), 0))
20429 && REG_P (XEXP (SET_DEST (curr_set
), 0))
20430 && REGNO (XEXP (SET_DEST (prev_set
), 0))
20431 == REGNO (XEXP (SET_DEST (curr_set
), 0))
20432 && CONST_INT_P (XEXP (SET_DEST (prev_set
), 2))
20433 && CONST_INT_P (XEXP (SET_DEST (curr_set
), 2))
20434 && INTVAL (XEXP (SET_DEST (prev_set
), 2)) == 32
20435 && INTVAL (XEXP (SET_DEST (curr_set
), 2)) == 48
20436 && CONST_INT_P (SET_SRC (prev_set
))
20437 && CONST_INT_P (SET_SRC (curr_set
)))
20441 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR
))
20443 /* We're trying to match:
20444 prev (adrp) == (set (reg r0)
20445 (high (symbol_ref ("SYM"))))
20446 curr (ldr) == (set (reg r1)
20447 (mem (lo_sum (reg r0)
20448 (symbol_ref ("SYM")))))
20450 curr (ldr) == (set (reg r1)
20453 (symbol_ref ("SYM")))))) */
20454 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
20455 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
20457 rtx curr_src
= SET_SRC (curr_set
);
20459 if (GET_CODE (curr_src
) == ZERO_EXTEND
)
20460 curr_src
= XEXP (curr_src
, 0);
20462 if (MEM_P (curr_src
) && GET_CODE (XEXP (curr_src
, 0)) == LO_SUM
20463 && REG_P (XEXP (XEXP (curr_src
, 0), 0))
20464 && REGNO (XEXP (XEXP (curr_src
, 0), 0))
20465 == REGNO (SET_DEST (prev_set
))
20466 && rtx_equal_p (XEXP (XEXP (curr_src
, 0), 1),
20467 XEXP (SET_SRC (prev_set
), 0)))
20472 /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch. */
20473 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH
)
20474 && prev_set
&& curr_set
&& any_condjump_p (curr
)
20475 && GET_CODE (SET_SRC (prev_set
)) == COMPARE
20476 && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set
), 0)))
20477 && reg_referenced_p (SET_DEST (prev_set
), PATTERN (curr
)))
20480 /* Fuse flag-setting ALU instructions and conditional branch. */
20481 if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH
)
20482 && any_condjump_p (curr
))
20484 unsigned int condreg1
, condreg2
;
20486 aarch64_fixed_condition_code_regs (&condreg1
, &condreg2
);
20487 cc_reg_1
= gen_rtx_REG (CCmode
, condreg1
);
20489 if (reg_referenced_p (cc_reg_1
, PATTERN (curr
))
20491 && modified_in_p (cc_reg_1
, prev
))
20493 enum attr_type prev_type
= get_attr_type (prev
);
20495 /* FIXME: this misses some which is considered simple arthematic
20496 instructions for ThunderX. Simple shifts are missed here. */
20497 if (prev_type
== TYPE_ALUS_SREG
20498 || prev_type
== TYPE_ALUS_IMM
20499 || prev_type
== TYPE_LOGICS_REG
20500 || prev_type
== TYPE_LOGICS_IMM
)
20505 /* Fuse ALU instructions and CBZ/CBNZ. */
20508 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ
)
20509 && any_condjump_p (curr
))
20511 /* We're trying to match:
20512 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
20513 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
20515 (label_ref ("SYM"))
20517 if (SET_DEST (curr_set
) == (pc_rtx
)
20518 && GET_CODE (SET_SRC (curr_set
)) == IF_THEN_ELSE
20519 && REG_P (XEXP (XEXP (SET_SRC (curr_set
), 0), 0))
20520 && REG_P (SET_DEST (prev_set
))
20521 && REGNO (SET_DEST (prev_set
))
20522 == REGNO (XEXP (XEXP (SET_SRC (curr_set
), 0), 0)))
20524 /* Fuse ALU operations followed by conditional branch instruction. */
20525 switch (get_attr_type (prev
))
20528 case TYPE_ALU_SREG
:
20531 case TYPE_ADCS_REG
:
20532 case TYPE_ADCS_IMM
:
20533 case TYPE_LOGIC_REG
:
20534 case TYPE_LOGIC_IMM
:
20538 case TYPE_SHIFT_REG
:
20539 case TYPE_SHIFT_IMM
:
20554 /* Return true iff the instruction fusion described by OP is enabled. */
20557 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op
)
20559 return (aarch64_tune_params
.fusible_ops
& op
) != 0;
20562 /* If MEM is in the form of [base+offset], extract the two parts
20563 of address and set to BASE and OFFSET, otherwise return false
20564 after clearing BASE and OFFSET. */
20567 extract_base_offset_in_addr (rtx mem
, rtx
*base
, rtx
*offset
)
20571 gcc_assert (MEM_P (mem
));
20573 addr
= XEXP (mem
, 0);
20578 *offset
= const0_rtx
;
20582 if (GET_CODE (addr
) == PLUS
20583 && REG_P (XEXP (addr
, 0)) && CONST_INT_P (XEXP (addr
, 1)))
20585 *base
= XEXP (addr
, 0);
20586 *offset
= XEXP (addr
, 1);
20591 *offset
= NULL_RTX
;
20596 /* Types for scheduling fusion. */
20597 enum sched_fusion_type
20599 SCHED_FUSION_NONE
= 0,
20600 SCHED_FUSION_LD_SIGN_EXTEND
,
20601 SCHED_FUSION_LD_ZERO_EXTEND
,
20607 /* If INSN is a load or store of address in the form of [base+offset],
20608 extract the two parts and set to BASE and OFFSET. Return scheduling
20609 fusion type this INSN is. */
20611 static enum sched_fusion_type
20612 fusion_load_store (rtx_insn
*insn
, rtx
*base
, rtx
*offset
)
20615 enum sched_fusion_type fusion
= SCHED_FUSION_LD
;
20617 gcc_assert (INSN_P (insn
));
20618 x
= PATTERN (insn
);
20619 if (GET_CODE (x
) != SET
)
20620 return SCHED_FUSION_NONE
;
20623 dest
= SET_DEST (x
);
20625 machine_mode dest_mode
= GET_MODE (dest
);
20627 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode
))
20628 return SCHED_FUSION_NONE
;
20630 if (GET_CODE (src
) == SIGN_EXTEND
)
20632 fusion
= SCHED_FUSION_LD_SIGN_EXTEND
;
20633 src
= XEXP (src
, 0);
20634 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
20635 return SCHED_FUSION_NONE
;
20637 else if (GET_CODE (src
) == ZERO_EXTEND
)
20639 fusion
= SCHED_FUSION_LD_ZERO_EXTEND
;
20640 src
= XEXP (src
, 0);
20641 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
20642 return SCHED_FUSION_NONE
;
20645 if (GET_CODE (src
) == MEM
&& REG_P (dest
))
20646 extract_base_offset_in_addr (src
, base
, offset
);
20647 else if (GET_CODE (dest
) == MEM
&& (REG_P (src
) || src
== const0_rtx
))
20649 fusion
= SCHED_FUSION_ST
;
20650 extract_base_offset_in_addr (dest
, base
, offset
);
20653 return SCHED_FUSION_NONE
;
20655 if (*base
== NULL_RTX
|| *offset
== NULL_RTX
)
20656 fusion
= SCHED_FUSION_NONE
;
20661 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
20663 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
20664 and PRI are only calculated for these instructions. For other instruction,
20665 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
20666 type instruction fusion can be added by returning different priorities.
20668 It's important that irrelevant instructions get the largest FUSION_PRI. */
20671 aarch64_sched_fusion_priority (rtx_insn
*insn
, int max_pri
,
20672 int *fusion_pri
, int *pri
)
20676 enum sched_fusion_type fusion
;
20678 gcc_assert (INSN_P (insn
));
20681 fusion
= fusion_load_store (insn
, &base
, &offset
);
20682 if (fusion
== SCHED_FUSION_NONE
)
20689 /* Set FUSION_PRI according to fusion type and base register. */
20690 *fusion_pri
= tmp
- fusion
* FIRST_PSEUDO_REGISTER
- REGNO (base
);
20692 /* Calculate PRI. */
20695 /* INSN with smaller offset goes first. */
20696 off_val
= (int)(INTVAL (offset
));
20698 tmp
-= (off_val
& 0xfffff);
20700 tmp
+= ((- off_val
) & 0xfffff);
20706 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
20707 Adjust priority of sha1h instructions so they are scheduled before
20708 other SHA1 instructions. */
20711 aarch64_sched_adjust_priority (rtx_insn
*insn
, int priority
)
20713 rtx x
= PATTERN (insn
);
20715 if (GET_CODE (x
) == SET
)
20719 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_SHA1H
)
20720 return priority
+ 10;
20726 /* Given OPERANDS of consecutive load/store, check if we can merge
20727 them into ldp/stp. LOAD is true if they are load instructions.
20728 MODE is the mode of memory operands. */
20731 aarch64_operands_ok_for_ldpstp (rtx
*operands
, bool load
,
20734 HOST_WIDE_INT offval_1
, offval_2
, msize
;
20735 enum reg_class rclass_1
, rclass_2
;
20736 rtx mem_1
, mem_2
, reg_1
, reg_2
, base_1
, base_2
, offset_1
, offset_2
;
20740 mem_1
= operands
[1];
20741 mem_2
= operands
[3];
20742 reg_1
= operands
[0];
20743 reg_2
= operands
[2];
20744 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
));
20745 if (REGNO (reg_1
) == REGNO (reg_2
))
20750 mem_1
= operands
[0];
20751 mem_2
= operands
[2];
20752 reg_1
= operands
[1];
20753 reg_2
= operands
[3];
20756 /* The mems cannot be volatile. */
20757 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
))
20760 /* If we have SImode and slow unaligned ldp,
20761 check the alignment to be at least 8 byte. */
20763 && (aarch64_tune_params
.extra_tuning_flags
20764 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
20766 && MEM_ALIGN (mem_1
) < 8 * BITS_PER_UNIT
)
20769 /* Check if the addresses are in the form of [base+offset]. */
20770 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
20771 if (base_1
== NULL_RTX
|| offset_1
== NULL_RTX
)
20773 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
20774 if (base_2
== NULL_RTX
|| offset_2
== NULL_RTX
)
20777 /* Check if the bases are same. */
20778 if (!rtx_equal_p (base_1
, base_2
))
20781 /* The operands must be of the same size. */
20782 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1
)),
20783 GET_MODE_SIZE (GET_MODE (mem_2
))));
20785 offval_1
= INTVAL (offset_1
);
20786 offval_2
= INTVAL (offset_2
);
20787 /* We should only be trying this for fixed-sized modes. There is no
20788 SVE LDP/STP instruction. */
20789 msize
= GET_MODE_SIZE (mode
).to_constant ();
20790 /* Check if the offsets are consecutive. */
20791 if (offval_1
!= (offval_2
+ msize
) && offval_2
!= (offval_1
+ msize
))
20794 /* Check if the addresses are clobbered by load. */
20797 if (reg_mentioned_p (reg_1
, mem_1
))
20800 /* In increasing order, the last load can clobber the address. */
20801 if (offval_1
> offval_2
&& reg_mentioned_p (reg_2
, mem_2
))
20805 /* One of the memory accesses must be a mempair operand.
20806 If it is not the first one, they need to be swapped by the
20808 if (!aarch64_mem_pair_operand (mem_1
, GET_MODE (mem_1
))
20809 && !aarch64_mem_pair_operand (mem_2
, GET_MODE (mem_2
)))
20812 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
20813 rclass_1
= FP_REGS
;
20815 rclass_1
= GENERAL_REGS
;
20817 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
20818 rclass_2
= FP_REGS
;
20820 rclass_2
= GENERAL_REGS
;
20822 /* Check if the registers are of same class. */
20823 if (rclass_1
!= rclass_2
)
20829 /* Given OPERANDS of consecutive load/store that can be merged,
20830 swap them if they are not in ascending order. */
20832 aarch64_swap_ldrstr_operands (rtx
* operands
, bool load
)
20834 rtx mem_1
, mem_2
, base_1
, base_2
, offset_1
, offset_2
;
20835 HOST_WIDE_INT offval_1
, offval_2
;
20839 mem_1
= operands
[1];
20840 mem_2
= operands
[3];
20844 mem_1
= operands
[0];
20845 mem_2
= operands
[2];
20848 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
20849 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
20851 offval_1
= INTVAL (offset_1
);
20852 offval_2
= INTVAL (offset_2
);
20854 if (offval_1
> offval_2
)
20856 /* Irrespective of whether this is a load or a store,
20857 we do the same swap. */
20858 std::swap (operands
[0], operands
[2]);
20859 std::swap (operands
[1], operands
[3]);
20863 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
20864 comparison between the two. */
20866 aarch64_host_wide_int_compare (const void *x
, const void *y
)
20868 return wi::cmps (* ((const HOST_WIDE_INT
*) x
),
20869 * ((const HOST_WIDE_INT
*) y
));
20872 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
20873 other pointing to a REG rtx containing an offset, compare the offsets
20878 1 iff offset (X) > offset (Y)
20879 0 iff offset (X) == offset (Y)
20880 -1 iff offset (X) < offset (Y) */
20882 aarch64_ldrstr_offset_compare (const void *x
, const void *y
)
20884 const rtx
* operands_1
= (const rtx
*) x
;
20885 const rtx
* operands_2
= (const rtx
*) y
;
20886 rtx mem_1
, mem_2
, base
, offset_1
, offset_2
;
20888 if (MEM_P (operands_1
[0]))
20889 mem_1
= operands_1
[0];
20891 mem_1
= operands_1
[1];
20893 if (MEM_P (operands_2
[0]))
20894 mem_2
= operands_2
[0];
20896 mem_2
= operands_2
[1];
20898 /* Extract the offsets. */
20899 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
20900 extract_base_offset_in_addr (mem_2
, &base
, &offset_2
);
20902 gcc_assert (offset_1
!= NULL_RTX
&& offset_2
!= NULL_RTX
);
20904 return wi::cmps (INTVAL (offset_1
), INTVAL (offset_2
));
20907 /* Given OPERANDS of consecutive load/store, check if we can merge
20908 them into ldp/stp by adjusting the offset. LOAD is true if they
20909 are load instructions. MODE is the mode of memory operands.
20911 Given below consecutive stores:
20913 str w1, [xb, 0x100]
20914 str w1, [xb, 0x104]
20915 str w1, [xb, 0x108]
20916 str w1, [xb, 0x10c]
20918 Though the offsets are out of the range supported by stp, we can
20919 still pair them after adjusting the offset, like:
20921 add scratch, xb, 0x100
20922 stp w1, w1, [scratch]
20923 stp w1, w1, [scratch, 0x8]
20925 The peephole patterns detecting this opportunity should guarantee
20926 the scratch register is avaliable. */
20929 aarch64_operands_adjust_ok_for_ldpstp (rtx
*operands
, bool load
,
20932 const int num_insns
= 4;
20933 enum reg_class rclass
;
20934 HOST_WIDE_INT offvals
[num_insns
], msize
;
20935 rtx mem
[num_insns
], reg
[num_insns
], base
[num_insns
], offset
[num_insns
];
20939 for (int i
= 0; i
< num_insns
; i
++)
20941 reg
[i
] = operands
[2 * i
];
20942 mem
[i
] = operands
[2 * i
+ 1];
20944 gcc_assert (REG_P (reg
[i
]));
20947 /* Do not attempt to merge the loads if the loads clobber each other. */
20948 for (int i
= 0; i
< 8; i
+= 2)
20949 for (int j
= i
+ 2; j
< 8; j
+= 2)
20950 if (reg_overlap_mentioned_p (operands
[i
], operands
[j
]))
20954 for (int i
= 0; i
< num_insns
; i
++)
20956 mem
[i
] = operands
[2 * i
];
20957 reg
[i
] = operands
[2 * i
+ 1];
20960 /* Skip if memory operand is by itself valid for ldp/stp. */
20961 if (!MEM_P (mem
[0]) || aarch64_mem_pair_operand (mem
[0], mode
))
20964 for (int i
= 0; i
< num_insns
; i
++)
20966 /* The mems cannot be volatile. */
20967 if (MEM_VOLATILE_P (mem
[i
]))
20970 /* Check if the addresses are in the form of [base+offset]. */
20971 extract_base_offset_in_addr (mem
[i
], base
+ i
, offset
+ i
);
20972 if (base
[i
] == NULL_RTX
|| offset
[i
] == NULL_RTX
)
20976 /* Check if the registers are of same class. */
20977 rclass
= REG_P (reg
[0]) && FP_REGNUM_P (REGNO (reg
[0]))
20978 ? FP_REGS
: GENERAL_REGS
;
20980 for (int i
= 1; i
< num_insns
; i
++)
20981 if (REG_P (reg
[i
]) && FP_REGNUM_P (REGNO (reg
[i
])))
20983 if (rclass
!= FP_REGS
)
20988 if (rclass
!= GENERAL_REGS
)
20992 /* Only the last register in the order in which they occur
20993 may be clobbered by the load. */
20994 if (rclass
== GENERAL_REGS
&& load
)
20995 for (int i
= 0; i
< num_insns
- 1; i
++)
20996 if (reg_mentioned_p (reg
[i
], mem
[i
]))
20999 /* Check if the bases are same. */
21000 for (int i
= 0; i
< num_insns
- 1; i
++)
21001 if (!rtx_equal_p (base
[i
], base
[i
+ 1]))
21004 for (int i
= 0; i
< num_insns
; i
++)
21005 offvals
[i
] = INTVAL (offset
[i
]);
21007 msize
= GET_MODE_SIZE (mode
);
21009 /* Check if the offsets can be put in the right order to do a ldp/stp. */
21010 qsort (offvals
, num_insns
, sizeof (HOST_WIDE_INT
),
21011 aarch64_host_wide_int_compare
);
21013 if (!(offvals
[1] == offvals
[0] + msize
21014 && offvals
[3] == offvals
[2] + msize
))
21017 /* Check that offsets are within range of each other. The ldp/stp
21018 instructions have 7 bit immediate offsets, so use 0x80. */
21019 if (offvals
[2] - offvals
[0] >= msize
* 0x80)
21022 /* The offsets must be aligned with respect to each other. */
21023 if (offvals
[0] % msize
!= offvals
[2] % msize
)
21026 /* If we have SImode and slow unaligned ldp,
21027 check the alignment to be at least 8 byte. */
21029 && (aarch64_tune_params
.extra_tuning_flags
21030 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
21032 && MEM_ALIGN (mem
[0]) < 8 * BITS_PER_UNIT
)
21038 /* Given OPERANDS of consecutive load/store, this function pairs them
21039 into LDP/STP after adjusting the offset. It depends on the fact
21040 that the operands can be sorted so the offsets are correct for STP.
21041 MODE is the mode of memory operands. CODE is the rtl operator
21042 which should be applied to all memory operands, it's SIGN_EXTEND,
21043 ZERO_EXTEND or UNKNOWN. */
21046 aarch64_gen_adjusted_ldpstp (rtx
*operands
, bool load
,
21047 scalar_mode mode
, RTX_CODE code
)
21049 rtx base
, offset_1
, offset_3
, t1
, t2
;
21050 rtx mem_1
, mem_2
, mem_3
, mem_4
;
21051 rtx temp_operands
[8];
21052 HOST_WIDE_INT off_val_1
, off_val_3
, base_off
, new_off_1
, new_off_3
,
21053 stp_off_upper_limit
, stp_off_lower_limit
, msize
;
21055 /* We make changes on a copy as we may still bail out. */
21056 for (int i
= 0; i
< 8; i
++)
21057 temp_operands
[i
] = operands
[i
];
21059 /* Sort the operands. */
21060 qsort (temp_operands
, 4, 2 * sizeof (rtx
*), aarch64_ldrstr_offset_compare
);
21062 /* Copy the memory operands so that if we have to bail for some
21063 reason the original addresses are unchanged. */
21066 mem_1
= copy_rtx (temp_operands
[1]);
21067 mem_2
= copy_rtx (temp_operands
[3]);
21068 mem_3
= copy_rtx (temp_operands
[5]);
21069 mem_4
= copy_rtx (temp_operands
[7]);
21073 mem_1
= copy_rtx (temp_operands
[0]);
21074 mem_2
= copy_rtx (temp_operands
[2]);
21075 mem_3
= copy_rtx (temp_operands
[4]);
21076 mem_4
= copy_rtx (temp_operands
[6]);
21077 gcc_assert (code
== UNKNOWN
);
21080 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
21081 extract_base_offset_in_addr (mem_3
, &base
, &offset_3
);
21082 gcc_assert (base
!= NULL_RTX
&& offset_1
!= NULL_RTX
21083 && offset_3
!= NULL_RTX
);
21085 /* Adjust offset so it can fit in LDP/STP instruction. */
21086 msize
= GET_MODE_SIZE (mode
);
21087 stp_off_upper_limit
= msize
* (0x40 - 1);
21088 stp_off_lower_limit
= - msize
* 0x40;
21090 off_val_1
= INTVAL (offset_1
);
21091 off_val_3
= INTVAL (offset_3
);
21093 /* The base offset is optimally half way between the two STP/LDP offsets. */
21095 base_off
= (off_val_1
+ off_val_3
) / 2;
21097 /* However, due to issues with negative LDP/STP offset generation for
21098 larger modes, for DF, DI and vector modes. we must not use negative
21099 addresses smaller than 9 signed unadjusted bits can store. This
21100 provides the most range in this case. */
21101 base_off
= off_val_1
;
21103 /* Adjust the base so that it is aligned with the addresses but still
21105 if (base_off
% msize
!= off_val_1
% msize
)
21106 /* Fix the offset, bearing in mind we want to make it bigger not
21108 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
21109 else if (msize
<= 4)
21110 /* The negative range of LDP/STP is one larger than the positive range. */
21113 /* Check if base offset is too big or too small. We can attempt to resolve
21114 this issue by setting it to the maximum value and seeing if the offsets
21116 if (base_off
>= 0x1000)
21118 base_off
= 0x1000 - 1;
21119 /* We must still make sure that the base offset is aligned with respect
21120 to the address. But it may may not be made any bigger. */
21121 base_off
-= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
21124 /* Likewise for the case where the base is too small. */
21125 if (base_off
<= -0x1000)
21127 base_off
= -0x1000 + 1;
21128 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
21131 /* Offset of the first STP/LDP. */
21132 new_off_1
= off_val_1
- base_off
;
21134 /* Offset of the second STP/LDP. */
21135 new_off_3
= off_val_3
- base_off
;
21137 /* The offsets must be within the range of the LDP/STP instructions. */
21138 if (new_off_1
> stp_off_upper_limit
|| new_off_1
< stp_off_lower_limit
21139 || new_off_3
> stp_off_upper_limit
|| new_off_3
< stp_off_lower_limit
)
21142 replace_equiv_address_nv (mem_1
, plus_constant (Pmode
, operands
[8],
21144 replace_equiv_address_nv (mem_2
, plus_constant (Pmode
, operands
[8],
21145 new_off_1
+ msize
), true);
21146 replace_equiv_address_nv (mem_3
, plus_constant (Pmode
, operands
[8],
21148 replace_equiv_address_nv (mem_4
, plus_constant (Pmode
, operands
[8],
21149 new_off_3
+ msize
), true);
21151 if (!aarch64_mem_pair_operand (mem_1
, mode
)
21152 || !aarch64_mem_pair_operand (mem_3
, mode
))
21155 if (code
== ZERO_EXTEND
)
21157 mem_1
= gen_rtx_ZERO_EXTEND (DImode
, mem_1
);
21158 mem_2
= gen_rtx_ZERO_EXTEND (DImode
, mem_2
);
21159 mem_3
= gen_rtx_ZERO_EXTEND (DImode
, mem_3
);
21160 mem_4
= gen_rtx_ZERO_EXTEND (DImode
, mem_4
);
21162 else if (code
== SIGN_EXTEND
)
21164 mem_1
= gen_rtx_SIGN_EXTEND (DImode
, mem_1
);
21165 mem_2
= gen_rtx_SIGN_EXTEND (DImode
, mem_2
);
21166 mem_3
= gen_rtx_SIGN_EXTEND (DImode
, mem_3
);
21167 mem_4
= gen_rtx_SIGN_EXTEND (DImode
, mem_4
);
21172 operands
[0] = temp_operands
[0];
21173 operands
[1] = mem_1
;
21174 operands
[2] = temp_operands
[2];
21175 operands
[3] = mem_2
;
21176 operands
[4] = temp_operands
[4];
21177 operands
[5] = mem_3
;
21178 operands
[6] = temp_operands
[6];
21179 operands
[7] = mem_4
;
21183 operands
[0] = mem_1
;
21184 operands
[1] = temp_operands
[1];
21185 operands
[2] = mem_2
;
21186 operands
[3] = temp_operands
[3];
21187 operands
[4] = mem_3
;
21188 operands
[5] = temp_operands
[5];
21189 operands
[6] = mem_4
;
21190 operands
[7] = temp_operands
[7];
21193 /* Emit adjusting instruction. */
21194 emit_insn (gen_rtx_SET (operands
[8], plus_constant (DImode
, base
, base_off
)));
21195 /* Emit ldp/stp instructions. */
21196 t1
= gen_rtx_SET (operands
[0], operands
[1]);
21197 t2
= gen_rtx_SET (operands
[2], operands
[3]);
21198 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
21199 t1
= gen_rtx_SET (operands
[4], operands
[5]);
21200 t2
= gen_rtx_SET (operands
[6], operands
[7]);
21201 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
21205 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
21206 it isn't worth branching around empty masked ops (including masked
21210 aarch64_empty_mask_is_expensive (unsigned)
21215 /* Return 1 if pseudo register should be created and used to hold
21216 GOT address for PIC code. */
21219 aarch64_use_pseudo_pic_reg (void)
21221 return aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
;
21224 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
21227 aarch64_unspec_may_trap_p (const_rtx x
, unsigned flags
)
21229 switch (XINT (x
, 1))
21231 case UNSPEC_GOTSMALLPIC
:
21232 case UNSPEC_GOTSMALLPIC28K
:
21233 case UNSPEC_GOTTINYPIC
:
21239 return default_unspec_may_trap_p (x
, flags
);
21243 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
21244 return the log2 of that value. Otherwise return -1. */
21247 aarch64_fpconst_pow_of_2 (rtx x
)
21249 const REAL_VALUE_TYPE
*r
;
21251 if (!CONST_DOUBLE_P (x
))
21254 r
= CONST_DOUBLE_REAL_VALUE (x
);
21256 if (REAL_VALUE_NEGATIVE (*r
)
21257 || REAL_VALUE_ISNAN (*r
)
21258 || REAL_VALUE_ISINF (*r
)
21259 || !real_isinteger (r
, DFmode
))
21262 return exact_log2 (real_to_integer (r
));
21265 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
21266 power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
21267 return n. Otherwise return -1. */
21270 aarch64_fpconst_pow2_recip (rtx x
)
21272 REAL_VALUE_TYPE r0
;
21274 if (!CONST_DOUBLE_P (x
))
21277 r0
= *CONST_DOUBLE_REAL_VALUE (x
);
21278 if (exact_real_inverse (DFmode
, &r0
)
21279 && !REAL_VALUE_NEGATIVE (r0
))
21281 int ret
= exact_log2 (real_to_integer (&r0
));
21282 if (ret
>= 1 && ret
<= 32)
21288 /* If X is a vector of equal CONST_DOUBLE values and that value is
21289 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
21292 aarch64_vec_fpconst_pow_of_2 (rtx x
)
21295 if (GET_CODE (x
) != CONST_VECTOR
21296 || !CONST_VECTOR_NUNITS (x
).is_constant (&nelts
))
21299 if (GET_MODE_CLASS (GET_MODE (x
)) != MODE_VECTOR_FLOAT
)
21302 int firstval
= aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, 0));
21306 for (int i
= 1; i
< nelts
; i
++)
21307 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, i
)) != firstval
)
21313 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
21316 __fp16 always promotes through this hook.
21317 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
21318 through the generic excess precision logic rather than here. */
21321 aarch64_promoted_type (const_tree t
)
21323 if (SCALAR_FLOAT_TYPE_P (t
)
21324 && TYPE_MAIN_VARIANT (t
) == aarch64_fp16_type_node
)
21325 return float_type_node
;
21330 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
21333 aarch64_optab_supported_p (int op
, machine_mode mode1
, machine_mode
,
21334 optimization_type opt_type
)
21339 return opt_type
== OPTIMIZE_FOR_SPEED
&& use_rsqrt_p (mode1
);
21346 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
21348 static unsigned int
21349 aarch64_dwarf_poly_indeterminate_value (unsigned int i
, unsigned int *factor
,
21352 /* Polynomial invariant 1 == (VG / 2) - 1. */
21353 gcc_assert (i
== 1);
21356 return AARCH64_DWARF_VG
;
21359 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
21360 if MODE is HFmode, and punt to the generic implementation otherwise. */
21363 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode
)
21365 return (mode
== HFmode
21367 : default_libgcc_floating_mode_supported_p (mode
));
21370 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
21371 if MODE is HFmode, and punt to the generic implementation otherwise. */
21374 aarch64_scalar_mode_supported_p (scalar_mode mode
)
21376 return (mode
== HFmode
21378 : default_scalar_mode_supported_p (mode
));
21381 /* Set the value of FLT_EVAL_METHOD.
21382 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
21384 0: evaluate all operations and constants, whose semantic type has at
21385 most the range and precision of type float, to the range and
21386 precision of float; evaluate all other operations and constants to
21387 the range and precision of the semantic type;
21389 N, where _FloatN is a supported interchange floating type
21390 evaluate all operations and constants, whose semantic type has at
21391 most the range and precision of _FloatN type, to the range and
21392 precision of the _FloatN type; evaluate all other operations and
21393 constants to the range and precision of the semantic type;
21395 If we have the ARMv8.2-A extensions then we support _Float16 in native
21396 precision, so we should set this to 16. Otherwise, we support the type,
21397 but want to evaluate expressions in float precision, so set this to
21400 static enum flt_eval_method
21401 aarch64_excess_precision (enum excess_precision_type type
)
21405 case EXCESS_PRECISION_TYPE_FAST
:
21406 case EXCESS_PRECISION_TYPE_STANDARD
:
21407 /* We can calculate either in 16-bit range and precision or
21408 32-bit range and precision. Make that decision based on whether
21409 we have native support for the ARMv8.2-A 16-bit floating-point
21410 instructions or not. */
21411 return (TARGET_FP_F16INST
21412 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
21413 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
);
21414 case EXCESS_PRECISION_TYPE_IMPLICIT
:
21415 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
;
21417 gcc_unreachable ();
21419 return FLT_EVAL_METHOD_UNPREDICTABLE
;
21422 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
21423 scheduled for speculative execution. Reject the long-running division
21424 and square-root instructions. */
21427 aarch64_sched_can_speculate_insn (rtx_insn
*insn
)
21429 switch (get_attr_type (insn
))
21437 case TYPE_NEON_FP_SQRT_S
:
21438 case TYPE_NEON_FP_SQRT_D
:
21439 case TYPE_NEON_FP_SQRT_S_Q
:
21440 case TYPE_NEON_FP_SQRT_D_Q
:
21441 case TYPE_NEON_FP_DIV_S
:
21442 case TYPE_NEON_FP_DIV_D
:
21443 case TYPE_NEON_FP_DIV_S_Q
:
21444 case TYPE_NEON_FP_DIV_D_Q
:
21451 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
21454 aarch64_compute_pressure_classes (reg_class
*classes
)
21457 classes
[i
++] = GENERAL_REGS
;
21458 classes
[i
++] = FP_REGS
;
21459 /* PR_REGS isn't a useful pressure class because many predicate pseudo
21460 registers need to go in PR_LO_REGS at some point during their
21461 lifetime. Splitting it into two halves has the effect of making
21462 all predicates count against PR_LO_REGS, so that we try whenever
21463 possible to restrict the number of live predicates to 8. This
21464 greatly reduces the amount of spilling in certain loops. */
21465 classes
[i
++] = PR_LO_REGS
;
21466 classes
[i
++] = PR_HI_REGS
;
21470 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
21473 aarch64_can_change_mode_class (machine_mode from
,
21474 machine_mode to
, reg_class_t
)
21476 unsigned int from_flags
= aarch64_classify_vector_mode (from
);
21477 unsigned int to_flags
= aarch64_classify_vector_mode (to
);
21479 bool from_sve_p
= (from_flags
& VEC_ANY_SVE
);
21480 bool to_sve_p
= (to_flags
& VEC_ANY_SVE
);
21482 bool from_partial_sve_p
= from_sve_p
&& (from_flags
& VEC_PARTIAL
);
21483 bool to_partial_sve_p
= to_sve_p
&& (to_flags
& VEC_PARTIAL
);
21485 /* Don't allow changes between partial SVE modes and other modes.
21486 The contents of partial SVE modes are distributed evenly across
21487 the register, whereas GCC expects them to be clustered together. */
21488 if (from_partial_sve_p
!= to_partial_sve_p
)
21491 /* Similarly reject changes between partial SVE modes that have
21492 different patterns of significant and insignificant bits. */
21493 if (from_partial_sve_p
21494 && (aarch64_sve_container_bits (from
) != aarch64_sve_container_bits (to
)
21495 || GET_MODE_UNIT_SIZE (from
) != GET_MODE_UNIT_SIZE (to
)))
21498 if (BYTES_BIG_ENDIAN
)
21500 /* Don't allow changes between SVE data modes and non-SVE modes.
21501 See the comment at the head of aarch64-sve.md for details. */
21502 if (from_sve_p
!= to_sve_p
)
21505 /* Don't allow changes in element size: lane 0 of the new vector
21506 would not then be lane 0 of the old vector. See the comment
21507 above aarch64_maybe_expand_sve_subreg_move for a more detailed
21510 In the worst case, this forces a register to be spilled in
21511 one mode and reloaded in the other, which handles the
21512 endianness correctly. */
21513 if (from_sve_p
&& GET_MODE_UNIT_SIZE (from
) != GET_MODE_UNIT_SIZE (to
))
21519 /* Implement TARGET_EARLY_REMAT_MODES. */
21522 aarch64_select_early_remat_modes (sbitmap modes
)
21524 /* SVE values are not normally live across a call, so it should be
21525 worth doing early rematerialization even in VL-specific mode. */
21526 for (int i
= 0; i
< NUM_MACHINE_MODES
; ++i
)
21527 if (aarch64_sve_mode_p ((machine_mode
) i
))
21528 bitmap_set_bit (modes
, i
);
21531 /* Override the default target speculation_safe_value. */
21533 aarch64_speculation_safe_value (machine_mode mode
,
21534 rtx result
, rtx val
, rtx failval
)
21536 /* Maybe we should warn if falling back to hard barriers. They are
21537 likely to be noticably more expensive than the alternative below. */
21538 if (!aarch64_track_speculation
)
21539 return default_speculation_safe_value (mode
, result
, val
, failval
);
21542 val
= copy_to_mode_reg (mode
, val
);
21544 if (!aarch64_reg_or_zero (failval
, mode
))
21545 failval
= copy_to_mode_reg (mode
, failval
);
21547 emit_insn (gen_despeculate_copy (mode
, result
, val
, failval
));
21551 /* Implement TARGET_ESTIMATED_POLY_VALUE.
21552 Look into the tuning structure for an estimate.
21553 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
21554 Advanced SIMD 128 bits. */
21556 static HOST_WIDE_INT
21557 aarch64_estimated_poly_value (poly_int64 val
)
21559 enum aarch64_sve_vector_bits_enum width_source
21560 = aarch64_tune_params
.sve_width
;
21562 /* If we still don't have an estimate, use the default. */
21563 if (width_source
== SVE_SCALABLE
)
21564 return default_estimated_poly_value (val
);
21566 HOST_WIDE_INT over_128
= width_source
- 128;
21567 return val
.coeffs
[0] + val
.coeffs
[1] * over_128
/ 128;
21571 /* Return true for types that could be supported as SIMD return or
21575 supported_simd_type (tree t
)
21577 if (SCALAR_FLOAT_TYPE_P (t
) || INTEGRAL_TYPE_P (t
) || POINTER_TYPE_P (t
))
21579 HOST_WIDE_INT s
= tree_to_shwi (TYPE_SIZE_UNIT (t
));
21580 return s
== 1 || s
== 2 || s
== 4 || s
== 8;
21585 /* Return true for types that currently are supported as SIMD return
21586 or argument types. */
21589 currently_supported_simd_type (tree t
, tree b
)
21591 if (COMPLEX_FLOAT_TYPE_P (t
))
21594 if (TYPE_SIZE (t
) != TYPE_SIZE (b
))
21597 return supported_simd_type (t
);
21600 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
21603 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node
*node
,
21604 struct cgraph_simd_clone
*clonei
,
21605 tree base_type
, int num
)
21607 tree t
, ret_type
, arg_type
;
21608 unsigned int elt_bits
, vec_bits
, count
;
21613 if (clonei
->simdlen
21614 && (clonei
->simdlen
< 2
21615 || clonei
->simdlen
> 1024
21616 || (clonei
->simdlen
& (clonei
->simdlen
- 1)) != 0))
21618 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
21619 "unsupported simdlen %d", clonei
->simdlen
);
21623 ret_type
= TREE_TYPE (TREE_TYPE (node
->decl
));
21624 if (TREE_CODE (ret_type
) != VOID_TYPE
21625 && !currently_supported_simd_type (ret_type
, base_type
))
21627 if (TYPE_SIZE (ret_type
) != TYPE_SIZE (base_type
))
21628 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
21629 "GCC does not currently support mixed size types "
21630 "for %<simd%> functions");
21631 else if (supported_simd_type (ret_type
))
21632 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
21633 "GCC does not currently support return type %qT "
21634 "for %<simd%> functions", ret_type
);
21636 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
21637 "unsupported return type %qT for %<simd%> functions",
21642 for (t
= DECL_ARGUMENTS (node
->decl
); t
; t
= DECL_CHAIN (t
))
21644 arg_type
= TREE_TYPE (t
);
21646 if (!currently_supported_simd_type (arg_type
, base_type
))
21648 if (TYPE_SIZE (arg_type
) != TYPE_SIZE (base_type
))
21649 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
21650 "GCC does not currently support mixed size types "
21651 "for %<simd%> functions");
21653 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
21654 "GCC does not currently support argument type %qT "
21655 "for %<simd%> functions", arg_type
);
21660 clonei
->vecsize_mangle
= 'n';
21661 clonei
->mask_mode
= VOIDmode
;
21662 elt_bits
= GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type
));
21663 if (clonei
->simdlen
== 0)
21666 vec_bits
= (num
== 0 ? 64 : 128);
21667 clonei
->simdlen
= vec_bits
/ elt_bits
;
21672 vec_bits
= clonei
->simdlen
* elt_bits
;
21673 if (vec_bits
!= 64 && vec_bits
!= 128)
21675 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
21676 "GCC does not currently support simdlen %d for type %qT",
21677 clonei
->simdlen
, base_type
);
21681 clonei
->vecsize_int
= vec_bits
;
21682 clonei
->vecsize_float
= vec_bits
;
21686 /* Implement TARGET_SIMD_CLONE_ADJUST. */
21689 aarch64_simd_clone_adjust (struct cgraph_node
*node
)
21691 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
21692 use the correct ABI. */
21694 tree t
= TREE_TYPE (node
->decl
);
21695 TYPE_ATTRIBUTES (t
) = make_attribute ("aarch64_vector_pcs", "default",
21696 TYPE_ATTRIBUTES (t
));
21699 /* Implement TARGET_SIMD_CLONE_USABLE. */
21702 aarch64_simd_clone_usable (struct cgraph_node
*node
)
21704 switch (node
->simdclone
->vecsize_mangle
)
21711 gcc_unreachable ();
21715 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
21718 aarch64_comp_type_attributes (const_tree type1
, const_tree type2
)
21720 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1
))
21721 != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2
)))
21726 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
21728 static const char *
21729 aarch64_get_multilib_abi_name (void)
21731 if (TARGET_BIG_END
)
21732 return TARGET_ILP32
? "aarch64_be_ilp32" : "aarch64_be";
21733 return TARGET_ILP32
? "aarch64_ilp32" : "aarch64";
21736 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
21737 global variable based guard use the default else
21738 return a null tree. */
21740 aarch64_stack_protect_guard (void)
21742 if (aarch64_stack_protector_guard
== SSP_GLOBAL
)
21743 return default_stack_protect_guard ();
21748 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
21749 section at the end if needed. */
21750 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
21751 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
21752 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
21754 aarch64_file_end_indicate_exec_stack ()
21756 file_end_indicate_exec_stack ();
21758 unsigned feature_1_and
= 0;
21759 if (aarch64_bti_enabled ())
21760 feature_1_and
|= GNU_PROPERTY_AARCH64_FEATURE_1_BTI
;
21762 if (aarch64_ra_sign_scope
!= AARCH64_FUNCTION_NONE
)
21763 feature_1_and
|= GNU_PROPERTY_AARCH64_FEATURE_1_PAC
;
21767 /* Generate .note.gnu.property section. */
21768 switch_to_section (get_section (".note.gnu.property",
21769 SECTION_NOTYPE
, NULL
));
21771 /* PT_NOTE header: namesz, descsz, type.
21772 namesz = 4 ("GNU\0")
21773 descsz = 16 (Size of the program property array)
21774 [(12 + padding) * Number of array elements]
21775 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
21776 assemble_align (POINTER_SIZE
);
21777 assemble_integer (GEN_INT (4), 4, 32, 1);
21778 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES
)), 4, 32, 1);
21779 assemble_integer (GEN_INT (5), 4, 32, 1);
21781 /* PT_NOTE name. */
21782 assemble_string ("GNU", 4);
21784 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
21785 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
21787 data = feature_1_and. */
21788 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND
), 4, 32, 1);
21789 assemble_integer (GEN_INT (4), 4, 32, 1);
21790 assemble_integer (GEN_INT (feature_1_and
), 4, 32, 1);
21792 /* Pad the size of the note to the required alignment. */
21793 assemble_align (POINTER_SIZE
);
21796 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
21797 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
21798 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
21800 /* Target-specific selftests. */
21804 namespace selftest
{
21806 /* Selftest for the RTL loader.
21807 Verify that the RTL loader copes with a dump from
21808 print_rtx_function. This is essentially just a test that class
21809 function_reader can handle a real dump, but it also verifies
21810 that lookup_reg_by_dump_name correctly handles hard regs.
21811 The presence of hard reg names in the dump means that the test is
21812 target-specific, hence it is in this file. */
21815 aarch64_test_loading_full_dump ()
21817 rtl_dump_test
t (SELFTEST_LOCATION
, locate_file ("aarch64/times-two.rtl"));
21819 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun
->decl
)));
21821 rtx_insn
*insn_1
= get_insn_by_uid (1);
21822 ASSERT_EQ (NOTE
, GET_CODE (insn_1
));
21824 rtx_insn
*insn_15
= get_insn_by_uid (15);
21825 ASSERT_EQ (INSN
, GET_CODE (insn_15
));
21826 ASSERT_EQ (USE
, GET_CODE (PATTERN (insn_15
)));
21828 /* Verify crtl->return_rtx. */
21829 ASSERT_EQ (REG
, GET_CODE (crtl
->return_rtx
));
21830 ASSERT_EQ (0, REGNO (crtl
->return_rtx
));
21831 ASSERT_EQ (SImode
, GET_MODE (crtl
->return_rtx
));
21834 /* Run all target-specific selftests. */
21837 aarch64_run_selftests (void)
21839 aarch64_test_loading_full_dump ();
21842 } // namespace selftest
21844 #endif /* #if CHECKING_P */
21846 #undef TARGET_STACK_PROTECT_GUARD
21847 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
21849 #undef TARGET_ADDRESS_COST
21850 #define TARGET_ADDRESS_COST aarch64_address_cost
21852 /* This hook will determines whether unnamed bitfields affect the alignment
21853 of the containing structure. The hook returns true if the structure
21854 should inherit the alignment requirements of an unnamed bitfield's
21856 #undef TARGET_ALIGN_ANON_BITFIELD
21857 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
21859 #undef TARGET_ASM_ALIGNED_DI_OP
21860 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
21862 #undef TARGET_ASM_ALIGNED_HI_OP
21863 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
21865 #undef TARGET_ASM_ALIGNED_SI_OP
21866 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
21868 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
21869 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
21870 hook_bool_const_tree_hwi_hwi_const_tree_true
21872 #undef TARGET_ASM_FILE_START
21873 #define TARGET_ASM_FILE_START aarch64_start_file
21875 #undef TARGET_ASM_OUTPUT_MI_THUNK
21876 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
21878 #undef TARGET_ASM_SELECT_RTX_SECTION
21879 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
21881 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
21882 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
21884 #undef TARGET_BUILD_BUILTIN_VA_LIST
21885 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
21887 #undef TARGET_CALLEE_COPIES
21888 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
21890 #undef TARGET_CAN_ELIMINATE
21891 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
21893 #undef TARGET_CAN_INLINE_P
21894 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
21896 #undef TARGET_CANNOT_FORCE_CONST_MEM
21897 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
21899 #undef TARGET_CASE_VALUES_THRESHOLD
21900 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
21902 #undef TARGET_CONDITIONAL_REGISTER_USAGE
21903 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
21905 /* Only the least significant bit is used for initialization guard
21907 #undef TARGET_CXX_GUARD_MASK_BIT
21908 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
21910 #undef TARGET_C_MODE_FOR_SUFFIX
21911 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
21913 #ifdef TARGET_BIG_ENDIAN_DEFAULT
21914 #undef TARGET_DEFAULT_TARGET_FLAGS
21915 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
21918 #undef TARGET_CLASS_MAX_NREGS
21919 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
21921 #undef TARGET_BUILTIN_DECL
21922 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
21924 #undef TARGET_BUILTIN_RECIPROCAL
21925 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
21927 #undef TARGET_C_EXCESS_PRECISION
21928 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
21930 #undef TARGET_EXPAND_BUILTIN
21931 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
21933 #undef TARGET_EXPAND_BUILTIN_VA_START
21934 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
21936 #undef TARGET_FOLD_BUILTIN
21937 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
21939 #undef TARGET_FUNCTION_ARG
21940 #define TARGET_FUNCTION_ARG aarch64_function_arg
21942 #undef TARGET_FUNCTION_ARG_ADVANCE
21943 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
21945 #undef TARGET_FUNCTION_ARG_BOUNDARY
21946 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
21948 #undef TARGET_FUNCTION_ARG_PADDING
21949 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
21951 #undef TARGET_GET_RAW_RESULT_MODE
21952 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
21953 #undef TARGET_GET_RAW_ARG_MODE
21954 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
21956 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
21957 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
21959 #undef TARGET_FUNCTION_VALUE
21960 #define TARGET_FUNCTION_VALUE aarch64_function_value
21962 #undef TARGET_FUNCTION_VALUE_REGNO_P
21963 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
21965 #undef TARGET_GIMPLE_FOLD_BUILTIN
21966 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
21968 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
21969 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
21971 #undef TARGET_INIT_BUILTINS
21972 #define TARGET_INIT_BUILTINS aarch64_init_builtins
21974 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
21975 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
21976 aarch64_ira_change_pseudo_allocno_class
21978 #undef TARGET_LEGITIMATE_ADDRESS_P
21979 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
21981 #undef TARGET_LEGITIMATE_CONSTANT_P
21982 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
21984 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
21985 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
21986 aarch64_legitimize_address_displacement
21988 #undef TARGET_LIBGCC_CMP_RETURN_MODE
21989 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
21991 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
21992 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
21993 aarch64_libgcc_floating_mode_supported_p
21995 #undef TARGET_MANGLE_TYPE
21996 #define TARGET_MANGLE_TYPE aarch64_mangle_type
21998 #undef TARGET_VERIFY_TYPE_CONTEXT
21999 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
22001 #undef TARGET_MEMORY_MOVE_COST
22002 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
22004 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
22005 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
22007 #undef TARGET_MUST_PASS_IN_STACK
22008 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
22010 /* This target hook should return true if accesses to volatile bitfields
22011 should use the narrowest mode possible. It should return false if these
22012 accesses should use the bitfield container type. */
22013 #undef TARGET_NARROW_VOLATILE_BITFIELD
22014 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
22016 #undef TARGET_OPTION_OVERRIDE
22017 #define TARGET_OPTION_OVERRIDE aarch64_override_options
22019 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
22020 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
22021 aarch64_override_options_after_change
22023 #undef TARGET_OPTION_SAVE
22024 #define TARGET_OPTION_SAVE aarch64_option_save
22026 #undef TARGET_OPTION_RESTORE
22027 #define TARGET_OPTION_RESTORE aarch64_option_restore
22029 #undef TARGET_OPTION_PRINT
22030 #define TARGET_OPTION_PRINT aarch64_option_print
22032 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
22033 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
22035 #undef TARGET_SET_CURRENT_FUNCTION
22036 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
22038 #undef TARGET_PASS_BY_REFERENCE
22039 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
22041 #undef TARGET_PREFERRED_RELOAD_CLASS
22042 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
22044 #undef TARGET_SCHED_REASSOCIATION_WIDTH
22045 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
22047 #undef TARGET_PROMOTED_TYPE
22048 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
22050 #undef TARGET_SECONDARY_RELOAD
22051 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
22053 #undef TARGET_SHIFT_TRUNCATION_MASK
22054 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
22056 #undef TARGET_SETUP_INCOMING_VARARGS
22057 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
22059 #undef TARGET_STRUCT_VALUE_RTX
22060 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
22062 #undef TARGET_REGISTER_MOVE_COST
22063 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
22065 #undef TARGET_RETURN_IN_MEMORY
22066 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
22068 #undef TARGET_RETURN_IN_MSB
22069 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
22071 #undef TARGET_RTX_COSTS
22072 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
22074 #undef TARGET_SCALAR_MODE_SUPPORTED_P
22075 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
22077 #undef TARGET_SCHED_ISSUE_RATE
22078 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
22080 #undef TARGET_SCHED_VARIABLE_ISSUE
22081 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
22083 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
22084 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
22085 aarch64_sched_first_cycle_multipass_dfa_lookahead
22087 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
22088 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
22089 aarch64_first_cycle_multipass_dfa_lookahead_guard
22091 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
22092 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
22093 aarch64_get_separate_components
22095 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
22096 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
22097 aarch64_components_for_bb
22099 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
22100 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
22101 aarch64_disqualify_components
22103 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
22104 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
22105 aarch64_emit_prologue_components
22107 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
22108 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
22109 aarch64_emit_epilogue_components
22111 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
22112 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
22113 aarch64_set_handled_components
22115 #undef TARGET_TRAMPOLINE_INIT
22116 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
22118 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
22119 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
22121 #undef TARGET_VECTOR_MODE_SUPPORTED_P
22122 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
22124 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
22125 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
22126 aarch64_builtin_support_vector_misalignment
22128 #undef TARGET_ARRAY_MODE
22129 #define TARGET_ARRAY_MODE aarch64_array_mode
22131 #undef TARGET_ARRAY_MODE_SUPPORTED_P
22132 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
22134 #undef TARGET_VECTORIZE_ADD_STMT_COST
22135 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
22137 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
22138 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
22139 aarch64_builtin_vectorization_cost
22141 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
22142 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
22144 #undef TARGET_VECTORIZE_BUILTINS
22145 #define TARGET_VECTORIZE_BUILTINS
22147 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
22148 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
22149 aarch64_builtin_vectorized_function
22151 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
22152 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
22153 aarch64_autovectorize_vector_modes
22155 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
22156 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
22157 aarch64_atomic_assign_expand_fenv
22159 /* Section anchor support. */
22161 #undef TARGET_MIN_ANCHOR_OFFSET
22162 #define TARGET_MIN_ANCHOR_OFFSET -256
22164 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
22165 byte offset; we can do much more for larger data types, but have no way
22166 to determine the size of the access. We assume accesses are aligned. */
22167 #undef TARGET_MAX_ANCHOR_OFFSET
22168 #define TARGET_MAX_ANCHOR_OFFSET 4095
22170 #undef TARGET_VECTOR_ALIGNMENT
22171 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
22173 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
22174 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
22175 aarch64_vectorize_preferred_vector_alignment
22176 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
22177 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
22178 aarch64_simd_vector_alignment_reachable
22180 /* vec_perm support. */
22182 #undef TARGET_VECTORIZE_VEC_PERM_CONST
22183 #define TARGET_VECTORIZE_VEC_PERM_CONST \
22184 aarch64_vectorize_vec_perm_const
22186 #undef TARGET_VECTORIZE_RELATED_MODE
22187 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
22188 #undef TARGET_VECTORIZE_GET_MASK_MODE
22189 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
22190 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
22191 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
22192 aarch64_empty_mask_is_expensive
22193 #undef TARGET_PREFERRED_ELSE_VALUE
22194 #define TARGET_PREFERRED_ELSE_VALUE \
22195 aarch64_preferred_else_value
22197 #undef TARGET_INIT_LIBFUNCS
22198 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
22200 #undef TARGET_FIXED_CONDITION_CODE_REGS
22201 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
22203 #undef TARGET_FLAGS_REGNUM
22204 #define TARGET_FLAGS_REGNUM CC_REGNUM
22206 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
22207 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
22209 #undef TARGET_ASAN_SHADOW_OFFSET
22210 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
22212 #undef TARGET_LEGITIMIZE_ADDRESS
22213 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
22215 #undef TARGET_SCHED_CAN_SPECULATE_INSN
22216 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
22218 #undef TARGET_CAN_USE_DOLOOP_P
22219 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
22221 #undef TARGET_SCHED_ADJUST_PRIORITY
22222 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
22224 #undef TARGET_SCHED_MACRO_FUSION_P
22225 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
22227 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
22228 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
22230 #undef TARGET_SCHED_FUSION_PRIORITY
22231 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
22233 #undef TARGET_UNSPEC_MAY_TRAP_P
22234 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
22236 #undef TARGET_USE_PSEUDO_PIC_REG
22237 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
22239 #undef TARGET_PRINT_OPERAND
22240 #define TARGET_PRINT_OPERAND aarch64_print_operand
22242 #undef TARGET_PRINT_OPERAND_ADDRESS
22243 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
22245 #undef TARGET_OPTAB_SUPPORTED_P
22246 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
22248 #undef TARGET_OMIT_STRUCT_RETURN_REG
22249 #define TARGET_OMIT_STRUCT_RETURN_REG true
22251 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
22252 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
22253 aarch64_dwarf_poly_indeterminate_value
22255 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
22256 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
22257 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
22259 #undef TARGET_HARD_REGNO_NREGS
22260 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
22261 #undef TARGET_HARD_REGNO_MODE_OK
22262 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
22264 #undef TARGET_MODES_TIEABLE_P
22265 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
22267 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
22268 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
22269 aarch64_hard_regno_call_part_clobbered
22271 #undef TARGET_INSN_CALLEE_ABI
22272 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
22274 #undef TARGET_CONSTANT_ALIGNMENT
22275 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
22277 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
22278 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
22279 aarch64_stack_clash_protection_alloca_probe_range
22281 #undef TARGET_COMPUTE_PRESSURE_CLASSES
22282 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
22284 #undef TARGET_CAN_CHANGE_MODE_CLASS
22285 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
22287 #undef TARGET_SELECT_EARLY_REMAT_MODES
22288 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
22290 #undef TARGET_SPECULATION_SAFE_VALUE
22291 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
22293 #undef TARGET_ESTIMATED_POLY_VALUE
22294 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
22296 #undef TARGET_ATTRIBUTE_TABLE
22297 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
22299 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
22300 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
22301 aarch64_simd_clone_compute_vecsize_and_simdlen
22303 #undef TARGET_SIMD_CLONE_ADJUST
22304 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
22306 #undef TARGET_SIMD_CLONE_USABLE
22307 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
22309 #undef TARGET_COMP_TYPE_ATTRIBUTES
22310 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
22312 #undef TARGET_GET_MULTILIB_ABI_NAME
22313 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
22315 #undef TARGET_FNTYPE_ABI
22316 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
22319 #undef TARGET_RUN_TARGET_SELFTESTS
22320 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
22321 #endif /* #if CHECKING_P */
22323 #undef TARGET_ASM_POST_CFI_STARTPROC
22324 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
22326 #undef TARGET_STRICT_ARGUMENT_NAMING
22327 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
22329 #undef TARGET_MD_ASM_ADJUST
22330 #define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
22332 struct gcc_target targetm
= TARGET_INITIALIZER
;
22334 #include "gt-aarch64.h"