1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2019 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
24 #define INCLUDE_STRING
26 #include "coretypes.h"
37 #include "stringpool.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
56 #include "langhooks.h"
61 #include "gimple-iterator.h"
62 #include "tree-vectorizer.h"
63 #include "aarch64-cost-tables.h"
67 #include "tm-constrs.h"
68 #include "sched-int.h"
69 #include "target-globals.h"
70 #include "common/common-target.h"
73 #include "selftest-rtl.h"
74 #include "rtx-vector-builder.h"
78 /* This file should be included last. */
79 #include "target-def.h"
81 /* Defined for convenience. */
82 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
84 /* Information about a legitimate vector immediate operand. */
85 struct simd_immediate_info
87 enum insn_type
{ MOV
, MVN
, INDEX
, PTRUE
};
88 enum modifier_type
{ LSL
, MSL
};
90 simd_immediate_info () {}
91 simd_immediate_info (scalar_float_mode
, rtx
);
92 simd_immediate_info (scalar_int_mode
, unsigned HOST_WIDE_INT
,
93 insn_type
= MOV
, modifier_type
= LSL
,
95 simd_immediate_info (scalar_mode
, rtx
, rtx
);
96 simd_immediate_info (scalar_int_mode
, aarch64_svpattern
);
98 /* The mode of the elements. */
101 /* The instruction to use to move the immediate into a vector. */
106 /* For MOV and MVN. */
109 /* The value of each element. */
112 /* The kind of shift modifier to use, and the number of bits to shift.
113 This is (LSL, 0) if no shift is needed. */
114 modifier_type modifier
;
121 /* The value of the first element and the step to be added for each
122 subsequent element. */
127 aarch64_svpattern pattern
;
131 /* Construct a floating-point immediate in which each element has mode
132 ELT_MODE_IN and value VALUE_IN. */
133 inline simd_immediate_info
134 ::simd_immediate_info (scalar_float_mode elt_mode_in
, rtx value_in
)
135 : elt_mode (elt_mode_in
), insn (MOV
)
137 u
.mov
.value
= value_in
;
138 u
.mov
.modifier
= LSL
;
142 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
143 and value VALUE_IN. The other parameters are as for the structure
145 inline simd_immediate_info
146 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
147 unsigned HOST_WIDE_INT value_in
,
148 insn_type insn_in
, modifier_type modifier_in
,
149 unsigned int shift_in
)
150 : elt_mode (elt_mode_in
), insn (insn_in
)
152 u
.mov
.value
= gen_int_mode (value_in
, elt_mode_in
);
153 u
.mov
.modifier
= modifier_in
;
154 u
.mov
.shift
= shift_in
;
157 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
158 and where element I is equal to BASE_IN + I * STEP_IN. */
159 inline simd_immediate_info
160 ::simd_immediate_info (scalar_mode elt_mode_in
, rtx base_in
, rtx step_in
)
161 : elt_mode (elt_mode_in
), insn (INDEX
)
163 u
.index
.base
= base_in
;
164 u
.index
.step
= step_in
;
167 /* Construct a predicate that controls elements of mode ELT_MODE_IN
168 and has PTRUE pattern PATTERN_IN. */
169 inline simd_immediate_info
170 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
171 aarch64_svpattern pattern_in
)
172 : elt_mode (elt_mode_in
), insn (PTRUE
)
174 u
.pattern
= pattern_in
;
177 /* The current code model. */
178 enum aarch64_code_model aarch64_cmodel
;
180 /* The number of 64-bit elements in an SVE vector. */
181 poly_uint16 aarch64_sve_vg
;
184 #undef TARGET_HAVE_TLS
185 #define TARGET_HAVE_TLS 1
188 static bool aarch64_composite_type_p (const_tree
, machine_mode
);
189 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode
,
191 machine_mode
*, int *,
193 static void aarch64_elf_asm_constructor (rtx
, int) ATTRIBUTE_UNUSED
;
194 static void aarch64_elf_asm_destructor (rtx
, int) ATTRIBUTE_UNUSED
;
195 static void aarch64_override_options_after_change (void);
196 static bool aarch64_vector_mode_supported_p (machine_mode
);
197 static int aarch64_address_cost (rtx
, machine_mode
, addr_space_t
, bool);
198 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode
,
202 static machine_mode
aarch64_simd_container_mode (scalar_mode
, poly_int64
);
203 static bool aarch64_print_address_internal (FILE*, machine_mode
, rtx
,
204 aarch64_addr_query_type
);
205 static HOST_WIDE_INT
aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val
);
207 /* Major revision number of the ARM Architecture implemented by the target. */
208 unsigned aarch64_architecture_version
;
210 /* The processor for which instructions should be scheduled. */
211 enum aarch64_processor aarch64_tune
= cortexa53
;
213 /* Mask to specify which instruction scheduling options should be used. */
214 uint64_t aarch64_tune_flags
= 0;
216 /* Global flag for PC relative loads. */
217 bool aarch64_pcrelative_literal_loads
;
219 /* Global flag for whether frame pointer is enabled. */
220 bool aarch64_use_frame_pointer
;
222 #define BRANCH_PROTECT_STR_MAX 255
223 char *accepted_branch_protection_string
= NULL
;
225 static enum aarch64_parse_opt_result
226 aarch64_parse_branch_protection (const char*, char**);
228 /* Support for command line parsing of boolean flags in the tuning
230 struct aarch64_flag_desc
236 #define AARCH64_FUSION_PAIR(name, internal_name) \
237 { name, AARCH64_FUSE_##internal_name },
238 static const struct aarch64_flag_desc aarch64_fusible_pairs
[] =
240 { "none", AARCH64_FUSE_NOTHING
},
241 #include "aarch64-fusion-pairs.def"
242 { "all", AARCH64_FUSE_ALL
},
243 { NULL
, AARCH64_FUSE_NOTHING
}
246 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
247 { name, AARCH64_EXTRA_TUNE_##internal_name },
248 static const struct aarch64_flag_desc aarch64_tuning_flags
[] =
250 { "none", AARCH64_EXTRA_TUNE_NONE
},
251 #include "aarch64-tuning-flags.def"
252 { "all", AARCH64_EXTRA_TUNE_ALL
},
253 { NULL
, AARCH64_EXTRA_TUNE_NONE
}
256 /* Tuning parameters. */
258 static const struct cpu_addrcost_table generic_addrcost_table
=
268 0, /* register_offset */
269 0, /* register_sextend */
270 0, /* register_zextend */
274 static const struct cpu_addrcost_table exynosm1_addrcost_table
=
284 1, /* register_offset */
285 1, /* register_sextend */
286 2, /* register_zextend */
290 static const struct cpu_addrcost_table xgene1_addrcost_table
=
300 0, /* register_offset */
301 1, /* register_sextend */
302 1, /* register_zextend */
306 static const struct cpu_addrcost_table thunderx2t99_addrcost_table
=
316 2, /* register_offset */
317 3, /* register_sextend */
318 3, /* register_zextend */
322 static const struct cpu_addrcost_table tsv110_addrcost_table
=
332 0, /* register_offset */
333 1, /* register_sextend */
334 1, /* register_zextend */
338 static const struct cpu_addrcost_table qdf24xx_addrcost_table
=
348 3, /* register_offset */
349 3, /* register_sextend */
350 3, /* register_zextend */
354 static const struct cpu_regmove_cost generic_regmove_cost
=
357 /* Avoid the use of slow int<->fp moves for spilling by setting
358 their cost higher than memmov_cost. */
364 static const struct cpu_regmove_cost cortexa57_regmove_cost
=
367 /* Avoid the use of slow int<->fp moves for spilling by setting
368 their cost higher than memmov_cost. */
374 static const struct cpu_regmove_cost cortexa53_regmove_cost
=
377 /* Avoid the use of slow int<->fp moves for spilling by setting
378 their cost higher than memmov_cost. */
384 static const struct cpu_regmove_cost exynosm1_regmove_cost
=
387 /* Avoid the use of slow int<->fp moves for spilling by setting
388 their cost higher than memmov_cost (actual, 4 and 9). */
394 static const struct cpu_regmove_cost thunderx_regmove_cost
=
402 static const struct cpu_regmove_cost xgene1_regmove_cost
=
405 /* Avoid the use of slow int<->fp moves for spilling by setting
406 their cost higher than memmov_cost. */
412 static const struct cpu_regmove_cost qdf24xx_regmove_cost
=
415 /* Avoid the use of int<->fp moves for spilling. */
421 static const struct cpu_regmove_cost thunderx2t99_regmove_cost
=
424 /* Avoid the use of int<->fp moves for spilling. */
430 static const struct cpu_regmove_cost tsv110_regmove_cost
=
433 /* Avoid the use of slow int<->fp moves for spilling by setting
434 their cost higher than memmov_cost. */
440 /* Generic costs for vector insn classes. */
441 static const struct cpu_vector_cost generic_vector_cost
=
443 1, /* scalar_int_stmt_cost */
444 1, /* scalar_fp_stmt_cost */
445 1, /* scalar_load_cost */
446 1, /* scalar_store_cost */
447 1, /* vec_int_stmt_cost */
448 1, /* vec_fp_stmt_cost */
449 2, /* vec_permute_cost */
450 1, /* vec_to_scalar_cost */
451 1, /* scalar_to_vec_cost */
452 1, /* vec_align_load_cost */
453 1, /* vec_unalign_load_cost */
454 1, /* vec_unalign_store_cost */
455 1, /* vec_store_cost */
456 3, /* cond_taken_branch_cost */
457 1 /* cond_not_taken_branch_cost */
460 /* QDF24XX costs for vector insn classes. */
461 static const struct cpu_vector_cost qdf24xx_vector_cost
=
463 1, /* scalar_int_stmt_cost */
464 1, /* scalar_fp_stmt_cost */
465 1, /* scalar_load_cost */
466 1, /* scalar_store_cost */
467 1, /* vec_int_stmt_cost */
468 3, /* vec_fp_stmt_cost */
469 2, /* vec_permute_cost */
470 1, /* vec_to_scalar_cost */
471 1, /* scalar_to_vec_cost */
472 1, /* vec_align_load_cost */
473 1, /* vec_unalign_load_cost */
474 1, /* vec_unalign_store_cost */
475 1, /* vec_store_cost */
476 3, /* cond_taken_branch_cost */
477 1 /* cond_not_taken_branch_cost */
480 /* ThunderX costs for vector insn classes. */
481 static const struct cpu_vector_cost thunderx_vector_cost
=
483 1, /* scalar_int_stmt_cost */
484 1, /* scalar_fp_stmt_cost */
485 3, /* scalar_load_cost */
486 1, /* scalar_store_cost */
487 4, /* vec_int_stmt_cost */
488 1, /* vec_fp_stmt_cost */
489 4, /* vec_permute_cost */
490 2, /* vec_to_scalar_cost */
491 2, /* scalar_to_vec_cost */
492 3, /* vec_align_load_cost */
493 5, /* vec_unalign_load_cost */
494 5, /* vec_unalign_store_cost */
495 1, /* vec_store_cost */
496 3, /* cond_taken_branch_cost */
497 3 /* cond_not_taken_branch_cost */
500 static const struct cpu_vector_cost tsv110_vector_cost
=
502 1, /* scalar_int_stmt_cost */
503 1, /* scalar_fp_stmt_cost */
504 5, /* scalar_load_cost */
505 1, /* scalar_store_cost */
506 2, /* vec_int_stmt_cost */
507 2, /* vec_fp_stmt_cost */
508 2, /* vec_permute_cost */
509 3, /* vec_to_scalar_cost */
510 2, /* scalar_to_vec_cost */
511 5, /* vec_align_load_cost */
512 5, /* vec_unalign_load_cost */
513 1, /* vec_unalign_store_cost */
514 1, /* vec_store_cost */
515 1, /* cond_taken_branch_cost */
516 1 /* cond_not_taken_branch_cost */
519 /* Generic costs for vector insn classes. */
520 static const struct cpu_vector_cost cortexa57_vector_cost
=
522 1, /* scalar_int_stmt_cost */
523 1, /* scalar_fp_stmt_cost */
524 4, /* scalar_load_cost */
525 1, /* scalar_store_cost */
526 2, /* vec_int_stmt_cost */
527 2, /* vec_fp_stmt_cost */
528 3, /* vec_permute_cost */
529 8, /* vec_to_scalar_cost */
530 8, /* scalar_to_vec_cost */
531 4, /* vec_align_load_cost */
532 4, /* vec_unalign_load_cost */
533 1, /* vec_unalign_store_cost */
534 1, /* vec_store_cost */
535 1, /* cond_taken_branch_cost */
536 1 /* cond_not_taken_branch_cost */
539 static const struct cpu_vector_cost exynosm1_vector_cost
=
541 1, /* scalar_int_stmt_cost */
542 1, /* scalar_fp_stmt_cost */
543 5, /* scalar_load_cost */
544 1, /* scalar_store_cost */
545 3, /* vec_int_stmt_cost */
546 3, /* vec_fp_stmt_cost */
547 3, /* vec_permute_cost */
548 3, /* vec_to_scalar_cost */
549 3, /* scalar_to_vec_cost */
550 5, /* vec_align_load_cost */
551 5, /* vec_unalign_load_cost */
552 1, /* vec_unalign_store_cost */
553 1, /* vec_store_cost */
554 1, /* cond_taken_branch_cost */
555 1 /* cond_not_taken_branch_cost */
558 /* Generic costs for vector insn classes. */
559 static const struct cpu_vector_cost xgene1_vector_cost
=
561 1, /* scalar_int_stmt_cost */
562 1, /* scalar_fp_stmt_cost */
563 5, /* scalar_load_cost */
564 1, /* scalar_store_cost */
565 2, /* vec_int_stmt_cost */
566 2, /* vec_fp_stmt_cost */
567 2, /* vec_permute_cost */
568 4, /* vec_to_scalar_cost */
569 4, /* scalar_to_vec_cost */
570 10, /* vec_align_load_cost */
571 10, /* vec_unalign_load_cost */
572 2, /* vec_unalign_store_cost */
573 2, /* vec_store_cost */
574 2, /* cond_taken_branch_cost */
575 1 /* cond_not_taken_branch_cost */
578 /* Costs for vector insn classes for Vulcan. */
579 static const struct cpu_vector_cost thunderx2t99_vector_cost
=
581 1, /* scalar_int_stmt_cost */
582 6, /* scalar_fp_stmt_cost */
583 4, /* scalar_load_cost */
584 1, /* scalar_store_cost */
585 5, /* vec_int_stmt_cost */
586 6, /* vec_fp_stmt_cost */
587 3, /* vec_permute_cost */
588 6, /* vec_to_scalar_cost */
589 5, /* scalar_to_vec_cost */
590 8, /* vec_align_load_cost */
591 8, /* vec_unalign_load_cost */
592 4, /* vec_unalign_store_cost */
593 4, /* vec_store_cost */
594 2, /* cond_taken_branch_cost */
595 1 /* cond_not_taken_branch_cost */
598 /* Generic costs for branch instructions. */
599 static const struct cpu_branch_cost generic_branch_cost
=
601 1, /* Predictable. */
602 3 /* Unpredictable. */
605 /* Generic approximation modes. */
606 static const cpu_approx_modes generic_approx_modes
=
608 AARCH64_APPROX_NONE
, /* division */
609 AARCH64_APPROX_NONE
, /* sqrt */
610 AARCH64_APPROX_NONE
/* recip_sqrt */
613 /* Approximation modes for Exynos M1. */
614 static const cpu_approx_modes exynosm1_approx_modes
=
616 AARCH64_APPROX_NONE
, /* division */
617 AARCH64_APPROX_ALL
, /* sqrt */
618 AARCH64_APPROX_ALL
/* recip_sqrt */
621 /* Approximation modes for X-Gene 1. */
622 static const cpu_approx_modes xgene1_approx_modes
=
624 AARCH64_APPROX_NONE
, /* division */
625 AARCH64_APPROX_NONE
, /* sqrt */
626 AARCH64_APPROX_ALL
/* recip_sqrt */
629 /* Generic prefetch settings (which disable prefetch). */
630 static const cpu_prefetch_tune generic_prefetch_tune
=
633 -1, /* l1_cache_size */
634 -1, /* l1_cache_line_size */
635 -1, /* l2_cache_size */
636 true, /* prefetch_dynamic_strides */
637 -1, /* minimum_stride */
638 -1 /* default_opt_level */
641 static const cpu_prefetch_tune exynosm1_prefetch_tune
=
644 -1, /* l1_cache_size */
645 64, /* l1_cache_line_size */
646 -1, /* l2_cache_size */
647 true, /* prefetch_dynamic_strides */
648 -1, /* minimum_stride */
649 -1 /* default_opt_level */
652 static const cpu_prefetch_tune qdf24xx_prefetch_tune
=
655 32, /* l1_cache_size */
656 64, /* l1_cache_line_size */
657 512, /* l2_cache_size */
658 false, /* prefetch_dynamic_strides */
659 2048, /* minimum_stride */
660 3 /* default_opt_level */
663 static const cpu_prefetch_tune thunderxt88_prefetch_tune
=
666 32, /* l1_cache_size */
667 128, /* l1_cache_line_size */
668 16*1024, /* l2_cache_size */
669 true, /* prefetch_dynamic_strides */
670 -1, /* minimum_stride */
671 3 /* default_opt_level */
674 static const cpu_prefetch_tune thunderx_prefetch_tune
=
677 32, /* l1_cache_size */
678 128, /* l1_cache_line_size */
679 -1, /* l2_cache_size */
680 true, /* prefetch_dynamic_strides */
681 -1, /* minimum_stride */
682 -1 /* default_opt_level */
685 static const cpu_prefetch_tune thunderx2t99_prefetch_tune
=
688 32, /* l1_cache_size */
689 64, /* l1_cache_line_size */
690 256, /* l2_cache_size */
691 true, /* prefetch_dynamic_strides */
692 -1, /* minimum_stride */
693 -1 /* default_opt_level */
696 static const cpu_prefetch_tune tsv110_prefetch_tune
=
699 64, /* l1_cache_size */
700 64, /* l1_cache_line_size */
701 512, /* l2_cache_size */
702 true, /* prefetch_dynamic_strides */
703 -1, /* minimum_stride */
704 -1 /* default_opt_level */
707 static const cpu_prefetch_tune xgene1_prefetch_tune
=
710 32, /* l1_cache_size */
711 64, /* l1_cache_line_size */
712 256, /* l2_cache_size */
713 true, /* prefetch_dynamic_strides */
714 -1, /* minimum_stride */
715 -1 /* default_opt_level */
718 static const struct tune_params generic_tunings
=
720 &cortexa57_extra_costs
,
721 &generic_addrcost_table
,
722 &generic_regmove_cost
,
723 &generic_vector_cost
,
724 &generic_branch_cost
,
725 &generic_approx_modes
,
726 SVE_NOT_IMPLEMENTED
, /* sve_width */
729 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
730 "16:12", /* function_align. */
731 "4", /* jump_align. */
732 "8", /* loop_align. */
733 2, /* int_reassoc_width. */
734 4, /* fp_reassoc_width. */
735 1, /* vec_reassoc_width. */
736 2, /* min_div_recip_mul_sf. */
737 2, /* min_div_recip_mul_df. */
738 0, /* max_case_values. */
739 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
740 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
741 &generic_prefetch_tune
744 static const struct tune_params cortexa35_tunings
=
746 &cortexa53_extra_costs
,
747 &generic_addrcost_table
,
748 &cortexa53_regmove_cost
,
749 &generic_vector_cost
,
750 &generic_branch_cost
,
751 &generic_approx_modes
,
752 SVE_NOT_IMPLEMENTED
, /* sve_width */
755 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
756 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
757 "16", /* function_align. */
758 "4", /* jump_align. */
759 "8", /* loop_align. */
760 2, /* int_reassoc_width. */
761 4, /* fp_reassoc_width. */
762 1, /* vec_reassoc_width. */
763 2, /* min_div_recip_mul_sf. */
764 2, /* min_div_recip_mul_df. */
765 0, /* max_case_values. */
766 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
767 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
768 &generic_prefetch_tune
771 static const struct tune_params cortexa53_tunings
=
773 &cortexa53_extra_costs
,
774 &generic_addrcost_table
,
775 &cortexa53_regmove_cost
,
776 &generic_vector_cost
,
777 &generic_branch_cost
,
778 &generic_approx_modes
,
779 SVE_NOT_IMPLEMENTED
, /* sve_width */
782 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
783 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
784 "16", /* function_align. */
785 "4", /* jump_align. */
786 "8", /* loop_align. */
787 2, /* int_reassoc_width. */
788 4, /* fp_reassoc_width. */
789 1, /* vec_reassoc_width. */
790 2, /* min_div_recip_mul_sf. */
791 2, /* min_div_recip_mul_df. */
792 0, /* max_case_values. */
793 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
794 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
795 &generic_prefetch_tune
798 static const struct tune_params cortexa57_tunings
=
800 &cortexa57_extra_costs
,
801 &generic_addrcost_table
,
802 &cortexa57_regmove_cost
,
803 &cortexa57_vector_cost
,
804 &generic_branch_cost
,
805 &generic_approx_modes
,
806 SVE_NOT_IMPLEMENTED
, /* sve_width */
809 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
810 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
811 "16", /* function_align. */
812 "4", /* jump_align. */
813 "8", /* loop_align. */
814 2, /* int_reassoc_width. */
815 4, /* fp_reassoc_width. */
816 1, /* vec_reassoc_width. */
817 2, /* min_div_recip_mul_sf. */
818 2, /* min_div_recip_mul_df. */
819 0, /* max_case_values. */
820 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
821 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
), /* tune_flags. */
822 &generic_prefetch_tune
825 static const struct tune_params cortexa72_tunings
=
827 &cortexa57_extra_costs
,
828 &generic_addrcost_table
,
829 &cortexa57_regmove_cost
,
830 &cortexa57_vector_cost
,
831 &generic_branch_cost
,
832 &generic_approx_modes
,
833 SVE_NOT_IMPLEMENTED
, /* sve_width */
836 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
837 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
838 "16", /* function_align. */
839 "4", /* jump_align. */
840 "8", /* loop_align. */
841 2, /* int_reassoc_width. */
842 4, /* fp_reassoc_width. */
843 1, /* vec_reassoc_width. */
844 2, /* min_div_recip_mul_sf. */
845 2, /* min_div_recip_mul_df. */
846 0, /* max_case_values. */
847 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
848 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
849 &generic_prefetch_tune
852 static const struct tune_params cortexa73_tunings
=
854 &cortexa57_extra_costs
,
855 &generic_addrcost_table
,
856 &cortexa57_regmove_cost
,
857 &cortexa57_vector_cost
,
858 &generic_branch_cost
,
859 &generic_approx_modes
,
860 SVE_NOT_IMPLEMENTED
, /* sve_width */
861 4, /* memmov_cost. */
863 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
864 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
865 "16", /* function_align. */
866 "4", /* jump_align. */
867 "8", /* loop_align. */
868 2, /* int_reassoc_width. */
869 4, /* fp_reassoc_width. */
870 1, /* vec_reassoc_width. */
871 2, /* min_div_recip_mul_sf. */
872 2, /* min_div_recip_mul_df. */
873 0, /* max_case_values. */
874 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
875 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
876 &generic_prefetch_tune
881 static const struct tune_params exynosm1_tunings
=
883 &exynosm1_extra_costs
,
884 &exynosm1_addrcost_table
,
885 &exynosm1_regmove_cost
,
886 &exynosm1_vector_cost
,
887 &generic_branch_cost
,
888 &exynosm1_approx_modes
,
889 SVE_NOT_IMPLEMENTED
, /* sve_width */
892 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
893 "4", /* function_align. */
894 "4", /* jump_align. */
895 "4", /* loop_align. */
896 2, /* int_reassoc_width. */
897 4, /* fp_reassoc_width. */
898 1, /* vec_reassoc_width. */
899 2, /* min_div_recip_mul_sf. */
900 2, /* min_div_recip_mul_df. */
901 48, /* max_case_values. */
902 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
903 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
904 &exynosm1_prefetch_tune
907 static const struct tune_params thunderxt88_tunings
=
909 &thunderx_extra_costs
,
910 &generic_addrcost_table
,
911 &thunderx_regmove_cost
,
912 &thunderx_vector_cost
,
913 &generic_branch_cost
,
914 &generic_approx_modes
,
915 SVE_NOT_IMPLEMENTED
, /* sve_width */
918 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
919 "8", /* function_align. */
920 "8", /* jump_align. */
921 "8", /* loop_align. */
922 2, /* int_reassoc_width. */
923 4, /* fp_reassoc_width. */
924 1, /* vec_reassoc_width. */
925 2, /* min_div_recip_mul_sf. */
926 2, /* min_div_recip_mul_df. */
927 0, /* max_case_values. */
928 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
929 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
), /* tune_flags. */
930 &thunderxt88_prefetch_tune
933 static const struct tune_params thunderx_tunings
=
935 &thunderx_extra_costs
,
936 &generic_addrcost_table
,
937 &thunderx_regmove_cost
,
938 &thunderx_vector_cost
,
939 &generic_branch_cost
,
940 &generic_approx_modes
,
941 SVE_NOT_IMPLEMENTED
, /* sve_width */
944 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
945 "8", /* function_align. */
946 "8", /* jump_align. */
947 "8", /* loop_align. */
948 2, /* int_reassoc_width. */
949 4, /* fp_reassoc_width. */
950 1, /* vec_reassoc_width. */
951 2, /* min_div_recip_mul_sf. */
952 2, /* min_div_recip_mul_df. */
953 0, /* max_case_values. */
954 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
955 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
956 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
), /* tune_flags. */
957 &thunderx_prefetch_tune
960 static const struct tune_params tsv110_tunings
=
963 &tsv110_addrcost_table
,
964 &tsv110_regmove_cost
,
966 &generic_branch_cost
,
967 &generic_approx_modes
,
968 SVE_NOT_IMPLEMENTED
, /* sve_width */
971 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_CMP_BRANCH
972 | AARCH64_FUSE_ALU_BRANCH
), /* fusible_ops */
973 "16", /* function_align. */
974 "4", /* jump_align. */
975 "8", /* loop_align. */
976 2, /* int_reassoc_width. */
977 4, /* fp_reassoc_width. */
978 1, /* vec_reassoc_width. */
979 2, /* min_div_recip_mul_sf. */
980 2, /* min_div_recip_mul_df. */
981 0, /* max_case_values. */
982 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
983 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
984 &tsv110_prefetch_tune
987 static const struct tune_params xgene1_tunings
=
990 &xgene1_addrcost_table
,
991 &xgene1_regmove_cost
,
993 &generic_branch_cost
,
994 &xgene1_approx_modes
,
995 SVE_NOT_IMPLEMENTED
, /* sve_width */
998 AARCH64_FUSE_NOTHING
, /* fusible_ops */
999 "16", /* function_align. */
1000 "16", /* jump_align. */
1001 "16", /* loop_align. */
1002 2, /* int_reassoc_width. */
1003 4, /* fp_reassoc_width. */
1004 1, /* vec_reassoc_width. */
1005 2, /* min_div_recip_mul_sf. */
1006 2, /* min_div_recip_mul_df. */
1007 17, /* max_case_values. */
1008 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
1009 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
), /* tune_flags. */
1010 &xgene1_prefetch_tune
1013 static const struct tune_params emag_tunings
=
1015 &xgene1_extra_costs
,
1016 &xgene1_addrcost_table
,
1017 &xgene1_regmove_cost
,
1018 &xgene1_vector_cost
,
1019 &generic_branch_cost
,
1020 &xgene1_approx_modes
,
1021 SVE_NOT_IMPLEMENTED
,
1022 6, /* memmov_cost */
1024 AARCH64_FUSE_NOTHING
, /* fusible_ops */
1025 "16", /* function_align. */
1026 "16", /* jump_align. */
1027 "16", /* loop_align. */
1028 2, /* int_reassoc_width. */
1029 4, /* fp_reassoc_width. */
1030 1, /* vec_reassoc_width. */
1031 2, /* min_div_recip_mul_sf. */
1032 2, /* min_div_recip_mul_df. */
1033 17, /* max_case_values. */
1034 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
1035 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
), /* tune_flags. */
1036 &xgene1_prefetch_tune
1039 static const struct tune_params qdf24xx_tunings
=
1041 &qdf24xx_extra_costs
,
1042 &qdf24xx_addrcost_table
,
1043 &qdf24xx_regmove_cost
,
1044 &qdf24xx_vector_cost
,
1045 &generic_branch_cost
,
1046 &generic_approx_modes
,
1047 SVE_NOT_IMPLEMENTED
, /* sve_width */
1048 4, /* memmov_cost */
1050 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
1051 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
1052 "16", /* function_align. */
1053 "8", /* jump_align. */
1054 "16", /* loop_align. */
1055 2, /* int_reassoc_width. */
1056 4, /* fp_reassoc_width. */
1057 1, /* vec_reassoc_width. */
1058 2, /* min_div_recip_mul_sf. */
1059 2, /* min_div_recip_mul_df. */
1060 0, /* max_case_values. */
1061 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1062 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS
, /* tune_flags. */
1063 &qdf24xx_prefetch_tune
1066 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1068 static const struct tune_params saphira_tunings
=
1070 &generic_extra_costs
,
1071 &generic_addrcost_table
,
1072 &generic_regmove_cost
,
1073 &generic_vector_cost
,
1074 &generic_branch_cost
,
1075 &generic_approx_modes
,
1076 SVE_NOT_IMPLEMENTED
, /* sve_width */
1077 4, /* memmov_cost */
1079 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
1080 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
1081 "16", /* function_align. */
1082 "8", /* jump_align. */
1083 "16", /* loop_align. */
1084 2, /* int_reassoc_width. */
1085 4, /* fp_reassoc_width. */
1086 1, /* vec_reassoc_width. */
1087 2, /* min_div_recip_mul_sf. */
1088 2, /* min_div_recip_mul_df. */
1089 0, /* max_case_values. */
1090 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1091 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1092 &generic_prefetch_tune
1095 static const struct tune_params thunderx2t99_tunings
=
1097 &thunderx2t99_extra_costs
,
1098 &thunderx2t99_addrcost_table
,
1099 &thunderx2t99_regmove_cost
,
1100 &thunderx2t99_vector_cost
,
1101 &generic_branch_cost
,
1102 &generic_approx_modes
,
1103 SVE_NOT_IMPLEMENTED
, /* sve_width */
1104 4, /* memmov_cost. */
1105 4, /* issue_rate. */
1106 (AARCH64_FUSE_CMP_BRANCH
| AARCH64_FUSE_AES_AESMC
1107 | AARCH64_FUSE_ALU_BRANCH
), /* fusible_ops */
1108 "16", /* function_align. */
1109 "8", /* jump_align. */
1110 "16", /* loop_align. */
1111 3, /* int_reassoc_width. */
1112 2, /* fp_reassoc_width. */
1113 2, /* vec_reassoc_width. */
1114 2, /* min_div_recip_mul_sf. */
1115 2, /* min_div_recip_mul_df. */
1116 0, /* max_case_values. */
1117 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1118 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1119 &thunderx2t99_prefetch_tune
1122 static const struct tune_params neoversen1_tunings
=
1124 &cortexa57_extra_costs
,
1125 &generic_addrcost_table
,
1126 &generic_regmove_cost
,
1127 &cortexa57_vector_cost
,
1128 &generic_branch_cost
,
1129 &generic_approx_modes
,
1130 SVE_NOT_IMPLEMENTED
, /* sve_width */
1131 4, /* memmov_cost */
1133 AARCH64_FUSE_AES_AESMC
, /* fusible_ops */
1134 "32:16", /* function_align. */
1135 "32:16", /* jump_align. */
1136 "32:16", /* loop_align. */
1137 2, /* int_reassoc_width. */
1138 4, /* fp_reassoc_width. */
1139 2, /* vec_reassoc_width. */
1140 2, /* min_div_recip_mul_sf. */
1141 2, /* min_div_recip_mul_df. */
1142 0, /* max_case_values. */
1143 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1144 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1145 &generic_prefetch_tune
1148 /* Support for fine-grained override of the tuning structures. */
1149 struct aarch64_tuning_override_function
1152 void (*parse_override
)(const char*, struct tune_params
*);
1155 static void aarch64_parse_fuse_string (const char*, struct tune_params
*);
1156 static void aarch64_parse_tune_string (const char*, struct tune_params
*);
1157 static void aarch64_parse_sve_width_string (const char*, struct tune_params
*);
1159 static const struct aarch64_tuning_override_function
1160 aarch64_tuning_override_functions
[] =
1162 { "fuse", aarch64_parse_fuse_string
},
1163 { "tune", aarch64_parse_tune_string
},
1164 { "sve_width", aarch64_parse_sve_width_string
},
1168 /* A processor implementing AArch64. */
1171 const char *const name
;
1172 enum aarch64_processor ident
;
1173 enum aarch64_processor sched_core
;
1174 enum aarch64_arch arch
;
1175 unsigned architecture_version
;
1176 const uint64_t flags
;
1177 const struct tune_params
*const tune
;
1180 /* Architectures implementing AArch64. */
1181 static const struct processor all_architectures
[] =
1183 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1184 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1185 #include "aarch64-arches.def"
1186 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
1189 /* Processor cores implementing AArch64. */
1190 static const struct processor all_cores
[] =
1192 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1193 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1194 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1195 FLAGS, &COSTS##_tunings},
1196 #include "aarch64-cores.def"
1197 {"generic", generic
, cortexa53
, AARCH64_ARCH_8A
, 8,
1198 AARCH64_FL_FOR_ARCH8
, &generic_tunings
},
1199 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
1203 /* Target specification. These are populated by the -march, -mtune, -mcpu
1204 handling code or by target attributes. */
1205 static const struct processor
*selected_arch
;
1206 static const struct processor
*selected_cpu
;
1207 static const struct processor
*selected_tune
;
1209 enum aarch64_key_type aarch64_ra_sign_key
= AARCH64_KEY_A
;
1211 /* The current tuning set. */
1212 struct tune_params aarch64_tune_params
= generic_tunings
;
1214 /* Table of machine attributes. */
1215 static const struct attribute_spec aarch64_attribute_table
[] =
1217 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1218 affects_type_identity, handler, exclude } */
1219 { "aarch64_vector_pcs", 0, 0, false, true, true, true, NULL
, NULL
},
1220 { NULL
, 0, 0, false, false, false, false, NULL
, NULL
}
1223 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1225 /* An ISA extension in the co-processor and main instruction set space. */
1226 struct aarch64_option_extension
1228 const char *const name
;
1229 const unsigned long flags_on
;
1230 const unsigned long flags_off
;
1233 typedef enum aarch64_cond_code
1235 AARCH64_EQ
= 0, AARCH64_NE
, AARCH64_CS
, AARCH64_CC
, AARCH64_MI
, AARCH64_PL
,
1236 AARCH64_VS
, AARCH64_VC
, AARCH64_HI
, AARCH64_LS
, AARCH64_GE
, AARCH64_LT
,
1237 AARCH64_GT
, AARCH64_LE
, AARCH64_AL
, AARCH64_NV
1241 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1243 struct aarch64_branch_protect_type
1245 /* The type's name that the user passes to the branch-protection option
1248 /* Function to handle the protection type and set global variables.
1249 First argument is the string token corresponding with this type and the
1250 second argument is the next token in the option string.
1252 * AARCH64_PARSE_OK: Handling was sucessful.
1253 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1254 should print an error.
1255 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1257 enum aarch64_parse_opt_result (*handler
)(char*, char*);
1258 /* A list of types that can follow this type in the option string. */
1259 const aarch64_branch_protect_type
* subtypes
;
1260 unsigned int num_subtypes
;
1263 static enum aarch64_parse_opt_result
1264 aarch64_handle_no_branch_protection (char* str
, char* rest
)
1266 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NONE
;
1267 aarch64_enable_bti
= 0;
1270 error ("unexpected %<%s%> after %<%s%>", rest
, str
);
1271 return AARCH64_PARSE_INVALID_FEATURE
;
1273 return AARCH64_PARSE_OK
;
1276 static enum aarch64_parse_opt_result
1277 aarch64_handle_standard_branch_protection (char* str
, char* rest
)
1279 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
1280 aarch64_ra_sign_key
= AARCH64_KEY_A
;
1281 aarch64_enable_bti
= 1;
1284 error ("unexpected %<%s%> after %<%s%>", rest
, str
);
1285 return AARCH64_PARSE_INVALID_FEATURE
;
1287 return AARCH64_PARSE_OK
;
1290 static enum aarch64_parse_opt_result
1291 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED
,
1292 char* rest ATTRIBUTE_UNUSED
)
1294 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
1295 aarch64_ra_sign_key
= AARCH64_KEY_A
;
1296 return AARCH64_PARSE_OK
;
1299 static enum aarch64_parse_opt_result
1300 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED
,
1301 char* rest ATTRIBUTE_UNUSED
)
1303 aarch64_ra_sign_scope
= AARCH64_FUNCTION_ALL
;
1304 return AARCH64_PARSE_OK
;
1307 static enum aarch64_parse_opt_result
1308 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED
,
1309 char* rest ATTRIBUTE_UNUSED
)
1311 aarch64_ra_sign_key
= AARCH64_KEY_B
;
1312 return AARCH64_PARSE_OK
;
1315 static enum aarch64_parse_opt_result
1316 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED
,
1317 char* rest ATTRIBUTE_UNUSED
)
1319 aarch64_enable_bti
= 1;
1320 return AARCH64_PARSE_OK
;
1323 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes
[] = {
1324 { "leaf", aarch64_handle_pac_ret_leaf
, NULL
, 0 },
1325 { "b-key", aarch64_handle_pac_ret_b_key
, NULL
, 0 },
1326 { NULL
, NULL
, NULL
, 0 }
1329 static const struct aarch64_branch_protect_type aarch64_branch_protect_types
[] = {
1330 { "none", aarch64_handle_no_branch_protection
, NULL
, 0 },
1331 { "standard", aarch64_handle_standard_branch_protection
, NULL
, 0 },
1332 { "pac-ret", aarch64_handle_pac_ret_protection
, aarch64_pac_ret_subtypes
,
1333 ARRAY_SIZE (aarch64_pac_ret_subtypes
) },
1334 { "bti", aarch64_handle_bti_protection
, NULL
, 0 },
1335 { NULL
, NULL
, NULL
, 0 }
1338 /* The condition codes of the processor, and the inverse function. */
1339 static const char * const aarch64_condition_codes
[] =
1341 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1342 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1345 /* The preferred condition codes for SVE conditions. */
1346 static const char *const aarch64_sve_condition_codes
[] =
1348 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1349 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1352 /* Return the assembly token for svpattern value VALUE. */
1355 svpattern_token (enum aarch64_svpattern pattern
)
1359 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1360 AARCH64_FOR_SVPATTERN (CASE
)
1362 case AARCH64_NUM_SVPATTERNS
:
1368 /* Generate code to enable conditional branches in functions over 1 MiB. */
1370 aarch64_gen_far_branch (rtx
* operands
, int pos_label
, const char * dest
,
1371 const char * branch_format
)
1373 rtx_code_label
* tmp_label
= gen_label_rtx ();
1374 char label_buf
[256];
1376 ASM_GENERATE_INTERNAL_LABEL (label_buf
, dest
,
1377 CODE_LABEL_NUMBER (tmp_label
));
1378 const char *label_ptr
= targetm
.strip_name_encoding (label_buf
);
1379 rtx dest_label
= operands
[pos_label
];
1380 operands
[pos_label
] = tmp_label
;
1382 snprintf (buffer
, sizeof (buffer
), "%s%s", branch_format
, label_ptr
);
1383 output_asm_insn (buffer
, operands
);
1385 snprintf (buffer
, sizeof (buffer
), "b\t%%l%d\n%s:", pos_label
, label_ptr
);
1386 operands
[pos_label
] = dest_label
;
1387 output_asm_insn (buffer
, operands
);
1392 aarch64_err_no_fpadvsimd (machine_mode mode
)
1394 if (TARGET_GENERAL_REGS_ONLY
)
1395 if (FLOAT_MODE_P (mode
))
1396 error ("%qs is incompatible with the use of floating-point types",
1397 "-mgeneral-regs-only");
1399 error ("%qs is incompatible with the use of vector types",
1400 "-mgeneral-regs-only");
1402 if (FLOAT_MODE_P (mode
))
1403 error ("%qs feature modifier is incompatible with the use of"
1404 " floating-point types", "+nofp");
1406 error ("%qs feature modifier is incompatible with the use of"
1407 " vector types", "+nofp");
1410 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1411 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1412 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1413 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1414 and GENERAL_REGS is lower than the memory cost (in this case the best class
1415 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1416 cost results in bad allocations with many redundant int<->FP moves which
1417 are expensive on various cores.
1418 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1419 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1420 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1421 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1422 The result of this is that it is no longer inefficient to have a higher
1423 memory move cost than the register move cost.
1427 aarch64_ira_change_pseudo_allocno_class (int regno
, reg_class_t allocno_class
,
1428 reg_class_t best_class
)
1432 if (!reg_class_subset_p (GENERAL_REGS
, allocno_class
)
1433 || !reg_class_subset_p (FP_REGS
, allocno_class
))
1434 return allocno_class
;
1436 if (!reg_class_subset_p (GENERAL_REGS
, best_class
)
1437 || !reg_class_subset_p (FP_REGS
, best_class
))
1440 mode
= PSEUDO_REGNO_MODE (regno
);
1441 return FLOAT_MODE_P (mode
) || VECTOR_MODE_P (mode
) ? FP_REGS
: GENERAL_REGS
;
1445 aarch64_min_divisions_for_recip_mul (machine_mode mode
)
1447 if (GET_MODE_UNIT_SIZE (mode
) == 4)
1448 return aarch64_tune_params
.min_div_recip_mul_sf
;
1449 return aarch64_tune_params
.min_div_recip_mul_df
;
1452 /* Return the reassociation width of treeop OPC with mode MODE. */
1454 aarch64_reassociation_width (unsigned opc
, machine_mode mode
)
1456 if (VECTOR_MODE_P (mode
))
1457 return aarch64_tune_params
.vec_reassoc_width
;
1458 if (INTEGRAL_MODE_P (mode
))
1459 return aarch64_tune_params
.int_reassoc_width
;
1460 /* Avoid reassociating floating point addition so we emit more FMAs. */
1461 if (FLOAT_MODE_P (mode
) && opc
!= PLUS_EXPR
)
1462 return aarch64_tune_params
.fp_reassoc_width
;
1466 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1468 aarch64_dbx_register_number (unsigned regno
)
1470 if (GP_REGNUM_P (regno
))
1471 return AARCH64_DWARF_R0
+ regno
- R0_REGNUM
;
1472 else if (regno
== SP_REGNUM
)
1473 return AARCH64_DWARF_SP
;
1474 else if (FP_REGNUM_P (regno
))
1475 return AARCH64_DWARF_V0
+ regno
- V0_REGNUM
;
1476 else if (PR_REGNUM_P (regno
))
1477 return AARCH64_DWARF_P0
+ regno
- P0_REGNUM
;
1478 else if (regno
== VG_REGNUM
)
1479 return AARCH64_DWARF_VG
;
1481 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1482 equivalent DWARF register. */
1483 return DWARF_FRAME_REGISTERS
;
1486 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1487 integer, otherwise return X unmodified. */
1489 aarch64_bit_representation (rtx x
)
1491 if (CONST_DOUBLE_P (x
))
1492 x
= gen_lowpart (int_mode_for_mode (GET_MODE (x
)).require (), x
);
1496 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1498 aarch64_advsimd_struct_mode_p (machine_mode mode
)
1501 && (mode
== OImode
|| mode
== CImode
|| mode
== XImode
));
1504 /* Return true if MODE is an SVE predicate mode. */
1506 aarch64_sve_pred_mode_p (machine_mode mode
)
1509 && (mode
== VNx16BImode
1510 || mode
== VNx8BImode
1511 || mode
== VNx4BImode
1512 || mode
== VNx2BImode
));
1515 /* Three mutually-exclusive flags describing a vector or predicate type. */
1516 const unsigned int VEC_ADVSIMD
= 1;
1517 const unsigned int VEC_SVE_DATA
= 2;
1518 const unsigned int VEC_SVE_PRED
= 4;
1519 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1520 a structure of 2, 3 or 4 vectors. */
1521 const unsigned int VEC_STRUCT
= 8;
1522 /* Useful combinations of the above. */
1523 const unsigned int VEC_ANY_SVE
= VEC_SVE_DATA
| VEC_SVE_PRED
;
1524 const unsigned int VEC_ANY_DATA
= VEC_ADVSIMD
| VEC_SVE_DATA
;
1526 /* Return a set of flags describing the vector properties of mode MODE.
1527 Ignore modes that are not supported by the current target. */
1529 aarch64_classify_vector_mode (machine_mode mode
)
1531 if (aarch64_advsimd_struct_mode_p (mode
))
1532 return VEC_ADVSIMD
| VEC_STRUCT
;
1534 if (aarch64_sve_pred_mode_p (mode
))
1535 return VEC_SVE_PRED
;
1537 /* Make the decision based on the mode's enum value rather than its
1538 properties, so that we keep the correct classification regardless
1539 of -msve-vector-bits. */
1542 /* Single SVE vectors. */
1550 return TARGET_SVE
? VEC_SVE_DATA
: 0;
1552 /* x2 SVE vectors. */
1560 /* x3 SVE vectors. */
1568 /* x4 SVE vectors. */
1576 return TARGET_SVE
? VEC_SVE_DATA
| VEC_STRUCT
: 0;
1578 /* 64-bit Advanced SIMD vectors. */
1582 /* ...E_V1DImode doesn't exist. */
1586 /* 128-bit Advanced SIMD vectors. */
1594 return TARGET_SIMD
? VEC_ADVSIMD
: 0;
1601 /* Return true if MODE is any of the data vector modes, including
1604 aarch64_vector_data_mode_p (machine_mode mode
)
1606 return aarch64_classify_vector_mode (mode
) & VEC_ANY_DATA
;
1609 /* Return true if MODE is any form of SVE mode, including predicates,
1610 vectors and structures. */
1612 aarch64_sve_mode_p (machine_mode mode
)
1614 return aarch64_classify_vector_mode (mode
) & VEC_ANY_SVE
;
1617 /* Return true if MODE is an SVE data vector mode; either a single vector
1618 or a structure of vectors. */
1620 aarch64_sve_data_mode_p (machine_mode mode
)
1622 return aarch64_classify_vector_mode (mode
) & VEC_SVE_DATA
;
1625 /* Implement target hook TARGET_ARRAY_MODE. */
1626 static opt_machine_mode
1627 aarch64_array_mode (machine_mode mode
, unsigned HOST_WIDE_INT nelems
)
1629 if (aarch64_classify_vector_mode (mode
) == VEC_SVE_DATA
1630 && IN_RANGE (nelems
, 2, 4))
1631 return mode_for_vector (GET_MODE_INNER (mode
),
1632 GET_MODE_NUNITS (mode
) * nelems
);
1634 return opt_machine_mode ();
1637 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1639 aarch64_array_mode_supported_p (machine_mode mode
,
1640 unsigned HOST_WIDE_INT nelems
)
1643 && (AARCH64_VALID_SIMD_QREG_MODE (mode
)
1644 || AARCH64_VALID_SIMD_DREG_MODE (mode
))
1645 && (nelems
>= 2 && nelems
<= 4))
1651 /* Return the SVE predicate mode to use for elements that have
1652 ELEM_NBYTES bytes, if such a mode exists. */
1655 aarch64_sve_pred_mode (unsigned int elem_nbytes
)
1659 if (elem_nbytes
== 1)
1661 if (elem_nbytes
== 2)
1663 if (elem_nbytes
== 4)
1665 if (elem_nbytes
== 8)
1668 return opt_machine_mode ();
1671 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1673 static opt_machine_mode
1674 aarch64_get_mask_mode (poly_uint64 nunits
, poly_uint64 nbytes
)
1676 if (TARGET_SVE
&& known_eq (nbytes
, BYTES_PER_SVE_VECTOR
))
1678 unsigned int elem_nbytes
= vector_element_size (nbytes
, nunits
);
1679 machine_mode pred_mode
;
1680 if (aarch64_sve_pred_mode (elem_nbytes
).exists (&pred_mode
))
1684 return default_get_mask_mode (nunits
, nbytes
);
1687 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
1689 static opt_machine_mode
1690 aarch64_sve_data_mode (scalar_mode inner_mode
, poly_uint64 nunits
)
1692 enum mode_class mclass
= (is_a
<scalar_float_mode
> (inner_mode
)
1693 ? MODE_VECTOR_FLOAT
: MODE_VECTOR_INT
);
1695 FOR_EACH_MODE_IN_CLASS (mode
, mclass
)
1696 if (inner_mode
== GET_MODE_INNER (mode
)
1697 && known_eq (nunits
, GET_MODE_NUNITS (mode
))
1698 && aarch64_sve_data_mode_p (mode
))
1700 return opt_machine_mode ();
1703 /* Return the integer element mode associated with SVE mode MODE. */
1705 static scalar_int_mode
1706 aarch64_sve_element_int_mode (machine_mode mode
)
1708 unsigned int elt_bits
= vector_element_size (BITS_PER_SVE_VECTOR
,
1709 GET_MODE_NUNITS (mode
));
1710 return int_mode_for_size (elt_bits
, 0).require ();
1713 /* Return the integer vector mode associated with SVE mode MODE.
1714 Unlike mode_for_int_vector, this can handle the case in which
1715 MODE is a predicate (and thus has a different total size). */
1718 aarch64_sve_int_mode (machine_mode mode
)
1720 scalar_int_mode int_mode
= aarch64_sve_element_int_mode (mode
);
1721 return aarch64_sve_data_mode (int_mode
, GET_MODE_NUNITS (mode
)).require ();
1724 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1725 prefer to use the first arithmetic operand as the else value if
1726 the else value doesn't matter, since that exactly matches the SVE
1727 destructive merging form. For ternary operations we could either
1728 pick the first operand and use FMAD-like instructions or the last
1729 operand and use FMLA-like instructions; the latter seems more
1733 aarch64_preferred_else_value (unsigned, tree
, unsigned int nops
, tree
*ops
)
1735 return nops
== 3 ? ops
[2] : ops
[0];
1738 /* Implement TARGET_HARD_REGNO_NREGS. */
1741 aarch64_hard_regno_nregs (unsigned regno
, machine_mode mode
)
1743 /* ??? Logically we should only need to provide a value when
1744 HARD_REGNO_MODE_OK says that the combination is valid,
1745 but at the moment we need to handle all modes. Just ignore
1746 any runtime parts for registers that can't store them. */
1747 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
1748 switch (aarch64_regno_regclass (regno
))
1753 if (aarch64_sve_data_mode_p (mode
))
1754 return exact_div (GET_MODE_SIZE (mode
),
1755 BYTES_PER_SVE_VECTOR
).to_constant ();
1756 return CEIL (lowest_size
, UNITS_PER_VREG
);
1762 return CEIL (lowest_size
, UNITS_PER_WORD
);
1767 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1770 aarch64_hard_regno_mode_ok (unsigned regno
, machine_mode mode
)
1772 if (GET_MODE_CLASS (mode
) == MODE_CC
)
1773 return regno
== CC_REGNUM
;
1775 if (regno
== VG_REGNUM
)
1776 /* This must have the same size as _Unwind_Word. */
1777 return mode
== DImode
;
1779 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1780 if (vec_flags
& VEC_SVE_PRED
)
1781 return PR_REGNUM_P (regno
);
1783 if (PR_REGNUM_P (regno
))
1786 if (regno
== SP_REGNUM
)
1787 /* The purpose of comparing with ptr_mode is to support the
1788 global register variable associated with the stack pointer
1789 register via the syntax of asm ("wsp") in ILP32. */
1790 return mode
== Pmode
|| mode
== ptr_mode
;
1792 if (regno
== FRAME_POINTER_REGNUM
|| regno
== ARG_POINTER_REGNUM
)
1793 return mode
== Pmode
;
1795 if (GP_REGNUM_P (regno
))
1797 if (known_le (GET_MODE_SIZE (mode
), 8))
1799 else if (known_le (GET_MODE_SIZE (mode
), 16))
1800 return (regno
& 1) == 0;
1802 else if (FP_REGNUM_P (regno
))
1804 if (vec_flags
& VEC_STRUCT
)
1805 return end_hard_regno (mode
, regno
) - 1 <= V31_REGNUM
;
1807 return !VECTOR_MODE_P (mode
) || vec_flags
!= 0;
1813 /* Return true if this is a definition of a vectorized simd function. */
1816 aarch64_simd_decl_p (tree fndecl
)
1822 fntype
= TREE_TYPE (fndecl
);
1826 /* Functions with the aarch64_vector_pcs attribute use the simd ABI. */
1827 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype
)) != NULL
)
1833 /* Return the mode a register save/restore should use. DImode for integer
1834 registers, DFmode for FP registers in non-SIMD functions (they only save
1835 the bottom half of a 128 bit register), or TFmode for FP registers in
1839 aarch64_reg_save_mode (tree fndecl
, unsigned regno
)
1841 return GP_REGNUM_P (regno
)
1843 : (aarch64_simd_decl_p (fndecl
) ? E_TFmode
: E_DFmode
);
1846 /* Return true if the instruction is a call to a SIMD function, false
1847 if it is not a SIMD function or if we do not know anything about
1851 aarch64_simd_call_p (rtx_insn
*insn
)
1857 gcc_assert (CALL_P (insn
));
1858 call
= get_call_rtx_from (insn
);
1859 symbol
= XEXP (XEXP (call
, 0), 0);
1860 if (GET_CODE (symbol
) != SYMBOL_REF
)
1862 fndecl
= SYMBOL_REF_DECL (symbol
);
1866 return aarch64_simd_decl_p (fndecl
);
1869 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS. If INSN calls
1870 a function that uses the SIMD ABI, take advantage of the extra
1871 call-preserved registers that the ABI provides. */
1874 aarch64_remove_extra_call_preserved_regs (rtx_insn
*insn
,
1875 HARD_REG_SET
*return_set
)
1877 if (aarch64_simd_call_p (insn
))
1879 for (int regno
= 0; regno
< FIRST_PSEUDO_REGISTER
; regno
++)
1880 if (FP_SIMD_SAVED_REGNUM_P (regno
))
1881 CLEAR_HARD_REG_BIT (*return_set
, regno
);
1885 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1886 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1887 clobbers the top 64 bits when restoring the bottom 64 bits. */
1890 aarch64_hard_regno_call_part_clobbered (rtx_insn
*insn
, unsigned int regno
,
1893 bool simd_p
= insn
&& CALL_P (insn
) && aarch64_simd_call_p (insn
);
1894 return FP_REGNUM_P (regno
)
1895 && maybe_gt (GET_MODE_SIZE (mode
), simd_p
? 16 : 8);
1898 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS. */
1901 aarch64_return_call_with_max_clobbers (rtx_insn
*call_1
, rtx_insn
*call_2
)
1903 gcc_assert (CALL_P (call_1
) && CALL_P (call_2
));
1905 if (!aarch64_simd_call_p (call_1
) || aarch64_simd_call_p (call_2
))
1911 /* Implement REGMODE_NATURAL_SIZE. */
1913 aarch64_regmode_natural_size (machine_mode mode
)
1915 /* The natural size for SVE data modes is one SVE data vector,
1916 and similarly for predicates. We can't independently modify
1917 anything smaller than that. */
1918 /* ??? For now, only do this for variable-width SVE registers.
1919 Doing it for constant-sized registers breaks lower-subreg.c. */
1920 /* ??? And once that's fixed, we should probably have similar
1921 code for Advanced SIMD. */
1922 if (!aarch64_sve_vg
.is_constant ())
1924 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1925 if (vec_flags
& VEC_SVE_PRED
)
1926 return BYTES_PER_SVE_PRED
;
1927 if (vec_flags
& VEC_SVE_DATA
)
1928 return BYTES_PER_SVE_VECTOR
;
1930 return UNITS_PER_WORD
;
1933 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1935 aarch64_hard_regno_caller_save_mode (unsigned regno
, unsigned,
1938 /* The predicate mode determines which bits are significant and
1939 which are "don't care". Decreasing the number of lanes would
1940 lose data while increasing the number of lanes would make bits
1941 unnecessarily significant. */
1942 if (PR_REGNUM_P (regno
))
1944 if (known_ge (GET_MODE_SIZE (mode
), 4))
1950 /* Return true if I's bits are consecutive ones from the MSB. */
1952 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i
)
1954 return exact_log2 (-i
) != HOST_WIDE_INT_M1
;
1957 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1958 that strcpy from constants will be faster. */
1960 static HOST_WIDE_INT
1961 aarch64_constant_alignment (const_tree exp
, HOST_WIDE_INT align
)
1963 if (TREE_CODE (exp
) == STRING_CST
&& !optimize_size
)
1964 return MAX (align
, BITS_PER_WORD
);
1968 /* Return true if calls to DECL should be treated as
1969 long-calls (ie called via a register). */
1971 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED
)
1976 /* Return true if calls to symbol-ref SYM should be treated as
1977 long-calls (ie called via a register). */
1979 aarch64_is_long_call_p (rtx sym
)
1981 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym
));
1984 /* Return true if calls to symbol-ref SYM should not go through
1988 aarch64_is_noplt_call_p (rtx sym
)
1990 const_tree decl
= SYMBOL_REF_DECL (sym
);
1995 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl
)))
1996 && !targetm
.binds_local_p (decl
))
2002 /* Return true if the offsets to a zero/sign-extract operation
2003 represent an expression that matches an extend operation. The
2004 operands represent the paramters from
2006 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
2008 aarch64_is_extend_from_extract (scalar_int_mode mode
, rtx mult_imm
,
2011 HOST_WIDE_INT mult_val
, extract_val
;
2013 if (! CONST_INT_P (mult_imm
) || ! CONST_INT_P (extract_imm
))
2016 mult_val
= INTVAL (mult_imm
);
2017 extract_val
= INTVAL (extract_imm
);
2020 && extract_val
< GET_MODE_BITSIZE (mode
)
2021 && exact_log2 (extract_val
& ~7) > 0
2022 && (extract_val
& 7) <= 4
2023 && mult_val
== (1 << (extract_val
& 7)))
2029 /* Emit an insn that's a simple single-set. Both the operands must be
2030 known to be valid. */
2031 inline static rtx_insn
*
2032 emit_set_insn (rtx x
, rtx y
)
2034 return emit_insn (gen_rtx_SET (x
, y
));
2037 /* X and Y are two things to compare using CODE. Emit the compare insn and
2038 return the rtx for register 0 in the proper mode. */
2040 aarch64_gen_compare_reg (RTX_CODE code
, rtx x
, rtx y
)
2042 machine_mode cmp_mode
= GET_MODE (x
);
2043 machine_mode cc_mode
;
2046 if (cmp_mode
== TImode
)
2048 gcc_assert (code
== NE
);
2051 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
2053 rtx x_lo
= operand_subword (x
, 0, 0, TImode
);
2054 rtx y_lo
= operand_subword (y
, 0, 0, TImode
);
2055 emit_set_insn (cc_reg
, gen_rtx_COMPARE (cc_mode
, x_lo
, y_lo
));
2057 rtx x_hi
= operand_subword (x
, 1, 0, TImode
);
2058 rtx y_hi
= operand_subword (y
, 1, 0, TImode
);
2059 emit_insn (gen_ccmpdi (cc_reg
, cc_reg
, x_hi
, y_hi
,
2060 gen_rtx_EQ (cc_mode
, cc_reg
, const0_rtx
),
2061 GEN_INT (AARCH64_EQ
)));
2065 cc_mode
= SELECT_CC_MODE (code
, x
, y
);
2066 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
2067 emit_set_insn (cc_reg
, gen_rtx_COMPARE (cc_mode
, x
, y
));
2072 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
2075 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code
, rtx x
, rtx y
,
2076 machine_mode y_mode
)
2078 if (y_mode
== E_QImode
|| y_mode
== E_HImode
)
2080 if (CONST_INT_P (y
))
2081 y
= GEN_INT (INTVAL (y
) & GET_MODE_MASK (y_mode
));
2085 machine_mode cc_mode
;
2087 t
= gen_rtx_ZERO_EXTEND (SImode
, y
);
2088 t
= gen_rtx_COMPARE (CC_SWPmode
, t
, x
);
2089 cc_mode
= CC_SWPmode
;
2090 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
2091 emit_set_insn (cc_reg
, t
);
2096 if (!aarch64_plus_operand (y
, y_mode
))
2097 y
= force_reg (y_mode
, y
);
2099 return aarch64_gen_compare_reg (code
, x
, y
);
2102 /* Build the SYMBOL_REF for __tls_get_addr. */
2104 static GTY(()) rtx tls_get_addr_libfunc
;
2107 aarch64_tls_get_addr (void)
2109 if (!tls_get_addr_libfunc
)
2110 tls_get_addr_libfunc
= init_one_libfunc ("__tls_get_addr");
2111 return tls_get_addr_libfunc
;
2114 /* Return the TLS model to use for ADDR. */
2116 static enum tls_model
2117 tls_symbolic_operand_type (rtx addr
)
2119 enum tls_model tls_kind
= TLS_MODEL_NONE
;
2120 if (GET_CODE (addr
) == CONST
)
2123 rtx sym
= strip_offset (addr
, &addend
);
2124 if (GET_CODE (sym
) == SYMBOL_REF
)
2125 tls_kind
= SYMBOL_REF_TLS_MODEL (sym
);
2127 else if (GET_CODE (addr
) == SYMBOL_REF
)
2128 tls_kind
= SYMBOL_REF_TLS_MODEL (addr
);
2133 /* We'll allow lo_sum's in addresses in our legitimate addresses
2134 so that combine would take care of combining addresses where
2135 necessary, but for generation purposes, we'll generate the address
2138 tmp = hi (symbol_ref); adrp x1, foo
2139 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
2143 adrp x1, :got:foo adrp tmp, :tlsgd:foo
2144 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
2148 Load TLS symbol, depending on TLS mechanism and TLS access model.
2150 Global Dynamic - Traditional TLS:
2151 adrp tmp, :tlsgd:imm
2152 add dest, tmp, #:tlsgd_lo12:imm
2155 Global Dynamic - TLS Descriptors:
2156 adrp dest, :tlsdesc:imm
2157 ldr tmp, [dest, #:tlsdesc_lo12:imm]
2158 add dest, dest, #:tlsdesc_lo12:imm
2165 adrp tmp, :gottprel:imm
2166 ldr dest, [tmp, #:gottprel_lo12:imm]
2171 add t0, tp, #:tprel_hi12:imm, lsl #12
2172 add t0, t0, #:tprel_lo12_nc:imm
2176 aarch64_load_symref_appropriately (rtx dest
, rtx imm
,
2177 enum aarch64_symbol_type type
)
2181 case SYMBOL_SMALL_ABSOLUTE
:
2183 /* In ILP32, the mode of dest can be either SImode or DImode. */
2185 machine_mode mode
= GET_MODE (dest
);
2187 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
2189 if (can_create_pseudo_p ())
2190 tmp_reg
= gen_reg_rtx (mode
);
2192 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
2193 emit_insn (gen_add_losym (dest
, tmp_reg
, imm
));
2197 case SYMBOL_TINY_ABSOLUTE
:
2198 emit_insn (gen_rtx_SET (dest
, imm
));
2201 case SYMBOL_SMALL_GOT_28K
:
2203 machine_mode mode
= GET_MODE (dest
);
2204 rtx gp_rtx
= pic_offset_table_rtx
;
2208 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2209 here before rtl expand. Tree IVOPT will generate rtl pattern to
2210 decide rtx costs, in which case pic_offset_table_rtx is not
2211 initialized. For that case no need to generate the first adrp
2212 instruction as the final cost for global variable access is
2216 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2217 using the page base as GOT base, the first page may be wasted,
2218 in the worst scenario, there is only 28K space for GOT).
2220 The generate instruction sequence for accessing global variable
2223 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2225 Only one instruction needed. But we must initialize
2226 pic_offset_table_rtx properly. We generate initialize insn for
2227 every global access, and allow CSE to remove all redundant.
2229 The final instruction sequences will look like the following
2230 for multiply global variables access.
2232 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2234 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2235 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2236 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2239 rtx s
= gen_rtx_SYMBOL_REF (Pmode
, "_GLOBAL_OFFSET_TABLE_");
2240 crtl
->uses_pic_offset_table
= 1;
2241 emit_move_insn (gp_rtx
, gen_rtx_HIGH (Pmode
, s
));
2243 if (mode
!= GET_MODE (gp_rtx
))
2244 gp_rtx
= gen_lowpart (mode
, gp_rtx
);
2248 if (mode
== ptr_mode
)
2251 insn
= gen_ldr_got_small_28k_di (dest
, gp_rtx
, imm
);
2253 insn
= gen_ldr_got_small_28k_si (dest
, gp_rtx
, imm
);
2255 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
2259 gcc_assert (mode
== Pmode
);
2261 insn
= gen_ldr_got_small_28k_sidi (dest
, gp_rtx
, imm
);
2262 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
2265 /* The operand is expected to be MEM. Whenever the related insn
2266 pattern changed, above code which calculate mem should be
2268 gcc_assert (GET_CODE (mem
) == MEM
);
2269 MEM_READONLY_P (mem
) = 1;
2270 MEM_NOTRAP_P (mem
) = 1;
2275 case SYMBOL_SMALL_GOT_4G
:
2277 /* In ILP32, the mode of dest can be either SImode or DImode,
2278 while the got entry is always of SImode size. The mode of
2279 dest depends on how dest is used: if dest is assigned to a
2280 pointer (e.g. in the memory), it has SImode; it may have
2281 DImode if dest is dereferenced to access the memeory.
2282 This is why we have to handle three different ldr_got_small
2283 patterns here (two patterns for ILP32). */
2288 machine_mode mode
= GET_MODE (dest
);
2290 if (can_create_pseudo_p ())
2291 tmp_reg
= gen_reg_rtx (mode
);
2293 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
2294 if (mode
== ptr_mode
)
2297 insn
= gen_ldr_got_small_di (dest
, tmp_reg
, imm
);
2299 insn
= gen_ldr_got_small_si (dest
, tmp_reg
, imm
);
2301 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
2305 gcc_assert (mode
== Pmode
);
2307 insn
= gen_ldr_got_small_sidi (dest
, tmp_reg
, imm
);
2308 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
2311 gcc_assert (GET_CODE (mem
) == MEM
);
2312 MEM_READONLY_P (mem
) = 1;
2313 MEM_NOTRAP_P (mem
) = 1;
2318 case SYMBOL_SMALL_TLSGD
:
2321 machine_mode mode
= GET_MODE (dest
);
2322 rtx result
= gen_rtx_REG (mode
, R0_REGNUM
);
2326 aarch64_emit_call_insn (gen_tlsgd_small_si (result
, imm
));
2328 aarch64_emit_call_insn (gen_tlsgd_small_di (result
, imm
));
2329 insns
= get_insns ();
2332 RTL_CONST_CALL_P (insns
) = 1;
2333 emit_libcall_block (insns
, dest
, result
, imm
);
2337 case SYMBOL_SMALL_TLSDESC
:
2339 machine_mode mode
= GET_MODE (dest
);
2340 rtx x0
= gen_rtx_REG (mode
, R0_REGNUM
);
2343 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
2345 /* In ILP32, the got entry is always of SImode size. Unlike
2346 small GOT, the dest is fixed at reg 0. */
2348 emit_insn (gen_tlsdesc_small_si (imm
));
2350 emit_insn (gen_tlsdesc_small_di (imm
));
2351 tp
= aarch64_load_tp (NULL
);
2354 tp
= gen_lowpart (mode
, tp
);
2356 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, x0
)));
2358 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2362 case SYMBOL_SMALL_TLSIE
:
2364 /* In ILP32, the mode of dest can be either SImode or DImode,
2365 while the got entry is always of SImode size. The mode of
2366 dest depends on how dest is used: if dest is assigned to a
2367 pointer (e.g. in the memory), it has SImode; it may have
2368 DImode if dest is dereferenced to access the memeory.
2369 This is why we have to handle three different tlsie_small
2370 patterns here (two patterns for ILP32). */
2371 machine_mode mode
= GET_MODE (dest
);
2372 rtx tmp_reg
= gen_reg_rtx (mode
);
2373 rtx tp
= aarch64_load_tp (NULL
);
2375 if (mode
== ptr_mode
)
2378 emit_insn (gen_tlsie_small_di (tmp_reg
, imm
));
2381 emit_insn (gen_tlsie_small_si (tmp_reg
, imm
));
2382 tp
= gen_lowpart (mode
, tp
);
2387 gcc_assert (mode
== Pmode
);
2388 emit_insn (gen_tlsie_small_sidi (tmp_reg
, imm
));
2391 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, tmp_reg
)));
2393 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2397 case SYMBOL_TLSLE12
:
2398 case SYMBOL_TLSLE24
:
2399 case SYMBOL_TLSLE32
:
2400 case SYMBOL_TLSLE48
:
2402 machine_mode mode
= GET_MODE (dest
);
2403 rtx tp
= aarch64_load_tp (NULL
);
2406 tp
= gen_lowpart (mode
, tp
);
2410 case SYMBOL_TLSLE12
:
2411 emit_insn ((mode
== DImode
? gen_tlsle12_di
: gen_tlsle12_si
)
2414 case SYMBOL_TLSLE24
:
2415 emit_insn ((mode
== DImode
? gen_tlsle24_di
: gen_tlsle24_si
)
2418 case SYMBOL_TLSLE32
:
2419 emit_insn ((mode
== DImode
? gen_tlsle32_di
: gen_tlsle32_si
)
2421 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
2424 case SYMBOL_TLSLE48
:
2425 emit_insn ((mode
== DImode
? gen_tlsle48_di
: gen_tlsle48_si
)
2427 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
2435 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2439 case SYMBOL_TINY_GOT
:
2440 emit_insn (gen_ldr_got_tiny (dest
, imm
));
2443 case SYMBOL_TINY_TLSIE
:
2445 machine_mode mode
= GET_MODE (dest
);
2446 rtx tp
= aarch64_load_tp (NULL
);
2448 if (mode
== ptr_mode
)
2451 emit_insn (gen_tlsie_tiny_di (dest
, imm
, tp
));
2454 tp
= gen_lowpart (mode
, tp
);
2455 emit_insn (gen_tlsie_tiny_si (dest
, imm
, tp
));
2460 gcc_assert (mode
== Pmode
);
2461 emit_insn (gen_tlsie_tiny_sidi (dest
, imm
, tp
));
2465 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2474 /* Emit a move from SRC to DEST. Assume that the move expanders can
2475 handle all moves if !can_create_pseudo_p (). The distinction is
2476 important because, unlike emit_move_insn, the move expanders know
2477 how to force Pmode objects into the constant pool even when the
2478 constant pool address is not itself legitimate. */
2480 aarch64_emit_move (rtx dest
, rtx src
)
2482 return (can_create_pseudo_p ()
2483 ? emit_move_insn (dest
, src
)
2484 : emit_move_insn_1 (dest
, src
));
2487 /* Apply UNOPTAB to OP and store the result in DEST. */
2490 aarch64_emit_unop (rtx dest
, optab unoptab
, rtx op
)
2492 rtx tmp
= expand_unop (GET_MODE (dest
), unoptab
, op
, dest
, 0);
2494 emit_move_insn (dest
, tmp
);
2497 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
2500 aarch64_emit_binop (rtx dest
, optab binoptab
, rtx op0
, rtx op1
)
2502 rtx tmp
= expand_binop (GET_MODE (dest
), binoptab
, op0
, op1
, dest
, 0,
2505 emit_move_insn (dest
, tmp
);
2508 /* Split a 128-bit move operation into two 64-bit move operations,
2509 taking care to handle partial overlap of register to register
2510 copies. Special cases are needed when moving between GP regs and
2511 FP regs. SRC can be a register, constant or memory; DST a register
2512 or memory. If either operand is memory it must not have any side
2515 aarch64_split_128bit_move (rtx dst
, rtx src
)
2520 machine_mode mode
= GET_MODE (dst
);
2522 gcc_assert (mode
== TImode
|| mode
== TFmode
);
2523 gcc_assert (!(side_effects_p (src
) || side_effects_p (dst
)));
2524 gcc_assert (mode
== GET_MODE (src
) || GET_MODE (src
) == VOIDmode
);
2526 if (REG_P (dst
) && REG_P (src
))
2528 int src_regno
= REGNO (src
);
2529 int dst_regno
= REGNO (dst
);
2531 /* Handle FP <-> GP regs. */
2532 if (FP_REGNUM_P (dst_regno
) && GP_REGNUM_P (src_regno
))
2534 src_lo
= gen_lowpart (word_mode
, src
);
2535 src_hi
= gen_highpart (word_mode
, src
);
2537 emit_insn (gen_aarch64_movlow_di (mode
, dst
, src_lo
));
2538 emit_insn (gen_aarch64_movhigh_di (mode
, dst
, src_hi
));
2541 else if (GP_REGNUM_P (dst_regno
) && FP_REGNUM_P (src_regno
))
2543 dst_lo
= gen_lowpart (word_mode
, dst
);
2544 dst_hi
= gen_highpart (word_mode
, dst
);
2546 emit_insn (gen_aarch64_movdi_low (mode
, dst_lo
, src
));
2547 emit_insn (gen_aarch64_movdi_high (mode
, dst_hi
, src
));
2552 dst_lo
= gen_lowpart (word_mode
, dst
);
2553 dst_hi
= gen_highpart (word_mode
, dst
);
2554 src_lo
= gen_lowpart (word_mode
, src
);
2555 src_hi
= gen_highpart_mode (word_mode
, mode
, src
);
2557 /* At most one pairing may overlap. */
2558 if (reg_overlap_mentioned_p (dst_lo
, src_hi
))
2560 aarch64_emit_move (dst_hi
, src_hi
);
2561 aarch64_emit_move (dst_lo
, src_lo
);
2565 aarch64_emit_move (dst_lo
, src_lo
);
2566 aarch64_emit_move (dst_hi
, src_hi
);
2571 aarch64_split_128bit_move_p (rtx dst
, rtx src
)
2573 return (! REG_P (src
)
2574 || ! (FP_REGNUM_P (REGNO (dst
)) && FP_REGNUM_P (REGNO (src
))));
2577 /* Split a complex SIMD combine. */
2580 aarch64_split_simd_combine (rtx dst
, rtx src1
, rtx src2
)
2582 machine_mode src_mode
= GET_MODE (src1
);
2583 machine_mode dst_mode
= GET_MODE (dst
);
2585 gcc_assert (VECTOR_MODE_P (dst_mode
));
2586 gcc_assert (register_operand (dst
, dst_mode
)
2587 && register_operand (src1
, src_mode
)
2588 && register_operand (src2
, src_mode
));
2590 emit_insn (gen_aarch64_simd_combine (src_mode
, dst
, src1
, src2
));
2594 /* Split a complex SIMD move. */
2597 aarch64_split_simd_move (rtx dst
, rtx src
)
2599 machine_mode src_mode
= GET_MODE (src
);
2600 machine_mode dst_mode
= GET_MODE (dst
);
2602 gcc_assert (VECTOR_MODE_P (dst_mode
));
2604 if (REG_P (dst
) && REG_P (src
))
2606 gcc_assert (VECTOR_MODE_P (src_mode
));
2607 emit_insn (gen_aarch64_split_simd_mov (src_mode
, dst
, src
));
2612 aarch64_zero_extend_const_eq (machine_mode xmode
, rtx x
,
2613 machine_mode ymode
, rtx y
)
2615 rtx r
= simplify_const_unary_operation (ZERO_EXTEND
, xmode
, y
, ymode
);
2616 gcc_assert (r
!= NULL
);
2617 return rtx_equal_p (x
, r
);
2620 /* Return TARGET if it is nonnull and a register of mode MODE.
2621 Otherwise, return a fresh register of mode MODE if we can,
2622 or TARGET reinterpreted as MODE if we can't. */
2625 aarch64_target_reg (rtx target
, machine_mode mode
)
2627 if (target
&& REG_P (target
) && GET_MODE (target
) == mode
)
2629 if (!can_create_pseudo_p ())
2631 gcc_assert (target
);
2632 return gen_lowpart (mode
, target
);
2634 return gen_reg_rtx (mode
);
2637 /* Return a register that contains the constant in BUILDER, given that
2638 the constant is a legitimate move operand. Use TARGET as the register
2639 if it is nonnull and convenient. */
2642 aarch64_emit_set_immediate (rtx target
, rtx_vector_builder
&builder
)
2644 rtx src
= builder
.build ();
2645 target
= aarch64_target_reg (target
, GET_MODE (src
));
2646 emit_insn (gen_rtx_SET (target
, src
));
2651 aarch64_force_temporary (machine_mode mode
, rtx x
, rtx value
)
2653 if (can_create_pseudo_p ())
2654 return force_reg (mode
, value
);
2658 aarch64_emit_move (x
, value
);
2663 /* Return true if predicate value X is a constant in which every element
2664 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
2665 value, i.e. as a predicate in which all bits are significant. */
2668 aarch64_get_sve_pred_bits (rtx_vector_builder
&builder
, rtx x
)
2670 if (GET_CODE (x
) != CONST_VECTOR
)
2673 unsigned int factor
= vector_element_size (GET_MODE_NUNITS (VNx16BImode
),
2674 GET_MODE_NUNITS (GET_MODE (x
)));
2675 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (x
) * factor
;
2676 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (x
);
2677 builder
.new_vector (VNx16BImode
, npatterns
, nelts_per_pattern
);
2679 unsigned int nelts
= const_vector_encoded_nelts (x
);
2680 for (unsigned int i
= 0; i
< nelts
; ++i
)
2682 rtx elt
= CONST_VECTOR_ENCODED_ELT (x
, i
);
2683 if (!CONST_INT_P (elt
))
2686 builder
.quick_push (elt
);
2687 for (unsigned int j
= 1; j
< factor
; ++j
)
2688 builder
.quick_push (const0_rtx
);
2690 builder
.finalize ();
2694 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
2695 widest predicate element size it can have (that is, the largest size
2696 for which each element would still be 0 or 1). */
2699 aarch64_widest_sve_pred_elt_size (rtx_vector_builder
&builder
)
2701 /* Start with the most optimistic assumption: that we only need
2702 one bit per pattern. This is what we will use if only the first
2703 bit in each pattern is ever set. */
2704 unsigned int mask
= GET_MODE_SIZE (DImode
);
2705 mask
|= builder
.npatterns ();
2707 /* Look for set bits. */
2708 unsigned int nelts
= builder
.encoded_nelts ();
2709 for (unsigned int i
= 1; i
< nelts
; ++i
)
2710 if (INTVAL (builder
.elt (i
)) != 0)
2716 return mask
& -mask
;
2719 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
2720 that the constant would have with predicate element size ELT_SIZE
2721 (ignoring the upper bits in each element) and return:
2723 * -1 if all bits are set
2724 * N if the predicate has N leading set bits followed by all clear bits
2725 * 0 if the predicate does not have any of these forms. */
2728 aarch64_partial_ptrue_length (rtx_vector_builder
&builder
,
2729 unsigned int elt_size
)
2731 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
2732 followed by set bits. */
2733 if (builder
.nelts_per_pattern () == 3)
2736 /* Skip over leading set bits. */
2737 unsigned int nelts
= builder
.encoded_nelts ();
2739 for (; i
< nelts
; i
+= elt_size
)
2740 if (INTVAL (builder
.elt (i
)) == 0)
2742 unsigned int vl
= i
/ elt_size
;
2744 /* Check for the all-true case. */
2748 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
2749 repeating pattern of set bits followed by clear bits. */
2750 if (builder
.nelts_per_pattern () != 2)
2753 /* We have a "foreground" value and a duplicated "background" value.
2754 If the background might repeat and the last set bit belongs to it,
2755 we might have set bits followed by clear bits followed by set bits. */
2756 if (i
> builder
.npatterns () && maybe_ne (nelts
, builder
.full_nelts ()))
2759 /* Make sure that the rest are all clear. */
2760 for (; i
< nelts
; i
+= elt_size
)
2761 if (INTVAL (builder
.elt (i
)) != 0)
2767 /* See if there is an svpattern that encodes an SVE predicate of mode
2768 PRED_MODE in which the first VL bits are set and the rest are clear.
2769 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
2770 A VL of -1 indicates an all-true vector. */
2773 aarch64_svpattern_for_vl (machine_mode pred_mode
, int vl
)
2776 return AARCH64_SV_ALL
;
2778 if (maybe_gt (vl
, GET_MODE_NUNITS (pred_mode
)))
2779 return AARCH64_NUM_SVPATTERNS
;
2781 if (vl
>= 1 && vl
<= 8)
2782 return aarch64_svpattern (AARCH64_SV_VL1
+ (vl
- 1));
2784 if (vl
>= 16 && vl
<= 256 && pow2p_hwi (vl
))
2785 return aarch64_svpattern (AARCH64_SV_VL16
+ (exact_log2 (vl
) - 4));
2788 if (GET_MODE_NUNITS (pred_mode
).is_constant (&max_vl
))
2790 if (vl
== (max_vl
/ 3) * 3)
2791 return AARCH64_SV_MUL3
;
2792 /* These would only trigger for non-power-of-2 lengths. */
2793 if (vl
== (max_vl
& -4))
2794 return AARCH64_SV_MUL4
;
2795 if (vl
== (1 << floor_log2 (max_vl
)))
2796 return AARCH64_SV_POW2
;
2798 return AARCH64_SV_ALL
;
2800 return AARCH64_NUM_SVPATTERNS
;
2803 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
2804 bits has the lowest bit set and the upper bits clear. This is the
2805 VNx16BImode equivalent of a PTRUE for controlling elements of
2806 ELT_SIZE bytes. However, because the constant is VNx16BImode,
2807 all bits are significant, even the upper zeros. */
2810 aarch64_ptrue_all (unsigned int elt_size
)
2812 rtx_vector_builder
builder (VNx16BImode
, elt_size
, 1);
2813 builder
.quick_push (const1_rtx
);
2814 for (unsigned int i
= 1; i
< elt_size
; ++i
)
2815 builder
.quick_push (const0_rtx
);
2816 return builder
.build ();
2819 /* Return an all-true predicate register of mode MODE. */
2822 aarch64_ptrue_reg (machine_mode mode
)
2824 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
);
2825 rtx reg
= force_reg (VNx16BImode
, CONSTM1_RTX (VNx16BImode
));
2826 return gen_lowpart (mode
, reg
);
2829 /* Return an all-false predicate register of mode MODE. */
2832 aarch64_pfalse_reg (machine_mode mode
)
2834 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
);
2835 rtx reg
= force_reg (VNx16BImode
, CONST0_RTX (VNx16BImode
));
2836 return gen_lowpart (mode
, reg
);
2839 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
2840 true, or alternatively if we know that the operation predicated by
2841 PRED1[0] is safe to perform whenever PRED2 is true. PRED1[1] is a
2842 aarch64_sve_gp_strictness operand that describes the operation
2843 predicated by PRED1[0]. */
2846 aarch64_sve_pred_dominates_p (rtx
*pred1
, rtx pred2
)
2848 machine_mode mode
= GET_MODE (pred2
);
2849 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
2850 && mode
== GET_MODE (pred1
[0])
2851 && aarch64_sve_gp_strictness (pred1
[1], SImode
));
2852 return (pred1
[0] == CONSTM1_RTX (mode
)
2853 || INTVAL (pred1
[1]) == SVE_RELAXED_GP
2854 || rtx_equal_p (pred1
[0], pred2
));
2857 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
2858 for it. PRED2[0] is the predicate for the instruction whose result
2859 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
2860 for it. Return true if we can prove that the two predicates are
2861 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
2862 with PRED1[0] without changing behavior. */
2865 aarch64_sve_same_pred_for_ptest_p (rtx
*pred1
, rtx
*pred2
)
2867 machine_mode mode
= GET_MODE (pred1
[0]);
2868 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
2869 && mode
== GET_MODE (pred2
[0])
2870 && aarch64_sve_ptrue_flag (pred1
[1], SImode
)
2871 && aarch64_sve_ptrue_flag (pred2
[1], SImode
));
2873 bool ptrue1_p
= (pred1
[0] == CONSTM1_RTX (mode
)
2874 || INTVAL (pred1
[1]) == SVE_KNOWN_PTRUE
);
2875 bool ptrue2_p
= (pred2
[0] == CONSTM1_RTX (mode
)
2876 || INTVAL (pred2
[1]) == SVE_KNOWN_PTRUE
);
2877 return (ptrue1_p
&& ptrue2_p
) || rtx_equal_p (pred1
[0], pred2
[0]);
2880 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
2881 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
2882 Use TARGET as the target register if nonnull and convenient. */
2885 aarch64_sve_emit_int_cmp (rtx target
, machine_mode pred_mode
, rtx_code cmp
,
2886 machine_mode data_mode
, rtx op1
, rtx op2
)
2888 insn_code icode
= code_for_aarch64_pred_cmp (cmp
, data_mode
);
2889 expand_operand ops
[5];
2890 create_output_operand (&ops
[0], target
, pred_mode
);
2891 create_input_operand (&ops
[1], CONSTM1_RTX (pred_mode
), pred_mode
);
2892 create_integer_operand (&ops
[2], SVE_KNOWN_PTRUE
);
2893 create_input_operand (&ops
[3], op1
, data_mode
);
2894 create_input_operand (&ops
[4], op2
, data_mode
);
2895 expand_insn (icode
, 5, ops
);
2896 return ops
[0].value
;
2899 /* Use a comparison to convert integer vector SRC into MODE, which is
2900 the corresponding SVE predicate mode. Use TARGET for the result
2901 if it's nonnull and convenient. */
2904 aarch64_convert_sve_data_to_pred (rtx target
, machine_mode mode
, rtx src
)
2906 machine_mode src_mode
= GET_MODE (src
);
2907 return aarch64_sve_emit_int_cmp (target
, mode
, NE
, src_mode
,
2908 src
, CONST0_RTX (src_mode
));
2911 /* Return true if we can move VALUE into a register using a single
2912 CNT[BHWD] instruction. */
2915 aarch64_sve_cnt_immediate_p (poly_int64 value
)
2917 HOST_WIDE_INT factor
= value
.coeffs
[0];
2918 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2919 return (value
.coeffs
[1] == factor
2920 && IN_RANGE (factor
, 2, 16 * 16)
2921 && (factor
& 1) == 0
2922 && factor
<= 16 * (factor
& -factor
));
2925 /* Likewise for rtx X. */
2928 aarch64_sve_cnt_immediate_p (rtx x
)
2931 return poly_int_rtx_p (x
, &value
) && aarch64_sve_cnt_immediate_p (value
);
2934 /* Return the asm string for an instruction with a CNT-like vector size
2935 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2936 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2937 first part of the operands template (the part that comes before the
2938 vector size itself). PATTERN is the pattern to use. FACTOR is the
2939 number of quadwords. NELTS_PER_VQ, if nonzero, is the number of elements
2940 in each quadword. If it is zero, we can use any element size. */
2943 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
2944 aarch64_svpattern pattern
,
2945 unsigned int factor
,
2946 unsigned int nelts_per_vq
)
2948 static char buffer
[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
2950 if (nelts_per_vq
== 0)
2951 /* There is some overlap in the ranges of the four CNT instructions.
2952 Here we always use the smallest possible element size, so that the
2953 multiplier is 1 whereever possible. */
2954 nelts_per_vq
= factor
& -factor
;
2955 int shift
= std::min (exact_log2 (nelts_per_vq
), 4);
2956 gcc_assert (IN_RANGE (shift
, 1, 4));
2957 char suffix
= "dwhb"[shift
- 1];
2960 unsigned int written
;
2961 if (pattern
== AARCH64_SV_ALL
&& factor
== 1)
2962 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s",
2963 prefix
, suffix
, operands
);
2964 else if (factor
== 1)
2965 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s, %s",
2966 prefix
, suffix
, operands
, svpattern_token (pattern
));
2968 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s, %s, mul #%d",
2969 prefix
, suffix
, operands
, svpattern_token (pattern
),
2971 gcc_assert (written
< sizeof (buffer
));
2975 /* Return the asm string for an instruction with a CNT-like vector size
2976 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2977 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2978 first part of the operands template (the part that comes before the
2979 vector size itself). X is the value of the vector size operand,
2980 as a polynomial integer rtx; we need to convert this into an "all"
2981 pattern with a multiplier. */
2984 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
2987 poly_int64 value
= rtx_to_poly_int64 (x
);
2988 gcc_assert (aarch64_sve_cnt_immediate_p (value
));
2989 return aarch64_output_sve_cnt_immediate (prefix
, operands
, AARCH64_SV_ALL
,
2990 value
.coeffs
[1], 0);
2993 /* Return true if we can add X using a single SVE INC or DEC instruction. */
2996 aarch64_sve_scalar_inc_dec_immediate_p (rtx x
)
2999 return (poly_int_rtx_p (x
, &value
)
3000 && (aarch64_sve_cnt_immediate_p (value
)
3001 || aarch64_sve_cnt_immediate_p (-value
)));
3004 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
3008 aarch64_output_sve_scalar_inc_dec (rtx offset
)
3010 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
3011 gcc_assert (offset_value
.coeffs
[0] == offset_value
.coeffs
[1]);
3012 if (offset_value
.coeffs
[1] > 0)
3013 return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL
,
3014 offset_value
.coeffs
[1], 0);
3016 return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL
,
3017 -offset_value
.coeffs
[1], 0);
3020 /* Return true if we can add VALUE to a register using a single ADDVL
3021 or ADDPL instruction. */
3024 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value
)
3026 HOST_WIDE_INT factor
= value
.coeffs
[0];
3027 if (factor
== 0 || value
.coeffs
[1] != factor
)
3029 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
3030 and a value of 16 is one vector width. */
3031 return (((factor
& 15) == 0 && IN_RANGE (factor
, -32 * 16, 31 * 16))
3032 || ((factor
& 1) == 0 && IN_RANGE (factor
, -32 * 2, 31 * 2)));
3035 /* Likewise for rtx X. */
3038 aarch64_sve_addvl_addpl_immediate_p (rtx x
)
3041 return (poly_int_rtx_p (x
, &value
)
3042 && aarch64_sve_addvl_addpl_immediate_p (value
));
3045 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3046 to operand 1 and storing the result in operand 0. */
3049 aarch64_output_sve_addvl_addpl (rtx offset
)
3051 static char buffer
[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3052 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
3053 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value
));
3055 int factor
= offset_value
.coeffs
[1];
3056 if ((factor
& 15) == 0)
3057 snprintf (buffer
, sizeof (buffer
), "addvl\t%%x0, %%x1, #%d", factor
/ 16);
3059 snprintf (buffer
, sizeof (buffer
), "addpl\t%%x0, %%x1, #%d", factor
/ 2);
3063 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3064 instruction. If it is, store the number of elements in each vector
3065 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3066 factor in *FACTOR_OUT (if nonnull). */
3069 aarch64_sve_vector_inc_dec_immediate_p (rtx x
, int *factor_out
,
3070 unsigned int *nelts_per_vq_out
)
3075 if (!const_vec_duplicate_p (x
, &elt
)
3076 || !poly_int_rtx_p (elt
, &value
))
3079 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x
));
3080 if (nelts_per_vq
!= 8 && nelts_per_vq
!= 4 && nelts_per_vq
!= 2)
3081 /* There's no vector INCB. */
3084 HOST_WIDE_INT factor
= value
.coeffs
[0];
3085 if (value
.coeffs
[1] != factor
)
3088 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
3089 if ((factor
% nelts_per_vq
) != 0
3090 || !IN_RANGE (abs (factor
), nelts_per_vq
, 16 * nelts_per_vq
))
3094 *factor_out
= factor
;
3095 if (nelts_per_vq_out
)
3096 *nelts_per_vq_out
= nelts_per_vq
;
3100 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3104 aarch64_sve_vector_inc_dec_immediate_p (rtx x
)
3106 return aarch64_sve_vector_inc_dec_immediate_p (x
, NULL
, NULL
);
3109 /* Return the asm template for an SVE vector INC or DEC instruction.
3110 OPERANDS gives the operands before the vector count and X is the
3111 value of the vector count operand itself. */
3114 aarch64_output_sve_vector_inc_dec (const char *operands
, rtx x
)
3117 unsigned int nelts_per_vq
;
3118 if (!aarch64_sve_vector_inc_dec_immediate_p (x
, &factor
, &nelts_per_vq
))
3121 return aarch64_output_sve_cnt_immediate ("dec", operands
, AARCH64_SV_ALL
,
3122 -factor
, nelts_per_vq
);
3124 return aarch64_output_sve_cnt_immediate ("inc", operands
, AARCH64_SV_ALL
,
3125 factor
, nelts_per_vq
);
3129 aarch64_internal_mov_immediate (rtx dest
, rtx imm
, bool generate
,
3130 scalar_int_mode mode
)
3133 unsigned HOST_WIDE_INT val
, val2
, mask
;
3134 int one_match
, zero_match
;
3139 if (aarch64_move_imm (val
, mode
))
3142 emit_insn (gen_rtx_SET (dest
, imm
));
3146 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3147 (with XXXX non-zero). In that case check to see if the move can be done in
3149 val2
= val
& 0xffffffff;
3151 && aarch64_move_imm (val2
, SImode
)
3152 && (((val
>> 32) & 0xffff) == 0 || (val
>> 48) == 0))
3155 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
3157 /* Check if we have to emit a second instruction by checking to see
3158 if any of the upper 32 bits of the original DI mode value is set. */
3162 i
= (val
>> 48) ? 48 : 32;
3165 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
3166 GEN_INT ((val
>> i
) & 0xffff)));
3171 if ((val
>> 32) == 0 || mode
== SImode
)
3175 emit_insn (gen_rtx_SET (dest
, GEN_INT (val
& 0xffff)));
3177 emit_insn (gen_insv_immsi (dest
, GEN_INT (16),
3178 GEN_INT ((val
>> 16) & 0xffff)));
3180 emit_insn (gen_insv_immdi (dest
, GEN_INT (16),
3181 GEN_INT ((val
>> 16) & 0xffff)));
3186 /* Remaining cases are all for DImode. */
3189 zero_match
= ((val
& mask
) == 0) + ((val
& (mask
<< 16)) == 0) +
3190 ((val
& (mask
<< 32)) == 0) + ((val
& (mask
<< 48)) == 0);
3191 one_match
= ((~val
& mask
) == 0) + ((~val
& (mask
<< 16)) == 0) +
3192 ((~val
& (mask
<< 32)) == 0) + ((~val
& (mask
<< 48)) == 0);
3194 if (zero_match
!= 2 && one_match
!= 2)
3196 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3197 For a 64-bit bitmask try whether changing 16 bits to all ones or
3198 zeroes creates a valid bitmask. To check any repeated bitmask,
3199 try using 16 bits from the other 32-bit half of val. */
3201 for (i
= 0; i
< 64; i
+= 16, mask
<<= 16)
3204 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
3207 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
3209 val2
= val2
& ~mask
;
3210 val2
= val2
| (((val2
>> 32) | (val2
<< 32)) & mask
);
3211 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
3218 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
3219 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
3220 GEN_INT ((val
>> i
) & 0xffff)));
3226 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3227 are emitted by the initial mov. If one_match > zero_match, skip set bits,
3228 otherwise skip zero bits. */
3232 val2
= one_match
> zero_match
? ~val
: val
;
3233 i
= (val2
& mask
) != 0 ? 0 : (val2
& (mask
<< 16)) != 0 ? 16 : 32;
3236 emit_insn (gen_rtx_SET (dest
, GEN_INT (one_match
> zero_match
3237 ? (val
| ~(mask
<< i
))
3238 : (val
& (mask
<< i
)))));
3239 for (i
+= 16; i
< 64; i
+= 16)
3241 if ((val2
& (mask
<< i
)) == 0)
3244 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
3245 GEN_INT ((val
>> i
) & 0xffff)));
3252 /* Return whether imm is a 128-bit immediate which is simple enough to
3255 aarch64_mov128_immediate (rtx imm
)
3257 if (GET_CODE (imm
) == CONST_INT
)
3260 gcc_assert (CONST_WIDE_INT_NUNITS (imm
) == 2);
3262 rtx lo
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 0));
3263 rtx hi
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 1));
3265 return aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, DImode
)
3266 + aarch64_internal_mov_immediate (NULL_RTX
, hi
, false, DImode
) <= 4;
3270 /* Return the number of temporary registers that aarch64_add_offset_1
3271 would need to add OFFSET to a register. */
3274 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset
)
3276 return abs_hwi (offset
) < 0x1000000 ? 0 : 1;
3279 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
3280 a non-polynomial OFFSET. MODE is the mode of the addition.
3281 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3282 be set and CFA adjustments added to the generated instructions.
3284 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3285 temporary if register allocation is already complete. This temporary
3286 register may overlap DEST but must not overlap SRC. If TEMP1 is known
3287 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3288 the immediate again.
3290 Since this function may be used to adjust the stack pointer, we must
3291 ensure that it cannot cause transient stack deallocation (for example
3292 by first incrementing SP and then decrementing when adjusting by a
3293 large immediate). */
3296 aarch64_add_offset_1 (scalar_int_mode mode
, rtx dest
,
3297 rtx src
, HOST_WIDE_INT offset
, rtx temp1
,
3298 bool frame_related_p
, bool emit_move_imm
)
3300 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
3301 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
3303 HOST_WIDE_INT moffset
= abs_hwi (offset
);
3308 if (!rtx_equal_p (dest
, src
))
3310 insn
= emit_insn (gen_rtx_SET (dest
, src
));
3311 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3316 /* Single instruction adjustment. */
3317 if (aarch64_uimm12_shift (moffset
))
3319 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (offset
)));
3320 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3324 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3327 a) the offset cannot be loaded by a 16-bit move or
3328 b) there is no spare register into which we can move it. */
3329 if (moffset
< 0x1000000
3330 && ((!temp1
&& !can_create_pseudo_p ())
3331 || !aarch64_move_imm (moffset
, mode
)))
3333 HOST_WIDE_INT low_off
= moffset
& 0xfff;
3335 low_off
= offset
< 0 ? -low_off
: low_off
;
3336 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (low_off
)));
3337 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3338 insn
= emit_insn (gen_add2_insn (dest
, GEN_INT (offset
- low_off
)));
3339 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3343 /* Emit a move immediate if required and an addition/subtraction. */
3346 gcc_assert (temp1
!= NULL_RTX
|| can_create_pseudo_p ());
3347 temp1
= aarch64_force_temporary (mode
, temp1
, GEN_INT (moffset
));
3349 insn
= emit_insn (offset
< 0
3350 ? gen_sub3_insn (dest
, src
, temp1
)
3351 : gen_add3_insn (dest
, src
, temp1
));
3352 if (frame_related_p
)
3354 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3355 rtx adj
= plus_constant (mode
, src
, offset
);
3356 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, gen_rtx_SET (dest
, adj
));
3360 /* Return the number of temporary registers that aarch64_add_offset
3361 would need to move OFFSET into a register or add OFFSET to a register;
3362 ADD_P is true if we want the latter rather than the former. */
3365 aarch64_offset_temporaries (bool add_p
, poly_int64 offset
)
3367 /* This follows the same structure as aarch64_add_offset. */
3368 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
3371 unsigned int count
= 0;
3372 HOST_WIDE_INT factor
= offset
.coeffs
[1];
3373 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
3374 poly_int64
poly_offset (factor
, factor
);
3375 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
3376 /* Need one register for the ADDVL/ADDPL result. */
3378 else if (factor
!= 0)
3380 factor
= abs (factor
);
3381 if (factor
> 16 * (factor
& -factor
))
3382 /* Need one register for the CNT result and one for the multiplication
3383 factor. If necessary, the second temporary can be reused for the
3384 constant part of the offset. */
3386 /* Need one register for the CNT result (which might then
3390 return count
+ aarch64_add_offset_1_temporaries (constant
);
3393 /* If X can be represented as a poly_int64, return the number
3394 of temporaries that are required to add it to a register.
3395 Return -1 otherwise. */
3398 aarch64_add_offset_temporaries (rtx x
)
3401 if (!poly_int_rtx_p (x
, &offset
))
3403 return aarch64_offset_temporaries (true, offset
);
3406 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
3407 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3408 be set and CFA adjustments added to the generated instructions.
3410 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3411 temporary if register allocation is already complete. This temporary
3412 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3413 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3414 false to avoid emitting the immediate again.
3416 TEMP2, if nonnull, is a second temporary register that doesn't
3417 overlap either DEST or REG.
3419 Since this function may be used to adjust the stack pointer, we must
3420 ensure that it cannot cause transient stack deallocation (for example
3421 by first incrementing SP and then decrementing when adjusting by a
3422 large immediate). */
3425 aarch64_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
3426 poly_int64 offset
, rtx temp1
, rtx temp2
,
3427 bool frame_related_p
, bool emit_move_imm
= true)
3429 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
3430 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
3431 gcc_assert (temp1
== NULL_RTX
3433 || !reg_overlap_mentioned_p (temp1
, dest
));
3434 gcc_assert (temp2
== NULL_RTX
|| !reg_overlap_mentioned_p (dest
, temp2
));
3436 /* Try using ADDVL or ADDPL to add the whole value. */
3437 if (src
!= const0_rtx
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
3439 rtx offset_rtx
= gen_int_mode (offset
, mode
);
3440 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
3441 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3445 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3446 SVE vector register, over and above the minimum size of 128 bits.
3447 This is equivalent to half the value returned by CNTD with a
3448 vector shape of ALL. */
3449 HOST_WIDE_INT factor
= offset
.coeffs
[1];
3450 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
3452 /* Try using ADDVL or ADDPL to add the VG-based part. */
3453 poly_int64
poly_offset (factor
, factor
);
3454 if (src
!= const0_rtx
3455 && aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
3457 rtx offset_rtx
= gen_int_mode (poly_offset
, mode
);
3458 if (frame_related_p
)
3460 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
3461 RTX_FRAME_RELATED_P (insn
) = true;
3466 rtx addr
= gen_rtx_PLUS (mode
, src
, offset_rtx
);
3467 src
= aarch64_force_temporary (mode
, temp1
, addr
);
3472 /* Otherwise use a CNT-based sequence. */
3473 else if (factor
!= 0)
3475 /* Use a subtraction if we have a negative factor. */
3476 rtx_code code
= PLUS
;
3483 /* Calculate CNTD * FACTOR / 2. First try to fold the division
3484 into the multiplication. */
3488 /* Use a right shift by 1. */
3492 HOST_WIDE_INT low_bit
= factor
& -factor
;
3493 if (factor
<= 16 * low_bit
)
3495 if (factor
> 16 * 8)
3497 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3498 the value with the minimum multiplier and shift it into
3500 int extra_shift
= exact_log2 (low_bit
);
3501 shift
+= extra_shift
;
3502 factor
>>= extra_shift
;
3504 val
= gen_int_mode (poly_int64 (factor
* 2, factor
* 2), mode
);
3508 /* Base the factor on LOW_BIT if we can calculate LOW_BIT
3509 directly, since that should increase the chances of being
3510 able to use a shift and add sequence. If LOW_BIT itself
3511 is out of range, just use CNTD. */
3512 if (low_bit
<= 16 * 8)
3517 val
= gen_int_mode (poly_int64 (low_bit
* 2, low_bit
* 2), mode
);
3518 val
= aarch64_force_temporary (mode
, temp1
, val
);
3520 if (can_create_pseudo_p ())
3522 rtx coeff1
= gen_int_mode (factor
, mode
);
3523 val
= expand_mult (mode
, val
, coeff1
, NULL_RTX
, false, true);
3527 /* Go back to using a negative multiplication factor if we have
3528 no register from which to subtract. */
3529 if (code
== MINUS
&& src
== const0_rtx
)
3534 rtx coeff1
= gen_int_mode (factor
, mode
);
3535 coeff1
= aarch64_force_temporary (mode
, temp2
, coeff1
);
3536 val
= gen_rtx_MULT (mode
, val
, coeff1
);
3542 /* Multiply by 1 << SHIFT. */
3543 val
= aarch64_force_temporary (mode
, temp1
, val
);
3544 val
= gen_rtx_ASHIFT (mode
, val
, GEN_INT (shift
));
3546 else if (shift
== -1)
3549 val
= aarch64_force_temporary (mode
, temp1
, val
);
3550 val
= gen_rtx_ASHIFTRT (mode
, val
, const1_rtx
);
3553 /* Calculate SRC +/- CNTD * FACTOR / 2. */
3554 if (src
!= const0_rtx
)
3556 val
= aarch64_force_temporary (mode
, temp1
, val
);
3557 val
= gen_rtx_fmt_ee (code
, mode
, src
, val
);
3559 else if (code
== MINUS
)
3561 val
= aarch64_force_temporary (mode
, temp1
, val
);
3562 val
= gen_rtx_NEG (mode
, val
);
3565 if (constant
== 0 || frame_related_p
)
3567 rtx_insn
*insn
= emit_insn (gen_rtx_SET (dest
, val
));
3568 if (frame_related_p
)
3570 RTX_FRAME_RELATED_P (insn
) = true;
3571 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
3572 gen_rtx_SET (dest
, plus_constant (Pmode
, src
,
3581 src
= aarch64_force_temporary (mode
, temp1
, val
);
3586 emit_move_imm
= true;
3589 aarch64_add_offset_1 (mode
, dest
, src
, constant
, temp1
,
3590 frame_related_p
, emit_move_imm
);
3593 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3594 than a poly_int64. */
3597 aarch64_split_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
3598 rtx offset_rtx
, rtx temp1
, rtx temp2
)
3600 aarch64_add_offset (mode
, dest
, src
, rtx_to_poly_int64 (offset_rtx
),
3601 temp1
, temp2
, false);
3604 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3605 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
3606 if TEMP1 already contains abs (DELTA). */
3609 aarch64_add_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool emit_move_imm
)
3611 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, delta
,
3612 temp1
, temp2
, true, emit_move_imm
);
3615 /* Subtract DELTA from the stack pointer, marking the instructions
3616 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
3620 aarch64_sub_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool frame_related_p
,
3621 bool emit_move_imm
= true)
3623 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, -delta
,
3624 temp1
, temp2
, frame_related_p
, emit_move_imm
);
3627 /* Set DEST to (vec_series BASE STEP). */
3630 aarch64_expand_vec_series (rtx dest
, rtx base
, rtx step
)
3632 machine_mode mode
= GET_MODE (dest
);
3633 scalar_mode inner
= GET_MODE_INNER (mode
);
3635 /* Each operand can be a register or an immediate in the range [-16, 15]. */
3636 if (!aarch64_sve_index_immediate_p (base
))
3637 base
= force_reg (inner
, base
);
3638 if (!aarch64_sve_index_immediate_p (step
))
3639 step
= force_reg (inner
, step
);
3641 emit_set_insn (dest
, gen_rtx_VEC_SERIES (mode
, base
, step
));
3644 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
3645 register of mode MODE. Use TARGET for the result if it's nonnull
3648 The two vector modes must have the same element mode. The behavior
3649 is to duplicate architectural lane N of SRC into architectural lanes
3650 N + I * STEP of the result. On big-endian targets, architectural
3651 lane 0 of an Advanced SIMD vector is the last element of the vector
3652 in memory layout, so for big-endian targets this operation has the
3653 effect of reversing SRC before duplicating it. Callers need to
3654 account for this. */
3657 aarch64_expand_sve_dupq (rtx target
, machine_mode mode
, rtx src
)
3659 machine_mode src_mode
= GET_MODE (src
);
3660 gcc_assert (GET_MODE_INNER (mode
) == GET_MODE_INNER (src_mode
));
3661 insn_code icode
= (BYTES_BIG_ENDIAN
3662 ? code_for_aarch64_vec_duplicate_vq_be (mode
)
3663 : code_for_aarch64_vec_duplicate_vq_le (mode
));
3666 expand_operand ops
[3];
3667 create_output_operand (&ops
[i
++], target
, mode
);
3668 create_output_operand (&ops
[i
++], src
, src_mode
);
3669 if (BYTES_BIG_ENDIAN
)
3671 /* Create a PARALLEL describing the reversal of SRC. */
3672 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (mode
);
3673 rtx sel
= aarch64_gen_stepped_int_parallel (nelts_per_vq
,
3674 nelts_per_vq
- 1, -1);
3675 create_fixed_operand (&ops
[i
++], sel
);
3677 expand_insn (icode
, i
, ops
);
3678 return ops
[0].value
;
3681 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
3682 the memory image into DEST. Return true on success. */
3685 aarch64_expand_sve_ld1rq (rtx dest
, rtx src
)
3687 src
= force_const_mem (GET_MODE (src
), src
);
3691 /* Make sure that the address is legitimate. */
3692 if (!aarch64_sve_ld1rq_operand_p (src
))
3694 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
3695 src
= replace_equiv_address (src
, addr
);
3698 machine_mode mode
= GET_MODE (dest
);
3699 unsigned int elem_bytes
= GET_MODE_UNIT_SIZE (mode
);
3700 machine_mode pred_mode
= aarch64_sve_pred_mode (elem_bytes
).require ();
3701 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
3702 emit_insn (gen_aarch64_sve_ld1rq (mode
, dest
, src
, ptrue
));
3706 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
3707 SVE data mode and isn't a legitimate constant. Use TARGET for the
3708 result if convenient.
3710 The returned register can have whatever mode seems most natural
3711 given the contents of SRC. */
3714 aarch64_expand_sve_const_vector (rtx target
, rtx src
)
3716 machine_mode mode
= GET_MODE (src
);
3717 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (src
);
3718 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (src
);
3719 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
3720 unsigned int elt_bits
= GET_MODE_BITSIZE (elt_mode
);
3721 unsigned int encoded_bits
= npatterns
* nelts_per_pattern
* elt_bits
;
3723 if (nelts_per_pattern
== 1 && encoded_bits
== 128)
3725 /* The constant is a duplicated quadword but can't be narrowed
3726 beyond a quadword. Get the memory image of the first quadword
3727 as a 128-bit vector and try using LD1RQ to load it from memory.
3729 The effect for both endiannesses is to load memory lane N into
3730 architectural lanes N + I * STEP of the result. On big-endian
3731 targets, the layout of the 128-bit vector in an Advanced SIMD
3732 register would be different from its layout in an SVE register,
3733 but this 128-bit vector is a memory value only. */
3734 machine_mode vq_mode
= aarch64_vq_mode (elt_mode
).require ();
3735 rtx vq_value
= simplify_gen_subreg (vq_mode
, src
, mode
, 0);
3736 if (vq_value
&& aarch64_expand_sve_ld1rq (target
, vq_value
))
3740 if (nelts_per_pattern
== 1 && encoded_bits
< 128)
3742 /* The vector is a repeating sequence of 64 bits or fewer.
3743 See if we can load them using an Advanced SIMD move and then
3744 duplicate it to fill a vector. This is better than using a GPR
3745 move because it keeps everything in the same register file. */
3746 machine_mode vq_mode
= aarch64_vq_mode (elt_mode
).require ();
3747 rtx_vector_builder
builder (vq_mode
, npatterns
, 1);
3748 for (unsigned int i
= 0; i
< npatterns
; ++i
)
3750 /* We want memory lane N to go into architectural lane N,
3751 so reverse for big-endian targets. The DUP .Q pattern
3752 has a compensating reverse built-in. */
3753 unsigned int srci
= BYTES_BIG_ENDIAN
? npatterns
- i
- 1 : i
;
3754 builder
.quick_push (CONST_VECTOR_ENCODED_ELT (src
, srci
));
3756 rtx vq_src
= builder
.build ();
3757 if (aarch64_simd_valid_immediate (vq_src
, NULL
))
3759 vq_src
= force_reg (vq_mode
, vq_src
);
3760 return aarch64_expand_sve_dupq (target
, mode
, vq_src
);
3763 /* Get an integer representation of the repeating part of Advanced
3764 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
3765 which for big-endian targets is lane-swapped wrt a normal
3766 Advanced SIMD vector. This means that for both endiannesses,
3767 memory lane N of SVE vector SRC corresponds to architectural
3768 lane N of a register holding VQ_SRC. This in turn means that
3769 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
3770 as a single 128-bit value) and thus that memory lane 0 of SRC is
3771 in the lsb of the integer. Duplicating the integer therefore
3772 ensures that memory lane N of SRC goes into architectural lane
3773 N + I * INDEX of the SVE register. */
3774 scalar_mode int_mode
= int_mode_for_size (encoded_bits
, 0).require ();
3775 rtx elt_value
= simplify_gen_subreg (int_mode
, vq_src
, vq_mode
, 0);
3778 /* Pretend that we had a vector of INT_MODE to start with. */
3779 elt_mode
= int_mode
;
3780 mode
= aarch64_full_sve_mode (int_mode
).require ();
3782 /* If the integer can be moved into a general register by a
3783 single instruction, do that and duplicate the result. */
3784 if (CONST_INT_P (elt_value
)
3785 && aarch64_move_imm (INTVAL (elt_value
), elt_mode
))
3787 elt_value
= force_reg (elt_mode
, elt_value
);
3788 return expand_vector_broadcast (mode
, elt_value
);
3791 else if (npatterns
== 1)
3792 /* We're duplicating a single value, but can't do better than
3793 force it to memory and load from there. This handles things
3794 like symbolic constants. */
3795 elt_value
= CONST_VECTOR_ENCODED_ELT (src
, 0);
3799 /* Load the element from memory if we can, otherwise move it into
3800 a register and use a DUP. */
3801 rtx op
= force_const_mem (elt_mode
, elt_value
);
3803 op
= force_reg (elt_mode
, elt_value
);
3804 return expand_vector_broadcast (mode
, op
);
3808 /* Try using INDEX. */
3810 if (const_vec_series_p (src
, &base
, &step
))
3812 aarch64_expand_vec_series (target
, base
, step
);
3816 /* From here on, it's better to force the whole constant to memory
3818 if (GET_MODE_NUNITS (mode
).is_constant ())
3821 /* Expand each pattern individually. */
3822 gcc_assert (npatterns
> 1);
3823 rtx_vector_builder builder
;
3824 auto_vec
<rtx
, 16> vectors (npatterns
);
3825 for (unsigned int i
= 0; i
< npatterns
; ++i
)
3827 builder
.new_vector (mode
, 1, nelts_per_pattern
);
3828 for (unsigned int j
= 0; j
< nelts_per_pattern
; ++j
)
3829 builder
.quick_push (CONST_VECTOR_ELT (src
, i
+ j
* npatterns
));
3830 vectors
.quick_push (force_reg (mode
, builder
.build ()));
3833 /* Use permutes to interleave the separate vectors. */
3834 while (npatterns
> 1)
3837 for (unsigned int i
= 0; i
< npatterns
; ++i
)
3839 rtx tmp
= (npatterns
== 1 ? target
: gen_reg_rtx (mode
));
3840 rtvec v
= gen_rtvec (2, vectors
[i
], vectors
[i
+ npatterns
]);
3841 emit_set_insn (tmp
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
3845 gcc_assert (vectors
[0] == target
);
3849 /* Use WHILE to set a predicate register of mode MODE in which the first
3850 VL bits are set and the rest are clear. Use TARGET for the register
3851 if it's nonnull and convenient. */
3854 aarch64_sve_move_pred_via_while (rtx target
, machine_mode mode
,
3857 rtx limit
= force_reg (DImode
, gen_int_mode (vl
, DImode
));
3858 target
= aarch64_target_reg (target
, mode
);
3859 emit_insn (gen_while_ult (DImode
, mode
, target
, const0_rtx
, limit
));
3864 aarch64_expand_sve_const_pred_1 (rtx
, rtx_vector_builder
&, bool);
3866 /* BUILDER is a constant predicate in which the index of every set bit
3867 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
3868 by inverting every element at a multiple of ELT_SIZE and EORing the
3869 result with an ELT_SIZE PTRUE.
3871 Return a register that contains the constant on success, otherwise
3872 return null. Use TARGET as the register if it is nonnull and
3876 aarch64_expand_sve_const_pred_eor (rtx target
, rtx_vector_builder
&builder
,
3877 unsigned int elt_size
)
3879 /* Invert every element at a multiple of ELT_SIZE, keeping the
3881 rtx_vector_builder
inv_builder (VNx16BImode
, builder
.npatterns (),
3882 builder
.nelts_per_pattern ());
3883 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); ++i
)
3884 if ((i
& (elt_size
- 1)) == 0 && INTVAL (builder
.elt (i
)) == 0)
3885 inv_builder
.quick_push (const1_rtx
);
3887 inv_builder
.quick_push (const0_rtx
);
3888 inv_builder
.finalize ();
3890 /* See if we can load the constant cheaply. */
3891 rtx inv
= aarch64_expand_sve_const_pred_1 (NULL_RTX
, inv_builder
, false);
3895 /* EOR the result with an ELT_SIZE PTRUE. */
3896 rtx mask
= aarch64_ptrue_all (elt_size
);
3897 mask
= force_reg (VNx16BImode
, mask
);
3898 target
= aarch64_target_reg (target
, VNx16BImode
);
3899 emit_insn (gen_aarch64_pred_z (XOR
, VNx16BImode
, target
, mask
, inv
, mask
));
3903 /* BUILDER is a constant predicate in which the index of every set bit
3904 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
3905 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
3906 register on success, otherwise return null. Use TARGET as the register
3907 if nonnull and convenient. */
3910 aarch64_expand_sve_const_pred_trn (rtx target
, rtx_vector_builder
&builder
,
3911 unsigned int elt_size
,
3912 unsigned int permute_size
)
3914 /* We're going to split the constant into two new constants A and B,
3915 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
3916 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
3918 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
3919 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
3921 where _ indicates elements that will be discarded by the permute.
3923 First calculate the ELT_SIZEs for A and B. */
3924 unsigned int a_elt_size
= GET_MODE_SIZE (DImode
);
3925 unsigned int b_elt_size
= GET_MODE_SIZE (DImode
);
3926 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); i
+= elt_size
)
3927 if (INTVAL (builder
.elt (i
)) != 0)
3929 if (i
& permute_size
)
3930 b_elt_size
|= i
- permute_size
;
3934 a_elt_size
&= -a_elt_size
;
3935 b_elt_size
&= -b_elt_size
;
3937 /* Now construct the vectors themselves. */
3938 rtx_vector_builder
a_builder (VNx16BImode
, builder
.npatterns (),
3939 builder
.nelts_per_pattern ());
3940 rtx_vector_builder
b_builder (VNx16BImode
, builder
.npatterns (),
3941 builder
.nelts_per_pattern ());
3942 unsigned int nelts
= builder
.encoded_nelts ();
3943 for (unsigned int i
= 0; i
< nelts
; ++i
)
3944 if (i
& (elt_size
- 1))
3946 a_builder
.quick_push (const0_rtx
);
3947 b_builder
.quick_push (const0_rtx
);
3949 else if ((i
& permute_size
) == 0)
3951 /* The A and B elements are significant. */
3952 a_builder
.quick_push (builder
.elt (i
));
3953 b_builder
.quick_push (builder
.elt (i
+ permute_size
));
3957 /* The A and B elements are going to be discarded, so pick whatever
3958 is likely to give a nice constant. We are targeting element
3959 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
3960 with the aim of each being a sequence of ones followed by
3961 a sequence of zeros. So:
3963 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
3964 duplicate the last X_ELT_SIZE element, to extend the
3965 current sequence of ones or zeros.
3967 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
3968 zero, so that the constant really does have X_ELT_SIZE and
3969 not a smaller size. */
3970 if (a_elt_size
> permute_size
)
3971 a_builder
.quick_push (const0_rtx
);
3973 a_builder
.quick_push (a_builder
.elt (i
- a_elt_size
));
3974 if (b_elt_size
> permute_size
)
3975 b_builder
.quick_push (const0_rtx
);
3977 b_builder
.quick_push (b_builder
.elt (i
- b_elt_size
));
3979 a_builder
.finalize ();
3980 b_builder
.finalize ();
3982 /* Try loading A into a register. */
3983 rtx_insn
*last
= get_last_insn ();
3984 rtx a
= aarch64_expand_sve_const_pred_1 (NULL_RTX
, a_builder
, false);
3988 /* Try loading B into a register. */
3990 if (a_builder
!= b_builder
)
3992 b
= aarch64_expand_sve_const_pred_1 (NULL_RTX
, b_builder
, false);
3995 delete_insns_since (last
);
4000 /* Emit the TRN1 itself. */
4001 machine_mode mode
= aarch64_sve_pred_mode (permute_size
).require ();
4002 target
= aarch64_target_reg (target
, mode
);
4003 emit_insn (gen_aarch64_sve (UNSPEC_TRN1
, mode
, target
,
4004 gen_lowpart (mode
, a
),
4005 gen_lowpart (mode
, b
)));
4009 /* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
4010 constant in BUILDER into an SVE predicate register. Return the register
4011 on success, otherwise return null. Use TARGET for the register if
4012 nonnull and convenient.
4014 ALLOW_RECURSE_P is true if we can use methods that would call this
4015 function recursively. */
4018 aarch64_expand_sve_const_pred_1 (rtx target
, rtx_vector_builder
&builder
,
4019 bool allow_recurse_p
)
4021 if (builder
.encoded_nelts () == 1)
4022 /* A PFALSE or a PTRUE .B ALL. */
4023 return aarch64_emit_set_immediate (target
, builder
);
4025 unsigned int elt_size
= aarch64_widest_sve_pred_elt_size (builder
);
4026 if (int vl
= aarch64_partial_ptrue_length (builder
, elt_size
))
4028 /* If we can load the constant using PTRUE, use it as-is. */
4029 machine_mode mode
= aarch64_sve_pred_mode (elt_size
).require ();
4030 if (aarch64_svpattern_for_vl (mode
, vl
) != AARCH64_NUM_SVPATTERNS
)
4031 return aarch64_emit_set_immediate (target
, builder
);
4033 /* Otherwise use WHILE to set the first VL bits. */
4034 return aarch64_sve_move_pred_via_while (target
, mode
, vl
);
4037 if (!allow_recurse_p
)
4040 /* Try inverting the vector in element size ELT_SIZE and then EORing
4041 the result with an ELT_SIZE PTRUE. */
4042 if (INTVAL (builder
.elt (0)) == 0)
4043 if (rtx res
= aarch64_expand_sve_const_pred_eor (target
, builder
,
4047 /* Try using TRN1 to permute two simpler constants. */
4048 for (unsigned int i
= elt_size
; i
<= 8; i
*= 2)
4049 if (rtx res
= aarch64_expand_sve_const_pred_trn (target
, builder
,
4056 /* Return an SVE predicate register that contains the VNx16BImode
4057 constant in BUILDER, without going through the move expanders.
4059 The returned register can have whatever mode seems most natural
4060 given the contents of BUILDER. Use TARGET for the result if
4064 aarch64_expand_sve_const_pred (rtx target
, rtx_vector_builder
&builder
)
4066 /* Try loading the constant using pure predicate operations. */
4067 if (rtx res
= aarch64_expand_sve_const_pred_1 (target
, builder
, true))
4070 /* Try forcing the constant to memory. */
4071 if (builder
.full_nelts ().is_constant ())
4072 if (rtx mem
= force_const_mem (VNx16BImode
, builder
.build ()))
4074 target
= aarch64_target_reg (target
, VNx16BImode
);
4075 emit_move_insn (target
, mem
);
4079 /* The last resort is to load the constant as an integer and then
4080 compare it against zero. Use -1 for set bits in order to increase
4081 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
4082 rtx_vector_builder
int_builder (VNx16QImode
, builder
.npatterns (),
4083 builder
.nelts_per_pattern ());
4084 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); ++i
)
4085 int_builder
.quick_push (INTVAL (builder
.elt (i
))
4086 ? constm1_rtx
: const0_rtx
);
4087 return aarch64_convert_sve_data_to_pred (target
, VNx16BImode
,
4088 int_builder
.build ());
4091 /* Set DEST to immediate IMM. */
4094 aarch64_expand_mov_immediate (rtx dest
, rtx imm
)
4096 machine_mode mode
= GET_MODE (dest
);
4098 /* Check on what type of symbol it is. */
4099 scalar_int_mode int_mode
;
4100 if ((GET_CODE (imm
) == SYMBOL_REF
4101 || GET_CODE (imm
) == LABEL_REF
4102 || GET_CODE (imm
) == CONST
4103 || GET_CODE (imm
) == CONST_POLY_INT
)
4104 && is_a
<scalar_int_mode
> (mode
, &int_mode
))
4108 HOST_WIDE_INT const_offset
;
4109 enum aarch64_symbol_type sty
;
4111 /* If we have (const (plus symbol offset)), separate out the offset
4112 before we start classifying the symbol. */
4113 rtx base
= strip_offset (imm
, &offset
);
4115 /* We must always add an offset involving VL separately, rather than
4116 folding it into the relocation. */
4117 if (!offset
.is_constant (&const_offset
))
4119 if (base
== const0_rtx
&& aarch64_sve_cnt_immediate_p (offset
))
4120 emit_insn (gen_rtx_SET (dest
, imm
));
4123 /* Do arithmetic on 32-bit values if the result is smaller
4125 if (partial_subreg_p (int_mode
, SImode
))
4127 /* It is invalid to do symbol calculations in modes
4128 narrower than SImode. */
4129 gcc_assert (base
== const0_rtx
);
4130 dest
= gen_lowpart (SImode
, dest
);
4133 if (base
!= const0_rtx
)
4135 base
= aarch64_force_temporary (int_mode
, dest
, base
);
4136 aarch64_add_offset (int_mode
, dest
, base
, offset
,
4137 NULL_RTX
, NULL_RTX
, false);
4140 aarch64_add_offset (int_mode
, dest
, base
, offset
,
4141 dest
, NULL_RTX
, false);
4146 sty
= aarch64_classify_symbol (base
, const_offset
);
4149 case SYMBOL_FORCE_TO_MEM
:
4150 if (const_offset
!= 0
4151 && targetm
.cannot_force_const_mem (int_mode
, imm
))
4153 gcc_assert (can_create_pseudo_p ());
4154 base
= aarch64_force_temporary (int_mode
, dest
, base
);
4155 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
4156 NULL_RTX
, NULL_RTX
, false);
4160 mem
= force_const_mem (ptr_mode
, imm
);
4163 /* If we aren't generating PC relative literals, then
4164 we need to expand the literal pool access carefully.
4165 This is something that needs to be done in a number
4166 of places, so could well live as a separate function. */
4167 if (!aarch64_pcrelative_literal_loads
)
4169 gcc_assert (can_create_pseudo_p ());
4170 base
= gen_reg_rtx (ptr_mode
);
4171 aarch64_expand_mov_immediate (base
, XEXP (mem
, 0));
4172 if (ptr_mode
!= Pmode
)
4173 base
= convert_memory_address (Pmode
, base
);
4174 mem
= gen_rtx_MEM (ptr_mode
, base
);
4177 if (int_mode
!= ptr_mode
)
4178 mem
= gen_rtx_ZERO_EXTEND (int_mode
, mem
);
4180 emit_insn (gen_rtx_SET (dest
, mem
));
4184 case SYMBOL_SMALL_TLSGD
:
4185 case SYMBOL_SMALL_TLSDESC
:
4186 case SYMBOL_SMALL_TLSIE
:
4187 case SYMBOL_SMALL_GOT_28K
:
4188 case SYMBOL_SMALL_GOT_4G
:
4189 case SYMBOL_TINY_GOT
:
4190 case SYMBOL_TINY_TLSIE
:
4191 if (const_offset
!= 0)
4193 gcc_assert(can_create_pseudo_p ());
4194 base
= aarch64_force_temporary (int_mode
, dest
, base
);
4195 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
4196 NULL_RTX
, NULL_RTX
, false);
4201 case SYMBOL_SMALL_ABSOLUTE
:
4202 case SYMBOL_TINY_ABSOLUTE
:
4203 case SYMBOL_TLSLE12
:
4204 case SYMBOL_TLSLE24
:
4205 case SYMBOL_TLSLE32
:
4206 case SYMBOL_TLSLE48
:
4207 aarch64_load_symref_appropriately (dest
, imm
, sty
);
4215 if (!CONST_INT_P (imm
))
4217 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
)
4219 /* Only the low bit of each .H, .S and .D element is defined,
4220 so we can set the upper bits to whatever we like. If the
4221 predicate is all-true in MODE, prefer to set all the undefined
4222 bits as well, so that we can share a single .B predicate for
4224 if (imm
== CONSTM1_RTX (mode
))
4225 imm
= CONSTM1_RTX (VNx16BImode
);
4227 /* All methods for constructing predicate modes wider than VNx16BI
4228 will set the upper bits of each element to zero. Expose this
4229 by moving such constants as a VNx16BI, so that all bits are
4230 significant and so that constants for different modes can be
4231 shared. The wider constant will still be available as a
4233 rtx_vector_builder builder
;
4234 if (aarch64_get_sve_pred_bits (builder
, imm
))
4236 rtx res
= aarch64_expand_sve_const_pred (dest
, builder
);
4238 emit_move_insn (dest
, gen_lowpart (mode
, res
));
4243 if (GET_CODE (imm
) == HIGH
4244 || aarch64_simd_valid_immediate (imm
, NULL
))
4246 emit_insn (gen_rtx_SET (dest
, imm
));
4250 if (GET_CODE (imm
) == CONST_VECTOR
&& aarch64_sve_data_mode_p (mode
))
4251 if (rtx res
= aarch64_expand_sve_const_vector (dest
, imm
))
4254 emit_insn (gen_aarch64_sve_reinterpret (mode
, dest
, res
));
4258 rtx mem
= force_const_mem (mode
, imm
);
4260 emit_move_insn (dest
, mem
);
4264 aarch64_internal_mov_immediate (dest
, imm
, true,
4265 as_a
<scalar_int_mode
> (mode
));
4268 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
4269 that is known to contain PTRUE. */
4272 aarch64_emit_sve_pred_move (rtx dest
, rtx pred
, rtx src
)
4274 expand_operand ops
[3];
4275 machine_mode mode
= GET_MODE (dest
);
4276 create_output_operand (&ops
[0], dest
, mode
);
4277 create_input_operand (&ops
[1], pred
, GET_MODE(pred
));
4278 create_input_operand (&ops
[2], src
, mode
);
4279 temporary_volatile_ok
v (true);
4280 expand_insn (code_for_aarch64_pred_mov (mode
), 3, ops
);
4283 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
4284 operand is in memory. In this case we need to use the predicated LD1
4285 and ST1 instead of LDR and STR, both for correctness on big-endian
4286 targets and because LD1 and ST1 support a wider range of addressing modes.
4287 PRED_MODE is the mode of the predicate.
4289 See the comment at the head of aarch64-sve.md for details about the
4290 big-endian handling. */
4293 aarch64_expand_sve_mem_move (rtx dest
, rtx src
, machine_mode pred_mode
)
4295 machine_mode mode
= GET_MODE (dest
);
4296 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
4297 if (!register_operand (src
, mode
)
4298 && !register_operand (dest
, mode
))
4300 rtx tmp
= gen_reg_rtx (mode
);
4302 aarch64_emit_sve_pred_move (tmp
, ptrue
, src
);
4304 emit_move_insn (tmp
, src
);
4307 aarch64_emit_sve_pred_move (dest
, ptrue
, src
);
4310 /* Called only on big-endian targets. See whether an SVE vector move
4311 from SRC to DEST is effectively a REV[BHW] instruction, because at
4312 least one operand is a subreg of an SVE vector that has wider or
4313 narrower elements. Return true and emit the instruction if so.
4317 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
4319 represents a VIEW_CONVERT between the following vectors, viewed
4322 R2: { [0].high, [0].low, [1].high, [1].low, ... }
4323 R1: { [0], [1], [2], [3], ... }
4325 The high part of lane X in R2 should therefore correspond to lane X*2
4326 of R1, but the register representations are:
4329 R2: ...... [1].high [1].low [0].high [0].low
4330 R1: ...... [3] [2] [1] [0]
4332 where the low part of lane X in R2 corresponds to lane X*2 in R1.
4333 We therefore need a reverse operation to swap the high and low values
4336 This is purely an optimization. Without it we would spill the
4337 subreg operand to the stack in one mode and reload it in the
4338 other mode, which has the same effect as the REV. */
4341 aarch64_maybe_expand_sve_subreg_move (rtx dest
, rtx src
)
4343 gcc_assert (BYTES_BIG_ENDIAN
);
4344 if (GET_CODE (dest
) == SUBREG
)
4345 dest
= SUBREG_REG (dest
);
4346 if (GET_CODE (src
) == SUBREG
)
4347 src
= SUBREG_REG (src
);
4349 /* The optimization handles two single SVE REGs with different element
4353 || aarch64_classify_vector_mode (GET_MODE (dest
)) != VEC_SVE_DATA
4354 || aarch64_classify_vector_mode (GET_MODE (src
)) != VEC_SVE_DATA
4355 || (GET_MODE_UNIT_SIZE (GET_MODE (dest
))
4356 == GET_MODE_UNIT_SIZE (GET_MODE (src
))))
4359 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
4360 rtx ptrue
= aarch64_ptrue_reg (VNx16BImode
);
4361 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (dest
), gen_rtvec (2, ptrue
, src
),
4363 emit_insn (gen_rtx_SET (dest
, unspec
));
4367 /* Return a copy of X with mode MODE, without changing its other
4368 attributes. Unlike gen_lowpart, this doesn't care whether the
4369 mode change is valid. */
4372 aarch64_replace_reg_mode (rtx x
, machine_mode mode
)
4374 if (GET_MODE (x
) == mode
)
4377 x
= shallow_copy_rtx (x
);
4378 set_mode_and_regno (x
, mode
, REGNO (x
));
4382 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
4383 stored in wider integer containers. */
4386 aarch64_sve_rev_unspec (machine_mode mode
)
4388 switch (GET_MODE_UNIT_SIZE (mode
))
4390 case 1: return UNSPEC_REVB
;
4391 case 2: return UNSPEC_REVH
;
4392 case 4: return UNSPEC_REVW
;
4397 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
4401 aarch64_split_sve_subreg_move (rtx dest
, rtx ptrue
, rtx src
)
4403 /* Decide which REV operation we need. The mode with wider elements
4404 determines the mode of the operands and the mode with the narrower
4405 elements determines the reverse width. */
4406 machine_mode mode_with_wider_elts
= GET_MODE (dest
);
4407 machine_mode mode_with_narrower_elts
= GET_MODE (src
);
4408 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts
)
4409 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts
))
4410 std::swap (mode_with_wider_elts
, mode_with_narrower_elts
);
4412 unsigned int unspec
= aarch64_sve_rev_unspec (mode_with_narrower_elts
);
4413 unsigned int wider_bytes
= GET_MODE_UNIT_SIZE (mode_with_wider_elts
);
4414 machine_mode pred_mode
= aarch64_sve_pred_mode (wider_bytes
).require ();
4416 /* Get the operands in the appropriate modes and emit the instruction. */
4417 ptrue
= gen_lowpart (pred_mode
, ptrue
);
4418 dest
= aarch64_replace_reg_mode (dest
, mode_with_wider_elts
);
4419 src
= aarch64_replace_reg_mode (src
, mode_with_wider_elts
);
4420 emit_insn (gen_aarch64_pred (unspec
, mode_with_wider_elts
,
4425 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED
,
4426 tree exp ATTRIBUTE_UNUSED
)
4428 if (aarch64_simd_decl_p (cfun
->decl
) != aarch64_simd_decl_p (decl
))
4434 /* Implement TARGET_PASS_BY_REFERENCE. */
4437 aarch64_pass_by_reference (cumulative_args_t
, const function_arg_info
&arg
)
4440 machine_mode dummymode
;
4443 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
4444 if (arg
.mode
== BLKmode
&& arg
.type
)
4445 size
= int_size_in_bytes (arg
.type
);
4447 /* No frontends can create types with variable-sized modes, so we
4448 shouldn't be asked to pass or return them. */
4449 size
= GET_MODE_SIZE (arg
.mode
).to_constant ();
4451 /* Aggregates are passed by reference based on their size. */
4452 if (arg
.aggregate_type_p ())
4453 size
= int_size_in_bytes (arg
.type
);
4455 /* Variable sized arguments are always returned by reference. */
4459 /* Can this be a candidate to be passed in fp/simd register(s)? */
4460 if (aarch64_vfp_is_call_or_return_candidate (arg
.mode
, arg
.type
,
4465 /* Arguments which are variable sized or larger than 2 registers are
4466 passed by reference unless they are a homogenous floating point
4468 return size
> 2 * UNITS_PER_WORD
;
4471 /* Return TRUE if VALTYPE is padded to its least significant bits. */
4473 aarch64_return_in_msb (const_tree valtype
)
4475 machine_mode dummy_mode
;
4478 /* Never happens in little-endian mode. */
4479 if (!BYTES_BIG_ENDIAN
)
4482 /* Only composite types smaller than or equal to 16 bytes can
4483 be potentially returned in registers. */
4484 if (!aarch64_composite_type_p (valtype
, TYPE_MODE (valtype
))
4485 || int_size_in_bytes (valtype
) <= 0
4486 || int_size_in_bytes (valtype
) > 16)
4489 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4490 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4491 is always passed/returned in the least significant bits of fp/simd
4493 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype
), valtype
,
4494 &dummy_mode
, &dummy_int
, NULL
))
4500 /* Implement TARGET_FUNCTION_VALUE.
4501 Define how to find the value returned by a function. */
4504 aarch64_function_value (const_tree type
, const_tree func
,
4505 bool outgoing ATTRIBUTE_UNUSED
)
4510 machine_mode ag_mode
;
4512 mode
= TYPE_MODE (type
);
4513 if (INTEGRAL_TYPE_P (type
))
4514 mode
= promote_function_mode (type
, mode
, &unsignedp
, func
, 1);
4516 if (aarch64_return_in_msb (type
))
4518 HOST_WIDE_INT size
= int_size_in_bytes (type
);
4520 if (size
% UNITS_PER_WORD
!= 0)
4522 size
+= UNITS_PER_WORD
- size
% UNITS_PER_WORD
;
4523 mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 0).require ();
4527 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
4528 &ag_mode
, &count
, NULL
))
4530 if (!aarch64_composite_type_p (type
, mode
))
4532 gcc_assert (count
== 1 && mode
== ag_mode
);
4533 return gen_rtx_REG (mode
, V0_REGNUM
);
4540 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (count
));
4541 for (i
= 0; i
< count
; i
++)
4543 rtx tmp
= gen_rtx_REG (ag_mode
, V0_REGNUM
+ i
);
4544 rtx offset
= gen_int_mode (i
* GET_MODE_SIZE (ag_mode
), Pmode
);
4545 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
4546 XVECEXP (par
, 0, i
) = tmp
;
4552 return gen_rtx_REG (mode
, R0_REGNUM
);
4555 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
4556 Return true if REGNO is the number of a hard register in which the values
4557 of called function may come back. */
4560 aarch64_function_value_regno_p (const unsigned int regno
)
4562 /* Maximum of 16 bytes can be returned in the general registers. Examples
4563 of 16-byte return values are: 128-bit integers and 16-byte small
4564 structures (excluding homogeneous floating-point aggregates). */
4565 if (regno
== R0_REGNUM
|| regno
== R1_REGNUM
)
4568 /* Up to four fp/simd registers can return a function value, e.g. a
4569 homogeneous floating-point aggregate having four members. */
4570 if (regno
>= V0_REGNUM
&& regno
< V0_REGNUM
+ HA_MAX_NUM_FLDS
)
4571 return TARGET_FLOAT
;
4576 /* Implement TARGET_RETURN_IN_MEMORY.
4578 If the type T of the result of a function is such that
4580 would require that arg be passed as a value in a register (or set of
4581 registers) according to the parameter passing rules, then the result
4582 is returned in the same registers as would be used for such an
4586 aarch64_return_in_memory (const_tree type
, const_tree fndecl ATTRIBUTE_UNUSED
)
4589 machine_mode ag_mode
;
4592 if (!AGGREGATE_TYPE_P (type
)
4593 && TREE_CODE (type
) != COMPLEX_TYPE
4594 && TREE_CODE (type
) != VECTOR_TYPE
)
4595 /* Simple scalar types always returned in registers. */
4598 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
),
4605 /* Types larger than 2 registers returned in memory. */
4606 size
= int_size_in_bytes (type
);
4607 return (size
< 0 || size
> 2 * UNITS_PER_WORD
);
4611 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v
, machine_mode mode
,
4612 const_tree type
, int *nregs
)
4614 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
4615 return aarch64_vfp_is_call_or_return_candidate (mode
,
4617 &pcum
->aapcs_vfp_rmode
,
4622 /* Given MODE and TYPE of a function argument, return the alignment in
4623 bits. The idea is to suppress any stronger alignment requested by
4624 the user and opt for the natural alignment (specified in AAPCS64 \S
4625 4.1). ABI_BREAK is set to true if the alignment was incorrectly
4626 calculated in versions of GCC prior to GCC-9. This is a helper
4627 function for local use only. */
4630 aarch64_function_arg_alignment (machine_mode mode
, const_tree type
,
4635 return GET_MODE_ALIGNMENT (mode
);
4637 if (integer_zerop (TYPE_SIZE (type
)))
4640 gcc_assert (TYPE_MODE (type
) == mode
);
4642 if (!AGGREGATE_TYPE_P (type
))
4643 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type
));
4645 if (TREE_CODE (type
) == ARRAY_TYPE
)
4646 return TYPE_ALIGN (TREE_TYPE (type
));
4648 unsigned int alignment
= 0;
4649 unsigned int bitfield_alignment
= 0;
4650 for (tree field
= TYPE_FIELDS (type
); field
; field
= DECL_CHAIN (field
))
4651 if (TREE_CODE (field
) == FIELD_DECL
)
4653 alignment
= std::max (alignment
, DECL_ALIGN (field
));
4654 if (DECL_BIT_FIELD_TYPE (field
))
4656 = std::max (bitfield_alignment
,
4657 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field
)));
4660 if (bitfield_alignment
> alignment
)
4663 return bitfield_alignment
;
4669 /* Layout a function argument according to the AAPCS64 rules. The rule
4670 numbers refer to the rule numbers in the AAPCS64. */
4673 aarch64_layout_arg (cumulative_args_t pcum_v
, machine_mode mode
,
4675 bool named ATTRIBUTE_UNUSED
)
4677 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
4678 int ncrn
, nvrn
, nregs
;
4679 bool allocate_ncrn
, allocate_nvrn
;
4683 /* We need to do this once per argument. */
4684 if (pcum
->aapcs_arg_processed
)
4687 pcum
->aapcs_arg_processed
= true;
4689 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
4691 size
= int_size_in_bytes (type
);
4693 /* No frontends can create types with variable-sized modes, so we
4694 shouldn't be asked to pass or return them. */
4695 size
= GET_MODE_SIZE (mode
).to_constant ();
4696 size
= ROUND_UP (size
, UNITS_PER_WORD
);
4698 allocate_ncrn
= (type
) ? !(FLOAT_TYPE_P (type
)) : !FLOAT_MODE_P (mode
);
4699 allocate_nvrn
= aarch64_vfp_is_call_candidate (pcum_v
,
4704 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
4705 The following code thus handles passing by SIMD/FP registers first. */
4707 nvrn
= pcum
->aapcs_nvrn
;
4709 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
4710 and homogenous short-vector aggregates (HVA). */
4714 aarch64_err_no_fpadvsimd (mode
);
4716 if (nvrn
+ nregs
<= NUM_FP_ARG_REGS
)
4718 pcum
->aapcs_nextnvrn
= nvrn
+ nregs
;
4719 if (!aarch64_composite_type_p (type
, mode
))
4721 gcc_assert (nregs
== 1);
4722 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ nvrn
);
4728 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
4729 for (i
= 0; i
< nregs
; i
++)
4731 rtx tmp
= gen_rtx_REG (pcum
->aapcs_vfp_rmode
,
4732 V0_REGNUM
+ nvrn
+ i
);
4733 rtx offset
= gen_int_mode
4734 (i
* GET_MODE_SIZE (pcum
->aapcs_vfp_rmode
), Pmode
);
4735 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
4736 XVECEXP (par
, 0, i
) = tmp
;
4738 pcum
->aapcs_reg
= par
;
4744 /* C.3 NSRN is set to 8. */
4745 pcum
->aapcs_nextnvrn
= NUM_FP_ARG_REGS
;
4750 ncrn
= pcum
->aapcs_ncrn
;
4751 nregs
= size
/ UNITS_PER_WORD
;
4753 /* C6 - C9. though the sign and zero extension semantics are
4754 handled elsewhere. This is the case where the argument fits
4755 entirely general registers. */
4756 if (allocate_ncrn
&& (ncrn
+ nregs
<= NUM_ARG_REGS
))
4758 gcc_assert (nregs
== 0 || nregs
== 1 || nregs
== 2);
4760 /* C.8 if the argument has an alignment of 16 then the NGRN is
4761 rounded up to the next even number. */
4764 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
4765 comparison is there because for > 16 * BITS_PER_UNIT
4766 alignment nregs should be > 2 and therefore it should be
4767 passed by reference rather than value. */
4768 && (aarch64_function_arg_alignment (mode
, type
, &abi_break
)
4769 == 16 * BITS_PER_UNIT
))
4771 if (abi_break
&& warn_psabi
&& currently_expanding_gimple_stmt
)
4772 inform (input_location
, "parameter passing for argument of type "
4773 "%qT changed in GCC 9.1", type
);
4775 gcc_assert (ncrn
+ nregs
<= NUM_ARG_REGS
);
4778 /* NREGS can be 0 when e.g. an empty structure is to be passed.
4779 A reg is still generated for it, but the caller should be smart
4780 enough not to use it. */
4781 if (nregs
== 0 || nregs
== 1 || GET_MODE_CLASS (mode
) == MODE_INT
)
4782 pcum
->aapcs_reg
= gen_rtx_REG (mode
, R0_REGNUM
+ ncrn
);
4788 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
4789 for (i
= 0; i
< nregs
; i
++)
4791 rtx tmp
= gen_rtx_REG (word_mode
, R0_REGNUM
+ ncrn
+ i
);
4792 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
,
4793 GEN_INT (i
* UNITS_PER_WORD
));
4794 XVECEXP (par
, 0, i
) = tmp
;
4796 pcum
->aapcs_reg
= par
;
4799 pcum
->aapcs_nextncrn
= ncrn
+ nregs
;
4804 pcum
->aapcs_nextncrn
= NUM_ARG_REGS
;
4806 /* The argument is passed on stack; record the needed number of words for
4807 this argument and align the total size if necessary. */
4809 pcum
->aapcs_stack_words
= size
/ UNITS_PER_WORD
;
4811 if (aarch64_function_arg_alignment (mode
, type
, &abi_break
)
4812 == 16 * BITS_PER_UNIT
)
4814 int new_size
= ROUND_UP (pcum
->aapcs_stack_size
, 16 / UNITS_PER_WORD
);
4815 if (pcum
->aapcs_stack_size
!= new_size
)
4817 if (abi_break
&& warn_psabi
&& currently_expanding_gimple_stmt
)
4818 inform (input_location
, "parameter passing for argument of type "
4819 "%qT changed in GCC 9.1", type
);
4820 pcum
->aapcs_stack_size
= new_size
;
4826 /* Implement TARGET_FUNCTION_ARG. */
4829 aarch64_function_arg (cumulative_args_t pcum_v
, const function_arg_info
&arg
)
4831 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
4832 gcc_assert (pcum
->pcs_variant
== ARM_PCS_AAPCS64
);
4834 if (arg
.end_marker_p ())
4837 aarch64_layout_arg (pcum_v
, arg
.mode
, arg
.type
, arg
.named
);
4838 return pcum
->aapcs_reg
;
4842 aarch64_init_cumulative_args (CUMULATIVE_ARGS
*pcum
,
4843 const_tree fntype ATTRIBUTE_UNUSED
,
4844 rtx libname ATTRIBUTE_UNUSED
,
4845 const_tree fndecl ATTRIBUTE_UNUSED
,
4846 unsigned n_named ATTRIBUTE_UNUSED
)
4848 pcum
->aapcs_ncrn
= 0;
4849 pcum
->aapcs_nvrn
= 0;
4850 pcum
->aapcs_nextncrn
= 0;
4851 pcum
->aapcs_nextnvrn
= 0;
4852 pcum
->pcs_variant
= ARM_PCS_AAPCS64
;
4853 pcum
->aapcs_reg
= NULL_RTX
;
4854 pcum
->aapcs_arg_processed
= false;
4855 pcum
->aapcs_stack_words
= 0;
4856 pcum
->aapcs_stack_size
= 0;
4859 && fndecl
&& TREE_PUBLIC (fndecl
)
4860 && fntype
&& fntype
!= error_mark_node
)
4862 const_tree type
= TREE_TYPE (fntype
);
4863 machine_mode mode ATTRIBUTE_UNUSED
; /* To pass pointer as argument. */
4864 int nregs ATTRIBUTE_UNUSED
; /* Likewise. */
4865 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
), type
,
4866 &mode
, &nregs
, NULL
))
4867 aarch64_err_no_fpadvsimd (TYPE_MODE (type
));
4873 aarch64_function_arg_advance (cumulative_args_t pcum_v
,
4874 const function_arg_info
&arg
)
4876 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
4877 if (pcum
->pcs_variant
== ARM_PCS_AAPCS64
)
4879 aarch64_layout_arg (pcum_v
, arg
.mode
, arg
.type
, arg
.named
);
4880 gcc_assert ((pcum
->aapcs_reg
!= NULL_RTX
)
4881 != (pcum
->aapcs_stack_words
!= 0));
4882 pcum
->aapcs_arg_processed
= false;
4883 pcum
->aapcs_ncrn
= pcum
->aapcs_nextncrn
;
4884 pcum
->aapcs_nvrn
= pcum
->aapcs_nextnvrn
;
4885 pcum
->aapcs_stack_size
+= pcum
->aapcs_stack_words
;
4886 pcum
->aapcs_stack_words
= 0;
4887 pcum
->aapcs_reg
= NULL_RTX
;
4892 aarch64_function_arg_regno_p (unsigned regno
)
4894 return ((GP_REGNUM_P (regno
) && regno
< R0_REGNUM
+ NUM_ARG_REGS
)
4895 || (FP_REGNUM_P (regno
) && regno
< V0_REGNUM
+ NUM_FP_ARG_REGS
));
4898 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
4899 PARM_BOUNDARY bits of alignment, but will be given anything up
4900 to STACK_BOUNDARY bits if the type requires it. This makes sure
4901 that both before and after the layout of each argument, the Next
4902 Stacked Argument Address (NSAA) will have a minimum alignment of
4906 aarch64_function_arg_boundary (machine_mode mode
, const_tree type
)
4909 unsigned int alignment
= aarch64_function_arg_alignment (mode
, type
,
4911 if (abi_break
& warn_psabi
)
4912 inform (input_location
, "parameter passing for argument of type "
4913 "%qT changed in GCC 9.1", type
);
4915 return MIN (MAX (alignment
, PARM_BOUNDARY
), STACK_BOUNDARY
);
4918 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
4920 static fixed_size_mode
4921 aarch64_get_reg_raw_mode (int regno
)
4923 if (TARGET_SVE
&& FP_REGNUM_P (regno
))
4924 /* Don't use the SVE part of the register for __builtin_apply and
4925 __builtin_return. The SVE registers aren't used by the normal PCS,
4926 so using them there would be a waste of time. The PCS extensions
4927 for SVE types are fundamentally incompatible with the
4928 __builtin_return/__builtin_apply interface. */
4929 return as_a
<fixed_size_mode
> (V16QImode
);
4930 return default_get_reg_raw_mode (regno
);
4933 /* Implement TARGET_FUNCTION_ARG_PADDING.
4935 Small aggregate types are placed in the lowest memory address.
4937 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
4939 static pad_direction
4940 aarch64_function_arg_padding (machine_mode mode
, const_tree type
)
4942 /* On little-endian targets, the least significant byte of every stack
4943 argument is passed at the lowest byte address of the stack slot. */
4944 if (!BYTES_BIG_ENDIAN
)
4947 /* Otherwise, integral, floating-point and pointer types are padded downward:
4948 the least significant byte of a stack argument is passed at the highest
4949 byte address of the stack slot. */
4951 ? (INTEGRAL_TYPE_P (type
) || SCALAR_FLOAT_TYPE_P (type
)
4952 || POINTER_TYPE_P (type
))
4953 : (SCALAR_INT_MODE_P (mode
) || SCALAR_FLOAT_MODE_P (mode
)))
4954 return PAD_DOWNWARD
;
4956 /* Everything else padded upward, i.e. data in first byte of stack slot. */
4960 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4962 It specifies padding for the last (may also be the only)
4963 element of a block move between registers and memory. If
4964 assuming the block is in the memory, padding upward means that
4965 the last element is padded after its highest significant byte,
4966 while in downward padding, the last element is padded at the
4967 its least significant byte side.
4969 Small aggregates and small complex types are always padded
4972 We don't need to worry about homogeneous floating-point or
4973 short-vector aggregates; their move is not affected by the
4974 padding direction determined here. Regardless of endianness,
4975 each element of such an aggregate is put in the least
4976 significant bits of a fp/simd register.
4978 Return !BYTES_BIG_ENDIAN if the least significant byte of the
4979 register has useful data, and return the opposite if the most
4980 significant byte does. */
4983 aarch64_pad_reg_upward (machine_mode mode
, const_tree type
,
4984 bool first ATTRIBUTE_UNUSED
)
4987 /* Small composite types are always padded upward. */
4988 if (BYTES_BIG_ENDIAN
&& aarch64_composite_type_p (type
, mode
))
4992 size
= int_size_in_bytes (type
);
4994 /* No frontends can create types with variable-sized modes, so we
4995 shouldn't be asked to pass or return them. */
4996 size
= GET_MODE_SIZE (mode
).to_constant ();
4997 if (size
< 2 * UNITS_PER_WORD
)
5001 /* Otherwise, use the default padding. */
5002 return !BYTES_BIG_ENDIAN
;
5005 static scalar_int_mode
5006 aarch64_libgcc_cmp_return_mode (void)
5011 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
5013 /* We use the 12-bit shifted immediate arithmetic instructions so values
5014 must be multiple of (1 << 12), i.e. 4096. */
5015 #define ARITH_FACTOR 4096
5017 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
5018 #error Cannot use simple address calculation for stack probing
5021 /* The pair of scratch registers used for stack probing. */
5022 #define PROBE_STACK_FIRST_REG R9_REGNUM
5023 #define PROBE_STACK_SECOND_REG R10_REGNUM
5025 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
5026 inclusive. These are offsets from the current stack pointer. */
5029 aarch64_emit_probe_stack_range (HOST_WIDE_INT first
, poly_int64 poly_size
)
5032 if (!poly_size
.is_constant (&size
))
5034 sorry ("stack probes for SVE frames");
5038 rtx reg1
= gen_rtx_REG (Pmode
, PROBE_STACK_FIRST_REG
);
5040 /* See the same assertion on PROBE_INTERVAL above. */
5041 gcc_assert ((first
% ARITH_FACTOR
) == 0);
5043 /* See if we have a constant small number of probes to generate. If so,
5044 that's the easy case. */
5045 if (size
<= PROBE_INTERVAL
)
5047 const HOST_WIDE_INT base
= ROUND_UP (size
, ARITH_FACTOR
);
5049 emit_set_insn (reg1
,
5050 plus_constant (Pmode
,
5051 stack_pointer_rtx
, -(first
+ base
)));
5052 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- size
));
5055 /* The run-time loop is made up of 8 insns in the generic case while the
5056 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
5057 else if (size
<= 4 * PROBE_INTERVAL
)
5059 HOST_WIDE_INT i
, rem
;
5061 emit_set_insn (reg1
,
5062 plus_constant (Pmode
,
5064 -(first
+ PROBE_INTERVAL
)));
5065 emit_stack_probe (reg1
);
5067 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
5068 it exceeds SIZE. If only two probes are needed, this will not
5069 generate any code. Then probe at FIRST + SIZE. */
5070 for (i
= 2 * PROBE_INTERVAL
; i
< size
; i
+= PROBE_INTERVAL
)
5072 emit_set_insn (reg1
,
5073 plus_constant (Pmode
, reg1
, -PROBE_INTERVAL
));
5074 emit_stack_probe (reg1
);
5077 rem
= size
- (i
- PROBE_INTERVAL
);
5080 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
5082 emit_set_insn (reg1
, plus_constant (Pmode
, reg1
, -base
));
5083 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- rem
));
5086 emit_stack_probe (plus_constant (Pmode
, reg1
, -rem
));
5089 /* Otherwise, do the same as above, but in a loop. Note that we must be
5090 extra careful with variables wrapping around because we might be at
5091 the very top (or the very bottom) of the address space and we have
5092 to be able to handle this case properly; in particular, we use an
5093 equality test for the loop condition. */
5096 rtx reg2
= gen_rtx_REG (Pmode
, PROBE_STACK_SECOND_REG
);
5098 /* Step 1: round SIZE to the previous multiple of the interval. */
5100 HOST_WIDE_INT rounded_size
= size
& -PROBE_INTERVAL
;
5103 /* Step 2: compute initial and final value of the loop counter. */
5105 /* TEST_ADDR = SP + FIRST. */
5106 emit_set_insn (reg1
,
5107 plus_constant (Pmode
, stack_pointer_rtx
, -first
));
5109 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
5110 HOST_WIDE_INT adjustment
= - (first
+ rounded_size
);
5111 if (! aarch64_uimm12_shift (adjustment
))
5113 aarch64_internal_mov_immediate (reg2
, GEN_INT (adjustment
),
5115 emit_set_insn (reg2
, gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, reg2
));
5118 emit_set_insn (reg2
,
5119 plus_constant (Pmode
, stack_pointer_rtx
, adjustment
));
5125 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
5128 while (TEST_ADDR != LAST_ADDR)
5130 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
5131 until it is equal to ROUNDED_SIZE. */
5133 emit_insn (gen_probe_stack_range (reg1
, reg1
, reg2
));
5136 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
5137 that SIZE is equal to ROUNDED_SIZE. */
5139 if (size
!= rounded_size
)
5141 HOST_WIDE_INT rem
= size
- rounded_size
;
5145 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
5147 emit_set_insn (reg2
, plus_constant (Pmode
, reg2
, -base
));
5148 emit_stack_probe (plus_constant (Pmode
, reg2
, base
- rem
));
5151 emit_stack_probe (plus_constant (Pmode
, reg2
, -rem
));
5155 /* Make sure nothing is scheduled before we are done. */
5156 emit_insn (gen_blockage ());
5159 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
5160 absolute addresses. */
5163 aarch64_output_probe_stack_range (rtx reg1
, rtx reg2
)
5165 static int labelno
= 0;
5169 ASM_GENERATE_INTERNAL_LABEL (loop_lab
, "LPSRL", labelno
++);
5172 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_lab
);
5174 HOST_WIDE_INT stack_clash_probe_interval
5175 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
5177 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
5179 HOST_WIDE_INT interval
;
5180 if (flag_stack_clash_protection
)
5181 interval
= stack_clash_probe_interval
;
5183 interval
= PROBE_INTERVAL
;
5185 gcc_assert (aarch64_uimm12_shift (interval
));
5186 xops
[1] = GEN_INT (interval
);
5188 output_asm_insn ("sub\t%0, %0, %1", xops
);
5190 /* If doing stack clash protection then we probe up by the ABI specified
5191 amount. We do this because we're dropping full pages at a time in the
5192 loop. But if we're doing non-stack clash probing, probe at SP 0. */
5193 if (flag_stack_clash_protection
)
5194 xops
[1] = GEN_INT (STACK_CLASH_CALLER_GUARD
);
5196 xops
[1] = CONST0_RTX (GET_MODE (xops
[1]));
5198 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
5199 by this amount for each iteration. */
5200 output_asm_insn ("str\txzr, [%0, %1]", xops
);
5202 /* Test if TEST_ADDR == LAST_ADDR. */
5204 output_asm_insn ("cmp\t%0, %1", xops
);
5207 fputs ("\tb.ne\t", asm_out_file
);
5208 assemble_name_raw (asm_out_file
, loop_lab
);
5209 fputc ('\n', asm_out_file
);
5214 /* Emit the probe loop for doing stack clash probes and stack adjustments for
5215 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
5216 of GUARD_SIZE. When a probe is emitted it is done at most
5217 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
5218 at most MIN_PROBE_THRESHOLD. By the end of this function
5219 BASE = BASE - ADJUSTMENT. */
5222 aarch64_output_probe_sve_stack_clash (rtx base
, rtx adjustment
,
5223 rtx min_probe_threshold
, rtx guard_size
)
5225 /* This function is not allowed to use any instruction generation function
5226 like gen_ and friends. If you do you'll likely ICE during CFG validation,
5227 so instead emit the code you want using output_asm_insn. */
5228 gcc_assert (flag_stack_clash_protection
);
5229 gcc_assert (CONST_INT_P (min_probe_threshold
) && CONST_INT_P (guard_size
));
5230 gcc_assert (INTVAL (guard_size
) > INTVAL (min_probe_threshold
));
5232 /* The minimum required allocation before the residual requires probing. */
5233 HOST_WIDE_INT residual_probe_guard
= INTVAL (min_probe_threshold
);
5235 /* Clamp the value down to the nearest value that can be used with a cmp. */
5236 residual_probe_guard
= aarch64_clamp_to_uimm12_shift (residual_probe_guard
);
5237 rtx probe_offset_value_rtx
= gen_int_mode (residual_probe_guard
, Pmode
);
5239 gcc_assert (INTVAL (min_probe_threshold
) >= residual_probe_guard
);
5240 gcc_assert (aarch64_uimm12_shift (residual_probe_guard
));
5242 static int labelno
= 0;
5243 char loop_start_lab
[32];
5244 char loop_end_lab
[32];
5247 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab
, "SVLPSPL", labelno
);
5248 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab
, "SVLPEND", labelno
++);
5250 /* Emit loop start label. */
5251 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_start_lab
);
5253 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
5254 xops
[0] = adjustment
;
5255 xops
[1] = probe_offset_value_rtx
;
5256 output_asm_insn ("cmp\t%0, %1", xops
);
5258 /* Branch to end if not enough adjustment to probe. */
5259 fputs ("\tb.lt\t", asm_out_file
);
5260 assemble_name_raw (asm_out_file
, loop_end_lab
);
5261 fputc ('\n', asm_out_file
);
5263 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
5265 xops
[1] = probe_offset_value_rtx
;
5266 output_asm_insn ("sub\t%0, %0, %1", xops
);
5268 /* Probe at BASE. */
5269 xops
[1] = const0_rtx
;
5270 output_asm_insn ("str\txzr, [%0, %1]", xops
);
5272 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
5273 xops
[0] = adjustment
;
5274 xops
[1] = probe_offset_value_rtx
;
5275 output_asm_insn ("sub\t%0, %0, %1", xops
);
5277 /* Branch to start if still more bytes to allocate. */
5278 fputs ("\tb\t", asm_out_file
);
5279 assemble_name_raw (asm_out_file
, loop_start_lab
);
5280 fputc ('\n', asm_out_file
);
5282 /* No probe leave. */
5283 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_end_lab
);
5285 /* BASE = BASE - ADJUSTMENT. */
5287 xops
[1] = adjustment
;
5288 output_asm_insn ("sub\t%0, %0, %1", xops
);
5292 /* Determine whether a frame chain needs to be generated. */
5294 aarch64_needs_frame_chain (void)
5296 /* Force a frame chain for EH returns so the return address is at FP+8. */
5297 if (frame_pointer_needed
|| crtl
->calls_eh_return
)
5300 /* A leaf function cannot have calls or write LR. */
5301 bool is_leaf
= crtl
->is_leaf
&& !df_regs_ever_live_p (LR_REGNUM
);
5303 /* Don't use a frame chain in leaf functions if leaf frame pointers
5305 if (flag_omit_leaf_frame_pointer
&& is_leaf
)
5308 return aarch64_use_frame_pointer
;
5311 /* Mark the registers that need to be saved by the callee and calculate
5312 the size of the callee-saved registers area and frame record (both FP
5313 and LR may be omitted). */
5315 aarch64_layout_frame (void)
5317 HOST_WIDE_INT offset
= 0;
5318 int regno
, last_fp_reg
= INVALID_REGNUM
;
5319 bool simd_function
= aarch64_simd_decl_p (cfun
->decl
);
5321 cfun
->machine
->frame
.emit_frame_chain
= aarch64_needs_frame_chain ();
5323 /* Adjust the outgoing arguments size if required. Keep it in sync with what
5324 the mid-end is doing. */
5325 crtl
->outgoing_args_size
= STACK_DYNAMIC_OFFSET (cfun
);
5327 #define SLOT_NOT_REQUIRED (-2)
5328 #define SLOT_REQUIRED (-1)
5330 cfun
->machine
->frame
.wb_candidate1
= INVALID_REGNUM
;
5331 cfun
->machine
->frame
.wb_candidate2
= INVALID_REGNUM
;
5333 /* If this is a non-leaf simd function with calls we assume that
5334 at least one of those calls is to a non-simd function and thus
5335 we must save V8 to V23 in the prologue. */
5337 if (simd_function
&& !crtl
->is_leaf
)
5339 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
5340 if (FP_SIMD_SAVED_REGNUM_P (regno
))
5341 df_set_regs_ever_live (regno
, true);
5344 /* First mark all the registers that really need to be saved... */
5345 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
5346 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
5348 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
5349 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
5351 /* ... that includes the eh data registers (if needed)... */
5352 if (crtl
->calls_eh_return
)
5353 for (regno
= 0; EH_RETURN_DATA_REGNO (regno
) != INVALID_REGNUM
; regno
++)
5354 cfun
->machine
->frame
.reg_offset
[EH_RETURN_DATA_REGNO (regno
)]
5357 /* ... and any callee saved register that dataflow says is live. */
5358 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
5359 if (df_regs_ever_live_p (regno
)
5360 && (regno
== R30_REGNUM
5361 || !call_used_or_fixed_reg_p (regno
)))
5362 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
5364 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
5365 if (df_regs_ever_live_p (regno
)
5366 && (!call_used_or_fixed_reg_p (regno
)
5367 || (simd_function
&& FP_SIMD_SAVED_REGNUM_P (regno
))))
5369 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
5370 last_fp_reg
= regno
;
5373 if (cfun
->machine
->frame
.emit_frame_chain
)
5375 /* FP and LR are placed in the linkage record. */
5376 cfun
->machine
->frame
.reg_offset
[R29_REGNUM
] = 0;
5377 cfun
->machine
->frame
.wb_candidate1
= R29_REGNUM
;
5378 cfun
->machine
->frame
.reg_offset
[R30_REGNUM
] = UNITS_PER_WORD
;
5379 cfun
->machine
->frame
.wb_candidate2
= R30_REGNUM
;
5380 offset
= 2 * UNITS_PER_WORD
;
5383 /* With stack-clash, LR must be saved in non-leaf functions. */
5384 gcc_assert (crtl
->is_leaf
5385 || (cfun
->machine
->frame
.reg_offset
[R30_REGNUM
]
5386 != SLOT_NOT_REQUIRED
));
5388 /* Now assign stack slots for them. */
5389 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
5390 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
5392 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
5393 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
5394 cfun
->machine
->frame
.wb_candidate1
= regno
;
5395 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
)
5396 cfun
->machine
->frame
.wb_candidate2
= regno
;
5397 offset
+= UNITS_PER_WORD
;
5400 HOST_WIDE_INT max_int_offset
= offset
;
5401 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
5402 bool has_align_gap
= offset
!= max_int_offset
;
5404 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
5405 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
5407 /* If there is an alignment gap between integer and fp callee-saves,
5408 allocate the last fp register to it if possible. */
5409 if (regno
== last_fp_reg
5412 && (offset
& 8) == 0)
5414 cfun
->machine
->frame
.reg_offset
[regno
] = max_int_offset
;
5418 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
5419 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
5420 cfun
->machine
->frame
.wb_candidate1
= regno
;
5421 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
5422 && cfun
->machine
->frame
.wb_candidate1
>= V0_REGNUM
)
5423 cfun
->machine
->frame
.wb_candidate2
= regno
;
5424 offset
+= simd_function
? UNITS_PER_VREG
: UNITS_PER_WORD
;
5427 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
5429 cfun
->machine
->frame
.saved_regs_size
= offset
;
5431 HOST_WIDE_INT varargs_and_saved_regs_size
5432 = offset
+ cfun
->machine
->frame
.saved_varargs_size
;
5434 cfun
->machine
->frame
.hard_fp_offset
5435 = aligned_upper_bound (varargs_and_saved_regs_size
5436 + get_frame_size (),
5437 STACK_BOUNDARY
/ BITS_PER_UNIT
);
5439 /* Both these values are already aligned. */
5440 gcc_assert (multiple_p (crtl
->outgoing_args_size
,
5441 STACK_BOUNDARY
/ BITS_PER_UNIT
));
5442 cfun
->machine
->frame
.frame_size
5443 = (cfun
->machine
->frame
.hard_fp_offset
5444 + crtl
->outgoing_args_size
);
5446 cfun
->machine
->frame
.locals_offset
= cfun
->machine
->frame
.saved_varargs_size
;
5448 cfun
->machine
->frame
.initial_adjust
= 0;
5449 cfun
->machine
->frame
.final_adjust
= 0;
5450 cfun
->machine
->frame
.callee_adjust
= 0;
5451 cfun
->machine
->frame
.callee_offset
= 0;
5453 HOST_WIDE_INT max_push_offset
= 0;
5454 if (cfun
->machine
->frame
.wb_candidate2
!= INVALID_REGNUM
)
5455 max_push_offset
= 512;
5456 else if (cfun
->machine
->frame
.wb_candidate1
!= INVALID_REGNUM
)
5457 max_push_offset
= 256;
5459 HOST_WIDE_INT const_size
, const_fp_offset
;
5460 if (cfun
->machine
->frame
.frame_size
.is_constant (&const_size
)
5461 && const_size
< max_push_offset
5462 && known_eq (crtl
->outgoing_args_size
, 0))
5464 /* Simple, small frame with no outgoing arguments:
5465 stp reg1, reg2, [sp, -frame_size]!
5466 stp reg3, reg4, [sp, 16] */
5467 cfun
->machine
->frame
.callee_adjust
= const_size
;
5469 else if (known_lt (crtl
->outgoing_args_size
5470 + cfun
->machine
->frame
.saved_regs_size
, 512)
5471 && !(cfun
->calls_alloca
5472 && known_lt (cfun
->machine
->frame
.hard_fp_offset
,
5475 /* Frame with small outgoing arguments:
5476 sub sp, sp, frame_size
5477 stp reg1, reg2, [sp, outgoing_args_size]
5478 stp reg3, reg4, [sp, outgoing_args_size + 16] */
5479 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.frame_size
;
5480 cfun
->machine
->frame
.callee_offset
5481 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.hard_fp_offset
;
5483 else if (cfun
->machine
->frame
.hard_fp_offset
.is_constant (&const_fp_offset
)
5484 && const_fp_offset
< max_push_offset
)
5486 /* Frame with large outgoing arguments but a small local area:
5487 stp reg1, reg2, [sp, -hard_fp_offset]!
5488 stp reg3, reg4, [sp, 16]
5489 sub sp, sp, outgoing_args_size */
5490 cfun
->machine
->frame
.callee_adjust
= const_fp_offset
;
5491 cfun
->machine
->frame
.final_adjust
5492 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.callee_adjust
;
5496 /* Frame with large local area and outgoing arguments using frame pointer:
5497 sub sp, sp, hard_fp_offset
5498 stp x29, x30, [sp, 0]
5500 stp reg3, reg4, [sp, 16]
5501 sub sp, sp, outgoing_args_size */
5502 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.hard_fp_offset
;
5503 cfun
->machine
->frame
.final_adjust
5504 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.initial_adjust
;
5507 cfun
->machine
->frame
.laid_out
= true;
5510 /* Return true if the register REGNO is saved on entry to
5511 the current function. */
5514 aarch64_register_saved_on_entry (int regno
)
5516 return cfun
->machine
->frame
.reg_offset
[regno
] >= 0;
5519 /* Return the next register up from REGNO up to LIMIT for the callee
5523 aarch64_next_callee_save (unsigned regno
, unsigned limit
)
5525 while (regno
<= limit
&& !aarch64_register_saved_on_entry (regno
))
5530 /* Push the register number REGNO of mode MODE to the stack with write-back
5531 adjusting the stack by ADJUSTMENT. */
5534 aarch64_pushwb_single_reg (machine_mode mode
, unsigned regno
,
5535 HOST_WIDE_INT adjustment
)
5537 rtx base_rtx
= stack_pointer_rtx
;
5540 reg
= gen_rtx_REG (mode
, regno
);
5541 mem
= gen_rtx_PRE_MODIFY (Pmode
, base_rtx
,
5542 plus_constant (Pmode
, base_rtx
, -adjustment
));
5543 mem
= gen_frame_mem (mode
, mem
);
5545 insn
= emit_move_insn (mem
, reg
);
5546 RTX_FRAME_RELATED_P (insn
) = 1;
5549 /* Generate and return an instruction to store the pair of registers
5550 REG and REG2 of mode MODE to location BASE with write-back adjusting
5551 the stack location BASE by ADJUSTMENT. */
5554 aarch64_gen_storewb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
5555 HOST_WIDE_INT adjustment
)
5560 return gen_storewb_pairdi_di (base
, base
, reg
, reg2
,
5561 GEN_INT (-adjustment
),
5562 GEN_INT (UNITS_PER_WORD
- adjustment
));
5564 return gen_storewb_pairdf_di (base
, base
, reg
, reg2
,
5565 GEN_INT (-adjustment
),
5566 GEN_INT (UNITS_PER_WORD
- adjustment
));
5568 return gen_storewb_pairtf_di (base
, base
, reg
, reg2
,
5569 GEN_INT (-adjustment
),
5570 GEN_INT (UNITS_PER_VREG
- adjustment
));
5576 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
5577 stack pointer by ADJUSTMENT. */
5580 aarch64_push_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
)
5583 machine_mode mode
= aarch64_reg_save_mode (cfun
->decl
, regno1
);
5585 if (regno2
== INVALID_REGNUM
)
5586 return aarch64_pushwb_single_reg (mode
, regno1
, adjustment
);
5588 rtx reg1
= gen_rtx_REG (mode
, regno1
);
5589 rtx reg2
= gen_rtx_REG (mode
, regno2
);
5591 insn
= emit_insn (aarch64_gen_storewb_pair (mode
, stack_pointer_rtx
, reg1
,
5593 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 2)) = 1;
5594 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
5595 RTX_FRAME_RELATED_P (insn
) = 1;
5598 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
5599 adjusting it by ADJUSTMENT afterwards. */
5602 aarch64_gen_loadwb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
5603 HOST_WIDE_INT adjustment
)
5608 return gen_loadwb_pairdi_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
5609 GEN_INT (UNITS_PER_WORD
));
5611 return gen_loadwb_pairdf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
5612 GEN_INT (UNITS_PER_WORD
));
5614 return gen_loadwb_pairtf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
5615 GEN_INT (UNITS_PER_VREG
));
5621 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
5622 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
5626 aarch64_pop_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
,
5629 machine_mode mode
= aarch64_reg_save_mode (cfun
->decl
, regno1
);
5630 rtx reg1
= gen_rtx_REG (mode
, regno1
);
5632 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg1
, *cfi_ops
);
5634 if (regno2
== INVALID_REGNUM
)
5636 rtx mem
= plus_constant (Pmode
, stack_pointer_rtx
, adjustment
);
5637 mem
= gen_rtx_POST_MODIFY (Pmode
, stack_pointer_rtx
, mem
);
5638 emit_move_insn (reg1
, gen_frame_mem (mode
, mem
));
5642 rtx reg2
= gen_rtx_REG (mode
, regno2
);
5643 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
5644 emit_insn (aarch64_gen_loadwb_pair (mode
, stack_pointer_rtx
, reg1
,
5649 /* Generate and return a store pair instruction of mode MODE to store
5650 register REG1 to MEM1 and register REG2 to MEM2. */
5653 aarch64_gen_store_pair (machine_mode mode
, rtx mem1
, rtx reg1
, rtx mem2
,
5659 return gen_store_pair_dw_didi (mem1
, reg1
, mem2
, reg2
);
5662 return gen_store_pair_dw_dfdf (mem1
, reg1
, mem2
, reg2
);
5665 return gen_store_pair_dw_tftf (mem1
, reg1
, mem2
, reg2
);
5672 /* Generate and regurn a load pair isntruction of mode MODE to load register
5673 REG1 from MEM1 and register REG2 from MEM2. */
5676 aarch64_gen_load_pair (machine_mode mode
, rtx reg1
, rtx mem1
, rtx reg2
,
5682 return gen_load_pair_dw_didi (reg1
, mem1
, reg2
, mem2
);
5685 return gen_load_pair_dw_dfdf (reg1
, mem1
, reg2
, mem2
);
5688 return gen_load_pair_dw_tftf (reg1
, mem1
, reg2
, mem2
);
5695 /* Return TRUE if return address signing should be enabled for the current
5696 function, otherwise return FALSE. */
5699 aarch64_return_address_signing_enabled (void)
5701 /* This function should only be called after frame laid out. */
5702 gcc_assert (cfun
->machine
->frame
.laid_out
);
5704 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
5705 if its LR is pushed onto stack. */
5706 return (aarch64_ra_sign_scope
== AARCH64_FUNCTION_ALL
5707 || (aarch64_ra_sign_scope
== AARCH64_FUNCTION_NON_LEAF
5708 && cfun
->machine
->frame
.reg_offset
[LR_REGNUM
] >= 0));
5711 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
5713 aarch64_bti_enabled (void)
5715 return (aarch64_enable_bti
== 1);
5718 /* Emit code to save the callee-saved registers from register number START
5719 to LIMIT to the stack at the location starting at offset START_OFFSET,
5720 skipping any write-back candidates if SKIP_WB is true. */
5723 aarch64_save_callee_saves (machine_mode mode
, poly_int64 start_offset
,
5724 unsigned start
, unsigned limit
, bool skip_wb
)
5730 for (regno
= aarch64_next_callee_save (start
, limit
);
5732 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
5739 && (regno
== cfun
->machine
->frame
.wb_candidate1
5740 || regno
== cfun
->machine
->frame
.wb_candidate2
))
5743 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
5746 reg
= gen_rtx_REG (mode
, regno
);
5747 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
5748 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
5751 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
5752 offset_diff
= cfun
->machine
->frame
.reg_offset
[regno2
]
5753 - cfun
->machine
->frame
.reg_offset
[regno
];
5756 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
5757 && known_eq (GET_MODE_SIZE (mode
), offset_diff
))
5759 rtx reg2
= gen_rtx_REG (mode
, regno2
);
5762 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
5763 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
5765 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
,
5768 /* The first part of a frame-related parallel insn is
5769 always assumed to be relevant to the frame
5770 calculations; subsequent parts, are only
5771 frame-related if explicitly marked. */
5772 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
5776 insn
= emit_move_insn (mem
, reg
);
5778 RTX_FRAME_RELATED_P (insn
) = 1;
5782 /* Emit code to restore the callee registers of mode MODE from register
5783 number START up to and including LIMIT. Restore from the stack offset
5784 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
5785 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
5788 aarch64_restore_callee_saves (machine_mode mode
,
5789 poly_int64 start_offset
, unsigned start
,
5790 unsigned limit
, bool skip_wb
, rtx
*cfi_ops
)
5792 rtx base_rtx
= stack_pointer_rtx
;
5797 for (regno
= aarch64_next_callee_save (start
, limit
);
5799 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
5801 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
5808 && (regno
== cfun
->machine
->frame
.wb_candidate1
5809 || regno
== cfun
->machine
->frame
.wb_candidate2
))
5812 reg
= gen_rtx_REG (mode
, regno
);
5813 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
5814 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
5816 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
5817 offset_diff
= cfun
->machine
->frame
.reg_offset
[regno2
]
5818 - cfun
->machine
->frame
.reg_offset
[regno
];
5821 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
5822 && known_eq (GET_MODE_SIZE (mode
), offset_diff
))
5824 rtx reg2
= gen_rtx_REG (mode
, regno2
);
5827 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
5828 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
5829 emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
5831 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
5835 emit_move_insn (reg
, mem
);
5836 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg
, *cfi_ops
);
5840 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
5844 offset_4bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
5846 HOST_WIDE_INT multiple
;
5847 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5848 && IN_RANGE (multiple
, -8, 7));
5851 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5855 offset_6bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
5857 HOST_WIDE_INT multiple
;
5858 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5859 && IN_RANGE (multiple
, 0, 63));
5862 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5866 aarch64_offset_7bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
5868 HOST_WIDE_INT multiple
;
5869 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5870 && IN_RANGE (multiple
, -64, 63));
5873 /* Return true if OFFSET is a signed 9-bit value. */
5876 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED
,
5879 HOST_WIDE_INT const_offset
;
5880 return (offset
.is_constant (&const_offset
)
5881 && IN_RANGE (const_offset
, -256, 255));
5884 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5888 offset_9bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
5890 HOST_WIDE_INT multiple
;
5891 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5892 && IN_RANGE (multiple
, -256, 255));
5895 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5899 offset_12bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
5901 HOST_WIDE_INT multiple
;
5902 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5903 && IN_RANGE (multiple
, 0, 4095));
5906 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
5909 aarch64_get_separate_components (void)
5911 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
5912 bitmap_clear (components
);
5914 /* The registers we need saved to the frame. */
5915 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
5916 if (aarch64_register_saved_on_entry (regno
))
5918 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
5919 if (!frame_pointer_needed
)
5920 offset
+= cfun
->machine
->frame
.frame_size
5921 - cfun
->machine
->frame
.hard_fp_offset
;
5922 /* Check that we can access the stack slot of the register with one
5923 direct load with no adjustments needed. */
5924 if (offset_12bit_unsigned_scaled_p (DImode
, offset
))
5925 bitmap_set_bit (components
, regno
);
5928 /* Don't mess with the hard frame pointer. */
5929 if (frame_pointer_needed
)
5930 bitmap_clear_bit (components
, HARD_FRAME_POINTER_REGNUM
);
5932 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
5933 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
5934 /* If registers have been chosen to be stored/restored with
5935 writeback don't interfere with them to avoid having to output explicit
5936 stack adjustment instructions. */
5937 if (reg2
!= INVALID_REGNUM
)
5938 bitmap_clear_bit (components
, reg2
);
5939 if (reg1
!= INVALID_REGNUM
)
5940 bitmap_clear_bit (components
, reg1
);
5942 bitmap_clear_bit (components
, LR_REGNUM
);
5943 bitmap_clear_bit (components
, SP_REGNUM
);
5948 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
5951 aarch64_components_for_bb (basic_block bb
)
5953 bitmap in
= DF_LIVE_IN (bb
);
5954 bitmap gen
= &DF_LIVE_BB_INFO (bb
)->gen
;
5955 bitmap kill
= &DF_LIVE_BB_INFO (bb
)->kill
;
5956 bool simd_function
= aarch64_simd_decl_p (cfun
->decl
);
5958 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
5959 bitmap_clear (components
);
5961 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
5962 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
5963 if ((!call_used_or_fixed_reg_p (regno
)
5964 || (simd_function
&& FP_SIMD_SAVED_REGNUM_P (regno
)))
5965 && (bitmap_bit_p (in
, regno
)
5966 || bitmap_bit_p (gen
, regno
)
5967 || bitmap_bit_p (kill
, regno
)))
5969 unsigned regno2
, offset
, offset2
;
5970 bitmap_set_bit (components
, regno
);
5972 /* If there is a callee-save at an adjacent offset, add it too
5973 to increase the use of LDP/STP. */
5974 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
5975 regno2
= ((offset
& 8) == 0) ? regno
+ 1 : regno
- 1;
5977 if (regno2
<= LAST_SAVED_REGNUM
)
5979 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
5980 if ((offset
& ~8) == (offset2
& ~8))
5981 bitmap_set_bit (components
, regno2
);
5988 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5989 Nothing to do for aarch64. */
5992 aarch64_disqualify_components (sbitmap
, edge
, sbitmap
, bool)
5996 /* Return the next set bit in BMP from START onwards. Return the total number
5997 of bits in BMP if no set bit is found at or after START. */
6000 aarch64_get_next_set_bit (sbitmap bmp
, unsigned int start
)
6002 unsigned int nbits
= SBITMAP_SIZE (bmp
);
6006 gcc_assert (start
< nbits
);
6007 for (unsigned int i
= start
; i
< nbits
; i
++)
6008 if (bitmap_bit_p (bmp
, i
))
6014 /* Do the work for aarch64_emit_prologue_components and
6015 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
6016 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
6017 for these components or the epilogue sequence. That is, it determines
6018 whether we should emit stores or loads and what kind of CFA notes to attach
6019 to the insns. Otherwise the logic for the two sequences is very
6023 aarch64_process_components (sbitmap components
, bool prologue_p
)
6025 rtx ptr_reg
= gen_rtx_REG (Pmode
, frame_pointer_needed
6026 ? HARD_FRAME_POINTER_REGNUM
6027 : STACK_POINTER_REGNUM
);
6029 unsigned last_regno
= SBITMAP_SIZE (components
);
6030 unsigned regno
= aarch64_get_next_set_bit (components
, R0_REGNUM
);
6031 rtx_insn
*insn
= NULL
;
6033 while (regno
!= last_regno
)
6035 /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
6036 so DFmode for the vector registers is enough. For simd functions
6037 we want to save the low 128 bits. */
6038 machine_mode mode
= aarch64_reg_save_mode (cfun
->decl
, regno
);
6040 rtx reg
= gen_rtx_REG (mode
, regno
);
6041 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
6042 if (!frame_pointer_needed
)
6043 offset
+= cfun
->machine
->frame
.frame_size
6044 - cfun
->machine
->frame
.hard_fp_offset
;
6045 rtx addr
= plus_constant (Pmode
, ptr_reg
, offset
);
6046 rtx mem
= gen_frame_mem (mode
, addr
);
6048 rtx set
= prologue_p
? gen_rtx_SET (mem
, reg
) : gen_rtx_SET (reg
, mem
);
6049 unsigned regno2
= aarch64_get_next_set_bit (components
, regno
+ 1);
6050 /* No more registers to handle after REGNO.
6051 Emit a single save/restore and exit. */
6052 if (regno2
== last_regno
)
6054 insn
= emit_insn (set
);
6055 RTX_FRAME_RELATED_P (insn
) = 1;
6057 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
6059 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
6063 poly_int64 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
6064 /* The next register is not of the same class or its offset is not
6065 mergeable with the current one into a pair. */
6066 if (!satisfies_constraint_Ump (mem
)
6067 || GP_REGNUM_P (regno
) != GP_REGNUM_P (regno2
)
6068 || (aarch64_simd_decl_p (cfun
->decl
) && FP_REGNUM_P (regno
))
6069 || maybe_ne ((offset2
- cfun
->machine
->frame
.reg_offset
[regno
]),
6070 GET_MODE_SIZE (mode
)))
6072 insn
= emit_insn (set
);
6073 RTX_FRAME_RELATED_P (insn
) = 1;
6075 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
6077 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
6083 /* REGNO2 can be saved/restored in a pair with REGNO. */
6084 rtx reg2
= gen_rtx_REG (mode
, regno2
);
6085 if (!frame_pointer_needed
)
6086 offset2
+= cfun
->machine
->frame
.frame_size
6087 - cfun
->machine
->frame
.hard_fp_offset
;
6088 rtx addr2
= plus_constant (Pmode
, ptr_reg
, offset2
);
6089 rtx mem2
= gen_frame_mem (mode
, addr2
);
6090 rtx set2
= prologue_p
? gen_rtx_SET (mem2
, reg2
)
6091 : gen_rtx_SET (reg2
, mem2
);
6094 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
, reg2
));
6096 insn
= emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
6098 RTX_FRAME_RELATED_P (insn
) = 1;
6101 add_reg_note (insn
, REG_CFA_OFFSET
, set
);
6102 add_reg_note (insn
, REG_CFA_OFFSET
, set2
);
6106 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
6107 add_reg_note (insn
, REG_CFA_RESTORE
, reg2
);
6110 regno
= aarch64_get_next_set_bit (components
, regno2
+ 1);
6114 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
6117 aarch64_emit_prologue_components (sbitmap components
)
6119 aarch64_process_components (components
, true);
6122 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
6125 aarch64_emit_epilogue_components (sbitmap components
)
6127 aarch64_process_components (components
, false);
6130 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
6133 aarch64_set_handled_components (sbitmap components
)
6135 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
6136 if (bitmap_bit_p (components
, regno
))
6137 cfun
->machine
->reg_is_wrapped_separately
[regno
] = true;
6140 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
6141 determining the probe offset for alloca. */
6143 static HOST_WIDE_INT
6144 aarch64_stack_clash_protection_alloca_probe_range (void)
6146 return STACK_CLASH_CALLER_GUARD
;
6150 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
6151 registers. If POLY_SIZE is not large enough to require a probe this function
6152 will only adjust the stack. When allocating the stack space
6153 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
6154 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
6155 arguments. If we are then we ensure that any allocation larger than the ABI
6156 defined buffer needs a probe so that the invariant of having a 1KB buffer is
6159 We emit barriers after each stack adjustment to prevent optimizations from
6160 breaking the invariant that we never drop the stack more than a page. This
6161 invariant is needed to make it easier to correctly handle asynchronous
6162 events, e.g. if we were to allow the stack to be dropped by more than a page
6163 and then have multiple probes up and we take a signal somewhere in between
6164 then the signal handler doesn't know the state of the stack and can make no
6165 assumptions about which pages have been probed. */
6168 aarch64_allocate_and_probe_stack_space (rtx temp1
, rtx temp2
,
6169 poly_int64 poly_size
,
6170 bool frame_related_p
,
6171 bool final_adjustment_p
)
6173 HOST_WIDE_INT guard_size
6174 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
6175 HOST_WIDE_INT guard_used_by_caller
= STACK_CLASH_CALLER_GUARD
;
6176 /* When doing the final adjustment for the outgoing argument size we can't
6177 assume that LR was saved at position 0. So subtract it's offset from the
6178 ABI safe buffer so that we don't accidentally allow an adjustment that
6179 would result in an allocation larger than the ABI buffer without
6181 HOST_WIDE_INT min_probe_threshold
6182 = final_adjustment_p
6183 ? guard_used_by_caller
- cfun
->machine
->frame
.reg_offset
[LR_REGNUM
]
6184 : guard_size
- guard_used_by_caller
;
6186 poly_int64 frame_size
= cfun
->machine
->frame
.frame_size
;
6188 /* We should always have a positive probe threshold. */
6189 gcc_assert (min_probe_threshold
> 0);
6191 if (flag_stack_clash_protection
&& !final_adjustment_p
)
6193 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
6194 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
6196 if (known_eq (frame_size
, 0))
6198 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME
, false);
6200 else if (known_lt (initial_adjust
, guard_size
- guard_used_by_caller
)
6201 && known_lt (final_adjust
, guard_used_by_caller
))
6203 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME
, true);
6207 /* If SIZE is not large enough to require probing, just adjust the stack and
6209 if (known_lt (poly_size
, min_probe_threshold
)
6210 || !flag_stack_clash_protection
)
6212 aarch64_sub_sp (temp1
, temp2
, poly_size
, frame_related_p
);
6217 /* Handle the SVE non-constant case first. */
6218 if (!poly_size
.is_constant (&size
))
6222 fprintf (dump_file
, "Stack clash SVE prologue: ");
6223 print_dec (poly_size
, dump_file
);
6224 fprintf (dump_file
, " bytes, dynamic probing will be required.\n");
6227 /* First calculate the amount of bytes we're actually spilling. */
6228 aarch64_add_offset (Pmode
, temp1
, CONST0_RTX (Pmode
),
6229 poly_size
, temp1
, temp2
, false, true);
6231 rtx_insn
*insn
= get_last_insn ();
6233 if (frame_related_p
)
6235 /* This is done to provide unwinding information for the stack
6236 adjustments we're about to do, however to prevent the optimizers
6237 from removing the R11 move and leaving the CFA note (which would be
6238 very wrong) we tie the old and new stack pointer together.
6239 The tie will expand to nothing but the optimizers will not touch
6241 rtx stack_ptr_copy
= gen_rtx_REG (Pmode
, STACK_CLASH_SVE_CFA_REGNUM
);
6242 emit_move_insn (stack_ptr_copy
, stack_pointer_rtx
);
6243 emit_insn (gen_stack_tie (stack_ptr_copy
, stack_pointer_rtx
));
6245 /* We want the CFA independent of the stack pointer for the
6246 duration of the loop. */
6247 add_reg_note (insn
, REG_CFA_DEF_CFA
, stack_ptr_copy
);
6248 RTX_FRAME_RELATED_P (insn
) = 1;
6251 rtx probe_const
= gen_int_mode (min_probe_threshold
, Pmode
);
6252 rtx guard_const
= gen_int_mode (guard_size
, Pmode
);
6254 insn
= emit_insn (gen_probe_sve_stack_clash (Pmode
, stack_pointer_rtx
,
6255 stack_pointer_rtx
, temp1
,
6256 probe_const
, guard_const
));
6258 /* Now reset the CFA register if needed. */
6259 if (frame_related_p
)
6261 add_reg_note (insn
, REG_CFA_DEF_CFA
,
6262 gen_rtx_PLUS (Pmode
, stack_pointer_rtx
,
6263 gen_int_mode (poly_size
, Pmode
)));
6264 RTX_FRAME_RELATED_P (insn
) = 1;
6272 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
6273 " bytes, probing will be required.\n", size
);
6275 /* Round size to the nearest multiple of guard_size, and calculate the
6276 residual as the difference between the original size and the rounded
6278 HOST_WIDE_INT rounded_size
= ROUND_DOWN (size
, guard_size
);
6279 HOST_WIDE_INT residual
= size
- rounded_size
;
6281 /* We can handle a small number of allocations/probes inline. Otherwise
6283 if (rounded_size
<= STACK_CLASH_MAX_UNROLL_PAGES
* guard_size
)
6285 for (HOST_WIDE_INT i
= 0; i
< rounded_size
; i
+= guard_size
)
6287 aarch64_sub_sp (NULL
, temp2
, guard_size
, true);
6288 emit_stack_probe (plus_constant (Pmode
, stack_pointer_rtx
,
6289 guard_used_by_caller
));
6290 emit_insn (gen_blockage ());
6292 dump_stack_clash_frame_info (PROBE_INLINE
, size
!= rounded_size
);
6296 /* Compute the ending address. */
6297 aarch64_add_offset (Pmode
, temp1
, stack_pointer_rtx
, -rounded_size
,
6298 temp1
, NULL
, false, true);
6299 rtx_insn
*insn
= get_last_insn ();
6301 /* For the initial allocation, we don't have a frame pointer
6302 set up, so we always need CFI notes. If we're doing the
6303 final allocation, then we may have a frame pointer, in which
6304 case it is the CFA, otherwise we need CFI notes.
6306 We can determine which allocation we are doing by looking at
6307 the value of FRAME_RELATED_P since the final allocations are not
6309 if (frame_related_p
)
6311 /* We want the CFA independent of the stack pointer for the
6312 duration of the loop. */
6313 add_reg_note (insn
, REG_CFA_DEF_CFA
,
6314 plus_constant (Pmode
, temp1
, rounded_size
));
6315 RTX_FRAME_RELATED_P (insn
) = 1;
6318 /* This allocates and probes the stack. Note that this re-uses some of
6319 the existing Ada stack protection code. However we are guaranteed not
6320 to enter the non loop or residual branches of that code.
6322 The non-loop part won't be entered because if our allocation amount
6323 doesn't require a loop, the case above would handle it.
6325 The residual amount won't be entered because TEMP1 is a mutliple of
6326 the allocation size. The residual will always be 0. As such, the only
6327 part we are actually using from that code is the loop setup. The
6328 actual probing is done in aarch64_output_probe_stack_range. */
6329 insn
= emit_insn (gen_probe_stack_range (stack_pointer_rtx
,
6330 stack_pointer_rtx
, temp1
));
6332 /* Now reset the CFA register if needed. */
6333 if (frame_related_p
)
6335 add_reg_note (insn
, REG_CFA_DEF_CFA
,
6336 plus_constant (Pmode
, stack_pointer_rtx
, rounded_size
));
6337 RTX_FRAME_RELATED_P (insn
) = 1;
6340 emit_insn (gen_blockage ());
6341 dump_stack_clash_frame_info (PROBE_LOOP
, size
!= rounded_size
);
6344 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
6345 be probed. This maintains the requirement that each page is probed at
6346 least once. For initial probing we probe only if the allocation is
6347 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
6348 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
6349 GUARD_SIZE. This works that for any allocation that is large enough to
6350 trigger a probe here, we'll have at least one, and if they're not large
6351 enough for this code to emit anything for them, The page would have been
6352 probed by the saving of FP/LR either by this function or any callees. If
6353 we don't have any callees then we won't have more stack adjustments and so
6357 HOST_WIDE_INT residual_probe_offset
= guard_used_by_caller
;
6358 /* If we're doing final adjustments, and we've done any full page
6359 allocations then any residual needs to be probed. */
6360 if (final_adjustment_p
&& rounded_size
!= 0)
6361 min_probe_threshold
= 0;
6362 /* If doing a small final adjustment, we always probe at offset 0.
6363 This is done to avoid issues when LR is not at position 0 or when
6364 the final adjustment is smaller than the probing offset. */
6365 else if (final_adjustment_p
&& rounded_size
== 0)
6366 residual_probe_offset
= 0;
6368 aarch64_sub_sp (temp1
, temp2
, residual
, frame_related_p
);
6369 if (residual
>= min_probe_threshold
)
6373 "Stack clash AArch64 prologue residuals: "
6374 HOST_WIDE_INT_PRINT_DEC
" bytes, probing will be required."
6377 emit_stack_probe (plus_constant (Pmode
, stack_pointer_rtx
,
6378 residual_probe_offset
));
6379 emit_insn (gen_blockage ());
6384 /* Return 1 if the register is used by the epilogue. We need to say the
6385 return register is used, but only after epilogue generation is complete.
6386 Note that in the case of sibcalls, the values "used by the epilogue" are
6387 considered live at the start of the called function.
6389 For SIMD functions we need to return 1 for FP registers that are saved and
6390 restored by a function but are not zero in call_used_regs. If we do not do
6391 this optimizations may remove the restore of the register. */
6394 aarch64_epilogue_uses (int regno
)
6396 if (epilogue_completed
)
6398 if (regno
== LR_REGNUM
)
6400 if (aarch64_simd_decl_p (cfun
->decl
) && FP_SIMD_SAVED_REGNUM_P (regno
))
6406 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6407 is saved at BASE + OFFSET. */
6410 aarch64_add_cfa_expression (rtx_insn
*insn
, unsigned int reg
,
6411 rtx base
, poly_int64 offset
)
6413 rtx mem
= gen_frame_mem (DImode
, plus_constant (Pmode
, base
, offset
));
6414 add_reg_note (insn
, REG_CFA_EXPRESSION
,
6415 gen_rtx_SET (mem
, regno_reg_rtx
[reg
]));
6418 /* AArch64 stack frames generated by this compiler look like:
6420 +-------------------------------+
6422 | incoming stack arguments |
6424 +-------------------------------+
6425 | | <-- incoming stack pointer (aligned)
6426 | callee-allocated save area |
6427 | for register varargs |
6429 +-------------------------------+
6430 | local variables | <-- frame_pointer_rtx
6432 +-------------------------------+
6434 +-------------------------------+ |
6435 | callee-saved registers | | frame.saved_regs_size
6436 +-------------------------------+ |
6438 +-------------------------------+ |
6439 | FP' | / <- hard_frame_pointer_rtx (aligned)
6440 +-------------------------------+
6441 | dynamic allocation |
6442 +-------------------------------+
6444 +-------------------------------+
6445 | outgoing stack arguments | <-- arg_pointer
6447 +-------------------------------+
6448 | | <-- stack_pointer_rtx (aligned)
6450 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
6451 but leave frame_pointer_rtx and hard_frame_pointer_rtx
6454 By default for stack-clash we assume the guard is at least 64KB, but this
6455 value is configurable to either 4KB or 64KB. We also force the guard size to
6456 be the same as the probing interval and both values are kept in sync.
6458 With those assumptions the callee can allocate up to 63KB (or 3KB depending
6459 on the guard size) of stack space without probing.
6461 When probing is needed, we emit a probe at the start of the prologue
6462 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
6464 We have to track how much space has been allocated and the only stores
6465 to the stack we track as implicit probes are the FP/LR stores.
6467 For outgoing arguments we probe if the size is larger than 1KB, such that
6468 the ABI specified buffer is maintained for the next callee.
6470 The following registers are reserved during frame layout and should not be
6471 used for any other purpose:
6473 - r11: Used by stack clash protection when SVE is enabled.
6474 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
6475 - r14 and r15: Used for speculation tracking.
6476 - r16(IP0), r17(IP1): Used by indirect tailcalls.
6477 - r30(LR), r29(FP): Used by standard frame layout.
6479 These registers must be avoided in frame layout related code unless the
6480 explicit intention is to interact with one of the features listed above. */
6482 /* Generate the prologue instructions for entry into a function.
6483 Establish the stack frame by decreasing the stack pointer with a
6484 properly calculated size and, if necessary, create a frame record
6485 filled with the values of LR and previous frame pointer. The
6486 current FP is also set up if it is in use. */
6489 aarch64_expand_prologue (void)
6491 poly_int64 frame_size
= cfun
->machine
->frame
.frame_size
;
6492 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
6493 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
6494 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
6495 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
6496 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
6497 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
6498 bool emit_frame_chain
= cfun
->machine
->frame
.emit_frame_chain
;
6501 /* Sign return address for functions. */
6502 if (aarch64_return_address_signing_enabled ())
6504 switch (aarch64_ra_sign_key
)
6507 insn
= emit_insn (gen_paciasp ());
6510 insn
= emit_insn (gen_pacibsp ());
6515 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
6516 RTX_FRAME_RELATED_P (insn
) = 1;
6519 if (flag_stack_usage_info
)
6520 current_function_static_stack_size
= constant_lower_bound (frame_size
);
6522 if (flag_stack_check
== STATIC_BUILTIN_STACK_CHECK
)
6524 if (crtl
->is_leaf
&& !cfun
->calls_alloca
)
6526 if (maybe_gt (frame_size
, PROBE_INTERVAL
)
6527 && maybe_gt (frame_size
, get_stack_check_protect ()))
6528 aarch64_emit_probe_stack_range (get_stack_check_protect (),
6530 - get_stack_check_protect ()));
6532 else if (maybe_gt (frame_size
, 0))
6533 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size
);
6536 rtx tmp0_rtx
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
6537 rtx tmp1_rtx
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
6539 /* In theory we should never have both an initial adjustment
6540 and a callee save adjustment. Verify that is the case since the
6541 code below does not handle it for -fstack-clash-protection. */
6542 gcc_assert (known_eq (initial_adjust
, 0) || callee_adjust
== 0);
6544 /* Will only probe if the initial adjustment is larger than the guard
6545 less the amount of the guard reserved for use by the caller's
6547 aarch64_allocate_and_probe_stack_space (tmp0_rtx
, tmp1_rtx
, initial_adjust
,
6550 if (callee_adjust
!= 0)
6551 aarch64_push_regs (reg1
, reg2
, callee_adjust
);
6553 if (emit_frame_chain
)
6555 poly_int64 reg_offset
= callee_adjust
;
6556 if (callee_adjust
== 0)
6560 reg_offset
= callee_offset
;
6561 aarch64_save_callee_saves (DImode
, reg_offset
, reg1
, reg2
, false);
6563 aarch64_add_offset (Pmode
, hard_frame_pointer_rtx
,
6564 stack_pointer_rtx
, callee_offset
,
6565 tmp1_rtx
, tmp0_rtx
, frame_pointer_needed
);
6566 if (frame_pointer_needed
&& !frame_size
.is_constant ())
6568 /* Variable-sized frames need to describe the save slot
6569 address using DW_CFA_expression rather than DW_CFA_offset.
6570 This means that, without taking further action, the
6571 locations of the registers that we've already saved would
6572 remain based on the stack pointer even after we redefine
6573 the CFA based on the frame pointer. We therefore need new
6574 DW_CFA_expressions to re-express the save slots with addresses
6575 based on the frame pointer. */
6576 rtx_insn
*insn
= get_last_insn ();
6577 gcc_assert (RTX_FRAME_RELATED_P (insn
));
6579 /* Add an explicit CFA definition if this was previously
6581 if (!find_reg_note (insn
, REG_CFA_ADJUST_CFA
, NULL_RTX
))
6583 rtx src
= plus_constant (Pmode
, stack_pointer_rtx
,
6585 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
6586 gen_rtx_SET (hard_frame_pointer_rtx
, src
));
6589 /* Change the save slot expressions for the registers that
6590 we've already saved. */
6591 reg_offset
-= callee_offset
;
6592 aarch64_add_cfa_expression (insn
, reg2
, hard_frame_pointer_rtx
,
6593 reg_offset
+ UNITS_PER_WORD
);
6594 aarch64_add_cfa_expression (insn
, reg1
, hard_frame_pointer_rtx
,
6597 emit_insn (gen_stack_tie (stack_pointer_rtx
, hard_frame_pointer_rtx
));
6600 aarch64_save_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
6601 callee_adjust
!= 0 || emit_frame_chain
);
6602 if (aarch64_simd_decl_p (cfun
->decl
))
6603 aarch64_save_callee_saves (TFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
6604 callee_adjust
!= 0 || emit_frame_chain
);
6606 aarch64_save_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
6607 callee_adjust
!= 0 || emit_frame_chain
);
6609 /* We may need to probe the final adjustment if it is larger than the guard
6610 that is assumed by the called. */
6611 aarch64_allocate_and_probe_stack_space (tmp1_rtx
, tmp0_rtx
, final_adjust
,
6612 !frame_pointer_needed
, true);
6615 /* Return TRUE if we can use a simple_return insn.
6617 This function checks whether the callee saved stack is empty, which
6618 means no restore actions are need. The pro_and_epilogue will use
6619 this to check whether shrink-wrapping opt is feasible. */
6622 aarch64_use_return_insn_p (void)
6624 if (!reload_completed
)
6630 return known_eq (cfun
->machine
->frame
.frame_size
, 0);
6633 /* Return false for non-leaf SIMD functions in order to avoid
6634 shrink-wrapping them. Doing this will lose the necessary
6635 save/restore of FP registers. */
6638 aarch64_use_simple_return_insn_p (void)
6640 if (aarch64_simd_decl_p (cfun
->decl
) && !crtl
->is_leaf
)
6646 /* Generate the epilogue instructions for returning from a function.
6647 This is almost exactly the reverse of the prolog sequence, except
6648 that we need to insert barriers to avoid scheduling loads that read
6649 from a deallocated stack, and we optimize the unwind records by
6650 emitting them all together if possible. */
6652 aarch64_expand_epilogue (bool for_sibcall
)
6654 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
6655 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
6656 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
6657 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
6658 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
6659 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
6662 /* A stack clash protection prologue may not have left EP0_REGNUM or
6663 EP1_REGNUM in a usable state. The same is true for allocations
6664 with an SVE component, since we then need both temporary registers
6665 for each allocation. For stack clash we are in a usable state if
6666 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
6667 HOST_WIDE_INT guard_size
6668 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
6669 HOST_WIDE_INT guard_used_by_caller
= STACK_CLASH_CALLER_GUARD
;
6671 /* We can re-use the registers when the allocation amount is smaller than
6672 guard_size - guard_used_by_caller because we won't be doing any probes
6673 then. In such situations the register should remain live with the correct
6675 bool can_inherit_p
= (initial_adjust
.is_constant ()
6676 && final_adjust
.is_constant ())
6677 && (!flag_stack_clash_protection
6678 || known_lt (initial_adjust
,
6679 guard_size
- guard_used_by_caller
));
6681 /* We need to add memory barrier to prevent read from deallocated stack. */
6683 = maybe_ne (get_frame_size ()
6684 + cfun
->machine
->frame
.saved_varargs_size
, 0);
6686 /* Emit a barrier to prevent loads from a deallocated stack. */
6687 if (maybe_gt (final_adjust
, crtl
->outgoing_args_size
)
6688 || cfun
->calls_alloca
6689 || crtl
->calls_eh_return
)
6691 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
6692 need_barrier_p
= false;
6695 /* Restore the stack pointer from the frame pointer if it may not
6696 be the same as the stack pointer. */
6697 rtx tmp0_rtx
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
6698 rtx tmp1_rtx
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
6699 if (frame_pointer_needed
6700 && (maybe_ne (final_adjust
, 0) || cfun
->calls_alloca
))
6701 /* If writeback is used when restoring callee-saves, the CFA
6702 is restored on the instruction doing the writeback. */
6703 aarch64_add_offset (Pmode
, stack_pointer_rtx
,
6704 hard_frame_pointer_rtx
, -callee_offset
,
6705 tmp1_rtx
, tmp0_rtx
, callee_adjust
== 0);
6707 /* The case where we need to re-use the register here is very rare, so
6708 avoid the complicated condition and just always emit a move if the
6709 immediate doesn't fit. */
6710 aarch64_add_sp (tmp1_rtx
, tmp0_rtx
, final_adjust
, true);
6712 aarch64_restore_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
6713 callee_adjust
!= 0, &cfi_ops
);
6714 if (aarch64_simd_decl_p (cfun
->decl
))
6715 aarch64_restore_callee_saves (TFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
6716 callee_adjust
!= 0, &cfi_ops
);
6718 aarch64_restore_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
6719 callee_adjust
!= 0, &cfi_ops
);
6722 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
6724 if (callee_adjust
!= 0)
6725 aarch64_pop_regs (reg1
, reg2
, callee_adjust
, &cfi_ops
);
6727 if (callee_adjust
!= 0 || maybe_gt (initial_adjust
, 65536))
6729 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
6730 insn
= get_last_insn ();
6731 rtx new_cfa
= plus_constant (Pmode
, stack_pointer_rtx
, initial_adjust
);
6732 REG_NOTES (insn
) = alloc_reg_note (REG_CFA_DEF_CFA
, new_cfa
, cfi_ops
);
6733 RTX_FRAME_RELATED_P (insn
) = 1;
6737 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
6738 add restriction on emit_move optimization to leaf functions. */
6739 aarch64_add_sp (tmp0_rtx
, tmp1_rtx
, initial_adjust
,
6740 (!can_inherit_p
|| !crtl
->is_leaf
6741 || df_regs_ever_live_p (EP0_REGNUM
)));
6745 /* Emit delayed restores and reset the CFA to be SP. */
6746 insn
= get_last_insn ();
6747 cfi_ops
= alloc_reg_note (REG_CFA_DEF_CFA
, stack_pointer_rtx
, cfi_ops
);
6748 REG_NOTES (insn
) = cfi_ops
;
6749 RTX_FRAME_RELATED_P (insn
) = 1;
6752 /* We prefer to emit the combined return/authenticate instruction RETAA,
6753 however there are three cases in which we must instead emit an explicit
6754 authentication instruction.
6756 1) Sibcalls don't return in a normal way, so if we're about to call one
6757 we must authenticate.
6759 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
6760 generating code for !TARGET_ARMV8_3 we can't use it and must
6761 explicitly authenticate.
6763 3) On an eh_return path we make extra stack adjustments to update the
6764 canonical frame address to be the exception handler's CFA. We want
6765 to authenticate using the CFA of the function which calls eh_return.
6767 if (aarch64_return_address_signing_enabled ()
6768 && (for_sibcall
|| !TARGET_ARMV8_3
|| crtl
->calls_eh_return
))
6770 switch (aarch64_ra_sign_key
)
6773 insn
= emit_insn (gen_autiasp ());
6776 insn
= emit_insn (gen_autibsp ());
6781 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
6782 RTX_FRAME_RELATED_P (insn
) = 1;
6785 /* Stack adjustment for exception handler. */
6786 if (crtl
->calls_eh_return
&& !for_sibcall
)
6788 /* We need to unwind the stack by the offset computed by
6789 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
6790 to be SP; letting the CFA move during this adjustment
6791 is just as correct as retaining the CFA from the body
6792 of the function. Therefore, do nothing special. */
6793 emit_insn (gen_add2_insn (stack_pointer_rtx
, EH_RETURN_STACKADJ_RTX
));
6796 emit_use (gen_rtx_REG (DImode
, LR_REGNUM
));
6798 emit_jump_insn (ret_rtx
);
6801 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
6802 normally or return to a previous frame after unwinding.
6804 An EH return uses a single shared return sequence. The epilogue is
6805 exactly like a normal epilogue except that it has an extra input
6806 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
6807 that must be applied after the frame has been destroyed. An extra label
6808 is inserted before the epilogue which initializes this register to zero,
6809 and this is the entry point for a normal return.
6811 An actual EH return updates the return address, initializes the stack
6812 adjustment and jumps directly into the epilogue (bypassing the zeroing
6813 of the adjustment). Since the return address is typically saved on the
6814 stack when a function makes a call, the saved LR must be updated outside
6817 This poses problems as the store is generated well before the epilogue,
6818 so the offset of LR is not known yet. Also optimizations will remove the
6819 store as it appears dead, even after the epilogue is generated (as the
6820 base or offset for loading LR is different in many cases).
6822 To avoid these problems this implementation forces the frame pointer
6823 in eh_return functions so that the location of LR is fixed and known early.
6824 It also marks the store volatile, so no optimization is permitted to
6825 remove the store. */
6827 aarch64_eh_return_handler_rtx (void)
6829 rtx tmp
= gen_frame_mem (Pmode
,
6830 plus_constant (Pmode
, hard_frame_pointer_rtx
, UNITS_PER_WORD
));
6832 /* Mark the store volatile, so no optimization is permitted to remove it. */
6833 MEM_VOLATILE_P (tmp
) = true;
6837 /* Output code to add DELTA to the first argument, and then jump
6838 to FUNCTION. Used for C++ multiple inheritance. */
6840 aarch64_output_mi_thunk (FILE *file
, tree thunk ATTRIBUTE_UNUSED
,
6841 HOST_WIDE_INT delta
,
6842 HOST_WIDE_INT vcall_offset
,
6845 /* The this pointer is always in x0. Note that this differs from
6846 Arm where the this pointer maybe bumped to r1 if r0 is required
6847 to return a pointer to an aggregate. On AArch64 a result value
6848 pointer will be in x8. */
6849 int this_regno
= R0_REGNUM
;
6850 rtx this_rtx
, temp0
, temp1
, addr
, funexp
;
6852 const char *fnname
= IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk
));
6854 if (aarch64_bti_enabled ())
6855 emit_insn (gen_bti_c());
6857 reload_completed
= 1;
6858 emit_note (NOTE_INSN_PROLOGUE_END
);
6860 this_rtx
= gen_rtx_REG (Pmode
, this_regno
);
6861 temp0
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
6862 temp1
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
6864 if (vcall_offset
== 0)
6865 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
, temp1
, temp0
, false);
6868 gcc_assert ((vcall_offset
& (POINTER_BYTES
- 1)) == 0);
6873 if (delta
>= -256 && delta
< 256)
6874 addr
= gen_rtx_PRE_MODIFY (Pmode
, this_rtx
,
6875 plus_constant (Pmode
, this_rtx
, delta
));
6877 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
,
6878 temp1
, temp0
, false);
6881 if (Pmode
== ptr_mode
)
6882 aarch64_emit_move (temp0
, gen_rtx_MEM (ptr_mode
, addr
));
6884 aarch64_emit_move (temp0
,
6885 gen_rtx_ZERO_EXTEND (Pmode
,
6886 gen_rtx_MEM (ptr_mode
, addr
)));
6888 if (vcall_offset
>= -256 && vcall_offset
< 4096 * POINTER_BYTES
)
6889 addr
= plus_constant (Pmode
, temp0
, vcall_offset
);
6892 aarch64_internal_mov_immediate (temp1
, GEN_INT (vcall_offset
), true,
6894 addr
= gen_rtx_PLUS (Pmode
, temp0
, temp1
);
6897 if (Pmode
== ptr_mode
)
6898 aarch64_emit_move (temp1
, gen_rtx_MEM (ptr_mode
,addr
));
6900 aarch64_emit_move (temp1
,
6901 gen_rtx_SIGN_EXTEND (Pmode
,
6902 gen_rtx_MEM (ptr_mode
, addr
)));
6904 emit_insn (gen_add2_insn (this_rtx
, temp1
));
6907 /* Generate a tail call to the target function. */
6908 if (!TREE_USED (function
))
6910 assemble_external (function
);
6911 TREE_USED (function
) = 1;
6913 funexp
= XEXP (DECL_RTL (function
), 0);
6914 funexp
= gen_rtx_MEM (FUNCTION_MODE
, funexp
);
6915 insn
= emit_call_insn (gen_sibcall (funexp
, const0_rtx
, NULL_RTX
));
6916 SIBLING_CALL_P (insn
) = 1;
6918 insn
= get_insns ();
6919 shorten_branches (insn
);
6921 assemble_start_function (thunk
, fnname
);
6922 final_start_function (insn
, file
, 1);
6923 final (insn
, file
, 1);
6924 final_end_function ();
6925 assemble_end_function (thunk
, fnname
);
6927 /* Stop pretending to be a post-reload pass. */
6928 reload_completed
= 0;
6932 aarch64_tls_referenced_p (rtx x
)
6934 if (!TARGET_HAVE_TLS
)
6936 subrtx_iterator::array_type array
;
6937 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
6939 const_rtx x
= *iter
;
6940 if (GET_CODE (x
) == SYMBOL_REF
&& SYMBOL_REF_TLS_MODEL (x
) != 0)
6942 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6943 TLS offsets, not real symbol references. */
6944 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
6945 iter
.skip_subrtxes ();
6951 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6952 a left shift of 0 or 12 bits. */
6954 aarch64_uimm12_shift (HOST_WIDE_INT val
)
6956 return ((val
& (((HOST_WIDE_INT
) 0xfff) << 0)) == val
6957 || (val
& (((HOST_WIDE_INT
) 0xfff) << 12)) == val
6961 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6962 that can be created with a left shift of 0 or 12. */
6963 static HOST_WIDE_INT
6964 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val
)
6966 /* Check to see if the value fits in 24 bits, as that is the maximum we can
6967 handle correctly. */
6968 gcc_assert ((val
& 0xffffff) == val
);
6970 if (((val
& 0xfff) << 0) == val
)
6973 return val
& (0xfff << 12);
6976 /* Return true if val is an immediate that can be loaded into a
6977 register by a MOVZ instruction. */
6979 aarch64_movw_imm (HOST_WIDE_INT val
, scalar_int_mode mode
)
6981 if (GET_MODE_SIZE (mode
) > 4)
6983 if ((val
& (((HOST_WIDE_INT
) 0xffff) << 32)) == val
6984 || (val
& (((HOST_WIDE_INT
) 0xffff) << 48)) == val
)
6989 /* Ignore sign extension. */
6990 val
&= (HOST_WIDE_INT
) 0xffffffff;
6992 return ((val
& (((HOST_WIDE_INT
) 0xffff) << 0)) == val
6993 || (val
& (((HOST_WIDE_INT
) 0xffff) << 16)) == val
);
6996 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
6997 64-bit (DImode) integer. */
6999 static unsigned HOST_WIDE_INT
7000 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val
, machine_mode mode
)
7002 unsigned int size
= GET_MODE_UNIT_PRECISION (mode
);
7005 val
&= (HOST_WIDE_INT_1U
<< size
) - 1;
7012 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
7014 static const unsigned HOST_WIDE_INT bitmask_imm_mul
[] =
7016 0x0000000100000001ull
,
7017 0x0001000100010001ull
,
7018 0x0101010101010101ull
,
7019 0x1111111111111111ull
,
7020 0x5555555555555555ull
,
7024 /* Return true if val is a valid bitmask immediate. */
7027 aarch64_bitmask_imm (HOST_WIDE_INT val_in
, machine_mode mode
)
7029 unsigned HOST_WIDE_INT val
, tmp
, mask
, first_one
, next_one
;
7032 /* Check for a single sequence of one bits and return quickly if so.
7033 The special cases of all ones and all zeroes returns false. */
7034 val
= aarch64_replicate_bitmask_imm (val_in
, mode
);
7035 tmp
= val
+ (val
& -val
);
7037 if (tmp
== (tmp
& -tmp
))
7038 return (val
+ 1) > 1;
7040 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
7042 val
= (val
<< 32) | (val
& 0xffffffff);
7044 /* Invert if the immediate doesn't start with a zero bit - this means we
7045 only need to search for sequences of one bits. */
7049 /* Find the first set bit and set tmp to val with the first sequence of one
7050 bits removed. Return success if there is a single sequence of ones. */
7051 first_one
= val
& -val
;
7052 tmp
= val
& (val
+ first_one
);
7057 /* Find the next set bit and compute the difference in bit position. */
7058 next_one
= tmp
& -tmp
;
7059 bits
= clz_hwi (first_one
) - clz_hwi (next_one
);
7062 /* Check the bit position difference is a power of 2, and that the first
7063 sequence of one bits fits within 'bits' bits. */
7064 if ((mask
>> bits
) != 0 || bits
!= (bits
& -bits
))
7067 /* Check the sequence of one bits is repeated 64/bits times. */
7068 return val
== mask
* bitmask_imm_mul
[__builtin_clz (bits
) - 26];
7071 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
7072 Assumed precondition: VAL_IN Is not zero. */
7074 unsigned HOST_WIDE_INT
7075 aarch64_and_split_imm1 (HOST_WIDE_INT val_in
)
7077 int lowest_bit_set
= ctz_hwi (val_in
);
7078 int highest_bit_set
= floor_log2 (val_in
);
7079 gcc_assert (val_in
!= 0);
7081 return ((HOST_WIDE_INT_UC (2) << highest_bit_set
) -
7082 (HOST_WIDE_INT_1U
<< lowest_bit_set
));
7085 /* Create constant where bits outside of lowest bit set to highest bit set
7088 unsigned HOST_WIDE_INT
7089 aarch64_and_split_imm2 (HOST_WIDE_INT val_in
)
7091 return val_in
| ~aarch64_and_split_imm1 (val_in
);
7094 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
7097 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in
, machine_mode mode
)
7099 scalar_int_mode int_mode
;
7100 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
7103 if (aarch64_bitmask_imm (val_in
, int_mode
))
7106 if (aarch64_move_imm (val_in
, int_mode
))
7109 unsigned HOST_WIDE_INT imm2
= aarch64_and_split_imm2 (val_in
);
7111 return aarch64_bitmask_imm (imm2
, int_mode
);
7114 /* Return true if val is an immediate that can be loaded into a
7115 register in a single instruction. */
7117 aarch64_move_imm (HOST_WIDE_INT val
, machine_mode mode
)
7119 scalar_int_mode int_mode
;
7120 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
7123 if (aarch64_movw_imm (val
, int_mode
) || aarch64_movw_imm (~val
, int_mode
))
7125 return aarch64_bitmask_imm (val
, int_mode
);
7129 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED
, rtx x
)
7133 if (GET_CODE (x
) == HIGH
)
7136 /* There's no way to calculate VL-based values using relocations. */
7137 subrtx_iterator::array_type array
;
7138 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
7139 if (GET_CODE (*iter
) == CONST_POLY_INT
)
7142 split_const (x
, &base
, &offset
);
7143 if (GET_CODE (base
) == SYMBOL_REF
|| GET_CODE (base
) == LABEL_REF
)
7145 if (aarch64_classify_symbol (base
, INTVAL (offset
))
7146 != SYMBOL_FORCE_TO_MEM
)
7149 /* Avoid generating a 64-bit relocation in ILP32; leave
7150 to aarch64_expand_mov_immediate to handle it properly. */
7151 return mode
!= ptr_mode
;
7154 return aarch64_tls_referenced_p (x
);
7157 /* Implement TARGET_CASE_VALUES_THRESHOLD.
7158 The expansion for a table switch is quite expensive due to the number
7159 of instructions, the table lookup and hard to predict indirect jump.
7160 When optimizing for speed, and -O3 enabled, use the per-core tuning if
7161 set, otherwise use tables for > 16 cases as a tradeoff between size and
7162 performance. When optimizing for size, use the default setting. */
7165 aarch64_case_values_threshold (void)
7167 /* Use the specified limit for the number of cases before using jump
7168 tables at higher optimization levels. */
7170 && selected_cpu
->tune
->max_case_values
!= 0)
7171 return selected_cpu
->tune
->max_case_values
;
7173 return optimize_size
? default_case_values_threshold () : 17;
7176 /* Return true if register REGNO is a valid index register.
7177 STRICT_P is true if REG_OK_STRICT is in effect. */
7180 aarch64_regno_ok_for_index_p (int regno
, bool strict_p
)
7182 if (!HARD_REGISTER_NUM_P (regno
))
7190 regno
= reg_renumber
[regno
];
7192 return GP_REGNUM_P (regno
);
7195 /* Return true if register REGNO is a valid base register for mode MODE.
7196 STRICT_P is true if REG_OK_STRICT is in effect. */
7199 aarch64_regno_ok_for_base_p (int regno
, bool strict_p
)
7201 if (!HARD_REGISTER_NUM_P (regno
))
7209 regno
= reg_renumber
[regno
];
7212 /* The fake registers will be eliminated to either the stack or
7213 hard frame pointer, both of which are usually valid base registers.
7214 Reload deals with the cases where the eliminated form isn't valid. */
7215 return (GP_REGNUM_P (regno
)
7216 || regno
== SP_REGNUM
7217 || regno
== FRAME_POINTER_REGNUM
7218 || regno
== ARG_POINTER_REGNUM
);
7221 /* Return true if X is a valid base register for mode MODE.
7222 STRICT_P is true if REG_OK_STRICT is in effect. */
7225 aarch64_base_register_rtx_p (rtx x
, bool strict_p
)
7228 && GET_CODE (x
) == SUBREG
7229 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (x
))])
7232 return (REG_P (x
) && aarch64_regno_ok_for_base_p (REGNO (x
), strict_p
));
7235 /* Return true if address offset is a valid index. If it is, fill in INFO
7236 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
7239 aarch64_classify_index (struct aarch64_address_info
*info
, rtx x
,
7240 machine_mode mode
, bool strict_p
)
7242 enum aarch64_address_type type
;
7247 if ((REG_P (x
) || GET_CODE (x
) == SUBREG
)
7248 && GET_MODE (x
) == Pmode
)
7250 type
= ADDRESS_REG_REG
;
7254 /* (sign_extend:DI (reg:SI)) */
7255 else if ((GET_CODE (x
) == SIGN_EXTEND
7256 || GET_CODE (x
) == ZERO_EXTEND
)
7257 && GET_MODE (x
) == DImode
7258 && GET_MODE (XEXP (x
, 0)) == SImode
)
7260 type
= (GET_CODE (x
) == SIGN_EXTEND
)
7261 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
7262 index
= XEXP (x
, 0);
7265 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
7266 else if (GET_CODE (x
) == MULT
7267 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
7268 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
7269 && GET_MODE (XEXP (x
, 0)) == DImode
7270 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
7271 && CONST_INT_P (XEXP (x
, 1)))
7273 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
7274 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
7275 index
= XEXP (XEXP (x
, 0), 0);
7276 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
7278 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
7279 else if (GET_CODE (x
) == ASHIFT
7280 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
7281 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
7282 && GET_MODE (XEXP (x
, 0)) == DImode
7283 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
7284 && CONST_INT_P (XEXP (x
, 1)))
7286 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
7287 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
7288 index
= XEXP (XEXP (x
, 0), 0);
7289 shift
= INTVAL (XEXP (x
, 1));
7291 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
7292 else if ((GET_CODE (x
) == SIGN_EXTRACT
7293 || GET_CODE (x
) == ZERO_EXTRACT
)
7294 && GET_MODE (x
) == DImode
7295 && GET_CODE (XEXP (x
, 0)) == MULT
7296 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
7297 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
7299 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
7300 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
7301 index
= XEXP (XEXP (x
, 0), 0);
7302 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
7303 if (INTVAL (XEXP (x
, 1)) != 32 + shift
7304 || INTVAL (XEXP (x
, 2)) != 0)
7307 /* (and:DI (mult:DI (reg:DI) (const_int scale))
7308 (const_int 0xffffffff<<shift)) */
7309 else if (GET_CODE (x
) == AND
7310 && GET_MODE (x
) == DImode
7311 && GET_CODE (XEXP (x
, 0)) == MULT
7312 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
7313 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
7314 && CONST_INT_P (XEXP (x
, 1)))
7316 type
= ADDRESS_REG_UXTW
;
7317 index
= XEXP (XEXP (x
, 0), 0);
7318 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
7319 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
7322 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
7323 else if ((GET_CODE (x
) == SIGN_EXTRACT
7324 || GET_CODE (x
) == ZERO_EXTRACT
)
7325 && GET_MODE (x
) == DImode
7326 && GET_CODE (XEXP (x
, 0)) == ASHIFT
7327 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
7328 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
7330 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
7331 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
7332 index
= XEXP (XEXP (x
, 0), 0);
7333 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
7334 if (INTVAL (XEXP (x
, 1)) != 32 + shift
7335 || INTVAL (XEXP (x
, 2)) != 0)
7338 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
7339 (const_int 0xffffffff<<shift)) */
7340 else if (GET_CODE (x
) == AND
7341 && GET_MODE (x
) == DImode
7342 && GET_CODE (XEXP (x
, 0)) == ASHIFT
7343 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
7344 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
7345 && CONST_INT_P (XEXP (x
, 1)))
7347 type
= ADDRESS_REG_UXTW
;
7348 index
= XEXP (XEXP (x
, 0), 0);
7349 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
7350 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
7353 /* (mult:P (reg:P) (const_int scale)) */
7354 else if (GET_CODE (x
) == MULT
7355 && GET_MODE (x
) == Pmode
7356 && GET_MODE (XEXP (x
, 0)) == Pmode
7357 && CONST_INT_P (XEXP (x
, 1)))
7359 type
= ADDRESS_REG_REG
;
7360 index
= XEXP (x
, 0);
7361 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
7363 /* (ashift:P (reg:P) (const_int shift)) */
7364 else if (GET_CODE (x
) == ASHIFT
7365 && GET_MODE (x
) == Pmode
7366 && GET_MODE (XEXP (x
, 0)) == Pmode
7367 && CONST_INT_P (XEXP (x
, 1)))
7369 type
= ADDRESS_REG_REG
;
7370 index
= XEXP (x
, 0);
7371 shift
= INTVAL (XEXP (x
, 1));
7377 && GET_CODE (index
) == SUBREG
7378 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (index
))])
7379 index
= SUBREG_REG (index
);
7381 if (aarch64_sve_data_mode_p (mode
))
7383 if (type
!= ADDRESS_REG_REG
7384 || (1 << shift
) != GET_MODE_UNIT_SIZE (mode
))
7390 && !(IN_RANGE (shift
, 1, 3)
7391 && known_eq (1 << shift
, GET_MODE_SIZE (mode
))))
7396 && aarch64_regno_ok_for_index_p (REGNO (index
), strict_p
))
7399 info
->offset
= index
;
7400 info
->shift
= shift
;
7407 /* Return true if MODE is one of the modes for which we
7408 support LDP/STP operations. */
7411 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode
)
7413 return mode
== SImode
|| mode
== DImode
7414 || mode
== SFmode
|| mode
== DFmode
7415 || (aarch64_vector_mode_supported_p (mode
)
7416 && (known_eq (GET_MODE_SIZE (mode
), 8)
7417 || (known_eq (GET_MODE_SIZE (mode
), 16)
7418 && (aarch64_tune_params
.extra_tuning_flags
7419 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
) == 0)));
7422 /* Return true if REGNO is a virtual pointer register, or an eliminable
7423 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
7424 include stack_pointer or hard_frame_pointer. */
7426 virt_or_elim_regno_p (unsigned regno
)
7428 return ((regno
>= FIRST_VIRTUAL_REGISTER
7429 && regno
<= LAST_VIRTUAL_POINTER_REGISTER
)
7430 || regno
== FRAME_POINTER_REGNUM
7431 || regno
== ARG_POINTER_REGNUM
);
7434 /* Return true if X is a valid address of type TYPE for machine mode MODE.
7435 If it is, fill in INFO appropriately. STRICT_P is true if
7436 REG_OK_STRICT is in effect. */
7439 aarch64_classify_address (struct aarch64_address_info
*info
,
7440 rtx x
, machine_mode mode
, bool strict_p
,
7441 aarch64_addr_query_type type
)
7443 enum rtx_code code
= GET_CODE (x
);
7447 HOST_WIDE_INT const_size
;
7449 /* On BE, we use load/store pair for all large int mode load/stores.
7450 TI/TFmode may also use a load/store pair. */
7451 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
7452 bool advsimd_struct_p
= (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
));
7453 bool load_store_pair_p
= (type
== ADDR_QUERY_LDP_STP
7454 || type
== ADDR_QUERY_LDP_STP_N
7457 || (BYTES_BIG_ENDIAN
&& advsimd_struct_p
));
7459 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
7460 corresponds to the actual size of the memory being loaded/stored and the
7461 mode of the corresponding addressing mode is half of that. */
7462 if (type
== ADDR_QUERY_LDP_STP_N
7463 && known_eq (GET_MODE_SIZE (mode
), 16))
7466 bool allow_reg_index_p
= (!load_store_pair_p
7467 && (known_lt (GET_MODE_SIZE (mode
), 16)
7468 || vec_flags
== VEC_ADVSIMD
7469 || vec_flags
& VEC_SVE_DATA
));
7471 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
7472 [Rn, #offset, MUL VL]. */
7473 if ((vec_flags
& (VEC_SVE_DATA
| VEC_SVE_PRED
)) != 0
7474 && (code
!= REG
&& code
!= PLUS
))
7477 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
7479 if (advsimd_struct_p
7480 && !BYTES_BIG_ENDIAN
7481 && (code
!= POST_INC
&& code
!= REG
))
7484 gcc_checking_assert (GET_MODE (x
) == VOIDmode
7485 || SCALAR_INT_MODE_P (GET_MODE (x
)));
7491 info
->type
= ADDRESS_REG_IMM
;
7493 info
->offset
= const0_rtx
;
7494 info
->const_offset
= 0;
7495 return aarch64_base_register_rtx_p (x
, strict_p
);
7503 && virt_or_elim_regno_p (REGNO (op0
))
7504 && poly_int_rtx_p (op1
, &offset
))
7506 info
->type
= ADDRESS_REG_IMM
;
7509 info
->const_offset
= offset
;
7514 if (maybe_ne (GET_MODE_SIZE (mode
), 0)
7515 && aarch64_base_register_rtx_p (op0
, strict_p
)
7516 && poly_int_rtx_p (op1
, &offset
))
7518 info
->type
= ADDRESS_REG_IMM
;
7521 info
->const_offset
= offset
;
7523 /* TImode and TFmode values are allowed in both pairs of X
7524 registers and individual Q registers. The available
7526 X,X: 7-bit signed scaled offset
7527 Q: 9-bit signed offset
7528 We conservatively require an offset representable in either mode.
7529 When performing the check for pairs of X registers i.e. LDP/STP
7530 pass down DImode since that is the natural size of the LDP/STP
7531 instruction memory accesses. */
7532 if (mode
== TImode
|| mode
== TFmode
)
7533 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
7534 && (aarch64_offset_9bit_signed_unscaled_p (mode
, offset
)
7535 || offset_12bit_unsigned_scaled_p (mode
, offset
)));
7537 /* A 7bit offset check because OImode will emit a ldp/stp
7538 instruction (only big endian will get here).
7539 For ldp/stp instructions, the offset is scaled for the size of a
7540 single element of the pair. */
7542 return aarch64_offset_7bit_signed_scaled_p (TImode
, offset
);
7544 /* Three 9/12 bit offsets checks because CImode will emit three
7545 ldr/str instructions (only big endian will get here). */
7547 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
7548 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode
,
7550 || offset_12bit_unsigned_scaled_p (V16QImode
,
7553 /* Two 7bit offsets checks because XImode will emit two ldp/stp
7554 instructions (only big endian will get here). */
7556 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
7557 && aarch64_offset_7bit_signed_scaled_p (TImode
,
7560 /* Make "m" use the LD1 offset range for SVE data modes, so
7561 that pre-RTL optimizers like ivopts will work to that
7562 instead of the wider LDR/STR range. */
7563 if (vec_flags
== VEC_SVE_DATA
)
7564 return (type
== ADDR_QUERY_M
7565 ? offset_4bit_signed_scaled_p (mode
, offset
)
7566 : offset_9bit_signed_scaled_p (mode
, offset
));
7568 if (vec_flags
== (VEC_SVE_DATA
| VEC_STRUCT
))
7570 poly_int64 end_offset
= (offset
7571 + GET_MODE_SIZE (mode
)
7572 - BYTES_PER_SVE_VECTOR
);
7573 return (type
== ADDR_QUERY_M
7574 ? offset_4bit_signed_scaled_p (mode
, offset
)
7575 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE
, offset
)
7576 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE
,
7580 if (vec_flags
== VEC_SVE_PRED
)
7581 return offset_9bit_signed_scaled_p (mode
, offset
);
7583 if (load_store_pair_p
)
7584 return ((known_eq (GET_MODE_SIZE (mode
), 4)
7585 || known_eq (GET_MODE_SIZE (mode
), 8)
7586 || known_eq (GET_MODE_SIZE (mode
), 16))
7587 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
7589 return (aarch64_offset_9bit_signed_unscaled_p (mode
, offset
)
7590 || offset_12bit_unsigned_scaled_p (mode
, offset
));
7593 if (allow_reg_index_p
)
7595 /* Look for base + (scaled/extended) index register. */
7596 if (aarch64_base_register_rtx_p (op0
, strict_p
)
7597 && aarch64_classify_index (info
, op1
, mode
, strict_p
))
7602 if (aarch64_base_register_rtx_p (op1
, strict_p
)
7603 && aarch64_classify_index (info
, op0
, mode
, strict_p
))
7616 info
->type
= ADDRESS_REG_WB
;
7617 info
->base
= XEXP (x
, 0);
7618 info
->offset
= NULL_RTX
;
7619 return aarch64_base_register_rtx_p (info
->base
, strict_p
);
7623 info
->type
= ADDRESS_REG_WB
;
7624 info
->base
= XEXP (x
, 0);
7625 if (GET_CODE (XEXP (x
, 1)) == PLUS
7626 && poly_int_rtx_p (XEXP (XEXP (x
, 1), 1), &offset
)
7627 && rtx_equal_p (XEXP (XEXP (x
, 1), 0), info
->base
)
7628 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
7630 info
->offset
= XEXP (XEXP (x
, 1), 1);
7631 info
->const_offset
= offset
;
7633 /* TImode and TFmode values are allowed in both pairs of X
7634 registers and individual Q registers. The available
7636 X,X: 7-bit signed scaled offset
7637 Q: 9-bit signed offset
7638 We conservatively require an offset representable in either mode.
7640 if (mode
== TImode
|| mode
== TFmode
)
7641 return (aarch64_offset_7bit_signed_scaled_p (mode
, offset
)
7642 && aarch64_offset_9bit_signed_unscaled_p (mode
, offset
));
7644 if (load_store_pair_p
)
7645 return ((known_eq (GET_MODE_SIZE (mode
), 4)
7646 || known_eq (GET_MODE_SIZE (mode
), 8)
7647 || known_eq (GET_MODE_SIZE (mode
), 16))
7648 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
7650 return aarch64_offset_9bit_signed_unscaled_p (mode
, offset
);
7657 /* load literal: pc-relative constant pool entry. Only supported
7658 for SI mode or larger. */
7659 info
->type
= ADDRESS_SYMBOLIC
;
7661 if (!load_store_pair_p
7662 && GET_MODE_SIZE (mode
).is_constant (&const_size
)
7667 split_const (x
, &sym
, &addend
);
7668 return ((GET_CODE (sym
) == LABEL_REF
7669 || (GET_CODE (sym
) == SYMBOL_REF
7670 && CONSTANT_POOL_ADDRESS_P (sym
)
7671 && aarch64_pcrelative_literal_loads
)));
7676 info
->type
= ADDRESS_LO_SUM
;
7677 info
->base
= XEXP (x
, 0);
7678 info
->offset
= XEXP (x
, 1);
7679 if (allow_reg_index_p
7680 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
7683 split_const (info
->offset
, &sym
, &offs
);
7684 if (GET_CODE (sym
) == SYMBOL_REF
7685 && (aarch64_classify_symbol (sym
, INTVAL (offs
))
7686 == SYMBOL_SMALL_ABSOLUTE
))
7688 /* The symbol and offset must be aligned to the access size. */
7691 if (CONSTANT_POOL_ADDRESS_P (sym
))
7692 align
= GET_MODE_ALIGNMENT (get_pool_mode (sym
));
7693 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym
))
7695 tree exp
= SYMBOL_REF_DECL (sym
);
7696 align
= TYPE_ALIGN (TREE_TYPE (exp
));
7697 align
= aarch64_constant_alignment (exp
, align
);
7699 else if (SYMBOL_REF_DECL (sym
))
7700 align
= DECL_ALIGN (SYMBOL_REF_DECL (sym
));
7701 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym
)
7702 && SYMBOL_REF_BLOCK (sym
) != NULL
)
7703 align
= SYMBOL_REF_BLOCK (sym
)->alignment
;
7705 align
= BITS_PER_UNIT
;
7707 poly_int64 ref_size
= GET_MODE_SIZE (mode
);
7708 if (known_eq (ref_size
, 0))
7709 ref_size
= GET_MODE_SIZE (DImode
);
7711 return (multiple_p (INTVAL (offs
), ref_size
)
7712 && multiple_p (align
/ BITS_PER_UNIT
, ref_size
));
7722 /* Return true if the address X is valid for a PRFM instruction.
7723 STRICT_P is true if we should do strict checking with
7724 aarch64_classify_address. */
7727 aarch64_address_valid_for_prefetch_p (rtx x
, bool strict_p
)
7729 struct aarch64_address_info addr
;
7731 /* PRFM accepts the same addresses as DImode... */
7732 bool res
= aarch64_classify_address (&addr
, x
, DImode
, strict_p
);
7736 /* ... except writeback forms. */
7737 return addr
.type
!= ADDRESS_REG_WB
;
7741 aarch64_symbolic_address_p (rtx x
)
7745 split_const (x
, &x
, &offset
);
7746 return GET_CODE (x
) == SYMBOL_REF
|| GET_CODE (x
) == LABEL_REF
;
7749 /* Classify the base of symbolic expression X. */
7751 enum aarch64_symbol_type
7752 aarch64_classify_symbolic_expression (rtx x
)
7756 split_const (x
, &x
, &offset
);
7757 return aarch64_classify_symbol (x
, INTVAL (offset
));
7761 /* Return TRUE if X is a legitimate address for accessing memory in
7764 aarch64_legitimate_address_hook_p (machine_mode mode
, rtx x
, bool strict_p
)
7766 struct aarch64_address_info addr
;
7768 return aarch64_classify_address (&addr
, x
, mode
, strict_p
);
7771 /* Return TRUE if X is a legitimate address of type TYPE for accessing
7772 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
7774 aarch64_legitimate_address_p (machine_mode mode
, rtx x
, bool strict_p
,
7775 aarch64_addr_query_type type
)
7777 struct aarch64_address_info addr
;
7779 return aarch64_classify_address (&addr
, x
, mode
, strict_p
, type
);
7782 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
7785 aarch64_legitimize_address_displacement (rtx
*offset1
, rtx
*offset2
,
7786 poly_int64 orig_offset
,
7790 if (GET_MODE_SIZE (mode
).is_constant (&size
))
7792 HOST_WIDE_INT const_offset
, second_offset
;
7794 /* A general SVE offset is A * VQ + B. Remove the A component from
7795 coefficient 0 in order to get the constant B. */
7796 const_offset
= orig_offset
.coeffs
[0] - orig_offset
.coeffs
[1];
7798 /* Split an out-of-range address displacement into a base and
7799 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
7800 range otherwise to increase opportunities for sharing the base
7801 address of different sizes. Unaligned accesses use the signed
7802 9-bit range, TImode/TFmode use the intersection of signed
7803 scaled 7-bit and signed 9-bit offset. */
7804 if (mode
== TImode
|| mode
== TFmode
)
7805 second_offset
= ((const_offset
+ 0x100) & 0x1f8) - 0x100;
7806 else if ((const_offset
& (size
- 1)) != 0)
7807 second_offset
= ((const_offset
+ 0x100) & 0x1ff) - 0x100;
7809 second_offset
= const_offset
& (size
< 4 ? 0xfff : 0x3ffc);
7811 if (second_offset
== 0 || known_eq (orig_offset
, second_offset
))
7814 /* Split the offset into second_offset and the rest. */
7815 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
7816 *offset2
= gen_int_mode (second_offset
, Pmode
);
7821 /* Get the mode we should use as the basis of the range. For structure
7822 modes this is the mode of one vector. */
7823 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
7824 machine_mode step_mode
7825 = (vec_flags
& VEC_STRUCT
) != 0 ? SVE_BYTE_MODE
: mode
;
7827 /* Get the "mul vl" multiplier we'd like to use. */
7828 HOST_WIDE_INT factor
= GET_MODE_SIZE (step_mode
).coeffs
[1];
7829 HOST_WIDE_INT vnum
= orig_offset
.coeffs
[1] / factor
;
7830 if (vec_flags
& VEC_SVE_DATA
)
7831 /* LDR supports a 9-bit range, but the move patterns for
7832 structure modes require all vectors to be in range of the
7833 same base. The simplest way of accomodating that while still
7834 promoting reuse of anchor points between different modes is
7835 to use an 8-bit range unconditionally. */
7836 vnum
= ((vnum
+ 128) & 255) - 128;
7838 /* Predicates are only handled singly, so we might as well use
7840 vnum
= ((vnum
+ 256) & 511) - 256;
7844 /* Convert the "mul vl" multiplier into a byte offset. */
7845 poly_int64 second_offset
= GET_MODE_SIZE (step_mode
) * vnum
;
7846 if (known_eq (second_offset
, orig_offset
))
7849 /* Split the offset into second_offset and the rest. */
7850 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
7851 *offset2
= gen_int_mode (second_offset
, Pmode
);
7856 /* Return the binary representation of floating point constant VALUE in INTVAL.
7857 If the value cannot be converted, return false without setting INTVAL.
7858 The conversion is done in the given MODE. */
7860 aarch64_reinterpret_float_as_int (rtx value
, unsigned HOST_WIDE_INT
*intval
)
7863 /* We make a general exception for 0. */
7864 if (aarch64_float_const_zero_rtx_p (value
))
7870 scalar_float_mode mode
;
7871 if (GET_CODE (value
) != CONST_DOUBLE
7872 || !is_a
<scalar_float_mode
> (GET_MODE (value
), &mode
)
7873 || GET_MODE_BITSIZE (mode
) > HOST_BITS_PER_WIDE_INT
7874 /* Only support up to DF mode. */
7875 || GET_MODE_BITSIZE (mode
) > GET_MODE_BITSIZE (DFmode
))
7878 unsigned HOST_WIDE_INT ival
= 0;
7881 real_to_target (res
,
7882 CONST_DOUBLE_REAL_VALUE (value
),
7883 REAL_MODE_FORMAT (mode
));
7887 int order
= BYTES_BIG_ENDIAN
? 1 : 0;
7888 ival
= zext_hwi (res
[order
], 32);
7889 ival
|= (zext_hwi (res
[1 - order
], 32) << 32);
7892 ival
= zext_hwi (res
[0], 32);
7898 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7899 single MOV(+MOVK) followed by an FMOV. */
7901 aarch64_float_const_rtx_p (rtx x
)
7903 machine_mode mode
= GET_MODE (x
);
7904 if (mode
== VOIDmode
)
7907 /* Determine whether it's cheaper to write float constants as
7908 mov/movk pairs over ldr/adrp pairs. */
7909 unsigned HOST_WIDE_INT ival
;
7911 if (GET_CODE (x
) == CONST_DOUBLE
7912 && SCALAR_FLOAT_MODE_P (mode
)
7913 && aarch64_reinterpret_float_as_int (x
, &ival
))
7915 scalar_int_mode imode
= (mode
== HFmode
7917 : int_mode_for_mode (mode
).require ());
7918 int num_instr
= aarch64_internal_mov_immediate
7919 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
7920 return num_instr
< 3;
7926 /* Return TRUE if rtx X is immediate constant 0.0 */
7928 aarch64_float_const_zero_rtx_p (rtx x
)
7930 if (GET_MODE (x
) == VOIDmode
)
7933 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x
)))
7934 return !HONOR_SIGNED_ZEROS (GET_MODE (x
));
7935 return real_equal (CONST_DOUBLE_REAL_VALUE (x
), &dconst0
);
7938 /* Return TRUE if rtx X is immediate constant that fits in a single
7939 MOVI immediate operation. */
7941 aarch64_can_const_movi_rtx_p (rtx x
, machine_mode mode
)
7947 scalar_int_mode imode
;
7948 unsigned HOST_WIDE_INT ival
;
7950 if (GET_CODE (x
) == CONST_DOUBLE
7951 && SCALAR_FLOAT_MODE_P (mode
))
7953 if (!aarch64_reinterpret_float_as_int (x
, &ival
))
7956 /* We make a general exception for 0. */
7957 if (aarch64_float_const_zero_rtx_p (x
))
7960 imode
= int_mode_for_mode (mode
).require ();
7962 else if (GET_CODE (x
) == CONST_INT
7963 && is_a
<scalar_int_mode
> (mode
, &imode
))
7968 /* use a 64 bit mode for everything except for DI/DF mode, where we use
7969 a 128 bit vector mode. */
7970 int width
= GET_MODE_BITSIZE (imode
) == 64 ? 128 : 64;
7972 vmode
= aarch64_simd_container_mode (imode
, width
);
7973 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, ival
);
7975 return aarch64_simd_valid_immediate (v_op
, NULL
);
7979 /* Return the fixed registers used for condition codes. */
7982 aarch64_fixed_condition_code_regs (unsigned int *p1
, unsigned int *p2
)
7985 *p2
= INVALID_REGNUM
;
7989 /* This function is used by the call expanders of the machine description.
7990 RESULT is the register in which the result is returned. It's NULL for
7991 "call" and "sibcall".
7992 MEM is the location of the function call.
7993 SIBCALL indicates whether this function call is normal call or sibling call.
7994 It will generate different pattern accordingly. */
7997 aarch64_expand_call (rtx result
, rtx mem
, bool sibcall
)
7999 rtx call
, callee
, tmp
;
8003 gcc_assert (MEM_P (mem
));
8004 callee
= XEXP (mem
, 0);
8005 mode
= GET_MODE (callee
);
8006 gcc_assert (mode
== Pmode
);
8008 /* Decide if we should generate indirect calls by loading the
8009 address of the callee into a register before performing
8010 the branch-and-link. */
8011 if (SYMBOL_REF_P (callee
)
8012 ? (aarch64_is_long_call_p (callee
)
8013 || aarch64_is_noplt_call_p (callee
))
8015 XEXP (mem
, 0) = force_reg (mode
, callee
);
8017 call
= gen_rtx_CALL (VOIDmode
, mem
, const0_rtx
);
8019 if (result
!= NULL_RTX
)
8020 call
= gen_rtx_SET (result
, call
);
8025 tmp
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (Pmode
, LR_REGNUM
));
8027 vec
= gen_rtvec (2, call
, tmp
);
8028 call
= gen_rtx_PARALLEL (VOIDmode
, vec
);
8030 aarch64_emit_call_insn (call
);
8033 /* Emit call insn with PAT and do aarch64-specific handling. */
8036 aarch64_emit_call_insn (rtx pat
)
8038 rtx insn
= emit_call_insn (pat
);
8040 rtx
*fusage
= &CALL_INSN_FUNCTION_USAGE (insn
);
8041 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP0_REGNUM
));
8042 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP1_REGNUM
));
8046 aarch64_select_cc_mode (RTX_CODE code
, rtx x
, rtx y
)
8048 machine_mode mode_x
= GET_MODE (x
);
8049 rtx_code code_x
= GET_CODE (x
);
8051 /* All floating point compares return CCFP if it is an equality
8052 comparison, and CCFPE otherwise. */
8053 if (GET_MODE_CLASS (mode_x
) == MODE_FLOAT
)
8080 /* Equality comparisons of short modes against zero can be performed
8081 using the TST instruction with the appropriate bitmask. */
8082 if (y
== const0_rtx
&& (REG_P (x
) || SUBREG_P (x
))
8083 && (code
== EQ
|| code
== NE
)
8084 && (mode_x
== HImode
|| mode_x
== QImode
))
8087 /* Similarly, comparisons of zero_extends from shorter modes can
8088 be performed using an ANDS with an immediate mask. */
8089 if (y
== const0_rtx
&& code_x
== ZERO_EXTEND
8090 && (mode_x
== SImode
|| mode_x
== DImode
)
8091 && (GET_MODE (XEXP (x
, 0)) == HImode
|| GET_MODE (XEXP (x
, 0)) == QImode
)
8092 && (code
== EQ
|| code
== NE
))
8095 if ((mode_x
== SImode
|| mode_x
== DImode
)
8097 && (code
== EQ
|| code
== NE
|| code
== LT
|| code
== GE
)
8098 && (code_x
== PLUS
|| code_x
== MINUS
|| code_x
== AND
8100 || (code_x
== ZERO_EXTRACT
&& CONST_INT_P (XEXP (x
, 1))
8101 && CONST_INT_P (XEXP (x
, 2)))))
8104 /* A compare with a shifted operand. Because of canonicalization,
8105 the comparison will have to be swapped when we emit the assembly
8107 if ((mode_x
== SImode
|| mode_x
== DImode
)
8108 && (REG_P (y
) || GET_CODE (y
) == SUBREG
|| y
== const0_rtx
)
8109 && (code_x
== ASHIFT
|| code_x
== ASHIFTRT
8110 || code_x
== LSHIFTRT
8111 || code_x
== ZERO_EXTEND
|| code_x
== SIGN_EXTEND
))
8114 /* Similarly for a negated operand, but we can only do this for
8116 if ((mode_x
== SImode
|| mode_x
== DImode
)
8117 && (REG_P (y
) || GET_CODE (y
) == SUBREG
)
8118 && (code
== EQ
|| code
== NE
)
8122 /* A test for unsigned overflow from an addition. */
8123 if ((mode_x
== DImode
|| mode_x
== TImode
)
8124 && (code
== LTU
|| code
== GEU
)
8126 && rtx_equal_p (XEXP (x
, 0), y
))
8129 /* A test for unsigned overflow from an add with carry. */
8130 if ((mode_x
== DImode
|| mode_x
== TImode
)
8131 && (code
== LTU
|| code
== GEU
)
8133 && CONST_SCALAR_INT_P (y
)
8134 && (rtx_mode_t (y
, mode_x
)
8135 == (wi::shwi (1, mode_x
)
8136 << (GET_MODE_BITSIZE (mode_x
).to_constant () / 2))))
8139 /* A test for signed overflow. */
8140 if ((mode_x
== DImode
|| mode_x
== TImode
)
8143 && GET_CODE (y
) == SIGN_EXTEND
)
8146 /* For everything else, return CCmode. */
8151 aarch64_get_condition_code_1 (machine_mode
, enum rtx_code
);
8154 aarch64_get_condition_code (rtx x
)
8156 machine_mode mode
= GET_MODE (XEXP (x
, 0));
8157 enum rtx_code comp_code
= GET_CODE (x
);
8159 if (GET_MODE_CLASS (mode
) != MODE_CC
)
8160 mode
= SELECT_CC_MODE (comp_code
, XEXP (x
, 0), XEXP (x
, 1));
8161 return aarch64_get_condition_code_1 (mode
, comp_code
);
8165 aarch64_get_condition_code_1 (machine_mode mode
, enum rtx_code comp_code
)
8173 case GE
: return AARCH64_GE
;
8174 case GT
: return AARCH64_GT
;
8175 case LE
: return AARCH64_LS
;
8176 case LT
: return AARCH64_MI
;
8177 case NE
: return AARCH64_NE
;
8178 case EQ
: return AARCH64_EQ
;
8179 case ORDERED
: return AARCH64_VC
;
8180 case UNORDERED
: return AARCH64_VS
;
8181 case UNLT
: return AARCH64_LT
;
8182 case UNLE
: return AARCH64_LE
;
8183 case UNGT
: return AARCH64_HI
;
8184 case UNGE
: return AARCH64_PL
;
8192 case NE
: return AARCH64_NE
;
8193 case EQ
: return AARCH64_EQ
;
8194 case GE
: return AARCH64_GE
;
8195 case GT
: return AARCH64_GT
;
8196 case LE
: return AARCH64_LE
;
8197 case LT
: return AARCH64_LT
;
8198 case GEU
: return AARCH64_CS
;
8199 case GTU
: return AARCH64_HI
;
8200 case LEU
: return AARCH64_LS
;
8201 case LTU
: return AARCH64_CC
;
8209 case NE
: return AARCH64_NE
;
8210 case EQ
: return AARCH64_EQ
;
8211 case GE
: return AARCH64_LE
;
8212 case GT
: return AARCH64_LT
;
8213 case LE
: return AARCH64_GE
;
8214 case LT
: return AARCH64_GT
;
8215 case GEU
: return AARCH64_LS
;
8216 case GTU
: return AARCH64_CC
;
8217 case LEU
: return AARCH64_CS
;
8218 case LTU
: return AARCH64_HI
;
8226 case NE
: return AARCH64_NE
; /* = any */
8227 case EQ
: return AARCH64_EQ
; /* = none */
8228 case GE
: return AARCH64_PL
; /* = nfrst */
8229 case LT
: return AARCH64_MI
; /* = first */
8230 case GEU
: return AARCH64_CS
; /* = nlast */
8231 case GTU
: return AARCH64_HI
; /* = pmore */
8232 case LEU
: return AARCH64_LS
; /* = plast */
8233 case LTU
: return AARCH64_CC
; /* = last */
8241 case NE
: return AARCH64_NE
;
8242 case EQ
: return AARCH64_EQ
;
8243 case GE
: return AARCH64_PL
;
8244 case LT
: return AARCH64_MI
;
8252 case NE
: return AARCH64_NE
;
8253 case EQ
: return AARCH64_EQ
;
8261 case LTU
: return AARCH64_CS
;
8262 case GEU
: return AARCH64_CC
;
8270 case GEU
: return AARCH64_CS
;
8271 case LTU
: return AARCH64_CC
;
8279 case NE
: return AARCH64_VS
;
8280 case EQ
: return AARCH64_VC
;
8293 aarch64_const_vec_all_same_in_range_p (rtx x
,
8294 HOST_WIDE_INT minval
,
8295 HOST_WIDE_INT maxval
)
8298 return (const_vec_duplicate_p (x
, &elt
)
8299 && CONST_INT_P (elt
)
8300 && IN_RANGE (INTVAL (elt
), minval
, maxval
));
8304 aarch64_const_vec_all_same_int_p (rtx x
, HOST_WIDE_INT val
)
8306 return aarch64_const_vec_all_same_in_range_p (x
, val
, val
);
8309 /* Return true if VEC is a constant in which every element is in the range
8310 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
8313 aarch64_const_vec_all_in_range_p (rtx vec
,
8314 HOST_WIDE_INT minval
,
8315 HOST_WIDE_INT maxval
)
8317 if (GET_CODE (vec
) != CONST_VECTOR
8318 || GET_MODE_CLASS (GET_MODE (vec
)) != MODE_VECTOR_INT
)
8322 if (!CONST_VECTOR_STEPPED_P (vec
))
8323 nunits
= const_vector_encoded_nelts (vec
);
8324 else if (!CONST_VECTOR_NUNITS (vec
).is_constant (&nunits
))
8327 for (int i
= 0; i
< nunits
; i
++)
8329 rtx vec_elem
= CONST_VECTOR_ELT (vec
, i
);
8330 if (!CONST_INT_P (vec_elem
)
8331 || !IN_RANGE (INTVAL (vec_elem
), minval
, maxval
))
8338 #define AARCH64_CC_V 1
8339 #define AARCH64_CC_C (1 << 1)
8340 #define AARCH64_CC_Z (1 << 2)
8341 #define AARCH64_CC_N (1 << 3)
8343 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
8344 static const int aarch64_nzcv_codes
[] =
8346 0, /* EQ, Z == 1. */
8347 AARCH64_CC_Z
, /* NE, Z == 0. */
8348 0, /* CS, C == 1. */
8349 AARCH64_CC_C
, /* CC, C == 0. */
8350 0, /* MI, N == 1. */
8351 AARCH64_CC_N
, /* PL, N == 0. */
8352 0, /* VS, V == 1. */
8353 AARCH64_CC_V
, /* VC, V == 0. */
8354 0, /* HI, C ==1 && Z == 0. */
8355 AARCH64_CC_C
, /* LS, !(C == 1 && Z == 0). */
8356 AARCH64_CC_V
, /* GE, N == V. */
8357 0, /* LT, N != V. */
8358 AARCH64_CC_Z
, /* GT, Z == 0 && N == V. */
8359 0, /* LE, !(Z == 0 && N == V). */
8364 /* Print floating-point vector immediate operand X to F, negating it
8365 first if NEGATE is true. Return true on success, false if it isn't
8366 a constant we can handle. */
8369 aarch64_print_vector_float_operand (FILE *f
, rtx x
, bool negate
)
8373 if (!const_vec_duplicate_p (x
, &elt
))
8376 REAL_VALUE_TYPE r
= *CONST_DOUBLE_REAL_VALUE (elt
);
8378 r
= real_value_negate (&r
);
8380 /* Handle the SVE single-bit immediates specially, since they have a
8381 fixed form in the assembly syntax. */
8382 if (real_equal (&r
, &dconst0
))
8383 asm_fprintf (f
, "0.0");
8384 else if (real_equal (&r
, &dconst2
))
8385 asm_fprintf (f
, "2.0");
8386 else if (real_equal (&r
, &dconst1
))
8387 asm_fprintf (f
, "1.0");
8388 else if (real_equal (&r
, &dconsthalf
))
8389 asm_fprintf (f
, "0.5");
8392 const int buf_size
= 20;
8393 char float_buf
[buf_size
] = {'\0'};
8394 real_to_decimal_for_mode (float_buf
, &r
, buf_size
, buf_size
,
8396 asm_fprintf (f
, "%s", float_buf
);
8402 /* Return the equivalent letter for size. */
8404 sizetochar (int size
)
8408 case 64: return 'd';
8409 case 32: return 's';
8410 case 16: return 'h';
8411 case 8 : return 'b';
8412 default: gcc_unreachable ();
8416 /* Print operand X to file F in a target specific manner according to CODE.
8417 The acceptable formatting commands given by CODE are:
8418 'c': An integer or symbol address without a preceding #
8420 'C': Take the duplicated element in a vector constant
8421 and print it in hex.
8422 'D': Take the duplicated element in a vector constant
8423 and print it as an unsigned integer, in decimal.
8424 'e': Print the sign/zero-extend size as a character 8->b,
8425 16->h, 32->w. Can also be used for masks:
8426 0xff->b, 0xffff->h, 0xffffffff->w.
8427 'I': If the operand is a duplicated vector constant,
8428 replace it with the duplicated scalar. If the
8429 operand is then a floating-point constant, replace
8430 it with the integer bit representation. Print the
8431 transformed constant as a signed decimal number.
8432 'p': Prints N such that 2^N == X (X must be power of 2 and
8434 'P': Print the number of non-zero bits in X (a const_int).
8435 'H': Print the higher numbered register of a pair (TImode)
8437 'm': Print a condition (eq, ne, etc).
8438 'M': Same as 'm', but invert condition.
8439 'N': Take the duplicated element in a vector constant
8440 and print the negative of it in decimal.
8441 'b/h/s/d/q': Print a scalar FP/SIMD register name.
8442 'S/T/U/V': Print a FP/SIMD register name for a register list.
8443 The register printed is the FP/SIMD register name
8444 of X + 0/1/2/3 for S/T/U/V.
8445 'R': Print a scalar Integer/FP/SIMD register name + 1.
8446 'X': Print bottom 16 bits of integer constant in hex.
8447 'w/x': Print a general register name or the zero register
8449 '0': Print a normal operand, if it's a general register,
8450 then we assume DImode.
8451 'k': Print NZCV for conditional compare instructions.
8452 'A': Output address constant representing the first
8453 argument of X, specifying a relocation offset
8455 'L': Output constant address specified by X
8456 with a relocation offset if appropriate.
8457 'G': Prints address of X, specifying a PC relative
8458 relocation mode if appropriate.
8459 'y': Output address of LDP or STP - this is used for
8460 some LDP/STPs which don't use a PARALLEL in their
8461 pattern (so the mode needs to be adjusted).
8462 'z': Output address of a typical LDP or STP. */
8465 aarch64_print_operand (FILE *f
, rtx x
, int code
)
8471 switch (GET_CODE (x
))
8474 fprintf (f
, HOST_WIDE_INT_PRINT_DEC
, INTVAL (x
));
8478 output_addr_const (f
, x
);
8482 if (GET_CODE (XEXP (x
, 0)) == PLUS
8483 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
)
8485 output_addr_const (f
, x
);
8491 output_operand_lossage ("unsupported operand for code '%c'", code
);
8497 x
= unwrap_const_vec_duplicate (x
);
8498 if (!CONST_INT_P (x
))
8500 output_operand_lossage ("invalid operand for '%%%c'", code
);
8504 HOST_WIDE_INT val
= INTVAL (x
);
8505 if ((val
& ~7) == 8 || val
== 0xff)
8507 else if ((val
& ~7) == 16 || val
== 0xffff)
8509 else if ((val
& ~7) == 32 || val
== 0xffffffff)
8513 output_operand_lossage ("invalid operand for '%%%c'", code
);
8523 if (!CONST_INT_P (x
) || (n
= exact_log2 (INTVAL (x
))) < 0)
8525 output_operand_lossage ("invalid operand for '%%%c'", code
);
8529 asm_fprintf (f
, "%d", n
);
8534 if (!CONST_INT_P (x
))
8536 output_operand_lossage ("invalid operand for '%%%c'", code
);
8540 asm_fprintf (f
, "%u", popcount_hwi (INTVAL (x
)));
8544 if (x
== const0_rtx
)
8546 asm_fprintf (f
, "xzr");
8550 if (!REG_P (x
) || !GP_REGNUM_P (REGNO (x
) + 1))
8552 output_operand_lossage ("invalid operand for '%%%c'", code
);
8556 asm_fprintf (f
, "%s", reg_names
[REGNO (x
) + 1]);
8561 x
= aarch64_bit_representation (unwrap_const_vec_duplicate (x
));
8562 if (CONST_INT_P (x
))
8563 asm_fprintf (f
, "%wd", INTVAL (x
));
8566 output_operand_lossage ("invalid operand for '%%%c'", code
);
8576 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
8577 if (x
== const_true_rtx
)
8584 if (!COMPARISON_P (x
))
8586 output_operand_lossage ("invalid operand for '%%%c'", code
);
8590 cond_code
= aarch64_get_condition_code (x
);
8591 gcc_assert (cond_code
>= 0);
8593 cond_code
= AARCH64_INVERSE_CONDITION_CODE (cond_code
);
8594 if (GET_MODE (XEXP (x
, 0)) == CC_NZCmode
)
8595 fputs (aarch64_sve_condition_codes
[cond_code
], f
);
8597 fputs (aarch64_condition_codes
[cond_code
], f
);
8602 if (!const_vec_duplicate_p (x
, &elt
))
8604 output_operand_lossage ("invalid vector constant");
8608 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
8609 asm_fprintf (f
, "%wd", -INTVAL (elt
));
8610 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
8611 && aarch64_print_vector_float_operand (f
, x
, true))
8615 output_operand_lossage ("invalid vector constant");
8625 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
8627 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
8630 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - V0_REGNUM
);
8637 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
8639 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
8642 asm_fprintf (f
, "%c%d",
8643 aarch64_sve_data_mode_p (GET_MODE (x
)) ? 'z' : 'v',
8644 REGNO (x
) - V0_REGNUM
+ (code
- 'S'));
8648 if (REG_P (x
) && FP_REGNUM_P (REGNO (x
)))
8649 asm_fprintf (f
, "q%d", REGNO (x
) - V0_REGNUM
+ 1);
8650 else if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
8651 asm_fprintf (f
, "x%d", REGNO (x
) - R0_REGNUM
+ 1);
8653 output_operand_lossage ("incompatible register operand for '%%%c'",
8658 if (!CONST_INT_P (x
))
8660 output_operand_lossage ("invalid operand for '%%%c'", code
);
8663 asm_fprintf (f
, "0x%wx", UINTVAL (x
) & 0xffff);
8668 /* Print a replicated constant in hex. */
8669 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
8671 output_operand_lossage ("invalid operand for '%%%c'", code
);
8674 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
8675 asm_fprintf (f
, "0x%wx", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
8681 /* Print a replicated constant in decimal, treating it as
8683 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
8685 output_operand_lossage ("invalid operand for '%%%c'", code
);
8688 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
8689 asm_fprintf (f
, "%wd", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
8696 || (CONST_DOUBLE_P (x
) && aarch64_float_const_zero_rtx_p (x
)))
8698 asm_fprintf (f
, "%czr", code
);
8702 if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
8704 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - R0_REGNUM
);
8708 if (REG_P (x
) && REGNO (x
) == SP_REGNUM
)
8710 asm_fprintf (f
, "%ssp", code
== 'w' ? "w" : "");
8719 output_operand_lossage ("missing operand");
8723 switch (GET_CODE (x
))
8726 if (aarch64_sve_data_mode_p (GET_MODE (x
)))
8728 if (REG_NREGS (x
) == 1)
8729 asm_fprintf (f
, "z%d", REGNO (x
) - V0_REGNUM
);
8733 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x
)));
8734 asm_fprintf (f
, "{z%d.%c - z%d.%c}",
8735 REGNO (x
) - V0_REGNUM
, suffix
,
8736 END_REGNO (x
) - V0_REGNUM
- 1, suffix
);
8740 asm_fprintf (f
, "%s", reg_names
[REGNO (x
)]);
8744 output_address (GET_MODE (x
), XEXP (x
, 0));
8749 output_addr_const (asm_out_file
, x
);
8753 asm_fprintf (f
, "%wd", INTVAL (x
));
8757 if (!VECTOR_MODE_P (GET_MODE (x
)))
8759 output_addr_const (asm_out_file
, x
);
8765 if (!const_vec_duplicate_p (x
, &elt
))
8767 output_operand_lossage ("invalid vector constant");
8771 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
8772 asm_fprintf (f
, "%wd", INTVAL (elt
));
8773 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
8774 && aarch64_print_vector_float_operand (f
, x
, false))
8778 output_operand_lossage ("invalid vector constant");
8784 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
8785 be getting CONST_DOUBLEs holding integers. */
8786 gcc_assert (GET_MODE (x
) != VOIDmode
);
8787 if (aarch64_float_const_zero_rtx_p (x
))
8792 else if (aarch64_float_const_representable_p (x
))
8795 char float_buf
[buf_size
] = {'\0'};
8796 real_to_decimal_for_mode (float_buf
,
8797 CONST_DOUBLE_REAL_VALUE (x
),
8800 asm_fprintf (asm_out_file
, "%s", float_buf
);
8804 output_operand_lossage ("invalid constant");
8807 output_operand_lossage ("invalid operand");
8813 if (GET_CODE (x
) == HIGH
)
8816 switch (aarch64_classify_symbolic_expression (x
))
8818 case SYMBOL_SMALL_GOT_4G
:
8819 asm_fprintf (asm_out_file
, ":got:");
8822 case SYMBOL_SMALL_TLSGD
:
8823 asm_fprintf (asm_out_file
, ":tlsgd:");
8826 case SYMBOL_SMALL_TLSDESC
:
8827 asm_fprintf (asm_out_file
, ":tlsdesc:");
8830 case SYMBOL_SMALL_TLSIE
:
8831 asm_fprintf (asm_out_file
, ":gottprel:");
8834 case SYMBOL_TLSLE24
:
8835 asm_fprintf (asm_out_file
, ":tprel:");
8838 case SYMBOL_TINY_GOT
:
8845 output_addr_const (asm_out_file
, x
);
8849 switch (aarch64_classify_symbolic_expression (x
))
8851 case SYMBOL_SMALL_GOT_4G
:
8852 asm_fprintf (asm_out_file
, ":lo12:");
8855 case SYMBOL_SMALL_TLSGD
:
8856 asm_fprintf (asm_out_file
, ":tlsgd_lo12:");
8859 case SYMBOL_SMALL_TLSDESC
:
8860 asm_fprintf (asm_out_file
, ":tlsdesc_lo12:");
8863 case SYMBOL_SMALL_TLSIE
:
8864 asm_fprintf (asm_out_file
, ":gottprel_lo12:");
8867 case SYMBOL_TLSLE12
:
8868 asm_fprintf (asm_out_file
, ":tprel_lo12:");
8871 case SYMBOL_TLSLE24
:
8872 asm_fprintf (asm_out_file
, ":tprel_lo12_nc:");
8875 case SYMBOL_TINY_GOT
:
8876 asm_fprintf (asm_out_file
, ":got:");
8879 case SYMBOL_TINY_TLSIE
:
8880 asm_fprintf (asm_out_file
, ":gottprel:");
8886 output_addr_const (asm_out_file
, x
);
8890 switch (aarch64_classify_symbolic_expression (x
))
8892 case SYMBOL_TLSLE24
:
8893 asm_fprintf (asm_out_file
, ":tprel_hi12:");
8898 output_addr_const (asm_out_file
, x
);
8903 HOST_WIDE_INT cond_code
;
8905 if (!CONST_INT_P (x
))
8907 output_operand_lossage ("invalid operand for '%%%c'", code
);
8911 cond_code
= INTVAL (x
);
8912 gcc_assert (cond_code
>= 0 && cond_code
<= AARCH64_NV
);
8913 asm_fprintf (f
, "%d", aarch64_nzcv_codes
[cond_code
]);
8920 machine_mode mode
= GET_MODE (x
);
8922 if (GET_CODE (x
) != MEM
8923 || (code
== 'y' && maybe_ne (GET_MODE_SIZE (mode
), 16)))
8925 output_operand_lossage ("invalid operand for '%%%c'", code
);
8929 if (!aarch64_print_address_internal (f
, mode
, XEXP (x
, 0),
8931 ? ADDR_QUERY_LDP_STP_N
8932 : ADDR_QUERY_LDP_STP
))
8933 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
8938 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
8943 /* Print address 'x' of a memory access with mode 'mode'.
8944 'op' is the context required by aarch64_classify_address. It can either be
8945 MEM for a normal memory access or PARALLEL for LDP/STP. */
8947 aarch64_print_address_internal (FILE *f
, machine_mode mode
, rtx x
,
8948 aarch64_addr_query_type type
)
8950 struct aarch64_address_info addr
;
8953 /* Check all addresses are Pmode - including ILP32. */
8954 if (GET_MODE (x
) != Pmode
8955 && (!CONST_INT_P (x
)
8956 || trunc_int_for_mode (INTVAL (x
), Pmode
) != INTVAL (x
)))
8958 output_operand_lossage ("invalid address mode");
8962 if (aarch64_classify_address (&addr
, x
, mode
, true, type
))
8965 case ADDRESS_REG_IMM
:
8966 if (known_eq (addr
.const_offset
, 0))
8967 asm_fprintf (f
, "[%s]", reg_names
[REGNO (addr
.base
)]);
8968 else if (aarch64_sve_data_mode_p (mode
))
8971 = exact_div (addr
.const_offset
,
8972 BYTES_PER_SVE_VECTOR
).to_constant ();
8973 asm_fprintf (f
, "[%s, #%wd, mul vl]",
8974 reg_names
[REGNO (addr
.base
)], vnum
);
8976 else if (aarch64_sve_pred_mode_p (mode
))
8979 = exact_div (addr
.const_offset
,
8980 BYTES_PER_SVE_PRED
).to_constant ();
8981 asm_fprintf (f
, "[%s, #%wd, mul vl]",
8982 reg_names
[REGNO (addr
.base
)], vnum
);
8985 asm_fprintf (f
, "[%s, %wd]", reg_names
[REGNO (addr
.base
)],
8986 INTVAL (addr
.offset
));
8989 case ADDRESS_REG_REG
:
8990 if (addr
.shift
== 0)
8991 asm_fprintf (f
, "[%s, %s]", reg_names
[REGNO (addr
.base
)],
8992 reg_names
[REGNO (addr
.offset
)]);
8994 asm_fprintf (f
, "[%s, %s, lsl %u]", reg_names
[REGNO (addr
.base
)],
8995 reg_names
[REGNO (addr
.offset
)], addr
.shift
);
8998 case ADDRESS_REG_UXTW
:
8999 if (addr
.shift
== 0)
9000 asm_fprintf (f
, "[%s, w%d, uxtw]", reg_names
[REGNO (addr
.base
)],
9001 REGNO (addr
.offset
) - R0_REGNUM
);
9003 asm_fprintf (f
, "[%s, w%d, uxtw %u]", reg_names
[REGNO (addr
.base
)],
9004 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
9007 case ADDRESS_REG_SXTW
:
9008 if (addr
.shift
== 0)
9009 asm_fprintf (f
, "[%s, w%d, sxtw]", reg_names
[REGNO (addr
.base
)],
9010 REGNO (addr
.offset
) - R0_REGNUM
);
9012 asm_fprintf (f
, "[%s, w%d, sxtw %u]", reg_names
[REGNO (addr
.base
)],
9013 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
9016 case ADDRESS_REG_WB
:
9017 /* Writeback is only supported for fixed-width modes. */
9018 size
= GET_MODE_SIZE (mode
).to_constant ();
9019 switch (GET_CODE (x
))
9022 asm_fprintf (f
, "[%s, %d]!", reg_names
[REGNO (addr
.base
)], size
);
9025 asm_fprintf (f
, "[%s], %d", reg_names
[REGNO (addr
.base
)], size
);
9028 asm_fprintf (f
, "[%s, -%d]!", reg_names
[REGNO (addr
.base
)], size
);
9031 asm_fprintf (f
, "[%s], -%d", reg_names
[REGNO (addr
.base
)], size
);
9034 asm_fprintf (f
, "[%s, %wd]!", reg_names
[REGNO (addr
.base
)],
9035 INTVAL (addr
.offset
));
9038 asm_fprintf (f
, "[%s], %wd", reg_names
[REGNO (addr
.base
)],
9039 INTVAL (addr
.offset
));
9046 case ADDRESS_LO_SUM
:
9047 asm_fprintf (f
, "[%s, #:lo12:", reg_names
[REGNO (addr
.base
)]);
9048 output_addr_const (f
, addr
.offset
);
9049 asm_fprintf (f
, "]");
9052 case ADDRESS_SYMBOLIC
:
9053 output_addr_const (f
, x
);
9060 /* Print address 'x' of a memory access with mode 'mode'. */
9062 aarch64_print_operand_address (FILE *f
, machine_mode mode
, rtx x
)
9064 if (!aarch64_print_address_internal (f
, mode
, x
, ADDR_QUERY_ANY
))
9065 output_addr_const (f
, x
);
9069 aarch64_label_mentioned_p (rtx x
)
9074 if (GET_CODE (x
) == LABEL_REF
)
9077 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
9078 referencing instruction, but they are constant offsets, not
9080 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
9083 fmt
= GET_RTX_FORMAT (GET_CODE (x
));
9084 for (i
= GET_RTX_LENGTH (GET_CODE (x
)) - 1; i
>= 0; i
--)
9090 for (j
= XVECLEN (x
, i
) - 1; j
>= 0; j
--)
9091 if (aarch64_label_mentioned_p (XVECEXP (x
, i
, j
)))
9094 else if (fmt
[i
] == 'e' && aarch64_label_mentioned_p (XEXP (x
, i
)))
9101 /* Implement REGNO_REG_CLASS. */
9104 aarch64_regno_regclass (unsigned regno
)
9106 if (GP_REGNUM_P (regno
))
9107 return GENERAL_REGS
;
9109 if (regno
== SP_REGNUM
)
9112 if (regno
== FRAME_POINTER_REGNUM
9113 || regno
== ARG_POINTER_REGNUM
)
9114 return POINTER_REGS
;
9116 if (FP_REGNUM_P (regno
))
9117 return (FP_LO8_REGNUM_P (regno
) ? FP_LO8_REGS
9118 : FP_LO_REGNUM_P (regno
) ? FP_LO_REGS
: FP_REGS
);
9120 if (PR_REGNUM_P (regno
))
9121 return PR_LO_REGNUM_P (regno
) ? PR_LO_REGS
: PR_HI_REGS
;
9126 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
9127 If OFFSET is out of range, return an offset of an anchor point
9128 that is in range. Return 0 otherwise. */
9130 static HOST_WIDE_INT
9131 aarch64_anchor_offset (HOST_WIDE_INT offset
, HOST_WIDE_INT size
,
9134 /* Does it look like we'll need a 16-byte load/store-pair operation? */
9136 return (offset
+ 0x400) & ~0x7f0;
9138 /* For offsets that aren't a multiple of the access size, the limit is
9140 if (offset
& (size
- 1))
9142 /* BLKmode typically uses LDP of X-registers. */
9143 if (mode
== BLKmode
)
9144 return (offset
+ 512) & ~0x3ff;
9145 return (offset
+ 0x100) & ~0x1ff;
9148 /* Small negative offsets are supported. */
9149 if (IN_RANGE (offset
, -256, 0))
9152 if (mode
== TImode
|| mode
== TFmode
)
9153 return (offset
+ 0x100) & ~0x1ff;
9155 /* Use 12-bit offset by access size. */
9156 return offset
& (~0xfff * size
);
9160 aarch64_legitimize_address (rtx x
, rtx
/* orig_x */, machine_mode mode
)
9162 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
9163 where mask is selected by alignment and size of the offset.
9164 We try to pick as large a range for the offset as possible to
9165 maximize the chance of a CSE. However, for aligned addresses
9166 we limit the range to 4k so that structures with different sized
9167 elements are likely to use the same base. We need to be careful
9168 not to split a CONST for some forms of address expression, otherwise
9169 it will generate sub-optimal code. */
9171 if (GET_CODE (x
) == PLUS
&& CONST_INT_P (XEXP (x
, 1)))
9173 rtx base
= XEXP (x
, 0);
9174 rtx offset_rtx
= XEXP (x
, 1);
9175 HOST_WIDE_INT offset
= INTVAL (offset_rtx
);
9177 if (GET_CODE (base
) == PLUS
)
9179 rtx op0
= XEXP (base
, 0);
9180 rtx op1
= XEXP (base
, 1);
9182 /* Force any scaling into a temp for CSE. */
9183 op0
= force_reg (Pmode
, op0
);
9184 op1
= force_reg (Pmode
, op1
);
9186 /* Let the pointer register be in op0. */
9187 if (REG_POINTER (op1
))
9188 std::swap (op0
, op1
);
9190 /* If the pointer is virtual or frame related, then we know that
9191 virtual register instantiation or register elimination is going
9192 to apply a second constant. We want the two constants folded
9193 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
9194 if (virt_or_elim_regno_p (REGNO (op0
)))
9196 base
= expand_binop (Pmode
, add_optab
, op0
, offset_rtx
,
9197 NULL_RTX
, true, OPTAB_DIRECT
);
9198 return gen_rtx_PLUS (Pmode
, base
, op1
);
9201 /* Otherwise, in order to encourage CSE (and thence loop strength
9202 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
9203 base
= expand_binop (Pmode
, add_optab
, op0
, op1
,
9204 NULL_RTX
, true, OPTAB_DIRECT
);
9205 x
= gen_rtx_PLUS (Pmode
, base
, offset_rtx
);
9209 if (GET_MODE_SIZE (mode
).is_constant (&size
))
9211 HOST_WIDE_INT base_offset
= aarch64_anchor_offset (offset
, size
,
9213 if (base_offset
!= 0)
9215 base
= plus_constant (Pmode
, base
, base_offset
);
9216 base
= force_operand (base
, NULL_RTX
);
9217 return plus_constant (Pmode
, base
, offset
- base_offset
);
9226 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED
, rtx x
,
9229 secondary_reload_info
*sri
)
9231 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
9232 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
9233 comment at the head of aarch64-sve.md for more details about the
9234 big-endian handling. */
9235 if (BYTES_BIG_ENDIAN
9236 && reg_class_subset_p (rclass
, FP_REGS
)
9237 && !((REG_P (x
) && HARD_REGISTER_P (x
))
9238 || aarch64_simd_valid_immediate (x
, NULL
))
9239 && aarch64_sve_data_mode_p (mode
))
9241 sri
->icode
= CODE_FOR_aarch64_sve_reload_be
;
9245 /* If we have to disable direct literal pool loads and stores because the
9246 function is too big, then we need a scratch register. */
9247 if (MEM_P (x
) && GET_CODE (x
) == SYMBOL_REF
&& CONSTANT_POOL_ADDRESS_P (x
)
9248 && (SCALAR_FLOAT_MODE_P (GET_MODE (x
))
9249 || targetm
.vector_mode_supported_p (GET_MODE (x
)))
9250 && !aarch64_pcrelative_literal_loads
)
9252 sri
->icode
= code_for_aarch64_reload_movcp (mode
, DImode
);
9256 /* Without the TARGET_SIMD instructions we cannot move a Q register
9257 to a Q register directly. We need a scratch. */
9258 if (REG_P (x
) && (mode
== TFmode
|| mode
== TImode
) && mode
== GET_MODE (x
)
9259 && FP_REGNUM_P (REGNO (x
)) && !TARGET_SIMD
9260 && reg_class_subset_p (rclass
, FP_REGS
))
9262 sri
->icode
= code_for_aarch64_reload_mov (mode
);
9266 /* A TFmode or TImode memory access should be handled via an FP_REGS
9267 because AArch64 has richer addressing modes for LDR/STR instructions
9268 than LDP/STP instructions. */
9269 if (TARGET_FLOAT
&& rclass
== GENERAL_REGS
9270 && known_eq (GET_MODE_SIZE (mode
), 16) && MEM_P (x
))
9273 if (rclass
== FP_REGS
&& (mode
== TImode
|| mode
== TFmode
) && CONSTANT_P(x
))
9274 return GENERAL_REGS
;
9280 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED
, const int to
)
9282 gcc_assert (from
== ARG_POINTER_REGNUM
|| from
== FRAME_POINTER_REGNUM
);
9284 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
9285 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
9286 if (frame_pointer_needed
)
9287 return to
== HARD_FRAME_POINTER_REGNUM
;
9292 aarch64_initial_elimination_offset (unsigned from
, unsigned to
)
9294 if (to
== HARD_FRAME_POINTER_REGNUM
)
9296 if (from
== ARG_POINTER_REGNUM
)
9297 return cfun
->machine
->frame
.hard_fp_offset
;
9299 if (from
== FRAME_POINTER_REGNUM
)
9300 return cfun
->machine
->frame
.hard_fp_offset
9301 - cfun
->machine
->frame
.locals_offset
;
9304 if (to
== STACK_POINTER_REGNUM
)
9306 if (from
== FRAME_POINTER_REGNUM
)
9307 return cfun
->machine
->frame
.frame_size
9308 - cfun
->machine
->frame
.locals_offset
;
9311 return cfun
->machine
->frame
.frame_size
;
9314 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
9318 aarch64_return_addr (int count
, rtx frame ATTRIBUTE_UNUSED
)
9322 return get_hard_reg_initial_val (Pmode
, LR_REGNUM
);
9327 aarch64_asm_trampoline_template (FILE *f
)
9332 if (aarch64_bti_enabled ())
9334 asm_fprintf (f
, "\thint\t34 // bti c\n");
9341 asm_fprintf (f
, "\tldr\tw%d, .+%d\n", IP1_REGNUM
- R0_REGNUM
, offset1
);
9342 asm_fprintf (f
, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM
- R0_REGNUM
,
9347 asm_fprintf (f
, "\tldr\t%s, .+%d\n", reg_names
[IP1_REGNUM
], offset1
);
9348 asm_fprintf (f
, "\tldr\t%s, .+%d\n", reg_names
[STATIC_CHAIN_REGNUM
],
9351 asm_fprintf (f
, "\tbr\t%s\n", reg_names
[IP1_REGNUM
]);
9353 /* The trampoline needs an extra padding instruction. In case if BTI is
9354 enabled the padding instruction is replaced by the BTI instruction at
9356 if (!aarch64_bti_enabled ())
9357 assemble_aligned_integer (4, const0_rtx
);
9359 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
9360 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
9364 aarch64_trampoline_init (rtx m_tramp
, tree fndecl
, rtx chain_value
)
9366 rtx fnaddr
, mem
, a_tramp
;
9367 const int tramp_code_sz
= 16;
9369 /* Don't need to copy the trailing D-words, we fill those in below. */
9370 emit_block_move (m_tramp
, assemble_trampoline_template (),
9371 GEN_INT (tramp_code_sz
), BLOCK_OP_NORMAL
);
9372 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
);
9373 fnaddr
= XEXP (DECL_RTL (fndecl
), 0);
9374 if (GET_MODE (fnaddr
) != ptr_mode
)
9375 fnaddr
= convert_memory_address (ptr_mode
, fnaddr
);
9376 emit_move_insn (mem
, fnaddr
);
9378 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
+ POINTER_BYTES
);
9379 emit_move_insn (mem
, chain_value
);
9381 /* XXX We should really define a "clear_cache" pattern and use
9382 gen_clear_cache(). */
9383 a_tramp
= XEXP (m_tramp
, 0);
9384 emit_library_call (gen_rtx_SYMBOL_REF (Pmode
, "__clear_cache"),
9385 LCT_NORMAL
, VOIDmode
, a_tramp
, ptr_mode
,
9386 plus_constant (ptr_mode
, a_tramp
, TRAMPOLINE_SIZE
),
9390 static unsigned char
9391 aarch64_class_max_nregs (reg_class_t regclass
, machine_mode mode
)
9393 /* ??? Logically we should only need to provide a value when
9394 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
9395 can hold MODE, but at the moment we need to handle all modes.
9396 Just ignore any runtime parts for registers that can't store them. */
9397 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
9401 case TAILCALL_ADDR_REGS
:
9405 case POINTER_AND_FP_REGS
:
9409 if (aarch64_sve_data_mode_p (mode
)
9410 && constant_multiple_p (GET_MODE_SIZE (mode
),
9411 BYTES_PER_SVE_VECTOR
, &nregs
))
9413 return (aarch64_vector_data_mode_p (mode
)
9414 ? CEIL (lowest_size
, UNITS_PER_VREG
)
9415 : CEIL (lowest_size
, UNITS_PER_WORD
));
9432 aarch64_preferred_reload_class (rtx x
, reg_class_t regclass
)
9434 if (regclass
== POINTER_REGS
)
9435 return GENERAL_REGS
;
9437 if (regclass
== STACK_REG
)
9440 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x
)), POINTER_REGS
))
9446 /* Register eliminiation can result in a request for
9447 SP+constant->FP_REGS. We cannot support such operations which
9448 use SP as source and an FP_REG as destination, so reject out
9450 if (! reg_class_subset_p (regclass
, GENERAL_REGS
) && GET_CODE (x
) == PLUS
)
9452 rtx lhs
= XEXP (x
, 0);
9454 /* Look through a possible SUBREG introduced by ILP32. */
9455 if (GET_CODE (lhs
) == SUBREG
)
9456 lhs
= SUBREG_REG (lhs
);
9458 gcc_assert (REG_P (lhs
));
9459 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs
)),
9468 aarch64_asm_output_labelref (FILE* f
, const char *name
)
9470 asm_fprintf (f
, "%U%s", name
);
9474 aarch64_elf_asm_constructor (rtx symbol
, int priority
)
9476 if (priority
== DEFAULT_INIT_PRIORITY
)
9477 default_ctor_section_asm_out_constructor (symbol
, priority
);
9481 /* While priority is known to be in range [0, 65535], so 18 bytes
9482 would be enough, the compiler might not know that. To avoid
9483 -Wformat-truncation false positive, use a larger size. */
9485 snprintf (buf
, sizeof (buf
), ".init_array.%.5u", priority
);
9486 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
9487 switch_to_section (s
);
9488 assemble_align (POINTER_SIZE
);
9489 assemble_aligned_integer (POINTER_BYTES
, symbol
);
9494 aarch64_elf_asm_destructor (rtx symbol
, int priority
)
9496 if (priority
== DEFAULT_INIT_PRIORITY
)
9497 default_dtor_section_asm_out_destructor (symbol
, priority
);
9501 /* While priority is known to be in range [0, 65535], so 18 bytes
9502 would be enough, the compiler might not know that. To avoid
9503 -Wformat-truncation false positive, use a larger size. */
9505 snprintf (buf
, sizeof (buf
), ".fini_array.%.5u", priority
);
9506 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
9507 switch_to_section (s
);
9508 assemble_align (POINTER_SIZE
);
9509 assemble_aligned_integer (POINTER_BYTES
, symbol
);
9514 aarch64_output_casesi (rtx
*operands
)
9518 rtx diff_vec
= PATTERN (NEXT_INSN (as_a
<rtx_insn
*> (operands
[2])));
9520 static const char *const patterns
[4][2] =
9523 "ldrb\t%w3, [%0,%w1,uxtw]",
9524 "add\t%3, %4, %w3, sxtb #2"
9527 "ldrh\t%w3, [%0,%w1,uxtw #1]",
9528 "add\t%3, %4, %w3, sxth #2"
9531 "ldr\t%w3, [%0,%w1,uxtw #2]",
9532 "add\t%3, %4, %w3, sxtw #2"
9534 /* We assume that DImode is only generated when not optimizing and
9535 that we don't really need 64-bit address offsets. That would
9536 imply an object file with 8GB of code in a single function! */
9538 "ldr\t%w3, [%0,%w1,uxtw #2]",
9539 "add\t%3, %4, %w3, sxtw #2"
9543 gcc_assert (GET_CODE (diff_vec
) == ADDR_DIFF_VEC
);
9545 scalar_int_mode mode
= as_a
<scalar_int_mode
> (GET_MODE (diff_vec
));
9546 index
= exact_log2 (GET_MODE_SIZE (mode
));
9548 gcc_assert (index
>= 0 && index
<= 3);
9550 /* Need to implement table size reduction, by chaning the code below. */
9551 output_asm_insn (patterns
[index
][0], operands
);
9552 ASM_GENERATE_INTERNAL_LABEL (label
, "Lrtx", CODE_LABEL_NUMBER (operands
[2]));
9553 snprintf (buf
, sizeof (buf
),
9554 "adr\t%%4, %s", targetm
.strip_name_encoding (label
));
9555 output_asm_insn (buf
, operands
);
9556 output_asm_insn (patterns
[index
][1], operands
);
9557 output_asm_insn ("br\t%3", operands
);
9558 assemble_label (asm_out_file
, label
);
9563 /* Return size in bits of an arithmetic operand which is shifted/scaled and
9564 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
9568 aarch64_uxt_size (int shift
, HOST_WIDE_INT mask
)
9570 if (shift
>= 0 && shift
<= 3)
9573 for (size
= 8; size
<= 32; size
*= 2)
9575 HOST_WIDE_INT bits
= ((HOST_WIDE_INT
)1U << size
) - 1;
9576 if (mask
== bits
<< shift
)
9583 /* Constant pools are per function only when PC relative
9584 literal loads are true or we are in the large memory
9588 aarch64_can_use_per_function_literal_pools_p (void)
9590 return (aarch64_pcrelative_literal_loads
9591 || aarch64_cmodel
== AARCH64_CMODEL_LARGE
);
9595 aarch64_use_blocks_for_constant_p (machine_mode
, const_rtx
)
9597 /* We can't use blocks for constants when we're using a per-function
9599 return !aarch64_can_use_per_function_literal_pools_p ();
9602 /* Select appropriate section for constants depending
9603 on where we place literal pools. */
9606 aarch64_select_rtx_section (machine_mode mode
,
9608 unsigned HOST_WIDE_INT align
)
9610 if (aarch64_can_use_per_function_literal_pools_p ())
9611 return function_section (current_function_decl
);
9613 return default_elf_select_rtx_section (mode
, x
, align
);
9616 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
9618 aarch64_asm_output_pool_epilogue (FILE *f
, const char *, tree
,
9619 HOST_WIDE_INT offset
)
9621 /* When using per-function literal pools, we must ensure that any code
9622 section is aligned to the minimal instruction length, lest we get
9623 errors from the assembler re "unaligned instructions". */
9624 if ((offset
& 3) && aarch64_can_use_per_function_literal_pools_p ())
9625 ASM_OUTPUT_ALIGN (f
, 2);
9630 /* Helper function for rtx cost calculation. Strip a shift expression
9631 from X. Returns the inner operand if successful, or the original
9632 expression on failure. */
9634 aarch64_strip_shift (rtx x
)
9638 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
9639 we can convert both to ROR during final output. */
9640 if ((GET_CODE (op
) == ASHIFT
9641 || GET_CODE (op
) == ASHIFTRT
9642 || GET_CODE (op
) == LSHIFTRT
9643 || GET_CODE (op
) == ROTATERT
9644 || GET_CODE (op
) == ROTATE
)
9645 && CONST_INT_P (XEXP (op
, 1)))
9646 return XEXP (op
, 0);
9648 if (GET_CODE (op
) == MULT
9649 && CONST_INT_P (XEXP (op
, 1))
9650 && ((unsigned) exact_log2 (INTVAL (XEXP (op
, 1)))) < 64)
9651 return XEXP (op
, 0);
9656 /* Helper function for rtx cost calculation. Strip an extend
9657 expression from X. Returns the inner operand if successful, or the
9658 original expression on failure. We deal with a number of possible
9659 canonicalization variations here. If STRIP_SHIFT is true, then
9660 we can strip off a shift also. */
9662 aarch64_strip_extend (rtx x
, bool strip_shift
)
9664 scalar_int_mode mode
;
9667 if (!is_a
<scalar_int_mode
> (GET_MODE (op
), &mode
))
9670 /* Zero and sign extraction of a widened value. */
9671 if ((GET_CODE (op
) == ZERO_EXTRACT
|| GET_CODE (op
) == SIGN_EXTRACT
)
9672 && XEXP (op
, 2) == const0_rtx
9673 && GET_CODE (XEXP (op
, 0)) == MULT
9674 && aarch64_is_extend_from_extract (mode
, XEXP (XEXP (op
, 0), 1),
9676 return XEXP (XEXP (op
, 0), 0);
9678 /* It can also be represented (for zero-extend) as an AND with an
9680 if (GET_CODE (op
) == AND
9681 && GET_CODE (XEXP (op
, 0)) == MULT
9682 && CONST_INT_P (XEXP (XEXP (op
, 0), 1))
9683 && CONST_INT_P (XEXP (op
, 1))
9684 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op
, 0), 1))),
9685 INTVAL (XEXP (op
, 1))) != 0)
9686 return XEXP (XEXP (op
, 0), 0);
9688 /* Now handle extended register, as this may also have an optional
9689 left shift by 1..4. */
9691 && GET_CODE (op
) == ASHIFT
9692 && CONST_INT_P (XEXP (op
, 1))
9693 && ((unsigned HOST_WIDE_INT
) INTVAL (XEXP (op
, 1))) <= 4)
9696 if (GET_CODE (op
) == ZERO_EXTEND
9697 || GET_CODE (op
) == SIGN_EXTEND
)
9706 /* Return true iff CODE is a shift supported in combination
9707 with arithmetic instructions. */
9710 aarch64_shift_p (enum rtx_code code
)
9712 return code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
;
9716 /* Return true iff X is a cheap shift without a sign extend. */
9719 aarch64_cheap_mult_shift_p (rtx x
)
9726 if (!(aarch64_tune_params
.extra_tuning_flags
9727 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
))
9730 if (GET_CODE (op0
) == SIGN_EXTEND
)
9733 if (GET_CODE (x
) == ASHIFT
&& CONST_INT_P (op1
)
9734 && UINTVAL (op1
) <= 4)
9737 if (GET_CODE (x
) != MULT
|| !CONST_INT_P (op1
))
9740 HOST_WIDE_INT l2
= exact_log2 (INTVAL (op1
));
9742 if (l2
> 0 && l2
<= 4)
9748 /* Helper function for rtx cost calculation. Calculate the cost of
9749 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
9750 Return the calculated cost of the expression, recursing manually in to
9751 operands where needed. */
9754 aarch64_rtx_mult_cost (rtx x
, enum rtx_code code
, int outer
, bool speed
)
9757 const struct cpu_cost_table
*extra_cost
9758 = aarch64_tune_params
.insn_extra_cost
;
9760 bool compound_p
= (outer
== PLUS
|| outer
== MINUS
);
9761 machine_mode mode
= GET_MODE (x
);
9763 gcc_checking_assert (code
== MULT
);
9768 if (VECTOR_MODE_P (mode
))
9769 mode
= GET_MODE_INNER (mode
);
9771 /* Integer multiply/fma. */
9772 if (GET_MODE_CLASS (mode
) == MODE_INT
)
9774 /* The multiply will be canonicalized as a shift, cost it as such. */
9775 if (aarch64_shift_p (GET_CODE (x
))
9776 || (CONST_INT_P (op1
)
9777 && exact_log2 (INTVAL (op1
)) > 0))
9779 bool is_extend
= GET_CODE (op0
) == ZERO_EXTEND
9780 || GET_CODE (op0
) == SIGN_EXTEND
;
9785 /* If the shift is considered cheap,
9786 then don't add any cost. */
9787 if (aarch64_cheap_mult_shift_p (x
))
9789 else if (REG_P (op1
))
9790 /* ARITH + shift-by-register. */
9791 cost
+= extra_cost
->alu
.arith_shift_reg
;
9793 /* ARITH + extended register. We don't have a cost field
9794 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
9795 cost
+= extra_cost
->alu
.extend_arith
;
9797 /* ARITH + shift-by-immediate. */
9798 cost
+= extra_cost
->alu
.arith_shift
;
9801 /* LSL (immediate). */
9802 cost
+= extra_cost
->alu
.shift
;
9805 /* Strip extends as we will have costed them in the case above. */
9807 op0
= aarch64_strip_extend (op0
, true);
9809 cost
+= rtx_cost (op0
, VOIDmode
, code
, 0, speed
);
9814 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
9815 compound and let the below cases handle it. After all, MNEG is a
9816 special-case alias of MSUB. */
9817 if (GET_CODE (op0
) == NEG
)
9819 op0
= XEXP (op0
, 0);
9823 /* Integer multiplies or FMAs have zero/sign extending variants. */
9824 if ((GET_CODE (op0
) == ZERO_EXTEND
9825 && GET_CODE (op1
) == ZERO_EXTEND
)
9826 || (GET_CODE (op0
) == SIGN_EXTEND
9827 && GET_CODE (op1
) == SIGN_EXTEND
))
9829 cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, MULT
, 0, speed
);
9830 cost
+= rtx_cost (XEXP (op1
, 0), VOIDmode
, MULT
, 1, speed
);
9835 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
9836 cost
+= extra_cost
->mult
[0].extend_add
;
9838 /* MUL/SMULL/UMULL. */
9839 cost
+= extra_cost
->mult
[0].extend
;
9845 /* This is either an integer multiply or a MADD. In both cases
9846 we want to recurse and cost the operands. */
9847 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
9848 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
9854 cost
+= extra_cost
->mult
[mode
== DImode
].add
;
9857 cost
+= extra_cost
->mult
[mode
== DImode
].simple
;
9866 /* Floating-point FMA/FMUL can also support negations of the
9867 operands, unless the rounding mode is upward or downward in
9868 which case FNMUL is different than FMUL with operand negation. */
9869 bool neg0
= GET_CODE (op0
) == NEG
;
9870 bool neg1
= GET_CODE (op1
) == NEG
;
9871 if (compound_p
|| !flag_rounding_math
|| (neg0
&& neg1
))
9874 op0
= XEXP (op0
, 0);
9876 op1
= XEXP (op1
, 0);
9880 /* FMADD/FNMADD/FNMSUB/FMSUB. */
9881 cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
9884 cost
+= extra_cost
->fp
[mode
== DFmode
].mult
;
9887 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
9888 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
9894 aarch64_address_cost (rtx x
,
9896 addr_space_t as ATTRIBUTE_UNUSED
,
9899 enum rtx_code c
= GET_CODE (x
);
9900 const struct cpu_addrcost_table
*addr_cost
= aarch64_tune_params
.addr_cost
;
9901 struct aarch64_address_info info
;
9905 if (!aarch64_classify_address (&info
, x
, mode
, false))
9907 if (GET_CODE (x
) == CONST
|| GET_CODE (x
) == SYMBOL_REF
)
9909 /* This is a CONST or SYMBOL ref which will be split
9910 in a different way depending on the code model in use.
9911 Cost it through the generic infrastructure. */
9912 int cost_symbol_ref
= rtx_cost (x
, Pmode
, MEM
, 1, speed
);
9913 /* Divide through by the cost of one instruction to
9914 bring it to the same units as the address costs. */
9915 cost_symbol_ref
/= COSTS_N_INSNS (1);
9916 /* The cost is then the cost of preparing the address,
9917 followed by an immediate (possibly 0) offset. */
9918 return cost_symbol_ref
+ addr_cost
->imm_offset
;
9922 /* This is most likely a jump table from a case
9924 return addr_cost
->register_offset
;
9930 case ADDRESS_LO_SUM
:
9931 case ADDRESS_SYMBOLIC
:
9932 case ADDRESS_REG_IMM
:
9933 cost
+= addr_cost
->imm_offset
;
9936 case ADDRESS_REG_WB
:
9937 if (c
== PRE_INC
|| c
== PRE_DEC
|| c
== PRE_MODIFY
)
9938 cost
+= addr_cost
->pre_modify
;
9939 else if (c
== POST_INC
|| c
== POST_DEC
|| c
== POST_MODIFY
)
9940 cost
+= addr_cost
->post_modify
;
9946 case ADDRESS_REG_REG
:
9947 cost
+= addr_cost
->register_offset
;
9950 case ADDRESS_REG_SXTW
:
9951 cost
+= addr_cost
->register_sextend
;
9954 case ADDRESS_REG_UXTW
:
9955 cost
+= addr_cost
->register_zextend
;
9965 /* For the sake of calculating the cost of the shifted register
9966 component, we can treat same sized modes in the same way. */
9967 if (known_eq (GET_MODE_BITSIZE (mode
), 16))
9968 cost
+= addr_cost
->addr_scale_costs
.hi
;
9969 else if (known_eq (GET_MODE_BITSIZE (mode
), 32))
9970 cost
+= addr_cost
->addr_scale_costs
.si
;
9971 else if (known_eq (GET_MODE_BITSIZE (mode
), 64))
9972 cost
+= addr_cost
->addr_scale_costs
.di
;
9974 /* We can't tell, or this is a 128-bit vector. */
9975 cost
+= addr_cost
->addr_scale_costs
.ti
;
9981 /* Return the cost of a branch. If SPEED_P is true then the compiler is
9982 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
9986 aarch64_branch_cost (bool speed_p
, bool predictable_p
)
9988 /* When optimizing for speed, use the cost of unpredictable branches. */
9989 const struct cpu_branch_cost
*branch_costs
=
9990 aarch64_tune_params
.branch_costs
;
9992 if (!speed_p
|| predictable_p
)
9993 return branch_costs
->predictable
;
9995 return branch_costs
->unpredictable
;
9998 /* Return true if the RTX X in mode MODE is a zero or sign extract
9999 usable in an ADD or SUB (extended register) instruction. */
10001 aarch64_rtx_arith_op_extract_p (rtx x
, scalar_int_mode mode
)
10003 /* Catch add with a sign extract.
10004 This is add_<optab><mode>_multp2. */
10005 if (GET_CODE (x
) == SIGN_EXTRACT
10006 || GET_CODE (x
) == ZERO_EXTRACT
)
10008 rtx op0
= XEXP (x
, 0);
10009 rtx op1
= XEXP (x
, 1);
10010 rtx op2
= XEXP (x
, 2);
10012 if (GET_CODE (op0
) == MULT
10013 && CONST_INT_P (op1
)
10014 && op2
== const0_rtx
10015 && CONST_INT_P (XEXP (op0
, 1))
10016 && aarch64_is_extend_from_extract (mode
,
10023 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
10025 else if (GET_CODE (x
) == SIGN_EXTEND
10026 || GET_CODE (x
) == ZERO_EXTEND
)
10027 return REG_P (XEXP (x
, 0));
10033 aarch64_frint_unspec_p (unsigned int u
)
10037 case UNSPEC_FRINTZ
:
10038 case UNSPEC_FRINTP
:
10039 case UNSPEC_FRINTM
:
10040 case UNSPEC_FRINTA
:
10041 case UNSPEC_FRINTN
:
10042 case UNSPEC_FRINTX
:
10043 case UNSPEC_FRINTI
:
10051 /* Return true iff X is an rtx that will match an extr instruction
10052 i.e. as described in the *extr<mode>5_insn family of patterns.
10053 OP0 and OP1 will be set to the operands of the shifts involved
10054 on success and will be NULL_RTX otherwise. */
10057 aarch64_extr_rtx_p (rtx x
, rtx
*res_op0
, rtx
*res_op1
)
10060 scalar_int_mode mode
;
10061 if (!is_a
<scalar_int_mode
> (GET_MODE (x
), &mode
))
10064 *res_op0
= NULL_RTX
;
10065 *res_op1
= NULL_RTX
;
10067 if (GET_CODE (x
) != IOR
)
10073 if ((GET_CODE (op0
) == ASHIFT
&& GET_CODE (op1
) == LSHIFTRT
)
10074 || (GET_CODE (op1
) == ASHIFT
&& GET_CODE (op0
) == LSHIFTRT
))
10076 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
10077 if (GET_CODE (op1
) == ASHIFT
)
10078 std::swap (op0
, op1
);
10080 if (!CONST_INT_P (XEXP (op0
, 1)) || !CONST_INT_P (XEXP (op1
, 1)))
10083 unsigned HOST_WIDE_INT shft_amnt_0
= UINTVAL (XEXP (op0
, 1));
10084 unsigned HOST_WIDE_INT shft_amnt_1
= UINTVAL (XEXP (op1
, 1));
10086 if (shft_amnt_0
< GET_MODE_BITSIZE (mode
)
10087 && shft_amnt_0
+ shft_amnt_1
== GET_MODE_BITSIZE (mode
))
10089 *res_op0
= XEXP (op0
, 0);
10090 *res_op1
= XEXP (op1
, 0);
10098 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
10099 storing it in *COST. Result is true if the total cost of the operation
10100 has now been calculated. */
10102 aarch64_if_then_else_costs (rtx op0
, rtx op1
, rtx op2
, int *cost
, bool speed
)
10106 enum rtx_code cmpcode
;
10108 if (COMPARISON_P (op0
))
10110 inner
= XEXP (op0
, 0);
10111 comparator
= XEXP (op0
, 1);
10112 cmpcode
= GET_CODE (op0
);
10117 comparator
= const0_rtx
;
10121 if (GET_CODE (op1
) == PC
|| GET_CODE (op2
) == PC
)
10123 /* Conditional branch. */
10124 if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
10128 if (cmpcode
== NE
|| cmpcode
== EQ
)
10130 if (comparator
== const0_rtx
)
10132 /* TBZ/TBNZ/CBZ/CBNZ. */
10133 if (GET_CODE (inner
) == ZERO_EXTRACT
)
10135 *cost
+= rtx_cost (XEXP (inner
, 0), VOIDmode
,
10136 ZERO_EXTRACT
, 0, speed
);
10139 *cost
+= rtx_cost (inner
, VOIDmode
, cmpcode
, 0, speed
);
10144 else if (cmpcode
== LT
|| cmpcode
== GE
)
10147 if (comparator
== const0_rtx
)
10152 else if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
10155 if (GET_CODE (op1
) == COMPARE
)
10157 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
10158 if (XEXP (op1
, 1) == const0_rtx
)
10162 machine_mode mode
= GET_MODE (XEXP (op1
, 0));
10163 const struct cpu_cost_table
*extra_cost
10164 = aarch64_tune_params
.insn_extra_cost
;
10166 if (GET_MODE_CLASS (mode
) == MODE_INT
)
10167 *cost
+= extra_cost
->alu
.arith
;
10169 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
10174 /* It's a conditional operation based on the status flags,
10175 so it must be some flavor of CSEL. */
10177 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
10178 if (GET_CODE (op1
) == NEG
10179 || GET_CODE (op1
) == NOT
10180 || (GET_CODE (op1
) == PLUS
&& XEXP (op1
, 1) == const1_rtx
))
10181 op1
= XEXP (op1
, 0);
10182 else if (GET_CODE (op1
) == ZERO_EXTEND
&& GET_CODE (op2
) == ZERO_EXTEND
)
10184 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
10185 op1
= XEXP (op1
, 0);
10186 op2
= XEXP (op2
, 0);
10189 *cost
+= rtx_cost (op1
, VOIDmode
, IF_THEN_ELSE
, 1, speed
);
10190 *cost
+= rtx_cost (op2
, VOIDmode
, IF_THEN_ELSE
, 2, speed
);
10194 /* We don't know what this is, cost all operands. */
10198 /* Check whether X is a bitfield operation of the form shift + extend that
10199 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
10200 operand to which the bitfield operation is applied. Otherwise return
10204 aarch64_extend_bitfield_pattern_p (rtx x
)
10206 rtx_code outer_code
= GET_CODE (x
);
10207 machine_mode outer_mode
= GET_MODE (x
);
10209 if (outer_code
!= ZERO_EXTEND
&& outer_code
!= SIGN_EXTEND
10210 && outer_mode
!= SImode
&& outer_mode
!= DImode
)
10213 rtx inner
= XEXP (x
, 0);
10214 rtx_code inner_code
= GET_CODE (inner
);
10215 machine_mode inner_mode
= GET_MODE (inner
);
10218 switch (inner_code
)
10221 if (CONST_INT_P (XEXP (inner
, 1))
10222 && (inner_mode
== QImode
|| inner_mode
== HImode
))
10223 op
= XEXP (inner
, 0);
10226 if (outer_code
== ZERO_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
10227 && (inner_mode
== QImode
|| inner_mode
== HImode
))
10228 op
= XEXP (inner
, 0);
10231 if (outer_code
== SIGN_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
10232 && (inner_mode
== QImode
|| inner_mode
== HImode
))
10233 op
= XEXP (inner
, 0);
10242 /* Return true if the mask and a shift amount from an RTX of the form
10243 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
10244 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
10247 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode
, rtx mask
,
10250 return CONST_INT_P (mask
) && CONST_INT_P (shft_amnt
)
10251 && INTVAL (shft_amnt
) < GET_MODE_BITSIZE (mode
)
10252 && exact_log2 ((INTVAL (mask
) >> INTVAL (shft_amnt
)) + 1) >= 0
10254 & ((HOST_WIDE_INT_1U
<< INTVAL (shft_amnt
)) - 1)) == 0;
10257 /* Return true if the masks and a shift amount from an RTX of the form
10258 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
10259 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
10262 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode
,
10263 unsigned HOST_WIDE_INT mask1
,
10264 unsigned HOST_WIDE_INT shft_amnt
,
10265 unsigned HOST_WIDE_INT mask2
)
10267 unsigned HOST_WIDE_INT t
;
10269 /* Verify that there is no overlap in what bits are set in the two masks. */
10270 if (mask1
!= ~mask2
)
10273 /* Verify that mask2 is not all zeros or ones. */
10274 if (mask2
== 0 || mask2
== HOST_WIDE_INT_M1U
)
10277 /* The shift amount should always be less than the mode size. */
10278 gcc_assert (shft_amnt
< GET_MODE_BITSIZE (mode
));
10280 /* Verify that the mask being shifted is contiguous and would be in the
10281 least significant bits after shifting by shft_amnt. */
10282 t
= mask2
+ (HOST_WIDE_INT_1U
<< shft_amnt
);
10283 return (t
== (t
& -t
));
10286 /* Calculate the cost of calculating X, storing it in *COST. Result
10287 is true if the total cost of the operation has now been calculated. */
10289 aarch64_rtx_costs (rtx x
, machine_mode mode
, int outer ATTRIBUTE_UNUSED
,
10290 int param ATTRIBUTE_UNUSED
, int *cost
, bool speed
)
10293 const struct cpu_cost_table
*extra_cost
10294 = aarch64_tune_params
.insn_extra_cost
;
10295 int code
= GET_CODE (x
);
10296 scalar_int_mode int_mode
;
10298 /* By default, assume that everything has equivalent cost to the
10299 cheapest instruction. Any additional costs are applied as a delta
10300 above this default. */
10301 *cost
= COSTS_N_INSNS (1);
10306 /* The cost depends entirely on the operands to SET. */
10308 op0
= SET_DEST (x
);
10311 switch (GET_CODE (op0
))
10316 rtx address
= XEXP (op0
, 0);
10317 if (VECTOR_MODE_P (mode
))
10318 *cost
+= extra_cost
->ldst
.storev
;
10319 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
10320 *cost
+= extra_cost
->ldst
.store
;
10321 else if (mode
== SFmode
)
10322 *cost
+= extra_cost
->ldst
.storef
;
10323 else if (mode
== DFmode
)
10324 *cost
+= extra_cost
->ldst
.stored
;
10327 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
10331 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
10335 if (! REG_P (SUBREG_REG (op0
)))
10336 *cost
+= rtx_cost (SUBREG_REG (op0
), VOIDmode
, SET
, 0, speed
);
10338 /* Fall through. */
10340 /* The cost is one per vector-register copied. */
10341 if (VECTOR_MODE_P (GET_MODE (op0
)) && REG_P (op1
))
10343 int nregs
= aarch64_hard_regno_nregs (V0_REGNUM
, GET_MODE (op0
));
10344 *cost
= COSTS_N_INSNS (nregs
);
10346 /* const0_rtx is in general free, but we will use an
10347 instruction to set a register to 0. */
10348 else if (REG_P (op1
) || op1
== const0_rtx
)
10350 /* The cost is 1 per register copied. */
10351 int nregs
= aarch64_hard_regno_nregs (R0_REGNUM
, GET_MODE (op0
));
10352 *cost
= COSTS_N_INSNS (nregs
);
10355 /* Cost is just the cost of the RHS of the set. */
10356 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
10361 /* Bit-field insertion. Strip any redundant widening of
10362 the RHS to meet the width of the target. */
10363 if (GET_CODE (op1
) == SUBREG
)
10364 op1
= SUBREG_REG (op1
);
10365 if ((GET_CODE (op1
) == ZERO_EXTEND
10366 || GET_CODE (op1
) == SIGN_EXTEND
)
10367 && CONST_INT_P (XEXP (op0
, 1))
10368 && is_a
<scalar_int_mode
> (GET_MODE (XEXP (op1
, 0)), &int_mode
)
10369 && GET_MODE_BITSIZE (int_mode
) >= INTVAL (XEXP (op0
, 1)))
10370 op1
= XEXP (op1
, 0);
10372 if (CONST_INT_P (op1
))
10374 /* MOV immediate is assumed to always be cheap. */
10375 *cost
= COSTS_N_INSNS (1);
10381 *cost
+= extra_cost
->alu
.bfi
;
10382 *cost
+= rtx_cost (op1
, VOIDmode
, (enum rtx_code
) code
, 1, speed
);
10388 /* We can't make sense of this, assume default cost. */
10389 *cost
= COSTS_N_INSNS (1);
10395 /* If an instruction can incorporate a constant within the
10396 instruction, the instruction's expression avoids calling
10397 rtx_cost() on the constant. If rtx_cost() is called on a
10398 constant, then it is usually because the constant must be
10399 moved into a register by one or more instructions.
10401 The exception is constant 0, which can be expressed
10402 as XZR/WZR and is therefore free. The exception to this is
10403 if we have (set (reg) (const0_rtx)) in which case we must cost
10404 the move. However, we can catch that when we cost the SET, so
10405 we don't need to consider that here. */
10406 if (x
== const0_rtx
)
10410 /* To an approximation, building any other constant is
10411 proportionally expensive to the number of instructions
10412 required to build that constant. This is true whether we
10413 are compiling for SPEED or otherwise. */
10414 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
10415 int_mode
= word_mode
;
10416 *cost
= COSTS_N_INSNS (aarch64_internal_mov_immediate
10417 (NULL_RTX
, x
, false, int_mode
));
10423 /* First determine number of instructions to do the move
10424 as an integer constant. */
10425 if (!aarch64_float_const_representable_p (x
)
10426 && !aarch64_can_const_movi_rtx_p (x
, mode
)
10427 && aarch64_float_const_rtx_p (x
))
10429 unsigned HOST_WIDE_INT ival
;
10430 bool succeed
= aarch64_reinterpret_float_as_int (x
, &ival
);
10431 gcc_assert (succeed
);
10433 scalar_int_mode imode
= (mode
== HFmode
10435 : int_mode_for_mode (mode
).require ());
10436 int ncost
= aarch64_internal_mov_immediate
10437 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
10438 *cost
+= COSTS_N_INSNS (ncost
);
10444 /* mov[df,sf]_aarch64. */
10445 if (aarch64_float_const_representable_p (x
))
10446 /* FMOV (scalar immediate). */
10447 *cost
+= extra_cost
->fp
[mode
== DFmode
].fpconst
;
10448 else if (!aarch64_float_const_zero_rtx_p (x
))
10450 /* This will be a load from memory. */
10451 if (mode
== DFmode
)
10452 *cost
+= extra_cost
->ldst
.loadd
;
10454 *cost
+= extra_cost
->ldst
.loadf
;
10457 /* Otherwise this is +0.0. We get this using MOVI d0, #0
10458 or MOV v0.s[0], wzr - neither of which are modeled by the
10459 cost tables. Just use the default cost. */
10469 /* For loads we want the base cost of a load, plus an
10470 approximation for the additional cost of the addressing
10472 rtx address
= XEXP (x
, 0);
10473 if (VECTOR_MODE_P (mode
))
10474 *cost
+= extra_cost
->ldst
.loadv
;
10475 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
10476 *cost
+= extra_cost
->ldst
.load
;
10477 else if (mode
== SFmode
)
10478 *cost
+= extra_cost
->ldst
.loadf
;
10479 else if (mode
== DFmode
)
10480 *cost
+= extra_cost
->ldst
.loadd
;
10483 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
10492 if (VECTOR_MODE_P (mode
))
10497 *cost
+= extra_cost
->vect
.alu
;
10502 if (GET_MODE_CLASS (mode
) == MODE_INT
)
10504 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
10505 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
10508 *cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, NEG
, 0, speed
);
10512 /* Cost this as SUB wzr, X. */
10513 op0
= CONST0_RTX (mode
);
10518 if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
10520 /* Support (neg(fma...)) as a single instruction only if
10521 sign of zeros is unimportant. This matches the decision
10522 making in aarch64.md. */
10523 if (GET_CODE (op0
) == FMA
&& !HONOR_SIGNED_ZEROS (GET_MODE (op0
)))
10526 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
10529 if (GET_CODE (op0
) == MULT
)
10532 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
10537 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
10547 if (VECTOR_MODE_P (mode
))
10548 *cost
+= extra_cost
->vect
.alu
;
10550 *cost
+= extra_cost
->alu
.clz
;
10559 if (op1
== const0_rtx
10560 && GET_CODE (op0
) == AND
)
10563 mode
= GET_MODE (op0
);
10567 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
)
10569 /* TODO: A write to the CC flags possibly costs extra, this
10570 needs encoding in the cost tables. */
10572 mode
= GET_MODE (op0
);
10574 if (GET_CODE (op0
) == AND
)
10580 if (GET_CODE (op0
) == PLUS
)
10582 /* ADDS (and CMN alias). */
10587 if (GET_CODE (op0
) == MINUS
)
10594 if (GET_CODE (op0
) == ZERO_EXTRACT
&& op1
== const0_rtx
10595 && GET_MODE (x
) == CC_NZmode
&& CONST_INT_P (XEXP (op0
, 1))
10596 && CONST_INT_P (XEXP (op0
, 2)))
10598 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
10599 Handle it here directly rather than going to cost_logic
10600 since we know the immediate generated for the TST is valid
10601 so we can avoid creating an intermediate rtx for it only
10602 for costing purposes. */
10604 *cost
+= extra_cost
->alu
.logical
;
10606 *cost
+= rtx_cost (XEXP (op0
, 0), GET_MODE (op0
),
10607 ZERO_EXTRACT
, 0, speed
);
10611 if (GET_CODE (op1
) == NEG
)
10615 *cost
+= extra_cost
->alu
.arith
;
10617 *cost
+= rtx_cost (op0
, mode
, COMPARE
, 0, speed
);
10618 *cost
+= rtx_cost (XEXP (op1
, 0), mode
, NEG
, 1, speed
);
10624 Compare can freely swap the order of operands, and
10625 canonicalization puts the more complex operation first.
10626 But the integer MINUS logic expects the shift/extend
10627 operation in op1. */
10629 || (GET_CODE (op0
) == SUBREG
&& REG_P (SUBREG_REG (op0
)))))
10637 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_FLOAT
)
10641 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
10643 if (CONST_DOUBLE_P (op1
) && aarch64_float_const_zero_rtx_p (op1
))
10645 *cost
+= rtx_cost (op0
, VOIDmode
, COMPARE
, 0, speed
);
10646 /* FCMP supports constant 0.0 for no extra cost. */
10652 if (VECTOR_MODE_P (mode
))
10654 /* Vector compare. */
10656 *cost
+= extra_cost
->vect
.alu
;
10658 if (aarch64_float_const_zero_rtx_p (op1
))
10660 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
10674 *cost
+= rtx_cost (op0
, mode
, MINUS
, 0, speed
);
10676 /* Detect valid immediates. */
10677 if ((GET_MODE_CLASS (mode
) == MODE_INT
10678 || (GET_MODE_CLASS (mode
) == MODE_CC
10679 && GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
))
10680 && CONST_INT_P (op1
)
10681 && aarch64_uimm12_shift (INTVAL (op1
)))
10684 /* SUB(S) (immediate). */
10685 *cost
+= extra_cost
->alu
.arith
;
10689 /* Look for SUB (extended register). */
10690 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
10691 && aarch64_rtx_arith_op_extract_p (op1
, int_mode
))
10694 *cost
+= extra_cost
->alu
.extend_arith
;
10696 op1
= aarch64_strip_extend (op1
, true);
10697 *cost
+= rtx_cost (op1
, VOIDmode
,
10698 (enum rtx_code
) GET_CODE (op1
), 0, speed
);
10702 rtx new_op1
= aarch64_strip_extend (op1
, false);
10704 /* Cost this as an FMA-alike operation. */
10705 if ((GET_CODE (new_op1
) == MULT
10706 || aarch64_shift_p (GET_CODE (new_op1
)))
10707 && code
!= COMPARE
)
10709 *cost
+= aarch64_rtx_mult_cost (new_op1
, MULT
,
10710 (enum rtx_code
) code
,
10715 *cost
+= rtx_cost (new_op1
, VOIDmode
, MINUS
, 1, speed
);
10719 if (VECTOR_MODE_P (mode
))
10722 *cost
+= extra_cost
->vect
.alu
;
10724 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
10727 *cost
+= extra_cost
->alu
.arith
;
10729 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
10732 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
10746 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
10747 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
10750 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, PLUS
, 0, speed
);
10751 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
10755 if (GET_MODE_CLASS (mode
) == MODE_INT
10756 && (aarch64_plus_immediate (op1
, mode
)
10757 || aarch64_sve_addvl_addpl_immediate (op1
, mode
)))
10759 *cost
+= rtx_cost (op0
, mode
, PLUS
, 0, speed
);
10762 /* ADD (immediate). */
10763 *cost
+= extra_cost
->alu
.arith
;
10767 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
10769 /* Look for ADD (extended register). */
10770 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
10771 && aarch64_rtx_arith_op_extract_p (op0
, int_mode
))
10774 *cost
+= extra_cost
->alu
.extend_arith
;
10776 op0
= aarch64_strip_extend (op0
, true);
10777 *cost
+= rtx_cost (op0
, VOIDmode
,
10778 (enum rtx_code
) GET_CODE (op0
), 0, speed
);
10782 /* Strip any extend, leave shifts behind as we will
10783 cost them through mult_cost. */
10784 new_op0
= aarch64_strip_extend (op0
, false);
10786 if (GET_CODE (new_op0
) == MULT
10787 || aarch64_shift_p (GET_CODE (new_op0
)))
10789 *cost
+= aarch64_rtx_mult_cost (new_op0
, MULT
, PLUS
,
10794 *cost
+= rtx_cost (new_op0
, VOIDmode
, PLUS
, 0, speed
);
10798 if (VECTOR_MODE_P (mode
))
10801 *cost
+= extra_cost
->vect
.alu
;
10803 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
10806 *cost
+= extra_cost
->alu
.arith
;
10808 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
10811 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
10818 *cost
= COSTS_N_INSNS (1);
10822 if (VECTOR_MODE_P (mode
))
10823 *cost
+= extra_cost
->vect
.alu
;
10825 *cost
+= extra_cost
->alu
.rev
;
10830 if (aarch_rev16_p (x
))
10832 *cost
= COSTS_N_INSNS (1);
10836 if (VECTOR_MODE_P (mode
))
10837 *cost
+= extra_cost
->vect
.alu
;
10839 *cost
+= extra_cost
->alu
.rev
;
10844 if (aarch64_extr_rtx_p (x
, &op0
, &op1
))
10846 *cost
+= rtx_cost (op0
, mode
, IOR
, 0, speed
);
10847 *cost
+= rtx_cost (op1
, mode
, IOR
, 1, speed
);
10849 *cost
+= extra_cost
->alu
.shift
;
10853 /* Fall through. */
10860 if (VECTOR_MODE_P (mode
))
10863 *cost
+= extra_cost
->vect
.alu
;
10868 && GET_CODE (op0
) == MULT
10869 && CONST_INT_P (XEXP (op0
, 1))
10870 && CONST_INT_P (op1
)
10871 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0
, 1))),
10872 INTVAL (op1
)) != 0)
10874 /* This is a UBFM/SBFM. */
10875 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, ZERO_EXTRACT
, 0, speed
);
10877 *cost
+= extra_cost
->alu
.bfx
;
10881 if (is_int_mode (mode
, &int_mode
))
10883 if (CONST_INT_P (op1
))
10885 /* We have a mask + shift version of a UBFIZ
10886 i.e. the *andim_ashift<mode>_bfiz pattern. */
10887 if (GET_CODE (op0
) == ASHIFT
10888 && aarch64_mask_and_shift_for_ubfiz_p (int_mode
, op1
,
10891 *cost
+= rtx_cost (XEXP (op0
, 0), int_mode
,
10892 (enum rtx_code
) code
, 0, speed
);
10894 *cost
+= extra_cost
->alu
.bfx
;
10898 else if (aarch64_bitmask_imm (INTVAL (op1
), int_mode
))
10900 /* We possibly get the immediate for free, this is not
10902 *cost
+= rtx_cost (op0
, int_mode
,
10903 (enum rtx_code
) code
, 0, speed
);
10905 *cost
+= extra_cost
->alu
.logical
;
10914 /* Handle ORN, EON, or BIC. */
10915 if (GET_CODE (op0
) == NOT
)
10916 op0
= XEXP (op0
, 0);
10918 new_op0
= aarch64_strip_shift (op0
);
10920 /* If we had a shift on op0 then this is a logical-shift-
10921 by-register/immediate operation. Otherwise, this is just
10922 a logical operation. */
10925 if (new_op0
!= op0
)
10927 /* Shift by immediate. */
10928 if (CONST_INT_P (XEXP (op0
, 1)))
10929 *cost
+= extra_cost
->alu
.log_shift
;
10931 *cost
+= extra_cost
->alu
.log_shift_reg
;
10934 *cost
+= extra_cost
->alu
.logical
;
10937 /* In both cases we want to cost both operands. */
10938 *cost
+= rtx_cost (new_op0
, int_mode
, (enum rtx_code
) code
,
10940 *cost
+= rtx_cost (op1
, int_mode
, (enum rtx_code
) code
,
10950 op0
= aarch64_strip_shift (x
);
10952 if (VECTOR_MODE_P (mode
))
10955 *cost
+= extra_cost
->vect
.alu
;
10959 /* MVN-shifted-reg. */
10962 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
10965 *cost
+= extra_cost
->alu
.log_shift
;
10969 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10970 Handle the second form here taking care that 'a' in the above can
10972 else if (GET_CODE (op0
) == XOR
)
10974 rtx newop0
= XEXP (op0
, 0);
10975 rtx newop1
= XEXP (op0
, 1);
10976 rtx op0_stripped
= aarch64_strip_shift (newop0
);
10978 *cost
+= rtx_cost (newop1
, mode
, (enum rtx_code
) code
, 1, speed
);
10979 *cost
+= rtx_cost (op0_stripped
, mode
, XOR
, 0, speed
);
10983 if (op0_stripped
!= newop0
)
10984 *cost
+= extra_cost
->alu
.log_shift
;
10986 *cost
+= extra_cost
->alu
.logical
;
10993 *cost
+= extra_cost
->alu
.logical
;
11000 /* If a value is written in SI mode, then zero extended to DI
11001 mode, the operation will in general be free as a write to
11002 a 'w' register implicitly zeroes the upper bits of an 'x'
11003 register. However, if this is
11005 (set (reg) (zero_extend (reg)))
11007 we must cost the explicit register move. */
11009 && GET_MODE (op0
) == SImode
11012 int op_cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, 0, speed
);
11014 /* If OP_COST is non-zero, then the cost of the zero extend
11015 is effectively the cost of the inner operation. Otherwise
11016 we have a MOV instruction and we take the cost from the MOV
11017 itself. This is true independently of whether we are
11018 optimizing for space or time. */
11024 else if (MEM_P (op0
))
11026 /* All loads can zero extend to any size for free. */
11027 *cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, param
, speed
);
11031 op0
= aarch64_extend_bitfield_pattern_p (x
);
11034 *cost
+= rtx_cost (op0
, mode
, ZERO_EXTEND
, 0, speed
);
11036 *cost
+= extra_cost
->alu
.bfx
;
11042 if (VECTOR_MODE_P (mode
))
11045 *cost
+= extra_cost
->vect
.alu
;
11049 /* We generate an AND instead of UXTB/UXTH. */
11050 *cost
+= extra_cost
->alu
.logical
;
11056 if (MEM_P (XEXP (x
, 0)))
11061 rtx address
= XEXP (XEXP (x
, 0), 0);
11062 *cost
+= extra_cost
->ldst
.load_sign_extend
;
11065 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
11071 op0
= aarch64_extend_bitfield_pattern_p (x
);
11074 *cost
+= rtx_cost (op0
, mode
, SIGN_EXTEND
, 0, speed
);
11076 *cost
+= extra_cost
->alu
.bfx
;
11082 if (VECTOR_MODE_P (mode
))
11083 *cost
+= extra_cost
->vect
.alu
;
11085 *cost
+= extra_cost
->alu
.extend
;
11093 if (CONST_INT_P (op1
))
11097 if (VECTOR_MODE_P (mode
))
11099 /* Vector shift (immediate). */
11100 *cost
+= extra_cost
->vect
.alu
;
11104 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
11106 *cost
+= extra_cost
->alu
.shift
;
11110 /* We can incorporate zero/sign extend for free. */
11111 if (GET_CODE (op0
) == ZERO_EXTEND
11112 || GET_CODE (op0
) == SIGN_EXTEND
)
11113 op0
= XEXP (op0
, 0);
11115 *cost
+= rtx_cost (op0
, VOIDmode
, ASHIFT
, 0, speed
);
11120 if (VECTOR_MODE_P (mode
))
11123 /* Vector shift (register). */
11124 *cost
+= extra_cost
->vect
.alu
;
11130 *cost
+= extra_cost
->alu
.shift_reg
;
11132 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
11133 && CONST_INT_P (XEXP (op1
, 1))
11134 && known_eq (INTVAL (XEXP (op1
, 1)),
11135 GET_MODE_BITSIZE (mode
) - 1))
11137 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
11138 /* We already demanded XEXP (op1, 0) to be REG_P, so
11139 don't recurse into it. */
11143 return false; /* All arguments need to be in registers. */
11153 if (CONST_INT_P (op1
))
11155 /* ASR (immediate) and friends. */
11158 if (VECTOR_MODE_P (mode
))
11159 *cost
+= extra_cost
->vect
.alu
;
11161 *cost
+= extra_cost
->alu
.shift
;
11164 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
11169 if (VECTOR_MODE_P (mode
))
11172 /* Vector shift (register). */
11173 *cost
+= extra_cost
->vect
.alu
;
11178 /* ASR (register) and friends. */
11179 *cost
+= extra_cost
->alu
.shift_reg
;
11181 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
11182 && CONST_INT_P (XEXP (op1
, 1))
11183 && known_eq (INTVAL (XEXP (op1
, 1)),
11184 GET_MODE_BITSIZE (mode
) - 1))
11186 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
11187 /* We already demanded XEXP (op1, 0) to be REG_P, so
11188 don't recurse into it. */
11192 return false; /* All arguments need to be in registers. */
11197 if (aarch64_cmodel
== AARCH64_CMODEL_LARGE
11198 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
)
11202 *cost
+= extra_cost
->ldst
.load
;
11204 else if (aarch64_cmodel
== AARCH64_CMODEL_SMALL
11205 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_PIC
)
11207 /* ADRP, followed by ADD. */
11208 *cost
+= COSTS_N_INSNS (1);
11210 *cost
+= 2 * extra_cost
->alu
.arith
;
11212 else if (aarch64_cmodel
== AARCH64_CMODEL_TINY
11213 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
11217 *cost
+= extra_cost
->alu
.arith
;
11222 /* One extra load instruction, after accessing the GOT. */
11223 *cost
+= COSTS_N_INSNS (1);
11225 *cost
+= extra_cost
->ldst
.load
;
11231 /* ADRP/ADD (immediate). */
11233 *cost
+= extra_cost
->alu
.arith
;
11241 if (VECTOR_MODE_P (mode
))
11242 *cost
+= extra_cost
->vect
.alu
;
11244 *cost
+= extra_cost
->alu
.bfx
;
11247 /* We can trust that the immediates used will be correct (there
11248 are no by-register forms), so we need only cost op0. */
11249 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (enum rtx_code
) code
, 0, speed
);
11253 *cost
+= aarch64_rtx_mult_cost (x
, MULT
, 0, speed
);
11254 /* aarch64_rtx_mult_cost always handles recursion to its
11259 /* We can expand signed mod by power of 2 using a NEGS, two parallel
11260 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
11261 an unconditional negate. This case should only ever be reached through
11262 the set_smod_pow2_cheap check in expmed.c. */
11263 if (CONST_INT_P (XEXP (x
, 1))
11264 && exact_log2 (INTVAL (XEXP (x
, 1))) > 0
11265 && (mode
== SImode
|| mode
== DImode
))
11267 /* We expand to 4 instructions. Reset the baseline. */
11268 *cost
= COSTS_N_INSNS (4);
11271 *cost
+= 2 * extra_cost
->alu
.logical
11272 + 2 * extra_cost
->alu
.arith
;
11277 /* Fall-through. */
11281 /* Slighly prefer UMOD over SMOD. */
11282 if (VECTOR_MODE_P (mode
))
11283 *cost
+= extra_cost
->vect
.alu
;
11284 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
11285 *cost
+= (extra_cost
->mult
[mode
== DImode
].add
11286 + extra_cost
->mult
[mode
== DImode
].idiv
11287 + (code
== MOD
? 1 : 0));
11289 return false; /* All arguments need to be in registers. */
11296 if (VECTOR_MODE_P (mode
))
11297 *cost
+= extra_cost
->vect
.alu
;
11298 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
11299 /* There is no integer SQRT, so only DIV and UDIV can get
11301 *cost
+= (extra_cost
->mult
[mode
== DImode
].idiv
11302 /* Slighly prefer UDIV over SDIV. */
11303 + (code
== DIV
? 1 : 0));
11305 *cost
+= extra_cost
->fp
[mode
== DFmode
].div
;
11307 return false; /* All arguments need to be in registers. */
11310 return aarch64_if_then_else_costs (XEXP (x
, 0), XEXP (x
, 1),
11311 XEXP (x
, 2), cost
, speed
);
11324 return false; /* All arguments must be in registers. */
11333 if (VECTOR_MODE_P (mode
))
11334 *cost
+= extra_cost
->vect
.alu
;
11336 *cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
11339 /* FMSUB, FNMADD, and FNMSUB are free. */
11340 if (GET_CODE (op0
) == NEG
)
11341 op0
= XEXP (op0
, 0);
11343 if (GET_CODE (op2
) == NEG
)
11344 op2
= XEXP (op2
, 0);
11346 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
11347 and the by-element operand as operand 0. */
11348 if (GET_CODE (op1
) == NEG
)
11349 op1
= XEXP (op1
, 0);
11351 /* Catch vector-by-element operations. The by-element operand can
11352 either be (vec_duplicate (vec_select (x))) or just
11353 (vec_select (x)), depending on whether we are multiplying by
11354 a vector or a scalar.
11356 Canonicalization is not very good in these cases, FMA4 will put the
11357 by-element operand as operand 0, FNMA4 will have it as operand 1. */
11358 if (GET_CODE (op0
) == VEC_DUPLICATE
)
11359 op0
= XEXP (op0
, 0);
11360 else if (GET_CODE (op1
) == VEC_DUPLICATE
)
11361 op1
= XEXP (op1
, 0);
11363 if (GET_CODE (op0
) == VEC_SELECT
)
11364 op0
= XEXP (op0
, 0);
11365 else if (GET_CODE (op1
) == VEC_SELECT
)
11366 op1
= XEXP (op1
, 0);
11368 /* If the remaining parameters are not registers,
11369 get the cost to put them into registers. */
11370 *cost
+= rtx_cost (op0
, mode
, FMA
, 0, speed
);
11371 *cost
+= rtx_cost (op1
, mode
, FMA
, 1, speed
);
11372 *cost
+= rtx_cost (op2
, mode
, FMA
, 2, speed
);
11376 case UNSIGNED_FLOAT
:
11378 *cost
+= extra_cost
->fp
[mode
== DFmode
].fromint
;
11384 if (VECTOR_MODE_P (mode
))
11386 /*Vector truncate. */
11387 *cost
+= extra_cost
->vect
.alu
;
11390 *cost
+= extra_cost
->fp
[mode
== DFmode
].widen
;
11394 case FLOAT_TRUNCATE
:
11397 if (VECTOR_MODE_P (mode
))
11399 /*Vector conversion. */
11400 *cost
+= extra_cost
->vect
.alu
;
11403 *cost
+= extra_cost
->fp
[mode
== DFmode
].narrow
;
11410 /* Strip the rounding part. They will all be implemented
11411 by the fcvt* family of instructions anyway. */
11412 if (GET_CODE (x
) == UNSPEC
)
11414 unsigned int uns_code
= XINT (x
, 1);
11416 if (uns_code
== UNSPEC_FRINTA
11417 || uns_code
== UNSPEC_FRINTM
11418 || uns_code
== UNSPEC_FRINTN
11419 || uns_code
== UNSPEC_FRINTP
11420 || uns_code
== UNSPEC_FRINTZ
)
11421 x
= XVECEXP (x
, 0, 0);
11426 if (VECTOR_MODE_P (mode
))
11427 *cost
+= extra_cost
->vect
.alu
;
11429 *cost
+= extra_cost
->fp
[GET_MODE (x
) == DFmode
].toint
;
11432 /* We can combine fmul by a power of 2 followed by a fcvt into a single
11433 fixed-point fcvt. */
11434 if (GET_CODE (x
) == MULT
11435 && ((VECTOR_MODE_P (mode
)
11436 && aarch64_vec_fpconst_pow_of_2 (XEXP (x
, 1)) > 0)
11437 || aarch64_fpconst_pow_of_2 (XEXP (x
, 1)) > 0))
11439 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (rtx_code
) code
,
11444 *cost
+= rtx_cost (x
, VOIDmode
, (enum rtx_code
) code
, 0, speed
);
11448 if (VECTOR_MODE_P (mode
))
11450 /* ABS (vector). */
11452 *cost
+= extra_cost
->vect
.alu
;
11454 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
11458 /* FABD, which is analogous to FADD. */
11459 if (GET_CODE (op0
) == MINUS
)
11461 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, MINUS
, 0, speed
);
11462 *cost
+= rtx_cost (XEXP (op0
, 1), mode
, MINUS
, 1, speed
);
11464 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
11468 /* Simple FABS is analogous to FNEG. */
11470 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
11474 /* Integer ABS will either be split to
11475 two arithmetic instructions, or will be an ABS
11476 (scalar), which we don't model. */
11477 *cost
= COSTS_N_INSNS (2);
11479 *cost
+= 2 * extra_cost
->alu
.arith
;
11487 if (VECTOR_MODE_P (mode
))
11488 *cost
+= extra_cost
->vect
.alu
;
11491 /* FMAXNM/FMINNM/FMAX/FMIN.
11492 TODO: This may not be accurate for all implementations, but
11493 we do not model this in the cost tables. */
11494 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
11500 /* The floating point round to integer frint* instructions. */
11501 if (aarch64_frint_unspec_p (XINT (x
, 1)))
11504 *cost
+= extra_cost
->fp
[mode
== DFmode
].roundint
;
11509 if (XINT (x
, 1) == UNSPEC_RBIT
)
11512 *cost
+= extra_cost
->alu
.rev
;
11520 /* Decompose <su>muldi3_highpart. */
11521 if (/* (truncate:DI */
11524 && GET_MODE (XEXP (x
, 0)) == TImode
11525 && GET_CODE (XEXP (x
, 0)) == LSHIFTRT
11527 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == MULT
11528 /* (ANY_EXTEND:TI (reg:DI))
11529 (ANY_EXTEND:TI (reg:DI))) */
11530 && ((GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == ZERO_EXTEND
11531 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == ZERO_EXTEND
)
11532 || (GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == SIGN_EXTEND
11533 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == SIGN_EXTEND
))
11534 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0)) == DImode
11535 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0)) == DImode
11536 /* (const_int 64) */
11537 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
11538 && UINTVAL (XEXP (XEXP (x
, 0), 1)) == 64)
11542 *cost
+= extra_cost
->mult
[mode
== DImode
].extend
;
11543 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0),
11544 mode
, MULT
, 0, speed
);
11545 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0),
11546 mode
, MULT
, 1, speed
);
11550 /* Fall through. */
11556 && flag_aarch64_verbose_cost
)
11557 fprintf (dump_file
,
11558 "\nFailed to cost RTX. Assuming default cost.\n");
11563 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
11564 calculated for X. This cost is stored in *COST. Returns true
11565 if the total cost of X was calculated. */
11567 aarch64_rtx_costs_wrapper (rtx x
, machine_mode mode
, int outer
,
11568 int param
, int *cost
, bool speed
)
11570 bool result
= aarch64_rtx_costs (x
, mode
, outer
, param
, cost
, speed
);
11573 && flag_aarch64_verbose_cost
)
11575 print_rtl_single (dump_file
, x
);
11576 fprintf (dump_file
, "\n%s cost: %d (%s)\n",
11577 speed
? "Hot" : "Cold",
11578 *cost
, result
? "final" : "partial");
11585 aarch64_register_move_cost (machine_mode mode
,
11586 reg_class_t from_i
, reg_class_t to_i
)
11588 enum reg_class from
= (enum reg_class
) from_i
;
11589 enum reg_class to
= (enum reg_class
) to_i
;
11590 const struct cpu_regmove_cost
*regmove_cost
11591 = aarch64_tune_params
.regmove_cost
;
11593 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
11594 if (to
== TAILCALL_ADDR_REGS
|| to
== POINTER_REGS
)
11597 if (from
== TAILCALL_ADDR_REGS
|| from
== POINTER_REGS
)
11598 from
= GENERAL_REGS
;
11600 /* Moving between GPR and stack cost is the same as GP2GP. */
11601 if ((from
== GENERAL_REGS
&& to
== STACK_REG
)
11602 || (to
== GENERAL_REGS
&& from
== STACK_REG
))
11603 return regmove_cost
->GP2GP
;
11605 /* To/From the stack register, we move via the gprs. */
11606 if (to
== STACK_REG
|| from
== STACK_REG
)
11607 return aarch64_register_move_cost (mode
, from
, GENERAL_REGS
)
11608 + aarch64_register_move_cost (mode
, GENERAL_REGS
, to
);
11610 if (known_eq (GET_MODE_SIZE (mode
), 16))
11612 /* 128-bit operations on general registers require 2 instructions. */
11613 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
11614 return regmove_cost
->GP2GP
* 2;
11615 else if (from
== GENERAL_REGS
)
11616 return regmove_cost
->GP2FP
* 2;
11617 else if (to
== GENERAL_REGS
)
11618 return regmove_cost
->FP2GP
* 2;
11620 /* When AdvSIMD instructions are disabled it is not possible to move
11621 a 128-bit value directly between Q registers. This is handled in
11622 secondary reload. A general register is used as a scratch to move
11623 the upper DI value and the lower DI value is moved directly,
11624 hence the cost is the sum of three moves. */
11626 return regmove_cost
->GP2FP
+ regmove_cost
->FP2GP
+ regmove_cost
->FP2FP
;
11628 return regmove_cost
->FP2FP
;
11631 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
11632 return regmove_cost
->GP2GP
;
11633 else if (from
== GENERAL_REGS
)
11634 return regmove_cost
->GP2FP
;
11635 else if (to
== GENERAL_REGS
)
11636 return regmove_cost
->FP2GP
;
11638 return regmove_cost
->FP2FP
;
11642 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED
,
11643 reg_class_t rclass ATTRIBUTE_UNUSED
,
11644 bool in ATTRIBUTE_UNUSED
)
11646 return aarch64_tune_params
.memmov_cost
;
11649 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
11650 to optimize 1.0/sqrt. */
11653 use_rsqrt_p (machine_mode mode
)
11655 return (!flag_trapping_math
11656 && flag_unsafe_math_optimizations
11657 && ((aarch64_tune_params
.approx_modes
->recip_sqrt
11658 & AARCH64_APPROX_MODE (mode
))
11659 || flag_mrecip_low_precision_sqrt
));
11662 /* Function to decide when to use the approximate reciprocal square root
11666 aarch64_builtin_reciprocal (tree fndecl
)
11668 machine_mode mode
= TYPE_MODE (TREE_TYPE (fndecl
));
11670 if (!use_rsqrt_p (mode
))
11672 return aarch64_builtin_rsqrt (DECL_MD_FUNCTION_CODE (fndecl
));
11675 /* Emit instruction sequence to compute either the approximate square root
11676 or its approximate reciprocal, depending on the flag RECP, and return
11677 whether the sequence was emitted or not. */
11680 aarch64_emit_approx_sqrt (rtx dst
, rtx src
, bool recp
)
11682 machine_mode mode
= GET_MODE (dst
);
11684 if (GET_MODE_INNER (mode
) == HFmode
)
11686 gcc_assert (!recp
);
11692 if (!(flag_mlow_precision_sqrt
11693 || (aarch64_tune_params
.approx_modes
->sqrt
11694 & AARCH64_APPROX_MODE (mode
))))
11697 if (flag_finite_math_only
11698 || flag_trapping_math
11699 || !flag_unsafe_math_optimizations
11700 || optimize_function_for_size_p (cfun
))
11704 /* Caller assumes we cannot fail. */
11705 gcc_assert (use_rsqrt_p (mode
));
11707 machine_mode mmsk
= mode_for_int_vector (mode
).require ();
11708 rtx xmsk
= gen_reg_rtx (mmsk
);
11710 /* When calculating the approximate square root, compare the
11711 argument with 0.0 and create a mask. */
11712 emit_insn (gen_rtx_SET (xmsk
,
11714 gen_rtx_EQ (mmsk
, src
,
11715 CONST0_RTX (mode
)))));
11717 /* Estimate the approximate reciprocal square root. */
11718 rtx xdst
= gen_reg_rtx (mode
);
11719 emit_insn (gen_aarch64_rsqrte (mode
, xdst
, src
));
11721 /* Iterate over the series twice for SF and thrice for DF. */
11722 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
11724 /* Optionally iterate over the series once less for faster performance
11725 while sacrificing the accuracy. */
11726 if ((recp
&& flag_mrecip_low_precision_sqrt
)
11727 || (!recp
&& flag_mlow_precision_sqrt
))
11730 /* Iterate over the series to calculate the approximate reciprocal square
11732 rtx x1
= gen_reg_rtx (mode
);
11733 while (iterations
--)
11735 rtx x2
= gen_reg_rtx (mode
);
11736 emit_set_insn (x2
, gen_rtx_MULT (mode
, xdst
, xdst
));
11738 emit_insn (gen_aarch64_rsqrts (mode
, x1
, src
, x2
));
11740 if (iterations
> 0)
11741 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, x1
));
11746 /* Qualify the approximate reciprocal square root when the argument is
11747 0.0 by squashing the intermediary result to 0.0. */
11748 rtx xtmp
= gen_reg_rtx (mmsk
);
11749 emit_set_insn (xtmp
, gen_rtx_AND (mmsk
, gen_rtx_NOT (mmsk
, xmsk
),
11750 gen_rtx_SUBREG (mmsk
, xdst
, 0)));
11751 emit_move_insn (xdst
, gen_rtx_SUBREG (mode
, xtmp
, 0));
11753 /* Calculate the approximate square root. */
11754 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, src
));
11757 /* Finalize the approximation. */
11758 emit_set_insn (dst
, gen_rtx_MULT (mode
, xdst
, x1
));
11763 /* Emit the instruction sequence to compute the approximation for the division
11764 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
11767 aarch64_emit_approx_div (rtx quo
, rtx num
, rtx den
)
11769 machine_mode mode
= GET_MODE (quo
);
11771 if (GET_MODE_INNER (mode
) == HFmode
)
11774 bool use_approx_division_p
= (flag_mlow_precision_div
11775 || (aarch64_tune_params
.approx_modes
->division
11776 & AARCH64_APPROX_MODE (mode
)));
11778 if (!flag_finite_math_only
11779 || flag_trapping_math
11780 || !flag_unsafe_math_optimizations
11781 || optimize_function_for_size_p (cfun
)
11782 || !use_approx_division_p
)
11785 if (!TARGET_SIMD
&& VECTOR_MODE_P (mode
))
11788 /* Estimate the approximate reciprocal. */
11789 rtx xrcp
= gen_reg_rtx (mode
);
11790 emit_insn (gen_aarch64_frecpe (mode
, xrcp
, den
));
11792 /* Iterate over the series twice for SF and thrice for DF. */
11793 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
11795 /* Optionally iterate over the series once less for faster performance,
11796 while sacrificing the accuracy. */
11797 if (flag_mlow_precision_div
)
11800 /* Iterate over the series to calculate the approximate reciprocal. */
11801 rtx xtmp
= gen_reg_rtx (mode
);
11802 while (iterations
--)
11804 emit_insn (gen_aarch64_frecps (mode
, xtmp
, xrcp
, den
));
11806 if (iterations
> 0)
11807 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
11810 if (num
!= CONST1_RTX (mode
))
11812 /* As the approximate reciprocal of DEN is already calculated, only
11813 calculate the approximate division when NUM is not 1.0. */
11814 rtx xnum
= force_reg (mode
, num
);
11815 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xnum
));
11818 /* Finalize the approximation. */
11819 emit_set_insn (quo
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
11823 /* Return the number of instructions that can be issued per cycle. */
11825 aarch64_sched_issue_rate (void)
11827 return aarch64_tune_params
.issue_rate
;
11830 /* Implement TARGET_SCHED_VARIABLE_ISSUE. */
11832 aarch64_sched_variable_issue (FILE *, int, rtx_insn
*insn
, int more
)
11834 if (DEBUG_INSN_P (insn
))
11837 rtx_code code
= GET_CODE (PATTERN (insn
));
11838 if (code
== USE
|| code
== CLOBBER
)
11841 if (get_attr_type (insn
) == TYPE_NO_INSN
)
11848 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
11850 int issue_rate
= aarch64_sched_issue_rate ();
11852 return issue_rate
> 1 && !sched_fusion
? issue_rate
: 0;
11856 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
11857 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
11858 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
11861 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn
*insn
,
11864 return autopref_multipass_dfa_lookahead_guard (insn
, ready_index
);
11868 /* Vectorizer cost model target hooks. */
11870 /* Implement targetm.vectorize.builtin_vectorization_cost. */
11872 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost
,
11874 int misalign ATTRIBUTE_UNUSED
)
11877 const cpu_vector_cost
*costs
= aarch64_tune_params
.vec_costs
;
11880 if (vectype
!= NULL
)
11881 fp
= FLOAT_TYPE_P (vectype
);
11883 switch (type_of_cost
)
11886 return fp
? costs
->scalar_fp_stmt_cost
: costs
->scalar_int_stmt_cost
;
11889 return costs
->scalar_load_cost
;
11892 return costs
->scalar_store_cost
;
11895 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
11898 return costs
->vec_align_load_cost
;
11901 return costs
->vec_store_cost
;
11903 case vec_to_scalar
:
11904 return costs
->vec_to_scalar_cost
;
11906 case scalar_to_vec
:
11907 return costs
->scalar_to_vec_cost
;
11909 case unaligned_load
:
11910 case vector_gather_load
:
11911 return costs
->vec_unalign_load_cost
;
11913 case unaligned_store
:
11914 case vector_scatter_store
:
11915 return costs
->vec_unalign_store_cost
;
11917 case cond_branch_taken
:
11918 return costs
->cond_taken_branch_cost
;
11920 case cond_branch_not_taken
:
11921 return costs
->cond_not_taken_branch_cost
;
11924 return costs
->vec_permute_cost
;
11926 case vec_promote_demote
:
11927 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
11929 case vec_construct
:
11930 elements
= estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype
));
11931 return elements
/ 2 + 1;
11934 gcc_unreachable ();
11938 /* Implement targetm.vectorize.add_stmt_cost. */
11940 aarch64_add_stmt_cost (void *data
, int count
, enum vect_cost_for_stmt kind
,
11941 struct _stmt_vec_info
*stmt_info
, int misalign
,
11942 enum vect_cost_model_location where
)
11944 unsigned *cost
= (unsigned *) data
;
11945 unsigned retval
= 0;
11947 if (flag_vect_cost_model
)
11949 tree vectype
= stmt_info
? stmt_vectype (stmt_info
) : NULL_TREE
;
11951 aarch64_builtin_vectorization_cost (kind
, vectype
, misalign
);
11953 /* Statements in an inner loop relative to the loop being
11954 vectorized are weighted more heavily. The value here is
11955 arbitrary and could potentially be improved with analysis. */
11956 if (where
== vect_body
&& stmt_info
&& stmt_in_inner_loop_p (stmt_info
))
11957 count
*= 50; /* FIXME */
11959 retval
= (unsigned) (count
* stmt_cost
);
11960 cost
[where
] += retval
;
11966 static void initialize_aarch64_code_model (struct gcc_options
*);
11968 /* Parse the TO_PARSE string and put the architecture struct that it
11969 selects into RES and the architectural features into ISA_FLAGS.
11970 Return an aarch64_parse_opt_result describing the parse result.
11971 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
11972 When the TO_PARSE string contains an invalid extension,
11973 a copy of the string is created and stored to INVALID_EXTENSION. */
11975 static enum aarch64_parse_opt_result
11976 aarch64_parse_arch (const char *to_parse
, const struct processor
**res
,
11977 uint64_t *isa_flags
, std::string
*invalid_extension
)
11980 const struct processor
*arch
;
11983 ext
= strchr (to_parse
, '+');
11986 len
= ext
- to_parse
;
11988 len
= strlen (to_parse
);
11991 return AARCH64_PARSE_MISSING_ARG
;
11994 /* Loop through the list of supported ARCHes to find a match. */
11995 for (arch
= all_architectures
; arch
->name
!= NULL
; arch
++)
11997 if (strlen (arch
->name
) == len
11998 && strncmp (arch
->name
, to_parse
, len
) == 0)
12000 uint64_t isa_temp
= arch
->flags
;
12004 /* TO_PARSE string contains at least one extension. */
12005 enum aarch64_parse_opt_result ext_res
12006 = aarch64_parse_extension (ext
, &isa_temp
, invalid_extension
);
12008 if (ext_res
!= AARCH64_PARSE_OK
)
12011 /* Extension parsing was successful. Confirm the result
12012 arch and ISA flags. */
12014 *isa_flags
= isa_temp
;
12015 return AARCH64_PARSE_OK
;
12019 /* ARCH name not found in list. */
12020 return AARCH64_PARSE_INVALID_ARG
;
12023 /* Parse the TO_PARSE string and put the result tuning in RES and the
12024 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
12025 describing the parse result. If there is an error parsing, RES and
12026 ISA_FLAGS are left unchanged.
12027 When the TO_PARSE string contains an invalid extension,
12028 a copy of the string is created and stored to INVALID_EXTENSION. */
12030 static enum aarch64_parse_opt_result
12031 aarch64_parse_cpu (const char *to_parse
, const struct processor
**res
,
12032 uint64_t *isa_flags
, std::string
*invalid_extension
)
12035 const struct processor
*cpu
;
12038 ext
= strchr (to_parse
, '+');
12041 len
= ext
- to_parse
;
12043 len
= strlen (to_parse
);
12046 return AARCH64_PARSE_MISSING_ARG
;
12049 /* Loop through the list of supported CPUs to find a match. */
12050 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
12052 if (strlen (cpu
->name
) == len
&& strncmp (cpu
->name
, to_parse
, len
) == 0)
12054 uint64_t isa_temp
= cpu
->flags
;
12059 /* TO_PARSE string contains at least one extension. */
12060 enum aarch64_parse_opt_result ext_res
12061 = aarch64_parse_extension (ext
, &isa_temp
, invalid_extension
);
12063 if (ext_res
!= AARCH64_PARSE_OK
)
12066 /* Extension parsing was successfull. Confirm the result
12067 cpu and ISA flags. */
12069 *isa_flags
= isa_temp
;
12070 return AARCH64_PARSE_OK
;
12074 /* CPU name not found in list. */
12075 return AARCH64_PARSE_INVALID_ARG
;
12078 /* Parse the TO_PARSE string and put the cpu it selects into RES.
12079 Return an aarch64_parse_opt_result describing the parse result.
12080 If the parsing fails the RES does not change. */
12082 static enum aarch64_parse_opt_result
12083 aarch64_parse_tune (const char *to_parse
, const struct processor
**res
)
12085 const struct processor
*cpu
;
12087 /* Loop through the list of supported CPUs to find a match. */
12088 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
12090 if (strcmp (cpu
->name
, to_parse
) == 0)
12093 return AARCH64_PARSE_OK
;
12097 /* CPU name not found in list. */
12098 return AARCH64_PARSE_INVALID_ARG
;
12101 /* Parse TOKEN, which has length LENGTH to see if it is an option
12102 described in FLAG. If it is, return the index bit for that fusion type.
12103 If not, error (printing OPTION_NAME) and return zero. */
12105 static unsigned int
12106 aarch64_parse_one_option_token (const char *token
,
12108 const struct aarch64_flag_desc
*flag
,
12109 const char *option_name
)
12111 for (; flag
->name
!= NULL
; flag
++)
12113 if (length
== strlen (flag
->name
)
12114 && !strncmp (flag
->name
, token
, length
))
12118 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name
, token
);
12122 /* Parse OPTION which is a comma-separated list of flags to enable.
12123 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
12124 default state we inherit from the CPU tuning structures. OPTION_NAME
12125 gives the top-level option we are parsing in the -moverride string,
12126 for use in error messages. */
12128 static unsigned int
12129 aarch64_parse_boolean_options (const char *option
,
12130 const struct aarch64_flag_desc
*flags
,
12131 unsigned int initial_state
,
12132 const char *option_name
)
12134 const char separator
= '.';
12135 const char* specs
= option
;
12136 const char* ntoken
= option
;
12137 unsigned int found_flags
= initial_state
;
12139 while ((ntoken
= strchr (specs
, separator
)))
12141 size_t token_length
= ntoken
- specs
;
12142 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
12146 /* If we find "none" (or, for simplicity's sake, an error) anywhere
12147 in the token stream, reset the supported operations. So:
12149 adrp+add.cmp+branch.none.adrp+add
12151 would have the result of turning on only adrp+add fusion. */
12155 found_flags
|= token_ops
;
12159 /* We ended with a comma, print something. */
12162 error ("%s string ill-formed\n", option_name
);
12166 /* We still have one more token to parse. */
12167 size_t token_length
= strlen (specs
);
12168 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
12175 found_flags
|= token_ops
;
12176 return found_flags
;
12179 /* Support for overriding instruction fusion. */
12182 aarch64_parse_fuse_string (const char *fuse_string
,
12183 struct tune_params
*tune
)
12185 tune
->fusible_ops
= aarch64_parse_boolean_options (fuse_string
,
12186 aarch64_fusible_pairs
,
12191 /* Support for overriding other tuning flags. */
12194 aarch64_parse_tune_string (const char *tune_string
,
12195 struct tune_params
*tune
)
12197 tune
->extra_tuning_flags
12198 = aarch64_parse_boolean_options (tune_string
,
12199 aarch64_tuning_flags
,
12200 tune
->extra_tuning_flags
,
12204 /* Parse the sve_width tuning moverride string in TUNE_STRING.
12205 Accept the valid SVE vector widths allowed by
12206 aarch64_sve_vector_bits_enum and use it to override sve_width
12210 aarch64_parse_sve_width_string (const char *tune_string
,
12211 struct tune_params
*tune
)
12215 int n
= sscanf (tune_string
, "%d", &width
);
12218 error ("invalid format for sve_width");
12230 error ("invalid sve_width value: %d", width
);
12232 tune
->sve_width
= (enum aarch64_sve_vector_bits_enum
) width
;
12235 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
12236 we understand. If it is, extract the option string and handoff to
12237 the appropriate function. */
12240 aarch64_parse_one_override_token (const char* token
,
12242 struct tune_params
*tune
)
12244 const struct aarch64_tuning_override_function
*fn
12245 = aarch64_tuning_override_functions
;
12247 const char *option_part
= strchr (token
, '=');
12250 error ("tuning string missing in option (%s)", token
);
12254 /* Get the length of the option name. */
12255 length
= option_part
- token
;
12256 /* Skip the '=' to get to the option string. */
12259 for (; fn
->name
!= NULL
; fn
++)
12261 if (!strncmp (fn
->name
, token
, length
))
12263 fn
->parse_override (option_part
, tune
);
12268 error ("unknown tuning option (%s)",token
);
12272 /* A checking mechanism for the implementation of the tls size. */
12275 initialize_aarch64_tls_size (struct gcc_options
*opts
)
12277 if (aarch64_tls_size
== 0)
12278 aarch64_tls_size
= 24;
12280 switch (opts
->x_aarch64_cmodel_var
)
12282 case AARCH64_CMODEL_TINY
:
12283 /* Both the default and maximum TLS size allowed under tiny is 1M which
12284 needs two instructions to address, so we clamp the size to 24. */
12285 if (aarch64_tls_size
> 24)
12286 aarch64_tls_size
= 24;
12288 case AARCH64_CMODEL_SMALL
:
12289 /* The maximum TLS size allowed under small is 4G. */
12290 if (aarch64_tls_size
> 32)
12291 aarch64_tls_size
= 32;
12293 case AARCH64_CMODEL_LARGE
:
12294 /* The maximum TLS size allowed under large is 16E.
12295 FIXME: 16E should be 64bit, we only support 48bit offset now. */
12296 if (aarch64_tls_size
> 48)
12297 aarch64_tls_size
= 48;
12300 gcc_unreachable ();
12306 /* Parse STRING looking for options in the format:
12307 string :: option:string
12308 option :: name=substring
12310 substring :: defined by option. */
12313 aarch64_parse_override_string (const char* input_string
,
12314 struct tune_params
* tune
)
12316 const char separator
= ':';
12317 size_t string_length
= strlen (input_string
) + 1;
12318 char *string_root
= (char *) xmalloc (sizeof (*string_root
) * string_length
);
12319 char *string
= string_root
;
12320 strncpy (string
, input_string
, string_length
);
12321 string
[string_length
- 1] = '\0';
12323 char* ntoken
= string
;
12325 while ((ntoken
= strchr (string
, separator
)))
12327 size_t token_length
= ntoken
- string
;
12328 /* Make this substring look like a string. */
12330 aarch64_parse_one_override_token (string
, token_length
, tune
);
12334 /* One last option to parse. */
12335 aarch64_parse_one_override_token (string
, strlen (string
), tune
);
12336 free (string_root
);
12341 aarch64_override_options_after_change_1 (struct gcc_options
*opts
)
12343 if (accepted_branch_protection_string
)
12345 opts
->x_aarch64_branch_protection_string
12346 = xstrdup (accepted_branch_protection_string
);
12349 /* PR 70044: We have to be careful about being called multiple times for the
12350 same function. This means all changes should be repeatable. */
12352 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
12353 Disable the frame pointer flag so the mid-end will not use a frame
12354 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
12355 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
12356 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
12357 aarch64_use_frame_pointer
= opts
->x_flag_omit_frame_pointer
!= 1;
12358 if (opts
->x_flag_omit_frame_pointer
== 0)
12359 opts
->x_flag_omit_frame_pointer
= 2;
12361 /* If not optimizing for size, set the default
12362 alignment to what the target wants. */
12363 if (!opts
->x_optimize_size
)
12365 if (opts
->x_flag_align_loops
&& !opts
->x_str_align_loops
)
12366 opts
->x_str_align_loops
= aarch64_tune_params
.loop_align
;
12367 if (opts
->x_flag_align_jumps
&& !opts
->x_str_align_jumps
)
12368 opts
->x_str_align_jumps
= aarch64_tune_params
.jump_align
;
12369 if (opts
->x_flag_align_functions
&& !opts
->x_str_align_functions
)
12370 opts
->x_str_align_functions
= aarch64_tune_params
.function_align
;
12373 /* We default to no pc-relative literal loads. */
12375 aarch64_pcrelative_literal_loads
= false;
12377 /* If -mpc-relative-literal-loads is set on the command line, this
12378 implies that the user asked for PC relative literal loads. */
12379 if (opts
->x_pcrelative_literal_loads
== 1)
12380 aarch64_pcrelative_literal_loads
= true;
12382 /* In the tiny memory model it makes no sense to disallow PC relative
12383 literal pool loads. */
12384 if (aarch64_cmodel
== AARCH64_CMODEL_TINY
12385 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
12386 aarch64_pcrelative_literal_loads
= true;
12388 /* When enabling the lower precision Newton series for the square root, also
12389 enable it for the reciprocal square root, since the latter is an
12390 intermediary step for the former. */
12391 if (flag_mlow_precision_sqrt
)
12392 flag_mrecip_low_precision_sqrt
= true;
12395 /* 'Unpack' up the internal tuning structs and update the options
12396 in OPTS. The caller must have set up selected_tune and selected_arch
12397 as all the other target-specific codegen decisions are
12398 derived from them. */
12401 aarch64_override_options_internal (struct gcc_options
*opts
)
12403 aarch64_tune_flags
= selected_tune
->flags
;
12404 aarch64_tune
= selected_tune
->sched_core
;
12405 /* Make a copy of the tuning parameters attached to the core, which
12406 we may later overwrite. */
12407 aarch64_tune_params
= *(selected_tune
->tune
);
12408 aarch64_architecture_version
= selected_arch
->architecture_version
;
12410 if (opts
->x_aarch64_override_tune_string
)
12411 aarch64_parse_override_string (opts
->x_aarch64_override_tune_string
,
12412 &aarch64_tune_params
);
12414 /* This target defaults to strict volatile bitfields. */
12415 if (opts
->x_flag_strict_volatile_bitfields
< 0 && abi_version_at_least (2))
12416 opts
->x_flag_strict_volatile_bitfields
= 1;
12418 if (aarch64_stack_protector_guard
== SSP_GLOBAL
12419 && opts
->x_aarch64_stack_protector_guard_offset_str
)
12421 error ("incompatible options %<-mstack-protector-guard=global%> and "
12422 "%<-mstack-protector-guard-offset=%s%>",
12423 aarch64_stack_protector_guard_offset_str
);
12426 if (aarch64_stack_protector_guard
== SSP_SYSREG
12427 && !(opts
->x_aarch64_stack_protector_guard_offset_str
12428 && opts
->x_aarch64_stack_protector_guard_reg_str
))
12430 error ("both %<-mstack-protector-guard-offset%> and "
12431 "%<-mstack-protector-guard-reg%> must be used "
12432 "with %<-mstack-protector-guard=sysreg%>");
12435 if (opts
->x_aarch64_stack_protector_guard_reg_str
)
12437 if (strlen (opts
->x_aarch64_stack_protector_guard_reg_str
) > 100)
12438 error ("specify a system register with a small string length.");
12441 if (opts
->x_aarch64_stack_protector_guard_offset_str
)
12444 const char *str
= aarch64_stack_protector_guard_offset_str
;
12446 long offs
= strtol (aarch64_stack_protector_guard_offset_str
, &end
, 0);
12447 if (!*str
|| *end
|| errno
)
12448 error ("%qs is not a valid offset in %qs", str
,
12449 "-mstack-protector-guard-offset=");
12450 aarch64_stack_protector_guard_offset
= offs
;
12453 initialize_aarch64_code_model (opts
);
12454 initialize_aarch64_tls_size (opts
);
12456 int queue_depth
= 0;
12457 switch (aarch64_tune_params
.autoprefetcher_model
)
12459 case tune_params::AUTOPREFETCHER_OFF
:
12462 case tune_params::AUTOPREFETCHER_WEAK
:
12465 case tune_params::AUTOPREFETCHER_STRONG
:
12466 queue_depth
= max_insn_queue_index
+ 1;
12469 gcc_unreachable ();
12472 /* We don't mind passing in global_options_set here as we don't use
12473 the *options_set structs anyway. */
12474 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH
,
12476 opts
->x_param_values
,
12477 global_options_set
.x_param_values
);
12479 /* Set up parameters to be used in prefetching algorithm. Do not
12480 override the defaults unless we are tuning for a core we have
12481 researched values for. */
12482 if (aarch64_tune_params
.prefetch
->num_slots
> 0)
12483 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES
,
12484 aarch64_tune_params
.prefetch
->num_slots
,
12485 opts
->x_param_values
,
12486 global_options_set
.x_param_values
);
12487 if (aarch64_tune_params
.prefetch
->l1_cache_size
>= 0)
12488 maybe_set_param_value (PARAM_L1_CACHE_SIZE
,
12489 aarch64_tune_params
.prefetch
->l1_cache_size
,
12490 opts
->x_param_values
,
12491 global_options_set
.x_param_values
);
12492 if (aarch64_tune_params
.prefetch
->l1_cache_line_size
>= 0)
12493 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE
,
12494 aarch64_tune_params
.prefetch
->l1_cache_line_size
,
12495 opts
->x_param_values
,
12496 global_options_set
.x_param_values
);
12497 if (aarch64_tune_params
.prefetch
->l2_cache_size
>= 0)
12498 maybe_set_param_value (PARAM_L2_CACHE_SIZE
,
12499 aarch64_tune_params
.prefetch
->l2_cache_size
,
12500 opts
->x_param_values
,
12501 global_options_set
.x_param_values
);
12502 if (!aarch64_tune_params
.prefetch
->prefetch_dynamic_strides
)
12503 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES
,
12505 opts
->x_param_values
,
12506 global_options_set
.x_param_values
);
12507 if (aarch64_tune_params
.prefetch
->minimum_stride
>= 0)
12508 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE
,
12509 aarch64_tune_params
.prefetch
->minimum_stride
,
12510 opts
->x_param_values
,
12511 global_options_set
.x_param_values
);
12513 /* Use the alternative scheduling-pressure algorithm by default. */
12514 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM
, SCHED_PRESSURE_MODEL
,
12515 opts
->x_param_values
,
12516 global_options_set
.x_param_values
);
12518 /* If the user hasn't changed it via configure then set the default to 64 KB
12519 for the backend. */
12520 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
,
12521 DEFAULT_STK_CLASH_GUARD_SIZE
== 0
12522 ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE
,
12523 opts
->x_param_values
,
12524 global_options_set
.x_param_values
);
12526 /* Validate the guard size. */
12527 int guard_size
= PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
12529 /* Enforce that interval is the same size as size so the mid-end does the
12531 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL
,
12533 opts
->x_param_values
,
12534 global_options_set
.x_param_values
);
12536 /* The maybe_set calls won't update the value if the user has explicitly set
12537 one. Which means we need to validate that probing interval and guard size
12540 = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL
);
12541 if (guard_size
!= probe_interval
)
12542 error ("stack clash guard size %<%d%> must be equal to probing interval "
12543 "%<%d%>", guard_size
, probe_interval
);
12545 /* Enable sw prefetching at specified optimization level for
12546 CPUS that have prefetch. Lower optimization level threshold by 1
12547 when profiling is enabled. */
12548 if (opts
->x_flag_prefetch_loop_arrays
< 0
12549 && !opts
->x_optimize_size
12550 && aarch64_tune_params
.prefetch
->default_opt_level
>= 0
12551 && opts
->x_optimize
>= aarch64_tune_params
.prefetch
->default_opt_level
)
12552 opts
->x_flag_prefetch_loop_arrays
= 1;
12554 if (opts
->x_aarch64_arch_string
== NULL
)
12555 opts
->x_aarch64_arch_string
= selected_arch
->name
;
12556 if (opts
->x_aarch64_cpu_string
== NULL
)
12557 opts
->x_aarch64_cpu_string
= selected_cpu
->name
;
12558 if (opts
->x_aarch64_tune_string
== NULL
)
12559 opts
->x_aarch64_tune_string
= selected_tune
->name
;
12561 aarch64_override_options_after_change_1 (opts
);
12564 /* Print a hint with a suggestion for a core or architecture name that
12565 most closely resembles what the user passed in STR. ARCH is true if
12566 the user is asking for an architecture name. ARCH is false if the user
12567 is asking for a core name. */
12570 aarch64_print_hint_for_core_or_arch (const char *str
, bool arch
)
12572 auto_vec
<const char *> candidates
;
12573 const struct processor
*entry
= arch
? all_architectures
: all_cores
;
12574 for (; entry
->name
!= NULL
; entry
++)
12575 candidates
.safe_push (entry
->name
);
12577 #ifdef HAVE_LOCAL_CPU_DETECT
12578 /* Add also "native" as possible value. */
12580 candidates
.safe_push ("native");
12584 const char *hint
= candidates_list_and_hint (str
, s
, candidates
);
12586 inform (input_location
, "valid arguments are: %s;"
12587 " did you mean %qs?", s
, hint
);
12589 inform (input_location
, "valid arguments are: %s", s
);
12594 /* Print a hint with a suggestion for a core name that most closely resembles
12595 what the user passed in STR. */
12598 aarch64_print_hint_for_core (const char *str
)
12600 aarch64_print_hint_for_core_or_arch (str
, false);
12603 /* Print a hint with a suggestion for an architecture name that most closely
12604 resembles what the user passed in STR. */
12607 aarch64_print_hint_for_arch (const char *str
)
12609 aarch64_print_hint_for_core_or_arch (str
, true);
12613 /* Print a hint with a suggestion for an extension name
12614 that most closely resembles what the user passed in STR. */
12617 aarch64_print_hint_for_extensions (const std::string
&str
)
12619 auto_vec
<const char *> candidates
;
12620 aarch64_get_all_extension_candidates (&candidates
);
12622 const char *hint
= candidates_list_and_hint (str
.c_str (), s
, candidates
);
12624 inform (input_location
, "valid arguments are: %s;"
12625 " did you mean %qs?", s
, hint
);
12627 inform (input_location
, "valid arguments are: %s;", s
);
12632 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
12633 specified in STR and throw errors if appropriate. Put the results if
12634 they are valid in RES and ISA_FLAGS. Return whether the option is
12638 aarch64_validate_mcpu (const char *str
, const struct processor
**res
,
12639 uint64_t *isa_flags
)
12641 std::string invalid_extension
;
12642 enum aarch64_parse_opt_result parse_res
12643 = aarch64_parse_cpu (str
, res
, isa_flags
, &invalid_extension
);
12645 if (parse_res
== AARCH64_PARSE_OK
)
12650 case AARCH64_PARSE_MISSING_ARG
:
12651 error ("missing cpu name in %<-mcpu=%s%>", str
);
12653 case AARCH64_PARSE_INVALID_ARG
:
12654 error ("unknown value %qs for %<-mcpu%>", str
);
12655 aarch64_print_hint_for_core (str
);
12657 case AARCH64_PARSE_INVALID_FEATURE
:
12658 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
12659 invalid_extension
.c_str (), str
);
12660 aarch64_print_hint_for_extensions (invalid_extension
);
12663 gcc_unreachable ();
12669 /* Parses CONST_STR for branch protection features specified in
12670 aarch64_branch_protect_types, and set any global variables required. Returns
12671 the parsing result and assigns LAST_STR to the last processed token from
12672 CONST_STR so that it can be used for error reporting. */
12675 aarch64_parse_opt_result
aarch64_parse_branch_protection (const char *const_str
,
12678 char *str_root
= xstrdup (const_str
);
12679 char* token_save
= NULL
;
12680 char *str
= strtok_r (str_root
, "+", &token_save
);
12681 enum aarch64_parse_opt_result res
= AARCH64_PARSE_OK
;
12683 res
= AARCH64_PARSE_MISSING_ARG
;
12686 char *next_str
= strtok_r (NULL
, "+", &token_save
);
12687 /* Reset the branch protection features to their defaults. */
12688 aarch64_handle_no_branch_protection (NULL
, NULL
);
12690 while (str
&& res
== AARCH64_PARSE_OK
)
12692 const aarch64_branch_protect_type
* type
= aarch64_branch_protect_types
;
12693 bool found
= false;
12694 /* Search for this type. */
12695 while (type
&& type
->name
&& !found
&& res
== AARCH64_PARSE_OK
)
12697 if (strcmp (str
, type
->name
) == 0)
12700 res
= type
->handler (str
, next_str
);
12702 next_str
= strtok_r (NULL
, "+", &token_save
);
12707 if (found
&& res
== AARCH64_PARSE_OK
)
12709 bool found_subtype
= true;
12710 /* Loop through each token until we find one that isn't a
12712 while (found_subtype
)
12714 found_subtype
= false;
12715 const aarch64_branch_protect_type
*subtype
= type
->subtypes
;
12716 /* Search for the subtype. */
12717 while (str
&& subtype
&& subtype
->name
&& !found_subtype
12718 && res
== AARCH64_PARSE_OK
)
12720 if (strcmp (str
, subtype
->name
) == 0)
12722 found_subtype
= true;
12723 res
= subtype
->handler (str
, next_str
);
12725 next_str
= strtok_r (NULL
, "+", &token_save
);
12733 res
= AARCH64_PARSE_INVALID_ARG
;
12736 /* Copy the last processed token into the argument to pass it back.
12737 Used by option and attribute validation to print the offending token. */
12740 if (str
) strcpy (*last_str
, str
);
12741 else *last_str
= NULL
;
12743 if (res
== AARCH64_PARSE_OK
)
12745 /* If needed, alloc the accepted string then copy in const_str.
12746 Used by override_option_after_change_1. */
12747 if (!accepted_branch_protection_string
)
12748 accepted_branch_protection_string
= (char *) xmalloc (
12749 BRANCH_PROTECT_STR_MAX
12751 strncpy (accepted_branch_protection_string
, const_str
,
12752 BRANCH_PROTECT_STR_MAX
+ 1);
12753 /* Forcibly null-terminate. */
12754 accepted_branch_protection_string
[BRANCH_PROTECT_STR_MAX
] = '\0';
12760 aarch64_validate_mbranch_protection (const char *const_str
)
12762 char *str
= (char *) xmalloc (strlen (const_str
));
12763 enum aarch64_parse_opt_result res
=
12764 aarch64_parse_branch_protection (const_str
, &str
);
12765 if (res
== AARCH64_PARSE_INVALID_ARG
)
12766 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str
);
12767 else if (res
== AARCH64_PARSE_MISSING_ARG
)
12768 error ("missing argument for %<-mbranch-protection=%>");
12770 return res
== AARCH64_PARSE_OK
;
12773 /* Validate a command-line -march option. Parse the arch and extensions
12774 (if any) specified in STR and throw errors if appropriate. Put the
12775 results, if they are valid, in RES and ISA_FLAGS. Return whether the
12776 option is valid. */
12779 aarch64_validate_march (const char *str
, const struct processor
**res
,
12780 uint64_t *isa_flags
)
12782 std::string invalid_extension
;
12783 enum aarch64_parse_opt_result parse_res
12784 = aarch64_parse_arch (str
, res
, isa_flags
, &invalid_extension
);
12786 if (parse_res
== AARCH64_PARSE_OK
)
12791 case AARCH64_PARSE_MISSING_ARG
:
12792 error ("missing arch name in %<-march=%s%>", str
);
12794 case AARCH64_PARSE_INVALID_ARG
:
12795 error ("unknown value %qs for %<-march%>", str
);
12796 aarch64_print_hint_for_arch (str
);
12798 case AARCH64_PARSE_INVALID_FEATURE
:
12799 error ("invalid feature modifier %qs in %<-march=%s%>",
12800 invalid_extension
.c_str (), str
);
12801 aarch64_print_hint_for_extensions (invalid_extension
);
12804 gcc_unreachable ();
12810 /* Validate a command-line -mtune option. Parse the cpu
12811 specified in STR and throw errors if appropriate. Put the
12812 result, if it is valid, in RES. Return whether the option is
12816 aarch64_validate_mtune (const char *str
, const struct processor
**res
)
12818 enum aarch64_parse_opt_result parse_res
12819 = aarch64_parse_tune (str
, res
);
12821 if (parse_res
== AARCH64_PARSE_OK
)
12826 case AARCH64_PARSE_MISSING_ARG
:
12827 error ("missing cpu name in %<-mtune=%s%>", str
);
12829 case AARCH64_PARSE_INVALID_ARG
:
12830 error ("unknown value %qs for %<-mtune%>", str
);
12831 aarch64_print_hint_for_core (str
);
12834 gcc_unreachable ();
12839 /* Return the CPU corresponding to the enum CPU.
12840 If it doesn't specify a cpu, return the default. */
12842 static const struct processor
*
12843 aarch64_get_tune_cpu (enum aarch64_processor cpu
)
12845 if (cpu
!= aarch64_none
)
12846 return &all_cores
[cpu
];
12848 /* The & 0x3f is to extract the bottom 6 bits that encode the
12849 default cpu as selected by the --with-cpu GCC configure option
12851 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
12852 flags mechanism should be reworked to make it more sane. */
12853 return &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
12856 /* Return the architecture corresponding to the enum ARCH.
12857 If it doesn't specify a valid architecture, return the default. */
12859 static const struct processor
*
12860 aarch64_get_arch (enum aarch64_arch arch
)
12862 if (arch
!= aarch64_no_arch
)
12863 return &all_architectures
[arch
];
12865 const struct processor
*cpu
= &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
12867 return &all_architectures
[cpu
->arch
];
12870 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
12873 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value
)
12875 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
12876 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
12877 deciding which .md file patterns to use and when deciding whether
12878 something is a legitimate address or constant. */
12879 if (value
== SVE_SCALABLE
|| value
== SVE_128
)
12880 return poly_uint16 (2, 2);
12882 return (int) value
/ 64;
12885 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
12886 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
12887 tuning structs. In particular it must set selected_tune and
12888 aarch64_isa_flags that define the available ISA features and tuning
12889 decisions. It must also set selected_arch as this will be used to
12890 output the .arch asm tags for each function. */
12893 aarch64_override_options (void)
12895 uint64_t cpu_isa
= 0;
12896 uint64_t arch_isa
= 0;
12897 aarch64_isa_flags
= 0;
12899 bool valid_cpu
= true;
12900 bool valid_tune
= true;
12901 bool valid_arch
= true;
12903 selected_cpu
= NULL
;
12904 selected_arch
= NULL
;
12905 selected_tune
= NULL
;
12907 if (aarch64_branch_protection_string
)
12908 aarch64_validate_mbranch_protection (aarch64_branch_protection_string
);
12910 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
12911 If either of -march or -mtune is given, they override their
12912 respective component of -mcpu. */
12913 if (aarch64_cpu_string
)
12914 valid_cpu
= aarch64_validate_mcpu (aarch64_cpu_string
, &selected_cpu
,
12917 if (aarch64_arch_string
)
12918 valid_arch
= aarch64_validate_march (aarch64_arch_string
, &selected_arch
,
12921 if (aarch64_tune_string
)
12922 valid_tune
= aarch64_validate_mtune (aarch64_tune_string
, &selected_tune
);
12924 #ifdef SUBTARGET_OVERRIDE_OPTIONS
12925 SUBTARGET_OVERRIDE_OPTIONS
;
12928 /* If the user did not specify a processor, choose the default
12929 one for them. This will be the CPU set during configuration using
12930 --with-cpu, otherwise it is "generic". */
12935 selected_cpu
= &all_cores
[selected_arch
->ident
];
12936 aarch64_isa_flags
= arch_isa
;
12937 explicit_arch
= selected_arch
->arch
;
12941 /* Get default configure-time CPU. */
12942 selected_cpu
= aarch64_get_tune_cpu (aarch64_none
);
12943 aarch64_isa_flags
= TARGET_CPU_DEFAULT
>> 6;
12947 explicit_tune_core
= selected_tune
->ident
;
12949 /* If both -mcpu and -march are specified check that they are architecturally
12950 compatible, warn if they're not and prefer the -march ISA flags. */
12951 else if (selected_arch
)
12953 if (selected_arch
->arch
!= selected_cpu
->arch
)
12955 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
12956 all_architectures
[selected_cpu
->arch
].name
,
12957 selected_arch
->name
);
12959 aarch64_isa_flags
= arch_isa
;
12960 explicit_arch
= selected_arch
->arch
;
12961 explicit_tune_core
= selected_tune
? selected_tune
->ident
12962 : selected_cpu
->ident
;
12966 /* -mcpu but no -march. */
12967 aarch64_isa_flags
= cpu_isa
;
12968 explicit_tune_core
= selected_tune
? selected_tune
->ident
12969 : selected_cpu
->ident
;
12970 gcc_assert (selected_cpu
);
12971 selected_arch
= &all_architectures
[selected_cpu
->arch
];
12972 explicit_arch
= selected_arch
->arch
;
12975 /* Set the arch as well as we will need it when outputing
12976 the .arch directive in assembly. */
12977 if (!selected_arch
)
12979 gcc_assert (selected_cpu
);
12980 selected_arch
= &all_architectures
[selected_cpu
->arch
];
12983 if (!selected_tune
)
12984 selected_tune
= selected_cpu
;
12986 if (aarch64_enable_bti
== 2)
12988 #ifdef TARGET_ENABLE_BTI
12989 aarch64_enable_bti
= 1;
12991 aarch64_enable_bti
= 0;
12995 /* Return address signing is currently not supported for ILP32 targets. For
12996 LP64 targets use the configured option in the absence of a command-line
12997 option for -mbranch-protection. */
12998 if (!TARGET_ILP32
&& accepted_branch_protection_string
== NULL
)
13000 #ifdef TARGET_ENABLE_PAC_RET
13001 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
13003 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NONE
;
13007 #ifndef HAVE_AS_MABI_OPTION
13008 /* The compiler may have been configured with 2.23.* binutils, which does
13009 not have support for ILP32. */
13011 error ("assembler does not support %<-mabi=ilp32%>");
13014 /* Convert -msve-vector-bits to a VG count. */
13015 aarch64_sve_vg
= aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits
);
13017 if (aarch64_ra_sign_scope
!= AARCH64_FUNCTION_NONE
&& TARGET_ILP32
)
13018 sorry ("return address signing is only supported for %<-mabi=lp64%>");
13020 /* Make sure we properly set up the explicit options. */
13021 if ((aarch64_cpu_string
&& valid_cpu
)
13022 || (aarch64_tune_string
&& valid_tune
))
13023 gcc_assert (explicit_tune_core
!= aarch64_none
);
13025 if ((aarch64_cpu_string
&& valid_cpu
)
13026 || (aarch64_arch_string
&& valid_arch
))
13027 gcc_assert (explicit_arch
!= aarch64_no_arch
);
13029 /* The pass to insert speculation tracking runs before
13030 shrink-wrapping and the latter does not know how to update the
13031 tracking status. So disable it in this case. */
13032 if (aarch64_track_speculation
)
13033 flag_shrink_wrap
= 0;
13035 aarch64_override_options_internal (&global_options
);
13037 /* Save these options as the default ones in case we push and pop them later
13038 while processing functions with potential target attributes. */
13039 target_option_default_node
= target_option_current_node
13040 = build_target_option_node (&global_options
);
13043 /* Implement targetm.override_options_after_change. */
13046 aarch64_override_options_after_change (void)
13048 aarch64_override_options_after_change_1 (&global_options
);
13051 static struct machine_function
*
13052 aarch64_init_machine_status (void)
13054 struct machine_function
*machine
;
13055 machine
= ggc_cleared_alloc
<machine_function
> ();
13060 aarch64_init_expanders (void)
13062 init_machine_status
= aarch64_init_machine_status
;
13065 /* A checking mechanism for the implementation of the various code models. */
13067 initialize_aarch64_code_model (struct gcc_options
*opts
)
13069 if (opts
->x_flag_pic
)
13071 switch (opts
->x_aarch64_cmodel_var
)
13073 case AARCH64_CMODEL_TINY
:
13074 aarch64_cmodel
= AARCH64_CMODEL_TINY_PIC
;
13076 case AARCH64_CMODEL_SMALL
:
13077 #ifdef HAVE_AS_SMALL_PIC_RELOCS
13078 aarch64_cmodel
= (flag_pic
== 2
13079 ? AARCH64_CMODEL_SMALL_PIC
13080 : AARCH64_CMODEL_SMALL_SPIC
);
13082 aarch64_cmodel
= AARCH64_CMODEL_SMALL_PIC
;
13085 case AARCH64_CMODEL_LARGE
:
13086 sorry ("code model %qs with %<-f%s%>", "large",
13087 opts
->x_flag_pic
> 1 ? "PIC" : "pic");
13090 gcc_unreachable ();
13094 aarch64_cmodel
= opts
->x_aarch64_cmodel_var
;
13097 /* Implement TARGET_OPTION_SAVE. */
13100 aarch64_option_save (struct cl_target_option
*ptr
, struct gcc_options
*opts
)
13102 ptr
->x_aarch64_override_tune_string
= opts
->x_aarch64_override_tune_string
;
13103 ptr
->x_aarch64_branch_protection_string
13104 = opts
->x_aarch64_branch_protection_string
;
13107 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
13108 using the information saved in PTR. */
13111 aarch64_option_restore (struct gcc_options
*opts
, struct cl_target_option
*ptr
)
13113 opts
->x_explicit_tune_core
= ptr
->x_explicit_tune_core
;
13114 selected_tune
= aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
13115 opts
->x_explicit_arch
= ptr
->x_explicit_arch
;
13116 selected_arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
13117 opts
->x_aarch64_override_tune_string
= ptr
->x_aarch64_override_tune_string
;
13118 opts
->x_aarch64_branch_protection_string
13119 = ptr
->x_aarch64_branch_protection_string
;
13120 if (opts
->x_aarch64_branch_protection_string
)
13122 aarch64_parse_branch_protection (opts
->x_aarch64_branch_protection_string
,
13126 aarch64_override_options_internal (opts
);
13129 /* Implement TARGET_OPTION_PRINT. */
13132 aarch64_option_print (FILE *file
, int indent
, struct cl_target_option
*ptr
)
13134 const struct processor
*cpu
13135 = aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
13136 uint64_t isa_flags
= ptr
->x_aarch64_isa_flags
;
13137 const struct processor
*arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
13138 std::string extension
13139 = aarch64_get_extension_string_for_isa_flags (isa_flags
, arch
->flags
);
13141 fprintf (file
, "%*sselected tune = %s\n", indent
, "", cpu
->name
);
13142 fprintf (file
, "%*sselected arch = %s%s\n", indent
, "",
13143 arch
->name
, extension
.c_str ());
13146 static GTY(()) tree aarch64_previous_fndecl
;
13149 aarch64_reset_previous_fndecl (void)
13151 aarch64_previous_fndecl
= NULL
;
13154 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
13155 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
13156 make sure optab availability predicates are recomputed when necessary. */
13159 aarch64_save_restore_target_globals (tree new_tree
)
13161 if (TREE_TARGET_GLOBALS (new_tree
))
13162 restore_target_globals (TREE_TARGET_GLOBALS (new_tree
));
13163 else if (new_tree
== target_option_default_node
)
13164 restore_target_globals (&default_target_globals
);
13166 TREE_TARGET_GLOBALS (new_tree
) = save_target_globals_default_opts ();
13169 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
13170 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
13171 of the function, if such exists. This function may be called multiple
13172 times on a single function so use aarch64_previous_fndecl to avoid
13173 setting up identical state. */
13176 aarch64_set_current_function (tree fndecl
)
13178 if (!fndecl
|| fndecl
== aarch64_previous_fndecl
)
13181 tree old_tree
= (aarch64_previous_fndecl
13182 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl
)
13185 tree new_tree
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
13187 /* If current function has no attributes but the previous one did,
13188 use the default node. */
13189 if (!new_tree
&& old_tree
)
13190 new_tree
= target_option_default_node
;
13192 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
13193 the default have been handled by aarch64_save_restore_target_globals from
13194 aarch64_pragma_target_parse. */
13195 if (old_tree
== new_tree
)
13198 aarch64_previous_fndecl
= fndecl
;
13200 /* First set the target options. */
13201 cl_target_option_restore (&global_options
, TREE_TARGET_OPTION (new_tree
));
13203 aarch64_save_restore_target_globals (new_tree
);
13206 /* Enum describing the various ways we can handle attributes.
13207 In many cases we can reuse the generic option handling machinery. */
13209 enum aarch64_attr_opt_type
13211 aarch64_attr_mask
, /* Attribute should set a bit in target_flags. */
13212 aarch64_attr_bool
, /* Attribute sets or unsets a boolean variable. */
13213 aarch64_attr_enum
, /* Attribute sets an enum variable. */
13214 aarch64_attr_custom
/* Attribute requires a custom handling function. */
13217 /* All the information needed to handle a target attribute.
13218 NAME is the name of the attribute.
13219 ATTR_TYPE specifies the type of behavior of the attribute as described
13220 in the definition of enum aarch64_attr_opt_type.
13221 ALLOW_NEG is true if the attribute supports a "no-" form.
13222 HANDLER is the function that takes the attribute string as an argument
13223 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
13224 OPT_NUM is the enum specifying the option that the attribute modifies.
13225 This is needed for attributes that mirror the behavior of a command-line
13226 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
13227 aarch64_attr_enum. */
13229 struct aarch64_attribute_info
13232 enum aarch64_attr_opt_type attr_type
;
13234 bool (*handler
) (const char *);
13235 enum opt_code opt_num
;
13238 /* Handle the ARCH_STR argument to the arch= target attribute. */
13241 aarch64_handle_attr_arch (const char *str
)
13243 const struct processor
*tmp_arch
= NULL
;
13244 std::string invalid_extension
;
13245 enum aarch64_parse_opt_result parse_res
13246 = aarch64_parse_arch (str
, &tmp_arch
, &aarch64_isa_flags
, &invalid_extension
);
13248 if (parse_res
== AARCH64_PARSE_OK
)
13250 gcc_assert (tmp_arch
);
13251 selected_arch
= tmp_arch
;
13252 explicit_arch
= selected_arch
->arch
;
13258 case AARCH64_PARSE_MISSING_ARG
:
13259 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
13261 case AARCH64_PARSE_INVALID_ARG
:
13262 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str
);
13263 aarch64_print_hint_for_arch (str
);
13265 case AARCH64_PARSE_INVALID_FEATURE
:
13266 error ("invalid feature modifier %s of value (\"%s\") in "
13267 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
13268 aarch64_print_hint_for_extensions (invalid_extension
);
13271 gcc_unreachable ();
13277 /* Handle the argument CPU_STR to the cpu= target attribute. */
13280 aarch64_handle_attr_cpu (const char *str
)
13282 const struct processor
*tmp_cpu
= NULL
;
13283 std::string invalid_extension
;
13284 enum aarch64_parse_opt_result parse_res
13285 = aarch64_parse_cpu (str
, &tmp_cpu
, &aarch64_isa_flags
, &invalid_extension
);
13287 if (parse_res
== AARCH64_PARSE_OK
)
13289 gcc_assert (tmp_cpu
);
13290 selected_tune
= tmp_cpu
;
13291 explicit_tune_core
= selected_tune
->ident
;
13293 selected_arch
= &all_architectures
[tmp_cpu
->arch
];
13294 explicit_arch
= selected_arch
->arch
;
13300 case AARCH64_PARSE_MISSING_ARG
:
13301 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
13303 case AARCH64_PARSE_INVALID_ARG
:
13304 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str
);
13305 aarch64_print_hint_for_core (str
);
13307 case AARCH64_PARSE_INVALID_FEATURE
:
13308 error ("invalid feature modifier %s of value (\"%s\") in "
13309 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
13310 aarch64_print_hint_for_extensions (invalid_extension
);
13313 gcc_unreachable ();
13319 /* Handle the argument STR to the branch-protection= attribute. */
13322 aarch64_handle_attr_branch_protection (const char* str
)
13324 char *err_str
= (char *) xmalloc (strlen (str
));
13325 enum aarch64_parse_opt_result res
= aarch64_parse_branch_protection (str
,
13327 bool success
= false;
13330 case AARCH64_PARSE_MISSING_ARG
:
13331 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
13334 case AARCH64_PARSE_INVALID_ARG
:
13335 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
13336 "=\")%> pragma or attribute", err_str
);
13338 case AARCH64_PARSE_OK
:
13340 /* Fall through. */
13341 case AARCH64_PARSE_INVALID_FEATURE
:
13344 gcc_unreachable ();
13350 /* Handle the argument STR to the tune= target attribute. */
13353 aarch64_handle_attr_tune (const char *str
)
13355 const struct processor
*tmp_tune
= NULL
;
13356 enum aarch64_parse_opt_result parse_res
13357 = aarch64_parse_tune (str
, &tmp_tune
);
13359 if (parse_res
== AARCH64_PARSE_OK
)
13361 gcc_assert (tmp_tune
);
13362 selected_tune
= tmp_tune
;
13363 explicit_tune_core
= selected_tune
->ident
;
13369 case AARCH64_PARSE_INVALID_ARG
:
13370 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str
);
13371 aarch64_print_hint_for_core (str
);
13374 gcc_unreachable ();
13380 /* Parse an architecture extensions target attribute string specified in STR.
13381 For example "+fp+nosimd". Show any errors if needed. Return TRUE
13382 if successful. Update aarch64_isa_flags to reflect the ISA features
13386 aarch64_handle_attr_isa_flags (char *str
)
13388 enum aarch64_parse_opt_result parse_res
;
13389 uint64_t isa_flags
= aarch64_isa_flags
;
13391 /* We allow "+nothing" in the beginning to clear out all architectural
13392 features if the user wants to handpick specific features. */
13393 if (strncmp ("+nothing", str
, 8) == 0)
13399 std::string invalid_extension
;
13400 parse_res
= aarch64_parse_extension (str
, &isa_flags
, &invalid_extension
);
13402 if (parse_res
== AARCH64_PARSE_OK
)
13404 aarch64_isa_flags
= isa_flags
;
13410 case AARCH64_PARSE_MISSING_ARG
:
13411 error ("missing value in %<target()%> pragma or attribute");
13414 case AARCH64_PARSE_INVALID_FEATURE
:
13415 error ("invalid feature modifier %s of value (\"%s\") in "
13416 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
13420 gcc_unreachable ();
13426 /* The target attributes that we support. On top of these we also support just
13427 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
13428 handled explicitly in aarch64_process_one_target_attr. */
13430 static const struct aarch64_attribute_info aarch64_attributes
[] =
13432 { "general-regs-only", aarch64_attr_mask
, false, NULL
,
13433 OPT_mgeneral_regs_only
},
13434 { "fix-cortex-a53-835769", aarch64_attr_bool
, true, NULL
,
13435 OPT_mfix_cortex_a53_835769
},
13436 { "fix-cortex-a53-843419", aarch64_attr_bool
, true, NULL
,
13437 OPT_mfix_cortex_a53_843419
},
13438 { "cmodel", aarch64_attr_enum
, false, NULL
, OPT_mcmodel_
},
13439 { "strict-align", aarch64_attr_mask
, true, NULL
, OPT_mstrict_align
},
13440 { "omit-leaf-frame-pointer", aarch64_attr_bool
, true, NULL
,
13441 OPT_momit_leaf_frame_pointer
},
13442 { "tls-dialect", aarch64_attr_enum
, false, NULL
, OPT_mtls_dialect_
},
13443 { "arch", aarch64_attr_custom
, false, aarch64_handle_attr_arch
,
13445 { "cpu", aarch64_attr_custom
, false, aarch64_handle_attr_cpu
, OPT_mcpu_
},
13446 { "tune", aarch64_attr_custom
, false, aarch64_handle_attr_tune
,
13448 { "branch-protection", aarch64_attr_custom
, false,
13449 aarch64_handle_attr_branch_protection
, OPT_mbranch_protection_
},
13450 { "sign-return-address", aarch64_attr_enum
, false, NULL
,
13451 OPT_msign_return_address_
},
13452 { NULL
, aarch64_attr_custom
, false, NULL
, OPT____
}
13455 /* Parse ARG_STR which contains the definition of one target attribute.
13456 Show appropriate errors if any or return true if the attribute is valid. */
13459 aarch64_process_one_target_attr (char *arg_str
)
13461 bool invert
= false;
13463 size_t len
= strlen (arg_str
);
13467 error ("malformed %<target()%> pragma or attribute");
13471 char *str_to_check
= (char *) alloca (len
+ 1);
13472 strcpy (str_to_check
, arg_str
);
13474 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
13475 It is easier to detect and handle it explicitly here rather than going
13476 through the machinery for the rest of the target attributes in this
13478 if (*str_to_check
== '+')
13479 return aarch64_handle_attr_isa_flags (str_to_check
);
13481 if (len
> 3 && strncmp (str_to_check
, "no-", 3) == 0)
13486 char *arg
= strchr (str_to_check
, '=');
13488 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
13489 and point ARG to "foo". */
13495 const struct aarch64_attribute_info
*p_attr
;
13496 bool found
= false;
13497 for (p_attr
= aarch64_attributes
; p_attr
->name
; p_attr
++)
13499 /* If the names don't match up, or the user has given an argument
13500 to an attribute that doesn't accept one, or didn't give an argument
13501 to an attribute that expects one, fail to match. */
13502 if (strcmp (str_to_check
, p_attr
->name
) != 0)
13506 bool attr_need_arg_p
= p_attr
->attr_type
== aarch64_attr_custom
13507 || p_attr
->attr_type
== aarch64_attr_enum
;
13509 if (attr_need_arg_p
^ (arg
!= NULL
))
13511 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check
);
13515 /* If the name matches but the attribute does not allow "no-" versions
13516 then we can't match. */
13517 if (invert
&& !p_attr
->allow_neg
)
13519 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check
);
13523 switch (p_attr
->attr_type
)
13525 /* Has a custom handler registered.
13526 For example, cpu=, arch=, tune=. */
13527 case aarch64_attr_custom
:
13528 gcc_assert (p_attr
->handler
);
13529 if (!p_attr
->handler (arg
))
13533 /* Either set or unset a boolean option. */
13534 case aarch64_attr_bool
:
13536 struct cl_decoded_option decoded
;
13538 generate_option (p_attr
->opt_num
, NULL
, !invert
,
13539 CL_TARGET
, &decoded
);
13540 aarch64_handle_option (&global_options
, &global_options_set
,
13541 &decoded
, input_location
);
13544 /* Set or unset a bit in the target_flags. aarch64_handle_option
13545 should know what mask to apply given the option number. */
13546 case aarch64_attr_mask
:
13548 struct cl_decoded_option decoded
;
13549 /* We only need to specify the option number.
13550 aarch64_handle_option will know which mask to apply. */
13551 decoded
.opt_index
= p_attr
->opt_num
;
13552 decoded
.value
= !invert
;
13553 aarch64_handle_option (&global_options
, &global_options_set
,
13554 &decoded
, input_location
);
13557 /* Use the option setting machinery to set an option to an enum. */
13558 case aarch64_attr_enum
:
13563 valid
= opt_enum_arg_to_value (p_attr
->opt_num
, arg
,
13564 &value
, CL_TARGET
);
13567 set_option (&global_options
, NULL
, p_attr
->opt_num
, value
,
13568 NULL
, DK_UNSPECIFIED
, input_location
,
13573 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check
, arg
);
13578 gcc_unreachable ();
13582 /* If we reached here we either have found an attribute and validated
13583 it or didn't match any. If we matched an attribute but its arguments
13584 were malformed we will have returned false already. */
13588 /* Count how many times the character C appears in
13589 NULL-terminated string STR. */
13591 static unsigned int
13592 num_occurences_in_str (char c
, char *str
)
13594 unsigned int res
= 0;
13595 while (*str
!= '\0')
13606 /* Parse the tree in ARGS that contains the target attribute information
13607 and update the global target options space. */
13610 aarch64_process_target_attr (tree args
)
13612 if (TREE_CODE (args
) == TREE_LIST
)
13616 tree head
= TREE_VALUE (args
);
13619 if (!aarch64_process_target_attr (head
))
13622 args
= TREE_CHAIN (args
);
13628 if (TREE_CODE (args
) != STRING_CST
)
13630 error ("attribute %<target%> argument not a string");
13634 size_t len
= strlen (TREE_STRING_POINTER (args
));
13635 char *str_to_check
= (char *) alloca (len
+ 1);
13636 strcpy (str_to_check
, TREE_STRING_POINTER (args
));
13640 error ("malformed %<target()%> pragma or attribute");
13644 /* Used to catch empty spaces between commas i.e.
13645 attribute ((target ("attr1,,attr2"))). */
13646 unsigned int num_commas
= num_occurences_in_str (',', str_to_check
);
13648 /* Handle multiple target attributes separated by ','. */
13649 char *token
= strtok_r (str_to_check
, ",", &str_to_check
);
13651 unsigned int num_attrs
= 0;
13655 if (!aarch64_process_one_target_attr (token
))
13657 error ("pragma or attribute %<target(\"%s\")%> is not valid", token
);
13661 token
= strtok_r (NULL
, ",", &str_to_check
);
13664 if (num_attrs
!= num_commas
+ 1)
13666 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args
));
13673 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
13674 process attribute ((target ("..."))). */
13677 aarch64_option_valid_attribute_p (tree fndecl
, tree
, tree args
, int)
13679 struct cl_target_option cur_target
;
13682 tree new_target
, new_optimize
;
13683 tree existing_target
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
13685 /* If what we're processing is the current pragma string then the
13686 target option node is already stored in target_option_current_node
13687 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
13688 having to re-parse the string. This is especially useful to keep
13689 arm_neon.h compile times down since that header contains a lot
13690 of intrinsics enclosed in pragmas. */
13691 if (!existing_target
&& args
== current_target_pragma
)
13693 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = target_option_current_node
;
13696 tree func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
13698 old_optimize
= build_optimization_node (&global_options
);
13699 func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
13701 /* If the function changed the optimization levels as well as setting
13702 target options, start with the optimizations specified. */
13703 if (func_optimize
&& func_optimize
!= old_optimize
)
13704 cl_optimization_restore (&global_options
,
13705 TREE_OPTIMIZATION (func_optimize
));
13707 /* Save the current target options to restore at the end. */
13708 cl_target_option_save (&cur_target
, &global_options
);
13710 /* If fndecl already has some target attributes applied to it, unpack
13711 them so that we add this attribute on top of them, rather than
13712 overwriting them. */
13713 if (existing_target
)
13715 struct cl_target_option
*existing_options
13716 = TREE_TARGET_OPTION (existing_target
);
13718 if (existing_options
)
13719 cl_target_option_restore (&global_options
, existing_options
);
13722 cl_target_option_restore (&global_options
,
13723 TREE_TARGET_OPTION (target_option_current_node
));
13725 ret
= aarch64_process_target_attr (args
);
13727 /* Set up any additional state. */
13730 aarch64_override_options_internal (&global_options
);
13731 /* Initialize SIMD builtins if we haven't already.
13732 Set current_target_pragma to NULL for the duration so that
13733 the builtin initialization code doesn't try to tag the functions
13734 being built with the attributes specified by any current pragma, thus
13735 going into an infinite recursion. */
13738 tree saved_current_target_pragma
= current_target_pragma
;
13739 current_target_pragma
= NULL
;
13740 aarch64_init_simd_builtins ();
13741 current_target_pragma
= saved_current_target_pragma
;
13743 new_target
= build_target_option_node (&global_options
);
13748 new_optimize
= build_optimization_node (&global_options
);
13752 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = new_target
;
13754 if (old_optimize
!= new_optimize
)
13755 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
) = new_optimize
;
13758 cl_target_option_restore (&global_options
, &cur_target
);
13760 if (old_optimize
!= new_optimize
)
13761 cl_optimization_restore (&global_options
,
13762 TREE_OPTIMIZATION (old_optimize
));
13766 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
13767 tri-bool options (yes, no, don't care) and the default value is
13768 DEF, determine whether to reject inlining. */
13771 aarch64_tribools_ok_for_inlining_p (int caller
, int callee
,
13772 int dont_care
, int def
)
13774 /* If the callee doesn't care, always allow inlining. */
13775 if (callee
== dont_care
)
13778 /* If the caller doesn't care, always allow inlining. */
13779 if (caller
== dont_care
)
13782 /* Otherwise, allow inlining if either the callee and caller values
13783 agree, or if the callee is using the default value. */
13784 return (callee
== caller
|| callee
== def
);
13787 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
13788 to inline CALLEE into CALLER based on target-specific info.
13789 Make sure that the caller and callee have compatible architectural
13790 features. Then go through the other possible target attributes
13791 and see if they can block inlining. Try not to reject always_inline
13792 callees unless they are incompatible architecturally. */
13795 aarch64_can_inline_p (tree caller
, tree callee
)
13797 tree caller_tree
= DECL_FUNCTION_SPECIFIC_TARGET (caller
);
13798 tree callee_tree
= DECL_FUNCTION_SPECIFIC_TARGET (callee
);
13800 struct cl_target_option
*caller_opts
13801 = TREE_TARGET_OPTION (caller_tree
? caller_tree
13802 : target_option_default_node
);
13804 struct cl_target_option
*callee_opts
13805 = TREE_TARGET_OPTION (callee_tree
? callee_tree
13806 : target_option_default_node
);
13808 /* Callee's ISA flags should be a subset of the caller's. */
13809 if ((caller_opts
->x_aarch64_isa_flags
& callee_opts
->x_aarch64_isa_flags
)
13810 != callee_opts
->x_aarch64_isa_flags
)
13813 /* Allow non-strict aligned functions inlining into strict
13815 if ((TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)
13816 != TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
))
13817 && !(!TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
)
13818 && TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)))
13821 bool always_inline
= lookup_attribute ("always_inline",
13822 DECL_ATTRIBUTES (callee
));
13824 /* If the architectural features match up and the callee is always_inline
13825 then the other attributes don't matter. */
13829 if (caller_opts
->x_aarch64_cmodel_var
13830 != callee_opts
->x_aarch64_cmodel_var
)
13833 if (caller_opts
->x_aarch64_tls_dialect
13834 != callee_opts
->x_aarch64_tls_dialect
)
13837 /* Honour explicit requests to workaround errata. */
13838 if (!aarch64_tribools_ok_for_inlining_p (
13839 caller_opts
->x_aarch64_fix_a53_err835769
,
13840 callee_opts
->x_aarch64_fix_a53_err835769
,
13841 2, TARGET_FIX_ERR_A53_835769_DEFAULT
))
13844 if (!aarch64_tribools_ok_for_inlining_p (
13845 caller_opts
->x_aarch64_fix_a53_err843419
,
13846 callee_opts
->x_aarch64_fix_a53_err843419
,
13847 2, TARGET_FIX_ERR_A53_843419
))
13850 /* If the user explicitly specified -momit-leaf-frame-pointer for the
13851 caller and calle and they don't match up, reject inlining. */
13852 if (!aarch64_tribools_ok_for_inlining_p (
13853 caller_opts
->x_flag_omit_leaf_frame_pointer
,
13854 callee_opts
->x_flag_omit_leaf_frame_pointer
,
13858 /* If the callee has specific tuning overrides, respect them. */
13859 if (callee_opts
->x_aarch64_override_tune_string
!= NULL
13860 && caller_opts
->x_aarch64_override_tune_string
== NULL
)
13863 /* If the user specified tuning override strings for the
13864 caller and callee and they don't match up, reject inlining.
13865 We just do a string compare here, we don't analyze the meaning
13866 of the string, as it would be too costly for little gain. */
13867 if (callee_opts
->x_aarch64_override_tune_string
13868 && caller_opts
->x_aarch64_override_tune_string
13869 && (strcmp (callee_opts
->x_aarch64_override_tune_string
,
13870 caller_opts
->x_aarch64_override_tune_string
) != 0))
13876 /* Return true if SYMBOL_REF X binds locally. */
13879 aarch64_symbol_binds_local_p (const_rtx x
)
13881 return (SYMBOL_REF_DECL (x
)
13882 ? targetm
.binds_local_p (SYMBOL_REF_DECL (x
))
13883 : SYMBOL_REF_LOCAL_P (x
));
13886 /* Return true if SYMBOL_REF X is thread local */
13888 aarch64_tls_symbol_p (rtx x
)
13890 if (! TARGET_HAVE_TLS
)
13893 if (GET_CODE (x
) != SYMBOL_REF
)
13896 return SYMBOL_REF_TLS_MODEL (x
) != 0;
13899 /* Classify a TLS symbol into one of the TLS kinds. */
13900 enum aarch64_symbol_type
13901 aarch64_classify_tls_symbol (rtx x
)
13903 enum tls_model tls_kind
= tls_symbolic_operand_type (x
);
13907 case TLS_MODEL_GLOBAL_DYNAMIC
:
13908 case TLS_MODEL_LOCAL_DYNAMIC
:
13909 return TARGET_TLS_DESC
? SYMBOL_SMALL_TLSDESC
: SYMBOL_SMALL_TLSGD
;
13911 case TLS_MODEL_INITIAL_EXEC
:
13912 switch (aarch64_cmodel
)
13914 case AARCH64_CMODEL_TINY
:
13915 case AARCH64_CMODEL_TINY_PIC
:
13916 return SYMBOL_TINY_TLSIE
;
13918 return SYMBOL_SMALL_TLSIE
;
13921 case TLS_MODEL_LOCAL_EXEC
:
13922 if (aarch64_tls_size
== 12)
13923 return SYMBOL_TLSLE12
;
13924 else if (aarch64_tls_size
== 24)
13925 return SYMBOL_TLSLE24
;
13926 else if (aarch64_tls_size
== 32)
13927 return SYMBOL_TLSLE32
;
13928 else if (aarch64_tls_size
== 48)
13929 return SYMBOL_TLSLE48
;
13931 gcc_unreachable ();
13933 case TLS_MODEL_EMULATED
:
13934 case TLS_MODEL_NONE
:
13935 return SYMBOL_FORCE_TO_MEM
;
13938 gcc_unreachable ();
13942 /* Return the correct method for accessing X + OFFSET, where X is either
13943 a SYMBOL_REF or LABEL_REF. */
13945 enum aarch64_symbol_type
13946 aarch64_classify_symbol (rtx x
, HOST_WIDE_INT offset
)
13948 if (GET_CODE (x
) == LABEL_REF
)
13950 switch (aarch64_cmodel
)
13952 case AARCH64_CMODEL_LARGE
:
13953 return SYMBOL_FORCE_TO_MEM
;
13955 case AARCH64_CMODEL_TINY_PIC
:
13956 case AARCH64_CMODEL_TINY
:
13957 return SYMBOL_TINY_ABSOLUTE
;
13959 case AARCH64_CMODEL_SMALL_SPIC
:
13960 case AARCH64_CMODEL_SMALL_PIC
:
13961 case AARCH64_CMODEL_SMALL
:
13962 return SYMBOL_SMALL_ABSOLUTE
;
13965 gcc_unreachable ();
13969 if (GET_CODE (x
) == SYMBOL_REF
)
13971 if (aarch64_tls_symbol_p (x
))
13972 return aarch64_classify_tls_symbol (x
);
13974 switch (aarch64_cmodel
)
13976 case AARCH64_CMODEL_TINY
:
13977 /* When we retrieve symbol + offset address, we have to make sure
13978 the offset does not cause overflow of the final address. But
13979 we have no way of knowing the address of symbol at compile time
13980 so we can't accurately say if the distance between the PC and
13981 symbol + offset is outside the addressible range of +/-1M in the
13982 TINY code model. So we rely on images not being greater than
13983 1M and cap the offset at 1M and anything beyond 1M will have to
13984 be loaded using an alternative mechanism. Furthermore if the
13985 symbol is a weak reference to something that isn't known to
13986 resolve to a symbol in this module, then force to memory. */
13987 if ((SYMBOL_REF_WEAK (x
)
13988 && !aarch64_symbol_binds_local_p (x
))
13989 || !IN_RANGE (offset
, -1048575, 1048575))
13990 return SYMBOL_FORCE_TO_MEM
;
13991 return SYMBOL_TINY_ABSOLUTE
;
13993 case AARCH64_CMODEL_SMALL
:
13994 /* Same reasoning as the tiny code model, but the offset cap here is
13996 if ((SYMBOL_REF_WEAK (x
)
13997 && !aarch64_symbol_binds_local_p (x
))
13998 || !IN_RANGE (offset
, HOST_WIDE_INT_C (-4294967263),
13999 HOST_WIDE_INT_C (4294967264)))
14000 return SYMBOL_FORCE_TO_MEM
;
14001 return SYMBOL_SMALL_ABSOLUTE
;
14003 case AARCH64_CMODEL_TINY_PIC
:
14004 if (!aarch64_symbol_binds_local_p (x
))
14005 return SYMBOL_TINY_GOT
;
14006 return SYMBOL_TINY_ABSOLUTE
;
14008 case AARCH64_CMODEL_SMALL_SPIC
:
14009 case AARCH64_CMODEL_SMALL_PIC
:
14010 if (!aarch64_symbol_binds_local_p (x
))
14011 return (aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
14012 ? SYMBOL_SMALL_GOT_28K
: SYMBOL_SMALL_GOT_4G
);
14013 return SYMBOL_SMALL_ABSOLUTE
;
14015 case AARCH64_CMODEL_LARGE
:
14016 /* This is alright even in PIC code as the constant
14017 pool reference is always PC relative and within
14018 the same translation unit. */
14019 if (!aarch64_pcrelative_literal_loads
&& CONSTANT_POOL_ADDRESS_P (x
))
14020 return SYMBOL_SMALL_ABSOLUTE
;
14022 return SYMBOL_FORCE_TO_MEM
;
14025 gcc_unreachable ();
14029 /* By default push everything into the constant pool. */
14030 return SYMBOL_FORCE_TO_MEM
;
14034 aarch64_constant_address_p (rtx x
)
14036 return (CONSTANT_P (x
) && memory_address_p (DImode
, x
));
14040 aarch64_legitimate_pic_operand_p (rtx x
)
14042 if (GET_CODE (x
) == SYMBOL_REF
14043 || (GET_CODE (x
) == CONST
14044 && GET_CODE (XEXP (x
, 0)) == PLUS
14045 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
))
14051 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
14052 that should be rematerialized rather than spilled. */
14055 aarch64_legitimate_constant_p (machine_mode mode
, rtx x
)
14057 /* Support CSE and rematerialization of common constants. */
14058 if (CONST_INT_P (x
)
14059 || (CONST_DOUBLE_P (x
) && GET_MODE_CLASS (mode
) == MODE_FLOAT
)
14060 || GET_CODE (x
) == CONST_VECTOR
)
14063 /* Do not allow vector struct mode constants for Advanced SIMD.
14064 We could support 0 and -1 easily, but they need support in
14065 aarch64-simd.md. */
14066 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
14067 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
14070 /* Only accept variable-length vector constants if they can be
14073 ??? It would be possible to handle rematerialization of other
14074 constants via secondary reloads. */
14075 if (vec_flags
& VEC_ANY_SVE
)
14076 return aarch64_simd_valid_immediate (x
, NULL
);
14078 if (GET_CODE (x
) == HIGH
)
14081 /* Accept polynomial constants that can be calculated by using the
14082 destination of a move as the sole temporary. Constants that
14083 require a second temporary cannot be rematerialized (they can't be
14084 forced to memory and also aren't legitimate constants). */
14086 if (poly_int_rtx_p (x
, &offset
))
14087 return aarch64_offset_temporaries (false, offset
) <= 1;
14089 /* If an offset is being added to something else, we need to allow the
14090 base to be moved into the destination register, meaning that there
14091 are no free temporaries for the offset. */
14092 x
= strip_offset (x
, &offset
);
14093 if (!offset
.is_constant () && aarch64_offset_temporaries (true, offset
) > 0)
14096 /* Do not allow const (plus (anchor_symbol, const_int)). */
14097 if (maybe_ne (offset
, 0) && SYMBOL_REF_P (x
) && SYMBOL_REF_ANCHOR_P (x
))
14100 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
14101 so spilling them is better than rematerialization. */
14102 if (SYMBOL_REF_P (x
) && !SYMBOL_REF_TLS_MODEL (x
))
14105 /* Label references are always constant. */
14106 if (GET_CODE (x
) == LABEL_REF
)
14113 aarch64_load_tp (rtx target
)
14116 || GET_MODE (target
) != Pmode
14117 || !register_operand (target
, Pmode
))
14118 target
= gen_reg_rtx (Pmode
);
14120 /* Can return in any reg. */
14121 emit_insn (gen_aarch64_load_tp_hard (target
));
14125 /* On AAPCS systems, this is the "struct __va_list". */
14126 static GTY(()) tree va_list_type
;
14128 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
14129 Return the type to use as __builtin_va_list.
14131 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
14143 aarch64_build_builtin_va_list (void)
14146 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
14148 /* Create the type. */
14149 va_list_type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
14150 /* Give it the required name. */
14151 va_list_name
= build_decl (BUILTINS_LOCATION
,
14153 get_identifier ("__va_list"),
14155 DECL_ARTIFICIAL (va_list_name
) = 1;
14156 TYPE_NAME (va_list_type
) = va_list_name
;
14157 TYPE_STUB_DECL (va_list_type
) = va_list_name
;
14159 /* Create the fields. */
14160 f_stack
= build_decl (BUILTINS_LOCATION
,
14161 FIELD_DECL
, get_identifier ("__stack"),
14163 f_grtop
= build_decl (BUILTINS_LOCATION
,
14164 FIELD_DECL
, get_identifier ("__gr_top"),
14166 f_vrtop
= build_decl (BUILTINS_LOCATION
,
14167 FIELD_DECL
, get_identifier ("__vr_top"),
14169 f_groff
= build_decl (BUILTINS_LOCATION
,
14170 FIELD_DECL
, get_identifier ("__gr_offs"),
14171 integer_type_node
);
14172 f_vroff
= build_decl (BUILTINS_LOCATION
,
14173 FIELD_DECL
, get_identifier ("__vr_offs"),
14174 integer_type_node
);
14176 /* Tell tree-stdarg pass about our internal offset fields.
14177 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
14178 purpose to identify whether the code is updating va_list internal
14179 offset fields through irregular way. */
14180 va_list_gpr_counter_field
= f_groff
;
14181 va_list_fpr_counter_field
= f_vroff
;
14183 DECL_ARTIFICIAL (f_stack
) = 1;
14184 DECL_ARTIFICIAL (f_grtop
) = 1;
14185 DECL_ARTIFICIAL (f_vrtop
) = 1;
14186 DECL_ARTIFICIAL (f_groff
) = 1;
14187 DECL_ARTIFICIAL (f_vroff
) = 1;
14189 DECL_FIELD_CONTEXT (f_stack
) = va_list_type
;
14190 DECL_FIELD_CONTEXT (f_grtop
) = va_list_type
;
14191 DECL_FIELD_CONTEXT (f_vrtop
) = va_list_type
;
14192 DECL_FIELD_CONTEXT (f_groff
) = va_list_type
;
14193 DECL_FIELD_CONTEXT (f_vroff
) = va_list_type
;
14195 TYPE_FIELDS (va_list_type
) = f_stack
;
14196 DECL_CHAIN (f_stack
) = f_grtop
;
14197 DECL_CHAIN (f_grtop
) = f_vrtop
;
14198 DECL_CHAIN (f_vrtop
) = f_groff
;
14199 DECL_CHAIN (f_groff
) = f_vroff
;
14201 /* Compute its layout. */
14202 layout_type (va_list_type
);
14204 return va_list_type
;
14207 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
14209 aarch64_expand_builtin_va_start (tree valist
, rtx nextarg ATTRIBUTE_UNUSED
)
14211 const CUMULATIVE_ARGS
*cum
;
14212 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
14213 tree stack
, grtop
, vrtop
, groff
, vroff
;
14215 int gr_save_area_size
= cfun
->va_list_gpr_size
;
14216 int vr_save_area_size
= cfun
->va_list_fpr_size
;
14219 cum
= &crtl
->args
.info
;
14220 if (cfun
->va_list_gpr_size
)
14221 gr_save_area_size
= MIN ((NUM_ARG_REGS
- cum
->aapcs_ncrn
) * UNITS_PER_WORD
,
14222 cfun
->va_list_gpr_size
);
14223 if (cfun
->va_list_fpr_size
)
14224 vr_save_area_size
= MIN ((NUM_FP_ARG_REGS
- cum
->aapcs_nvrn
)
14225 * UNITS_PER_VREG
, cfun
->va_list_fpr_size
);
14229 gcc_assert (cum
->aapcs_nvrn
== 0);
14230 vr_save_area_size
= 0;
14233 f_stack
= TYPE_FIELDS (va_list_type_node
);
14234 f_grtop
= DECL_CHAIN (f_stack
);
14235 f_vrtop
= DECL_CHAIN (f_grtop
);
14236 f_groff
= DECL_CHAIN (f_vrtop
);
14237 f_vroff
= DECL_CHAIN (f_groff
);
14239 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), valist
, f_stack
,
14241 grtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
), valist
, f_grtop
,
14243 vrtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
), valist
, f_vrtop
,
14245 groff
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
), valist
, f_groff
,
14247 vroff
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
), valist
, f_vroff
,
14250 /* Emit code to initialize STACK, which points to the next varargs stack
14251 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
14252 by named arguments. STACK is 8-byte aligned. */
14253 t
= make_tree (TREE_TYPE (stack
), virtual_incoming_args_rtx
);
14254 if (cum
->aapcs_stack_size
> 0)
14255 t
= fold_build_pointer_plus_hwi (t
, cum
->aapcs_stack_size
* UNITS_PER_WORD
);
14256 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), stack
, t
);
14257 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
14259 /* Emit code to initialize GRTOP, the top of the GR save area.
14260 virtual_incoming_args_rtx should have been 16 byte aligned. */
14261 t
= make_tree (TREE_TYPE (grtop
), virtual_incoming_args_rtx
);
14262 t
= build2 (MODIFY_EXPR
, TREE_TYPE (grtop
), grtop
, t
);
14263 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
14265 /* Emit code to initialize VRTOP, the top of the VR save area.
14266 This address is gr_save_area_bytes below GRTOP, rounded
14267 down to the next 16-byte boundary. */
14268 t
= make_tree (TREE_TYPE (vrtop
), virtual_incoming_args_rtx
);
14269 vr_offset
= ROUND_UP (gr_save_area_size
,
14270 STACK_BOUNDARY
/ BITS_PER_UNIT
);
14273 t
= fold_build_pointer_plus_hwi (t
, -vr_offset
);
14274 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vrtop
), vrtop
, t
);
14275 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
14277 /* Emit code to initialize GROFF, the offset from GRTOP of the
14278 next GPR argument. */
14279 t
= build2 (MODIFY_EXPR
, TREE_TYPE (groff
), groff
,
14280 build_int_cst (TREE_TYPE (groff
), -gr_save_area_size
));
14281 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
14283 /* Likewise emit code to initialize VROFF, the offset from FTOP
14284 of the next VR argument. */
14285 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vroff
), vroff
,
14286 build_int_cst (TREE_TYPE (vroff
), -vr_save_area_size
));
14287 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
14290 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
14293 aarch64_gimplify_va_arg_expr (tree valist
, tree type
, gimple_seq
*pre_p
,
14294 gimple_seq
*post_p ATTRIBUTE_UNUSED
)
14298 bool is_ha
; /* is HFA or HVA. */
14299 bool dw_align
; /* double-word align. */
14300 machine_mode ag_mode
= VOIDmode
;
14304 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
14305 tree stack
, f_top
, f_off
, off
, arg
, roundup
, on_stack
;
14306 HOST_WIDE_INT size
, rsize
, adjust
, align
;
14307 tree t
, u
, cond1
, cond2
;
14309 indirect_p
= pass_va_arg_by_reference (type
);
14311 type
= build_pointer_type (type
);
14313 mode
= TYPE_MODE (type
);
14315 f_stack
= TYPE_FIELDS (va_list_type_node
);
14316 f_grtop
= DECL_CHAIN (f_stack
);
14317 f_vrtop
= DECL_CHAIN (f_grtop
);
14318 f_groff
= DECL_CHAIN (f_vrtop
);
14319 f_vroff
= DECL_CHAIN (f_groff
);
14321 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), unshare_expr (valist
),
14322 f_stack
, NULL_TREE
);
14323 size
= int_size_in_bytes (type
);
14327 = aarch64_function_arg_alignment (mode
, type
, &abi_break
) / BITS_PER_UNIT
;
14331 if (aarch64_vfp_is_call_or_return_candidate (mode
,
14337 /* No frontends can create types with variable-sized modes, so we
14338 shouldn't be asked to pass or return them. */
14339 unsigned int ag_size
= GET_MODE_SIZE (ag_mode
).to_constant ();
14341 /* TYPE passed in fp/simd registers. */
14343 aarch64_err_no_fpadvsimd (mode
);
14345 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
),
14346 unshare_expr (valist
), f_vrtop
, NULL_TREE
);
14347 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
),
14348 unshare_expr (valist
), f_vroff
, NULL_TREE
);
14350 rsize
= nregs
* UNITS_PER_VREG
;
14354 if (BYTES_BIG_ENDIAN
&& ag_size
< UNITS_PER_VREG
)
14355 adjust
= UNITS_PER_VREG
- ag_size
;
14357 else if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
14358 && size
< UNITS_PER_VREG
)
14360 adjust
= UNITS_PER_VREG
- size
;
14365 /* TYPE passed in general registers. */
14366 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
),
14367 unshare_expr (valist
), f_grtop
, NULL_TREE
);
14368 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
),
14369 unshare_expr (valist
), f_groff
, NULL_TREE
);
14370 rsize
= ROUND_UP (size
, UNITS_PER_WORD
);
14371 nregs
= rsize
/ UNITS_PER_WORD
;
14375 if (abi_break
&& warn_psabi
)
14376 inform (input_location
, "parameter passing for argument of type "
14377 "%qT changed in GCC 9.1", type
);
14381 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
14382 && size
< UNITS_PER_WORD
)
14384 adjust
= UNITS_PER_WORD
- size
;
14388 /* Get a local temporary for the field value. */
14389 off
= get_initialized_tmp_var (f_off
, pre_p
, NULL
);
14391 /* Emit code to branch if off >= 0. */
14392 t
= build2 (GE_EXPR
, boolean_type_node
, off
,
14393 build_int_cst (TREE_TYPE (off
), 0));
14394 cond1
= build3 (COND_EXPR
, ptr_type_node
, t
, NULL_TREE
, NULL_TREE
);
14398 /* Emit: offs = (offs + 15) & -16. */
14399 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
14400 build_int_cst (TREE_TYPE (off
), 15));
14401 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (off
), t
,
14402 build_int_cst (TREE_TYPE (off
), -16));
14403 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (off
), off
, t
);
14408 /* Update ap.__[g|v]r_offs */
14409 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
14410 build_int_cst (TREE_TYPE (off
), rsize
));
14411 t
= build2 (MODIFY_EXPR
, TREE_TYPE (f_off
), unshare_expr (f_off
), t
);
14415 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
14417 /* [cond2] if (ap.__[g|v]r_offs > 0) */
14418 u
= build2 (GT_EXPR
, boolean_type_node
, unshare_expr (f_off
),
14419 build_int_cst (TREE_TYPE (f_off
), 0));
14420 cond2
= build3 (COND_EXPR
, ptr_type_node
, u
, NULL_TREE
, NULL_TREE
);
14422 /* String up: make sure the assignment happens before the use. */
14423 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (cond2
), t
, cond2
);
14424 COND_EXPR_ELSE (cond1
) = t
;
14426 /* Prepare the trees handling the argument that is passed on the stack;
14427 the top level node will store in ON_STACK. */
14428 arg
= get_initialized_tmp_var (stack
, pre_p
, NULL
);
14431 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
14432 t
= fold_build_pointer_plus_hwi (arg
, 15);
14433 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
14434 build_int_cst (TREE_TYPE (t
), -16));
14435 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (arg
), arg
, t
);
14439 /* Advance ap.__stack */
14440 t
= fold_build_pointer_plus_hwi (arg
, size
+ 7);
14441 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
14442 build_int_cst (TREE_TYPE (t
), -8));
14443 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), unshare_expr (stack
), t
);
14444 /* String up roundup and advance. */
14446 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
14447 /* String up with arg */
14448 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), t
, arg
);
14449 /* Big-endianness related address adjustment. */
14450 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
14451 && size
< UNITS_PER_WORD
)
14453 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (arg
), arg
,
14454 size_int (UNITS_PER_WORD
- size
));
14455 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), on_stack
, t
);
14458 COND_EXPR_THEN (cond1
) = unshare_expr (on_stack
);
14459 COND_EXPR_THEN (cond2
) = unshare_expr (on_stack
);
14461 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
14464 t
= build2 (PREINCREMENT_EXPR
, TREE_TYPE (off
), off
,
14465 build_int_cst (TREE_TYPE (off
), adjust
));
14467 t
= fold_convert (sizetype
, t
);
14468 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (f_top
), f_top
, t
);
14472 /* type ha; // treat as "struct {ftype field[n];}"
14473 ... [computing offs]
14474 for (i = 0; i <nregs; ++i, offs += 16)
14475 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
14478 tree tmp_ha
, field_t
, field_ptr_t
;
14480 /* Declare a local variable. */
14481 tmp_ha
= create_tmp_var_raw (type
, "ha");
14482 gimple_add_tmp_var (tmp_ha
);
14484 /* Establish the base type. */
14488 field_t
= float_type_node
;
14489 field_ptr_t
= float_ptr_type_node
;
14492 field_t
= double_type_node
;
14493 field_ptr_t
= double_ptr_type_node
;
14496 field_t
= long_double_type_node
;
14497 field_ptr_t
= long_double_ptr_type_node
;
14500 field_t
= aarch64_fp16_type_node
;
14501 field_ptr_t
= aarch64_fp16_ptr_type_node
;
14506 tree innertype
= make_signed_type (GET_MODE_PRECISION (SImode
));
14507 field_t
= build_vector_type_for_mode (innertype
, ag_mode
);
14508 field_ptr_t
= build_pointer_type (field_t
);
14515 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
14516 tmp_ha
= build1 (ADDR_EXPR
, field_ptr_t
, tmp_ha
);
14518 t
= fold_convert (field_ptr_t
, addr
);
14519 t
= build2 (MODIFY_EXPR
, field_t
,
14520 build1 (INDIRECT_REF
, field_t
, tmp_ha
),
14521 build1 (INDIRECT_REF
, field_t
, t
));
14523 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
14524 for (i
= 1; i
< nregs
; ++i
)
14526 addr
= fold_build_pointer_plus_hwi (addr
, UNITS_PER_VREG
);
14527 u
= fold_convert (field_ptr_t
, addr
);
14528 u
= build2 (MODIFY_EXPR
, field_t
,
14529 build2 (MEM_REF
, field_t
, tmp_ha
,
14530 build_int_cst (field_ptr_t
,
14532 int_size_in_bytes (field_t
)))),
14533 build1 (INDIRECT_REF
, field_t
, u
));
14534 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), t
, u
);
14537 u
= fold_convert (TREE_TYPE (f_top
), tmp_ha
);
14538 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (f_top
), t
, u
);
14541 COND_EXPR_ELSE (cond2
) = t
;
14542 addr
= fold_convert (build_pointer_type (type
), cond1
);
14543 addr
= build_va_arg_indirect_ref (addr
);
14546 addr
= build_va_arg_indirect_ref (addr
);
14551 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
14554 aarch64_setup_incoming_varargs (cumulative_args_t cum_v
,
14555 const function_arg_info
&arg
,
14556 int *pretend_size ATTRIBUTE_UNUSED
, int no_rtl
)
14558 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
14559 CUMULATIVE_ARGS local_cum
;
14560 int gr_saved
= cfun
->va_list_gpr_size
;
14561 int vr_saved
= cfun
->va_list_fpr_size
;
14563 /* The caller has advanced CUM up to, but not beyond, the last named
14564 argument. Advance a local copy of CUM past the last "real" named
14565 argument, to find out how many registers are left over. */
14567 aarch64_function_arg_advance (pack_cumulative_args(&local_cum
), arg
);
14569 /* Found out how many registers we need to save.
14570 Honor tree-stdvar analysis results. */
14571 if (cfun
->va_list_gpr_size
)
14572 gr_saved
= MIN (NUM_ARG_REGS
- local_cum
.aapcs_ncrn
,
14573 cfun
->va_list_gpr_size
/ UNITS_PER_WORD
);
14574 if (cfun
->va_list_fpr_size
)
14575 vr_saved
= MIN (NUM_FP_ARG_REGS
- local_cum
.aapcs_nvrn
,
14576 cfun
->va_list_fpr_size
/ UNITS_PER_VREG
);
14580 gcc_assert (local_cum
.aapcs_nvrn
== 0);
14590 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
14591 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
,
14592 - gr_saved
* UNITS_PER_WORD
);
14593 mem
= gen_frame_mem (BLKmode
, ptr
);
14594 set_mem_alias_set (mem
, get_varargs_alias_set ());
14596 move_block_from_reg (local_cum
.aapcs_ncrn
+ R0_REGNUM
,
14601 /* We can't use move_block_from_reg, because it will use
14602 the wrong mode, storing D regs only. */
14603 machine_mode mode
= TImode
;
14604 int off
, i
, vr_start
;
14606 /* Set OFF to the offset from virtual_incoming_args_rtx of
14607 the first vector register. The VR save area lies below
14608 the GR one, and is aligned to 16 bytes. */
14609 off
= -ROUND_UP (gr_saved
* UNITS_PER_WORD
,
14610 STACK_BOUNDARY
/ BITS_PER_UNIT
);
14611 off
-= vr_saved
* UNITS_PER_VREG
;
14613 vr_start
= V0_REGNUM
+ local_cum
.aapcs_nvrn
;
14614 for (i
= 0; i
< vr_saved
; ++i
)
14618 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
, off
);
14619 mem
= gen_frame_mem (mode
, ptr
);
14620 set_mem_alias_set (mem
, get_varargs_alias_set ());
14621 aarch64_emit_move (mem
, gen_rtx_REG (mode
, vr_start
+ i
));
14622 off
+= UNITS_PER_VREG
;
14627 /* We don't save the size into *PRETEND_SIZE because we want to avoid
14628 any complication of having crtl->args.pretend_args_size changed. */
14629 cfun
->machine
->frame
.saved_varargs_size
14630 = (ROUND_UP (gr_saved
* UNITS_PER_WORD
,
14631 STACK_BOUNDARY
/ BITS_PER_UNIT
)
14632 + vr_saved
* UNITS_PER_VREG
);
14636 aarch64_conditional_register_usage (void)
14641 for (i
= V0_REGNUM
; i
<= V31_REGNUM
; i
++)
14644 call_used_regs
[i
] = 1;
14648 for (i
= P0_REGNUM
; i
<= P15_REGNUM
; i
++)
14651 call_used_regs
[i
] = 1;
14654 /* When tracking speculation, we need a couple of call-clobbered registers
14655 to track the speculation state. It would be nice to just use
14656 IP0 and IP1, but currently there are numerous places that just
14657 assume these registers are free for other uses (eg pointer
14658 authentication). */
14659 if (aarch64_track_speculation
)
14661 fixed_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
14662 call_used_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
14663 fixed_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
14664 call_used_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
14668 /* Walk down the type tree of TYPE counting consecutive base elements.
14669 If *MODEP is VOIDmode, then set it to the first valid floating point
14670 type. If a non-floating point type is found, or if a floating point
14671 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
14672 otherwise return the count in the sub-tree. */
14674 aapcs_vfp_sub_candidate (const_tree type
, machine_mode
*modep
)
14677 HOST_WIDE_INT size
;
14679 switch (TREE_CODE (type
))
14682 mode
= TYPE_MODE (type
);
14683 if (mode
!= DFmode
&& mode
!= SFmode
14684 && mode
!= TFmode
&& mode
!= HFmode
)
14687 if (*modep
== VOIDmode
)
14690 if (*modep
== mode
)
14696 mode
= TYPE_MODE (TREE_TYPE (type
));
14697 if (mode
!= DFmode
&& mode
!= SFmode
14698 && mode
!= TFmode
&& mode
!= HFmode
)
14701 if (*modep
== VOIDmode
)
14704 if (*modep
== mode
)
14710 /* Use V2SImode and V4SImode as representatives of all 64-bit
14711 and 128-bit vector types. */
14712 size
= int_size_in_bytes (type
);
14725 if (*modep
== VOIDmode
)
14728 /* Vector modes are considered to be opaque: two vectors are
14729 equivalent for the purposes of being homogeneous aggregates
14730 if they are the same size. */
14731 if (*modep
== mode
)
14739 tree index
= TYPE_DOMAIN (type
);
14741 /* Can't handle incomplete types nor sizes that are not
14743 if (!COMPLETE_TYPE_P (type
)
14744 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
14747 count
= aapcs_vfp_sub_candidate (TREE_TYPE (type
), modep
);
14750 || !TYPE_MAX_VALUE (index
)
14751 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index
))
14752 || !TYPE_MIN_VALUE (index
)
14753 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index
))
14757 count
*= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index
))
14758 - tree_to_uhwi (TYPE_MIN_VALUE (index
)));
14760 /* There must be no padding. */
14761 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
14762 count
* GET_MODE_BITSIZE (*modep
)))
14774 /* Can't handle incomplete types nor sizes that are not
14776 if (!COMPLETE_TYPE_P (type
)
14777 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
14780 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
14782 if (TREE_CODE (field
) != FIELD_DECL
)
14785 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
14788 count
+= sub_count
;
14791 /* There must be no padding. */
14792 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
14793 count
* GET_MODE_BITSIZE (*modep
)))
14800 case QUAL_UNION_TYPE
:
14802 /* These aren't very interesting except in a degenerate case. */
14807 /* Can't handle incomplete types nor sizes that are not
14809 if (!COMPLETE_TYPE_P (type
)
14810 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
14813 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
14815 if (TREE_CODE (field
) != FIELD_DECL
)
14818 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
14821 count
= count
> sub_count
? count
: sub_count
;
14824 /* There must be no padding. */
14825 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
14826 count
* GET_MODE_BITSIZE (*modep
)))
14839 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
14840 type as described in AAPCS64 \S 4.1.2.
14842 See the comment above aarch64_composite_type_p for the notes on MODE. */
14845 aarch64_short_vector_p (const_tree type
,
14848 poly_int64 size
= -1;
14850 if (type
&& TREE_CODE (type
) == VECTOR_TYPE
)
14851 size
= int_size_in_bytes (type
);
14852 else if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
14853 || GET_MODE_CLASS (mode
) == MODE_VECTOR_FLOAT
)
14854 size
= GET_MODE_SIZE (mode
);
14856 return known_eq (size
, 8) || known_eq (size
, 16);
14859 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
14860 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
14861 array types. The C99 floating-point complex types are also considered
14862 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
14863 types, which are GCC extensions and out of the scope of AAPCS64, are
14864 treated as composite types here as well.
14866 Note that MODE itself is not sufficient in determining whether a type
14867 is such a composite type or not. This is because
14868 stor-layout.c:compute_record_mode may have already changed the MODE
14869 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
14870 structure with only one field may have its MODE set to the mode of the
14871 field. Also an integer mode whose size matches the size of the
14872 RECORD_TYPE type may be used to substitute the original mode
14873 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
14874 solely relied on. */
14877 aarch64_composite_type_p (const_tree type
,
14880 if (aarch64_short_vector_p (type
, mode
))
14883 if (type
&& (AGGREGATE_TYPE_P (type
) || TREE_CODE (type
) == COMPLEX_TYPE
))
14886 if (mode
== BLKmode
14887 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
14888 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_INT
)
14894 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
14895 shall be passed or returned in simd/fp register(s) (providing these
14896 parameter passing registers are available).
14898 Upon successful return, *COUNT returns the number of needed registers,
14899 *BASE_MODE returns the mode of the individual register and when IS_HAF
14900 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
14901 floating-point aggregate or a homogeneous short-vector aggregate. */
14904 aarch64_vfp_is_call_or_return_candidate (machine_mode mode
,
14906 machine_mode
*base_mode
,
14910 machine_mode new_mode
= VOIDmode
;
14911 bool composite_p
= aarch64_composite_type_p (type
, mode
);
14913 if (is_ha
!= NULL
) *is_ha
= false;
14915 if ((!composite_p
&& GET_MODE_CLASS (mode
) == MODE_FLOAT
)
14916 || aarch64_short_vector_p (type
, mode
))
14921 else if (GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
)
14923 if (is_ha
!= NULL
) *is_ha
= true;
14925 new_mode
= GET_MODE_INNER (mode
);
14927 else if (type
&& composite_p
)
14929 int ag_count
= aapcs_vfp_sub_candidate (type
, &new_mode
);
14931 if (ag_count
> 0 && ag_count
<= HA_MAX_NUM_FLDS
)
14933 if (is_ha
!= NULL
) *is_ha
= true;
14942 *base_mode
= new_mode
;
14946 /* Implement TARGET_STRUCT_VALUE_RTX. */
14949 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED
,
14950 int incoming ATTRIBUTE_UNUSED
)
14952 return gen_rtx_REG (Pmode
, AARCH64_STRUCT_VALUE_REGNUM
);
14955 /* Implements target hook vector_mode_supported_p. */
14957 aarch64_vector_mode_supported_p (machine_mode mode
)
14959 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
14960 return vec_flags
!= 0 && (vec_flags
& VEC_STRUCT
) == 0;
14963 /* Return the full-width SVE vector mode for element mode MODE, if one
14966 aarch64_full_sve_mode (scalar_mode mode
)
14983 return VNx16QImode
;
14985 return opt_machine_mode ();
14989 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
14992 aarch64_vq_mode (scalar_mode mode
)
15011 return opt_machine_mode ();
15015 /* Return appropriate SIMD container
15016 for MODE within a vector of WIDTH bits. */
15017 static machine_mode
15018 aarch64_simd_container_mode (scalar_mode mode
, poly_int64 width
)
15020 if (TARGET_SVE
&& known_eq (width
, BITS_PER_SVE_VECTOR
))
15021 return aarch64_full_sve_mode (mode
).else_mode (word_mode
);
15023 gcc_assert (known_eq (width
, 64) || known_eq (width
, 128));
15026 if (known_eq (width
, 128))
15027 return aarch64_vq_mode (mode
).else_mode (word_mode
);
15048 /* Return 128-bit container as the preferred SIMD mode for MODE. */
15049 static machine_mode
15050 aarch64_preferred_simd_mode (scalar_mode mode
)
15052 poly_int64 bits
= TARGET_SVE
? BITS_PER_SVE_VECTOR
: 128;
15053 return aarch64_simd_container_mode (mode
, bits
);
15056 /* Return a list of possible vector sizes for the vectorizer
15057 to iterate over. */
15059 aarch64_autovectorize_vector_sizes (vector_sizes
*sizes
, bool)
15062 sizes
->safe_push (BYTES_PER_SVE_VECTOR
);
15063 sizes
->safe_push (16);
15064 sizes
->safe_push (8);
15067 /* Implement TARGET_MANGLE_TYPE. */
15069 static const char *
15070 aarch64_mangle_type (const_tree type
)
15072 /* The AArch64 ABI documents say that "__va_list" has to be
15073 mangled as if it is in the "std" namespace. */
15074 if (lang_hooks
.types_compatible_p (CONST_CAST_TREE (type
), va_list_type
))
15075 return "St9__va_list";
15077 /* Half-precision float. */
15078 if (TREE_CODE (type
) == REAL_TYPE
&& TYPE_PRECISION (type
) == 16)
15081 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
15083 if (TYPE_NAME (type
) != NULL
)
15084 return aarch64_mangle_builtin_type (type
);
15086 /* Use the default mangling. */
15090 /* Find the first rtx_insn before insn that will generate an assembly
15094 aarch64_prev_real_insn (rtx_insn
*insn
)
15101 insn
= prev_real_insn (insn
);
15103 while (insn
&& recog_memoized (insn
) < 0);
15109 is_madd_op (enum attr_type t1
)
15112 /* A number of these may be AArch32 only. */
15113 enum attr_type mlatypes
[] = {
15114 TYPE_MLA
, TYPE_MLAS
, TYPE_SMLAD
, TYPE_SMLADX
, TYPE_SMLAL
, TYPE_SMLALD
,
15115 TYPE_SMLALS
, TYPE_SMLALXY
, TYPE_SMLAWX
, TYPE_SMLAWY
, TYPE_SMLAXY
,
15116 TYPE_SMMLA
, TYPE_UMLAL
, TYPE_UMLALS
,TYPE_SMLSD
, TYPE_SMLSDX
, TYPE_SMLSLD
15119 for (i
= 0; i
< sizeof (mlatypes
) / sizeof (enum attr_type
); i
++)
15121 if (t1
== mlatypes
[i
])
15128 /* Check if there is a register dependency between a load and the insn
15129 for which we hold recog_data. */
15132 dep_between_memop_and_curr (rtx memop
)
15137 gcc_assert (GET_CODE (memop
) == SET
);
15139 if (!REG_P (SET_DEST (memop
)))
15142 load_reg
= SET_DEST (memop
);
15143 for (opno
= 1; opno
< recog_data
.n_operands
; opno
++)
15145 rtx operand
= recog_data
.operand
[opno
];
15146 if (REG_P (operand
)
15147 && reg_overlap_mentioned_p (load_reg
, operand
))
15155 /* When working around the Cortex-A53 erratum 835769,
15156 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
15157 instruction and has a preceding memory instruction such that a NOP
15158 should be inserted between them. */
15161 aarch64_madd_needs_nop (rtx_insn
* insn
)
15163 enum attr_type attr_type
;
15167 if (!TARGET_FIX_ERR_A53_835769
)
15170 if (!INSN_P (insn
) || recog_memoized (insn
) < 0)
15173 attr_type
= get_attr_type (insn
);
15174 if (!is_madd_op (attr_type
))
15177 prev
= aarch64_prev_real_insn (insn
);
15178 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
15179 Restore recog state to INSN to avoid state corruption. */
15180 extract_constrain_insn_cached (insn
);
15182 if (!prev
|| !contains_mem_rtx_p (PATTERN (prev
)))
15185 body
= single_set (prev
);
15187 /* If the previous insn is a memory op and there is no dependency between
15188 it and the DImode madd, emit a NOP between them. If body is NULL then we
15189 have a complex memory operation, probably a load/store pair.
15190 Be conservative for now and emit a NOP. */
15191 if (GET_MODE (recog_data
.operand
[0]) == DImode
15192 && (!body
|| !dep_between_memop_and_curr (body
)))
15200 /* Implement FINAL_PRESCAN_INSN. */
15203 aarch64_final_prescan_insn (rtx_insn
*insn
)
15205 if (aarch64_madd_needs_nop (insn
))
15206 fprintf (asm_out_file
, "\tnop // between mem op and mult-accumulate\n");
15210 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
15214 aarch64_sve_index_immediate_p (rtx base_or_step
)
15216 return (CONST_INT_P (base_or_step
)
15217 && IN_RANGE (INTVAL (base_or_step
), -16, 15));
15220 /* Return true if X is a valid immediate for the SVE ADD and SUB
15221 instructions. Negate X first if NEGATE_P is true. */
15224 aarch64_sve_arith_immediate_p (rtx x
, bool negate_p
)
15228 if (!const_vec_duplicate_p (x
, &elt
)
15229 || !CONST_INT_P (elt
))
15232 HOST_WIDE_INT val
= INTVAL (elt
);
15235 val
&= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x
)));
15238 return IN_RANGE (val
, 0, 0xff);
15239 return IN_RANGE (val
, 0, 0xff00);
15242 /* Return true if X is a valid immediate operand for an SVE logical
15243 instruction such as AND. */
15246 aarch64_sve_bitmask_immediate_p (rtx x
)
15250 return (const_vec_duplicate_p (x
, &elt
)
15251 && CONST_INT_P (elt
)
15252 && aarch64_bitmask_imm (INTVAL (elt
),
15253 GET_MODE_INNER (GET_MODE (x
))));
15256 /* Return true if X is a valid immediate for the SVE DUP and CPY
15260 aarch64_sve_dup_immediate_p (rtx x
)
15262 x
= aarch64_bit_representation (unwrap_const_vec_duplicate (x
));
15263 if (!CONST_INT_P (x
))
15266 HOST_WIDE_INT val
= INTVAL (x
);
15268 return IN_RANGE (val
, -0x80, 0x7f);
15269 return IN_RANGE (val
, -0x8000, 0x7f00);
15272 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
15273 SIGNED_P says whether the operand is signed rather than unsigned. */
15276 aarch64_sve_cmp_immediate_p (rtx x
, bool signed_p
)
15280 return (const_vec_duplicate_p (x
, &elt
)
15281 && CONST_INT_P (elt
)
15283 ? IN_RANGE (INTVAL (elt
), -16, 15)
15284 : IN_RANGE (INTVAL (elt
), 0, 127)));
15287 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
15288 instruction. Negate X first if NEGATE_P is true. */
15291 aarch64_sve_float_arith_immediate_p (rtx x
, bool negate_p
)
15296 if (!const_vec_duplicate_p (x
, &elt
)
15297 || GET_CODE (elt
) != CONST_DOUBLE
)
15300 r
= *CONST_DOUBLE_REAL_VALUE (elt
);
15303 r
= real_value_negate (&r
);
15305 if (real_equal (&r
, &dconst1
))
15307 if (real_equal (&r
, &dconsthalf
))
15312 /* Return true if X is a valid immediate operand for an SVE FMUL
15316 aarch64_sve_float_mul_immediate_p (rtx x
)
15320 return (const_vec_duplicate_p (x
, &elt
)
15321 && GET_CODE (elt
) == CONST_DOUBLE
15322 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt
), &dconsthalf
)
15323 || real_equal (CONST_DOUBLE_REAL_VALUE (elt
), &dconst2
)));
15326 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
15327 for the Advanced SIMD operation described by WHICH and INSN. If INFO
15328 is nonnull, use it to describe valid immediates. */
15330 aarch64_advsimd_valid_immediate_hs (unsigned int val32
,
15331 simd_immediate_info
*info
,
15332 enum simd_immediate_check which
,
15333 simd_immediate_info::insn_type insn
)
15335 /* Try a 4-byte immediate with LSL. */
15336 for (unsigned int shift
= 0; shift
< 32; shift
+= 8)
15337 if ((val32
& (0xff << shift
)) == val32
)
15340 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
15341 simd_immediate_info::LSL
, shift
);
15345 /* Try a 2-byte immediate with LSL. */
15346 unsigned int imm16
= val32
& 0xffff;
15347 if (imm16
== (val32
>> 16))
15348 for (unsigned int shift
= 0; shift
< 16; shift
+= 8)
15349 if ((imm16
& (0xff << shift
)) == imm16
)
15352 *info
= simd_immediate_info (HImode
, imm16
>> shift
, insn
,
15353 simd_immediate_info::LSL
, shift
);
15357 /* Try a 4-byte immediate with MSL, except for cases that MVN
15359 if (which
== AARCH64_CHECK_MOV
)
15360 for (unsigned int shift
= 8; shift
< 24; shift
+= 8)
15362 unsigned int low
= (1 << shift
) - 1;
15363 if (((val32
& (0xff << shift
)) | low
) == val32
)
15366 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
15367 simd_immediate_info::MSL
, shift
);
15375 /* Return true if replicating VAL64 is a valid immediate for the
15376 Advanced SIMD operation described by WHICH. If INFO is nonnull,
15377 use it to describe valid immediates. */
15379 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64
,
15380 simd_immediate_info
*info
,
15381 enum simd_immediate_check which
)
15383 unsigned int val32
= val64
& 0xffffffff;
15384 unsigned int val16
= val64
& 0xffff;
15385 unsigned int val8
= val64
& 0xff;
15387 if (val32
== (val64
>> 32))
15389 if ((which
& AARCH64_CHECK_ORR
) != 0
15390 && aarch64_advsimd_valid_immediate_hs (val32
, info
, which
,
15391 simd_immediate_info::MOV
))
15394 if ((which
& AARCH64_CHECK_BIC
) != 0
15395 && aarch64_advsimd_valid_immediate_hs (~val32
, info
, which
,
15396 simd_immediate_info::MVN
))
15399 /* Try using a replicated byte. */
15400 if (which
== AARCH64_CHECK_MOV
15401 && val16
== (val32
>> 16)
15402 && val8
== (val16
>> 8))
15405 *info
= simd_immediate_info (QImode
, val8
);
15410 /* Try using a bit-to-bytemask. */
15411 if (which
== AARCH64_CHECK_MOV
)
15414 for (i
= 0; i
< 64; i
+= 8)
15416 unsigned char byte
= (val64
>> i
) & 0xff;
15417 if (byte
!= 0 && byte
!= 0xff)
15423 *info
= simd_immediate_info (DImode
, val64
);
15430 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
15431 instruction. If INFO is nonnull, use it to describe valid immediates. */
15434 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64
,
15435 simd_immediate_info
*info
)
15437 scalar_int_mode mode
= DImode
;
15438 unsigned int val32
= val64
& 0xffffffff;
15439 if (val32
== (val64
>> 32))
15442 unsigned int val16
= val32
& 0xffff;
15443 if (val16
== (val32
>> 16))
15446 unsigned int val8
= val16
& 0xff;
15447 if (val8
== (val16
>> 8))
15451 HOST_WIDE_INT val
= trunc_int_for_mode (val64
, mode
);
15452 if (IN_RANGE (val
, -0x80, 0x7f))
15454 /* DUP with no shift. */
15456 *info
= simd_immediate_info (mode
, val
);
15459 if ((val
& 0xff) == 0 && IN_RANGE (val
, -0x8000, 0x7f00))
15461 /* DUP with LSL #8. */
15463 *info
= simd_immediate_info (mode
, val
);
15466 if (aarch64_bitmask_imm (val64
, mode
))
15470 *info
= simd_immediate_info (mode
, val
);
15476 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
15477 it to describe valid immediates. */
15480 aarch64_sve_pred_valid_immediate (rtx x
, simd_immediate_info
*info
)
15482 if (x
== CONST0_RTX (GET_MODE (x
)))
15485 *info
= simd_immediate_info (DImode
, 0);
15489 /* Analyze the value as a VNx16BImode. This should be relatively
15490 efficient, since rtx_vector_builder has enough built-in capacity
15491 to store all VLA predicate constants without needing the heap. */
15492 rtx_vector_builder builder
;
15493 if (!aarch64_get_sve_pred_bits (builder
, x
))
15496 unsigned int elt_size
= aarch64_widest_sve_pred_elt_size (builder
);
15497 if (int vl
= aarch64_partial_ptrue_length (builder
, elt_size
))
15499 machine_mode mode
= aarch64_sve_pred_mode (elt_size
).require ();
15500 aarch64_svpattern pattern
= aarch64_svpattern_for_vl (mode
, vl
);
15501 if (pattern
!= AARCH64_NUM_SVPATTERNS
)
15505 scalar_int_mode int_mode
= aarch64_sve_element_int_mode (mode
);
15506 *info
= simd_immediate_info (int_mode
, pattern
);
15514 /* Return true if OP is a valid SIMD immediate for the operation
15515 described by WHICH. If INFO is nonnull, use it to describe valid
15518 aarch64_simd_valid_immediate (rtx op
, simd_immediate_info
*info
,
15519 enum simd_immediate_check which
)
15521 machine_mode mode
= GET_MODE (op
);
15522 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
15523 if (vec_flags
== 0 || vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
15526 if (vec_flags
& VEC_SVE_PRED
)
15527 return aarch64_sve_pred_valid_immediate (op
, info
);
15529 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
15531 unsigned int n_elts
;
15532 if (GET_CODE (op
) == CONST_VECTOR
15533 && CONST_VECTOR_DUPLICATE_P (op
))
15534 n_elts
= CONST_VECTOR_NPATTERNS (op
);
15535 else if ((vec_flags
& VEC_SVE_DATA
)
15536 && const_vec_series_p (op
, &base
, &step
))
15538 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
);
15539 if (!aarch64_sve_index_immediate_p (base
)
15540 || !aarch64_sve_index_immediate_p (step
))
15544 *info
= simd_immediate_info (elt_mode
, base
, step
);
15547 else if (GET_CODE (op
) == CONST_VECTOR
15548 && CONST_VECTOR_NUNITS (op
).is_constant (&n_elts
))
15549 /* N_ELTS set above. */;
15553 scalar_float_mode elt_float_mode
;
15555 && is_a
<scalar_float_mode
> (elt_mode
, &elt_float_mode
))
15557 rtx elt
= CONST_VECTOR_ENCODED_ELT (op
, 0);
15558 if (aarch64_float_const_zero_rtx_p (elt
)
15559 || aarch64_float_const_representable_p (elt
))
15562 *info
= simd_immediate_info (elt_float_mode
, elt
);
15567 unsigned int elt_size
= GET_MODE_SIZE (elt_mode
);
15571 scalar_int_mode elt_int_mode
= int_mode_for_mode (elt_mode
).require ();
15573 /* Expand the vector constant out into a byte vector, with the least
15574 significant byte of the register first. */
15575 auto_vec
<unsigned char, 16> bytes
;
15576 bytes
.reserve (n_elts
* elt_size
);
15577 for (unsigned int i
= 0; i
< n_elts
; i
++)
15579 /* The vector is provided in gcc endian-neutral fashion.
15580 For aarch64_be Advanced SIMD, it must be laid out in the vector
15581 register in reverse order. */
15582 bool swap_p
= ((vec_flags
& VEC_ADVSIMD
) != 0 && BYTES_BIG_ENDIAN
);
15583 rtx elt
= CONST_VECTOR_ELT (op
, swap_p
? (n_elts
- 1 - i
) : i
);
15585 if (elt_mode
!= elt_int_mode
)
15586 elt
= gen_lowpart (elt_int_mode
, elt
);
15588 if (!CONST_INT_P (elt
))
15591 unsigned HOST_WIDE_INT elt_val
= INTVAL (elt
);
15592 for (unsigned int byte
= 0; byte
< elt_size
; byte
++)
15594 bytes
.quick_push (elt_val
& 0xff);
15595 elt_val
>>= BITS_PER_UNIT
;
15599 /* The immediate must repeat every eight bytes. */
15600 unsigned int nbytes
= bytes
.length ();
15601 for (unsigned i
= 8; i
< nbytes
; ++i
)
15602 if (bytes
[i
] != bytes
[i
- 8])
15605 /* Get the repeating 8-byte value as an integer. No endian correction
15606 is needed here because bytes is already in lsb-first order. */
15607 unsigned HOST_WIDE_INT val64
= 0;
15608 for (unsigned int i
= 0; i
< 8; i
++)
15609 val64
|= ((unsigned HOST_WIDE_INT
) bytes
[i
% nbytes
]
15610 << (i
* BITS_PER_UNIT
));
15612 if (vec_flags
& VEC_SVE_DATA
)
15613 return aarch64_sve_valid_immediate (val64
, info
);
15615 return aarch64_advsimd_valid_immediate (val64
, info
, which
);
15618 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
15619 has a step in the range of INDEX. Return the index expression if so,
15620 otherwise return null. */
15622 aarch64_check_zero_based_sve_index_immediate (rtx x
)
15625 if (const_vec_series_p (x
, &base
, &step
)
15626 && base
== const0_rtx
15627 && aarch64_sve_index_immediate_p (step
))
15632 /* Check of immediate shift constants are within range. */
15634 aarch64_simd_shift_imm_p (rtx x
, machine_mode mode
, bool left
)
15636 int bit_width
= GET_MODE_UNIT_SIZE (mode
) * BITS_PER_UNIT
;
15638 return aarch64_const_vec_all_same_in_range_p (x
, 0, bit_width
- 1);
15640 return aarch64_const_vec_all_same_in_range_p (x
, 1, bit_width
);
15643 /* Return the bitmask CONST_INT to select the bits required by a zero extract
15644 operation of width WIDTH at bit position POS. */
15647 aarch64_mask_from_zextract_ops (rtx width
, rtx pos
)
15649 gcc_assert (CONST_INT_P (width
));
15650 gcc_assert (CONST_INT_P (pos
));
15652 unsigned HOST_WIDE_INT mask
15653 = ((unsigned HOST_WIDE_INT
) 1 << UINTVAL (width
)) - 1;
15654 return GEN_INT (mask
<< UINTVAL (pos
));
15658 aarch64_mov_operand_p (rtx x
, machine_mode mode
)
15660 if (GET_CODE (x
) == HIGH
15661 && aarch64_valid_symref (XEXP (x
, 0), GET_MODE (XEXP (x
, 0))))
15664 if (CONST_INT_P (x
))
15667 if (VECTOR_MODE_P (GET_MODE (x
)))
15669 /* Require predicate constants to be VNx16BI before RA, so that we
15670 force everything to have a canonical form. */
15671 if (!lra_in_progress
15672 && !reload_completed
15673 && GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_BOOL
15674 && GET_MODE (x
) != VNx16BImode
)
15677 return aarch64_simd_valid_immediate (x
, NULL
);
15680 if (GET_CODE (x
) == SYMBOL_REF
&& mode
== DImode
&& CONSTANT_ADDRESS_P (x
))
15683 if (aarch64_sve_cnt_immediate_p (x
))
15686 return aarch64_classify_symbolic_expression (x
)
15687 == SYMBOL_TINY_ABSOLUTE
;
15690 /* Return a const_int vector of VAL. */
15692 aarch64_simd_gen_const_vector_dup (machine_mode mode
, HOST_WIDE_INT val
)
15694 rtx c
= gen_int_mode (val
, GET_MODE_INNER (mode
));
15695 return gen_const_vec_duplicate (mode
, c
);
15698 /* Check OP is a legal scalar immediate for the MOVI instruction. */
15701 aarch64_simd_scalar_immediate_valid_for_move (rtx op
, scalar_int_mode mode
)
15703 machine_mode vmode
;
15705 vmode
= aarch64_simd_container_mode (mode
, 64);
15706 rtx op_v
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (op
));
15707 return aarch64_simd_valid_immediate (op_v
, NULL
);
15710 /* Construct and return a PARALLEL RTX vector with elements numbering the
15711 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
15712 the vector - from the perspective of the architecture. This does not
15713 line up with GCC's perspective on lane numbers, so we end up with
15714 different masks depending on our target endian-ness. The diagram
15715 below may help. We must draw the distinction when building masks
15716 which select one half of the vector. An instruction selecting
15717 architectural low-lanes for a big-endian target, must be described using
15718 a mask selecting GCC high-lanes.
15720 Big-Endian Little-Endian
15722 GCC 0 1 2 3 3 2 1 0
15723 | x | x | x | x | | x | x | x | x |
15724 Architecture 3 2 1 0 3 2 1 0
15726 Low Mask: { 2, 3 } { 0, 1 }
15727 High Mask: { 0, 1 } { 2, 3 }
15729 MODE Is the mode of the vector and NUNITS is the number of units in it. */
15732 aarch64_simd_vect_par_cnst_half (machine_mode mode
, int nunits
, bool high
)
15734 rtvec v
= rtvec_alloc (nunits
/ 2);
15735 int high_base
= nunits
/ 2;
15741 if (BYTES_BIG_ENDIAN
)
15742 base
= high
? low_base
: high_base
;
15744 base
= high
? high_base
: low_base
;
15746 for (i
= 0; i
< nunits
/ 2; i
++)
15747 RTVEC_ELT (v
, i
) = GEN_INT (base
+ i
);
15749 t1
= gen_rtx_PARALLEL (mode
, v
);
15753 /* Check OP for validity as a PARALLEL RTX vector with elements
15754 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
15755 from the perspective of the architecture. See the diagram above
15756 aarch64_simd_vect_par_cnst_half for more details. */
15759 aarch64_simd_check_vect_par_cnst_half (rtx op
, machine_mode mode
,
15763 if (!VECTOR_MODE_P (mode
) || !GET_MODE_NUNITS (mode
).is_constant (&nelts
))
15766 rtx ideal
= aarch64_simd_vect_par_cnst_half (mode
, nelts
, high
);
15767 HOST_WIDE_INT count_op
= XVECLEN (op
, 0);
15768 HOST_WIDE_INT count_ideal
= XVECLEN (ideal
, 0);
15771 if (count_op
!= count_ideal
)
15774 for (i
= 0; i
< count_ideal
; i
++)
15776 rtx elt_op
= XVECEXP (op
, 0, i
);
15777 rtx elt_ideal
= XVECEXP (ideal
, 0, i
);
15779 if (!CONST_INT_P (elt_op
)
15780 || INTVAL (elt_ideal
) != INTVAL (elt_op
))
15786 /* Return a PARALLEL containing NELTS elements, with element I equal
15787 to BASE + I * STEP. */
15790 aarch64_gen_stepped_int_parallel (unsigned int nelts
, int base
, int step
)
15792 rtvec vec
= rtvec_alloc (nelts
);
15793 for (unsigned int i
= 0; i
< nelts
; ++i
)
15794 RTVEC_ELT (vec
, i
) = gen_int_mode (base
+ i
* step
, DImode
);
15795 return gen_rtx_PARALLEL (VOIDmode
, vec
);
15798 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
15799 series with step STEP. */
15802 aarch64_stepped_int_parallel_p (rtx op
, int step
)
15804 if (GET_CODE (op
) != PARALLEL
|| !CONST_INT_P (XVECEXP (op
, 0, 0)))
15807 unsigned HOST_WIDE_INT base
= UINTVAL (XVECEXP (op
, 0, 0));
15808 for (int i
= 1; i
< XVECLEN (op
, 0); ++i
)
15809 if (!CONST_INT_P (XVECEXP (op
, 0, i
))
15810 || UINTVAL (XVECEXP (op
, 0, i
)) != base
+ i
* step
)
15816 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
15817 HIGH (exclusive). */
15819 aarch64_simd_lane_bounds (rtx operand
, HOST_WIDE_INT low
, HOST_WIDE_INT high
,
15822 HOST_WIDE_INT lane
;
15823 gcc_assert (CONST_INT_P (operand
));
15824 lane
= INTVAL (operand
);
15826 if (lane
< low
|| lane
>= high
)
15829 error ("%Klane %wd out of range %wd - %wd", exp
, lane
, low
, high
- 1);
15831 error ("lane %wd out of range %wd - %wd", lane
, low
, high
- 1);
15835 /* Peform endian correction on lane number N, which indexes a vector
15836 of mode MODE, and return the result as an SImode rtx. */
15839 aarch64_endian_lane_rtx (machine_mode mode
, unsigned int n
)
15841 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode
), n
), SImode
);
15844 /* Return TRUE if OP is a valid vector addressing mode. */
15847 aarch64_simd_mem_operand_p (rtx op
)
15849 return MEM_P (op
) && (GET_CODE (XEXP (op
, 0)) == POST_INC
15850 || REG_P (XEXP (op
, 0)));
15853 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
15856 aarch64_sve_ld1r_operand_p (rtx op
)
15858 struct aarch64_address_info addr
;
15862 && is_a
<scalar_mode
> (GET_MODE (op
), &mode
)
15863 && aarch64_classify_address (&addr
, XEXP (op
, 0), mode
, false)
15864 && addr
.type
== ADDRESS_REG_IMM
15865 && offset_6bit_unsigned_scaled_p (mode
, addr
.const_offset
));
15868 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
15870 aarch64_sve_ld1rq_operand_p (rtx op
)
15872 struct aarch64_address_info addr
;
15873 scalar_mode elem_mode
= GET_MODE_INNER (GET_MODE (op
));
15875 || !aarch64_classify_address (&addr
, XEXP (op
, 0), elem_mode
, false))
15878 if (addr
.type
== ADDRESS_REG_IMM
)
15879 return offset_4bit_signed_scaled_p (TImode
, addr
.const_offset
);
15881 if (addr
.type
== ADDRESS_REG_REG
)
15882 return (1U << addr
.shift
) == GET_MODE_SIZE (elem_mode
);
15887 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
15888 The conditions for STR are the same. */
15890 aarch64_sve_ldr_operand_p (rtx op
)
15892 struct aarch64_address_info addr
;
15895 && aarch64_classify_address (&addr
, XEXP (op
, 0), GET_MODE (op
),
15896 false, ADDR_QUERY_ANY
)
15897 && addr
.type
== ADDRESS_REG_IMM
);
15900 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
15901 We need to be able to access the individual pieces, so the range
15902 is different from LD[234] and ST[234]. */
15904 aarch64_sve_struct_memory_operand_p (rtx op
)
15909 machine_mode mode
= GET_MODE (op
);
15910 struct aarch64_address_info addr
;
15911 if (!aarch64_classify_address (&addr
, XEXP (op
, 0), SVE_BYTE_MODE
, false,
15913 || addr
.type
!= ADDRESS_REG_IMM
)
15916 poly_int64 first
= addr
.const_offset
;
15917 poly_int64 last
= first
+ GET_MODE_SIZE (mode
) - BYTES_PER_SVE_VECTOR
;
15918 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, first
)
15919 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, last
));
15922 /* Emit a register copy from operand to operand, taking care not to
15923 early-clobber source registers in the process.
15925 COUNT is the number of components into which the copy needs to be
15928 aarch64_simd_emit_reg_reg_move (rtx
*operands
, machine_mode mode
,
15929 unsigned int count
)
15932 int rdest
= REGNO (operands
[0]);
15933 int rsrc
= REGNO (operands
[1]);
15935 if (!reg_overlap_mentioned_p (operands
[0], operands
[1])
15937 for (i
= 0; i
< count
; i
++)
15938 emit_move_insn (gen_rtx_REG (mode
, rdest
+ i
),
15939 gen_rtx_REG (mode
, rsrc
+ i
));
15941 for (i
= 0; i
< count
; i
++)
15942 emit_move_insn (gen_rtx_REG (mode
, rdest
+ count
- i
- 1),
15943 gen_rtx_REG (mode
, rsrc
+ count
- i
- 1));
15946 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
15947 one of VSTRUCT modes: OI, CI, or XI. */
15949 aarch64_simd_attr_length_rglist (machine_mode mode
)
15951 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
15952 return (GET_MODE_SIZE (mode
).to_constant () / UNITS_PER_VREG
) * 4;
15955 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
15956 alignment of a vector to 128 bits. SVE predicates have an alignment of
15958 static HOST_WIDE_INT
15959 aarch64_simd_vector_alignment (const_tree type
)
15961 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
15962 be set for non-predicate vectors of booleans. Modes are the most
15963 direct way we have of identifying real SVE predicate types. */
15964 if (GET_MODE_CLASS (TYPE_MODE (type
)) == MODE_VECTOR_BOOL
)
15966 if (TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
15968 return wi::umin (wi::to_wide (TYPE_SIZE (type
)), 128).to_uhwi ();
15971 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
15973 aarch64_vectorize_preferred_vector_alignment (const_tree type
)
15975 if (aarch64_sve_data_mode_p (TYPE_MODE (type
)))
15977 /* If the length of the vector is fixed, try to align to that length,
15978 otherwise don't try to align at all. */
15979 HOST_WIDE_INT result
;
15980 if (!BITS_PER_SVE_VECTOR
.is_constant (&result
))
15981 result
= TYPE_ALIGN (TREE_TYPE (type
));
15984 return TYPE_ALIGN (type
);
15987 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
15989 aarch64_simd_vector_alignment_reachable (const_tree type
, bool is_packed
)
15994 /* For fixed-length vectors, check that the vectorizer will aim for
15995 full-vector alignment. This isn't true for generic GCC vectors
15996 that are wider than the ABI maximum of 128 bits. */
15997 poly_uint64 preferred_alignment
=
15998 aarch64_vectorize_preferred_vector_alignment (type
);
15999 if (TREE_CODE (TYPE_SIZE (type
)) == INTEGER_CST
16000 && maybe_ne (wi::to_widest (TYPE_SIZE (type
)),
16001 preferred_alignment
))
16004 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
16008 /* Return true if the vector misalignment factor is supported by the
16011 aarch64_builtin_support_vector_misalignment (machine_mode mode
,
16012 const_tree type
, int misalignment
,
16015 if (TARGET_SIMD
&& STRICT_ALIGNMENT
)
16017 /* Return if movmisalign pattern is not supported for this mode. */
16018 if (optab_handler (movmisalign_optab
, mode
) == CODE_FOR_nothing
)
16021 /* Misalignment factor is unknown at compile time. */
16022 if (misalignment
== -1)
16025 return default_builtin_support_vector_misalignment (mode
, type
, misalignment
,
16029 /* If VALS is a vector constant that can be loaded into a register
16030 using DUP, generate instructions to do so and return an RTX to
16031 assign to the register. Otherwise return NULL_RTX. */
16033 aarch64_simd_dup_constant (rtx vals
)
16035 machine_mode mode
= GET_MODE (vals
);
16036 machine_mode inner_mode
= GET_MODE_INNER (mode
);
16039 if (!const_vec_duplicate_p (vals
, &x
))
16042 /* We can load this constant by using DUP and a constant in a
16043 single ARM register. This will be cheaper than a vector
16045 x
= copy_to_mode_reg (inner_mode
, x
);
16046 return gen_vec_duplicate (mode
, x
);
16050 /* Generate code to load VALS, which is a PARALLEL containing only
16051 constants (for vec_init) or CONST_VECTOR, efficiently into a
16052 register. Returns an RTX to copy into the register, or NULL_RTX
16053 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
16055 aarch64_simd_make_constant (rtx vals
)
16057 machine_mode mode
= GET_MODE (vals
);
16059 rtx const_vec
= NULL_RTX
;
16063 if (GET_CODE (vals
) == CONST_VECTOR
)
16065 else if (GET_CODE (vals
) == PARALLEL
)
16067 /* A CONST_VECTOR must contain only CONST_INTs and
16068 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
16069 Only store valid constants in a CONST_VECTOR. */
16070 int n_elts
= XVECLEN (vals
, 0);
16071 for (i
= 0; i
< n_elts
; ++i
)
16073 rtx x
= XVECEXP (vals
, 0, i
);
16074 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
16077 if (n_const
== n_elts
)
16078 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0));
16081 gcc_unreachable ();
16083 if (const_vec
!= NULL_RTX
16084 && aarch64_simd_valid_immediate (const_vec
, NULL
))
16085 /* Load using MOVI/MVNI. */
16087 else if ((const_dup
= aarch64_simd_dup_constant (vals
)) != NULL_RTX
)
16088 /* Loaded using DUP. */
16090 else if (const_vec
!= NULL_RTX
)
16091 /* Load from constant pool. We cannot take advantage of single-cycle
16092 LD1 because we need a PC-relative addressing mode. */
16095 /* A PARALLEL containing something not valid inside CONST_VECTOR.
16096 We cannot construct an initializer. */
16100 /* Expand a vector initialisation sequence, such that TARGET is
16101 initialised to contain VALS. */
16104 aarch64_expand_vector_init (rtx target
, rtx vals
)
16106 machine_mode mode
= GET_MODE (target
);
16107 scalar_mode inner_mode
= GET_MODE_INNER (mode
);
16108 /* The number of vector elements. */
16109 int n_elts
= XVECLEN (vals
, 0);
16110 /* The number of vector elements which are not constant. */
16112 rtx any_const
= NULL_RTX
;
16113 /* The first element of vals. */
16114 rtx v0
= XVECEXP (vals
, 0, 0);
16115 bool all_same
= true;
16117 /* This is a special vec_init<M><N> where N is not an element mode but a
16118 vector mode with half the elements of M. We expect to find two entries
16119 of mode N in VALS and we must put their concatentation into TARGET. */
16120 if (XVECLEN (vals
, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals
, 0, 0))))
16122 gcc_assert (known_eq (GET_MODE_SIZE (mode
),
16123 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals
, 0, 0)))));
16124 rtx lo
= XVECEXP (vals
, 0, 0);
16125 rtx hi
= XVECEXP (vals
, 0, 1);
16126 machine_mode narrow_mode
= GET_MODE (lo
);
16127 gcc_assert (GET_MODE_INNER (narrow_mode
) == inner_mode
);
16128 gcc_assert (narrow_mode
== GET_MODE (hi
));
16130 /* When we want to concatenate a half-width vector with zeroes we can
16131 use the aarch64_combinez[_be] patterns. Just make sure that the
16132 zeroes are in the right half. */
16133 if (BYTES_BIG_ENDIAN
16134 && aarch64_simd_imm_zero (lo
, narrow_mode
)
16135 && general_operand (hi
, narrow_mode
))
16136 emit_insn (gen_aarch64_combinez_be (narrow_mode
, target
, hi
, lo
));
16137 else if (!BYTES_BIG_ENDIAN
16138 && aarch64_simd_imm_zero (hi
, narrow_mode
)
16139 && general_operand (lo
, narrow_mode
))
16140 emit_insn (gen_aarch64_combinez (narrow_mode
, target
, lo
, hi
));
16143 /* Else create the two half-width registers and combine them. */
16145 lo
= force_reg (GET_MODE (lo
), lo
);
16147 hi
= force_reg (GET_MODE (hi
), hi
);
16149 if (BYTES_BIG_ENDIAN
)
16150 std::swap (lo
, hi
);
16151 emit_insn (gen_aarch64_simd_combine (narrow_mode
, target
, lo
, hi
));
16156 /* Count the number of variable elements to initialise. */
16157 for (int i
= 0; i
< n_elts
; ++i
)
16159 rtx x
= XVECEXP (vals
, 0, i
);
16160 if (!(CONST_INT_P (x
) || CONST_DOUBLE_P (x
)))
16165 all_same
&= rtx_equal_p (x
, v0
);
16168 /* No variable elements, hand off to aarch64_simd_make_constant which knows
16169 how best to handle this. */
16172 rtx constant
= aarch64_simd_make_constant (vals
);
16173 if (constant
!= NULL_RTX
)
16175 emit_move_insn (target
, constant
);
16180 /* Splat a single non-constant element if we can. */
16183 rtx x
= copy_to_mode_reg (inner_mode
, v0
);
16184 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
16188 enum insn_code icode
= optab_handler (vec_set_optab
, mode
);
16189 gcc_assert (icode
!= CODE_FOR_nothing
);
16191 /* If there are only variable elements, try to optimize
16192 the insertion using dup for the most common element
16193 followed by insertions. */
16195 /* The algorithm will fill matches[*][0] with the earliest matching element,
16196 and matches[X][1] with the count of duplicate elements (if X is the
16197 earliest element which has duplicates). */
16199 if (n_var
== n_elts
&& n_elts
<= 16)
16201 int matches
[16][2] = {0};
16202 for (int i
= 0; i
< n_elts
; i
++)
16204 for (int j
= 0; j
<= i
; j
++)
16206 if (rtx_equal_p (XVECEXP (vals
, 0, i
), XVECEXP (vals
, 0, j
)))
16214 int maxelement
= 0;
16216 for (int i
= 0; i
< n_elts
; i
++)
16217 if (matches
[i
][1] > maxv
)
16220 maxv
= matches
[i
][1];
16223 /* Create a duplicate of the most common element, unless all elements
16224 are equally useless to us, in which case just immediately set the
16225 vector register using the first element. */
16229 /* For vectors of two 64-bit elements, we can do even better. */
16231 && (inner_mode
== E_DImode
16232 || inner_mode
== E_DFmode
))
16235 rtx x0
= XVECEXP (vals
, 0, 0);
16236 rtx x1
= XVECEXP (vals
, 0, 1);
16237 /* Combine can pick up this case, but handling it directly
16238 here leaves clearer RTL.
16240 This is load_pair_lanes<mode>, and also gives us a clean-up
16241 for store_pair_lanes<mode>. */
16242 if (memory_operand (x0
, inner_mode
)
16243 && memory_operand (x1
, inner_mode
)
16244 && !STRICT_ALIGNMENT
16245 && rtx_equal_p (XEXP (x1
, 0),
16246 plus_constant (Pmode
,
16248 GET_MODE_SIZE (inner_mode
))))
16251 if (inner_mode
== DFmode
)
16252 t
= gen_load_pair_lanesdf (target
, x0
, x1
);
16254 t
= gen_load_pair_lanesdi (target
, x0
, x1
);
16259 /* The subreg-move sequence below will move into lane zero of the
16260 vector register. For big-endian we want that position to hold
16261 the last element of VALS. */
16262 maxelement
= BYTES_BIG_ENDIAN
? n_elts
- 1 : 0;
16263 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
16264 aarch64_emit_move (target
, lowpart_subreg (mode
, x
, inner_mode
));
16268 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
16269 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
16272 /* Insert the rest. */
16273 for (int i
= 0; i
< n_elts
; i
++)
16275 rtx x
= XVECEXP (vals
, 0, i
);
16276 if (matches
[i
][0] == maxelement
)
16278 x
= copy_to_mode_reg (inner_mode
, x
);
16279 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
16284 /* Initialise a vector which is part-variable. We want to first try
16285 to build those lanes which are constant in the most efficient way we
16287 if (n_var
!= n_elts
)
16289 rtx copy
= copy_rtx (vals
);
16291 /* Load constant part of vector. We really don't care what goes into the
16292 parts we will overwrite, but we're more likely to be able to load the
16293 constant efficiently if it has fewer, larger, repeating parts
16294 (see aarch64_simd_valid_immediate). */
16295 for (int i
= 0; i
< n_elts
; i
++)
16297 rtx x
= XVECEXP (vals
, 0, i
);
16298 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
16300 rtx subst
= any_const
;
16301 for (int bit
= n_elts
/ 2; bit
> 0; bit
/= 2)
16303 /* Look in the copied vector, as more elements are const. */
16304 rtx test
= XVECEXP (copy
, 0, i
^ bit
);
16305 if (CONST_INT_P (test
) || CONST_DOUBLE_P (test
))
16311 XVECEXP (copy
, 0, i
) = subst
;
16313 aarch64_expand_vector_init (target
, copy
);
16316 /* Insert the variable lanes directly. */
16317 for (int i
= 0; i
< n_elts
; i
++)
16319 rtx x
= XVECEXP (vals
, 0, i
);
16320 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
16322 x
= copy_to_mode_reg (inner_mode
, x
);
16323 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
16327 /* Emit RTL corresponding to:
16328 insr TARGET, ELEM. */
16331 emit_insr (rtx target
, rtx elem
)
16333 machine_mode mode
= GET_MODE (target
);
16334 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
16335 elem
= force_reg (elem_mode
, elem
);
16337 insn_code icode
= optab_handler (vec_shl_insert_optab
, mode
);
16338 gcc_assert (icode
!= CODE_FOR_nothing
);
16339 emit_insn (GEN_FCN (icode
) (target
, target
, elem
));
16342 /* Subroutine of aarch64_sve_expand_vector_init for handling
16343 trailing constants.
16344 This function works as follows:
16345 (a) Create a new vector consisting of trailing constants.
16346 (b) Initialize TARGET with the constant vector using emit_move_insn.
16347 (c) Insert remaining elements in TARGET using insr.
16348 NELTS is the total number of elements in original vector while
16349 while NELTS_REQD is the number of elements that are actually
16352 ??? The heuristic used is to do above only if number of constants
16353 is at least half the total number of elements. May need fine tuning. */
16356 aarch64_sve_expand_vector_init_handle_trailing_constants
16357 (rtx target
, const rtx_vector_builder
&builder
, int nelts
, int nelts_reqd
)
16359 machine_mode mode
= GET_MODE (target
);
16360 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
16361 int n_trailing_constants
= 0;
16363 for (int i
= nelts_reqd
- 1;
16364 i
>= 0 && aarch64_legitimate_constant_p (elem_mode
, builder
.elt (i
));
16366 n_trailing_constants
++;
16368 if (n_trailing_constants
>= nelts_reqd
/ 2)
16370 rtx_vector_builder
v (mode
, 1, nelts
);
16371 for (int i
= 0; i
< nelts
; i
++)
16372 v
.quick_push (builder
.elt (i
+ nelts_reqd
- n_trailing_constants
));
16373 rtx const_vec
= v
.build ();
16374 emit_move_insn (target
, const_vec
);
16376 for (int i
= nelts_reqd
- n_trailing_constants
- 1; i
>= 0; i
--)
16377 emit_insr (target
, builder
.elt (i
));
16385 /* Subroutine of aarch64_sve_expand_vector_init.
16387 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
16388 (b) Skip trailing elements from BUILDER, which are the same as
16389 element NELTS_REQD - 1.
16390 (c) Insert earlier elements in reverse order in TARGET using insr. */
16393 aarch64_sve_expand_vector_init_insert_elems (rtx target
,
16394 const rtx_vector_builder
&builder
,
16397 machine_mode mode
= GET_MODE (target
);
16398 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
16400 struct expand_operand ops
[2];
16401 enum insn_code icode
= optab_handler (vec_duplicate_optab
, mode
);
16402 gcc_assert (icode
!= CODE_FOR_nothing
);
16404 create_output_operand (&ops
[0], target
, mode
);
16405 create_input_operand (&ops
[1], builder
.elt (nelts_reqd
- 1), elem_mode
);
16406 expand_insn (icode
, 2, ops
);
16408 int ndups
= builder
.count_dups (nelts_reqd
- 1, -1, -1);
16409 for (int i
= nelts_reqd
- ndups
- 1; i
>= 0; i
--)
16410 emit_insr (target
, builder
.elt (i
));
16413 /* Subroutine of aarch64_sve_expand_vector_init to handle case
16414 when all trailing elements of builder are same.
16415 This works as follows:
16416 (a) Use expand_insn interface to broadcast last vector element in TARGET.
16417 (b) Insert remaining elements in TARGET using insr.
16419 ??? The heuristic used is to do above if number of same trailing elements
16420 is at least 3/4 of total number of elements, loosely based on
16421 heuristic from mostly_zeros_p. May need fine-tuning. */
16424 aarch64_sve_expand_vector_init_handle_trailing_same_elem
16425 (rtx target
, const rtx_vector_builder
&builder
, int nelts_reqd
)
16427 int ndups
= builder
.count_dups (nelts_reqd
- 1, -1, -1);
16428 if (ndups
>= (3 * nelts_reqd
) / 4)
16430 aarch64_sve_expand_vector_init_insert_elems (target
, builder
,
16431 nelts_reqd
- ndups
+ 1);
16438 /* Initialize register TARGET from BUILDER. NELTS is the constant number
16439 of elements in BUILDER.
16441 The function tries to initialize TARGET from BUILDER if it fits one
16442 of the special cases outlined below.
16444 Failing that, the function divides BUILDER into two sub-vectors:
16445 v_even = even elements of BUILDER;
16446 v_odd = odd elements of BUILDER;
16448 and recursively calls itself with v_even and v_odd.
16450 if (recursive call succeeded for v_even or v_odd)
16451 TARGET = zip (v_even, v_odd)
16453 The function returns true if it managed to build TARGET from BUILDER
16454 with one of the special cases, false otherwise.
16456 Example: {a, 1, b, 2, c, 3, d, 4}
16458 The vector gets divided into:
16459 v_even = {a, b, c, d}
16460 v_odd = {1, 2, 3, 4}
16462 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
16463 initialize tmp2 from constant vector v_odd using emit_move_insn.
16465 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
16466 4 elements, so we construct tmp1 from v_even using insr:
16473 TARGET = zip (tmp1, tmp2)
16474 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
16477 aarch64_sve_expand_vector_init (rtx target
, const rtx_vector_builder
&builder
,
16478 int nelts
, int nelts_reqd
)
16480 machine_mode mode
= GET_MODE (target
);
16482 /* Case 1: Vector contains trailing constants. */
16484 if (aarch64_sve_expand_vector_init_handle_trailing_constants
16485 (target
, builder
, nelts
, nelts_reqd
))
16488 /* Case 2: Vector contains leading constants. */
16490 rtx_vector_builder
rev_builder (mode
, 1, nelts_reqd
);
16491 for (int i
= 0; i
< nelts_reqd
; i
++)
16492 rev_builder
.quick_push (builder
.elt (nelts_reqd
- i
- 1));
16493 rev_builder
.finalize ();
16495 if (aarch64_sve_expand_vector_init_handle_trailing_constants
16496 (target
, rev_builder
, nelts
, nelts_reqd
))
16498 emit_insn (gen_aarch64_sve_rev (mode
, target
, target
));
16502 /* Case 3: Vector contains trailing same element. */
16504 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16505 (target
, builder
, nelts_reqd
))
16508 /* Case 4: Vector contains leading same element. */
16510 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16511 (target
, rev_builder
, nelts_reqd
) && nelts_reqd
== nelts
)
16513 emit_insn (gen_aarch64_sve_rev (mode
, target
, target
));
16517 /* Avoid recursing below 4-elements.
16518 ??? The threshold 4 may need fine-tuning. */
16520 if (nelts_reqd
<= 4)
16523 rtx_vector_builder
v_even (mode
, 1, nelts
);
16524 rtx_vector_builder
v_odd (mode
, 1, nelts
);
16526 for (int i
= 0; i
< nelts
* 2; i
+= 2)
16528 v_even
.quick_push (builder
.elt (i
));
16529 v_odd
.quick_push (builder
.elt (i
+ 1));
16532 v_even
.finalize ();
16535 rtx tmp1
= gen_reg_rtx (mode
);
16536 bool did_even_p
= aarch64_sve_expand_vector_init (tmp1
, v_even
,
16537 nelts
, nelts_reqd
/ 2);
16539 rtx tmp2
= gen_reg_rtx (mode
);
16540 bool did_odd_p
= aarch64_sve_expand_vector_init (tmp2
, v_odd
,
16541 nelts
, nelts_reqd
/ 2);
16543 if (!did_even_p
&& !did_odd_p
)
16546 /* Initialize v_even and v_odd using INSR if it didn't match any of the
16547 special cases and zip v_even, v_odd. */
16550 aarch64_sve_expand_vector_init_insert_elems (tmp1
, v_even
, nelts_reqd
/ 2);
16553 aarch64_sve_expand_vector_init_insert_elems (tmp2
, v_odd
, nelts_reqd
/ 2);
16555 rtvec v
= gen_rtvec (2, tmp1
, tmp2
);
16556 emit_set_insn (target
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
16560 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
16563 aarch64_sve_expand_vector_init (rtx target
, rtx vals
)
16565 machine_mode mode
= GET_MODE (target
);
16566 int nelts
= XVECLEN (vals
, 0);
16568 rtx_vector_builder
v (mode
, 1, nelts
);
16569 for (int i
= 0; i
< nelts
; i
++)
16570 v
.quick_push (XVECEXP (vals
, 0, i
));
16573 /* If neither sub-vectors of v could be initialized specially,
16574 then use INSR to insert all elements from v into TARGET.
16575 ??? This might not be optimal for vectors with large
16576 initializers like 16-element or above.
16577 For nelts < 4, it probably isn't useful to handle specially. */
16580 || !aarch64_sve_expand_vector_init (target
, v
, nelts
, nelts
))
16581 aarch64_sve_expand_vector_init_insert_elems (target
, v
, nelts
);
16584 /* Check whether VALUE is a vector constant in which every element
16585 is either a power of 2 or a negated power of 2. If so, return
16586 a constant vector of log2s, and flip CODE between PLUS and MINUS
16587 if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
16590 aarch64_convert_mult_to_shift (rtx value
, rtx_code
&code
)
16592 if (GET_CODE (value
) != CONST_VECTOR
)
16595 rtx_vector_builder builder
;
16596 if (!builder
.new_unary_operation (GET_MODE (value
), value
, false))
16599 scalar_mode int_mode
= GET_MODE_INNER (GET_MODE (value
));
16600 /* 1 if the result of the multiplication must be negated,
16601 0 if it mustn't, or -1 if we don't yet care. */
16603 unsigned int encoded_nelts
= const_vector_encoded_nelts (value
);
16604 for (unsigned int i
= 0; i
< encoded_nelts
; ++i
)
16606 rtx elt
= CONST_VECTOR_ENCODED_ELT (value
, i
);
16607 if (!CONST_SCALAR_INT_P (elt
))
16609 rtx_mode_t
val (elt
, int_mode
);
16610 wide_int pow2
= wi::neg (val
);
16613 /* It matters whether we negate or not. Make that choice,
16614 and make sure that it's consistent with previous elements. */
16615 if (negate
== !wi::neg_p (val
))
16617 negate
= wi::neg_p (val
);
16621 /* POW2 is now the value that we want to be a power of 2. */
16622 int shift
= wi::exact_log2 (pow2
);
16625 builder
.quick_push (gen_int_mode (shift
, int_mode
));
16628 /* PLUS and MINUS are equivalent; canonicalize on PLUS. */
16630 else if (negate
== 1)
16631 code
= code
== PLUS
? MINUS
: PLUS
;
16632 return builder
.build ();
16635 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
16636 CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
16637 operands array, in the same order as for fma_optab. Return true if
16638 the function emitted all the necessary instructions, false if the caller
16639 should generate the pattern normally with the new OPERANDS array. */
16642 aarch64_prepare_sve_int_fma (rtx
*operands
, rtx_code code
)
16644 machine_mode mode
= GET_MODE (operands
[0]);
16645 if (rtx shifts
= aarch64_convert_mult_to_shift (operands
[2], code
))
16647 rtx product
= expand_binop (mode
, vashl_optab
, operands
[1], shifts
,
16648 NULL_RTX
, true, OPTAB_DIRECT
);
16649 force_expand_binop (mode
, code
== PLUS
? add_optab
: sub_optab
,
16650 operands
[3], product
, operands
[0], true,
16654 operands
[2] = force_reg (mode
, operands
[2]);
16658 /* Likewise, but for a conditional pattern. */
16661 aarch64_prepare_sve_cond_int_fma (rtx
*operands
, rtx_code code
)
16663 machine_mode mode
= GET_MODE (operands
[0]);
16664 if (rtx shifts
= aarch64_convert_mult_to_shift (operands
[3], code
))
16666 rtx product
= expand_binop (mode
, vashl_optab
, operands
[2], shifts
,
16667 NULL_RTX
, true, OPTAB_DIRECT
);
16668 emit_insn (gen_cond (code
, mode
, operands
[0], operands
[1],
16669 operands
[4], product
, operands
[5]));
16672 operands
[3] = force_reg (mode
, operands
[3]);
16676 static unsigned HOST_WIDE_INT
16677 aarch64_shift_truncation_mask (machine_mode mode
)
16679 if (!SHIFT_COUNT_TRUNCATED
|| aarch64_vector_data_mode_p (mode
))
16681 return GET_MODE_UNIT_BITSIZE (mode
) - 1;
16684 /* Select a format to encode pointers in exception handling data. */
16686 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED
, int global
)
16689 switch (aarch64_cmodel
)
16691 case AARCH64_CMODEL_TINY
:
16692 case AARCH64_CMODEL_TINY_PIC
:
16693 case AARCH64_CMODEL_SMALL
:
16694 case AARCH64_CMODEL_SMALL_PIC
:
16695 case AARCH64_CMODEL_SMALL_SPIC
:
16696 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
16698 type
= DW_EH_PE_sdata4
;
16701 /* No assumptions here. 8-byte relocs required. */
16702 type
= DW_EH_PE_sdata8
;
16705 return (global
? DW_EH_PE_indirect
: 0) | DW_EH_PE_pcrel
| type
;
16708 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
16711 aarch64_asm_output_variant_pcs (FILE *stream
, const tree decl
, const char* name
)
16713 if (aarch64_simd_decl_p (decl
))
16715 fprintf (stream
, "\t.variant_pcs\t");
16716 assemble_name (stream
, name
);
16717 fprintf (stream
, "\n");
16721 /* The last .arch and .tune assembly strings that we printed. */
16722 static std::string aarch64_last_printed_arch_string
;
16723 static std::string aarch64_last_printed_tune_string
;
16725 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
16726 by the function fndecl. */
16729 aarch64_declare_function_name (FILE *stream
, const char* name
,
16732 tree target_parts
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
16734 struct cl_target_option
*targ_options
;
16736 targ_options
= TREE_TARGET_OPTION (target_parts
);
16738 targ_options
= TREE_TARGET_OPTION (target_option_current_node
);
16739 gcc_assert (targ_options
);
16741 const struct processor
*this_arch
16742 = aarch64_get_arch (targ_options
->x_explicit_arch
);
16744 uint64_t isa_flags
= targ_options
->x_aarch64_isa_flags
;
16745 std::string extension
16746 = aarch64_get_extension_string_for_isa_flags (isa_flags
,
16748 /* Only update the assembler .arch string if it is distinct from the last
16749 such string we printed. */
16750 std::string to_print
= this_arch
->name
+ extension
;
16751 if (to_print
!= aarch64_last_printed_arch_string
)
16753 asm_fprintf (asm_out_file
, "\t.arch %s\n", to_print
.c_str ());
16754 aarch64_last_printed_arch_string
= to_print
;
16757 /* Print the cpu name we're tuning for in the comments, might be
16758 useful to readers of the generated asm. Do it only when it changes
16759 from function to function and verbose assembly is requested. */
16760 const struct processor
*this_tune
16761 = aarch64_get_tune_cpu (targ_options
->x_explicit_tune_core
);
16763 if (flag_debug_asm
&& aarch64_last_printed_tune_string
!= this_tune
->name
)
16765 asm_fprintf (asm_out_file
, "\t" ASM_COMMENT_START
".tune %s\n",
16767 aarch64_last_printed_tune_string
= this_tune
->name
;
16770 aarch64_asm_output_variant_pcs (stream
, fndecl
, name
);
16772 /* Don't forget the type directive for ELF. */
16773 ASM_OUTPUT_TYPE_DIRECTIVE (stream
, name
, "function");
16774 ASM_OUTPUT_LABEL (stream
, name
);
16777 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
16780 aarch64_asm_output_alias (FILE *stream
, const tree decl
, const tree target
)
16782 const char *name
= XSTR (XEXP (DECL_RTL (decl
), 0), 0);
16783 const char *value
= IDENTIFIER_POINTER (target
);
16784 aarch64_asm_output_variant_pcs (stream
, decl
, name
);
16785 ASM_OUTPUT_DEF (stream
, name
, value
);
16788 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
16789 function symbol references. */
16792 aarch64_asm_output_external (FILE *stream
, tree decl
, const char* name
)
16794 default_elf_asm_output_external (stream
, decl
, name
);
16795 aarch64_asm_output_variant_pcs (stream
, decl
, name
);
16798 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
16799 Used to output the .cfi_b_key_frame directive when signing the current
16800 function with the B key. */
16803 aarch64_post_cfi_startproc (FILE *f
, tree ignored ATTRIBUTE_UNUSED
)
16805 if (cfun
->machine
->frame
.laid_out
&& aarch64_return_address_signing_enabled ()
16806 && aarch64_ra_sign_key
== AARCH64_KEY_B
)
16807 asm_fprintf (f
, "\t.cfi_b_key_frame\n");
16810 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
16813 aarch64_start_file (void)
16815 struct cl_target_option
*default_options
16816 = TREE_TARGET_OPTION (target_option_default_node
);
16818 const struct processor
*default_arch
16819 = aarch64_get_arch (default_options
->x_explicit_arch
);
16820 uint64_t default_isa_flags
= default_options
->x_aarch64_isa_flags
;
16821 std::string extension
16822 = aarch64_get_extension_string_for_isa_flags (default_isa_flags
,
16823 default_arch
->flags
);
16825 aarch64_last_printed_arch_string
= default_arch
->name
+ extension
;
16826 aarch64_last_printed_tune_string
= "";
16827 asm_fprintf (asm_out_file
, "\t.arch %s\n",
16828 aarch64_last_printed_arch_string
.c_str ());
16830 default_file_start ();
16833 /* Emit load exclusive. */
16836 aarch64_emit_load_exclusive (machine_mode mode
, rtx rval
,
16837 rtx mem
, rtx model_rtx
)
16839 if (mode
== TImode
)
16840 emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode
, rval
),
16841 gen_highpart (DImode
, rval
),
16844 emit_insn (gen_aarch64_load_exclusive (mode
, rval
, mem
, model_rtx
));
16847 /* Emit store exclusive. */
16850 aarch64_emit_store_exclusive (machine_mode mode
, rtx bval
,
16851 rtx mem
, rtx rval
, rtx model_rtx
)
16853 if (mode
== TImode
)
16854 emit_insn (gen_aarch64_store_exclusive_pair
16855 (bval
, mem
, operand_subword (rval
, 0, 0, TImode
),
16856 operand_subword (rval
, 1, 0, TImode
), model_rtx
));
16858 emit_insn (gen_aarch64_store_exclusive (mode
, bval
, mem
, rval
, model_rtx
));
16861 /* Mark the previous jump instruction as unlikely. */
16864 aarch64_emit_unlikely_jump (rtx insn
)
16866 rtx_insn
*jump
= emit_jump_insn (insn
);
16867 add_reg_br_prob_note (jump
, profile_probability::very_unlikely ());
16870 /* We store the names of the various atomic helpers in a 5x4 array.
16871 Return the libcall function given MODE, MODEL and NAMES. */
16874 aarch64_atomic_ool_func(machine_mode mode
, rtx model_rtx
,
16875 const atomic_ool_names
*names
)
16877 memmodel model
= memmodel_base (INTVAL (model_rtx
));
16878 int mode_idx
, model_idx
;
16898 gcc_unreachable ();
16903 case MEMMODEL_RELAXED
:
16906 case MEMMODEL_CONSUME
:
16907 case MEMMODEL_ACQUIRE
:
16910 case MEMMODEL_RELEASE
:
16913 case MEMMODEL_ACQ_REL
:
16914 case MEMMODEL_SEQ_CST
:
16918 gcc_unreachable ();
16921 return init_one_libfunc_visibility (names
->str
[mode_idx
][model_idx
],
16922 VISIBILITY_HIDDEN
);
16925 #define DEF0(B, N) \
16926 { "__aarch64_" #B #N "_relax", \
16927 "__aarch64_" #B #N "_acq", \
16928 "__aarch64_" #B #N "_rel", \
16929 "__aarch64_" #B #N "_acq_rel" }
16931 #define DEF4(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
16932 { NULL, NULL, NULL, NULL }
16933 #define DEF5(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
16935 static const atomic_ool_names aarch64_ool_cas_names
= { { DEF5(cas
) } };
16936 const atomic_ool_names aarch64_ool_swp_names
= { { DEF4(swp
) } };
16937 const atomic_ool_names aarch64_ool_ldadd_names
= { { DEF4(ldadd
) } };
16938 const atomic_ool_names aarch64_ool_ldset_names
= { { DEF4(ldset
) } };
16939 const atomic_ool_names aarch64_ool_ldclr_names
= { { DEF4(ldclr
) } };
16940 const atomic_ool_names aarch64_ool_ldeor_names
= { { DEF4(ldeor
) } };
16946 /* Expand a compare and swap pattern. */
16949 aarch64_expand_compare_and_swap (rtx operands
[])
16951 rtx bval
, rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
, x
, cc_reg
;
16952 machine_mode mode
, r_mode
;
16954 bval
= operands
[0];
16955 rval
= operands
[1];
16957 oldval
= operands
[3];
16958 newval
= operands
[4];
16959 is_weak
= operands
[5];
16960 mod_s
= operands
[6];
16961 mod_f
= operands
[7];
16962 mode
= GET_MODE (mem
);
16964 /* Normally the succ memory model must be stronger than fail, but in the
16965 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
16966 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
16967 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f
)))
16968 && is_mm_release (memmodel_from_int (INTVAL (mod_s
))))
16969 mod_s
= GEN_INT (MEMMODEL_ACQ_REL
);
16972 if (mode
== QImode
|| mode
== HImode
)
16975 rval
= gen_reg_rtx (r_mode
);
16980 /* The CAS insn requires oldval and rval overlap, but we need to
16981 have a copy of oldval saved across the operation to tell if
16982 the operation is successful. */
16983 if (reg_overlap_mentioned_p (rval
, oldval
))
16984 rval
= copy_to_mode_reg (r_mode
, oldval
);
16986 emit_move_insn (rval
, gen_lowpart (r_mode
, oldval
));
16988 emit_insn (gen_aarch64_compare_and_swap_lse (mode
, rval
, mem
,
16990 cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
16992 else if (TARGET_OUTLINE_ATOMICS
)
16994 /* Oldval must satisfy compare afterward. */
16995 if (!aarch64_plus_operand (oldval
, mode
))
16996 oldval
= force_reg (mode
, oldval
);
16997 rtx func
= aarch64_atomic_ool_func (mode
, mod_s
, &aarch64_ool_cas_names
);
16998 rval
= emit_library_call_value (func
, NULL_RTX
, LCT_NORMAL
, r_mode
,
16999 oldval
, mode
, newval
, mode
,
17000 XEXP (mem
, 0), Pmode
);
17001 cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
17005 /* The oldval predicate varies by mode. Test it and force to reg. */
17006 insn_code code
= code_for_aarch64_compare_and_swap (mode
);
17007 if (!insn_data
[code
].operand
[2].predicate (oldval
, mode
))
17008 oldval
= force_reg (mode
, oldval
);
17010 emit_insn (GEN_FCN (code
) (rval
, mem
, oldval
, newval
,
17011 is_weak
, mod_s
, mod_f
));
17012 cc_reg
= gen_rtx_REG (CCmode
, CC_REGNUM
);
17015 if (r_mode
!= mode
)
17016 rval
= gen_lowpart (mode
, rval
);
17017 emit_move_insn (operands
[1], rval
);
17019 x
= gen_rtx_EQ (SImode
, cc_reg
, const0_rtx
);
17020 emit_insn (gen_rtx_SET (bval
, x
));
17023 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
17024 sequence implementing an atomic operation. */
17027 aarch64_emit_post_barrier (enum memmodel model
)
17029 const enum memmodel base_model
= memmodel_base (model
);
17031 if (is_mm_sync (model
)
17032 && (base_model
== MEMMODEL_ACQUIRE
17033 || base_model
== MEMMODEL_ACQ_REL
17034 || base_model
== MEMMODEL_SEQ_CST
))
17036 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST
)));
17040 /* Split a compare and swap pattern. */
17043 aarch64_split_compare_and_swap (rtx operands
[])
17045 rtx rval
, mem
, oldval
, newval
, scratch
, x
, model_rtx
;
17048 rtx_code_label
*label1
, *label2
;
17049 enum memmodel model
;
17051 rval
= operands
[0];
17053 oldval
= operands
[2];
17054 newval
= operands
[3];
17055 is_weak
= (operands
[4] != const0_rtx
);
17056 model_rtx
= operands
[5];
17057 scratch
= operands
[7];
17058 mode
= GET_MODE (mem
);
17059 model
= memmodel_from_int (INTVAL (model_rtx
));
17061 /* When OLDVAL is zero and we want the strong version we can emit a tighter
17064 LD[A]XR rval, [mem]
17066 ST[L]XR scratch, newval, [mem]
17067 CBNZ scratch, .label1
17070 bool strong_zero_p
= (!is_weak
&& !aarch64_track_speculation
&&
17071 oldval
== const0_rtx
&& mode
!= TImode
);
17076 label1
= gen_label_rtx ();
17077 emit_label (label1
);
17079 label2
= gen_label_rtx ();
17081 /* The initial load can be relaxed for a __sync operation since a final
17082 barrier will be emitted to stop code hoisting. */
17083 if (is_mm_sync (model
))
17084 aarch64_emit_load_exclusive (mode
, rval
, mem
, GEN_INT (MEMMODEL_RELAXED
));
17086 aarch64_emit_load_exclusive (mode
, rval
, mem
, model_rtx
);
17089 x
= gen_rtx_NE (VOIDmode
, rval
, const0_rtx
);
17092 rtx cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
17093 x
= gen_rtx_NE (VOIDmode
, cc_reg
, const0_rtx
);
17095 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
17096 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
17097 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
17099 aarch64_emit_store_exclusive (mode
, scratch
, mem
, newval
, model_rtx
);
17103 if (aarch64_track_speculation
)
17105 /* Emit an explicit compare instruction, so that we can correctly
17106 track the condition codes. */
17107 rtx cc_reg
= aarch64_gen_compare_reg (NE
, scratch
, const0_rtx
);
17108 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
17111 x
= gen_rtx_NE (VOIDmode
, scratch
, const0_rtx
);
17113 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
17114 gen_rtx_LABEL_REF (Pmode
, label1
), pc_rtx
);
17115 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
17118 aarch64_gen_compare_reg (NE
, scratch
, const0_rtx
);
17120 emit_label (label2
);
17122 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
17123 to set the condition flags. If this is not used it will be removed by
17126 aarch64_gen_compare_reg (NE
, rval
, const0_rtx
);
17128 /* Emit any final barrier needed for a __sync operation. */
17129 if (is_mm_sync (model
))
17130 aarch64_emit_post_barrier (model
);
17133 /* Split an atomic operation. */
17136 aarch64_split_atomic_op (enum rtx_code code
, rtx old_out
, rtx new_out
, rtx mem
,
17137 rtx value
, rtx model_rtx
, rtx cond
)
17139 machine_mode mode
= GET_MODE (mem
);
17140 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
17141 const enum memmodel model
= memmodel_from_int (INTVAL (model_rtx
));
17142 const bool is_sync
= is_mm_sync (model
);
17143 rtx_code_label
*label
;
17146 /* Split the atomic operation into a sequence. */
17147 label
= gen_label_rtx ();
17148 emit_label (label
);
17151 new_out
= gen_lowpart (wmode
, new_out
);
17153 old_out
= gen_lowpart (wmode
, old_out
);
17156 value
= simplify_gen_subreg (wmode
, value
, mode
, 0);
17158 /* The initial load can be relaxed for a __sync operation since a final
17159 barrier will be emitted to stop code hoisting. */
17161 aarch64_emit_load_exclusive (mode
, old_out
, mem
,
17162 GEN_INT (MEMMODEL_RELAXED
));
17164 aarch64_emit_load_exclusive (mode
, old_out
, mem
, model_rtx
);
17173 x
= gen_rtx_AND (wmode
, old_out
, value
);
17174 emit_insn (gen_rtx_SET (new_out
, x
));
17175 x
= gen_rtx_NOT (wmode
, new_out
);
17176 emit_insn (gen_rtx_SET (new_out
, x
));
17180 if (CONST_INT_P (value
))
17182 value
= GEN_INT (-INTVAL (value
));
17185 /* Fall through. */
17188 x
= gen_rtx_fmt_ee (code
, wmode
, old_out
, value
);
17189 emit_insn (gen_rtx_SET (new_out
, x
));
17193 aarch64_emit_store_exclusive (mode
, cond
, mem
,
17194 gen_lowpart (mode
, new_out
), model_rtx
);
17196 if (aarch64_track_speculation
)
17198 /* Emit an explicit compare instruction, so that we can correctly
17199 track the condition codes. */
17200 rtx cc_reg
= aarch64_gen_compare_reg (NE
, cond
, const0_rtx
);
17201 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
17204 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
17206 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
17207 gen_rtx_LABEL_REF (Pmode
, label
), pc_rtx
);
17208 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
17210 /* Emit any final barrier needed for a __sync operation. */
17212 aarch64_emit_post_barrier (model
);
17216 aarch64_init_libfuncs (void)
17218 /* Half-precision float operations. The compiler handles all operations
17219 with NULL libfuncs by converting to SFmode. */
17222 set_conv_libfunc (trunc_optab
, HFmode
, SFmode
, "__gnu_f2h_ieee");
17223 set_conv_libfunc (sext_optab
, SFmode
, HFmode
, "__gnu_h2f_ieee");
17226 set_optab_libfunc (add_optab
, HFmode
, NULL
);
17227 set_optab_libfunc (sdiv_optab
, HFmode
, NULL
);
17228 set_optab_libfunc (smul_optab
, HFmode
, NULL
);
17229 set_optab_libfunc (neg_optab
, HFmode
, NULL
);
17230 set_optab_libfunc (sub_optab
, HFmode
, NULL
);
17233 set_optab_libfunc (eq_optab
, HFmode
, NULL
);
17234 set_optab_libfunc (ne_optab
, HFmode
, NULL
);
17235 set_optab_libfunc (lt_optab
, HFmode
, NULL
);
17236 set_optab_libfunc (le_optab
, HFmode
, NULL
);
17237 set_optab_libfunc (ge_optab
, HFmode
, NULL
);
17238 set_optab_libfunc (gt_optab
, HFmode
, NULL
);
17239 set_optab_libfunc (unord_optab
, HFmode
, NULL
);
17242 /* Target hook for c_mode_for_suffix. */
17243 static machine_mode
17244 aarch64_c_mode_for_suffix (char suffix
)
17252 /* We can only represent floating point constants which will fit in
17253 "quarter-precision" values. These values are characterised by
17254 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
17257 (-1)^s * (n/16) * 2^r
17260 's' is the sign bit.
17261 'n' is an integer in the range 16 <= n <= 31.
17262 'r' is an integer in the range -3 <= r <= 4. */
17264 /* Return true iff X can be represented by a quarter-precision
17265 floating point immediate operand X. Note, we cannot represent 0.0. */
17267 aarch64_float_const_representable_p (rtx x
)
17269 /* This represents our current view of how many bits
17270 make up the mantissa. */
17271 int point_pos
= 2 * HOST_BITS_PER_WIDE_INT
- 1;
17273 unsigned HOST_WIDE_INT mantissa
, mask
;
17274 REAL_VALUE_TYPE r
, m
;
17277 x
= unwrap_const_vec_duplicate (x
);
17278 if (!CONST_DOUBLE_P (x
))
17281 if (GET_MODE (x
) == VOIDmode
17282 || (GET_MODE (x
) == HFmode
&& !TARGET_FP_F16INST
))
17285 r
= *CONST_DOUBLE_REAL_VALUE (x
);
17287 /* We cannot represent infinities, NaNs or +/-zero. We won't
17288 know if we have +zero until we analyse the mantissa, but we
17289 can reject the other invalid values. */
17290 if (REAL_VALUE_ISINF (r
) || REAL_VALUE_ISNAN (r
)
17291 || REAL_VALUE_MINUS_ZERO (r
))
17294 /* Extract exponent. */
17295 r
= real_value_abs (&r
);
17296 exponent
= REAL_EXP (&r
);
17298 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
17299 highest (sign) bit, with a fixed binary point at bit point_pos.
17300 m1 holds the low part of the mantissa, m2 the high part.
17301 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
17302 bits for the mantissa, this can fail (low bits will be lost). */
17303 real_ldexp (&m
, &r
, point_pos
- exponent
);
17304 wide_int w
= real_to_integer (&m
, &fail
, HOST_BITS_PER_WIDE_INT
* 2);
17306 /* If the low part of the mantissa has bits set we cannot represent
17308 if (w
.ulow () != 0)
17310 /* We have rejected the lower HOST_WIDE_INT, so update our
17311 understanding of how many bits lie in the mantissa and
17312 look only at the high HOST_WIDE_INT. */
17313 mantissa
= w
.elt (1);
17314 point_pos
-= HOST_BITS_PER_WIDE_INT
;
17316 /* We can only represent values with a mantissa of the form 1.xxxx. */
17317 mask
= ((unsigned HOST_WIDE_INT
)1 << (point_pos
- 5)) - 1;
17318 if ((mantissa
& mask
) != 0)
17321 /* Having filtered unrepresentable values, we may now remove all
17322 but the highest 5 bits. */
17323 mantissa
>>= point_pos
- 5;
17325 /* We cannot represent the value 0.0, so reject it. This is handled
17330 /* Then, as bit 4 is always set, we can mask it off, leaving
17331 the mantissa in the range [0, 15]. */
17332 mantissa
&= ~(1 << 4);
17333 gcc_assert (mantissa
<= 15);
17335 /* GCC internally does not use IEEE754-like encoding (where normalized
17336 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
17337 Our mantissa values are shifted 4 places to the left relative to
17338 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
17339 by 5 places to correct for GCC's representation. */
17340 exponent
= 5 - exponent
;
17342 return (exponent
>= 0 && exponent
<= 7);
17345 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
17346 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
17347 output MOVI/MVNI, ORR or BIC immediate. */
17349 aarch64_output_simd_mov_immediate (rtx const_vector
, unsigned width
,
17350 enum simd_immediate_check which
)
17353 static char templ
[40];
17354 const char *mnemonic
;
17355 const char *shift_op
;
17356 unsigned int lane_count
= 0;
17359 struct simd_immediate_info info
;
17361 /* This will return true to show const_vector is legal for use as either
17362 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
17363 It will also update INFO to show how the immediate should be generated.
17364 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
17365 is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
, which
);
17366 gcc_assert (is_valid
);
17368 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
17369 lane_count
= width
/ GET_MODE_BITSIZE (info
.elt_mode
);
17371 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
17373 gcc_assert (info
.insn
== simd_immediate_info::MOV
17374 && info
.u
.mov
.shift
== 0);
17375 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
17376 move immediate path. */
17377 if (aarch64_float_const_zero_rtx_p (info
.u
.mov
.value
))
17378 info
.u
.mov
.value
= GEN_INT (0);
17381 const unsigned int buf_size
= 20;
17382 char float_buf
[buf_size
] = {'\0'};
17383 real_to_decimal_for_mode (float_buf
,
17384 CONST_DOUBLE_REAL_VALUE (info
.u
.mov
.value
),
17385 buf_size
, buf_size
, 1, info
.elt_mode
);
17387 if (lane_count
== 1)
17388 snprintf (templ
, sizeof (templ
), "fmov\t%%d0, %s", float_buf
);
17390 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%d%c, %s",
17391 lane_count
, element_char
, float_buf
);
17396 gcc_assert (CONST_INT_P (info
.u
.mov
.value
));
17398 if (which
== AARCH64_CHECK_MOV
)
17400 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "mvni" : "movi";
17401 shift_op
= (info
.u
.mov
.modifier
== simd_immediate_info::MSL
17403 if (lane_count
== 1)
17404 snprintf (templ
, sizeof (templ
), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX
,
17405 mnemonic
, UINTVAL (info
.u
.mov
.value
));
17406 else if (info
.u
.mov
.shift
)
17407 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
17408 HOST_WIDE_INT_PRINT_HEX
", %s %d", mnemonic
, lane_count
,
17409 element_char
, UINTVAL (info
.u
.mov
.value
), shift_op
,
17412 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
17413 HOST_WIDE_INT_PRINT_HEX
, mnemonic
, lane_count
,
17414 element_char
, UINTVAL (info
.u
.mov
.value
));
17418 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
17419 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "bic" : "orr";
17420 if (info
.u
.mov
.shift
)
17421 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
17422 HOST_WIDE_INT_PRINT_DEC
", %s #%d", mnemonic
, lane_count
,
17423 element_char
, UINTVAL (info
.u
.mov
.value
), "lsl",
17426 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
17427 HOST_WIDE_INT_PRINT_DEC
, mnemonic
, lane_count
,
17428 element_char
, UINTVAL (info
.u
.mov
.value
));
17434 aarch64_output_scalar_simd_mov_immediate (rtx immediate
, scalar_int_mode mode
)
17437 /* If a floating point number was passed and we desire to use it in an
17438 integer mode do the conversion to integer. */
17439 if (CONST_DOUBLE_P (immediate
) && GET_MODE_CLASS (mode
) == MODE_INT
)
17441 unsigned HOST_WIDE_INT ival
;
17442 if (!aarch64_reinterpret_float_as_int (immediate
, &ival
))
17443 gcc_unreachable ();
17444 immediate
= gen_int_mode (ival
, mode
);
17447 machine_mode vmode
;
17448 /* use a 64 bit mode for everything except for DI/DF mode, where we use
17449 a 128 bit vector mode. */
17450 int width
= GET_MODE_BITSIZE (mode
) == 64 ? 128 : 64;
17452 vmode
= aarch64_simd_container_mode (mode
, width
);
17453 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (immediate
));
17454 return aarch64_output_simd_mov_immediate (v_op
, width
);
17457 /* Return the output string to use for moving immediate CONST_VECTOR
17458 into an SVE register. */
17461 aarch64_output_sve_mov_immediate (rtx const_vector
)
17463 static char templ
[40];
17464 struct simd_immediate_info info
;
17467 bool is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
);
17468 gcc_assert (is_valid
);
17470 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
17472 machine_mode vec_mode
= GET_MODE (const_vector
);
17473 if (aarch64_sve_pred_mode_p (vec_mode
))
17475 static char buf
[sizeof ("ptrue\t%0.N, vlNNNNN")];
17476 if (info
.insn
== simd_immediate_info::MOV
)
17478 gcc_assert (info
.u
.mov
.value
== const0_rtx
);
17479 snprintf (buf
, sizeof (buf
), "pfalse\t%%0.b");
17483 gcc_assert (info
.insn
== simd_immediate_info::PTRUE
);
17484 unsigned int total_bytes
;
17485 if (info
.u
.pattern
== AARCH64_SV_ALL
17486 && BYTES_PER_SVE_VECTOR
.is_constant (&total_bytes
))
17487 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, vl%d", element_char
,
17488 total_bytes
/ GET_MODE_SIZE (info
.elt_mode
));
17490 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, %s", element_char
,
17491 svpattern_token (info
.u
.pattern
));
17496 if (info
.insn
== simd_immediate_info::INDEX
)
17498 snprintf (templ
, sizeof (templ
), "index\t%%0.%c, #"
17499 HOST_WIDE_INT_PRINT_DEC
", #" HOST_WIDE_INT_PRINT_DEC
,
17500 element_char
, INTVAL (info
.u
.index
.base
),
17501 INTVAL (info
.u
.index
.step
));
17505 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
17507 if (aarch64_float_const_zero_rtx_p (info
.u
.mov
.value
))
17508 info
.u
.mov
.value
= GEN_INT (0);
17511 const int buf_size
= 20;
17512 char float_buf
[buf_size
] = {};
17513 real_to_decimal_for_mode (float_buf
,
17514 CONST_DOUBLE_REAL_VALUE (info
.u
.mov
.value
),
17515 buf_size
, buf_size
, 1, info
.elt_mode
);
17517 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%c, #%s",
17518 element_char
, float_buf
);
17523 snprintf (templ
, sizeof (templ
), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC
,
17524 element_char
, INTVAL (info
.u
.mov
.value
));
17528 /* Split operands into moves from op[1] + op[2] into op[0]. */
17531 aarch64_split_combinev16qi (rtx operands
[3])
17533 unsigned int dest
= REGNO (operands
[0]);
17534 unsigned int src1
= REGNO (operands
[1]);
17535 unsigned int src2
= REGNO (operands
[2]);
17536 machine_mode halfmode
= GET_MODE (operands
[1]);
17537 unsigned int halfregs
= REG_NREGS (operands
[1]);
17538 rtx destlo
, desthi
;
17540 gcc_assert (halfmode
== V16QImode
);
17542 if (src1
== dest
&& src2
== dest
+ halfregs
)
17544 /* No-op move. Can't split to nothing; emit something. */
17545 emit_note (NOTE_INSN_DELETED
);
17549 /* Preserve register attributes for variable tracking. */
17550 destlo
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
, 0);
17551 desthi
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
+ halfregs
,
17552 GET_MODE_SIZE (halfmode
));
17554 /* Special case of reversed high/low parts. */
17555 if (reg_overlap_mentioned_p (operands
[2], destlo
)
17556 && reg_overlap_mentioned_p (operands
[1], desthi
))
17558 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
17559 emit_insn (gen_xorv16qi3 (operands
[2], operands
[1], operands
[2]));
17560 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
17562 else if (!reg_overlap_mentioned_p (operands
[2], destlo
))
17564 /* Try to avoid unnecessary moves if part of the result
17565 is in the right place already. */
17567 emit_move_insn (destlo
, operands
[1]);
17568 if (src2
!= dest
+ halfregs
)
17569 emit_move_insn (desthi
, operands
[2]);
17573 if (src2
!= dest
+ halfregs
)
17574 emit_move_insn (desthi
, operands
[2]);
17576 emit_move_insn (destlo
, operands
[1]);
17580 /* vec_perm support. */
17582 struct expand_vec_perm_d
17584 rtx target
, op0
, op1
;
17585 vec_perm_indices perm
;
17586 machine_mode vmode
;
17587 unsigned int vec_flags
;
17592 /* Generate a variable permutation. */
17595 aarch64_expand_vec_perm_1 (rtx target
, rtx op0
, rtx op1
, rtx sel
)
17597 machine_mode vmode
= GET_MODE (target
);
17598 bool one_vector_p
= rtx_equal_p (op0
, op1
);
17600 gcc_checking_assert (vmode
== V8QImode
|| vmode
== V16QImode
);
17601 gcc_checking_assert (GET_MODE (op0
) == vmode
);
17602 gcc_checking_assert (GET_MODE (op1
) == vmode
);
17603 gcc_checking_assert (GET_MODE (sel
) == vmode
);
17604 gcc_checking_assert (TARGET_SIMD
);
17608 if (vmode
== V8QImode
)
17610 /* Expand the argument to a V16QI mode by duplicating it. */
17611 rtx pair
= gen_reg_rtx (V16QImode
);
17612 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op0
));
17613 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
17617 emit_insn (gen_aarch64_tbl1v16qi (target
, op0
, sel
));
17624 if (vmode
== V8QImode
)
17626 pair
= gen_reg_rtx (V16QImode
);
17627 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op1
));
17628 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
17632 pair
= gen_reg_rtx (OImode
);
17633 emit_insn (gen_aarch64_combinev16qi (pair
, op0
, op1
));
17634 emit_insn (gen_aarch64_tbl2v16qi (target
, pair
, sel
));
17639 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
17640 NELT is the number of elements in the vector. */
17643 aarch64_expand_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
,
17646 machine_mode vmode
= GET_MODE (target
);
17647 bool one_vector_p
= rtx_equal_p (op0
, op1
);
17650 /* The TBL instruction does not use a modulo index, so we must take care
17651 of that ourselves. */
17652 mask
= aarch64_simd_gen_const_vector_dup (vmode
,
17653 one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
17654 sel
= expand_simple_binop (vmode
, AND
, sel
, mask
, NULL
, 0, OPTAB_LIB_WIDEN
);
17656 /* For big-endian, we also need to reverse the index within the vector
17657 (but not which vector). */
17658 if (BYTES_BIG_ENDIAN
)
17660 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
17662 mask
= aarch64_simd_gen_const_vector_dup (vmode
, nelt
- 1);
17663 sel
= expand_simple_binop (vmode
, XOR
, sel
, mask
,
17664 NULL
, 0, OPTAB_LIB_WIDEN
);
17666 aarch64_expand_vec_perm_1 (target
, op0
, op1
, sel
);
17669 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
17672 emit_unspec2 (rtx target
, int code
, rtx op0
, rtx op1
)
17674 emit_insn (gen_rtx_SET (target
,
17675 gen_rtx_UNSPEC (GET_MODE (target
),
17676 gen_rtvec (2, op0
, op1
), code
)));
17679 /* Expand an SVE vec_perm with the given operands. */
17682 aarch64_expand_sve_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
)
17684 machine_mode data_mode
= GET_MODE (target
);
17685 machine_mode sel_mode
= GET_MODE (sel
);
17686 /* Enforced by the pattern condition. */
17687 int nunits
= GET_MODE_NUNITS (sel_mode
).to_constant ();
17689 /* Note: vec_perm indices are supposed to wrap when they go beyond the
17690 size of the two value vectors, i.e. the upper bits of the indices
17691 are effectively ignored. SVE TBL instead produces 0 for any
17692 out-of-range indices, so we need to modulo all the vec_perm indices
17693 to ensure they are all in range. */
17694 rtx sel_reg
= force_reg (sel_mode
, sel
);
17696 /* Check if the sel only references the first values vector. */
17697 if (GET_CODE (sel
) == CONST_VECTOR
17698 && aarch64_const_vec_all_in_range_p (sel
, 0, nunits
- 1))
17700 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_reg
);
17704 /* Check if the two values vectors are the same. */
17705 if (rtx_equal_p (op0
, op1
))
17707 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
, nunits
- 1);
17708 rtx sel_mod
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
17709 NULL
, 0, OPTAB_DIRECT
);
17710 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_mod
);
17714 /* Run TBL on for each value vector and combine the results. */
17716 rtx res0
= gen_reg_rtx (data_mode
);
17717 rtx res1
= gen_reg_rtx (data_mode
);
17718 rtx neg_num_elems
= aarch64_simd_gen_const_vector_dup (sel_mode
, -nunits
);
17719 if (GET_CODE (sel
) != CONST_VECTOR
17720 || !aarch64_const_vec_all_in_range_p (sel
, 0, 2 * nunits
- 1))
17722 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
,
17724 sel_reg
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
17725 NULL
, 0, OPTAB_DIRECT
);
17727 emit_unspec2 (res0
, UNSPEC_TBL
, op0
, sel_reg
);
17728 rtx sel_sub
= expand_simple_binop (sel_mode
, PLUS
, sel_reg
, neg_num_elems
,
17729 NULL
, 0, OPTAB_DIRECT
);
17730 emit_unspec2 (res1
, UNSPEC_TBL
, op1
, sel_sub
);
17731 if (GET_MODE_CLASS (data_mode
) == MODE_VECTOR_INT
)
17732 emit_insn (gen_rtx_SET (target
, gen_rtx_IOR (data_mode
, res0
, res1
)));
17734 emit_unspec2 (target
, UNSPEC_IORF
, res0
, res1
);
17737 /* Recognize patterns suitable for the TRN instructions. */
17739 aarch64_evpc_trn (struct expand_vec_perm_d
*d
)
17742 poly_uint64 nelt
= d
->perm
.length ();
17743 rtx out
, in0
, in1
, x
;
17744 machine_mode vmode
= d
->vmode
;
17746 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
17749 /* Note that these are little-endian tests.
17750 We correct for big-endian later. */
17751 if (!d
->perm
[0].is_constant (&odd
)
17752 || (odd
!= 0 && odd
!= 1)
17753 || !d
->perm
.series_p (0, 2, odd
, 2)
17754 || !d
->perm
.series_p (1, 2, nelt
+ odd
, 2))
17763 /* We don't need a big-endian lane correction for SVE; see the comment
17764 at the head of aarch64-sve.md for details. */
17765 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
17767 x
= in0
, in0
= in1
, in1
= x
;
17772 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
17773 odd
? UNSPEC_TRN2
: UNSPEC_TRN1
));
17777 /* Recognize patterns suitable for the UZP instructions. */
17779 aarch64_evpc_uzp (struct expand_vec_perm_d
*d
)
17782 rtx out
, in0
, in1
, x
;
17783 machine_mode vmode
= d
->vmode
;
17785 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
17788 /* Note that these are little-endian tests.
17789 We correct for big-endian later. */
17790 if (!d
->perm
[0].is_constant (&odd
)
17791 || (odd
!= 0 && odd
!= 1)
17792 || !d
->perm
.series_p (0, 1, odd
, 2))
17801 /* We don't need a big-endian lane correction for SVE; see the comment
17802 at the head of aarch64-sve.md for details. */
17803 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
17805 x
= in0
, in0
= in1
, in1
= x
;
17810 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
17811 odd
? UNSPEC_UZP2
: UNSPEC_UZP1
));
17815 /* Recognize patterns suitable for the ZIP instructions. */
17817 aarch64_evpc_zip (struct expand_vec_perm_d
*d
)
17820 poly_uint64 nelt
= d
->perm
.length ();
17821 rtx out
, in0
, in1
, x
;
17822 machine_mode vmode
= d
->vmode
;
17824 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
17827 /* Note that these are little-endian tests.
17828 We correct for big-endian later. */
17829 poly_uint64 first
= d
->perm
[0];
17830 if ((maybe_ne (first
, 0U) && maybe_ne (first
* 2, nelt
))
17831 || !d
->perm
.series_p (0, 2, first
, 1)
17832 || !d
->perm
.series_p (1, 2, first
+ nelt
, 1))
17834 high
= maybe_ne (first
, 0U);
17842 /* We don't need a big-endian lane correction for SVE; see the comment
17843 at the head of aarch64-sve.md for details. */
17844 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
17846 x
= in0
, in0
= in1
, in1
= x
;
17851 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
17852 high
? UNSPEC_ZIP2
: UNSPEC_ZIP1
));
17856 /* Recognize patterns for the EXT insn. */
17859 aarch64_evpc_ext (struct expand_vec_perm_d
*d
)
17861 HOST_WIDE_INT location
;
17864 /* The first element always refers to the first vector.
17865 Check if the extracted indices are increasing by one. */
17866 if (d
->vec_flags
== VEC_SVE_PRED
17867 || !d
->perm
[0].is_constant (&location
)
17868 || !d
->perm
.series_p (0, 1, location
, 1))
17875 /* The case where (location == 0) is a no-op for both big- and little-endian,
17876 and is removed by the mid-end at optimization levels -O1 and higher.
17878 We don't need a big-endian lane correction for SVE; see the comment
17879 at the head of aarch64-sve.md for details. */
17880 if (BYTES_BIG_ENDIAN
&& location
!= 0 && d
->vec_flags
== VEC_ADVSIMD
)
17882 /* After setup, we want the high elements of the first vector (stored
17883 at the LSB end of the register), and the low elements of the second
17884 vector (stored at the MSB end of the register). So swap. */
17885 std::swap (d
->op0
, d
->op1
);
17886 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
17887 to_constant () is safe since this is restricted to Advanced SIMD
17889 location
= d
->perm
.length ().to_constant () - location
;
17892 offset
= GEN_INT (location
);
17893 emit_set_insn (d
->target
,
17894 gen_rtx_UNSPEC (d
->vmode
,
17895 gen_rtvec (3, d
->op0
, d
->op1
, offset
),
17900 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
17901 within each 64-bit, 32-bit or 16-bit granule. */
17904 aarch64_evpc_rev_local (struct expand_vec_perm_d
*d
)
17906 HOST_WIDE_INT diff
;
17907 unsigned int i
, size
, unspec
;
17908 machine_mode pred_mode
;
17910 if (d
->vec_flags
== VEC_SVE_PRED
17911 || !d
->one_vector_p
17912 || !d
->perm
[0].is_constant (&diff
))
17915 size
= (diff
+ 1) * GET_MODE_UNIT_SIZE (d
->vmode
);
17918 unspec
= UNSPEC_REV64
;
17919 pred_mode
= VNx2BImode
;
17921 else if (size
== 4)
17923 unspec
= UNSPEC_REV32
;
17924 pred_mode
= VNx4BImode
;
17926 else if (size
== 2)
17928 unspec
= UNSPEC_REV16
;
17929 pred_mode
= VNx8BImode
;
17934 unsigned int step
= diff
+ 1;
17935 for (i
= 0; i
< step
; ++i
)
17936 if (!d
->perm
.series_p (i
, step
, diff
- i
, step
))
17943 if (d
->vec_flags
== VEC_SVE_DATA
)
17945 machine_mode int_mode
= aarch64_sve_int_mode (pred_mode
);
17946 rtx target
= gen_reg_rtx (int_mode
);
17947 if (BYTES_BIG_ENDIAN
)
17948 /* The act of taking a subreg between INT_MODE and d->vmode
17949 is itself a reversing operation on big-endian targets;
17950 see the comment at the head of aarch64-sve.md for details.
17951 First reinterpret OP0 as INT_MODE without using a subreg
17952 and without changing the contents. */
17953 emit_insn (gen_aarch64_sve_reinterpret (int_mode
, target
, d
->op0
));
17956 /* For SVE we use REV[BHW] unspecs derived from the element size
17957 of v->mode and vector modes whose elements have SIZE bytes.
17958 This ensures that the vector modes match the predicate modes. */
17959 int unspec
= aarch64_sve_rev_unspec (d
->vmode
);
17960 rtx pred
= aarch64_ptrue_reg (pred_mode
);
17961 emit_insn (gen_aarch64_pred (unspec
, int_mode
, target
, pred
,
17962 gen_lowpart (int_mode
, d
->op0
)));
17964 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
17967 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), unspec
);
17968 emit_set_insn (d
->target
, src
);
17972 /* Recognize patterns for the REV insn, which reverses elements within
17976 aarch64_evpc_rev_global (struct expand_vec_perm_d
*d
)
17978 poly_uint64 nelt
= d
->perm
.length ();
17980 if (!d
->one_vector_p
|| d
->vec_flags
!= VEC_SVE_DATA
)
17983 if (!d
->perm
.series_p (0, 1, nelt
- 1, -1))
17990 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), UNSPEC_REV
);
17991 emit_set_insn (d
->target
, src
);
17996 aarch64_evpc_dup (struct expand_vec_perm_d
*d
)
17998 rtx out
= d
->target
;
18001 machine_mode vmode
= d
->vmode
;
18004 if (d
->vec_flags
== VEC_SVE_PRED
18005 || d
->perm
.encoding ().encoded_nelts () != 1
18006 || !d
->perm
[0].is_constant (&elt
))
18009 if (d
->vec_flags
== VEC_SVE_DATA
&& elt
>= 64 * GET_MODE_UNIT_SIZE (vmode
))
18016 /* The generic preparation in aarch64_expand_vec_perm_const_1
18017 swaps the operand order and the permute indices if it finds
18018 d->perm[0] to be in the second operand. Thus, we can always
18019 use d->op0 and need not do any extra arithmetic to get the
18020 correct lane number. */
18022 lane
= GEN_INT (elt
); /* The pattern corrects for big-endian. */
18024 rtx parallel
= gen_rtx_PARALLEL (vmode
, gen_rtvec (1, lane
));
18025 rtx select
= gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode
), in0
, parallel
);
18026 emit_set_insn (out
, gen_rtx_VEC_DUPLICATE (vmode
, select
));
18031 aarch64_evpc_tbl (struct expand_vec_perm_d
*d
)
18033 rtx rperm
[MAX_COMPILE_TIME_VEC_BYTES
], sel
;
18034 machine_mode vmode
= d
->vmode
;
18036 /* Make sure that the indices are constant. */
18037 unsigned int encoded_nelts
= d
->perm
.encoding ().encoded_nelts ();
18038 for (unsigned int i
= 0; i
< encoded_nelts
; ++i
)
18039 if (!d
->perm
[i
].is_constant ())
18045 /* Generic code will try constant permutation twice. Once with the
18046 original mode and again with the elements lowered to QImode.
18047 So wait and don't do the selector expansion ourselves. */
18048 if (vmode
!= V8QImode
&& vmode
!= V16QImode
)
18051 /* to_constant is safe since this routine is specific to Advanced SIMD
18053 unsigned int nelt
= d
->perm
.length ().to_constant ();
18054 for (unsigned int i
= 0; i
< nelt
; ++i
)
18055 /* If big-endian and two vectors we end up with a weird mixed-endian
18056 mode on NEON. Reverse the index within each word but not the word
18057 itself. to_constant is safe because we checked is_constant above. */
18058 rperm
[i
] = GEN_INT (BYTES_BIG_ENDIAN
18059 ? d
->perm
[i
].to_constant () ^ (nelt
- 1)
18060 : d
->perm
[i
].to_constant ());
18062 sel
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
18063 sel
= force_reg (vmode
, sel
);
18065 aarch64_expand_vec_perm_1 (d
->target
, d
->op0
, d
->op1
, sel
);
18069 /* Try to implement D using an SVE TBL instruction. */
18072 aarch64_evpc_sve_tbl (struct expand_vec_perm_d
*d
)
18074 unsigned HOST_WIDE_INT nelt
;
18076 /* Permuting two variable-length vectors could overflow the
18078 if (!d
->one_vector_p
&& !d
->perm
.length ().is_constant (&nelt
))
18084 machine_mode sel_mode
= mode_for_int_vector (d
->vmode
).require ();
18085 rtx sel
= vec_perm_indices_to_rtx (sel_mode
, d
->perm
);
18086 if (d
->one_vector_p
)
18087 emit_unspec2 (d
->target
, UNSPEC_TBL
, d
->op0
, force_reg (sel_mode
, sel
));
18089 aarch64_expand_sve_vec_perm (d
->target
, d
->op0
, d
->op1
, sel
);
18093 /* Try to implement D using SVE SEL instruction. */
18096 aarch64_evpc_sel (struct expand_vec_perm_d
*d
)
18098 machine_mode vmode
= d
->vmode
;
18099 int unit_size
= GET_MODE_UNIT_SIZE (vmode
);
18101 if (d
->vec_flags
!= VEC_SVE_DATA
18105 int n_patterns
= d
->perm
.encoding ().npatterns ();
18106 poly_int64 vec_len
= d
->perm
.length ();
18108 for (int i
= 0; i
< n_patterns
; ++i
)
18109 if (!known_eq (d
->perm
[i
], i
)
18110 && !known_eq (d
->perm
[i
], vec_len
+ i
))
18113 for (int i
= n_patterns
; i
< n_patterns
* 2; i
++)
18114 if (!d
->perm
.series_p (i
, n_patterns
, i
, n_patterns
)
18115 && !d
->perm
.series_p (i
, n_patterns
, vec_len
+ i
, n_patterns
))
18121 machine_mode pred_mode
= aarch64_sve_pred_mode (unit_size
).require ();
18123 rtx_vector_builder
builder (pred_mode
, n_patterns
, 2);
18124 for (int i
= 0; i
< n_patterns
* 2; i
++)
18126 rtx elem
= known_eq (d
->perm
[i
], i
) ? CONST1_RTX (BImode
)
18127 : CONST0_RTX (BImode
);
18128 builder
.quick_push (elem
);
18131 rtx const_vec
= builder
.build ();
18132 rtx pred
= force_reg (pred_mode
, const_vec
);
18133 emit_insn (gen_vcond_mask (vmode
, vmode
, d
->target
, d
->op1
, d
->op0
, pred
));
18138 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
18140 /* The pattern matching functions above are written to look for a small
18141 number to begin the sequence (0, 1, N/2). If we begin with an index
18142 from the second operand, we can swap the operands. */
18143 poly_int64 nelt
= d
->perm
.length ();
18144 if (known_ge (d
->perm
[0], nelt
))
18146 d
->perm
.rotate_inputs (1);
18147 std::swap (d
->op0
, d
->op1
);
18150 if ((d
->vec_flags
== VEC_ADVSIMD
18151 || d
->vec_flags
== VEC_SVE_DATA
18152 || d
->vec_flags
== VEC_SVE_PRED
)
18153 && known_gt (nelt
, 1))
18155 if (aarch64_evpc_rev_local (d
))
18157 else if (aarch64_evpc_rev_global (d
))
18159 else if (aarch64_evpc_ext (d
))
18161 else if (aarch64_evpc_dup (d
))
18163 else if (aarch64_evpc_zip (d
))
18165 else if (aarch64_evpc_uzp (d
))
18167 else if (aarch64_evpc_trn (d
))
18169 else if (aarch64_evpc_sel (d
))
18171 if (d
->vec_flags
== VEC_SVE_DATA
)
18172 return aarch64_evpc_sve_tbl (d
);
18173 else if (d
->vec_flags
== VEC_ADVSIMD
)
18174 return aarch64_evpc_tbl (d
);
18179 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
18182 aarch64_vectorize_vec_perm_const (machine_mode vmode
, rtx target
, rtx op0
,
18183 rtx op1
, const vec_perm_indices
&sel
)
18185 struct expand_vec_perm_d d
;
18187 /* Check whether the mask can be applied to a single vector. */
18188 if (sel
.ninputs () == 1
18189 || (op0
&& rtx_equal_p (op0
, op1
)))
18190 d
.one_vector_p
= true;
18191 else if (sel
.all_from_input_p (0))
18193 d
.one_vector_p
= true;
18196 else if (sel
.all_from_input_p (1))
18198 d
.one_vector_p
= true;
18202 d
.one_vector_p
= false;
18204 d
.perm
.new_vector (sel
.encoding (), d
.one_vector_p
? 1 : 2,
18205 sel
.nelts_per_input ());
18207 d
.vec_flags
= aarch64_classify_vector_mode (d
.vmode
);
18211 d
.testing_p
= !target
;
18214 return aarch64_expand_vec_perm_const_1 (&d
);
18216 rtx_insn
*last
= get_last_insn ();
18217 bool ret
= aarch64_expand_vec_perm_const_1 (&d
);
18218 gcc_assert (last
== get_last_insn ());
18223 /* Generate a byte permute mask for a register of mode MODE,
18224 which has NUNITS units. */
18227 aarch64_reverse_mask (machine_mode mode
, unsigned int nunits
)
18229 /* We have to reverse each vector because we dont have
18230 a permuted load that can reverse-load according to ABI rules. */
18232 rtvec v
= rtvec_alloc (16);
18234 unsigned int usize
= GET_MODE_UNIT_SIZE (mode
);
18236 gcc_assert (BYTES_BIG_ENDIAN
);
18237 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode
));
18239 for (i
= 0; i
< nunits
; i
++)
18240 for (j
= 0; j
< usize
; j
++)
18241 RTVEC_ELT (v
, i
* usize
+ j
) = GEN_INT ((i
+ 1) * usize
- 1 - j
);
18242 mask
= gen_rtx_CONST_VECTOR (V16QImode
, v
);
18243 return force_reg (V16QImode
, mask
);
18246 /* Expand an SVE integer comparison using the SVE equivalent of:
18248 (set TARGET (CODE OP0 OP1)). */
18251 aarch64_expand_sve_vec_cmp_int (rtx target
, rtx_code code
, rtx op0
, rtx op1
)
18253 machine_mode pred_mode
= GET_MODE (target
);
18254 machine_mode data_mode
= GET_MODE (op0
);
18255 rtx res
= aarch64_sve_emit_int_cmp (target
, pred_mode
, code
, data_mode
,
18257 if (!rtx_equal_p (target
, res
))
18258 emit_move_insn (target
, res
);
18261 /* Return the UNSPEC_COND_* code for comparison CODE. */
18263 static unsigned int
18264 aarch64_unspec_cond_code (rtx_code code
)
18269 return UNSPEC_COND_FCMNE
;
18271 return UNSPEC_COND_FCMEQ
;
18273 return UNSPEC_COND_FCMLT
;
18275 return UNSPEC_COND_FCMGT
;
18277 return UNSPEC_COND_FCMLE
;
18279 return UNSPEC_COND_FCMGE
;
18281 return UNSPEC_COND_FCMUO
;
18283 gcc_unreachable ();
18289 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
18291 where <X> is the operation associated with comparison CODE.
18292 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
18295 aarch64_emit_sve_fp_cond (rtx target
, rtx_code code
, rtx pred
,
18296 bool known_ptrue_p
, rtx op0
, rtx op1
)
18298 rtx flag
= gen_int_mode (known_ptrue_p
, SImode
);
18299 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (pred
),
18300 gen_rtvec (4, pred
, flag
, op0
, op1
),
18301 aarch64_unspec_cond_code (code
));
18302 emit_set_insn (target
, unspec
);
18305 /* Emit the SVE equivalent of:
18307 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
18308 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
18309 (set TARGET (ior:PRED_MODE TMP1 TMP2))
18311 where <Xi> is the operation associated with comparison CODEi.
18312 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
18315 aarch64_emit_sve_or_fp_conds (rtx target
, rtx_code code1
, rtx_code code2
,
18316 rtx pred
, bool known_ptrue_p
, rtx op0
, rtx op1
)
18318 machine_mode pred_mode
= GET_MODE (pred
);
18319 rtx tmp1
= gen_reg_rtx (pred_mode
);
18320 aarch64_emit_sve_fp_cond (tmp1
, code1
, pred
, known_ptrue_p
, op0
, op1
);
18321 rtx tmp2
= gen_reg_rtx (pred_mode
);
18322 aarch64_emit_sve_fp_cond (tmp2
, code2
, pred
, known_ptrue_p
, op0
, op1
);
18323 aarch64_emit_binop (target
, ior_optab
, tmp1
, tmp2
);
18326 /* Emit the SVE equivalent of:
18328 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
18329 (set TARGET (not TMP))
18331 where <X> is the operation associated with comparison CODE.
18332 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
18335 aarch64_emit_sve_invert_fp_cond (rtx target
, rtx_code code
, rtx pred
,
18336 bool known_ptrue_p
, rtx op0
, rtx op1
)
18338 machine_mode pred_mode
= GET_MODE (pred
);
18339 rtx tmp
= gen_reg_rtx (pred_mode
);
18340 aarch64_emit_sve_fp_cond (tmp
, code
, pred
, known_ptrue_p
, op0
, op1
);
18341 aarch64_emit_unop (target
, one_cmpl_optab
, tmp
);
18344 /* Expand an SVE floating-point comparison using the SVE equivalent of:
18346 (set TARGET (CODE OP0 OP1))
18348 If CAN_INVERT_P is true, the caller can also handle inverted results;
18349 return true if the result is in fact inverted. */
18352 aarch64_expand_sve_vec_cmp_float (rtx target
, rtx_code code
,
18353 rtx op0
, rtx op1
, bool can_invert_p
)
18355 machine_mode pred_mode
= GET_MODE (target
);
18356 machine_mode data_mode
= GET_MODE (op0
);
18358 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
18362 /* UNORDERED has no immediate form. */
18363 op1
= force_reg (data_mode
, op1
);
18372 /* There is native support for the comparison. */
18373 aarch64_emit_sve_fp_cond (target
, code
, ptrue
, true, op0
, op1
);
18378 /* This is a trapping operation (LT or GT). */
18379 aarch64_emit_sve_or_fp_conds (target
, LT
, GT
, ptrue
, true, op0
, op1
);
18383 if (!flag_trapping_math
)
18385 /* This would trap for signaling NaNs. */
18386 op1
= force_reg (data_mode
, op1
);
18387 aarch64_emit_sve_or_fp_conds (target
, UNORDERED
, EQ
,
18388 ptrue
, true, op0
, op1
);
18396 if (flag_trapping_math
)
18398 /* Work out which elements are ordered. */
18399 rtx ordered
= gen_reg_rtx (pred_mode
);
18400 op1
= force_reg (data_mode
, op1
);
18401 aarch64_emit_sve_invert_fp_cond (ordered
, UNORDERED
,
18402 ptrue
, true, op0
, op1
);
18404 /* Test the opposite condition for the ordered elements,
18405 then invert the result. */
18409 code
= reverse_condition_maybe_unordered (code
);
18412 aarch64_emit_sve_fp_cond (target
, code
,
18413 ordered
, false, op0
, op1
);
18416 aarch64_emit_sve_invert_fp_cond (target
, code
,
18417 ordered
, false, op0
, op1
);
18423 /* ORDERED has no immediate form. */
18424 op1
= force_reg (data_mode
, op1
);
18428 gcc_unreachable ();
18431 /* There is native support for the inverse comparison. */
18432 code
= reverse_condition_maybe_unordered (code
);
18435 aarch64_emit_sve_fp_cond (target
, code
, ptrue
, true, op0
, op1
);
18438 aarch64_emit_sve_invert_fp_cond (target
, code
, ptrue
, true, op0
, op1
);
18442 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
18443 of the data being selected and CMP_MODE is the mode of the values being
18447 aarch64_expand_sve_vcond (machine_mode data_mode
, machine_mode cmp_mode
,
18450 machine_mode pred_mode
18451 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode
),
18452 GET_MODE_SIZE (cmp_mode
)).require ();
18453 rtx pred
= gen_reg_rtx (pred_mode
);
18454 if (FLOAT_MODE_P (cmp_mode
))
18456 if (aarch64_expand_sve_vec_cmp_float (pred
, GET_CODE (ops
[3]),
18457 ops
[4], ops
[5], true))
18458 std::swap (ops
[1], ops
[2]);
18461 aarch64_expand_sve_vec_cmp_int (pred
, GET_CODE (ops
[3]), ops
[4], ops
[5]);
18463 if (!aarch64_sve_reg_or_dup_imm (ops
[1], data_mode
))
18464 ops
[1] = force_reg (data_mode
, ops
[1]);
18465 /* The "false" value can only be zero if the "true" value is a constant. */
18466 if (register_operand (ops
[1], data_mode
)
18467 || !aarch64_simd_reg_or_zero (ops
[2], data_mode
))
18468 ops
[2] = force_reg (data_mode
, ops
[2]);
18470 rtvec vec
= gen_rtvec (3, pred
, ops
[1], ops
[2]);
18471 emit_set_insn (ops
[0], gen_rtx_UNSPEC (data_mode
, vec
, UNSPEC_SEL
));
18474 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
18475 true. However due to issues with register allocation it is preferable
18476 to avoid tieing integer scalar and FP scalar modes. Executing integer
18477 operations in general registers is better than treating them as scalar
18478 vector operations. This reduces latency and avoids redundant int<->FP
18479 moves. So tie modes if they are either the same class, or vector modes
18480 with other vector modes, vector structs or any scalar mode. */
18483 aarch64_modes_tieable_p (machine_mode mode1
, machine_mode mode2
)
18485 if (GET_MODE_CLASS (mode1
) == GET_MODE_CLASS (mode2
))
18488 /* We specifically want to allow elements of "structure" modes to
18489 be tieable to the structure. This more general condition allows
18490 other rarer situations too. The reason we don't extend this to
18491 predicate modes is that there are no predicate structure modes
18492 nor any specific instructions for extracting part of a predicate
18494 if (aarch64_vector_data_mode_p (mode1
)
18495 && aarch64_vector_data_mode_p (mode2
))
18498 /* Also allow any scalar modes with vectors. */
18499 if (aarch64_vector_mode_supported_p (mode1
)
18500 || aarch64_vector_mode_supported_p (mode2
))
18506 /* Return a new RTX holding the result of moving POINTER forward by
18510 aarch64_move_pointer (rtx pointer
, poly_int64 amount
)
18512 rtx next
= plus_constant (Pmode
, XEXP (pointer
, 0), amount
);
18514 return adjust_automodify_address (pointer
, GET_MODE (pointer
),
18518 /* Return a new RTX holding the result of moving POINTER forward by the
18519 size of the mode it points to. */
18522 aarch64_progress_pointer (rtx pointer
)
18524 return aarch64_move_pointer (pointer
, GET_MODE_SIZE (GET_MODE (pointer
)));
18527 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
18531 aarch64_copy_one_block_and_progress_pointers (rtx
*src
, rtx
*dst
,
18534 rtx reg
= gen_reg_rtx (mode
);
18536 /* "Cast" the pointers to the correct mode. */
18537 *src
= adjust_address (*src
, mode
, 0);
18538 *dst
= adjust_address (*dst
, mode
, 0);
18539 /* Emit the memcpy. */
18540 emit_move_insn (reg
, *src
);
18541 emit_move_insn (*dst
, reg
);
18542 /* Move the pointers forward. */
18543 *src
= aarch64_progress_pointer (*src
);
18544 *dst
= aarch64_progress_pointer (*dst
);
18547 /* Expand cpymem, as if from a __builtin_memcpy. Return true if
18548 we succeed, otherwise return false. */
18551 aarch64_expand_cpymem (rtx
*operands
)
18554 rtx dst
= operands
[0];
18555 rtx src
= operands
[1];
18557 machine_mode cur_mode
= BLKmode
, next_mode
;
18558 bool speed_p
= !optimize_function_for_size_p (cfun
);
18560 /* When optimizing for size, give a better estimate of the length of a
18561 memcpy call, but use the default otherwise. Moves larger than 8 bytes
18562 will always require an even number of instructions to do now. And each
18563 operation requires both a load+store, so devide the max number by 2. */
18564 int max_num_moves
= (speed_p
? 16 : AARCH64_CALL_RATIO
) / 2;
18566 /* We can't do anything smart if the amount to copy is not constant. */
18567 if (!CONST_INT_P (operands
[2]))
18570 n
= INTVAL (operands
[2]);
18572 /* Try to keep the number of instructions low. For all cases we will do at
18573 most two moves for the residual amount, since we'll always overlap the
18575 if (((n
/ 16) + (n
% 16 ? 2 : 0)) > max_num_moves
)
18578 base
= copy_to_mode_reg (Pmode
, XEXP (dst
, 0));
18579 dst
= adjust_automodify_address (dst
, VOIDmode
, base
, 0);
18581 base
= copy_to_mode_reg (Pmode
, XEXP (src
, 0));
18582 src
= adjust_automodify_address (src
, VOIDmode
, base
, 0);
18584 /* Convert n to bits to make the rest of the code simpler. */
18585 n
= n
* BITS_PER_UNIT
;
18587 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
18588 larger than TImode, but we should not use them for loads/stores here. */
18589 const int copy_limit
= GET_MODE_BITSIZE (TImode
);
18593 /* Find the largest mode in which to do the copy in without over reading
18595 opt_scalar_int_mode mode_iter
;
18596 FOR_EACH_MODE_IN_CLASS (mode_iter
, MODE_INT
)
18597 if (GET_MODE_BITSIZE (mode_iter
.require ()) <= MIN (n
, copy_limit
))
18598 cur_mode
= mode_iter
.require ();
18600 gcc_assert (cur_mode
!= BLKmode
);
18602 mode_bits
= GET_MODE_BITSIZE (cur_mode
).to_constant ();
18603 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, cur_mode
);
18607 /* Do certain trailing copies as overlapping if it's going to be
18608 cheaper. i.e. less instructions to do so. For instance doing a 15
18609 byte copy it's more efficient to do two overlapping 8 byte copies than
18611 if (n
> 0 && n
<= 8 * BITS_PER_UNIT
)
18613 next_mode
= smallest_mode_for_size (n
, MODE_INT
);
18614 int n_bits
= GET_MODE_BITSIZE (next_mode
).to_constant ();
18615 src
= aarch64_move_pointer (src
, (n
- n_bits
) / BITS_PER_UNIT
);
18616 dst
= aarch64_move_pointer (dst
, (n
- n_bits
) / BITS_PER_UNIT
);
18624 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
18625 SImode stores. Handle the case when the constant has identical
18626 bottom and top halves. This is beneficial when the two stores can be
18627 merged into an STP and we avoid synthesising potentially expensive
18628 immediates twice. Return true if such a split is possible. */
18631 aarch64_split_dimode_const_store (rtx dst
, rtx src
)
18633 rtx lo
= gen_lowpart (SImode
, src
);
18634 rtx hi
= gen_highpart_mode (SImode
, DImode
, src
);
18636 bool size_p
= optimize_function_for_size_p (cfun
);
18638 if (!rtx_equal_p (lo
, hi
))
18641 unsigned int orig_cost
18642 = aarch64_internal_mov_immediate (NULL_RTX
, src
, false, DImode
);
18643 unsigned int lo_cost
18644 = aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, SImode
);
18646 /* We want to transform:
18648 MOVK x1, 0x140, lsl 16
18649 MOVK x1, 0xc0da, lsl 32
18650 MOVK x1, 0x140, lsl 48
18654 MOVK w1, 0x140, lsl 16
18656 So we want to perform this only when we save two instructions
18657 or more. When optimizing for size, however, accept any code size
18659 if (size_p
&& orig_cost
<= lo_cost
)
18663 && (orig_cost
<= lo_cost
+ 1))
18666 rtx mem_lo
= adjust_address (dst
, SImode
, 0);
18667 if (!aarch64_mem_pair_operand (mem_lo
, SImode
))
18670 rtx tmp_reg
= gen_reg_rtx (SImode
);
18671 aarch64_expand_mov_immediate (tmp_reg
, lo
);
18672 rtx mem_hi
= aarch64_move_pointer (mem_lo
, GET_MODE_SIZE (SImode
));
18673 /* Don't emit an explicit store pair as this may not be always profitable.
18674 Let the sched-fusion logic decide whether to merge them. */
18675 emit_move_insn (mem_lo
, tmp_reg
);
18676 emit_move_insn (mem_hi
, tmp_reg
);
18681 /* Generate RTL for a conditional branch with rtx comparison CODE in
18682 mode CC_MODE. The destination of the unlikely conditional branch
18686 aarch64_gen_unlikely_cbranch (enum rtx_code code
, machine_mode cc_mode
,
18690 x
= gen_rtx_fmt_ee (code
, VOIDmode
,
18691 gen_rtx_REG (cc_mode
, CC_REGNUM
),
18694 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
18695 gen_rtx_LABEL_REF (VOIDmode
, label_ref
),
18697 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
18700 /* Generate DImode scratch registers for 128-bit (TImode) addition.
18702 OP1 represents the TImode destination operand 1
18703 OP2 represents the TImode destination operand 2
18704 LOW_DEST represents the low half (DImode) of TImode operand 0
18705 LOW_IN1 represents the low half (DImode) of TImode operand 1
18706 LOW_IN2 represents the low half (DImode) of TImode operand 2
18707 HIGH_DEST represents the high half (DImode) of TImode operand 0
18708 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18709 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
18712 aarch64_addti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
18713 rtx
*low_in1
, rtx
*low_in2
,
18714 rtx
*high_dest
, rtx
*high_in1
,
18717 *low_dest
= gen_reg_rtx (DImode
);
18718 *low_in1
= gen_lowpart (DImode
, op1
);
18719 *low_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
18720 subreg_lowpart_offset (DImode
, TImode
));
18721 *high_dest
= gen_reg_rtx (DImode
);
18722 *high_in1
= gen_highpart (DImode
, op1
);
18723 *high_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
18724 subreg_highpart_offset (DImode
, TImode
));
18727 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
18729 This function differs from 'arch64_addti_scratch_regs' in that
18730 OP1 can be an immediate constant (zero). We must call
18731 subreg_highpart_offset with DImode and TImode arguments, otherwise
18732 VOIDmode will be used for the const_int which generates an internal
18733 error from subreg_size_highpart_offset which does not expect a size of zero.
18735 OP1 represents the TImode destination operand 1
18736 OP2 represents the TImode destination operand 2
18737 LOW_DEST represents the low half (DImode) of TImode operand 0
18738 LOW_IN1 represents the low half (DImode) of TImode operand 1
18739 LOW_IN2 represents the low half (DImode) of TImode operand 2
18740 HIGH_DEST represents the high half (DImode) of TImode operand 0
18741 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18742 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
18746 aarch64_subvti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
18747 rtx
*low_in1
, rtx
*low_in2
,
18748 rtx
*high_dest
, rtx
*high_in1
,
18751 *low_dest
= gen_reg_rtx (DImode
);
18752 *low_in1
= simplify_gen_subreg (DImode
, op1
, TImode
,
18753 subreg_lowpart_offset (DImode
, TImode
));
18755 *low_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
18756 subreg_lowpart_offset (DImode
, TImode
));
18757 *high_dest
= gen_reg_rtx (DImode
);
18759 *high_in1
= simplify_gen_subreg (DImode
, op1
, TImode
,
18760 subreg_highpart_offset (DImode
, TImode
));
18761 *high_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
18762 subreg_highpart_offset (DImode
, TImode
));
18765 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
18767 OP0 represents the TImode destination operand 0
18768 LOW_DEST represents the low half (DImode) of TImode operand 0
18769 LOW_IN1 represents the low half (DImode) of TImode operand 1
18770 LOW_IN2 represents the low half (DImode) of TImode operand 2
18771 HIGH_DEST represents the high half (DImode) of TImode operand 0
18772 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18773 HIGH_IN2 represents the high half (DImode) of TImode operand 2
18774 UNSIGNED_P is true if the operation is being performed on unsigned
18777 aarch64_expand_subvti (rtx op0
, rtx low_dest
, rtx low_in1
,
18778 rtx low_in2
, rtx high_dest
, rtx high_in1
,
18779 rtx high_in2
, bool unsigned_p
)
18781 if (low_in2
== const0_rtx
)
18783 low_dest
= low_in1
;
18784 high_in2
= force_reg (DImode
, high_in2
);
18786 emit_insn (gen_subdi3_compare1 (high_dest
, high_in1
, high_in2
));
18788 emit_insn (gen_subvdi_insn (high_dest
, high_in1
, high_in2
));
18792 if (CONST_INT_P (low_in2
))
18794 high_in2
= force_reg (DImode
, high_in2
);
18795 emit_insn (gen_subdi3_compare1_imm (low_dest
, low_in1
, low_in2
,
18796 GEN_INT (-INTVAL (low_in2
))));
18799 emit_insn (gen_subdi3_compare1 (low_dest
, low_in1
, low_in2
));
18802 emit_insn (gen_usubdi3_carryinC (high_dest
, high_in1
, high_in2
));
18804 emit_insn (gen_subdi3_carryinV (high_dest
, high_in1
, high_in2
));
18807 emit_move_insn (gen_lowpart (DImode
, op0
), low_dest
);
18808 emit_move_insn (gen_highpart (DImode
, op0
), high_dest
);
18812 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
18814 static unsigned HOST_WIDE_INT
18815 aarch64_asan_shadow_offset (void)
18818 return (HOST_WIDE_INT_1
<< 29);
18820 return (HOST_WIDE_INT_1
<< 36);
18824 aarch64_gen_ccmp_first (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
,
18825 int code
, tree treeop0
, tree treeop1
)
18827 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
18829 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
18831 struct expand_operand ops
[4];
18834 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
18836 op_mode
= GET_MODE (op0
);
18837 if (op_mode
== VOIDmode
)
18838 op_mode
= GET_MODE (op1
);
18846 icode
= CODE_FOR_cmpsi
;
18851 icode
= CODE_FOR_cmpdi
;
18856 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
18857 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpesf
: CODE_FOR_fcmpsf
;
18862 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
18863 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpedf
: CODE_FOR_fcmpdf
;
18871 op0
= prepare_operand (icode
, op0
, 0, op_mode
, cmp_mode
, unsignedp
);
18872 op1
= prepare_operand (icode
, op1
, 1, op_mode
, cmp_mode
, unsignedp
);
18878 *prep_seq
= get_insns ();
18881 create_fixed_operand (&ops
[0], op0
);
18882 create_fixed_operand (&ops
[1], op1
);
18885 if (!maybe_expand_insn (icode
, 2, ops
))
18890 *gen_seq
= get_insns ();
18893 return gen_rtx_fmt_ee ((rtx_code
) code
, cc_mode
,
18894 gen_rtx_REG (cc_mode
, CC_REGNUM
), const0_rtx
);
18898 aarch64_gen_ccmp_next (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
, rtx prev
,
18899 int cmp_code
, tree treeop0
, tree treeop1
, int bit_code
)
18901 rtx op0
, op1
, target
;
18902 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
18903 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
18905 struct expand_operand ops
[6];
18908 push_to_sequence (*prep_seq
);
18909 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
18911 op_mode
= GET_MODE (op0
);
18912 if (op_mode
== VOIDmode
)
18913 op_mode
= GET_MODE (op1
);
18921 icode
= CODE_FOR_ccmpsi
;
18926 icode
= CODE_FOR_ccmpdi
;
18931 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
18932 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpesf
: CODE_FOR_fccmpsf
;
18937 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
18938 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpedf
: CODE_FOR_fccmpdf
;
18946 op0
= prepare_operand (icode
, op0
, 2, op_mode
, cmp_mode
, unsignedp
);
18947 op1
= prepare_operand (icode
, op1
, 3, op_mode
, cmp_mode
, unsignedp
);
18953 *prep_seq
= get_insns ();
18956 target
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
18957 aarch64_cond
= aarch64_get_condition_code_1 (cc_mode
, (rtx_code
) cmp_code
);
18959 if (bit_code
!= AND
)
18961 prev
= gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev
),
18962 GET_MODE (XEXP (prev
, 0))),
18963 VOIDmode
, XEXP (prev
, 0), const0_rtx
);
18964 aarch64_cond
= AARCH64_INVERSE_CONDITION_CODE (aarch64_cond
);
18967 create_fixed_operand (&ops
[0], XEXP (prev
, 0));
18968 create_fixed_operand (&ops
[1], target
);
18969 create_fixed_operand (&ops
[2], op0
);
18970 create_fixed_operand (&ops
[3], op1
);
18971 create_fixed_operand (&ops
[4], prev
);
18972 create_fixed_operand (&ops
[5], GEN_INT (aarch64_cond
));
18974 push_to_sequence (*gen_seq
);
18975 if (!maybe_expand_insn (icode
, 6, ops
))
18981 *gen_seq
= get_insns ();
18984 return gen_rtx_fmt_ee ((rtx_code
) cmp_code
, VOIDmode
, target
, const0_rtx
);
18987 #undef TARGET_GEN_CCMP_FIRST
18988 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
18990 #undef TARGET_GEN_CCMP_NEXT
18991 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
18993 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
18994 instruction fusion of some sort. */
18997 aarch64_macro_fusion_p (void)
18999 return aarch64_tune_params
.fusible_ops
!= AARCH64_FUSE_NOTHING
;
19003 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
19004 should be kept together during scheduling. */
19007 aarch_macro_fusion_pair_p (rtx_insn
*prev
, rtx_insn
*curr
)
19010 rtx prev_set
= single_set (prev
);
19011 rtx curr_set
= single_set (curr
);
19012 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
19013 bool simple_sets_p
= prev_set
&& curr_set
&& !any_condjump_p (curr
);
19015 if (!aarch64_macro_fusion_p ())
19018 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK
))
19020 /* We are trying to match:
19021 prev (mov) == (set (reg r0) (const_int imm16))
19022 curr (movk) == (set (zero_extract (reg r0)
19025 (const_int imm16_1)) */
19027 set_dest
= SET_DEST (curr_set
);
19029 if (GET_CODE (set_dest
) == ZERO_EXTRACT
19030 && CONST_INT_P (SET_SRC (curr_set
))
19031 && CONST_INT_P (SET_SRC (prev_set
))
19032 && CONST_INT_P (XEXP (set_dest
, 2))
19033 && INTVAL (XEXP (set_dest
, 2)) == 16
19034 && REG_P (XEXP (set_dest
, 0))
19035 && REG_P (SET_DEST (prev_set
))
19036 && REGNO (XEXP (set_dest
, 0)) == REGNO (SET_DEST (prev_set
)))
19042 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD
))
19045 /* We're trying to match:
19046 prev (adrp) == (set (reg r1)
19047 (high (symbol_ref ("SYM"))))
19048 curr (add) == (set (reg r0)
19050 (symbol_ref ("SYM"))))
19051 Note that r0 need not necessarily be the same as r1, especially
19052 during pre-regalloc scheduling. */
19054 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
19055 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
19057 if (GET_CODE (SET_SRC (curr_set
)) == LO_SUM
19058 && REG_P (XEXP (SET_SRC (curr_set
), 0))
19059 && REGNO (XEXP (SET_SRC (curr_set
), 0))
19060 == REGNO (SET_DEST (prev_set
))
19061 && rtx_equal_p (XEXP (SET_SRC (prev_set
), 0),
19062 XEXP (SET_SRC (curr_set
), 1)))
19067 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK
))
19070 /* We're trying to match:
19071 prev (movk) == (set (zero_extract (reg r0)
19074 (const_int imm16_1))
19075 curr (movk) == (set (zero_extract (reg r0)
19078 (const_int imm16_2)) */
19080 if (GET_CODE (SET_DEST (prev_set
)) == ZERO_EXTRACT
19081 && GET_CODE (SET_DEST (curr_set
)) == ZERO_EXTRACT
19082 && REG_P (XEXP (SET_DEST (prev_set
), 0))
19083 && REG_P (XEXP (SET_DEST (curr_set
), 0))
19084 && REGNO (XEXP (SET_DEST (prev_set
), 0))
19085 == REGNO (XEXP (SET_DEST (curr_set
), 0))
19086 && CONST_INT_P (XEXP (SET_DEST (prev_set
), 2))
19087 && CONST_INT_P (XEXP (SET_DEST (curr_set
), 2))
19088 && INTVAL (XEXP (SET_DEST (prev_set
), 2)) == 32
19089 && INTVAL (XEXP (SET_DEST (curr_set
), 2)) == 48
19090 && CONST_INT_P (SET_SRC (prev_set
))
19091 && CONST_INT_P (SET_SRC (curr_set
)))
19095 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR
))
19097 /* We're trying to match:
19098 prev (adrp) == (set (reg r0)
19099 (high (symbol_ref ("SYM"))))
19100 curr (ldr) == (set (reg r1)
19101 (mem (lo_sum (reg r0)
19102 (symbol_ref ("SYM")))))
19104 curr (ldr) == (set (reg r1)
19107 (symbol_ref ("SYM")))))) */
19108 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
19109 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
19111 rtx curr_src
= SET_SRC (curr_set
);
19113 if (GET_CODE (curr_src
) == ZERO_EXTEND
)
19114 curr_src
= XEXP (curr_src
, 0);
19116 if (MEM_P (curr_src
) && GET_CODE (XEXP (curr_src
, 0)) == LO_SUM
19117 && REG_P (XEXP (XEXP (curr_src
, 0), 0))
19118 && REGNO (XEXP (XEXP (curr_src
, 0), 0))
19119 == REGNO (SET_DEST (prev_set
))
19120 && rtx_equal_p (XEXP (XEXP (curr_src
, 0), 1),
19121 XEXP (SET_SRC (prev_set
), 0)))
19126 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH
)
19127 && any_condjump_p (curr
))
19129 unsigned int condreg1
, condreg2
;
19131 aarch64_fixed_condition_code_regs (&condreg1
, &condreg2
);
19132 cc_reg_1
= gen_rtx_REG (CCmode
, condreg1
);
19134 if (reg_referenced_p (cc_reg_1
, PATTERN (curr
))
19136 && modified_in_p (cc_reg_1
, prev
))
19138 enum attr_type prev_type
= get_attr_type (prev
);
19140 /* FIXME: this misses some which is considered simple arthematic
19141 instructions for ThunderX. Simple shifts are missed here. */
19142 if (prev_type
== TYPE_ALUS_SREG
19143 || prev_type
== TYPE_ALUS_IMM
19144 || prev_type
== TYPE_LOGICS_REG
19145 || prev_type
== TYPE_LOGICS_IMM
)
19152 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH
)
19153 && any_condjump_p (curr
))
19155 /* We're trying to match:
19156 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
19157 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
19159 (label_ref ("SYM"))
19161 if (SET_DEST (curr_set
) == (pc_rtx
)
19162 && GET_CODE (SET_SRC (curr_set
)) == IF_THEN_ELSE
19163 && REG_P (XEXP (XEXP (SET_SRC (curr_set
), 0), 0))
19164 && REG_P (SET_DEST (prev_set
))
19165 && REGNO (SET_DEST (prev_set
))
19166 == REGNO (XEXP (XEXP (SET_SRC (curr_set
), 0), 0)))
19168 /* Fuse ALU operations followed by conditional branch instruction. */
19169 switch (get_attr_type (prev
))
19172 case TYPE_ALU_SREG
:
19175 case TYPE_ADCS_REG
:
19176 case TYPE_ADCS_IMM
:
19177 case TYPE_LOGIC_REG
:
19178 case TYPE_LOGIC_IMM
:
19182 case TYPE_SHIFT_REG
:
19183 case TYPE_SHIFT_IMM
:
19198 /* Return true iff the instruction fusion described by OP is enabled. */
19201 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op
)
19203 return (aarch64_tune_params
.fusible_ops
& op
) != 0;
19206 /* If MEM is in the form of [base+offset], extract the two parts
19207 of address and set to BASE and OFFSET, otherwise return false
19208 after clearing BASE and OFFSET. */
19211 extract_base_offset_in_addr (rtx mem
, rtx
*base
, rtx
*offset
)
19215 gcc_assert (MEM_P (mem
));
19217 addr
= XEXP (mem
, 0);
19222 *offset
= const0_rtx
;
19226 if (GET_CODE (addr
) == PLUS
19227 && REG_P (XEXP (addr
, 0)) && CONST_INT_P (XEXP (addr
, 1)))
19229 *base
= XEXP (addr
, 0);
19230 *offset
= XEXP (addr
, 1);
19235 *offset
= NULL_RTX
;
19240 /* Types for scheduling fusion. */
19241 enum sched_fusion_type
19243 SCHED_FUSION_NONE
= 0,
19244 SCHED_FUSION_LD_SIGN_EXTEND
,
19245 SCHED_FUSION_LD_ZERO_EXTEND
,
19251 /* If INSN is a load or store of address in the form of [base+offset],
19252 extract the two parts and set to BASE and OFFSET. Return scheduling
19253 fusion type this INSN is. */
19255 static enum sched_fusion_type
19256 fusion_load_store (rtx_insn
*insn
, rtx
*base
, rtx
*offset
)
19259 enum sched_fusion_type fusion
= SCHED_FUSION_LD
;
19261 gcc_assert (INSN_P (insn
));
19262 x
= PATTERN (insn
);
19263 if (GET_CODE (x
) != SET
)
19264 return SCHED_FUSION_NONE
;
19267 dest
= SET_DEST (x
);
19269 machine_mode dest_mode
= GET_MODE (dest
);
19271 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode
))
19272 return SCHED_FUSION_NONE
;
19274 if (GET_CODE (src
) == SIGN_EXTEND
)
19276 fusion
= SCHED_FUSION_LD_SIGN_EXTEND
;
19277 src
= XEXP (src
, 0);
19278 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
19279 return SCHED_FUSION_NONE
;
19281 else if (GET_CODE (src
) == ZERO_EXTEND
)
19283 fusion
= SCHED_FUSION_LD_ZERO_EXTEND
;
19284 src
= XEXP (src
, 0);
19285 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
19286 return SCHED_FUSION_NONE
;
19289 if (GET_CODE (src
) == MEM
&& REG_P (dest
))
19290 extract_base_offset_in_addr (src
, base
, offset
);
19291 else if (GET_CODE (dest
) == MEM
&& (REG_P (src
) || src
== const0_rtx
))
19293 fusion
= SCHED_FUSION_ST
;
19294 extract_base_offset_in_addr (dest
, base
, offset
);
19297 return SCHED_FUSION_NONE
;
19299 if (*base
== NULL_RTX
|| *offset
== NULL_RTX
)
19300 fusion
= SCHED_FUSION_NONE
;
19305 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
19307 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
19308 and PRI are only calculated for these instructions. For other instruction,
19309 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
19310 type instruction fusion can be added by returning different priorities.
19312 It's important that irrelevant instructions get the largest FUSION_PRI. */
19315 aarch64_sched_fusion_priority (rtx_insn
*insn
, int max_pri
,
19316 int *fusion_pri
, int *pri
)
19320 enum sched_fusion_type fusion
;
19322 gcc_assert (INSN_P (insn
));
19325 fusion
= fusion_load_store (insn
, &base
, &offset
);
19326 if (fusion
== SCHED_FUSION_NONE
)
19333 /* Set FUSION_PRI according to fusion type and base register. */
19334 *fusion_pri
= tmp
- fusion
* FIRST_PSEUDO_REGISTER
- REGNO (base
);
19336 /* Calculate PRI. */
19339 /* INSN with smaller offset goes first. */
19340 off_val
= (int)(INTVAL (offset
));
19342 tmp
-= (off_val
& 0xfffff);
19344 tmp
+= ((- off_val
) & 0xfffff);
19350 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
19351 Adjust priority of sha1h instructions so they are scheduled before
19352 other SHA1 instructions. */
19355 aarch64_sched_adjust_priority (rtx_insn
*insn
, int priority
)
19357 rtx x
= PATTERN (insn
);
19359 if (GET_CODE (x
) == SET
)
19363 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_SHA1H
)
19364 return priority
+ 10;
19370 /* Given OPERANDS of consecutive load/store, check if we can merge
19371 them into ldp/stp. LOAD is true if they are load instructions.
19372 MODE is the mode of memory operands. */
19375 aarch64_operands_ok_for_ldpstp (rtx
*operands
, bool load
,
19378 HOST_WIDE_INT offval_1
, offval_2
, msize
;
19379 enum reg_class rclass_1
, rclass_2
;
19380 rtx mem_1
, mem_2
, reg_1
, reg_2
, base_1
, base_2
, offset_1
, offset_2
;
19384 mem_1
= operands
[1];
19385 mem_2
= operands
[3];
19386 reg_1
= operands
[0];
19387 reg_2
= operands
[2];
19388 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
));
19389 if (REGNO (reg_1
) == REGNO (reg_2
))
19394 mem_1
= operands
[0];
19395 mem_2
= operands
[2];
19396 reg_1
= operands
[1];
19397 reg_2
= operands
[3];
19400 /* The mems cannot be volatile. */
19401 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
))
19404 /* If we have SImode and slow unaligned ldp,
19405 check the alignment to be at least 8 byte. */
19407 && (aarch64_tune_params
.extra_tuning_flags
19408 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
19410 && MEM_ALIGN (mem_1
) < 8 * BITS_PER_UNIT
)
19413 /* Check if the addresses are in the form of [base+offset]. */
19414 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
19415 if (base_1
== NULL_RTX
|| offset_1
== NULL_RTX
)
19417 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
19418 if (base_2
== NULL_RTX
|| offset_2
== NULL_RTX
)
19421 /* Check if the bases are same. */
19422 if (!rtx_equal_p (base_1
, base_2
))
19425 /* The operands must be of the same size. */
19426 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1
)),
19427 GET_MODE_SIZE (GET_MODE (mem_2
))));
19429 offval_1
= INTVAL (offset_1
);
19430 offval_2
= INTVAL (offset_2
);
19431 /* We should only be trying this for fixed-sized modes. There is no
19432 SVE LDP/STP instruction. */
19433 msize
= GET_MODE_SIZE (mode
).to_constant ();
19434 /* Check if the offsets are consecutive. */
19435 if (offval_1
!= (offval_2
+ msize
) && offval_2
!= (offval_1
+ msize
))
19438 /* Check if the addresses are clobbered by load. */
19441 if (reg_mentioned_p (reg_1
, mem_1
))
19444 /* In increasing order, the last load can clobber the address. */
19445 if (offval_1
> offval_2
&& reg_mentioned_p (reg_2
, mem_2
))
19449 /* One of the memory accesses must be a mempair operand.
19450 If it is not the first one, they need to be swapped by the
19452 if (!aarch64_mem_pair_operand (mem_1
, GET_MODE (mem_1
))
19453 && !aarch64_mem_pair_operand (mem_2
, GET_MODE (mem_2
)))
19456 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
19457 rclass_1
= FP_REGS
;
19459 rclass_1
= GENERAL_REGS
;
19461 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
19462 rclass_2
= FP_REGS
;
19464 rclass_2
= GENERAL_REGS
;
19466 /* Check if the registers are of same class. */
19467 if (rclass_1
!= rclass_2
)
19473 /* Given OPERANDS of consecutive load/store that can be merged,
19474 swap them if they are not in ascending order. */
19476 aarch64_swap_ldrstr_operands (rtx
* operands
, bool load
)
19478 rtx mem_1
, mem_2
, base_1
, base_2
, offset_1
, offset_2
;
19479 HOST_WIDE_INT offval_1
, offval_2
;
19483 mem_1
= operands
[1];
19484 mem_2
= operands
[3];
19488 mem_1
= operands
[0];
19489 mem_2
= operands
[2];
19492 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
19493 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
19495 offval_1
= INTVAL (offset_1
);
19496 offval_2
= INTVAL (offset_2
);
19498 if (offval_1
> offval_2
)
19500 /* Irrespective of whether this is a load or a store,
19501 we do the same swap. */
19502 std::swap (operands
[0], operands
[2]);
19503 std::swap (operands
[1], operands
[3]);
19507 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
19508 comparison between the two. */
19510 aarch64_host_wide_int_compare (const void *x
, const void *y
)
19512 return wi::cmps (* ((const HOST_WIDE_INT
*) x
),
19513 * ((const HOST_WIDE_INT
*) y
));
19516 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
19517 other pointing to a REG rtx containing an offset, compare the offsets
19522 1 iff offset (X) > offset (Y)
19523 0 iff offset (X) == offset (Y)
19524 -1 iff offset (X) < offset (Y) */
19526 aarch64_ldrstr_offset_compare (const void *x
, const void *y
)
19528 const rtx
* operands_1
= (const rtx
*) x
;
19529 const rtx
* operands_2
= (const rtx
*) y
;
19530 rtx mem_1
, mem_2
, base
, offset_1
, offset_2
;
19532 if (MEM_P (operands_1
[0]))
19533 mem_1
= operands_1
[0];
19535 mem_1
= operands_1
[1];
19537 if (MEM_P (operands_2
[0]))
19538 mem_2
= operands_2
[0];
19540 mem_2
= operands_2
[1];
19542 /* Extract the offsets. */
19543 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
19544 extract_base_offset_in_addr (mem_2
, &base
, &offset_2
);
19546 gcc_assert (offset_1
!= NULL_RTX
&& offset_2
!= NULL_RTX
);
19548 return wi::cmps (INTVAL (offset_1
), INTVAL (offset_2
));
19551 /* Given OPERANDS of consecutive load/store, check if we can merge
19552 them into ldp/stp by adjusting the offset. LOAD is true if they
19553 are load instructions. MODE is the mode of memory operands.
19555 Given below consecutive stores:
19557 str w1, [xb, 0x100]
19558 str w1, [xb, 0x104]
19559 str w1, [xb, 0x108]
19560 str w1, [xb, 0x10c]
19562 Though the offsets are out of the range supported by stp, we can
19563 still pair them after adjusting the offset, like:
19565 add scratch, xb, 0x100
19566 stp w1, w1, [scratch]
19567 stp w1, w1, [scratch, 0x8]
19569 The peephole patterns detecting this opportunity should guarantee
19570 the scratch register is avaliable. */
19573 aarch64_operands_adjust_ok_for_ldpstp (rtx
*operands
, bool load
,
19576 const int num_insns
= 4;
19577 enum reg_class rclass
;
19578 HOST_WIDE_INT offvals
[num_insns
], msize
;
19579 rtx mem
[num_insns
], reg
[num_insns
], base
[num_insns
], offset
[num_insns
];
19583 for (int i
= 0; i
< num_insns
; i
++)
19585 reg
[i
] = operands
[2 * i
];
19586 mem
[i
] = operands
[2 * i
+ 1];
19588 gcc_assert (REG_P (reg
[i
]));
19591 /* Do not attempt to merge the loads if the loads clobber each other. */
19592 for (int i
= 0; i
< 8; i
+= 2)
19593 for (int j
= i
+ 2; j
< 8; j
+= 2)
19594 if (reg_overlap_mentioned_p (operands
[i
], operands
[j
]))
19598 for (int i
= 0; i
< num_insns
; i
++)
19600 mem
[i
] = operands
[2 * i
];
19601 reg
[i
] = operands
[2 * i
+ 1];
19604 /* Skip if memory operand is by itself valid for ldp/stp. */
19605 if (!MEM_P (mem
[0]) || aarch64_mem_pair_operand (mem
[0], mode
))
19608 for (int i
= 0; i
< num_insns
; i
++)
19610 /* The mems cannot be volatile. */
19611 if (MEM_VOLATILE_P (mem
[i
]))
19614 /* Check if the addresses are in the form of [base+offset]. */
19615 extract_base_offset_in_addr (mem
[i
], base
+ i
, offset
+ i
);
19616 if (base
[i
] == NULL_RTX
|| offset
[i
] == NULL_RTX
)
19620 /* Check if the registers are of same class. */
19621 rclass
= REG_P (reg
[0]) && FP_REGNUM_P (REGNO (reg
[0]))
19622 ? FP_REGS
: GENERAL_REGS
;
19624 for (int i
= 1; i
< num_insns
; i
++)
19625 if (REG_P (reg
[i
]) && FP_REGNUM_P (REGNO (reg
[i
])))
19627 if (rclass
!= FP_REGS
)
19632 if (rclass
!= GENERAL_REGS
)
19636 /* Only the last register in the order in which they occur
19637 may be clobbered by the load. */
19638 if (rclass
== GENERAL_REGS
&& load
)
19639 for (int i
= 0; i
< num_insns
- 1; i
++)
19640 if (reg_mentioned_p (reg
[i
], mem
[i
]))
19643 /* Check if the bases are same. */
19644 for (int i
= 0; i
< num_insns
- 1; i
++)
19645 if (!rtx_equal_p (base
[i
], base
[i
+ 1]))
19648 for (int i
= 0; i
< num_insns
; i
++)
19649 offvals
[i
] = INTVAL (offset
[i
]);
19651 msize
= GET_MODE_SIZE (mode
);
19653 /* Check if the offsets can be put in the right order to do a ldp/stp. */
19654 qsort (offvals
, num_insns
, sizeof (HOST_WIDE_INT
),
19655 aarch64_host_wide_int_compare
);
19657 if (!(offvals
[1] == offvals
[0] + msize
19658 && offvals
[3] == offvals
[2] + msize
))
19661 /* Check that offsets are within range of each other. The ldp/stp
19662 instructions have 7 bit immediate offsets, so use 0x80. */
19663 if (offvals
[2] - offvals
[0] >= msize
* 0x80)
19666 /* The offsets must be aligned with respect to each other. */
19667 if (offvals
[0] % msize
!= offvals
[2] % msize
)
19670 /* If we have SImode and slow unaligned ldp,
19671 check the alignment to be at least 8 byte. */
19673 && (aarch64_tune_params
.extra_tuning_flags
19674 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
19676 && MEM_ALIGN (mem
[0]) < 8 * BITS_PER_UNIT
)
19682 /* Given OPERANDS of consecutive load/store, this function pairs them
19683 into LDP/STP after adjusting the offset. It depends on the fact
19684 that the operands can be sorted so the offsets are correct for STP.
19685 MODE is the mode of memory operands. CODE is the rtl operator
19686 which should be applied to all memory operands, it's SIGN_EXTEND,
19687 ZERO_EXTEND or UNKNOWN. */
19690 aarch64_gen_adjusted_ldpstp (rtx
*operands
, bool load
,
19691 scalar_mode mode
, RTX_CODE code
)
19693 rtx base
, offset_1
, offset_3
, t1
, t2
;
19694 rtx mem_1
, mem_2
, mem_3
, mem_4
;
19695 rtx temp_operands
[8];
19696 HOST_WIDE_INT off_val_1
, off_val_3
, base_off
, new_off_1
, new_off_3
,
19697 stp_off_upper_limit
, stp_off_lower_limit
, msize
;
19699 /* We make changes on a copy as we may still bail out. */
19700 for (int i
= 0; i
< 8; i
++)
19701 temp_operands
[i
] = operands
[i
];
19703 /* Sort the operands. */
19704 qsort (temp_operands
, 4, 2 * sizeof (rtx
*), aarch64_ldrstr_offset_compare
);
19706 /* Copy the memory operands so that if we have to bail for some
19707 reason the original addresses are unchanged. */
19710 mem_1
= copy_rtx (temp_operands
[1]);
19711 mem_2
= copy_rtx (temp_operands
[3]);
19712 mem_3
= copy_rtx (temp_operands
[5]);
19713 mem_4
= copy_rtx (temp_operands
[7]);
19717 mem_1
= copy_rtx (temp_operands
[0]);
19718 mem_2
= copy_rtx (temp_operands
[2]);
19719 mem_3
= copy_rtx (temp_operands
[4]);
19720 mem_4
= copy_rtx (temp_operands
[6]);
19721 gcc_assert (code
== UNKNOWN
);
19724 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
19725 extract_base_offset_in_addr (mem_3
, &base
, &offset_3
);
19726 gcc_assert (base
!= NULL_RTX
&& offset_1
!= NULL_RTX
19727 && offset_3
!= NULL_RTX
);
19729 /* Adjust offset so it can fit in LDP/STP instruction. */
19730 msize
= GET_MODE_SIZE (mode
);
19731 stp_off_upper_limit
= msize
* (0x40 - 1);
19732 stp_off_lower_limit
= - msize
* 0x40;
19734 off_val_1
= INTVAL (offset_1
);
19735 off_val_3
= INTVAL (offset_3
);
19737 /* The base offset is optimally half way between the two STP/LDP offsets. */
19739 base_off
= (off_val_1
+ off_val_3
) / 2;
19741 /* However, due to issues with negative LDP/STP offset generation for
19742 larger modes, for DF, DI and vector modes. we must not use negative
19743 addresses smaller than 9 signed unadjusted bits can store. This
19744 provides the most range in this case. */
19745 base_off
= off_val_1
;
19747 /* Adjust the base so that it is aligned with the addresses but still
19749 if (base_off
% msize
!= off_val_1
% msize
)
19750 /* Fix the offset, bearing in mind we want to make it bigger not
19752 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
19753 else if (msize
<= 4)
19754 /* The negative range of LDP/STP is one larger than the positive range. */
19757 /* Check if base offset is too big or too small. We can attempt to resolve
19758 this issue by setting it to the maximum value and seeing if the offsets
19760 if (base_off
>= 0x1000)
19762 base_off
= 0x1000 - 1;
19763 /* We must still make sure that the base offset is aligned with respect
19764 to the address. But it may may not be made any bigger. */
19765 base_off
-= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
19768 /* Likewise for the case where the base is too small. */
19769 if (base_off
<= -0x1000)
19771 base_off
= -0x1000 + 1;
19772 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
19775 /* Offset of the first STP/LDP. */
19776 new_off_1
= off_val_1
- base_off
;
19778 /* Offset of the second STP/LDP. */
19779 new_off_3
= off_val_3
- base_off
;
19781 /* The offsets must be within the range of the LDP/STP instructions. */
19782 if (new_off_1
> stp_off_upper_limit
|| new_off_1
< stp_off_lower_limit
19783 || new_off_3
> stp_off_upper_limit
|| new_off_3
< stp_off_lower_limit
)
19786 replace_equiv_address_nv (mem_1
, plus_constant (Pmode
, operands
[8],
19788 replace_equiv_address_nv (mem_2
, plus_constant (Pmode
, operands
[8],
19789 new_off_1
+ msize
), true);
19790 replace_equiv_address_nv (mem_3
, plus_constant (Pmode
, operands
[8],
19792 replace_equiv_address_nv (mem_4
, plus_constant (Pmode
, operands
[8],
19793 new_off_3
+ msize
), true);
19795 if (!aarch64_mem_pair_operand (mem_1
, mode
)
19796 || !aarch64_mem_pair_operand (mem_3
, mode
))
19799 if (code
== ZERO_EXTEND
)
19801 mem_1
= gen_rtx_ZERO_EXTEND (DImode
, mem_1
);
19802 mem_2
= gen_rtx_ZERO_EXTEND (DImode
, mem_2
);
19803 mem_3
= gen_rtx_ZERO_EXTEND (DImode
, mem_3
);
19804 mem_4
= gen_rtx_ZERO_EXTEND (DImode
, mem_4
);
19806 else if (code
== SIGN_EXTEND
)
19808 mem_1
= gen_rtx_SIGN_EXTEND (DImode
, mem_1
);
19809 mem_2
= gen_rtx_SIGN_EXTEND (DImode
, mem_2
);
19810 mem_3
= gen_rtx_SIGN_EXTEND (DImode
, mem_3
);
19811 mem_4
= gen_rtx_SIGN_EXTEND (DImode
, mem_4
);
19816 operands
[0] = temp_operands
[0];
19817 operands
[1] = mem_1
;
19818 operands
[2] = temp_operands
[2];
19819 operands
[3] = mem_2
;
19820 operands
[4] = temp_operands
[4];
19821 operands
[5] = mem_3
;
19822 operands
[6] = temp_operands
[6];
19823 operands
[7] = mem_4
;
19827 operands
[0] = mem_1
;
19828 operands
[1] = temp_operands
[1];
19829 operands
[2] = mem_2
;
19830 operands
[3] = temp_operands
[3];
19831 operands
[4] = mem_3
;
19832 operands
[5] = temp_operands
[5];
19833 operands
[6] = mem_4
;
19834 operands
[7] = temp_operands
[7];
19837 /* Emit adjusting instruction. */
19838 emit_insn (gen_rtx_SET (operands
[8], plus_constant (DImode
, base
, base_off
)));
19839 /* Emit ldp/stp instructions. */
19840 t1
= gen_rtx_SET (operands
[0], operands
[1]);
19841 t2
= gen_rtx_SET (operands
[2], operands
[3]);
19842 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
19843 t1
= gen_rtx_SET (operands
[4], operands
[5]);
19844 t2
= gen_rtx_SET (operands
[6], operands
[7]);
19845 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
19849 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
19850 it isn't worth branching around empty masked ops (including masked
19854 aarch64_empty_mask_is_expensive (unsigned)
19859 /* Return 1 if pseudo register should be created and used to hold
19860 GOT address for PIC code. */
19863 aarch64_use_pseudo_pic_reg (void)
19865 return aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
;
19868 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
19871 aarch64_unspec_may_trap_p (const_rtx x
, unsigned flags
)
19873 switch (XINT (x
, 1))
19875 case UNSPEC_GOTSMALLPIC
:
19876 case UNSPEC_GOTSMALLPIC28K
:
19877 case UNSPEC_GOTTINYPIC
:
19883 return default_unspec_may_trap_p (x
, flags
);
19887 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
19888 return the log2 of that value. Otherwise return -1. */
19891 aarch64_fpconst_pow_of_2 (rtx x
)
19893 const REAL_VALUE_TYPE
*r
;
19895 if (!CONST_DOUBLE_P (x
))
19898 r
= CONST_DOUBLE_REAL_VALUE (x
);
19900 if (REAL_VALUE_NEGATIVE (*r
)
19901 || REAL_VALUE_ISNAN (*r
)
19902 || REAL_VALUE_ISINF (*r
)
19903 || !real_isinteger (r
, DFmode
))
19906 return exact_log2 (real_to_integer (r
));
19909 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
19910 power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
19911 return n. Otherwise return -1. */
19914 aarch64_fpconst_pow2_recip (rtx x
)
19916 REAL_VALUE_TYPE r0
;
19918 if (!CONST_DOUBLE_P (x
))
19921 r0
= *CONST_DOUBLE_REAL_VALUE (x
);
19922 if (exact_real_inverse (DFmode
, &r0
)
19923 && !REAL_VALUE_NEGATIVE (r0
))
19925 int ret
= exact_log2 (real_to_integer (&r0
));
19926 if (ret
>= 1 && ret
<= 32)
19932 /* If X is a vector of equal CONST_DOUBLE values and that value is
19933 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
19936 aarch64_vec_fpconst_pow_of_2 (rtx x
)
19939 if (GET_CODE (x
) != CONST_VECTOR
19940 || !CONST_VECTOR_NUNITS (x
).is_constant (&nelts
))
19943 if (GET_MODE_CLASS (GET_MODE (x
)) != MODE_VECTOR_FLOAT
)
19946 int firstval
= aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, 0));
19950 for (int i
= 1; i
< nelts
; i
++)
19951 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, i
)) != firstval
)
19957 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
19960 __fp16 always promotes through this hook.
19961 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
19962 through the generic excess precision logic rather than here. */
19965 aarch64_promoted_type (const_tree t
)
19967 if (SCALAR_FLOAT_TYPE_P (t
)
19968 && TYPE_MAIN_VARIANT (t
) == aarch64_fp16_type_node
)
19969 return float_type_node
;
19974 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
19977 aarch64_optab_supported_p (int op
, machine_mode mode1
, machine_mode
,
19978 optimization_type opt_type
)
19983 return opt_type
== OPTIMIZE_FOR_SPEED
&& use_rsqrt_p (mode1
);
19990 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
19992 static unsigned int
19993 aarch64_dwarf_poly_indeterminate_value (unsigned int i
, unsigned int *factor
,
19996 /* Polynomial invariant 1 == (VG / 2) - 1. */
19997 gcc_assert (i
== 1);
20000 return AARCH64_DWARF_VG
;
20003 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
20004 if MODE is HFmode, and punt to the generic implementation otherwise. */
20007 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode
)
20009 return (mode
== HFmode
20011 : default_libgcc_floating_mode_supported_p (mode
));
20014 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
20015 if MODE is HFmode, and punt to the generic implementation otherwise. */
20018 aarch64_scalar_mode_supported_p (scalar_mode mode
)
20020 return (mode
== HFmode
20022 : default_scalar_mode_supported_p (mode
));
20025 /* Set the value of FLT_EVAL_METHOD.
20026 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
20028 0: evaluate all operations and constants, whose semantic type has at
20029 most the range and precision of type float, to the range and
20030 precision of float; evaluate all other operations and constants to
20031 the range and precision of the semantic type;
20033 N, where _FloatN is a supported interchange floating type
20034 evaluate all operations and constants, whose semantic type has at
20035 most the range and precision of _FloatN type, to the range and
20036 precision of the _FloatN type; evaluate all other operations and
20037 constants to the range and precision of the semantic type;
20039 If we have the ARMv8.2-A extensions then we support _Float16 in native
20040 precision, so we should set this to 16. Otherwise, we support the type,
20041 but want to evaluate expressions in float precision, so set this to
20044 static enum flt_eval_method
20045 aarch64_excess_precision (enum excess_precision_type type
)
20049 case EXCESS_PRECISION_TYPE_FAST
:
20050 case EXCESS_PRECISION_TYPE_STANDARD
:
20051 /* We can calculate either in 16-bit range and precision or
20052 32-bit range and precision. Make that decision based on whether
20053 we have native support for the ARMv8.2-A 16-bit floating-point
20054 instructions or not. */
20055 return (TARGET_FP_F16INST
20056 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
20057 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
);
20058 case EXCESS_PRECISION_TYPE_IMPLICIT
:
20059 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
;
20061 gcc_unreachable ();
20063 return FLT_EVAL_METHOD_UNPREDICTABLE
;
20066 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
20067 scheduled for speculative execution. Reject the long-running division
20068 and square-root instructions. */
20071 aarch64_sched_can_speculate_insn (rtx_insn
*insn
)
20073 switch (get_attr_type (insn
))
20081 case TYPE_NEON_FP_SQRT_S
:
20082 case TYPE_NEON_FP_SQRT_D
:
20083 case TYPE_NEON_FP_SQRT_S_Q
:
20084 case TYPE_NEON_FP_SQRT_D_Q
:
20085 case TYPE_NEON_FP_DIV_S
:
20086 case TYPE_NEON_FP_DIV_D
:
20087 case TYPE_NEON_FP_DIV_S_Q
:
20088 case TYPE_NEON_FP_DIV_D_Q
:
20095 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
20098 aarch64_compute_pressure_classes (reg_class
*classes
)
20101 classes
[i
++] = GENERAL_REGS
;
20102 classes
[i
++] = FP_REGS
;
20103 /* PR_REGS isn't a useful pressure class because many predicate pseudo
20104 registers need to go in PR_LO_REGS at some point during their
20105 lifetime. Splitting it into two halves has the effect of making
20106 all predicates count against PR_LO_REGS, so that we try whenever
20107 possible to restrict the number of live predicates to 8. This
20108 greatly reduces the amount of spilling in certain loops. */
20109 classes
[i
++] = PR_LO_REGS
;
20110 classes
[i
++] = PR_HI_REGS
;
20114 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
20117 aarch64_can_change_mode_class (machine_mode from
,
20118 machine_mode to
, reg_class_t
)
20120 if (BYTES_BIG_ENDIAN
)
20122 bool from_sve_p
= aarch64_sve_data_mode_p (from
);
20123 bool to_sve_p
= aarch64_sve_data_mode_p (to
);
20125 /* Don't allow changes between SVE data modes and non-SVE modes.
20126 See the comment at the head of aarch64-sve.md for details. */
20127 if (from_sve_p
!= to_sve_p
)
20130 /* Don't allow changes in element size: lane 0 of the new vector
20131 would not then be lane 0 of the old vector. See the comment
20132 above aarch64_maybe_expand_sve_subreg_move for a more detailed
20135 In the worst case, this forces a register to be spilled in
20136 one mode and reloaded in the other, which handles the
20137 endianness correctly. */
20138 if (from_sve_p
&& GET_MODE_UNIT_SIZE (from
) != GET_MODE_UNIT_SIZE (to
))
20144 /* Implement TARGET_EARLY_REMAT_MODES. */
20147 aarch64_select_early_remat_modes (sbitmap modes
)
20149 /* SVE values are not normally live across a call, so it should be
20150 worth doing early rematerialization even in VL-specific mode. */
20151 for (int i
= 0; i
< NUM_MACHINE_MODES
; ++i
)
20152 if (aarch64_sve_mode_p ((machine_mode
) i
))
20153 bitmap_set_bit (modes
, i
);
20156 /* Override the default target speculation_safe_value. */
20158 aarch64_speculation_safe_value (machine_mode mode
,
20159 rtx result
, rtx val
, rtx failval
)
20161 /* Maybe we should warn if falling back to hard barriers. They are
20162 likely to be noticably more expensive than the alternative below. */
20163 if (!aarch64_track_speculation
)
20164 return default_speculation_safe_value (mode
, result
, val
, failval
);
20167 val
= copy_to_mode_reg (mode
, val
);
20169 if (!aarch64_reg_or_zero (failval
, mode
))
20170 failval
= copy_to_mode_reg (mode
, failval
);
20172 emit_insn (gen_despeculate_copy (mode
, result
, val
, failval
));
20176 /* Implement TARGET_ESTIMATED_POLY_VALUE.
20177 Look into the tuning structure for an estimate.
20178 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
20179 Advanced SIMD 128 bits. */
20181 static HOST_WIDE_INT
20182 aarch64_estimated_poly_value (poly_int64 val
)
20184 enum aarch64_sve_vector_bits_enum width_source
20185 = aarch64_tune_params
.sve_width
;
20187 /* If we still don't have an estimate, use the default. */
20188 if (width_source
== SVE_SCALABLE
)
20189 return default_estimated_poly_value (val
);
20191 HOST_WIDE_INT over_128
= width_source
- 128;
20192 return val
.coeffs
[0] + val
.coeffs
[1] * over_128
/ 128;
20196 /* Return true for types that could be supported as SIMD return or
20200 supported_simd_type (tree t
)
20202 if (SCALAR_FLOAT_TYPE_P (t
) || INTEGRAL_TYPE_P (t
) || POINTER_TYPE_P (t
))
20204 HOST_WIDE_INT s
= tree_to_shwi (TYPE_SIZE_UNIT (t
));
20205 return s
== 1 || s
== 2 || s
== 4 || s
== 8;
20210 /* Return true for types that currently are supported as SIMD return
20211 or argument types. */
20214 currently_supported_simd_type (tree t
, tree b
)
20216 if (COMPLEX_FLOAT_TYPE_P (t
))
20219 if (TYPE_SIZE (t
) != TYPE_SIZE (b
))
20222 return supported_simd_type (t
);
20225 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
20228 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node
*node
,
20229 struct cgraph_simd_clone
*clonei
,
20230 tree base_type
, int num
)
20232 tree t
, ret_type
, arg_type
;
20233 unsigned int elt_bits
, vec_bits
, count
;
20238 if (clonei
->simdlen
20239 && (clonei
->simdlen
< 2
20240 || clonei
->simdlen
> 1024
20241 || (clonei
->simdlen
& (clonei
->simdlen
- 1)) != 0))
20243 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
20244 "unsupported simdlen %d", clonei
->simdlen
);
20248 ret_type
= TREE_TYPE (TREE_TYPE (node
->decl
));
20249 if (TREE_CODE (ret_type
) != VOID_TYPE
20250 && !currently_supported_simd_type (ret_type
, base_type
))
20252 if (TYPE_SIZE (ret_type
) != TYPE_SIZE (base_type
))
20253 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
20254 "GCC does not currently support mixed size types "
20255 "for %<simd%> functions");
20256 else if (supported_simd_type (ret_type
))
20257 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
20258 "GCC does not currently support return type %qT "
20259 "for %<simd%> functions", ret_type
);
20261 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
20262 "unsupported return type %qT for %<simd%> functions",
20267 for (t
= DECL_ARGUMENTS (node
->decl
); t
; t
= DECL_CHAIN (t
))
20269 arg_type
= TREE_TYPE (t
);
20271 if (!currently_supported_simd_type (arg_type
, base_type
))
20273 if (TYPE_SIZE (arg_type
) != TYPE_SIZE (base_type
))
20274 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
20275 "GCC does not currently support mixed size types "
20276 "for %<simd%> functions");
20278 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
20279 "GCC does not currently support argument type %qT "
20280 "for %<simd%> functions", arg_type
);
20285 clonei
->vecsize_mangle
= 'n';
20286 clonei
->mask_mode
= VOIDmode
;
20287 elt_bits
= GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type
));
20288 if (clonei
->simdlen
== 0)
20291 vec_bits
= (num
== 0 ? 64 : 128);
20292 clonei
->simdlen
= vec_bits
/ elt_bits
;
20297 vec_bits
= clonei
->simdlen
* elt_bits
;
20298 if (vec_bits
!= 64 && vec_bits
!= 128)
20300 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
20301 "GCC does not currently support simdlen %d for type %qT",
20302 clonei
->simdlen
, base_type
);
20306 clonei
->vecsize_int
= vec_bits
;
20307 clonei
->vecsize_float
= vec_bits
;
20311 /* Implement TARGET_SIMD_CLONE_ADJUST. */
20314 aarch64_simd_clone_adjust (struct cgraph_node
*node
)
20316 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
20317 use the correct ABI. */
20319 tree t
= TREE_TYPE (node
->decl
);
20320 TYPE_ATTRIBUTES (t
) = make_attribute ("aarch64_vector_pcs", "default",
20321 TYPE_ATTRIBUTES (t
));
20324 /* Implement TARGET_SIMD_CLONE_USABLE. */
20327 aarch64_simd_clone_usable (struct cgraph_node
*node
)
20329 switch (node
->simdclone
->vecsize_mangle
)
20336 gcc_unreachable ();
20340 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
20343 aarch64_comp_type_attributes (const_tree type1
, const_tree type2
)
20345 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1
))
20346 != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2
)))
20351 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
20353 static const char *
20354 aarch64_get_multilib_abi_name (void)
20356 if (TARGET_BIG_END
)
20357 return TARGET_ILP32
? "aarch64_be_ilp32" : "aarch64_be";
20358 return TARGET_ILP32
? "aarch64_ilp32" : "aarch64";
20361 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
20362 global variable based guard use the default else
20363 return a null tree. */
20365 aarch64_stack_protect_guard (void)
20367 if (aarch64_stack_protector_guard
== SSP_GLOBAL
)
20368 return default_stack_protect_guard ();
20373 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
20374 section at the end if needed. */
20375 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
20376 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
20377 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
20379 aarch64_file_end_indicate_exec_stack ()
20381 file_end_indicate_exec_stack ();
20383 unsigned feature_1_and
= 0;
20384 if (aarch64_bti_enabled ())
20385 feature_1_and
|= GNU_PROPERTY_AARCH64_FEATURE_1_BTI
;
20387 if (aarch64_ra_sign_scope
!= AARCH64_FUNCTION_NONE
)
20388 feature_1_and
|= GNU_PROPERTY_AARCH64_FEATURE_1_PAC
;
20392 /* Generate .note.gnu.property section. */
20393 switch_to_section (get_section (".note.gnu.property",
20394 SECTION_NOTYPE
, NULL
));
20396 /* PT_NOTE header: namesz, descsz, type.
20397 namesz = 4 ("GNU\0")
20398 descsz = 16 (Size of the program property array)
20399 [(12 + padding) * Number of array elements]
20400 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
20401 assemble_align (POINTER_SIZE
);
20402 assemble_integer (GEN_INT (4), 4, 32, 1);
20403 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES
)), 4, 32, 1);
20404 assemble_integer (GEN_INT (5), 4, 32, 1);
20406 /* PT_NOTE name. */
20407 assemble_string ("GNU", 4);
20409 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
20410 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
20412 data = feature_1_and. */
20413 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND
), 4, 32, 1);
20414 assemble_integer (GEN_INT (4), 4, 32, 1);
20415 assemble_integer (GEN_INT (feature_1_and
), 4, 32, 1);
20417 /* Pad the size of the note to the required alignment. */
20418 assemble_align (POINTER_SIZE
);
20421 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
20422 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
20423 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
20425 /* Target-specific selftests. */
20429 namespace selftest
{
20431 /* Selftest for the RTL loader.
20432 Verify that the RTL loader copes with a dump from
20433 print_rtx_function. This is essentially just a test that class
20434 function_reader can handle a real dump, but it also verifies
20435 that lookup_reg_by_dump_name correctly handles hard regs.
20436 The presence of hard reg names in the dump means that the test is
20437 target-specific, hence it is in this file. */
20440 aarch64_test_loading_full_dump ()
20442 rtl_dump_test
t (SELFTEST_LOCATION
, locate_file ("aarch64/times-two.rtl"));
20444 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun
->decl
)));
20446 rtx_insn
*insn_1
= get_insn_by_uid (1);
20447 ASSERT_EQ (NOTE
, GET_CODE (insn_1
));
20449 rtx_insn
*insn_15
= get_insn_by_uid (15);
20450 ASSERT_EQ (INSN
, GET_CODE (insn_15
));
20451 ASSERT_EQ (USE
, GET_CODE (PATTERN (insn_15
)));
20453 /* Verify crtl->return_rtx. */
20454 ASSERT_EQ (REG
, GET_CODE (crtl
->return_rtx
));
20455 ASSERT_EQ (0, REGNO (crtl
->return_rtx
));
20456 ASSERT_EQ (SImode
, GET_MODE (crtl
->return_rtx
));
20459 /* Run all target-specific selftests. */
20462 aarch64_run_selftests (void)
20464 aarch64_test_loading_full_dump ();
20467 } // namespace selftest
20469 #endif /* #if CHECKING_P */
20471 #undef TARGET_STACK_PROTECT_GUARD
20472 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
20474 #undef TARGET_ADDRESS_COST
20475 #define TARGET_ADDRESS_COST aarch64_address_cost
20477 /* This hook will determines whether unnamed bitfields affect the alignment
20478 of the containing structure. The hook returns true if the structure
20479 should inherit the alignment requirements of an unnamed bitfield's
20481 #undef TARGET_ALIGN_ANON_BITFIELD
20482 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
20484 #undef TARGET_ASM_ALIGNED_DI_OP
20485 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
20487 #undef TARGET_ASM_ALIGNED_HI_OP
20488 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
20490 #undef TARGET_ASM_ALIGNED_SI_OP
20491 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
20493 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
20494 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
20495 hook_bool_const_tree_hwi_hwi_const_tree_true
20497 #undef TARGET_ASM_FILE_START
20498 #define TARGET_ASM_FILE_START aarch64_start_file
20500 #undef TARGET_ASM_OUTPUT_MI_THUNK
20501 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
20503 #undef TARGET_ASM_SELECT_RTX_SECTION
20504 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
20506 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
20507 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
20509 #undef TARGET_BUILD_BUILTIN_VA_LIST
20510 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
20512 #undef TARGET_CALLEE_COPIES
20513 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
20515 #undef TARGET_CAN_ELIMINATE
20516 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
20518 #undef TARGET_CAN_INLINE_P
20519 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
20521 #undef TARGET_CANNOT_FORCE_CONST_MEM
20522 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
20524 #undef TARGET_CASE_VALUES_THRESHOLD
20525 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
20527 #undef TARGET_CONDITIONAL_REGISTER_USAGE
20528 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
20530 /* Only the least significant bit is used for initialization guard
20532 #undef TARGET_CXX_GUARD_MASK_BIT
20533 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
20535 #undef TARGET_C_MODE_FOR_SUFFIX
20536 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
20538 #ifdef TARGET_BIG_ENDIAN_DEFAULT
20539 #undef TARGET_DEFAULT_TARGET_FLAGS
20540 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
20543 #undef TARGET_CLASS_MAX_NREGS
20544 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
20546 #undef TARGET_BUILTIN_DECL
20547 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
20549 #undef TARGET_BUILTIN_RECIPROCAL
20550 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
20552 #undef TARGET_C_EXCESS_PRECISION
20553 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
20555 #undef TARGET_EXPAND_BUILTIN
20556 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
20558 #undef TARGET_EXPAND_BUILTIN_VA_START
20559 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
20561 #undef TARGET_FOLD_BUILTIN
20562 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
20564 #undef TARGET_FUNCTION_ARG
20565 #define TARGET_FUNCTION_ARG aarch64_function_arg
20567 #undef TARGET_FUNCTION_ARG_ADVANCE
20568 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
20570 #undef TARGET_FUNCTION_ARG_BOUNDARY
20571 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
20573 #undef TARGET_FUNCTION_ARG_PADDING
20574 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
20576 #undef TARGET_GET_RAW_RESULT_MODE
20577 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
20578 #undef TARGET_GET_RAW_ARG_MODE
20579 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
20581 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
20582 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
20584 #undef TARGET_FUNCTION_VALUE
20585 #define TARGET_FUNCTION_VALUE aarch64_function_value
20587 #undef TARGET_FUNCTION_VALUE_REGNO_P
20588 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
20590 #undef TARGET_GIMPLE_FOLD_BUILTIN
20591 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
20593 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
20594 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
20596 #undef TARGET_INIT_BUILTINS
20597 #define TARGET_INIT_BUILTINS aarch64_init_builtins
20599 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
20600 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
20601 aarch64_ira_change_pseudo_allocno_class
20603 #undef TARGET_LEGITIMATE_ADDRESS_P
20604 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
20606 #undef TARGET_LEGITIMATE_CONSTANT_P
20607 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
20609 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
20610 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
20611 aarch64_legitimize_address_displacement
20613 #undef TARGET_LIBGCC_CMP_RETURN_MODE
20614 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
20616 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
20617 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
20618 aarch64_libgcc_floating_mode_supported_p
20620 #undef TARGET_MANGLE_TYPE
20621 #define TARGET_MANGLE_TYPE aarch64_mangle_type
20623 #undef TARGET_MEMORY_MOVE_COST
20624 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
20626 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
20627 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
20629 #undef TARGET_MUST_PASS_IN_STACK
20630 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
20632 /* This target hook should return true if accesses to volatile bitfields
20633 should use the narrowest mode possible. It should return false if these
20634 accesses should use the bitfield container type. */
20635 #undef TARGET_NARROW_VOLATILE_BITFIELD
20636 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
20638 #undef TARGET_OPTION_OVERRIDE
20639 #define TARGET_OPTION_OVERRIDE aarch64_override_options
20641 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
20642 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
20643 aarch64_override_options_after_change
20645 #undef TARGET_OPTION_SAVE
20646 #define TARGET_OPTION_SAVE aarch64_option_save
20648 #undef TARGET_OPTION_RESTORE
20649 #define TARGET_OPTION_RESTORE aarch64_option_restore
20651 #undef TARGET_OPTION_PRINT
20652 #define TARGET_OPTION_PRINT aarch64_option_print
20654 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
20655 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
20657 #undef TARGET_SET_CURRENT_FUNCTION
20658 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
20660 #undef TARGET_PASS_BY_REFERENCE
20661 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
20663 #undef TARGET_PREFERRED_RELOAD_CLASS
20664 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
20666 #undef TARGET_SCHED_REASSOCIATION_WIDTH
20667 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
20669 #undef TARGET_PROMOTED_TYPE
20670 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
20672 #undef TARGET_SECONDARY_RELOAD
20673 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
20675 #undef TARGET_SHIFT_TRUNCATION_MASK
20676 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
20678 #undef TARGET_SETUP_INCOMING_VARARGS
20679 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
20681 #undef TARGET_STRUCT_VALUE_RTX
20682 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
20684 #undef TARGET_REGISTER_MOVE_COST
20685 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
20687 #undef TARGET_RETURN_IN_MEMORY
20688 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
20690 #undef TARGET_RETURN_IN_MSB
20691 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
20693 #undef TARGET_RTX_COSTS
20694 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
20696 #undef TARGET_SCALAR_MODE_SUPPORTED_P
20697 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
20699 #undef TARGET_SCHED_ISSUE_RATE
20700 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
20702 #undef TARGET_SCHED_VARIABLE_ISSUE
20703 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
20705 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
20706 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
20707 aarch64_sched_first_cycle_multipass_dfa_lookahead
20709 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
20710 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
20711 aarch64_first_cycle_multipass_dfa_lookahead_guard
20713 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
20714 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
20715 aarch64_get_separate_components
20717 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
20718 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
20719 aarch64_components_for_bb
20721 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
20722 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
20723 aarch64_disqualify_components
20725 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
20726 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
20727 aarch64_emit_prologue_components
20729 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
20730 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
20731 aarch64_emit_epilogue_components
20733 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
20734 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
20735 aarch64_set_handled_components
20737 #undef TARGET_TRAMPOLINE_INIT
20738 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
20740 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
20741 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
20743 #undef TARGET_VECTOR_MODE_SUPPORTED_P
20744 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
20746 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
20747 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
20748 aarch64_builtin_support_vector_misalignment
20750 #undef TARGET_ARRAY_MODE
20751 #define TARGET_ARRAY_MODE aarch64_array_mode
20753 #undef TARGET_ARRAY_MODE_SUPPORTED_P
20754 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
20756 #undef TARGET_VECTORIZE_ADD_STMT_COST
20757 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
20759 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
20760 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
20761 aarch64_builtin_vectorization_cost
20763 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
20764 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
20766 #undef TARGET_VECTORIZE_BUILTINS
20767 #define TARGET_VECTORIZE_BUILTINS
20769 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
20770 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
20771 aarch64_builtin_vectorized_function
20773 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
20774 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
20775 aarch64_autovectorize_vector_sizes
20777 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
20778 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
20779 aarch64_atomic_assign_expand_fenv
20781 /* Section anchor support. */
20783 #undef TARGET_MIN_ANCHOR_OFFSET
20784 #define TARGET_MIN_ANCHOR_OFFSET -256
20786 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
20787 byte offset; we can do much more for larger data types, but have no way
20788 to determine the size of the access. We assume accesses are aligned. */
20789 #undef TARGET_MAX_ANCHOR_OFFSET
20790 #define TARGET_MAX_ANCHOR_OFFSET 4095
20792 #undef TARGET_VECTOR_ALIGNMENT
20793 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
20795 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
20796 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
20797 aarch64_vectorize_preferred_vector_alignment
20798 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
20799 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
20800 aarch64_simd_vector_alignment_reachable
20802 /* vec_perm support. */
20804 #undef TARGET_VECTORIZE_VEC_PERM_CONST
20805 #define TARGET_VECTORIZE_VEC_PERM_CONST \
20806 aarch64_vectorize_vec_perm_const
20808 #undef TARGET_VECTORIZE_GET_MASK_MODE
20809 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
20810 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
20811 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
20812 aarch64_empty_mask_is_expensive
20813 #undef TARGET_PREFERRED_ELSE_VALUE
20814 #define TARGET_PREFERRED_ELSE_VALUE \
20815 aarch64_preferred_else_value
20817 #undef TARGET_INIT_LIBFUNCS
20818 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
20820 #undef TARGET_FIXED_CONDITION_CODE_REGS
20821 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
20823 #undef TARGET_FLAGS_REGNUM
20824 #define TARGET_FLAGS_REGNUM CC_REGNUM
20826 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
20827 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
20829 #undef TARGET_ASAN_SHADOW_OFFSET
20830 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
20832 #undef TARGET_LEGITIMIZE_ADDRESS
20833 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
20835 #undef TARGET_SCHED_CAN_SPECULATE_INSN
20836 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
20838 #undef TARGET_CAN_USE_DOLOOP_P
20839 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
20841 #undef TARGET_SCHED_ADJUST_PRIORITY
20842 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
20844 #undef TARGET_SCHED_MACRO_FUSION_P
20845 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
20847 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
20848 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
20850 #undef TARGET_SCHED_FUSION_PRIORITY
20851 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
20853 #undef TARGET_UNSPEC_MAY_TRAP_P
20854 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
20856 #undef TARGET_USE_PSEUDO_PIC_REG
20857 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
20859 #undef TARGET_PRINT_OPERAND
20860 #define TARGET_PRINT_OPERAND aarch64_print_operand
20862 #undef TARGET_PRINT_OPERAND_ADDRESS
20863 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
20865 #undef TARGET_OPTAB_SUPPORTED_P
20866 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
20868 #undef TARGET_OMIT_STRUCT_RETURN_REG
20869 #define TARGET_OMIT_STRUCT_RETURN_REG true
20871 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
20872 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
20873 aarch64_dwarf_poly_indeterminate_value
20875 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
20876 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
20877 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
20879 #undef TARGET_HARD_REGNO_NREGS
20880 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
20881 #undef TARGET_HARD_REGNO_MODE_OK
20882 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
20884 #undef TARGET_MODES_TIEABLE_P
20885 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
20887 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
20888 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
20889 aarch64_hard_regno_call_part_clobbered
20891 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
20892 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
20893 aarch64_remove_extra_call_preserved_regs
20895 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
20896 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
20897 aarch64_return_call_with_max_clobbers
20899 #undef TARGET_CONSTANT_ALIGNMENT
20900 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
20902 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
20903 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
20904 aarch64_stack_clash_protection_alloca_probe_range
20906 #undef TARGET_COMPUTE_PRESSURE_CLASSES
20907 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
20909 #undef TARGET_CAN_CHANGE_MODE_CLASS
20910 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
20912 #undef TARGET_SELECT_EARLY_REMAT_MODES
20913 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
20915 #undef TARGET_SPECULATION_SAFE_VALUE
20916 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
20918 #undef TARGET_ESTIMATED_POLY_VALUE
20919 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
20921 #undef TARGET_ATTRIBUTE_TABLE
20922 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
20924 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
20925 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
20926 aarch64_simd_clone_compute_vecsize_and_simdlen
20928 #undef TARGET_SIMD_CLONE_ADJUST
20929 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
20931 #undef TARGET_SIMD_CLONE_USABLE
20932 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
20934 #undef TARGET_COMP_TYPE_ATTRIBUTES
20935 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
20937 #undef TARGET_GET_MULTILIB_ABI_NAME
20938 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
20941 #undef TARGET_RUN_TARGET_SELFTESTS
20942 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
20943 #endif /* #if CHECKING_P */
20945 #undef TARGET_ASM_POST_CFI_STARTPROC
20946 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
20948 struct gcc_target targetm
= TARGET_INITIALIZER
;
20950 #include "gt-aarch64.h"