1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2019 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
24 #define INCLUDE_STRING
26 #include "coretypes.h"
37 #include "stringpool.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
56 #include "langhooks.h"
61 #include "gimple-iterator.h"
62 #include "tree-vectorizer.h"
63 #include "aarch64-cost-tables.h"
67 #include "tm-constrs.h"
68 #include "sched-int.h"
69 #include "target-globals.h"
70 #include "common/common-target.h"
73 #include "selftest-rtl.h"
74 #include "rtx-vector-builder.h"
78 /* This file should be included last. */
79 #include "target-def.h"
81 /* Defined for convenience. */
82 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
84 /* Information about a legitimate vector immediate operand. */
85 struct simd_immediate_info
87 enum insn_type
{ MOV
, MVN
, INDEX
, PTRUE
};
88 enum modifier_type
{ LSL
, MSL
};
90 simd_immediate_info () {}
91 simd_immediate_info (scalar_float_mode
, rtx
);
92 simd_immediate_info (scalar_int_mode
, unsigned HOST_WIDE_INT
,
93 insn_type
= MOV
, modifier_type
= LSL
,
95 simd_immediate_info (scalar_mode
, rtx
, rtx
);
96 simd_immediate_info (scalar_int_mode
, aarch64_svpattern
);
98 /* The mode of the elements. */
101 /* The instruction to use to move the immediate into a vector. */
106 /* For MOV and MVN. */
109 /* The value of each element. */
112 /* The kind of shift modifier to use, and the number of bits to shift.
113 This is (LSL, 0) if no shift is needed. */
114 modifier_type modifier
;
121 /* The value of the first element and the step to be added for each
122 subsequent element. */
127 aarch64_svpattern pattern
;
131 /* Construct a floating-point immediate in which each element has mode
132 ELT_MODE_IN and value VALUE_IN. */
133 inline simd_immediate_info
134 ::simd_immediate_info (scalar_float_mode elt_mode_in
, rtx value_in
)
135 : elt_mode (elt_mode_in
), insn (MOV
)
137 u
.mov
.value
= value_in
;
138 u
.mov
.modifier
= LSL
;
142 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
143 and value VALUE_IN. The other parameters are as for the structure
145 inline simd_immediate_info
146 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
147 unsigned HOST_WIDE_INT value_in
,
148 insn_type insn_in
, modifier_type modifier_in
,
149 unsigned int shift_in
)
150 : elt_mode (elt_mode_in
), insn (insn_in
)
152 u
.mov
.value
= gen_int_mode (value_in
, elt_mode_in
);
153 u
.mov
.modifier
= modifier_in
;
154 u
.mov
.shift
= shift_in
;
157 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
158 and where element I is equal to BASE_IN + I * STEP_IN. */
159 inline simd_immediate_info
160 ::simd_immediate_info (scalar_mode elt_mode_in
, rtx base_in
, rtx step_in
)
161 : elt_mode (elt_mode_in
), insn (INDEX
)
163 u
.index
.base
= base_in
;
164 u
.index
.step
= step_in
;
167 /* Construct a predicate that controls elements of mode ELT_MODE_IN
168 and has PTRUE pattern PATTERN_IN. */
169 inline simd_immediate_info
170 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
171 aarch64_svpattern pattern_in
)
172 : elt_mode (elt_mode_in
), insn (PTRUE
)
174 u
.pattern
= pattern_in
;
177 /* The current code model. */
178 enum aarch64_code_model aarch64_cmodel
;
180 /* The number of 64-bit elements in an SVE vector. */
181 poly_uint16 aarch64_sve_vg
;
184 #undef TARGET_HAVE_TLS
185 #define TARGET_HAVE_TLS 1
188 static bool aarch64_composite_type_p (const_tree
, machine_mode
);
189 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode
,
191 machine_mode
*, int *,
193 static void aarch64_elf_asm_constructor (rtx
, int) ATTRIBUTE_UNUSED
;
194 static void aarch64_elf_asm_destructor (rtx
, int) ATTRIBUTE_UNUSED
;
195 static void aarch64_override_options_after_change (void);
196 static bool aarch64_vector_mode_supported_p (machine_mode
);
197 static int aarch64_address_cost (rtx
, machine_mode
, addr_space_t
, bool);
198 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode
,
202 static machine_mode
aarch64_simd_container_mode (scalar_mode
, poly_int64
);
203 static bool aarch64_print_address_internal (FILE*, machine_mode
, rtx
,
204 aarch64_addr_query_type
);
205 static HOST_WIDE_INT
aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val
);
207 /* Major revision number of the ARM Architecture implemented by the target. */
208 unsigned aarch64_architecture_version
;
210 /* The processor for which instructions should be scheduled. */
211 enum aarch64_processor aarch64_tune
= cortexa53
;
213 /* Mask to specify which instruction scheduling options should be used. */
214 uint64_t aarch64_tune_flags
= 0;
216 /* Global flag for PC relative loads. */
217 bool aarch64_pcrelative_literal_loads
;
219 /* Global flag for whether frame pointer is enabled. */
220 bool aarch64_use_frame_pointer
;
222 #define BRANCH_PROTECT_STR_MAX 255
223 char *accepted_branch_protection_string
= NULL
;
225 static enum aarch64_parse_opt_result
226 aarch64_parse_branch_protection (const char*, char**);
228 /* Support for command line parsing of boolean flags in the tuning
230 struct aarch64_flag_desc
236 #define AARCH64_FUSION_PAIR(name, internal_name) \
237 { name, AARCH64_FUSE_##internal_name },
238 static const struct aarch64_flag_desc aarch64_fusible_pairs
[] =
240 { "none", AARCH64_FUSE_NOTHING
},
241 #include "aarch64-fusion-pairs.def"
242 { "all", AARCH64_FUSE_ALL
},
243 { NULL
, AARCH64_FUSE_NOTHING
}
246 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
247 { name, AARCH64_EXTRA_TUNE_##internal_name },
248 static const struct aarch64_flag_desc aarch64_tuning_flags
[] =
250 { "none", AARCH64_EXTRA_TUNE_NONE
},
251 #include "aarch64-tuning-flags.def"
252 { "all", AARCH64_EXTRA_TUNE_ALL
},
253 { NULL
, AARCH64_EXTRA_TUNE_NONE
}
256 /* Tuning parameters. */
258 static const struct cpu_addrcost_table generic_addrcost_table
=
268 0, /* register_offset */
269 0, /* register_sextend */
270 0, /* register_zextend */
274 static const struct cpu_addrcost_table exynosm1_addrcost_table
=
284 1, /* register_offset */
285 1, /* register_sextend */
286 2, /* register_zextend */
290 static const struct cpu_addrcost_table xgene1_addrcost_table
=
300 0, /* register_offset */
301 1, /* register_sextend */
302 1, /* register_zextend */
306 static const struct cpu_addrcost_table thunderx2t99_addrcost_table
=
316 2, /* register_offset */
317 3, /* register_sextend */
318 3, /* register_zextend */
322 static const struct cpu_addrcost_table tsv110_addrcost_table
=
332 0, /* register_offset */
333 1, /* register_sextend */
334 1, /* register_zextend */
338 static const struct cpu_addrcost_table qdf24xx_addrcost_table
=
348 3, /* register_offset */
349 3, /* register_sextend */
350 3, /* register_zextend */
354 static const struct cpu_regmove_cost generic_regmove_cost
=
357 /* Avoid the use of slow int<->fp moves for spilling by setting
358 their cost higher than memmov_cost. */
364 static const struct cpu_regmove_cost cortexa57_regmove_cost
=
367 /* Avoid the use of slow int<->fp moves for spilling by setting
368 their cost higher than memmov_cost. */
374 static const struct cpu_regmove_cost cortexa53_regmove_cost
=
377 /* Avoid the use of slow int<->fp moves for spilling by setting
378 their cost higher than memmov_cost. */
384 static const struct cpu_regmove_cost exynosm1_regmove_cost
=
387 /* Avoid the use of slow int<->fp moves for spilling by setting
388 their cost higher than memmov_cost (actual, 4 and 9). */
394 static const struct cpu_regmove_cost thunderx_regmove_cost
=
402 static const struct cpu_regmove_cost xgene1_regmove_cost
=
405 /* Avoid the use of slow int<->fp moves for spilling by setting
406 their cost higher than memmov_cost. */
412 static const struct cpu_regmove_cost qdf24xx_regmove_cost
=
415 /* Avoid the use of int<->fp moves for spilling. */
421 static const struct cpu_regmove_cost thunderx2t99_regmove_cost
=
424 /* Avoid the use of int<->fp moves for spilling. */
430 static const struct cpu_regmove_cost tsv110_regmove_cost
=
433 /* Avoid the use of slow int<->fp moves for spilling by setting
434 their cost higher than memmov_cost. */
440 /* Generic costs for vector insn classes. */
441 static const struct cpu_vector_cost generic_vector_cost
=
443 1, /* scalar_int_stmt_cost */
444 1, /* scalar_fp_stmt_cost */
445 1, /* scalar_load_cost */
446 1, /* scalar_store_cost */
447 1, /* vec_int_stmt_cost */
448 1, /* vec_fp_stmt_cost */
449 2, /* vec_permute_cost */
450 1, /* vec_to_scalar_cost */
451 1, /* scalar_to_vec_cost */
452 1, /* vec_align_load_cost */
453 1, /* vec_unalign_load_cost */
454 1, /* vec_unalign_store_cost */
455 1, /* vec_store_cost */
456 3, /* cond_taken_branch_cost */
457 1 /* cond_not_taken_branch_cost */
460 /* QDF24XX costs for vector insn classes. */
461 static const struct cpu_vector_cost qdf24xx_vector_cost
=
463 1, /* scalar_int_stmt_cost */
464 1, /* scalar_fp_stmt_cost */
465 1, /* scalar_load_cost */
466 1, /* scalar_store_cost */
467 1, /* vec_int_stmt_cost */
468 3, /* vec_fp_stmt_cost */
469 2, /* vec_permute_cost */
470 1, /* vec_to_scalar_cost */
471 1, /* scalar_to_vec_cost */
472 1, /* vec_align_load_cost */
473 1, /* vec_unalign_load_cost */
474 1, /* vec_unalign_store_cost */
475 1, /* vec_store_cost */
476 3, /* cond_taken_branch_cost */
477 1 /* cond_not_taken_branch_cost */
480 /* ThunderX costs for vector insn classes. */
481 static const struct cpu_vector_cost thunderx_vector_cost
=
483 1, /* scalar_int_stmt_cost */
484 1, /* scalar_fp_stmt_cost */
485 3, /* scalar_load_cost */
486 1, /* scalar_store_cost */
487 4, /* vec_int_stmt_cost */
488 1, /* vec_fp_stmt_cost */
489 4, /* vec_permute_cost */
490 2, /* vec_to_scalar_cost */
491 2, /* scalar_to_vec_cost */
492 3, /* vec_align_load_cost */
493 5, /* vec_unalign_load_cost */
494 5, /* vec_unalign_store_cost */
495 1, /* vec_store_cost */
496 3, /* cond_taken_branch_cost */
497 3 /* cond_not_taken_branch_cost */
500 static const struct cpu_vector_cost tsv110_vector_cost
=
502 1, /* scalar_int_stmt_cost */
503 1, /* scalar_fp_stmt_cost */
504 5, /* scalar_load_cost */
505 1, /* scalar_store_cost */
506 2, /* vec_int_stmt_cost */
507 2, /* vec_fp_stmt_cost */
508 2, /* vec_permute_cost */
509 3, /* vec_to_scalar_cost */
510 2, /* scalar_to_vec_cost */
511 5, /* vec_align_load_cost */
512 5, /* vec_unalign_load_cost */
513 1, /* vec_unalign_store_cost */
514 1, /* vec_store_cost */
515 1, /* cond_taken_branch_cost */
516 1 /* cond_not_taken_branch_cost */
519 /* Generic costs for vector insn classes. */
520 static const struct cpu_vector_cost cortexa57_vector_cost
=
522 1, /* scalar_int_stmt_cost */
523 1, /* scalar_fp_stmt_cost */
524 4, /* scalar_load_cost */
525 1, /* scalar_store_cost */
526 2, /* vec_int_stmt_cost */
527 2, /* vec_fp_stmt_cost */
528 3, /* vec_permute_cost */
529 8, /* vec_to_scalar_cost */
530 8, /* scalar_to_vec_cost */
531 4, /* vec_align_load_cost */
532 4, /* vec_unalign_load_cost */
533 1, /* vec_unalign_store_cost */
534 1, /* vec_store_cost */
535 1, /* cond_taken_branch_cost */
536 1 /* cond_not_taken_branch_cost */
539 static const struct cpu_vector_cost exynosm1_vector_cost
=
541 1, /* scalar_int_stmt_cost */
542 1, /* scalar_fp_stmt_cost */
543 5, /* scalar_load_cost */
544 1, /* scalar_store_cost */
545 3, /* vec_int_stmt_cost */
546 3, /* vec_fp_stmt_cost */
547 3, /* vec_permute_cost */
548 3, /* vec_to_scalar_cost */
549 3, /* scalar_to_vec_cost */
550 5, /* vec_align_load_cost */
551 5, /* vec_unalign_load_cost */
552 1, /* vec_unalign_store_cost */
553 1, /* vec_store_cost */
554 1, /* cond_taken_branch_cost */
555 1 /* cond_not_taken_branch_cost */
558 /* Generic costs for vector insn classes. */
559 static const struct cpu_vector_cost xgene1_vector_cost
=
561 1, /* scalar_int_stmt_cost */
562 1, /* scalar_fp_stmt_cost */
563 5, /* scalar_load_cost */
564 1, /* scalar_store_cost */
565 2, /* vec_int_stmt_cost */
566 2, /* vec_fp_stmt_cost */
567 2, /* vec_permute_cost */
568 4, /* vec_to_scalar_cost */
569 4, /* scalar_to_vec_cost */
570 10, /* vec_align_load_cost */
571 10, /* vec_unalign_load_cost */
572 2, /* vec_unalign_store_cost */
573 2, /* vec_store_cost */
574 2, /* cond_taken_branch_cost */
575 1 /* cond_not_taken_branch_cost */
578 /* Costs for vector insn classes for Vulcan. */
579 static const struct cpu_vector_cost thunderx2t99_vector_cost
=
581 1, /* scalar_int_stmt_cost */
582 6, /* scalar_fp_stmt_cost */
583 4, /* scalar_load_cost */
584 1, /* scalar_store_cost */
585 5, /* vec_int_stmt_cost */
586 6, /* vec_fp_stmt_cost */
587 3, /* vec_permute_cost */
588 6, /* vec_to_scalar_cost */
589 5, /* scalar_to_vec_cost */
590 8, /* vec_align_load_cost */
591 8, /* vec_unalign_load_cost */
592 4, /* vec_unalign_store_cost */
593 4, /* vec_store_cost */
594 2, /* cond_taken_branch_cost */
595 1 /* cond_not_taken_branch_cost */
598 /* Generic costs for branch instructions. */
599 static const struct cpu_branch_cost generic_branch_cost
=
601 1, /* Predictable. */
602 3 /* Unpredictable. */
605 /* Generic approximation modes. */
606 static const cpu_approx_modes generic_approx_modes
=
608 AARCH64_APPROX_NONE
, /* division */
609 AARCH64_APPROX_NONE
, /* sqrt */
610 AARCH64_APPROX_NONE
/* recip_sqrt */
613 /* Approximation modes for Exynos M1. */
614 static const cpu_approx_modes exynosm1_approx_modes
=
616 AARCH64_APPROX_NONE
, /* division */
617 AARCH64_APPROX_ALL
, /* sqrt */
618 AARCH64_APPROX_ALL
/* recip_sqrt */
621 /* Approximation modes for X-Gene 1. */
622 static const cpu_approx_modes xgene1_approx_modes
=
624 AARCH64_APPROX_NONE
, /* division */
625 AARCH64_APPROX_NONE
, /* sqrt */
626 AARCH64_APPROX_ALL
/* recip_sqrt */
629 /* Generic prefetch settings (which disable prefetch). */
630 static const cpu_prefetch_tune generic_prefetch_tune
=
633 -1, /* l1_cache_size */
634 -1, /* l1_cache_line_size */
635 -1, /* l2_cache_size */
636 true, /* prefetch_dynamic_strides */
637 -1, /* minimum_stride */
638 -1 /* default_opt_level */
641 static const cpu_prefetch_tune exynosm1_prefetch_tune
=
644 -1, /* l1_cache_size */
645 64, /* l1_cache_line_size */
646 -1, /* l2_cache_size */
647 true, /* prefetch_dynamic_strides */
648 -1, /* minimum_stride */
649 -1 /* default_opt_level */
652 static const cpu_prefetch_tune qdf24xx_prefetch_tune
=
655 32, /* l1_cache_size */
656 64, /* l1_cache_line_size */
657 512, /* l2_cache_size */
658 false, /* prefetch_dynamic_strides */
659 2048, /* minimum_stride */
660 3 /* default_opt_level */
663 static const cpu_prefetch_tune thunderxt88_prefetch_tune
=
666 32, /* l1_cache_size */
667 128, /* l1_cache_line_size */
668 16*1024, /* l2_cache_size */
669 true, /* prefetch_dynamic_strides */
670 -1, /* minimum_stride */
671 3 /* default_opt_level */
674 static const cpu_prefetch_tune thunderx_prefetch_tune
=
677 32, /* l1_cache_size */
678 128, /* l1_cache_line_size */
679 -1, /* l2_cache_size */
680 true, /* prefetch_dynamic_strides */
681 -1, /* minimum_stride */
682 -1 /* default_opt_level */
685 static const cpu_prefetch_tune thunderx2t99_prefetch_tune
=
688 32, /* l1_cache_size */
689 64, /* l1_cache_line_size */
690 256, /* l2_cache_size */
691 true, /* prefetch_dynamic_strides */
692 -1, /* minimum_stride */
693 -1 /* default_opt_level */
696 static const cpu_prefetch_tune tsv110_prefetch_tune
=
699 64, /* l1_cache_size */
700 64, /* l1_cache_line_size */
701 512, /* l2_cache_size */
702 true, /* prefetch_dynamic_strides */
703 -1, /* minimum_stride */
704 -1 /* default_opt_level */
707 static const cpu_prefetch_tune xgene1_prefetch_tune
=
710 32, /* l1_cache_size */
711 64, /* l1_cache_line_size */
712 256, /* l2_cache_size */
713 true, /* prefetch_dynamic_strides */
714 -1, /* minimum_stride */
715 -1 /* default_opt_level */
718 static const struct tune_params generic_tunings
=
720 &cortexa57_extra_costs
,
721 &generic_addrcost_table
,
722 &generic_regmove_cost
,
723 &generic_vector_cost
,
724 &generic_branch_cost
,
725 &generic_approx_modes
,
726 SVE_NOT_IMPLEMENTED
, /* sve_width */
729 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
730 "16:12", /* function_align. */
731 "4", /* jump_align. */
732 "8", /* loop_align. */
733 2, /* int_reassoc_width. */
734 4, /* fp_reassoc_width. */
735 1, /* vec_reassoc_width. */
736 2, /* min_div_recip_mul_sf. */
737 2, /* min_div_recip_mul_df. */
738 0, /* max_case_values. */
739 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
740 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
741 &generic_prefetch_tune
744 static const struct tune_params cortexa35_tunings
=
746 &cortexa53_extra_costs
,
747 &generic_addrcost_table
,
748 &cortexa53_regmove_cost
,
749 &generic_vector_cost
,
750 &generic_branch_cost
,
751 &generic_approx_modes
,
752 SVE_NOT_IMPLEMENTED
, /* sve_width */
755 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
756 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
757 "16", /* function_align. */
758 "4", /* jump_align. */
759 "8", /* loop_align. */
760 2, /* int_reassoc_width. */
761 4, /* fp_reassoc_width. */
762 1, /* vec_reassoc_width. */
763 2, /* min_div_recip_mul_sf. */
764 2, /* min_div_recip_mul_df. */
765 0, /* max_case_values. */
766 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
767 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
768 &generic_prefetch_tune
771 static const struct tune_params cortexa53_tunings
=
773 &cortexa53_extra_costs
,
774 &generic_addrcost_table
,
775 &cortexa53_regmove_cost
,
776 &generic_vector_cost
,
777 &generic_branch_cost
,
778 &generic_approx_modes
,
779 SVE_NOT_IMPLEMENTED
, /* sve_width */
782 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
783 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
784 "16", /* function_align. */
785 "4", /* jump_align. */
786 "8", /* loop_align. */
787 2, /* int_reassoc_width. */
788 4, /* fp_reassoc_width. */
789 1, /* vec_reassoc_width. */
790 2, /* min_div_recip_mul_sf. */
791 2, /* min_div_recip_mul_df. */
792 0, /* max_case_values. */
793 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
794 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
795 &generic_prefetch_tune
798 static const struct tune_params cortexa57_tunings
=
800 &cortexa57_extra_costs
,
801 &generic_addrcost_table
,
802 &cortexa57_regmove_cost
,
803 &cortexa57_vector_cost
,
804 &generic_branch_cost
,
805 &generic_approx_modes
,
806 SVE_NOT_IMPLEMENTED
, /* sve_width */
809 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
810 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
811 "16", /* function_align. */
812 "4", /* jump_align. */
813 "8", /* loop_align. */
814 2, /* int_reassoc_width. */
815 4, /* fp_reassoc_width. */
816 1, /* vec_reassoc_width. */
817 2, /* min_div_recip_mul_sf. */
818 2, /* min_div_recip_mul_df. */
819 0, /* max_case_values. */
820 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
821 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
), /* tune_flags. */
822 &generic_prefetch_tune
825 static const struct tune_params cortexa72_tunings
=
827 &cortexa57_extra_costs
,
828 &generic_addrcost_table
,
829 &cortexa57_regmove_cost
,
830 &cortexa57_vector_cost
,
831 &generic_branch_cost
,
832 &generic_approx_modes
,
833 SVE_NOT_IMPLEMENTED
, /* sve_width */
836 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
837 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
838 "16", /* function_align. */
839 "4", /* jump_align. */
840 "8", /* loop_align. */
841 2, /* int_reassoc_width. */
842 4, /* fp_reassoc_width. */
843 1, /* vec_reassoc_width. */
844 2, /* min_div_recip_mul_sf. */
845 2, /* min_div_recip_mul_df. */
846 0, /* max_case_values. */
847 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
848 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
849 &generic_prefetch_tune
852 static const struct tune_params cortexa73_tunings
=
854 &cortexa57_extra_costs
,
855 &generic_addrcost_table
,
856 &cortexa57_regmove_cost
,
857 &cortexa57_vector_cost
,
858 &generic_branch_cost
,
859 &generic_approx_modes
,
860 SVE_NOT_IMPLEMENTED
, /* sve_width */
861 4, /* memmov_cost. */
863 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
864 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
865 "16", /* function_align. */
866 "4", /* jump_align. */
867 "8", /* loop_align. */
868 2, /* int_reassoc_width. */
869 4, /* fp_reassoc_width. */
870 1, /* vec_reassoc_width. */
871 2, /* min_div_recip_mul_sf. */
872 2, /* min_div_recip_mul_df. */
873 0, /* max_case_values. */
874 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
875 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
876 &generic_prefetch_tune
881 static const struct tune_params exynosm1_tunings
=
883 &exynosm1_extra_costs
,
884 &exynosm1_addrcost_table
,
885 &exynosm1_regmove_cost
,
886 &exynosm1_vector_cost
,
887 &generic_branch_cost
,
888 &exynosm1_approx_modes
,
889 SVE_NOT_IMPLEMENTED
, /* sve_width */
892 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
893 "4", /* function_align. */
894 "4", /* jump_align. */
895 "4", /* loop_align. */
896 2, /* int_reassoc_width. */
897 4, /* fp_reassoc_width. */
898 1, /* vec_reassoc_width. */
899 2, /* min_div_recip_mul_sf. */
900 2, /* min_div_recip_mul_df. */
901 48, /* max_case_values. */
902 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
903 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
904 &exynosm1_prefetch_tune
907 static const struct tune_params thunderxt88_tunings
=
909 &thunderx_extra_costs
,
910 &generic_addrcost_table
,
911 &thunderx_regmove_cost
,
912 &thunderx_vector_cost
,
913 &generic_branch_cost
,
914 &generic_approx_modes
,
915 SVE_NOT_IMPLEMENTED
, /* sve_width */
918 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
919 "8", /* function_align. */
920 "8", /* jump_align. */
921 "8", /* loop_align. */
922 2, /* int_reassoc_width. */
923 4, /* fp_reassoc_width. */
924 1, /* vec_reassoc_width. */
925 2, /* min_div_recip_mul_sf. */
926 2, /* min_div_recip_mul_df. */
927 0, /* max_case_values. */
928 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
929 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
), /* tune_flags. */
930 &thunderxt88_prefetch_tune
933 static const struct tune_params thunderx_tunings
=
935 &thunderx_extra_costs
,
936 &generic_addrcost_table
,
937 &thunderx_regmove_cost
,
938 &thunderx_vector_cost
,
939 &generic_branch_cost
,
940 &generic_approx_modes
,
941 SVE_NOT_IMPLEMENTED
, /* sve_width */
944 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
945 "8", /* function_align. */
946 "8", /* jump_align. */
947 "8", /* loop_align. */
948 2, /* int_reassoc_width. */
949 4, /* fp_reassoc_width. */
950 1, /* vec_reassoc_width. */
951 2, /* min_div_recip_mul_sf. */
952 2, /* min_div_recip_mul_df. */
953 0, /* max_case_values. */
954 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
955 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
956 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
), /* tune_flags. */
957 &thunderx_prefetch_tune
960 static const struct tune_params tsv110_tunings
=
963 &tsv110_addrcost_table
,
964 &tsv110_regmove_cost
,
966 &generic_branch_cost
,
967 &generic_approx_modes
,
968 SVE_NOT_IMPLEMENTED
, /* sve_width */
971 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_CMP_BRANCH
972 | AARCH64_FUSE_ALU_BRANCH
), /* fusible_ops */
973 "16", /* function_align. */
974 "4", /* jump_align. */
975 "8", /* loop_align. */
976 2, /* int_reassoc_width. */
977 4, /* fp_reassoc_width. */
978 1, /* vec_reassoc_width. */
979 2, /* min_div_recip_mul_sf. */
980 2, /* min_div_recip_mul_df. */
981 0, /* max_case_values. */
982 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
983 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
984 &tsv110_prefetch_tune
987 static const struct tune_params xgene1_tunings
=
990 &xgene1_addrcost_table
,
991 &xgene1_regmove_cost
,
993 &generic_branch_cost
,
994 &xgene1_approx_modes
,
995 SVE_NOT_IMPLEMENTED
, /* sve_width */
998 AARCH64_FUSE_NOTHING
, /* fusible_ops */
999 "16", /* function_align. */
1000 "16", /* jump_align. */
1001 "16", /* loop_align. */
1002 2, /* int_reassoc_width. */
1003 4, /* fp_reassoc_width. */
1004 1, /* vec_reassoc_width. */
1005 2, /* min_div_recip_mul_sf. */
1006 2, /* min_div_recip_mul_df. */
1007 17, /* max_case_values. */
1008 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
1009 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
), /* tune_flags. */
1010 &xgene1_prefetch_tune
1013 static const struct tune_params emag_tunings
=
1015 &xgene1_extra_costs
,
1016 &xgene1_addrcost_table
,
1017 &xgene1_regmove_cost
,
1018 &xgene1_vector_cost
,
1019 &generic_branch_cost
,
1020 &xgene1_approx_modes
,
1021 SVE_NOT_IMPLEMENTED
,
1022 6, /* memmov_cost */
1024 AARCH64_FUSE_NOTHING
, /* fusible_ops */
1025 "16", /* function_align. */
1026 "16", /* jump_align. */
1027 "16", /* loop_align. */
1028 2, /* int_reassoc_width. */
1029 4, /* fp_reassoc_width. */
1030 1, /* vec_reassoc_width. */
1031 2, /* min_div_recip_mul_sf. */
1032 2, /* min_div_recip_mul_df. */
1033 17, /* max_case_values. */
1034 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
1035 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
), /* tune_flags. */
1036 &xgene1_prefetch_tune
1039 static const struct tune_params qdf24xx_tunings
=
1041 &qdf24xx_extra_costs
,
1042 &qdf24xx_addrcost_table
,
1043 &qdf24xx_regmove_cost
,
1044 &qdf24xx_vector_cost
,
1045 &generic_branch_cost
,
1046 &generic_approx_modes
,
1047 SVE_NOT_IMPLEMENTED
, /* sve_width */
1048 4, /* memmov_cost */
1050 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
1051 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
1052 "16", /* function_align. */
1053 "8", /* jump_align. */
1054 "16", /* loop_align. */
1055 2, /* int_reassoc_width. */
1056 4, /* fp_reassoc_width. */
1057 1, /* vec_reassoc_width. */
1058 2, /* min_div_recip_mul_sf. */
1059 2, /* min_div_recip_mul_df. */
1060 0, /* max_case_values. */
1061 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1062 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS
, /* tune_flags. */
1063 &qdf24xx_prefetch_tune
1066 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1068 static const struct tune_params saphira_tunings
=
1070 &generic_extra_costs
,
1071 &generic_addrcost_table
,
1072 &generic_regmove_cost
,
1073 &generic_vector_cost
,
1074 &generic_branch_cost
,
1075 &generic_approx_modes
,
1076 SVE_NOT_IMPLEMENTED
, /* sve_width */
1077 4, /* memmov_cost */
1079 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
1080 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
1081 "16", /* function_align. */
1082 "8", /* jump_align. */
1083 "16", /* loop_align. */
1084 2, /* int_reassoc_width. */
1085 4, /* fp_reassoc_width. */
1086 1, /* vec_reassoc_width. */
1087 2, /* min_div_recip_mul_sf. */
1088 2, /* min_div_recip_mul_df. */
1089 0, /* max_case_values. */
1090 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1091 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1092 &generic_prefetch_tune
1095 static const struct tune_params thunderx2t99_tunings
=
1097 &thunderx2t99_extra_costs
,
1098 &thunderx2t99_addrcost_table
,
1099 &thunderx2t99_regmove_cost
,
1100 &thunderx2t99_vector_cost
,
1101 &generic_branch_cost
,
1102 &generic_approx_modes
,
1103 SVE_NOT_IMPLEMENTED
, /* sve_width */
1104 4, /* memmov_cost. */
1105 4, /* issue_rate. */
1106 (AARCH64_FUSE_CMP_BRANCH
| AARCH64_FUSE_AES_AESMC
1107 | AARCH64_FUSE_ALU_BRANCH
), /* fusible_ops */
1108 "16", /* function_align. */
1109 "8", /* jump_align. */
1110 "16", /* loop_align. */
1111 3, /* int_reassoc_width. */
1112 2, /* fp_reassoc_width. */
1113 2, /* vec_reassoc_width. */
1114 2, /* min_div_recip_mul_sf. */
1115 2, /* min_div_recip_mul_df. */
1116 0, /* max_case_values. */
1117 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1118 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1119 &thunderx2t99_prefetch_tune
1122 static const struct tune_params neoversen1_tunings
=
1124 &cortexa57_extra_costs
,
1125 &generic_addrcost_table
,
1126 &generic_regmove_cost
,
1127 &cortexa57_vector_cost
,
1128 &generic_branch_cost
,
1129 &generic_approx_modes
,
1130 SVE_NOT_IMPLEMENTED
, /* sve_width */
1131 4, /* memmov_cost */
1133 AARCH64_FUSE_AES_AESMC
, /* fusible_ops */
1134 "32:16", /* function_align. */
1135 "32:16", /* jump_align. */
1136 "32:16", /* loop_align. */
1137 2, /* int_reassoc_width. */
1138 4, /* fp_reassoc_width. */
1139 2, /* vec_reassoc_width. */
1140 2, /* min_div_recip_mul_sf. */
1141 2, /* min_div_recip_mul_df. */
1142 0, /* max_case_values. */
1143 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1144 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1145 &generic_prefetch_tune
1148 /* Support for fine-grained override of the tuning structures. */
1149 struct aarch64_tuning_override_function
1152 void (*parse_override
)(const char*, struct tune_params
*);
1155 static void aarch64_parse_fuse_string (const char*, struct tune_params
*);
1156 static void aarch64_parse_tune_string (const char*, struct tune_params
*);
1157 static void aarch64_parse_sve_width_string (const char*, struct tune_params
*);
1159 static const struct aarch64_tuning_override_function
1160 aarch64_tuning_override_functions
[] =
1162 { "fuse", aarch64_parse_fuse_string
},
1163 { "tune", aarch64_parse_tune_string
},
1164 { "sve_width", aarch64_parse_sve_width_string
},
1168 /* A processor implementing AArch64. */
1171 const char *const name
;
1172 enum aarch64_processor ident
;
1173 enum aarch64_processor sched_core
;
1174 enum aarch64_arch arch
;
1175 unsigned architecture_version
;
1176 const uint64_t flags
;
1177 const struct tune_params
*const tune
;
1180 /* Architectures implementing AArch64. */
1181 static const struct processor all_architectures
[] =
1183 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1184 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1185 #include "aarch64-arches.def"
1186 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
1189 /* Processor cores implementing AArch64. */
1190 static const struct processor all_cores
[] =
1192 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1193 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1194 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1195 FLAGS, &COSTS##_tunings},
1196 #include "aarch64-cores.def"
1197 {"generic", generic
, cortexa53
, AARCH64_ARCH_8A
, 8,
1198 AARCH64_FL_FOR_ARCH8
, &generic_tunings
},
1199 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
1203 /* Target specification. These are populated by the -march, -mtune, -mcpu
1204 handling code or by target attributes. */
1205 static const struct processor
*selected_arch
;
1206 static const struct processor
*selected_cpu
;
1207 static const struct processor
*selected_tune
;
1209 enum aarch64_key_type aarch64_ra_sign_key
= AARCH64_KEY_A
;
1211 /* The current tuning set. */
1212 struct tune_params aarch64_tune_params
= generic_tunings
;
1214 /* Table of machine attributes. */
1215 static const struct attribute_spec aarch64_attribute_table
[] =
1217 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1218 affects_type_identity, handler, exclude } */
1219 { "aarch64_vector_pcs", 0, 0, false, true, true, true, NULL
, NULL
},
1220 { NULL
, 0, 0, false, false, false, false, NULL
, NULL
}
1223 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1225 /* An ISA extension in the co-processor and main instruction set space. */
1226 struct aarch64_option_extension
1228 const char *const name
;
1229 const unsigned long flags_on
;
1230 const unsigned long flags_off
;
1233 typedef enum aarch64_cond_code
1235 AARCH64_EQ
= 0, AARCH64_NE
, AARCH64_CS
, AARCH64_CC
, AARCH64_MI
, AARCH64_PL
,
1236 AARCH64_VS
, AARCH64_VC
, AARCH64_HI
, AARCH64_LS
, AARCH64_GE
, AARCH64_LT
,
1237 AARCH64_GT
, AARCH64_LE
, AARCH64_AL
, AARCH64_NV
1241 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1243 struct aarch64_branch_protect_type
1245 /* The type's name that the user passes to the branch-protection option
1248 /* Function to handle the protection type and set global variables.
1249 First argument is the string token corresponding with this type and the
1250 second argument is the next token in the option string.
1252 * AARCH64_PARSE_OK: Handling was sucessful.
1253 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1254 should print an error.
1255 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1257 enum aarch64_parse_opt_result (*handler
)(char*, char*);
1258 /* A list of types that can follow this type in the option string. */
1259 const aarch64_branch_protect_type
* subtypes
;
1260 unsigned int num_subtypes
;
1263 static enum aarch64_parse_opt_result
1264 aarch64_handle_no_branch_protection (char* str
, char* rest
)
1266 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NONE
;
1267 aarch64_enable_bti
= 0;
1270 error ("unexpected %<%s%> after %<%s%>", rest
, str
);
1271 return AARCH64_PARSE_INVALID_FEATURE
;
1273 return AARCH64_PARSE_OK
;
1276 static enum aarch64_parse_opt_result
1277 aarch64_handle_standard_branch_protection (char* str
, char* rest
)
1279 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
1280 aarch64_ra_sign_key
= AARCH64_KEY_A
;
1281 aarch64_enable_bti
= 1;
1284 error ("unexpected %<%s%> after %<%s%>", rest
, str
);
1285 return AARCH64_PARSE_INVALID_FEATURE
;
1287 return AARCH64_PARSE_OK
;
1290 static enum aarch64_parse_opt_result
1291 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED
,
1292 char* rest ATTRIBUTE_UNUSED
)
1294 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
1295 aarch64_ra_sign_key
= AARCH64_KEY_A
;
1296 return AARCH64_PARSE_OK
;
1299 static enum aarch64_parse_opt_result
1300 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED
,
1301 char* rest ATTRIBUTE_UNUSED
)
1303 aarch64_ra_sign_scope
= AARCH64_FUNCTION_ALL
;
1304 return AARCH64_PARSE_OK
;
1307 static enum aarch64_parse_opt_result
1308 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED
,
1309 char* rest ATTRIBUTE_UNUSED
)
1311 aarch64_ra_sign_key
= AARCH64_KEY_B
;
1312 return AARCH64_PARSE_OK
;
1315 static enum aarch64_parse_opt_result
1316 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED
,
1317 char* rest ATTRIBUTE_UNUSED
)
1319 aarch64_enable_bti
= 1;
1320 return AARCH64_PARSE_OK
;
1323 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes
[] = {
1324 { "leaf", aarch64_handle_pac_ret_leaf
, NULL
, 0 },
1325 { "b-key", aarch64_handle_pac_ret_b_key
, NULL
, 0 },
1326 { NULL
, NULL
, NULL
, 0 }
1329 static const struct aarch64_branch_protect_type aarch64_branch_protect_types
[] = {
1330 { "none", aarch64_handle_no_branch_protection
, NULL
, 0 },
1331 { "standard", aarch64_handle_standard_branch_protection
, NULL
, 0 },
1332 { "pac-ret", aarch64_handle_pac_ret_protection
, aarch64_pac_ret_subtypes
,
1333 ARRAY_SIZE (aarch64_pac_ret_subtypes
) },
1334 { "bti", aarch64_handle_bti_protection
, NULL
, 0 },
1335 { NULL
, NULL
, NULL
, 0 }
1338 /* The condition codes of the processor, and the inverse function. */
1339 static const char * const aarch64_condition_codes
[] =
1341 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1342 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1345 /* The preferred condition codes for SVE conditions. */
1346 static const char *const aarch64_sve_condition_codes
[] =
1348 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1349 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1352 /* Return the assembly token for svpattern value VALUE. */
1355 svpattern_token (enum aarch64_svpattern pattern
)
1359 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1360 AARCH64_FOR_SVPATTERN (CASE
)
1362 case AARCH64_NUM_SVPATTERNS
:
1368 /* Generate code to enable conditional branches in functions over 1 MiB. */
1370 aarch64_gen_far_branch (rtx
* operands
, int pos_label
, const char * dest
,
1371 const char * branch_format
)
1373 rtx_code_label
* tmp_label
= gen_label_rtx ();
1374 char label_buf
[256];
1376 ASM_GENERATE_INTERNAL_LABEL (label_buf
, dest
,
1377 CODE_LABEL_NUMBER (tmp_label
));
1378 const char *label_ptr
= targetm
.strip_name_encoding (label_buf
);
1379 rtx dest_label
= operands
[pos_label
];
1380 operands
[pos_label
] = tmp_label
;
1382 snprintf (buffer
, sizeof (buffer
), "%s%s", branch_format
, label_ptr
);
1383 output_asm_insn (buffer
, operands
);
1385 snprintf (buffer
, sizeof (buffer
), "b\t%%l%d\n%s:", pos_label
, label_ptr
);
1386 operands
[pos_label
] = dest_label
;
1387 output_asm_insn (buffer
, operands
);
1392 aarch64_err_no_fpadvsimd (machine_mode mode
)
1394 if (TARGET_GENERAL_REGS_ONLY
)
1395 if (FLOAT_MODE_P (mode
))
1396 error ("%qs is incompatible with the use of floating-point types",
1397 "-mgeneral-regs-only");
1399 error ("%qs is incompatible with the use of vector types",
1400 "-mgeneral-regs-only");
1402 if (FLOAT_MODE_P (mode
))
1403 error ("%qs feature modifier is incompatible with the use of"
1404 " floating-point types", "+nofp");
1406 error ("%qs feature modifier is incompatible with the use of"
1407 " vector types", "+nofp");
1410 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1411 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1412 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1413 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1414 and GENERAL_REGS is lower than the memory cost (in this case the best class
1415 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1416 cost results in bad allocations with many redundant int<->FP moves which
1417 are expensive on various cores.
1418 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1419 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1420 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1421 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1422 The result of this is that it is no longer inefficient to have a higher
1423 memory move cost than the register move cost.
1427 aarch64_ira_change_pseudo_allocno_class (int regno
, reg_class_t allocno_class
,
1428 reg_class_t best_class
)
1432 if (!reg_class_subset_p (GENERAL_REGS
, allocno_class
)
1433 || !reg_class_subset_p (FP_REGS
, allocno_class
))
1434 return allocno_class
;
1436 if (!reg_class_subset_p (GENERAL_REGS
, best_class
)
1437 || !reg_class_subset_p (FP_REGS
, best_class
))
1440 mode
= PSEUDO_REGNO_MODE (regno
);
1441 return FLOAT_MODE_P (mode
) || VECTOR_MODE_P (mode
) ? FP_REGS
: GENERAL_REGS
;
1445 aarch64_min_divisions_for_recip_mul (machine_mode mode
)
1447 if (GET_MODE_UNIT_SIZE (mode
) == 4)
1448 return aarch64_tune_params
.min_div_recip_mul_sf
;
1449 return aarch64_tune_params
.min_div_recip_mul_df
;
1452 /* Return the reassociation width of treeop OPC with mode MODE. */
1454 aarch64_reassociation_width (unsigned opc
, machine_mode mode
)
1456 if (VECTOR_MODE_P (mode
))
1457 return aarch64_tune_params
.vec_reassoc_width
;
1458 if (INTEGRAL_MODE_P (mode
))
1459 return aarch64_tune_params
.int_reassoc_width
;
1460 /* Avoid reassociating floating point addition so we emit more FMAs. */
1461 if (FLOAT_MODE_P (mode
) && opc
!= PLUS_EXPR
)
1462 return aarch64_tune_params
.fp_reassoc_width
;
1466 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1468 aarch64_dbx_register_number (unsigned regno
)
1470 if (GP_REGNUM_P (regno
))
1471 return AARCH64_DWARF_R0
+ regno
- R0_REGNUM
;
1472 else if (regno
== SP_REGNUM
)
1473 return AARCH64_DWARF_SP
;
1474 else if (FP_REGNUM_P (regno
))
1475 return AARCH64_DWARF_V0
+ regno
- V0_REGNUM
;
1476 else if (PR_REGNUM_P (regno
))
1477 return AARCH64_DWARF_P0
+ regno
- P0_REGNUM
;
1478 else if (regno
== VG_REGNUM
)
1479 return AARCH64_DWARF_VG
;
1481 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1482 equivalent DWARF register. */
1483 return DWARF_FRAME_REGISTERS
;
1486 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1487 integer, otherwise return X unmodified. */
1489 aarch64_bit_representation (rtx x
)
1491 if (CONST_DOUBLE_P (x
))
1492 x
= gen_lowpart (int_mode_for_mode (GET_MODE (x
)).require (), x
);
1496 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1498 aarch64_advsimd_struct_mode_p (machine_mode mode
)
1501 && (mode
== OImode
|| mode
== CImode
|| mode
== XImode
));
1504 /* Return true if MODE is an SVE predicate mode. */
1506 aarch64_sve_pred_mode_p (machine_mode mode
)
1509 && (mode
== VNx16BImode
1510 || mode
== VNx8BImode
1511 || mode
== VNx4BImode
1512 || mode
== VNx2BImode
));
1515 /* Three mutually-exclusive flags describing a vector or predicate type. */
1516 const unsigned int VEC_ADVSIMD
= 1;
1517 const unsigned int VEC_SVE_DATA
= 2;
1518 const unsigned int VEC_SVE_PRED
= 4;
1519 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1520 a structure of 2, 3 or 4 vectors. */
1521 const unsigned int VEC_STRUCT
= 8;
1522 /* Useful combinations of the above. */
1523 const unsigned int VEC_ANY_SVE
= VEC_SVE_DATA
| VEC_SVE_PRED
;
1524 const unsigned int VEC_ANY_DATA
= VEC_ADVSIMD
| VEC_SVE_DATA
;
1526 /* Return a set of flags describing the vector properties of mode MODE.
1527 Ignore modes that are not supported by the current target. */
1529 aarch64_classify_vector_mode (machine_mode mode
)
1531 if (aarch64_advsimd_struct_mode_p (mode
))
1532 return VEC_ADVSIMD
| VEC_STRUCT
;
1534 if (aarch64_sve_pred_mode_p (mode
))
1535 return VEC_SVE_PRED
;
1537 /* Make the decision based on the mode's enum value rather than its
1538 properties, so that we keep the correct classification regardless
1539 of -msve-vector-bits. */
1542 /* Single SVE vectors. */
1550 return TARGET_SVE
? VEC_SVE_DATA
: 0;
1552 /* x2 SVE vectors. */
1560 /* x3 SVE vectors. */
1568 /* x4 SVE vectors. */
1576 return TARGET_SVE
? VEC_SVE_DATA
| VEC_STRUCT
: 0;
1578 /* 64-bit Advanced SIMD vectors. */
1582 /* ...E_V1DImode doesn't exist. */
1586 /* 128-bit Advanced SIMD vectors. */
1594 return TARGET_SIMD
? VEC_ADVSIMD
: 0;
1601 /* Return true if MODE is any of the data vector modes, including
1604 aarch64_vector_data_mode_p (machine_mode mode
)
1606 return aarch64_classify_vector_mode (mode
) & VEC_ANY_DATA
;
1609 /* Return true if MODE is any form of SVE mode, including predicates,
1610 vectors and structures. */
1612 aarch64_sve_mode_p (machine_mode mode
)
1614 return aarch64_classify_vector_mode (mode
) & VEC_ANY_SVE
;
1617 /* Return true if MODE is an SVE data vector mode; either a single vector
1618 or a structure of vectors. */
1620 aarch64_sve_data_mode_p (machine_mode mode
)
1622 return aarch64_classify_vector_mode (mode
) & VEC_SVE_DATA
;
1625 /* Implement target hook TARGET_ARRAY_MODE. */
1626 static opt_machine_mode
1627 aarch64_array_mode (machine_mode mode
, unsigned HOST_WIDE_INT nelems
)
1629 if (aarch64_classify_vector_mode (mode
) == VEC_SVE_DATA
1630 && IN_RANGE (nelems
, 2, 4))
1631 return mode_for_vector (GET_MODE_INNER (mode
),
1632 GET_MODE_NUNITS (mode
) * nelems
);
1634 return opt_machine_mode ();
1637 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1639 aarch64_array_mode_supported_p (machine_mode mode
,
1640 unsigned HOST_WIDE_INT nelems
)
1643 && (AARCH64_VALID_SIMD_QREG_MODE (mode
)
1644 || AARCH64_VALID_SIMD_DREG_MODE (mode
))
1645 && (nelems
>= 2 && nelems
<= 4))
1651 /* Return the SVE predicate mode to use for elements that have
1652 ELEM_NBYTES bytes, if such a mode exists. */
1655 aarch64_sve_pred_mode (unsigned int elem_nbytes
)
1659 if (elem_nbytes
== 1)
1661 if (elem_nbytes
== 2)
1663 if (elem_nbytes
== 4)
1665 if (elem_nbytes
== 8)
1668 return opt_machine_mode ();
1671 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1673 static opt_machine_mode
1674 aarch64_get_mask_mode (poly_uint64 nunits
, poly_uint64 nbytes
)
1676 if (TARGET_SVE
&& known_eq (nbytes
, BYTES_PER_SVE_VECTOR
))
1678 unsigned int elem_nbytes
= vector_element_size (nbytes
, nunits
);
1679 machine_mode pred_mode
;
1680 if (aarch64_sve_pred_mode (elem_nbytes
).exists (&pred_mode
))
1684 return default_get_mask_mode (nunits
, nbytes
);
1687 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
1689 static opt_machine_mode
1690 aarch64_sve_data_mode (scalar_mode inner_mode
, poly_uint64 nunits
)
1692 enum mode_class mclass
= (is_a
<scalar_float_mode
> (inner_mode
)
1693 ? MODE_VECTOR_FLOAT
: MODE_VECTOR_INT
);
1695 FOR_EACH_MODE_IN_CLASS (mode
, mclass
)
1696 if (inner_mode
== GET_MODE_INNER (mode
)
1697 && known_eq (nunits
, GET_MODE_NUNITS (mode
))
1698 && aarch64_sve_data_mode_p (mode
))
1700 return opt_machine_mode ();
1703 /* Return the integer element mode associated with SVE mode MODE. */
1705 static scalar_int_mode
1706 aarch64_sve_element_int_mode (machine_mode mode
)
1708 unsigned int elt_bits
= vector_element_size (BITS_PER_SVE_VECTOR
,
1709 GET_MODE_NUNITS (mode
));
1710 return int_mode_for_size (elt_bits
, 0).require ();
1713 /* Return the integer vector mode associated with SVE mode MODE.
1714 Unlike mode_for_int_vector, this can handle the case in which
1715 MODE is a predicate (and thus has a different total size). */
1718 aarch64_sve_int_mode (machine_mode mode
)
1720 scalar_int_mode int_mode
= aarch64_sve_element_int_mode (mode
);
1721 return aarch64_sve_data_mode (int_mode
, GET_MODE_NUNITS (mode
)).require ();
1724 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1725 prefer to use the first arithmetic operand as the else value if
1726 the else value doesn't matter, since that exactly matches the SVE
1727 destructive merging form. For ternary operations we could either
1728 pick the first operand and use FMAD-like instructions or the last
1729 operand and use FMLA-like instructions; the latter seems more
1733 aarch64_preferred_else_value (unsigned, tree
, unsigned int nops
, tree
*ops
)
1735 return nops
== 3 ? ops
[2] : ops
[0];
1738 /* Implement TARGET_HARD_REGNO_NREGS. */
1741 aarch64_hard_regno_nregs (unsigned regno
, machine_mode mode
)
1743 /* ??? Logically we should only need to provide a value when
1744 HARD_REGNO_MODE_OK says that the combination is valid,
1745 but at the moment we need to handle all modes. Just ignore
1746 any runtime parts for registers that can't store them. */
1747 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
1748 switch (aarch64_regno_regclass (regno
))
1753 if (aarch64_sve_data_mode_p (mode
))
1754 return exact_div (GET_MODE_SIZE (mode
),
1755 BYTES_PER_SVE_VECTOR
).to_constant ();
1756 return CEIL (lowest_size
, UNITS_PER_VREG
);
1762 return CEIL (lowest_size
, UNITS_PER_WORD
);
1767 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1770 aarch64_hard_regno_mode_ok (unsigned regno
, machine_mode mode
)
1772 if (GET_MODE_CLASS (mode
) == MODE_CC
)
1773 return regno
== CC_REGNUM
;
1775 if (regno
== VG_REGNUM
)
1776 /* This must have the same size as _Unwind_Word. */
1777 return mode
== DImode
;
1779 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1780 if (vec_flags
& VEC_SVE_PRED
)
1781 return PR_REGNUM_P (regno
);
1783 if (PR_REGNUM_P (regno
))
1786 if (regno
== SP_REGNUM
)
1787 /* The purpose of comparing with ptr_mode is to support the
1788 global register variable associated with the stack pointer
1789 register via the syntax of asm ("wsp") in ILP32. */
1790 return mode
== Pmode
|| mode
== ptr_mode
;
1792 if (regno
== FRAME_POINTER_REGNUM
|| regno
== ARG_POINTER_REGNUM
)
1793 return mode
== Pmode
;
1795 if (GP_REGNUM_P (regno
))
1797 if (known_le (GET_MODE_SIZE (mode
), 8))
1799 else if (known_le (GET_MODE_SIZE (mode
), 16))
1800 return (regno
& 1) == 0;
1802 else if (FP_REGNUM_P (regno
))
1804 if (vec_flags
& VEC_STRUCT
)
1805 return end_hard_regno (mode
, regno
) - 1 <= V31_REGNUM
;
1807 return !VECTOR_MODE_P (mode
) || vec_flags
!= 0;
1813 /* Return true if this is a definition of a vectorized simd function. */
1816 aarch64_simd_decl_p (tree fndecl
)
1822 fntype
= TREE_TYPE (fndecl
);
1826 /* Functions with the aarch64_vector_pcs attribute use the simd ABI. */
1827 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype
)) != NULL
)
1833 /* Return the mode a register save/restore should use. DImode for integer
1834 registers, DFmode for FP registers in non-SIMD functions (they only save
1835 the bottom half of a 128 bit register), or TFmode for FP registers in
1839 aarch64_reg_save_mode (tree fndecl
, unsigned regno
)
1841 return GP_REGNUM_P (regno
)
1843 : (aarch64_simd_decl_p (fndecl
) ? E_TFmode
: E_DFmode
);
1846 /* Return true if the instruction is a call to a SIMD function, false
1847 if it is not a SIMD function or if we do not know anything about
1851 aarch64_simd_call_p (rtx_insn
*insn
)
1857 gcc_assert (CALL_P (insn
));
1858 call
= get_call_rtx_from (insn
);
1859 symbol
= XEXP (XEXP (call
, 0), 0);
1860 if (GET_CODE (symbol
) != SYMBOL_REF
)
1862 fndecl
= SYMBOL_REF_DECL (symbol
);
1866 return aarch64_simd_decl_p (fndecl
);
1869 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS. If INSN calls
1870 a function that uses the SIMD ABI, take advantage of the extra
1871 call-preserved registers that the ABI provides. */
1874 aarch64_remove_extra_call_preserved_regs (rtx_insn
*insn
,
1875 HARD_REG_SET
*return_set
)
1877 if (aarch64_simd_call_p (insn
))
1879 for (int regno
= 0; regno
< FIRST_PSEUDO_REGISTER
; regno
++)
1880 if (FP_SIMD_SAVED_REGNUM_P (regno
))
1881 CLEAR_HARD_REG_BIT (*return_set
, regno
);
1885 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1886 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1887 clobbers the top 64 bits when restoring the bottom 64 bits. */
1890 aarch64_hard_regno_call_part_clobbered (rtx_insn
*insn
, unsigned int regno
,
1893 bool simd_p
= insn
&& CALL_P (insn
) && aarch64_simd_call_p (insn
);
1894 return FP_REGNUM_P (regno
)
1895 && maybe_gt (GET_MODE_SIZE (mode
), simd_p
? 16 : 8);
1898 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS. */
1901 aarch64_return_call_with_max_clobbers (rtx_insn
*call_1
, rtx_insn
*call_2
)
1903 gcc_assert (CALL_P (call_1
) && CALL_P (call_2
));
1905 if (!aarch64_simd_call_p (call_1
) || aarch64_simd_call_p (call_2
))
1911 /* Implement REGMODE_NATURAL_SIZE. */
1913 aarch64_regmode_natural_size (machine_mode mode
)
1915 /* The natural size for SVE data modes is one SVE data vector,
1916 and similarly for predicates. We can't independently modify
1917 anything smaller than that. */
1918 /* ??? For now, only do this for variable-width SVE registers.
1919 Doing it for constant-sized registers breaks lower-subreg.c. */
1920 /* ??? And once that's fixed, we should probably have similar
1921 code for Advanced SIMD. */
1922 if (!aarch64_sve_vg
.is_constant ())
1924 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1925 if (vec_flags
& VEC_SVE_PRED
)
1926 return BYTES_PER_SVE_PRED
;
1927 if (vec_flags
& VEC_SVE_DATA
)
1928 return BYTES_PER_SVE_VECTOR
;
1930 return UNITS_PER_WORD
;
1933 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1935 aarch64_hard_regno_caller_save_mode (unsigned regno
, unsigned,
1938 /* The predicate mode determines which bits are significant and
1939 which are "don't care". Decreasing the number of lanes would
1940 lose data while increasing the number of lanes would make bits
1941 unnecessarily significant. */
1942 if (PR_REGNUM_P (regno
))
1944 if (known_ge (GET_MODE_SIZE (mode
), 4))
1950 /* Return true if I's bits are consecutive ones from the MSB. */
1952 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i
)
1954 return exact_log2 (-i
) != HOST_WIDE_INT_M1
;
1957 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1958 that strcpy from constants will be faster. */
1960 static HOST_WIDE_INT
1961 aarch64_constant_alignment (const_tree exp
, HOST_WIDE_INT align
)
1963 if (TREE_CODE (exp
) == STRING_CST
&& !optimize_size
)
1964 return MAX (align
, BITS_PER_WORD
);
1968 /* Return true if calls to DECL should be treated as
1969 long-calls (ie called via a register). */
1971 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED
)
1976 /* Return true if calls to symbol-ref SYM should be treated as
1977 long-calls (ie called via a register). */
1979 aarch64_is_long_call_p (rtx sym
)
1981 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym
));
1984 /* Return true if calls to symbol-ref SYM should not go through
1988 aarch64_is_noplt_call_p (rtx sym
)
1990 const_tree decl
= SYMBOL_REF_DECL (sym
);
1995 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl
)))
1996 && !targetm
.binds_local_p (decl
))
2002 /* Return true if the offsets to a zero/sign-extract operation
2003 represent an expression that matches an extend operation. The
2004 operands represent the paramters from
2006 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
2008 aarch64_is_extend_from_extract (scalar_int_mode mode
, rtx mult_imm
,
2011 HOST_WIDE_INT mult_val
, extract_val
;
2013 if (! CONST_INT_P (mult_imm
) || ! CONST_INT_P (extract_imm
))
2016 mult_val
= INTVAL (mult_imm
);
2017 extract_val
= INTVAL (extract_imm
);
2020 && extract_val
< GET_MODE_BITSIZE (mode
)
2021 && exact_log2 (extract_val
& ~7) > 0
2022 && (extract_val
& 7) <= 4
2023 && mult_val
== (1 << (extract_val
& 7)))
2029 /* Emit an insn that's a simple single-set. Both the operands must be
2030 known to be valid. */
2031 inline static rtx_insn
*
2032 emit_set_insn (rtx x
, rtx y
)
2034 return emit_insn (gen_rtx_SET (x
, y
));
2037 /* X and Y are two things to compare using CODE. Emit the compare insn and
2038 return the rtx for register 0 in the proper mode. */
2040 aarch64_gen_compare_reg (RTX_CODE code
, rtx x
, rtx y
)
2042 machine_mode mode
= SELECT_CC_MODE (code
, x
, y
);
2043 rtx cc_reg
= gen_rtx_REG (mode
, CC_REGNUM
);
2045 emit_set_insn (cc_reg
, gen_rtx_COMPARE (mode
, x
, y
));
2049 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
2052 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code
, rtx x
, rtx y
,
2053 machine_mode y_mode
)
2055 if (y_mode
== E_QImode
|| y_mode
== E_HImode
)
2057 if (CONST_INT_P (y
))
2058 y
= GEN_INT (INTVAL (y
) & GET_MODE_MASK (y_mode
));
2062 machine_mode cc_mode
;
2064 t
= gen_rtx_ZERO_EXTEND (SImode
, y
);
2065 t
= gen_rtx_COMPARE (CC_SWPmode
, t
, x
);
2066 cc_mode
= CC_SWPmode
;
2067 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
2068 emit_set_insn (cc_reg
, t
);
2073 return aarch64_gen_compare_reg (code
, x
, y
);
2076 /* Build the SYMBOL_REF for __tls_get_addr. */
2078 static GTY(()) rtx tls_get_addr_libfunc
;
2081 aarch64_tls_get_addr (void)
2083 if (!tls_get_addr_libfunc
)
2084 tls_get_addr_libfunc
= init_one_libfunc ("__tls_get_addr");
2085 return tls_get_addr_libfunc
;
2088 /* Return the TLS model to use for ADDR. */
2090 static enum tls_model
2091 tls_symbolic_operand_type (rtx addr
)
2093 enum tls_model tls_kind
= TLS_MODEL_NONE
;
2094 if (GET_CODE (addr
) == CONST
)
2097 rtx sym
= strip_offset (addr
, &addend
);
2098 if (GET_CODE (sym
) == SYMBOL_REF
)
2099 tls_kind
= SYMBOL_REF_TLS_MODEL (sym
);
2101 else if (GET_CODE (addr
) == SYMBOL_REF
)
2102 tls_kind
= SYMBOL_REF_TLS_MODEL (addr
);
2107 /* We'll allow lo_sum's in addresses in our legitimate addresses
2108 so that combine would take care of combining addresses where
2109 necessary, but for generation purposes, we'll generate the address
2112 tmp = hi (symbol_ref); adrp x1, foo
2113 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
2117 adrp x1, :got:foo adrp tmp, :tlsgd:foo
2118 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
2122 Load TLS symbol, depending on TLS mechanism and TLS access model.
2124 Global Dynamic - Traditional TLS:
2125 adrp tmp, :tlsgd:imm
2126 add dest, tmp, #:tlsgd_lo12:imm
2129 Global Dynamic - TLS Descriptors:
2130 adrp dest, :tlsdesc:imm
2131 ldr tmp, [dest, #:tlsdesc_lo12:imm]
2132 add dest, dest, #:tlsdesc_lo12:imm
2139 adrp tmp, :gottprel:imm
2140 ldr dest, [tmp, #:gottprel_lo12:imm]
2145 add t0, tp, #:tprel_hi12:imm, lsl #12
2146 add t0, t0, #:tprel_lo12_nc:imm
2150 aarch64_load_symref_appropriately (rtx dest
, rtx imm
,
2151 enum aarch64_symbol_type type
)
2155 case SYMBOL_SMALL_ABSOLUTE
:
2157 /* In ILP32, the mode of dest can be either SImode or DImode. */
2159 machine_mode mode
= GET_MODE (dest
);
2161 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
2163 if (can_create_pseudo_p ())
2164 tmp_reg
= gen_reg_rtx (mode
);
2166 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
2167 emit_insn (gen_add_losym (dest
, tmp_reg
, imm
));
2171 case SYMBOL_TINY_ABSOLUTE
:
2172 emit_insn (gen_rtx_SET (dest
, imm
));
2175 case SYMBOL_SMALL_GOT_28K
:
2177 machine_mode mode
= GET_MODE (dest
);
2178 rtx gp_rtx
= pic_offset_table_rtx
;
2182 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2183 here before rtl expand. Tree IVOPT will generate rtl pattern to
2184 decide rtx costs, in which case pic_offset_table_rtx is not
2185 initialized. For that case no need to generate the first adrp
2186 instruction as the final cost for global variable access is
2190 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2191 using the page base as GOT base, the first page may be wasted,
2192 in the worst scenario, there is only 28K space for GOT).
2194 The generate instruction sequence for accessing global variable
2197 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2199 Only one instruction needed. But we must initialize
2200 pic_offset_table_rtx properly. We generate initialize insn for
2201 every global access, and allow CSE to remove all redundant.
2203 The final instruction sequences will look like the following
2204 for multiply global variables access.
2206 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2208 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2209 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2210 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2213 rtx s
= gen_rtx_SYMBOL_REF (Pmode
, "_GLOBAL_OFFSET_TABLE_");
2214 crtl
->uses_pic_offset_table
= 1;
2215 emit_move_insn (gp_rtx
, gen_rtx_HIGH (Pmode
, s
));
2217 if (mode
!= GET_MODE (gp_rtx
))
2218 gp_rtx
= gen_lowpart (mode
, gp_rtx
);
2222 if (mode
== ptr_mode
)
2225 insn
= gen_ldr_got_small_28k_di (dest
, gp_rtx
, imm
);
2227 insn
= gen_ldr_got_small_28k_si (dest
, gp_rtx
, imm
);
2229 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
2233 gcc_assert (mode
== Pmode
);
2235 insn
= gen_ldr_got_small_28k_sidi (dest
, gp_rtx
, imm
);
2236 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
2239 /* The operand is expected to be MEM. Whenever the related insn
2240 pattern changed, above code which calculate mem should be
2242 gcc_assert (GET_CODE (mem
) == MEM
);
2243 MEM_READONLY_P (mem
) = 1;
2244 MEM_NOTRAP_P (mem
) = 1;
2249 case SYMBOL_SMALL_GOT_4G
:
2251 /* In ILP32, the mode of dest can be either SImode or DImode,
2252 while the got entry is always of SImode size. The mode of
2253 dest depends on how dest is used: if dest is assigned to a
2254 pointer (e.g. in the memory), it has SImode; it may have
2255 DImode if dest is dereferenced to access the memeory.
2256 This is why we have to handle three different ldr_got_small
2257 patterns here (two patterns for ILP32). */
2262 machine_mode mode
= GET_MODE (dest
);
2264 if (can_create_pseudo_p ())
2265 tmp_reg
= gen_reg_rtx (mode
);
2267 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
2268 if (mode
== ptr_mode
)
2271 insn
= gen_ldr_got_small_di (dest
, tmp_reg
, imm
);
2273 insn
= gen_ldr_got_small_si (dest
, tmp_reg
, imm
);
2275 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
2279 gcc_assert (mode
== Pmode
);
2281 insn
= gen_ldr_got_small_sidi (dest
, tmp_reg
, imm
);
2282 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
2285 gcc_assert (GET_CODE (mem
) == MEM
);
2286 MEM_READONLY_P (mem
) = 1;
2287 MEM_NOTRAP_P (mem
) = 1;
2292 case SYMBOL_SMALL_TLSGD
:
2295 machine_mode mode
= GET_MODE (dest
);
2296 rtx result
= gen_rtx_REG (mode
, R0_REGNUM
);
2300 aarch64_emit_call_insn (gen_tlsgd_small_si (result
, imm
));
2302 aarch64_emit_call_insn (gen_tlsgd_small_di (result
, imm
));
2303 insns
= get_insns ();
2306 RTL_CONST_CALL_P (insns
) = 1;
2307 emit_libcall_block (insns
, dest
, result
, imm
);
2311 case SYMBOL_SMALL_TLSDESC
:
2313 machine_mode mode
= GET_MODE (dest
);
2314 rtx x0
= gen_rtx_REG (mode
, R0_REGNUM
);
2317 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
2319 /* In ILP32, the got entry is always of SImode size. Unlike
2320 small GOT, the dest is fixed at reg 0. */
2322 emit_insn (gen_tlsdesc_small_si (imm
));
2324 emit_insn (gen_tlsdesc_small_di (imm
));
2325 tp
= aarch64_load_tp (NULL
);
2328 tp
= gen_lowpart (mode
, tp
);
2330 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, x0
)));
2332 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2336 case SYMBOL_SMALL_TLSIE
:
2338 /* In ILP32, the mode of dest can be either SImode or DImode,
2339 while the got entry is always of SImode size. The mode of
2340 dest depends on how dest is used: if dest is assigned to a
2341 pointer (e.g. in the memory), it has SImode; it may have
2342 DImode if dest is dereferenced to access the memeory.
2343 This is why we have to handle three different tlsie_small
2344 patterns here (two patterns for ILP32). */
2345 machine_mode mode
= GET_MODE (dest
);
2346 rtx tmp_reg
= gen_reg_rtx (mode
);
2347 rtx tp
= aarch64_load_tp (NULL
);
2349 if (mode
== ptr_mode
)
2352 emit_insn (gen_tlsie_small_di (tmp_reg
, imm
));
2355 emit_insn (gen_tlsie_small_si (tmp_reg
, imm
));
2356 tp
= gen_lowpart (mode
, tp
);
2361 gcc_assert (mode
== Pmode
);
2362 emit_insn (gen_tlsie_small_sidi (tmp_reg
, imm
));
2365 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, tmp_reg
)));
2367 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2371 case SYMBOL_TLSLE12
:
2372 case SYMBOL_TLSLE24
:
2373 case SYMBOL_TLSLE32
:
2374 case SYMBOL_TLSLE48
:
2376 machine_mode mode
= GET_MODE (dest
);
2377 rtx tp
= aarch64_load_tp (NULL
);
2380 tp
= gen_lowpart (mode
, tp
);
2384 case SYMBOL_TLSLE12
:
2385 emit_insn ((mode
== DImode
? gen_tlsle12_di
: gen_tlsle12_si
)
2388 case SYMBOL_TLSLE24
:
2389 emit_insn ((mode
== DImode
? gen_tlsle24_di
: gen_tlsle24_si
)
2392 case SYMBOL_TLSLE32
:
2393 emit_insn ((mode
== DImode
? gen_tlsle32_di
: gen_tlsle32_si
)
2395 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
2398 case SYMBOL_TLSLE48
:
2399 emit_insn ((mode
== DImode
? gen_tlsle48_di
: gen_tlsle48_si
)
2401 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
2409 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2413 case SYMBOL_TINY_GOT
:
2414 emit_insn (gen_ldr_got_tiny (dest
, imm
));
2417 case SYMBOL_TINY_TLSIE
:
2419 machine_mode mode
= GET_MODE (dest
);
2420 rtx tp
= aarch64_load_tp (NULL
);
2422 if (mode
== ptr_mode
)
2425 emit_insn (gen_tlsie_tiny_di (dest
, imm
, tp
));
2428 tp
= gen_lowpart (mode
, tp
);
2429 emit_insn (gen_tlsie_tiny_si (dest
, imm
, tp
));
2434 gcc_assert (mode
== Pmode
);
2435 emit_insn (gen_tlsie_tiny_sidi (dest
, imm
, tp
));
2439 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2448 /* Emit a move from SRC to DEST. Assume that the move expanders can
2449 handle all moves if !can_create_pseudo_p (). The distinction is
2450 important because, unlike emit_move_insn, the move expanders know
2451 how to force Pmode objects into the constant pool even when the
2452 constant pool address is not itself legitimate. */
2454 aarch64_emit_move (rtx dest
, rtx src
)
2456 return (can_create_pseudo_p ()
2457 ? emit_move_insn (dest
, src
)
2458 : emit_move_insn_1 (dest
, src
));
2461 /* Apply UNOPTAB to OP and store the result in DEST. */
2464 aarch64_emit_unop (rtx dest
, optab unoptab
, rtx op
)
2466 rtx tmp
= expand_unop (GET_MODE (dest
), unoptab
, op
, dest
, 0);
2468 emit_move_insn (dest
, tmp
);
2471 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
2474 aarch64_emit_binop (rtx dest
, optab binoptab
, rtx op0
, rtx op1
)
2476 rtx tmp
= expand_binop (GET_MODE (dest
), binoptab
, op0
, op1
, dest
, 0,
2479 emit_move_insn (dest
, tmp
);
2482 /* Split a 128-bit move operation into two 64-bit move operations,
2483 taking care to handle partial overlap of register to register
2484 copies. Special cases are needed when moving between GP regs and
2485 FP regs. SRC can be a register, constant or memory; DST a register
2486 or memory. If either operand is memory it must not have any side
2489 aarch64_split_128bit_move (rtx dst
, rtx src
)
2494 machine_mode mode
= GET_MODE (dst
);
2496 gcc_assert (mode
== TImode
|| mode
== TFmode
);
2497 gcc_assert (!(side_effects_p (src
) || side_effects_p (dst
)));
2498 gcc_assert (mode
== GET_MODE (src
) || GET_MODE (src
) == VOIDmode
);
2500 if (REG_P (dst
) && REG_P (src
))
2502 int src_regno
= REGNO (src
);
2503 int dst_regno
= REGNO (dst
);
2505 /* Handle FP <-> GP regs. */
2506 if (FP_REGNUM_P (dst_regno
) && GP_REGNUM_P (src_regno
))
2508 src_lo
= gen_lowpart (word_mode
, src
);
2509 src_hi
= gen_highpart (word_mode
, src
);
2511 emit_insn (gen_aarch64_movlow_di (mode
, dst
, src_lo
));
2512 emit_insn (gen_aarch64_movhigh_di (mode
, dst
, src_hi
));
2515 else if (GP_REGNUM_P (dst_regno
) && FP_REGNUM_P (src_regno
))
2517 dst_lo
= gen_lowpart (word_mode
, dst
);
2518 dst_hi
= gen_highpart (word_mode
, dst
);
2520 emit_insn (gen_aarch64_movdi_low (mode
, dst_lo
, src
));
2521 emit_insn (gen_aarch64_movdi_high (mode
, dst_hi
, src
));
2526 dst_lo
= gen_lowpart (word_mode
, dst
);
2527 dst_hi
= gen_highpart (word_mode
, dst
);
2528 src_lo
= gen_lowpart (word_mode
, src
);
2529 src_hi
= gen_highpart_mode (word_mode
, mode
, src
);
2531 /* At most one pairing may overlap. */
2532 if (reg_overlap_mentioned_p (dst_lo
, src_hi
))
2534 aarch64_emit_move (dst_hi
, src_hi
);
2535 aarch64_emit_move (dst_lo
, src_lo
);
2539 aarch64_emit_move (dst_lo
, src_lo
);
2540 aarch64_emit_move (dst_hi
, src_hi
);
2545 aarch64_split_128bit_move_p (rtx dst
, rtx src
)
2547 return (! REG_P (src
)
2548 || ! (FP_REGNUM_P (REGNO (dst
)) && FP_REGNUM_P (REGNO (src
))));
2551 /* Split a complex SIMD combine. */
2554 aarch64_split_simd_combine (rtx dst
, rtx src1
, rtx src2
)
2556 machine_mode src_mode
= GET_MODE (src1
);
2557 machine_mode dst_mode
= GET_MODE (dst
);
2559 gcc_assert (VECTOR_MODE_P (dst_mode
));
2560 gcc_assert (register_operand (dst
, dst_mode
)
2561 && register_operand (src1
, src_mode
)
2562 && register_operand (src2
, src_mode
));
2564 emit_insn (gen_aarch64_simd_combine (src_mode
, dst
, src1
, src2
));
2568 /* Split a complex SIMD move. */
2571 aarch64_split_simd_move (rtx dst
, rtx src
)
2573 machine_mode src_mode
= GET_MODE (src
);
2574 machine_mode dst_mode
= GET_MODE (dst
);
2576 gcc_assert (VECTOR_MODE_P (dst_mode
));
2578 if (REG_P (dst
) && REG_P (src
))
2580 gcc_assert (VECTOR_MODE_P (src_mode
));
2581 emit_insn (gen_aarch64_split_simd_mov (src_mode
, dst
, src
));
2586 aarch64_zero_extend_const_eq (machine_mode xmode
, rtx x
,
2587 machine_mode ymode
, rtx y
)
2589 rtx r
= simplify_const_unary_operation (ZERO_EXTEND
, xmode
, y
, ymode
);
2590 gcc_assert (r
!= NULL
);
2591 return rtx_equal_p (x
, r
);
2595 /* Return TARGET if it is nonnull and a register of mode MODE.
2596 Otherwise, return a fresh register of mode MODE if we can,
2597 or TARGET reinterpreted as MODE if we can't. */
2600 aarch64_target_reg (rtx target
, machine_mode mode
)
2602 if (target
&& REG_P (target
) && GET_MODE (target
) == mode
)
2604 if (!can_create_pseudo_p ())
2606 gcc_assert (target
);
2607 return gen_lowpart (mode
, target
);
2609 return gen_reg_rtx (mode
);
2612 /* Return a register that contains the constant in BUILDER, given that
2613 the constant is a legitimate move operand. Use TARGET as the register
2614 if it is nonnull and convenient. */
2617 aarch64_emit_set_immediate (rtx target
, rtx_vector_builder
&builder
)
2619 rtx src
= builder
.build ();
2620 target
= aarch64_target_reg (target
, GET_MODE (src
));
2621 emit_insn (gen_rtx_SET (target
, src
));
2626 aarch64_force_temporary (machine_mode mode
, rtx x
, rtx value
)
2628 if (can_create_pseudo_p ())
2629 return force_reg (mode
, value
);
2633 aarch64_emit_move (x
, value
);
2638 /* Return true if predicate value X is a constant in which every element
2639 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
2640 value, i.e. as a predicate in which all bits are significant. */
2643 aarch64_get_sve_pred_bits (rtx_vector_builder
&builder
, rtx x
)
2645 if (GET_CODE (x
) != CONST_VECTOR
)
2648 unsigned int factor
= vector_element_size (GET_MODE_NUNITS (VNx16BImode
),
2649 GET_MODE_NUNITS (GET_MODE (x
)));
2650 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (x
) * factor
;
2651 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (x
);
2652 builder
.new_vector (VNx16BImode
, npatterns
, nelts_per_pattern
);
2654 unsigned int nelts
= const_vector_encoded_nelts (x
);
2655 for (unsigned int i
= 0; i
< nelts
; ++i
)
2657 rtx elt
= CONST_VECTOR_ENCODED_ELT (x
, i
);
2658 if (!CONST_INT_P (elt
))
2661 builder
.quick_push (elt
);
2662 for (unsigned int j
= 1; j
< factor
; ++j
)
2663 builder
.quick_push (const0_rtx
);
2665 builder
.finalize ();
2669 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
2670 widest predicate element size it can have (that is, the largest size
2671 for which each element would still be 0 or 1). */
2674 aarch64_widest_sve_pred_elt_size (rtx_vector_builder
&builder
)
2676 /* Start with the most optimistic assumption: that we only need
2677 one bit per pattern. This is what we will use if only the first
2678 bit in each pattern is ever set. */
2679 unsigned int mask
= GET_MODE_SIZE (DImode
);
2680 mask
|= builder
.npatterns ();
2682 /* Look for set bits. */
2683 unsigned int nelts
= builder
.encoded_nelts ();
2684 for (unsigned int i
= 1; i
< nelts
; ++i
)
2685 if (INTVAL (builder
.elt (i
)) != 0)
2691 return mask
& -mask
;
2694 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
2695 that the constant would have with predicate element size ELT_SIZE
2696 (ignoring the upper bits in each element) and return:
2698 * -1 if all bits are set
2699 * N if the predicate has N leading set bits followed by all clear bits
2700 * 0 if the predicate does not have any of these forms. */
2703 aarch64_partial_ptrue_length (rtx_vector_builder
&builder
,
2704 unsigned int elt_size
)
2706 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
2707 followed by set bits. */
2708 if (builder
.nelts_per_pattern () == 3)
2711 /* Skip over leading set bits. */
2712 unsigned int nelts
= builder
.encoded_nelts ();
2714 for (; i
< nelts
; i
+= elt_size
)
2715 if (INTVAL (builder
.elt (i
)) == 0)
2717 unsigned int vl
= i
/ elt_size
;
2719 /* Check for the all-true case. */
2723 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
2724 repeating pattern of set bits followed by clear bits. */
2725 if (builder
.nelts_per_pattern () != 2)
2728 /* We have a "foreground" value and a duplicated "background" value.
2729 If the background might repeat and the last set bit belongs to it,
2730 we might have set bits followed by clear bits followed by set bits. */
2731 if (i
> builder
.npatterns () && maybe_ne (nelts
, builder
.full_nelts ()))
2734 /* Make sure that the rest are all clear. */
2735 for (; i
< nelts
; i
+= elt_size
)
2736 if (INTVAL (builder
.elt (i
)) != 0)
2742 /* See if there is an svpattern that encodes an SVE predicate of mode
2743 PRED_MODE in which the first VL bits are set and the rest are clear.
2744 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
2745 A VL of -1 indicates an all-true vector. */
2748 aarch64_svpattern_for_vl (machine_mode pred_mode
, int vl
)
2751 return AARCH64_SV_ALL
;
2753 if (maybe_gt (vl
, GET_MODE_NUNITS (pred_mode
)))
2754 return AARCH64_NUM_SVPATTERNS
;
2756 if (vl
>= 1 && vl
<= 8)
2757 return aarch64_svpattern (AARCH64_SV_VL1
+ (vl
- 1));
2759 if (vl
>= 16 && vl
<= 256 && pow2p_hwi (vl
))
2760 return aarch64_svpattern (AARCH64_SV_VL16
+ (exact_log2 (vl
) - 4));
2763 if (GET_MODE_NUNITS (pred_mode
).is_constant (&max_vl
))
2765 if (vl
== (max_vl
/ 3) * 3)
2766 return AARCH64_SV_MUL3
;
2767 /* These would only trigger for non-power-of-2 lengths. */
2768 if (vl
== (max_vl
& -4))
2769 return AARCH64_SV_MUL4
;
2770 if (vl
== (1 << floor_log2 (max_vl
)))
2771 return AARCH64_SV_POW2
;
2773 return AARCH64_SV_ALL
;
2775 return AARCH64_NUM_SVPATTERNS
;
2778 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
2779 bits has the lowest bit set and the upper bits clear. This is the
2780 VNx16BImode equivalent of a PTRUE for controlling elements of
2781 ELT_SIZE bytes. However, because the constant is VNx16BImode,
2782 all bits are significant, even the upper zeros. */
2785 aarch64_ptrue_all (unsigned int elt_size
)
2787 rtx_vector_builder
builder (VNx16BImode
, elt_size
, 1);
2788 builder
.quick_push (const1_rtx
);
2789 for (unsigned int i
= 1; i
< elt_size
; ++i
)
2790 builder
.quick_push (const0_rtx
);
2791 return builder
.build ();
2794 /* Return an all-true predicate register of mode MODE. */
2797 aarch64_ptrue_reg (machine_mode mode
)
2799 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
);
2800 rtx reg
= force_reg (VNx16BImode
, CONSTM1_RTX (VNx16BImode
));
2801 return gen_lowpart (mode
, reg
);
2804 /* Return an all-false predicate register of mode MODE. */
2807 aarch64_pfalse_reg (machine_mode mode
)
2809 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
);
2810 rtx reg
= force_reg (VNx16BImode
, CONST0_RTX (VNx16BImode
));
2811 return gen_lowpart (mode
, reg
);
2814 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
2815 true, or alternatively if we know that the operation predicated by
2816 PRED1[0] is safe to perform whenever PRED2 is true. PRED1[1] is a
2817 aarch64_sve_gp_strictness operand that describes the operation
2818 predicated by PRED1[0]. */
2821 aarch64_sve_pred_dominates_p (rtx
*pred1
, rtx pred2
)
2823 machine_mode mode
= GET_MODE (pred2
);
2824 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
2825 && mode
== GET_MODE (pred1
[0])
2826 && aarch64_sve_gp_strictness (pred1
[1], SImode
));
2827 return (pred1
[0] == CONSTM1_RTX (mode
)
2828 || INTVAL (pred1
[1]) == SVE_RELAXED_GP
2829 || rtx_equal_p (pred1
[0], pred2
));
2832 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
2833 for it. PRED2[0] is the predicate for the instruction whose result
2834 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
2835 for it. Return true if we can prove that the two predicates are
2836 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
2837 with PRED1[0] without changing behavior. */
2840 aarch64_sve_same_pred_for_ptest_p (rtx
*pred1
, rtx
*pred2
)
2842 machine_mode mode
= GET_MODE (pred1
[0]);
2843 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
2844 && mode
== GET_MODE (pred2
[0])
2845 && aarch64_sve_ptrue_flag (pred1
[1], SImode
)
2846 && aarch64_sve_ptrue_flag (pred2
[1], SImode
));
2848 bool ptrue1_p
= (pred1
[0] == CONSTM1_RTX (mode
)
2849 || INTVAL (pred1
[1]) == SVE_KNOWN_PTRUE
);
2850 bool ptrue2_p
= (pred2
[0] == CONSTM1_RTX (mode
)
2851 || INTVAL (pred2
[1]) == SVE_KNOWN_PTRUE
);
2852 return (ptrue1_p
&& ptrue2_p
) || rtx_equal_p (pred1
[0], pred2
[0]);
2855 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
2856 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
2857 Use TARGET as the target register if nonnull and convenient. */
2860 aarch64_sve_emit_int_cmp (rtx target
, machine_mode pred_mode
, rtx_code cmp
,
2861 machine_mode data_mode
, rtx op1
, rtx op2
)
2863 insn_code icode
= code_for_aarch64_pred_cmp (cmp
, data_mode
);
2864 expand_operand ops
[5];
2865 create_output_operand (&ops
[0], target
, pred_mode
);
2866 create_input_operand (&ops
[1], CONSTM1_RTX (pred_mode
), pred_mode
);
2867 create_integer_operand (&ops
[2], SVE_KNOWN_PTRUE
);
2868 create_input_operand (&ops
[3], op1
, data_mode
);
2869 create_input_operand (&ops
[4], op2
, data_mode
);
2870 expand_insn (icode
, 5, ops
);
2871 return ops
[0].value
;
2874 /* Use a comparison to convert integer vector SRC into MODE, which is
2875 the corresponding SVE predicate mode. Use TARGET for the result
2876 if it's nonnull and convenient. */
2879 aarch64_convert_sve_data_to_pred (rtx target
, machine_mode mode
, rtx src
)
2881 machine_mode src_mode
= GET_MODE (src
);
2882 return aarch64_sve_emit_int_cmp (target
, mode
, NE
, src_mode
,
2883 src
, CONST0_RTX (src_mode
));
2886 /* Return true if we can move VALUE into a register using a single
2887 CNT[BHWD] instruction. */
2890 aarch64_sve_cnt_immediate_p (poly_int64 value
)
2892 HOST_WIDE_INT factor
= value
.coeffs
[0];
2893 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2894 return (value
.coeffs
[1] == factor
2895 && IN_RANGE (factor
, 2, 16 * 16)
2896 && (factor
& 1) == 0
2897 && factor
<= 16 * (factor
& -factor
));
2900 /* Likewise for rtx X. */
2903 aarch64_sve_cnt_immediate_p (rtx x
)
2906 return poly_int_rtx_p (x
, &value
) && aarch64_sve_cnt_immediate_p (value
);
2909 /* Return the asm string for an instruction with a CNT-like vector size
2910 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2911 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2912 first part of the operands template (the part that comes before the
2913 vector size itself). PATTERN is the pattern to use. FACTOR is the
2914 number of quadwords. NELTS_PER_VQ, if nonzero, is the number of elements
2915 in each quadword. If it is zero, we can use any element size. */
2918 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
2919 aarch64_svpattern pattern
,
2920 unsigned int factor
,
2921 unsigned int nelts_per_vq
)
2923 static char buffer
[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
2925 if (nelts_per_vq
== 0)
2926 /* There is some overlap in the ranges of the four CNT instructions.
2927 Here we always use the smallest possible element size, so that the
2928 multiplier is 1 whereever possible. */
2929 nelts_per_vq
= factor
& -factor
;
2930 int shift
= std::min (exact_log2 (nelts_per_vq
), 4);
2931 gcc_assert (IN_RANGE (shift
, 1, 4));
2932 char suffix
= "dwhb"[shift
- 1];
2935 unsigned int written
;
2936 if (pattern
== AARCH64_SV_ALL
&& factor
== 1)
2937 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s",
2938 prefix
, suffix
, operands
);
2939 else if (factor
== 1)
2940 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s, %s",
2941 prefix
, suffix
, operands
, svpattern_token (pattern
));
2943 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s, %s, mul #%d",
2944 prefix
, suffix
, operands
, svpattern_token (pattern
),
2946 gcc_assert (written
< sizeof (buffer
));
2950 /* Return the asm string for an instruction with a CNT-like vector size
2951 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2952 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2953 first part of the operands template (the part that comes before the
2954 vector size itself). X is the value of the vector size operand,
2955 as a polynomial integer rtx; we need to convert this into an "all"
2956 pattern with a multiplier. */
2959 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
2962 poly_int64 value
= rtx_to_poly_int64 (x
);
2963 gcc_assert (aarch64_sve_cnt_immediate_p (value
));
2964 return aarch64_output_sve_cnt_immediate (prefix
, operands
, AARCH64_SV_ALL
,
2965 value
.coeffs
[1], 0);
2968 /* Return true if we can add X using a single SVE INC or DEC instruction. */
2971 aarch64_sve_scalar_inc_dec_immediate_p (rtx x
)
2974 return (poly_int_rtx_p (x
, &value
)
2975 && (aarch64_sve_cnt_immediate_p (value
)
2976 || aarch64_sve_cnt_immediate_p (-value
)));
2979 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
2983 aarch64_output_sve_scalar_inc_dec (rtx offset
)
2985 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
2986 gcc_assert (offset_value
.coeffs
[0] == offset_value
.coeffs
[1]);
2987 if (offset_value
.coeffs
[1] > 0)
2988 return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL
,
2989 offset_value
.coeffs
[1], 0);
2991 return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL
,
2992 -offset_value
.coeffs
[1], 0);
2995 /* Return true if we can add VALUE to a register using a single ADDVL
2996 or ADDPL instruction. */
2999 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value
)
3001 HOST_WIDE_INT factor
= value
.coeffs
[0];
3002 if (factor
== 0 || value
.coeffs
[1] != factor
)
3004 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
3005 and a value of 16 is one vector width. */
3006 return (((factor
& 15) == 0 && IN_RANGE (factor
, -32 * 16, 31 * 16))
3007 || ((factor
& 1) == 0 && IN_RANGE (factor
, -32 * 2, 31 * 2)));
3010 /* Likewise for rtx X. */
3013 aarch64_sve_addvl_addpl_immediate_p (rtx x
)
3016 return (poly_int_rtx_p (x
, &value
)
3017 && aarch64_sve_addvl_addpl_immediate_p (value
));
3020 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3021 to operand 1 and storing the result in operand 0. */
3024 aarch64_output_sve_addvl_addpl (rtx offset
)
3026 static char buffer
[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3027 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
3028 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value
));
3030 int factor
= offset_value
.coeffs
[1];
3031 if ((factor
& 15) == 0)
3032 snprintf (buffer
, sizeof (buffer
), "addvl\t%%x0, %%x1, #%d", factor
/ 16);
3034 snprintf (buffer
, sizeof (buffer
), "addpl\t%%x0, %%x1, #%d", factor
/ 2);
3038 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3039 instruction. If it is, store the number of elements in each vector
3040 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3041 factor in *FACTOR_OUT (if nonnull). */
3044 aarch64_sve_vector_inc_dec_immediate_p (rtx x
, int *factor_out
,
3045 unsigned int *nelts_per_vq_out
)
3050 if (!const_vec_duplicate_p (x
, &elt
)
3051 || !poly_int_rtx_p (elt
, &value
))
3054 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x
));
3055 if (nelts_per_vq
!= 8 && nelts_per_vq
!= 4 && nelts_per_vq
!= 2)
3056 /* There's no vector INCB. */
3059 HOST_WIDE_INT factor
= value
.coeffs
[0];
3060 if (value
.coeffs
[1] != factor
)
3063 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
3064 if ((factor
% nelts_per_vq
) != 0
3065 || !IN_RANGE (abs (factor
), nelts_per_vq
, 16 * nelts_per_vq
))
3069 *factor_out
= factor
;
3070 if (nelts_per_vq_out
)
3071 *nelts_per_vq_out
= nelts_per_vq
;
3075 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3079 aarch64_sve_vector_inc_dec_immediate_p (rtx x
)
3081 return aarch64_sve_vector_inc_dec_immediate_p (x
, NULL
, NULL
);
3084 /* Return the asm template for an SVE vector INC or DEC instruction.
3085 OPERANDS gives the operands before the vector count and X is the
3086 value of the vector count operand itself. */
3089 aarch64_output_sve_vector_inc_dec (const char *operands
, rtx x
)
3092 unsigned int nelts_per_vq
;
3093 if (!aarch64_sve_vector_inc_dec_immediate_p (x
, &factor
, &nelts_per_vq
))
3096 return aarch64_output_sve_cnt_immediate ("dec", operands
, AARCH64_SV_ALL
,
3097 -factor
, nelts_per_vq
);
3099 return aarch64_output_sve_cnt_immediate ("inc", operands
, AARCH64_SV_ALL
,
3100 factor
, nelts_per_vq
);
3104 aarch64_internal_mov_immediate (rtx dest
, rtx imm
, bool generate
,
3105 scalar_int_mode mode
)
3108 unsigned HOST_WIDE_INT val
, val2
, mask
;
3109 int one_match
, zero_match
;
3114 if (aarch64_move_imm (val
, mode
))
3117 emit_insn (gen_rtx_SET (dest
, imm
));
3121 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3122 (with XXXX non-zero). In that case check to see if the move can be done in
3124 val2
= val
& 0xffffffff;
3126 && aarch64_move_imm (val2
, SImode
)
3127 && (((val
>> 32) & 0xffff) == 0 || (val
>> 48) == 0))
3130 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
3132 /* Check if we have to emit a second instruction by checking to see
3133 if any of the upper 32 bits of the original DI mode value is set. */
3137 i
= (val
>> 48) ? 48 : 32;
3140 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
3141 GEN_INT ((val
>> i
) & 0xffff)));
3146 if ((val
>> 32) == 0 || mode
== SImode
)
3150 emit_insn (gen_rtx_SET (dest
, GEN_INT (val
& 0xffff)));
3152 emit_insn (gen_insv_immsi (dest
, GEN_INT (16),
3153 GEN_INT ((val
>> 16) & 0xffff)));
3155 emit_insn (gen_insv_immdi (dest
, GEN_INT (16),
3156 GEN_INT ((val
>> 16) & 0xffff)));
3161 /* Remaining cases are all for DImode. */
3164 zero_match
= ((val
& mask
) == 0) + ((val
& (mask
<< 16)) == 0) +
3165 ((val
& (mask
<< 32)) == 0) + ((val
& (mask
<< 48)) == 0);
3166 one_match
= ((~val
& mask
) == 0) + ((~val
& (mask
<< 16)) == 0) +
3167 ((~val
& (mask
<< 32)) == 0) + ((~val
& (mask
<< 48)) == 0);
3169 if (zero_match
!= 2 && one_match
!= 2)
3171 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3172 For a 64-bit bitmask try whether changing 16 bits to all ones or
3173 zeroes creates a valid bitmask. To check any repeated bitmask,
3174 try using 16 bits from the other 32-bit half of val. */
3176 for (i
= 0; i
< 64; i
+= 16, mask
<<= 16)
3179 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
3182 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
3184 val2
= val2
& ~mask
;
3185 val2
= val2
| (((val2
>> 32) | (val2
<< 32)) & mask
);
3186 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
3193 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
3194 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
3195 GEN_INT ((val
>> i
) & 0xffff)));
3201 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3202 are emitted by the initial mov. If one_match > zero_match, skip set bits,
3203 otherwise skip zero bits. */
3207 val2
= one_match
> zero_match
? ~val
: val
;
3208 i
= (val2
& mask
) != 0 ? 0 : (val2
& (mask
<< 16)) != 0 ? 16 : 32;
3211 emit_insn (gen_rtx_SET (dest
, GEN_INT (one_match
> zero_match
3212 ? (val
| ~(mask
<< i
))
3213 : (val
& (mask
<< i
)))));
3214 for (i
+= 16; i
< 64; i
+= 16)
3216 if ((val2
& (mask
<< i
)) == 0)
3219 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
3220 GEN_INT ((val
>> i
) & 0xffff)));
3227 /* Return whether imm is a 128-bit immediate which is simple enough to
3230 aarch64_mov128_immediate (rtx imm
)
3232 if (GET_CODE (imm
) == CONST_INT
)
3235 gcc_assert (CONST_WIDE_INT_NUNITS (imm
) == 2);
3237 rtx lo
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 0));
3238 rtx hi
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 1));
3240 return aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, DImode
)
3241 + aarch64_internal_mov_immediate (NULL_RTX
, hi
, false, DImode
) <= 4;
3245 /* Return the number of temporary registers that aarch64_add_offset_1
3246 would need to add OFFSET to a register. */
3249 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset
)
3251 return abs_hwi (offset
) < 0x1000000 ? 0 : 1;
3254 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
3255 a non-polynomial OFFSET. MODE is the mode of the addition.
3256 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3257 be set and CFA adjustments added to the generated instructions.
3259 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3260 temporary if register allocation is already complete. This temporary
3261 register may overlap DEST but must not overlap SRC. If TEMP1 is known
3262 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3263 the immediate again.
3265 Since this function may be used to adjust the stack pointer, we must
3266 ensure that it cannot cause transient stack deallocation (for example
3267 by first incrementing SP and then decrementing when adjusting by a
3268 large immediate). */
3271 aarch64_add_offset_1 (scalar_int_mode mode
, rtx dest
,
3272 rtx src
, HOST_WIDE_INT offset
, rtx temp1
,
3273 bool frame_related_p
, bool emit_move_imm
)
3275 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
3276 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
3278 HOST_WIDE_INT moffset
= abs_hwi (offset
);
3283 if (!rtx_equal_p (dest
, src
))
3285 insn
= emit_insn (gen_rtx_SET (dest
, src
));
3286 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3291 /* Single instruction adjustment. */
3292 if (aarch64_uimm12_shift (moffset
))
3294 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (offset
)));
3295 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3299 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3302 a) the offset cannot be loaded by a 16-bit move or
3303 b) there is no spare register into which we can move it. */
3304 if (moffset
< 0x1000000
3305 && ((!temp1
&& !can_create_pseudo_p ())
3306 || !aarch64_move_imm (moffset
, mode
)))
3308 HOST_WIDE_INT low_off
= moffset
& 0xfff;
3310 low_off
= offset
< 0 ? -low_off
: low_off
;
3311 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (low_off
)));
3312 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3313 insn
= emit_insn (gen_add2_insn (dest
, GEN_INT (offset
- low_off
)));
3314 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3318 /* Emit a move immediate if required and an addition/subtraction. */
3321 gcc_assert (temp1
!= NULL_RTX
|| can_create_pseudo_p ());
3322 temp1
= aarch64_force_temporary (mode
, temp1
, GEN_INT (moffset
));
3324 insn
= emit_insn (offset
< 0
3325 ? gen_sub3_insn (dest
, src
, temp1
)
3326 : gen_add3_insn (dest
, src
, temp1
));
3327 if (frame_related_p
)
3329 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3330 rtx adj
= plus_constant (mode
, src
, offset
);
3331 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, gen_rtx_SET (dest
, adj
));
3335 /* Return the number of temporary registers that aarch64_add_offset
3336 would need to move OFFSET into a register or add OFFSET to a register;
3337 ADD_P is true if we want the latter rather than the former. */
3340 aarch64_offset_temporaries (bool add_p
, poly_int64 offset
)
3342 /* This follows the same structure as aarch64_add_offset. */
3343 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
3346 unsigned int count
= 0;
3347 HOST_WIDE_INT factor
= offset
.coeffs
[1];
3348 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
3349 poly_int64
poly_offset (factor
, factor
);
3350 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
3351 /* Need one register for the ADDVL/ADDPL result. */
3353 else if (factor
!= 0)
3355 factor
= abs (factor
);
3356 if (factor
> 16 * (factor
& -factor
))
3357 /* Need one register for the CNT result and one for the multiplication
3358 factor. If necessary, the second temporary can be reused for the
3359 constant part of the offset. */
3361 /* Need one register for the CNT result (which might then
3365 return count
+ aarch64_add_offset_1_temporaries (constant
);
3368 /* If X can be represented as a poly_int64, return the number
3369 of temporaries that are required to add it to a register.
3370 Return -1 otherwise. */
3373 aarch64_add_offset_temporaries (rtx x
)
3376 if (!poly_int_rtx_p (x
, &offset
))
3378 return aarch64_offset_temporaries (true, offset
);
3381 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
3382 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3383 be set and CFA adjustments added to the generated instructions.
3385 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3386 temporary if register allocation is already complete. This temporary
3387 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3388 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3389 false to avoid emitting the immediate again.
3391 TEMP2, if nonnull, is a second temporary register that doesn't
3392 overlap either DEST or REG.
3394 Since this function may be used to adjust the stack pointer, we must
3395 ensure that it cannot cause transient stack deallocation (for example
3396 by first incrementing SP and then decrementing when adjusting by a
3397 large immediate). */
3400 aarch64_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
3401 poly_int64 offset
, rtx temp1
, rtx temp2
,
3402 bool frame_related_p
, bool emit_move_imm
= true)
3404 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
3405 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
3406 gcc_assert (temp1
== NULL_RTX
3408 || !reg_overlap_mentioned_p (temp1
, dest
));
3409 gcc_assert (temp2
== NULL_RTX
|| !reg_overlap_mentioned_p (dest
, temp2
));
3411 /* Try using ADDVL or ADDPL to add the whole value. */
3412 if (src
!= const0_rtx
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
3414 rtx offset_rtx
= gen_int_mode (offset
, mode
);
3415 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
3416 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3420 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3421 SVE vector register, over and above the minimum size of 128 bits.
3422 This is equivalent to half the value returned by CNTD with a
3423 vector shape of ALL. */
3424 HOST_WIDE_INT factor
= offset
.coeffs
[1];
3425 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
3427 /* Try using ADDVL or ADDPL to add the VG-based part. */
3428 poly_int64
poly_offset (factor
, factor
);
3429 if (src
!= const0_rtx
3430 && aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
3432 rtx offset_rtx
= gen_int_mode (poly_offset
, mode
);
3433 if (frame_related_p
)
3435 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
3436 RTX_FRAME_RELATED_P (insn
) = true;
3441 rtx addr
= gen_rtx_PLUS (mode
, src
, offset_rtx
);
3442 src
= aarch64_force_temporary (mode
, temp1
, addr
);
3447 /* Otherwise use a CNT-based sequence. */
3448 else if (factor
!= 0)
3450 /* Use a subtraction if we have a negative factor. */
3451 rtx_code code
= PLUS
;
3458 /* Calculate CNTD * FACTOR / 2. First try to fold the division
3459 into the multiplication. */
3463 /* Use a right shift by 1. */
3467 HOST_WIDE_INT low_bit
= factor
& -factor
;
3468 if (factor
<= 16 * low_bit
)
3470 if (factor
> 16 * 8)
3472 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3473 the value with the minimum multiplier and shift it into
3475 int extra_shift
= exact_log2 (low_bit
);
3476 shift
+= extra_shift
;
3477 factor
>>= extra_shift
;
3479 val
= gen_int_mode (poly_int64 (factor
* 2, factor
* 2), mode
);
3483 /* Base the factor on LOW_BIT if we can calculate LOW_BIT
3484 directly, since that should increase the chances of being
3485 able to use a shift and add sequence. If LOW_BIT itself
3486 is out of range, just use CNTD. */
3487 if (low_bit
<= 16 * 8)
3492 val
= gen_int_mode (poly_int64 (low_bit
* 2, low_bit
* 2), mode
);
3493 val
= aarch64_force_temporary (mode
, temp1
, val
);
3495 if (can_create_pseudo_p ())
3497 rtx coeff1
= gen_int_mode (factor
, mode
);
3498 val
= expand_mult (mode
, val
, coeff1
, NULL_RTX
, false, true);
3502 /* Go back to using a negative multiplication factor if we have
3503 no register from which to subtract. */
3504 if (code
== MINUS
&& src
== const0_rtx
)
3509 rtx coeff1
= gen_int_mode (factor
, mode
);
3510 coeff1
= aarch64_force_temporary (mode
, temp2
, coeff1
);
3511 val
= gen_rtx_MULT (mode
, val
, coeff1
);
3517 /* Multiply by 1 << SHIFT. */
3518 val
= aarch64_force_temporary (mode
, temp1
, val
);
3519 val
= gen_rtx_ASHIFT (mode
, val
, GEN_INT (shift
));
3521 else if (shift
== -1)
3524 val
= aarch64_force_temporary (mode
, temp1
, val
);
3525 val
= gen_rtx_ASHIFTRT (mode
, val
, const1_rtx
);
3528 /* Calculate SRC +/- CNTD * FACTOR / 2. */
3529 if (src
!= const0_rtx
)
3531 val
= aarch64_force_temporary (mode
, temp1
, val
);
3532 val
= gen_rtx_fmt_ee (code
, mode
, src
, val
);
3534 else if (code
== MINUS
)
3536 val
= aarch64_force_temporary (mode
, temp1
, val
);
3537 val
= gen_rtx_NEG (mode
, val
);
3540 if (constant
== 0 || frame_related_p
)
3542 rtx_insn
*insn
= emit_insn (gen_rtx_SET (dest
, val
));
3543 if (frame_related_p
)
3545 RTX_FRAME_RELATED_P (insn
) = true;
3546 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
3547 gen_rtx_SET (dest
, plus_constant (Pmode
, src
,
3556 src
= aarch64_force_temporary (mode
, temp1
, val
);
3561 emit_move_imm
= true;
3564 aarch64_add_offset_1 (mode
, dest
, src
, constant
, temp1
,
3565 frame_related_p
, emit_move_imm
);
3568 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3569 than a poly_int64. */
3572 aarch64_split_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
3573 rtx offset_rtx
, rtx temp1
, rtx temp2
)
3575 aarch64_add_offset (mode
, dest
, src
, rtx_to_poly_int64 (offset_rtx
),
3576 temp1
, temp2
, false);
3579 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3580 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
3581 if TEMP1 already contains abs (DELTA). */
3584 aarch64_add_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool emit_move_imm
)
3586 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, delta
,
3587 temp1
, temp2
, true, emit_move_imm
);
3590 /* Subtract DELTA from the stack pointer, marking the instructions
3591 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
3595 aarch64_sub_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool frame_related_p
,
3596 bool emit_move_imm
= true)
3598 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, -delta
,
3599 temp1
, temp2
, frame_related_p
, emit_move_imm
);
3602 /* Set DEST to (vec_series BASE STEP). */
3605 aarch64_expand_vec_series (rtx dest
, rtx base
, rtx step
)
3607 machine_mode mode
= GET_MODE (dest
);
3608 scalar_mode inner
= GET_MODE_INNER (mode
);
3610 /* Each operand can be a register or an immediate in the range [-16, 15]. */
3611 if (!aarch64_sve_index_immediate_p (base
))
3612 base
= force_reg (inner
, base
);
3613 if (!aarch64_sve_index_immediate_p (step
))
3614 step
= force_reg (inner
, step
);
3616 emit_set_insn (dest
, gen_rtx_VEC_SERIES (mode
, base
, step
));
3619 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
3620 register of mode MODE. Use TARGET for the result if it's nonnull
3623 The two vector modes must have the same element mode. The behavior
3624 is to duplicate architectural lane N of SRC into architectural lanes
3625 N + I * STEP of the result. On big-endian targets, architectural
3626 lane 0 of an Advanced SIMD vector is the last element of the vector
3627 in memory layout, so for big-endian targets this operation has the
3628 effect of reversing SRC before duplicating it. Callers need to
3629 account for this. */
3632 aarch64_expand_sve_dupq (rtx target
, machine_mode mode
, rtx src
)
3634 machine_mode src_mode
= GET_MODE (src
);
3635 gcc_assert (GET_MODE_INNER (mode
) == GET_MODE_INNER (src_mode
));
3636 insn_code icode
= (BYTES_BIG_ENDIAN
3637 ? code_for_aarch64_vec_duplicate_vq_be (mode
)
3638 : code_for_aarch64_vec_duplicate_vq_le (mode
));
3641 expand_operand ops
[3];
3642 create_output_operand (&ops
[i
++], target
, mode
);
3643 create_output_operand (&ops
[i
++], src
, src_mode
);
3644 if (BYTES_BIG_ENDIAN
)
3646 /* Create a PARALLEL describing the reversal of SRC. */
3647 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (mode
);
3648 rtx sel
= aarch64_gen_stepped_int_parallel (nelts_per_vq
,
3649 nelts_per_vq
- 1, -1);
3650 create_fixed_operand (&ops
[i
++], sel
);
3652 expand_insn (icode
, i
, ops
);
3653 return ops
[0].value
;
3656 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
3657 the memory image into DEST. Return true on success. */
3660 aarch64_expand_sve_ld1rq (rtx dest
, rtx src
)
3662 src
= force_const_mem (GET_MODE (src
), src
);
3666 /* Make sure that the address is legitimate. */
3667 if (!aarch64_sve_ld1rq_operand_p (src
))
3669 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
3670 src
= replace_equiv_address (src
, addr
);
3673 machine_mode mode
= GET_MODE (dest
);
3674 unsigned int elem_bytes
= GET_MODE_UNIT_SIZE (mode
);
3675 machine_mode pred_mode
= aarch64_sve_pred_mode (elem_bytes
).require ();
3676 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
3677 emit_insn (gen_aarch64_sve_ld1rq (mode
, dest
, src
, ptrue
));
3681 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
3682 SVE data mode and isn't a legitimate constant. Use TARGET for the
3683 result if convenient.
3685 The returned register can have whatever mode seems most natural
3686 given the contents of SRC. */
3689 aarch64_expand_sve_const_vector (rtx target
, rtx src
)
3691 machine_mode mode
= GET_MODE (src
);
3692 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (src
);
3693 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (src
);
3694 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
3695 unsigned int elt_bits
= GET_MODE_BITSIZE (elt_mode
);
3696 unsigned int encoded_bits
= npatterns
* nelts_per_pattern
* elt_bits
;
3698 if (nelts_per_pattern
== 1 && encoded_bits
== 128)
3700 /* The constant is a duplicated quadword but can't be narrowed
3701 beyond a quadword. Get the memory image of the first quadword
3702 as a 128-bit vector and try using LD1RQ to load it from memory.
3704 The effect for both endiannesses is to load memory lane N into
3705 architectural lanes N + I * STEP of the result. On big-endian
3706 targets, the layout of the 128-bit vector in an Advanced SIMD
3707 register would be different from its layout in an SVE register,
3708 but this 128-bit vector is a memory value only. */
3709 machine_mode vq_mode
= aarch64_vq_mode (elt_mode
).require ();
3710 rtx vq_value
= simplify_gen_subreg (vq_mode
, src
, mode
, 0);
3711 if (vq_value
&& aarch64_expand_sve_ld1rq (target
, vq_value
))
3715 if (nelts_per_pattern
== 1 && encoded_bits
< 128)
3717 /* The vector is a repeating sequence of 64 bits or fewer.
3718 See if we can load them using an Advanced SIMD move and then
3719 duplicate it to fill a vector. This is better than using a GPR
3720 move because it keeps everything in the same register file. */
3721 machine_mode vq_mode
= aarch64_vq_mode (elt_mode
).require ();
3722 rtx_vector_builder
builder (vq_mode
, npatterns
, 1);
3723 for (unsigned int i
= 0; i
< npatterns
; ++i
)
3725 /* We want memory lane N to go into architectural lane N,
3726 so reverse for big-endian targets. The DUP .Q pattern
3727 has a compensating reverse built-in. */
3728 unsigned int srci
= BYTES_BIG_ENDIAN
? npatterns
- i
- 1 : i
;
3729 builder
.quick_push (CONST_VECTOR_ENCODED_ELT (src
, srci
));
3731 rtx vq_src
= builder
.build ();
3732 if (aarch64_simd_valid_immediate (vq_src
, NULL
))
3734 vq_src
= force_reg (vq_mode
, vq_src
);
3735 return aarch64_expand_sve_dupq (target
, mode
, vq_src
);
3738 /* Get an integer representation of the repeating part of Advanced
3739 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
3740 which for big-endian targets is lane-swapped wrt a normal
3741 Advanced SIMD vector. This means that for both endiannesses,
3742 memory lane N of SVE vector SRC corresponds to architectural
3743 lane N of a register holding VQ_SRC. This in turn means that
3744 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
3745 as a single 128-bit value) and thus that memory lane 0 of SRC is
3746 in the lsb of the integer. Duplicating the integer therefore
3747 ensures that memory lane N of SRC goes into architectural lane
3748 N + I * INDEX of the SVE register. */
3749 scalar_mode int_mode
= int_mode_for_size (encoded_bits
, 0).require ();
3750 rtx elt_value
= simplify_gen_subreg (int_mode
, vq_src
, vq_mode
, 0);
3753 /* Pretend that we had a vector of INT_MODE to start with. */
3754 elt_mode
= int_mode
;
3755 mode
= aarch64_full_sve_mode (int_mode
).require ();
3757 /* If the integer can be moved into a general register by a
3758 single instruction, do that and duplicate the result. */
3759 if (CONST_INT_P (elt_value
)
3760 && aarch64_move_imm (INTVAL (elt_value
), elt_mode
))
3762 elt_value
= force_reg (elt_mode
, elt_value
);
3763 return expand_vector_broadcast (mode
, elt_value
);
3766 else if (npatterns
== 1)
3767 /* We're duplicating a single value, but can't do better than
3768 force it to memory and load from there. This handles things
3769 like symbolic constants. */
3770 elt_value
= CONST_VECTOR_ENCODED_ELT (src
, 0);
3774 /* Load the element from memory if we can, otherwise move it into
3775 a register and use a DUP. */
3776 rtx op
= force_const_mem (elt_mode
, elt_value
);
3778 op
= force_reg (elt_mode
, elt_value
);
3779 return expand_vector_broadcast (mode
, op
);
3783 /* Try using INDEX. */
3785 if (const_vec_series_p (src
, &base
, &step
))
3787 aarch64_expand_vec_series (target
, base
, step
);
3791 /* From here on, it's better to force the whole constant to memory
3793 if (GET_MODE_NUNITS (mode
).is_constant ())
3796 /* Expand each pattern individually. */
3797 gcc_assert (npatterns
> 1);
3798 rtx_vector_builder builder
;
3799 auto_vec
<rtx
, 16> vectors (npatterns
);
3800 for (unsigned int i
= 0; i
< npatterns
; ++i
)
3802 builder
.new_vector (mode
, 1, nelts_per_pattern
);
3803 for (unsigned int j
= 0; j
< nelts_per_pattern
; ++j
)
3804 builder
.quick_push (CONST_VECTOR_ELT (src
, i
+ j
* npatterns
));
3805 vectors
.quick_push (force_reg (mode
, builder
.build ()));
3808 /* Use permutes to interleave the separate vectors. */
3809 while (npatterns
> 1)
3812 for (unsigned int i
= 0; i
< npatterns
; ++i
)
3814 rtx tmp
= (npatterns
== 1 ? target
: gen_reg_rtx (mode
));
3815 rtvec v
= gen_rtvec (2, vectors
[i
], vectors
[i
+ npatterns
]);
3816 emit_set_insn (tmp
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
3820 gcc_assert (vectors
[0] == target
);
3824 /* Use WHILE to set a predicate register of mode MODE in which the first
3825 VL bits are set and the rest are clear. Use TARGET for the register
3826 if it's nonnull and convenient. */
3829 aarch64_sve_move_pred_via_while (rtx target
, machine_mode mode
,
3832 rtx limit
= force_reg (DImode
, gen_int_mode (vl
, DImode
));
3833 target
= aarch64_target_reg (target
, mode
);
3834 emit_insn (gen_while_ult (DImode
, mode
, target
, const0_rtx
, limit
));
3839 aarch64_expand_sve_const_pred_1 (rtx
, rtx_vector_builder
&, bool);
3841 /* BUILDER is a constant predicate in which the index of every set bit
3842 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
3843 by inverting every element at a multiple of ELT_SIZE and EORing the
3844 result with an ELT_SIZE PTRUE.
3846 Return a register that contains the constant on success, otherwise
3847 return null. Use TARGET as the register if it is nonnull and
3851 aarch64_expand_sve_const_pred_eor (rtx target
, rtx_vector_builder
&builder
,
3852 unsigned int elt_size
)
3854 /* Invert every element at a multiple of ELT_SIZE, keeping the
3856 rtx_vector_builder
inv_builder (VNx16BImode
, builder
.npatterns (),
3857 builder
.nelts_per_pattern ());
3858 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); ++i
)
3859 if ((i
& (elt_size
- 1)) == 0 && INTVAL (builder
.elt (i
)) == 0)
3860 inv_builder
.quick_push (const1_rtx
);
3862 inv_builder
.quick_push (const0_rtx
);
3863 inv_builder
.finalize ();
3865 /* See if we can load the constant cheaply. */
3866 rtx inv
= aarch64_expand_sve_const_pred_1 (NULL_RTX
, inv_builder
, false);
3870 /* EOR the result with an ELT_SIZE PTRUE. */
3871 rtx mask
= aarch64_ptrue_all (elt_size
);
3872 mask
= force_reg (VNx16BImode
, mask
);
3873 target
= aarch64_target_reg (target
, VNx16BImode
);
3874 emit_insn (gen_aarch64_pred_z (XOR
, VNx16BImode
, target
, mask
, inv
, mask
));
3878 /* BUILDER is a constant predicate in which the index of every set bit
3879 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
3880 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
3881 register on success, otherwise return null. Use TARGET as the register
3882 if nonnull and convenient. */
3885 aarch64_expand_sve_const_pred_trn (rtx target
, rtx_vector_builder
&builder
,
3886 unsigned int elt_size
,
3887 unsigned int permute_size
)
3889 /* We're going to split the constant into two new constants A and B,
3890 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
3891 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
3893 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
3894 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
3896 where _ indicates elements that will be discarded by the permute.
3898 First calculate the ELT_SIZEs for A and B. */
3899 unsigned int a_elt_size
= GET_MODE_SIZE (DImode
);
3900 unsigned int b_elt_size
= GET_MODE_SIZE (DImode
);
3901 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); i
+= elt_size
)
3902 if (INTVAL (builder
.elt (i
)) != 0)
3904 if (i
& permute_size
)
3905 b_elt_size
|= i
- permute_size
;
3909 a_elt_size
&= -a_elt_size
;
3910 b_elt_size
&= -b_elt_size
;
3912 /* Now construct the vectors themselves. */
3913 rtx_vector_builder
a_builder (VNx16BImode
, builder
.npatterns (),
3914 builder
.nelts_per_pattern ());
3915 rtx_vector_builder
b_builder (VNx16BImode
, builder
.npatterns (),
3916 builder
.nelts_per_pattern ());
3917 unsigned int nelts
= builder
.encoded_nelts ();
3918 for (unsigned int i
= 0; i
< nelts
; ++i
)
3919 if (i
& (elt_size
- 1))
3921 a_builder
.quick_push (const0_rtx
);
3922 b_builder
.quick_push (const0_rtx
);
3924 else if ((i
& permute_size
) == 0)
3926 /* The A and B elements are significant. */
3927 a_builder
.quick_push (builder
.elt (i
));
3928 b_builder
.quick_push (builder
.elt (i
+ permute_size
));
3932 /* The A and B elements are going to be discarded, so pick whatever
3933 is likely to give a nice constant. We are targeting element
3934 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
3935 with the aim of each being a sequence of ones followed by
3936 a sequence of zeros. So:
3938 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
3939 duplicate the last X_ELT_SIZE element, to extend the
3940 current sequence of ones or zeros.
3942 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
3943 zero, so that the constant really does have X_ELT_SIZE and
3944 not a smaller size. */
3945 if (a_elt_size
> permute_size
)
3946 a_builder
.quick_push (const0_rtx
);
3948 a_builder
.quick_push (a_builder
.elt (i
- a_elt_size
));
3949 if (b_elt_size
> permute_size
)
3950 b_builder
.quick_push (const0_rtx
);
3952 b_builder
.quick_push (b_builder
.elt (i
- b_elt_size
));
3954 a_builder
.finalize ();
3955 b_builder
.finalize ();
3957 /* Try loading A into a register. */
3958 rtx_insn
*last
= get_last_insn ();
3959 rtx a
= aarch64_expand_sve_const_pred_1 (NULL_RTX
, a_builder
, false);
3963 /* Try loading B into a register. */
3965 if (a_builder
!= b_builder
)
3967 b
= aarch64_expand_sve_const_pred_1 (NULL_RTX
, b_builder
, false);
3970 delete_insns_since (last
);
3975 /* Emit the TRN1 itself. */
3976 machine_mode mode
= aarch64_sve_pred_mode (permute_size
).require ();
3977 target
= aarch64_target_reg (target
, mode
);
3978 emit_insn (gen_aarch64_sve (UNSPEC_TRN1
, mode
, target
,
3979 gen_lowpart (mode
, a
),
3980 gen_lowpart (mode
, b
)));
3984 /* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
3985 constant in BUILDER into an SVE predicate register. Return the register
3986 on success, otherwise return null. Use TARGET for the register if
3987 nonnull and convenient.
3989 ALLOW_RECURSE_P is true if we can use methods that would call this
3990 function recursively. */
3993 aarch64_expand_sve_const_pred_1 (rtx target
, rtx_vector_builder
&builder
,
3994 bool allow_recurse_p
)
3996 if (builder
.encoded_nelts () == 1)
3997 /* A PFALSE or a PTRUE .B ALL. */
3998 return aarch64_emit_set_immediate (target
, builder
);
4000 unsigned int elt_size
= aarch64_widest_sve_pred_elt_size (builder
);
4001 if (int vl
= aarch64_partial_ptrue_length (builder
, elt_size
))
4003 /* If we can load the constant using PTRUE, use it as-is. */
4004 machine_mode mode
= aarch64_sve_pred_mode (elt_size
).require ();
4005 if (aarch64_svpattern_for_vl (mode
, vl
) != AARCH64_NUM_SVPATTERNS
)
4006 return aarch64_emit_set_immediate (target
, builder
);
4008 /* Otherwise use WHILE to set the first VL bits. */
4009 return aarch64_sve_move_pred_via_while (target
, mode
, vl
);
4012 if (!allow_recurse_p
)
4015 /* Try inverting the vector in element size ELT_SIZE and then EORing
4016 the result with an ELT_SIZE PTRUE. */
4017 if (INTVAL (builder
.elt (0)) == 0)
4018 if (rtx res
= aarch64_expand_sve_const_pred_eor (target
, builder
,
4022 /* Try using TRN1 to permute two simpler constants. */
4023 for (unsigned int i
= elt_size
; i
<= 8; i
*= 2)
4024 if (rtx res
= aarch64_expand_sve_const_pred_trn (target
, builder
,
4031 /* Return an SVE predicate register that contains the VNx16BImode
4032 constant in BUILDER, without going through the move expanders.
4034 The returned register can have whatever mode seems most natural
4035 given the contents of BUILDER. Use TARGET for the result if
4039 aarch64_expand_sve_const_pred (rtx target
, rtx_vector_builder
&builder
)
4041 /* Try loading the constant using pure predicate operations. */
4042 if (rtx res
= aarch64_expand_sve_const_pred_1 (target
, builder
, true))
4045 /* Try forcing the constant to memory. */
4046 if (builder
.full_nelts ().is_constant ())
4047 if (rtx mem
= force_const_mem (VNx16BImode
, builder
.build ()))
4049 target
= aarch64_target_reg (target
, VNx16BImode
);
4050 emit_move_insn (target
, mem
);
4054 /* The last resort is to load the constant as an integer and then
4055 compare it against zero. Use -1 for set bits in order to increase
4056 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
4057 rtx_vector_builder
int_builder (VNx16QImode
, builder
.npatterns (),
4058 builder
.nelts_per_pattern ());
4059 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); ++i
)
4060 int_builder
.quick_push (INTVAL (builder
.elt (i
))
4061 ? constm1_rtx
: const0_rtx
);
4062 return aarch64_convert_sve_data_to_pred (target
, VNx16BImode
,
4063 int_builder
.build ());
4066 /* Set DEST to immediate IMM. */
4069 aarch64_expand_mov_immediate (rtx dest
, rtx imm
)
4071 machine_mode mode
= GET_MODE (dest
);
4073 /* Check on what type of symbol it is. */
4074 scalar_int_mode int_mode
;
4075 if ((GET_CODE (imm
) == SYMBOL_REF
4076 || GET_CODE (imm
) == LABEL_REF
4077 || GET_CODE (imm
) == CONST
4078 || GET_CODE (imm
) == CONST_POLY_INT
)
4079 && is_a
<scalar_int_mode
> (mode
, &int_mode
))
4083 HOST_WIDE_INT const_offset
;
4084 enum aarch64_symbol_type sty
;
4086 /* If we have (const (plus symbol offset)), separate out the offset
4087 before we start classifying the symbol. */
4088 rtx base
= strip_offset (imm
, &offset
);
4090 /* We must always add an offset involving VL separately, rather than
4091 folding it into the relocation. */
4092 if (!offset
.is_constant (&const_offset
))
4094 if (base
== const0_rtx
&& aarch64_sve_cnt_immediate_p (offset
))
4095 emit_insn (gen_rtx_SET (dest
, imm
));
4098 /* Do arithmetic on 32-bit values if the result is smaller
4100 if (partial_subreg_p (int_mode
, SImode
))
4102 /* It is invalid to do symbol calculations in modes
4103 narrower than SImode. */
4104 gcc_assert (base
== const0_rtx
);
4105 dest
= gen_lowpart (SImode
, dest
);
4108 if (base
!= const0_rtx
)
4110 base
= aarch64_force_temporary (int_mode
, dest
, base
);
4111 aarch64_add_offset (int_mode
, dest
, base
, offset
,
4112 NULL_RTX
, NULL_RTX
, false);
4115 aarch64_add_offset (int_mode
, dest
, base
, offset
,
4116 dest
, NULL_RTX
, false);
4121 sty
= aarch64_classify_symbol (base
, const_offset
);
4124 case SYMBOL_FORCE_TO_MEM
:
4125 if (const_offset
!= 0
4126 && targetm
.cannot_force_const_mem (int_mode
, imm
))
4128 gcc_assert (can_create_pseudo_p ());
4129 base
= aarch64_force_temporary (int_mode
, dest
, base
);
4130 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
4131 NULL_RTX
, NULL_RTX
, false);
4135 mem
= force_const_mem (ptr_mode
, imm
);
4138 /* If we aren't generating PC relative literals, then
4139 we need to expand the literal pool access carefully.
4140 This is something that needs to be done in a number
4141 of places, so could well live as a separate function. */
4142 if (!aarch64_pcrelative_literal_loads
)
4144 gcc_assert (can_create_pseudo_p ());
4145 base
= gen_reg_rtx (ptr_mode
);
4146 aarch64_expand_mov_immediate (base
, XEXP (mem
, 0));
4147 if (ptr_mode
!= Pmode
)
4148 base
= convert_memory_address (Pmode
, base
);
4149 mem
= gen_rtx_MEM (ptr_mode
, base
);
4152 if (int_mode
!= ptr_mode
)
4153 mem
= gen_rtx_ZERO_EXTEND (int_mode
, mem
);
4155 emit_insn (gen_rtx_SET (dest
, mem
));
4159 case SYMBOL_SMALL_TLSGD
:
4160 case SYMBOL_SMALL_TLSDESC
:
4161 case SYMBOL_SMALL_TLSIE
:
4162 case SYMBOL_SMALL_GOT_28K
:
4163 case SYMBOL_SMALL_GOT_4G
:
4164 case SYMBOL_TINY_GOT
:
4165 case SYMBOL_TINY_TLSIE
:
4166 if (const_offset
!= 0)
4168 gcc_assert(can_create_pseudo_p ());
4169 base
= aarch64_force_temporary (int_mode
, dest
, base
);
4170 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
4171 NULL_RTX
, NULL_RTX
, false);
4176 case SYMBOL_SMALL_ABSOLUTE
:
4177 case SYMBOL_TINY_ABSOLUTE
:
4178 case SYMBOL_TLSLE12
:
4179 case SYMBOL_TLSLE24
:
4180 case SYMBOL_TLSLE32
:
4181 case SYMBOL_TLSLE48
:
4182 aarch64_load_symref_appropriately (dest
, imm
, sty
);
4190 if (!CONST_INT_P (imm
))
4192 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
)
4194 /* Only the low bit of each .H, .S and .D element is defined,
4195 so we can set the upper bits to whatever we like. If the
4196 predicate is all-true in MODE, prefer to set all the undefined
4197 bits as well, so that we can share a single .B predicate for
4199 if (imm
== CONSTM1_RTX (mode
))
4200 imm
= CONSTM1_RTX (VNx16BImode
);
4202 /* All methods for constructing predicate modes wider than VNx16BI
4203 will set the upper bits of each element to zero. Expose this
4204 by moving such constants as a VNx16BI, so that all bits are
4205 significant and so that constants for different modes can be
4206 shared. The wider constant will still be available as a
4208 rtx_vector_builder builder
;
4209 if (aarch64_get_sve_pred_bits (builder
, imm
))
4211 rtx res
= aarch64_expand_sve_const_pred (dest
, builder
);
4213 emit_move_insn (dest
, gen_lowpart (mode
, res
));
4218 if (GET_CODE (imm
) == HIGH
4219 || aarch64_simd_valid_immediate (imm
, NULL
))
4221 emit_insn (gen_rtx_SET (dest
, imm
));
4225 if (GET_CODE (imm
) == CONST_VECTOR
&& aarch64_sve_data_mode_p (mode
))
4226 if (rtx res
= aarch64_expand_sve_const_vector (dest
, imm
))
4229 emit_insn (gen_aarch64_sve_reinterpret (mode
, dest
, res
));
4233 rtx mem
= force_const_mem (mode
, imm
);
4235 emit_move_insn (dest
, mem
);
4239 aarch64_internal_mov_immediate (dest
, imm
, true,
4240 as_a
<scalar_int_mode
> (mode
));
4243 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
4244 that is known to contain PTRUE. */
4247 aarch64_emit_sve_pred_move (rtx dest
, rtx pred
, rtx src
)
4249 expand_operand ops
[3];
4250 machine_mode mode
= GET_MODE (dest
);
4251 create_output_operand (&ops
[0], dest
, mode
);
4252 create_input_operand (&ops
[1], pred
, GET_MODE(pred
));
4253 create_input_operand (&ops
[2], src
, mode
);
4254 temporary_volatile_ok
v (true);
4255 expand_insn (code_for_aarch64_pred_mov (mode
), 3, ops
);
4258 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
4259 operand is in memory. In this case we need to use the predicated LD1
4260 and ST1 instead of LDR and STR, both for correctness on big-endian
4261 targets and because LD1 and ST1 support a wider range of addressing modes.
4262 PRED_MODE is the mode of the predicate.
4264 See the comment at the head of aarch64-sve.md for details about the
4265 big-endian handling. */
4268 aarch64_expand_sve_mem_move (rtx dest
, rtx src
, machine_mode pred_mode
)
4270 machine_mode mode
= GET_MODE (dest
);
4271 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
4272 if (!register_operand (src
, mode
)
4273 && !register_operand (dest
, mode
))
4275 rtx tmp
= gen_reg_rtx (mode
);
4277 aarch64_emit_sve_pred_move (tmp
, ptrue
, src
);
4279 emit_move_insn (tmp
, src
);
4282 aarch64_emit_sve_pred_move (dest
, ptrue
, src
);
4285 /* Called only on big-endian targets. See whether an SVE vector move
4286 from SRC to DEST is effectively a REV[BHW] instruction, because at
4287 least one operand is a subreg of an SVE vector that has wider or
4288 narrower elements. Return true and emit the instruction if so.
4292 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
4294 represents a VIEW_CONVERT between the following vectors, viewed
4297 R2: { [0].high, [0].low, [1].high, [1].low, ... }
4298 R1: { [0], [1], [2], [3], ... }
4300 The high part of lane X in R2 should therefore correspond to lane X*2
4301 of R1, but the register representations are:
4304 R2: ...... [1].high [1].low [0].high [0].low
4305 R1: ...... [3] [2] [1] [0]
4307 where the low part of lane X in R2 corresponds to lane X*2 in R1.
4308 We therefore need a reverse operation to swap the high and low values
4311 This is purely an optimization. Without it we would spill the
4312 subreg operand to the stack in one mode and reload it in the
4313 other mode, which has the same effect as the REV. */
4316 aarch64_maybe_expand_sve_subreg_move (rtx dest
, rtx src
)
4318 gcc_assert (BYTES_BIG_ENDIAN
);
4319 if (GET_CODE (dest
) == SUBREG
)
4320 dest
= SUBREG_REG (dest
);
4321 if (GET_CODE (src
) == SUBREG
)
4322 src
= SUBREG_REG (src
);
4324 /* The optimization handles two single SVE REGs with different element
4328 || aarch64_classify_vector_mode (GET_MODE (dest
)) != VEC_SVE_DATA
4329 || aarch64_classify_vector_mode (GET_MODE (src
)) != VEC_SVE_DATA
4330 || (GET_MODE_UNIT_SIZE (GET_MODE (dest
))
4331 == GET_MODE_UNIT_SIZE (GET_MODE (src
))))
4334 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
4335 rtx ptrue
= aarch64_ptrue_reg (VNx16BImode
);
4336 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (dest
), gen_rtvec (2, ptrue
, src
),
4338 emit_insn (gen_rtx_SET (dest
, unspec
));
4342 /* Return a copy of X with mode MODE, without changing its other
4343 attributes. Unlike gen_lowpart, this doesn't care whether the
4344 mode change is valid. */
4347 aarch64_replace_reg_mode (rtx x
, machine_mode mode
)
4349 if (GET_MODE (x
) == mode
)
4352 x
= shallow_copy_rtx (x
);
4353 set_mode_and_regno (x
, mode
, REGNO (x
));
4357 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
4358 stored in wider integer containers. */
4361 aarch64_sve_rev_unspec (machine_mode mode
)
4363 switch (GET_MODE_UNIT_SIZE (mode
))
4365 case 1: return UNSPEC_REVB
;
4366 case 2: return UNSPEC_REVH
;
4367 case 4: return UNSPEC_REVW
;
4372 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
4376 aarch64_split_sve_subreg_move (rtx dest
, rtx ptrue
, rtx src
)
4378 /* Decide which REV operation we need. The mode with wider elements
4379 determines the mode of the operands and the mode with the narrower
4380 elements determines the reverse width. */
4381 machine_mode mode_with_wider_elts
= GET_MODE (dest
);
4382 machine_mode mode_with_narrower_elts
= GET_MODE (src
);
4383 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts
)
4384 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts
))
4385 std::swap (mode_with_wider_elts
, mode_with_narrower_elts
);
4387 unsigned int unspec
= aarch64_sve_rev_unspec (mode_with_narrower_elts
);
4388 unsigned int wider_bytes
= GET_MODE_UNIT_SIZE (mode_with_wider_elts
);
4389 machine_mode pred_mode
= aarch64_sve_pred_mode (wider_bytes
).require ();
4391 /* Get the operands in the appropriate modes and emit the instruction. */
4392 ptrue
= gen_lowpart (pred_mode
, ptrue
);
4393 dest
= aarch64_replace_reg_mode (dest
, mode_with_wider_elts
);
4394 src
= aarch64_replace_reg_mode (src
, mode_with_wider_elts
);
4395 emit_insn (gen_aarch64_pred (unspec
, mode_with_wider_elts
,
4400 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED
,
4401 tree exp ATTRIBUTE_UNUSED
)
4403 if (aarch64_simd_decl_p (cfun
->decl
) != aarch64_simd_decl_p (decl
))
4409 /* Implement TARGET_PASS_BY_REFERENCE. */
4412 aarch64_pass_by_reference (cumulative_args_t
, const function_arg_info
&arg
)
4415 machine_mode dummymode
;
4418 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
4419 if (arg
.mode
== BLKmode
&& arg
.type
)
4420 size
= int_size_in_bytes (arg
.type
);
4422 /* No frontends can create types with variable-sized modes, so we
4423 shouldn't be asked to pass or return them. */
4424 size
= GET_MODE_SIZE (arg
.mode
).to_constant ();
4426 /* Aggregates are passed by reference based on their size. */
4427 if (arg
.aggregate_type_p ())
4428 size
= int_size_in_bytes (arg
.type
);
4430 /* Variable sized arguments are always returned by reference. */
4434 /* Can this be a candidate to be passed in fp/simd register(s)? */
4435 if (aarch64_vfp_is_call_or_return_candidate (arg
.mode
, arg
.type
,
4440 /* Arguments which are variable sized or larger than 2 registers are
4441 passed by reference unless they are a homogenous floating point
4443 return size
> 2 * UNITS_PER_WORD
;
4446 /* Return TRUE if VALTYPE is padded to its least significant bits. */
4448 aarch64_return_in_msb (const_tree valtype
)
4450 machine_mode dummy_mode
;
4453 /* Never happens in little-endian mode. */
4454 if (!BYTES_BIG_ENDIAN
)
4457 /* Only composite types smaller than or equal to 16 bytes can
4458 be potentially returned in registers. */
4459 if (!aarch64_composite_type_p (valtype
, TYPE_MODE (valtype
))
4460 || int_size_in_bytes (valtype
) <= 0
4461 || int_size_in_bytes (valtype
) > 16)
4464 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4465 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4466 is always passed/returned in the least significant bits of fp/simd
4468 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype
), valtype
,
4469 &dummy_mode
, &dummy_int
, NULL
))
4475 /* Implement TARGET_FUNCTION_VALUE.
4476 Define how to find the value returned by a function. */
4479 aarch64_function_value (const_tree type
, const_tree func
,
4480 bool outgoing ATTRIBUTE_UNUSED
)
4485 machine_mode ag_mode
;
4487 mode
= TYPE_MODE (type
);
4488 if (INTEGRAL_TYPE_P (type
))
4489 mode
= promote_function_mode (type
, mode
, &unsignedp
, func
, 1);
4491 if (aarch64_return_in_msb (type
))
4493 HOST_WIDE_INT size
= int_size_in_bytes (type
);
4495 if (size
% UNITS_PER_WORD
!= 0)
4497 size
+= UNITS_PER_WORD
- size
% UNITS_PER_WORD
;
4498 mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 0).require ();
4502 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
4503 &ag_mode
, &count
, NULL
))
4505 if (!aarch64_composite_type_p (type
, mode
))
4507 gcc_assert (count
== 1 && mode
== ag_mode
);
4508 return gen_rtx_REG (mode
, V0_REGNUM
);
4515 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (count
));
4516 for (i
= 0; i
< count
; i
++)
4518 rtx tmp
= gen_rtx_REG (ag_mode
, V0_REGNUM
+ i
);
4519 rtx offset
= gen_int_mode (i
* GET_MODE_SIZE (ag_mode
), Pmode
);
4520 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
4521 XVECEXP (par
, 0, i
) = tmp
;
4527 return gen_rtx_REG (mode
, R0_REGNUM
);
4530 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
4531 Return true if REGNO is the number of a hard register in which the values
4532 of called function may come back. */
4535 aarch64_function_value_regno_p (const unsigned int regno
)
4537 /* Maximum of 16 bytes can be returned in the general registers. Examples
4538 of 16-byte return values are: 128-bit integers and 16-byte small
4539 structures (excluding homogeneous floating-point aggregates). */
4540 if (regno
== R0_REGNUM
|| regno
== R1_REGNUM
)
4543 /* Up to four fp/simd registers can return a function value, e.g. a
4544 homogeneous floating-point aggregate having four members. */
4545 if (regno
>= V0_REGNUM
&& regno
< V0_REGNUM
+ HA_MAX_NUM_FLDS
)
4546 return TARGET_FLOAT
;
4551 /* Implement TARGET_RETURN_IN_MEMORY.
4553 If the type T of the result of a function is such that
4555 would require that arg be passed as a value in a register (or set of
4556 registers) according to the parameter passing rules, then the result
4557 is returned in the same registers as would be used for such an
4561 aarch64_return_in_memory (const_tree type
, const_tree fndecl ATTRIBUTE_UNUSED
)
4564 machine_mode ag_mode
;
4567 if (!AGGREGATE_TYPE_P (type
)
4568 && TREE_CODE (type
) != COMPLEX_TYPE
4569 && TREE_CODE (type
) != VECTOR_TYPE
)
4570 /* Simple scalar types always returned in registers. */
4573 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
),
4580 /* Types larger than 2 registers returned in memory. */
4581 size
= int_size_in_bytes (type
);
4582 return (size
< 0 || size
> 2 * UNITS_PER_WORD
);
4586 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v
, machine_mode mode
,
4587 const_tree type
, int *nregs
)
4589 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
4590 return aarch64_vfp_is_call_or_return_candidate (mode
,
4592 &pcum
->aapcs_vfp_rmode
,
4597 /* Given MODE and TYPE of a function argument, return the alignment in
4598 bits. The idea is to suppress any stronger alignment requested by
4599 the user and opt for the natural alignment (specified in AAPCS64 \S
4600 4.1). ABI_BREAK is set to true if the alignment was incorrectly
4601 calculated in versions of GCC prior to GCC-9. This is a helper
4602 function for local use only. */
4605 aarch64_function_arg_alignment (machine_mode mode
, const_tree type
,
4610 return GET_MODE_ALIGNMENT (mode
);
4612 if (integer_zerop (TYPE_SIZE (type
)))
4615 gcc_assert (TYPE_MODE (type
) == mode
);
4617 if (!AGGREGATE_TYPE_P (type
))
4618 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type
));
4620 if (TREE_CODE (type
) == ARRAY_TYPE
)
4621 return TYPE_ALIGN (TREE_TYPE (type
));
4623 unsigned int alignment
= 0;
4624 unsigned int bitfield_alignment
= 0;
4625 for (tree field
= TYPE_FIELDS (type
); field
; field
= DECL_CHAIN (field
))
4626 if (TREE_CODE (field
) == FIELD_DECL
)
4628 alignment
= std::max (alignment
, DECL_ALIGN (field
));
4629 if (DECL_BIT_FIELD_TYPE (field
))
4631 = std::max (bitfield_alignment
,
4632 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field
)));
4635 if (bitfield_alignment
> alignment
)
4638 return bitfield_alignment
;
4644 /* Layout a function argument according to the AAPCS64 rules. The rule
4645 numbers refer to the rule numbers in the AAPCS64. */
4648 aarch64_layout_arg (cumulative_args_t pcum_v
, machine_mode mode
,
4650 bool named ATTRIBUTE_UNUSED
)
4652 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
4653 int ncrn
, nvrn
, nregs
;
4654 bool allocate_ncrn
, allocate_nvrn
;
4658 /* We need to do this once per argument. */
4659 if (pcum
->aapcs_arg_processed
)
4662 pcum
->aapcs_arg_processed
= true;
4664 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
4666 size
= int_size_in_bytes (type
);
4668 /* No frontends can create types with variable-sized modes, so we
4669 shouldn't be asked to pass or return them. */
4670 size
= GET_MODE_SIZE (mode
).to_constant ();
4671 size
= ROUND_UP (size
, UNITS_PER_WORD
);
4673 allocate_ncrn
= (type
) ? !(FLOAT_TYPE_P (type
)) : !FLOAT_MODE_P (mode
);
4674 allocate_nvrn
= aarch64_vfp_is_call_candidate (pcum_v
,
4679 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
4680 The following code thus handles passing by SIMD/FP registers first. */
4682 nvrn
= pcum
->aapcs_nvrn
;
4684 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
4685 and homogenous short-vector aggregates (HVA). */
4689 aarch64_err_no_fpadvsimd (mode
);
4691 if (nvrn
+ nregs
<= NUM_FP_ARG_REGS
)
4693 pcum
->aapcs_nextnvrn
= nvrn
+ nregs
;
4694 if (!aarch64_composite_type_p (type
, mode
))
4696 gcc_assert (nregs
== 1);
4697 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ nvrn
);
4703 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
4704 for (i
= 0; i
< nregs
; i
++)
4706 rtx tmp
= gen_rtx_REG (pcum
->aapcs_vfp_rmode
,
4707 V0_REGNUM
+ nvrn
+ i
);
4708 rtx offset
= gen_int_mode
4709 (i
* GET_MODE_SIZE (pcum
->aapcs_vfp_rmode
), Pmode
);
4710 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
4711 XVECEXP (par
, 0, i
) = tmp
;
4713 pcum
->aapcs_reg
= par
;
4719 /* C.3 NSRN is set to 8. */
4720 pcum
->aapcs_nextnvrn
= NUM_FP_ARG_REGS
;
4725 ncrn
= pcum
->aapcs_ncrn
;
4726 nregs
= size
/ UNITS_PER_WORD
;
4728 /* C6 - C9. though the sign and zero extension semantics are
4729 handled elsewhere. This is the case where the argument fits
4730 entirely general registers. */
4731 if (allocate_ncrn
&& (ncrn
+ nregs
<= NUM_ARG_REGS
))
4733 gcc_assert (nregs
== 0 || nregs
== 1 || nregs
== 2);
4735 /* C.8 if the argument has an alignment of 16 then the NGRN is
4736 rounded up to the next even number. */
4739 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
4740 comparison is there because for > 16 * BITS_PER_UNIT
4741 alignment nregs should be > 2 and therefore it should be
4742 passed by reference rather than value. */
4743 && (aarch64_function_arg_alignment (mode
, type
, &abi_break
)
4744 == 16 * BITS_PER_UNIT
))
4746 if (abi_break
&& warn_psabi
&& currently_expanding_gimple_stmt
)
4747 inform (input_location
, "parameter passing for argument of type "
4748 "%qT changed in GCC 9.1", type
);
4750 gcc_assert (ncrn
+ nregs
<= NUM_ARG_REGS
);
4753 /* NREGS can be 0 when e.g. an empty structure is to be passed.
4754 A reg is still generated for it, but the caller should be smart
4755 enough not to use it. */
4756 if (nregs
== 0 || nregs
== 1 || GET_MODE_CLASS (mode
) == MODE_INT
)
4757 pcum
->aapcs_reg
= gen_rtx_REG (mode
, R0_REGNUM
+ ncrn
);
4763 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
4764 for (i
= 0; i
< nregs
; i
++)
4766 rtx tmp
= gen_rtx_REG (word_mode
, R0_REGNUM
+ ncrn
+ i
);
4767 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
,
4768 GEN_INT (i
* UNITS_PER_WORD
));
4769 XVECEXP (par
, 0, i
) = tmp
;
4771 pcum
->aapcs_reg
= par
;
4774 pcum
->aapcs_nextncrn
= ncrn
+ nregs
;
4779 pcum
->aapcs_nextncrn
= NUM_ARG_REGS
;
4781 /* The argument is passed on stack; record the needed number of words for
4782 this argument and align the total size if necessary. */
4784 pcum
->aapcs_stack_words
= size
/ UNITS_PER_WORD
;
4786 if (aarch64_function_arg_alignment (mode
, type
, &abi_break
)
4787 == 16 * BITS_PER_UNIT
)
4789 int new_size
= ROUND_UP (pcum
->aapcs_stack_size
, 16 / UNITS_PER_WORD
);
4790 if (pcum
->aapcs_stack_size
!= new_size
)
4792 if (abi_break
&& warn_psabi
&& currently_expanding_gimple_stmt
)
4793 inform (input_location
, "parameter passing for argument of type "
4794 "%qT changed in GCC 9.1", type
);
4795 pcum
->aapcs_stack_size
= new_size
;
4801 /* Implement TARGET_FUNCTION_ARG. */
4804 aarch64_function_arg (cumulative_args_t pcum_v
, const function_arg_info
&arg
)
4806 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
4807 gcc_assert (pcum
->pcs_variant
== ARM_PCS_AAPCS64
);
4809 if (arg
.end_marker_p ())
4812 aarch64_layout_arg (pcum_v
, arg
.mode
, arg
.type
, arg
.named
);
4813 return pcum
->aapcs_reg
;
4817 aarch64_init_cumulative_args (CUMULATIVE_ARGS
*pcum
,
4818 const_tree fntype ATTRIBUTE_UNUSED
,
4819 rtx libname ATTRIBUTE_UNUSED
,
4820 const_tree fndecl ATTRIBUTE_UNUSED
,
4821 unsigned n_named ATTRIBUTE_UNUSED
)
4823 pcum
->aapcs_ncrn
= 0;
4824 pcum
->aapcs_nvrn
= 0;
4825 pcum
->aapcs_nextncrn
= 0;
4826 pcum
->aapcs_nextnvrn
= 0;
4827 pcum
->pcs_variant
= ARM_PCS_AAPCS64
;
4828 pcum
->aapcs_reg
= NULL_RTX
;
4829 pcum
->aapcs_arg_processed
= false;
4830 pcum
->aapcs_stack_words
= 0;
4831 pcum
->aapcs_stack_size
= 0;
4834 && fndecl
&& TREE_PUBLIC (fndecl
)
4835 && fntype
&& fntype
!= error_mark_node
)
4837 const_tree type
= TREE_TYPE (fntype
);
4838 machine_mode mode ATTRIBUTE_UNUSED
; /* To pass pointer as argument. */
4839 int nregs ATTRIBUTE_UNUSED
; /* Likewise. */
4840 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
), type
,
4841 &mode
, &nregs
, NULL
))
4842 aarch64_err_no_fpadvsimd (TYPE_MODE (type
));
4848 aarch64_function_arg_advance (cumulative_args_t pcum_v
,
4853 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
4854 if (pcum
->pcs_variant
== ARM_PCS_AAPCS64
)
4856 aarch64_layout_arg (pcum_v
, mode
, type
, named
);
4857 gcc_assert ((pcum
->aapcs_reg
!= NULL_RTX
)
4858 != (pcum
->aapcs_stack_words
!= 0));
4859 pcum
->aapcs_arg_processed
= false;
4860 pcum
->aapcs_ncrn
= pcum
->aapcs_nextncrn
;
4861 pcum
->aapcs_nvrn
= pcum
->aapcs_nextnvrn
;
4862 pcum
->aapcs_stack_size
+= pcum
->aapcs_stack_words
;
4863 pcum
->aapcs_stack_words
= 0;
4864 pcum
->aapcs_reg
= NULL_RTX
;
4869 aarch64_function_arg_regno_p (unsigned regno
)
4871 return ((GP_REGNUM_P (regno
) && regno
< R0_REGNUM
+ NUM_ARG_REGS
)
4872 || (FP_REGNUM_P (regno
) && regno
< V0_REGNUM
+ NUM_FP_ARG_REGS
));
4875 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
4876 PARM_BOUNDARY bits of alignment, but will be given anything up
4877 to STACK_BOUNDARY bits if the type requires it. This makes sure
4878 that both before and after the layout of each argument, the Next
4879 Stacked Argument Address (NSAA) will have a minimum alignment of
4883 aarch64_function_arg_boundary (machine_mode mode
, const_tree type
)
4886 unsigned int alignment
= aarch64_function_arg_alignment (mode
, type
,
4888 if (abi_break
& warn_psabi
)
4889 inform (input_location
, "parameter passing for argument of type "
4890 "%qT changed in GCC 9.1", type
);
4892 return MIN (MAX (alignment
, PARM_BOUNDARY
), STACK_BOUNDARY
);
4895 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
4897 static fixed_size_mode
4898 aarch64_get_reg_raw_mode (int regno
)
4900 if (TARGET_SVE
&& FP_REGNUM_P (regno
))
4901 /* Don't use the SVE part of the register for __builtin_apply and
4902 __builtin_return. The SVE registers aren't used by the normal PCS,
4903 so using them there would be a waste of time. The PCS extensions
4904 for SVE types are fundamentally incompatible with the
4905 __builtin_return/__builtin_apply interface. */
4906 return as_a
<fixed_size_mode
> (V16QImode
);
4907 return default_get_reg_raw_mode (regno
);
4910 /* Implement TARGET_FUNCTION_ARG_PADDING.
4912 Small aggregate types are placed in the lowest memory address.
4914 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
4916 static pad_direction
4917 aarch64_function_arg_padding (machine_mode mode
, const_tree type
)
4919 /* On little-endian targets, the least significant byte of every stack
4920 argument is passed at the lowest byte address of the stack slot. */
4921 if (!BYTES_BIG_ENDIAN
)
4924 /* Otherwise, integral, floating-point and pointer types are padded downward:
4925 the least significant byte of a stack argument is passed at the highest
4926 byte address of the stack slot. */
4928 ? (INTEGRAL_TYPE_P (type
) || SCALAR_FLOAT_TYPE_P (type
)
4929 || POINTER_TYPE_P (type
))
4930 : (SCALAR_INT_MODE_P (mode
) || SCALAR_FLOAT_MODE_P (mode
)))
4931 return PAD_DOWNWARD
;
4933 /* Everything else padded upward, i.e. data in first byte of stack slot. */
4937 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4939 It specifies padding for the last (may also be the only)
4940 element of a block move between registers and memory. If
4941 assuming the block is in the memory, padding upward means that
4942 the last element is padded after its highest significant byte,
4943 while in downward padding, the last element is padded at the
4944 its least significant byte side.
4946 Small aggregates and small complex types are always padded
4949 We don't need to worry about homogeneous floating-point or
4950 short-vector aggregates; their move is not affected by the
4951 padding direction determined here. Regardless of endianness,
4952 each element of such an aggregate is put in the least
4953 significant bits of a fp/simd register.
4955 Return !BYTES_BIG_ENDIAN if the least significant byte of the
4956 register has useful data, and return the opposite if the most
4957 significant byte does. */
4960 aarch64_pad_reg_upward (machine_mode mode
, const_tree type
,
4961 bool first ATTRIBUTE_UNUSED
)
4964 /* Small composite types are always padded upward. */
4965 if (BYTES_BIG_ENDIAN
&& aarch64_composite_type_p (type
, mode
))
4969 size
= int_size_in_bytes (type
);
4971 /* No frontends can create types with variable-sized modes, so we
4972 shouldn't be asked to pass or return them. */
4973 size
= GET_MODE_SIZE (mode
).to_constant ();
4974 if (size
< 2 * UNITS_PER_WORD
)
4978 /* Otherwise, use the default padding. */
4979 return !BYTES_BIG_ENDIAN
;
4982 static scalar_int_mode
4983 aarch64_libgcc_cmp_return_mode (void)
4988 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4990 /* We use the 12-bit shifted immediate arithmetic instructions so values
4991 must be multiple of (1 << 12), i.e. 4096. */
4992 #define ARITH_FACTOR 4096
4994 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4995 #error Cannot use simple address calculation for stack probing
4998 /* The pair of scratch registers used for stack probing. */
4999 #define PROBE_STACK_FIRST_REG R9_REGNUM
5000 #define PROBE_STACK_SECOND_REG R10_REGNUM
5002 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
5003 inclusive. These are offsets from the current stack pointer. */
5006 aarch64_emit_probe_stack_range (HOST_WIDE_INT first
, poly_int64 poly_size
)
5009 if (!poly_size
.is_constant (&size
))
5011 sorry ("stack probes for SVE frames");
5015 rtx reg1
= gen_rtx_REG (Pmode
, PROBE_STACK_FIRST_REG
);
5017 /* See the same assertion on PROBE_INTERVAL above. */
5018 gcc_assert ((first
% ARITH_FACTOR
) == 0);
5020 /* See if we have a constant small number of probes to generate. If so,
5021 that's the easy case. */
5022 if (size
<= PROBE_INTERVAL
)
5024 const HOST_WIDE_INT base
= ROUND_UP (size
, ARITH_FACTOR
);
5026 emit_set_insn (reg1
,
5027 plus_constant (Pmode
,
5028 stack_pointer_rtx
, -(first
+ base
)));
5029 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- size
));
5032 /* The run-time loop is made up of 8 insns in the generic case while the
5033 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
5034 else if (size
<= 4 * PROBE_INTERVAL
)
5036 HOST_WIDE_INT i
, rem
;
5038 emit_set_insn (reg1
,
5039 plus_constant (Pmode
,
5041 -(first
+ PROBE_INTERVAL
)));
5042 emit_stack_probe (reg1
);
5044 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
5045 it exceeds SIZE. If only two probes are needed, this will not
5046 generate any code. Then probe at FIRST + SIZE. */
5047 for (i
= 2 * PROBE_INTERVAL
; i
< size
; i
+= PROBE_INTERVAL
)
5049 emit_set_insn (reg1
,
5050 plus_constant (Pmode
, reg1
, -PROBE_INTERVAL
));
5051 emit_stack_probe (reg1
);
5054 rem
= size
- (i
- PROBE_INTERVAL
);
5057 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
5059 emit_set_insn (reg1
, plus_constant (Pmode
, reg1
, -base
));
5060 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- rem
));
5063 emit_stack_probe (plus_constant (Pmode
, reg1
, -rem
));
5066 /* Otherwise, do the same as above, but in a loop. Note that we must be
5067 extra careful with variables wrapping around because we might be at
5068 the very top (or the very bottom) of the address space and we have
5069 to be able to handle this case properly; in particular, we use an
5070 equality test for the loop condition. */
5073 rtx reg2
= gen_rtx_REG (Pmode
, PROBE_STACK_SECOND_REG
);
5075 /* Step 1: round SIZE to the previous multiple of the interval. */
5077 HOST_WIDE_INT rounded_size
= size
& -PROBE_INTERVAL
;
5080 /* Step 2: compute initial and final value of the loop counter. */
5082 /* TEST_ADDR = SP + FIRST. */
5083 emit_set_insn (reg1
,
5084 plus_constant (Pmode
, stack_pointer_rtx
, -first
));
5086 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
5087 HOST_WIDE_INT adjustment
= - (first
+ rounded_size
);
5088 if (! aarch64_uimm12_shift (adjustment
))
5090 aarch64_internal_mov_immediate (reg2
, GEN_INT (adjustment
),
5092 emit_set_insn (reg2
, gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, reg2
));
5095 emit_set_insn (reg2
,
5096 plus_constant (Pmode
, stack_pointer_rtx
, adjustment
));
5102 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
5105 while (TEST_ADDR != LAST_ADDR)
5107 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
5108 until it is equal to ROUNDED_SIZE. */
5110 emit_insn (gen_probe_stack_range (reg1
, reg1
, reg2
));
5113 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
5114 that SIZE is equal to ROUNDED_SIZE. */
5116 if (size
!= rounded_size
)
5118 HOST_WIDE_INT rem
= size
- rounded_size
;
5122 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
5124 emit_set_insn (reg2
, plus_constant (Pmode
, reg2
, -base
));
5125 emit_stack_probe (plus_constant (Pmode
, reg2
, base
- rem
));
5128 emit_stack_probe (plus_constant (Pmode
, reg2
, -rem
));
5132 /* Make sure nothing is scheduled before we are done. */
5133 emit_insn (gen_blockage ());
5136 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
5137 absolute addresses. */
5140 aarch64_output_probe_stack_range (rtx reg1
, rtx reg2
)
5142 static int labelno
= 0;
5146 ASM_GENERATE_INTERNAL_LABEL (loop_lab
, "LPSRL", labelno
++);
5149 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_lab
);
5151 HOST_WIDE_INT stack_clash_probe_interval
5152 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
5154 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
5156 HOST_WIDE_INT interval
;
5157 if (flag_stack_clash_protection
)
5158 interval
= stack_clash_probe_interval
;
5160 interval
= PROBE_INTERVAL
;
5162 gcc_assert (aarch64_uimm12_shift (interval
));
5163 xops
[1] = GEN_INT (interval
);
5165 output_asm_insn ("sub\t%0, %0, %1", xops
);
5167 /* If doing stack clash protection then we probe up by the ABI specified
5168 amount. We do this because we're dropping full pages at a time in the
5169 loop. But if we're doing non-stack clash probing, probe at SP 0. */
5170 if (flag_stack_clash_protection
)
5171 xops
[1] = GEN_INT (STACK_CLASH_CALLER_GUARD
);
5173 xops
[1] = CONST0_RTX (GET_MODE (xops
[1]));
5175 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
5176 by this amount for each iteration. */
5177 output_asm_insn ("str\txzr, [%0, %1]", xops
);
5179 /* Test if TEST_ADDR == LAST_ADDR. */
5181 output_asm_insn ("cmp\t%0, %1", xops
);
5184 fputs ("\tb.ne\t", asm_out_file
);
5185 assemble_name_raw (asm_out_file
, loop_lab
);
5186 fputc ('\n', asm_out_file
);
5191 /* Emit the probe loop for doing stack clash probes and stack adjustments for
5192 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
5193 of GUARD_SIZE. When a probe is emitted it is done at most
5194 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
5195 at most MIN_PROBE_THRESHOLD. By the end of this function
5196 BASE = BASE - ADJUSTMENT. */
5199 aarch64_output_probe_sve_stack_clash (rtx base
, rtx adjustment
,
5200 rtx min_probe_threshold
, rtx guard_size
)
5202 /* This function is not allowed to use any instruction generation function
5203 like gen_ and friends. If you do you'll likely ICE during CFG validation,
5204 so instead emit the code you want using output_asm_insn. */
5205 gcc_assert (flag_stack_clash_protection
);
5206 gcc_assert (CONST_INT_P (min_probe_threshold
) && CONST_INT_P (guard_size
));
5207 gcc_assert (INTVAL (guard_size
) > INTVAL (min_probe_threshold
));
5209 /* The minimum required allocation before the residual requires probing. */
5210 HOST_WIDE_INT residual_probe_guard
= INTVAL (min_probe_threshold
);
5212 /* Clamp the value down to the nearest value that can be used with a cmp. */
5213 residual_probe_guard
= aarch64_clamp_to_uimm12_shift (residual_probe_guard
);
5214 rtx probe_offset_value_rtx
= gen_int_mode (residual_probe_guard
, Pmode
);
5216 gcc_assert (INTVAL (min_probe_threshold
) >= residual_probe_guard
);
5217 gcc_assert (aarch64_uimm12_shift (residual_probe_guard
));
5219 static int labelno
= 0;
5220 char loop_start_lab
[32];
5221 char loop_end_lab
[32];
5224 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab
, "SVLPSPL", labelno
);
5225 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab
, "SVLPEND", labelno
++);
5227 /* Emit loop start label. */
5228 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_start_lab
);
5230 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
5231 xops
[0] = adjustment
;
5232 xops
[1] = probe_offset_value_rtx
;
5233 output_asm_insn ("cmp\t%0, %1", xops
);
5235 /* Branch to end if not enough adjustment to probe. */
5236 fputs ("\tb.lt\t", asm_out_file
);
5237 assemble_name_raw (asm_out_file
, loop_end_lab
);
5238 fputc ('\n', asm_out_file
);
5240 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
5242 xops
[1] = probe_offset_value_rtx
;
5243 output_asm_insn ("sub\t%0, %0, %1", xops
);
5245 /* Probe at BASE. */
5246 xops
[1] = const0_rtx
;
5247 output_asm_insn ("str\txzr, [%0, %1]", xops
);
5249 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
5250 xops
[0] = adjustment
;
5251 xops
[1] = probe_offset_value_rtx
;
5252 output_asm_insn ("sub\t%0, %0, %1", xops
);
5254 /* Branch to start if still more bytes to allocate. */
5255 fputs ("\tb\t", asm_out_file
);
5256 assemble_name_raw (asm_out_file
, loop_start_lab
);
5257 fputc ('\n', asm_out_file
);
5259 /* No probe leave. */
5260 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_end_lab
);
5262 /* BASE = BASE - ADJUSTMENT. */
5264 xops
[1] = adjustment
;
5265 output_asm_insn ("sub\t%0, %0, %1", xops
);
5269 /* Determine whether a frame chain needs to be generated. */
5271 aarch64_needs_frame_chain (void)
5273 /* Force a frame chain for EH returns so the return address is at FP+8. */
5274 if (frame_pointer_needed
|| crtl
->calls_eh_return
)
5277 /* A leaf function cannot have calls or write LR. */
5278 bool is_leaf
= crtl
->is_leaf
&& !df_regs_ever_live_p (LR_REGNUM
);
5280 /* Don't use a frame chain in leaf functions if leaf frame pointers
5282 if (flag_omit_leaf_frame_pointer
&& is_leaf
)
5285 return aarch64_use_frame_pointer
;
5288 /* Mark the registers that need to be saved by the callee and calculate
5289 the size of the callee-saved registers area and frame record (both FP
5290 and LR may be omitted). */
5292 aarch64_layout_frame (void)
5294 HOST_WIDE_INT offset
= 0;
5295 int regno
, last_fp_reg
= INVALID_REGNUM
;
5296 bool simd_function
= aarch64_simd_decl_p (cfun
->decl
);
5298 cfun
->machine
->frame
.emit_frame_chain
= aarch64_needs_frame_chain ();
5300 /* Adjust the outgoing arguments size if required. Keep it in sync with what
5301 the mid-end is doing. */
5302 crtl
->outgoing_args_size
= STACK_DYNAMIC_OFFSET (cfun
);
5304 #define SLOT_NOT_REQUIRED (-2)
5305 #define SLOT_REQUIRED (-1)
5307 cfun
->machine
->frame
.wb_candidate1
= INVALID_REGNUM
;
5308 cfun
->machine
->frame
.wb_candidate2
= INVALID_REGNUM
;
5310 /* If this is a non-leaf simd function with calls we assume that
5311 at least one of those calls is to a non-simd function and thus
5312 we must save V8 to V23 in the prologue. */
5314 if (simd_function
&& !crtl
->is_leaf
)
5316 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
5317 if (FP_SIMD_SAVED_REGNUM_P (regno
))
5318 df_set_regs_ever_live (regno
, true);
5321 /* First mark all the registers that really need to be saved... */
5322 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
5323 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
5325 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
5326 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
5328 /* ... that includes the eh data registers (if needed)... */
5329 if (crtl
->calls_eh_return
)
5330 for (regno
= 0; EH_RETURN_DATA_REGNO (regno
) != INVALID_REGNUM
; regno
++)
5331 cfun
->machine
->frame
.reg_offset
[EH_RETURN_DATA_REGNO (regno
)]
5334 /* ... and any callee saved register that dataflow says is live. */
5335 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
5336 if (df_regs_ever_live_p (regno
)
5337 && (regno
== R30_REGNUM
5338 || !call_used_regs
[regno
]))
5339 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
5341 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
5342 if (df_regs_ever_live_p (regno
)
5343 && (!call_used_regs
[regno
]
5344 || (simd_function
&& FP_SIMD_SAVED_REGNUM_P (regno
))))
5346 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
5347 last_fp_reg
= regno
;
5350 if (cfun
->machine
->frame
.emit_frame_chain
)
5352 /* FP and LR are placed in the linkage record. */
5353 cfun
->machine
->frame
.reg_offset
[R29_REGNUM
] = 0;
5354 cfun
->machine
->frame
.wb_candidate1
= R29_REGNUM
;
5355 cfun
->machine
->frame
.reg_offset
[R30_REGNUM
] = UNITS_PER_WORD
;
5356 cfun
->machine
->frame
.wb_candidate2
= R30_REGNUM
;
5357 offset
= 2 * UNITS_PER_WORD
;
5360 /* With stack-clash, LR must be saved in non-leaf functions. */
5361 gcc_assert (crtl
->is_leaf
5362 || (cfun
->machine
->frame
.reg_offset
[R30_REGNUM
]
5363 != SLOT_NOT_REQUIRED
));
5365 /* Now assign stack slots for them. */
5366 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
5367 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
5369 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
5370 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
5371 cfun
->machine
->frame
.wb_candidate1
= regno
;
5372 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
)
5373 cfun
->machine
->frame
.wb_candidate2
= regno
;
5374 offset
+= UNITS_PER_WORD
;
5377 HOST_WIDE_INT max_int_offset
= offset
;
5378 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
5379 bool has_align_gap
= offset
!= max_int_offset
;
5381 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
5382 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
5384 /* If there is an alignment gap between integer and fp callee-saves,
5385 allocate the last fp register to it if possible. */
5386 if (regno
== last_fp_reg
5389 && (offset
& 8) == 0)
5391 cfun
->machine
->frame
.reg_offset
[regno
] = max_int_offset
;
5395 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
5396 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
5397 cfun
->machine
->frame
.wb_candidate1
= regno
;
5398 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
5399 && cfun
->machine
->frame
.wb_candidate1
>= V0_REGNUM
)
5400 cfun
->machine
->frame
.wb_candidate2
= regno
;
5401 offset
+= simd_function
? UNITS_PER_VREG
: UNITS_PER_WORD
;
5404 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
5406 cfun
->machine
->frame
.saved_regs_size
= offset
;
5408 HOST_WIDE_INT varargs_and_saved_regs_size
5409 = offset
+ cfun
->machine
->frame
.saved_varargs_size
;
5411 cfun
->machine
->frame
.hard_fp_offset
5412 = aligned_upper_bound (varargs_and_saved_regs_size
5413 + get_frame_size (),
5414 STACK_BOUNDARY
/ BITS_PER_UNIT
);
5416 /* Both these values are already aligned. */
5417 gcc_assert (multiple_p (crtl
->outgoing_args_size
,
5418 STACK_BOUNDARY
/ BITS_PER_UNIT
));
5419 cfun
->machine
->frame
.frame_size
5420 = (cfun
->machine
->frame
.hard_fp_offset
5421 + crtl
->outgoing_args_size
);
5423 cfun
->machine
->frame
.locals_offset
= cfun
->machine
->frame
.saved_varargs_size
;
5425 cfun
->machine
->frame
.initial_adjust
= 0;
5426 cfun
->machine
->frame
.final_adjust
= 0;
5427 cfun
->machine
->frame
.callee_adjust
= 0;
5428 cfun
->machine
->frame
.callee_offset
= 0;
5430 HOST_WIDE_INT max_push_offset
= 0;
5431 if (cfun
->machine
->frame
.wb_candidate2
!= INVALID_REGNUM
)
5432 max_push_offset
= 512;
5433 else if (cfun
->machine
->frame
.wb_candidate1
!= INVALID_REGNUM
)
5434 max_push_offset
= 256;
5436 HOST_WIDE_INT const_size
, const_fp_offset
;
5437 if (cfun
->machine
->frame
.frame_size
.is_constant (&const_size
)
5438 && const_size
< max_push_offset
5439 && known_eq (crtl
->outgoing_args_size
, 0))
5441 /* Simple, small frame with no outgoing arguments:
5442 stp reg1, reg2, [sp, -frame_size]!
5443 stp reg3, reg4, [sp, 16] */
5444 cfun
->machine
->frame
.callee_adjust
= const_size
;
5446 else if (known_lt (crtl
->outgoing_args_size
5447 + cfun
->machine
->frame
.saved_regs_size
, 512)
5448 && !(cfun
->calls_alloca
5449 && known_lt (cfun
->machine
->frame
.hard_fp_offset
,
5452 /* Frame with small outgoing arguments:
5453 sub sp, sp, frame_size
5454 stp reg1, reg2, [sp, outgoing_args_size]
5455 stp reg3, reg4, [sp, outgoing_args_size + 16] */
5456 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.frame_size
;
5457 cfun
->machine
->frame
.callee_offset
5458 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.hard_fp_offset
;
5460 else if (cfun
->machine
->frame
.hard_fp_offset
.is_constant (&const_fp_offset
)
5461 && const_fp_offset
< max_push_offset
)
5463 /* Frame with large outgoing arguments but a small local area:
5464 stp reg1, reg2, [sp, -hard_fp_offset]!
5465 stp reg3, reg4, [sp, 16]
5466 sub sp, sp, outgoing_args_size */
5467 cfun
->machine
->frame
.callee_adjust
= const_fp_offset
;
5468 cfun
->machine
->frame
.final_adjust
5469 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.callee_adjust
;
5473 /* Frame with large local area and outgoing arguments using frame pointer:
5474 sub sp, sp, hard_fp_offset
5475 stp x29, x30, [sp, 0]
5477 stp reg3, reg4, [sp, 16]
5478 sub sp, sp, outgoing_args_size */
5479 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.hard_fp_offset
;
5480 cfun
->machine
->frame
.final_adjust
5481 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.initial_adjust
;
5484 cfun
->machine
->frame
.laid_out
= true;
5487 /* Return true if the register REGNO is saved on entry to
5488 the current function. */
5491 aarch64_register_saved_on_entry (int regno
)
5493 return cfun
->machine
->frame
.reg_offset
[regno
] >= 0;
5496 /* Return the next register up from REGNO up to LIMIT for the callee
5500 aarch64_next_callee_save (unsigned regno
, unsigned limit
)
5502 while (regno
<= limit
&& !aarch64_register_saved_on_entry (regno
))
5507 /* Push the register number REGNO of mode MODE to the stack with write-back
5508 adjusting the stack by ADJUSTMENT. */
5511 aarch64_pushwb_single_reg (machine_mode mode
, unsigned regno
,
5512 HOST_WIDE_INT adjustment
)
5514 rtx base_rtx
= stack_pointer_rtx
;
5517 reg
= gen_rtx_REG (mode
, regno
);
5518 mem
= gen_rtx_PRE_MODIFY (Pmode
, base_rtx
,
5519 plus_constant (Pmode
, base_rtx
, -adjustment
));
5520 mem
= gen_frame_mem (mode
, mem
);
5522 insn
= emit_move_insn (mem
, reg
);
5523 RTX_FRAME_RELATED_P (insn
) = 1;
5526 /* Generate and return an instruction to store the pair of registers
5527 REG and REG2 of mode MODE to location BASE with write-back adjusting
5528 the stack location BASE by ADJUSTMENT. */
5531 aarch64_gen_storewb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
5532 HOST_WIDE_INT adjustment
)
5537 return gen_storewb_pairdi_di (base
, base
, reg
, reg2
,
5538 GEN_INT (-adjustment
),
5539 GEN_INT (UNITS_PER_WORD
- adjustment
));
5541 return gen_storewb_pairdf_di (base
, base
, reg
, reg2
,
5542 GEN_INT (-adjustment
),
5543 GEN_INT (UNITS_PER_WORD
- adjustment
));
5545 return gen_storewb_pairtf_di (base
, base
, reg
, reg2
,
5546 GEN_INT (-adjustment
),
5547 GEN_INT (UNITS_PER_VREG
- adjustment
));
5553 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
5554 stack pointer by ADJUSTMENT. */
5557 aarch64_push_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
)
5560 machine_mode mode
= aarch64_reg_save_mode (cfun
->decl
, regno1
);
5562 if (regno2
== INVALID_REGNUM
)
5563 return aarch64_pushwb_single_reg (mode
, regno1
, adjustment
);
5565 rtx reg1
= gen_rtx_REG (mode
, regno1
);
5566 rtx reg2
= gen_rtx_REG (mode
, regno2
);
5568 insn
= emit_insn (aarch64_gen_storewb_pair (mode
, stack_pointer_rtx
, reg1
,
5570 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 2)) = 1;
5571 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
5572 RTX_FRAME_RELATED_P (insn
) = 1;
5575 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
5576 adjusting it by ADJUSTMENT afterwards. */
5579 aarch64_gen_loadwb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
5580 HOST_WIDE_INT adjustment
)
5585 return gen_loadwb_pairdi_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
5586 GEN_INT (UNITS_PER_WORD
));
5588 return gen_loadwb_pairdf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
5589 GEN_INT (UNITS_PER_WORD
));
5591 return gen_loadwb_pairtf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
5592 GEN_INT (UNITS_PER_VREG
));
5598 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
5599 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
5603 aarch64_pop_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
,
5606 machine_mode mode
= aarch64_reg_save_mode (cfun
->decl
, regno1
);
5607 rtx reg1
= gen_rtx_REG (mode
, regno1
);
5609 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg1
, *cfi_ops
);
5611 if (regno2
== INVALID_REGNUM
)
5613 rtx mem
= plus_constant (Pmode
, stack_pointer_rtx
, adjustment
);
5614 mem
= gen_rtx_POST_MODIFY (Pmode
, stack_pointer_rtx
, mem
);
5615 emit_move_insn (reg1
, gen_frame_mem (mode
, mem
));
5619 rtx reg2
= gen_rtx_REG (mode
, regno2
);
5620 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
5621 emit_insn (aarch64_gen_loadwb_pair (mode
, stack_pointer_rtx
, reg1
,
5626 /* Generate and return a store pair instruction of mode MODE to store
5627 register REG1 to MEM1 and register REG2 to MEM2. */
5630 aarch64_gen_store_pair (machine_mode mode
, rtx mem1
, rtx reg1
, rtx mem2
,
5636 return gen_store_pair_dw_didi (mem1
, reg1
, mem2
, reg2
);
5639 return gen_store_pair_dw_dfdf (mem1
, reg1
, mem2
, reg2
);
5642 return gen_store_pair_dw_tftf (mem1
, reg1
, mem2
, reg2
);
5649 /* Generate and regurn a load pair isntruction of mode MODE to load register
5650 REG1 from MEM1 and register REG2 from MEM2. */
5653 aarch64_gen_load_pair (machine_mode mode
, rtx reg1
, rtx mem1
, rtx reg2
,
5659 return gen_load_pair_dw_didi (reg1
, mem1
, reg2
, mem2
);
5662 return gen_load_pair_dw_dfdf (reg1
, mem1
, reg2
, mem2
);
5665 return gen_load_pair_dw_tftf (reg1
, mem1
, reg2
, mem2
);
5672 /* Return TRUE if return address signing should be enabled for the current
5673 function, otherwise return FALSE. */
5676 aarch64_return_address_signing_enabled (void)
5678 /* This function should only be called after frame laid out. */
5679 gcc_assert (cfun
->machine
->frame
.laid_out
);
5681 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
5682 if its LR is pushed onto stack. */
5683 return (aarch64_ra_sign_scope
== AARCH64_FUNCTION_ALL
5684 || (aarch64_ra_sign_scope
== AARCH64_FUNCTION_NON_LEAF
5685 && cfun
->machine
->frame
.reg_offset
[LR_REGNUM
] >= 0));
5688 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
5690 aarch64_bti_enabled (void)
5692 return (aarch64_enable_bti
== 1);
5695 /* Emit code to save the callee-saved registers from register number START
5696 to LIMIT to the stack at the location starting at offset START_OFFSET,
5697 skipping any write-back candidates if SKIP_WB is true. */
5700 aarch64_save_callee_saves (machine_mode mode
, poly_int64 start_offset
,
5701 unsigned start
, unsigned limit
, bool skip_wb
)
5707 for (regno
= aarch64_next_callee_save (start
, limit
);
5709 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
5716 && (regno
== cfun
->machine
->frame
.wb_candidate1
5717 || regno
== cfun
->machine
->frame
.wb_candidate2
))
5720 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
5723 reg
= gen_rtx_REG (mode
, regno
);
5724 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
5725 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
5728 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
5729 offset_diff
= cfun
->machine
->frame
.reg_offset
[regno2
]
5730 - cfun
->machine
->frame
.reg_offset
[regno
];
5733 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
5734 && known_eq (GET_MODE_SIZE (mode
), offset_diff
))
5736 rtx reg2
= gen_rtx_REG (mode
, regno2
);
5739 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
5740 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
5742 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
,
5745 /* The first part of a frame-related parallel insn is
5746 always assumed to be relevant to the frame
5747 calculations; subsequent parts, are only
5748 frame-related if explicitly marked. */
5749 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
5753 insn
= emit_move_insn (mem
, reg
);
5755 RTX_FRAME_RELATED_P (insn
) = 1;
5759 /* Emit code to restore the callee registers of mode MODE from register
5760 number START up to and including LIMIT. Restore from the stack offset
5761 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
5762 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
5765 aarch64_restore_callee_saves (machine_mode mode
,
5766 poly_int64 start_offset
, unsigned start
,
5767 unsigned limit
, bool skip_wb
, rtx
*cfi_ops
)
5769 rtx base_rtx
= stack_pointer_rtx
;
5774 for (regno
= aarch64_next_callee_save (start
, limit
);
5776 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
5778 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
5785 && (regno
== cfun
->machine
->frame
.wb_candidate1
5786 || regno
== cfun
->machine
->frame
.wb_candidate2
))
5789 reg
= gen_rtx_REG (mode
, regno
);
5790 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
5791 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
5793 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
5794 offset_diff
= cfun
->machine
->frame
.reg_offset
[regno2
]
5795 - cfun
->machine
->frame
.reg_offset
[regno
];
5798 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
5799 && known_eq (GET_MODE_SIZE (mode
), offset_diff
))
5801 rtx reg2
= gen_rtx_REG (mode
, regno2
);
5804 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
5805 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
5806 emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
5808 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
5812 emit_move_insn (reg
, mem
);
5813 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg
, *cfi_ops
);
5817 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
5821 offset_4bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
5823 HOST_WIDE_INT multiple
;
5824 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5825 && IN_RANGE (multiple
, -8, 7));
5828 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5832 offset_6bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
5834 HOST_WIDE_INT multiple
;
5835 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5836 && IN_RANGE (multiple
, 0, 63));
5839 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5843 aarch64_offset_7bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
5845 HOST_WIDE_INT multiple
;
5846 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5847 && IN_RANGE (multiple
, -64, 63));
5850 /* Return true if OFFSET is a signed 9-bit value. */
5853 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED
,
5856 HOST_WIDE_INT const_offset
;
5857 return (offset
.is_constant (&const_offset
)
5858 && IN_RANGE (const_offset
, -256, 255));
5861 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5865 offset_9bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
5867 HOST_WIDE_INT multiple
;
5868 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5869 && IN_RANGE (multiple
, -256, 255));
5872 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5876 offset_12bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
5878 HOST_WIDE_INT multiple
;
5879 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5880 && IN_RANGE (multiple
, 0, 4095));
5883 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
5886 aarch64_get_separate_components (void)
5888 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
5889 bitmap_clear (components
);
5891 /* The registers we need saved to the frame. */
5892 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
5893 if (aarch64_register_saved_on_entry (regno
))
5895 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
5896 if (!frame_pointer_needed
)
5897 offset
+= cfun
->machine
->frame
.frame_size
5898 - cfun
->machine
->frame
.hard_fp_offset
;
5899 /* Check that we can access the stack slot of the register with one
5900 direct load with no adjustments needed. */
5901 if (offset_12bit_unsigned_scaled_p (DImode
, offset
))
5902 bitmap_set_bit (components
, regno
);
5905 /* Don't mess with the hard frame pointer. */
5906 if (frame_pointer_needed
)
5907 bitmap_clear_bit (components
, HARD_FRAME_POINTER_REGNUM
);
5909 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
5910 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
5911 /* If registers have been chosen to be stored/restored with
5912 writeback don't interfere with them to avoid having to output explicit
5913 stack adjustment instructions. */
5914 if (reg2
!= INVALID_REGNUM
)
5915 bitmap_clear_bit (components
, reg2
);
5916 if (reg1
!= INVALID_REGNUM
)
5917 bitmap_clear_bit (components
, reg1
);
5919 bitmap_clear_bit (components
, LR_REGNUM
);
5920 bitmap_clear_bit (components
, SP_REGNUM
);
5925 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
5928 aarch64_components_for_bb (basic_block bb
)
5930 bitmap in
= DF_LIVE_IN (bb
);
5931 bitmap gen
= &DF_LIVE_BB_INFO (bb
)->gen
;
5932 bitmap kill
= &DF_LIVE_BB_INFO (bb
)->kill
;
5933 bool simd_function
= aarch64_simd_decl_p (cfun
->decl
);
5935 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
5936 bitmap_clear (components
);
5938 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
5939 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
5940 if ((!call_used_regs
[regno
]
5941 || (simd_function
&& FP_SIMD_SAVED_REGNUM_P (regno
)))
5942 && (bitmap_bit_p (in
, regno
)
5943 || bitmap_bit_p (gen
, regno
)
5944 || bitmap_bit_p (kill
, regno
)))
5946 unsigned regno2
, offset
, offset2
;
5947 bitmap_set_bit (components
, regno
);
5949 /* If there is a callee-save at an adjacent offset, add it too
5950 to increase the use of LDP/STP. */
5951 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
5952 regno2
= ((offset
& 8) == 0) ? regno
+ 1 : regno
- 1;
5954 if (regno2
<= LAST_SAVED_REGNUM
)
5956 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
5957 if ((offset
& ~8) == (offset2
& ~8))
5958 bitmap_set_bit (components
, regno2
);
5965 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5966 Nothing to do for aarch64. */
5969 aarch64_disqualify_components (sbitmap
, edge
, sbitmap
, bool)
5973 /* Return the next set bit in BMP from START onwards. Return the total number
5974 of bits in BMP if no set bit is found at or after START. */
5977 aarch64_get_next_set_bit (sbitmap bmp
, unsigned int start
)
5979 unsigned int nbits
= SBITMAP_SIZE (bmp
);
5983 gcc_assert (start
< nbits
);
5984 for (unsigned int i
= start
; i
< nbits
; i
++)
5985 if (bitmap_bit_p (bmp
, i
))
5991 /* Do the work for aarch64_emit_prologue_components and
5992 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
5993 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5994 for these components or the epilogue sequence. That is, it determines
5995 whether we should emit stores or loads and what kind of CFA notes to attach
5996 to the insns. Otherwise the logic for the two sequences is very
6000 aarch64_process_components (sbitmap components
, bool prologue_p
)
6002 rtx ptr_reg
= gen_rtx_REG (Pmode
, frame_pointer_needed
6003 ? HARD_FRAME_POINTER_REGNUM
6004 : STACK_POINTER_REGNUM
);
6006 unsigned last_regno
= SBITMAP_SIZE (components
);
6007 unsigned regno
= aarch64_get_next_set_bit (components
, R0_REGNUM
);
6008 rtx_insn
*insn
= NULL
;
6010 while (regno
!= last_regno
)
6012 /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
6013 so DFmode for the vector registers is enough. For simd functions
6014 we want to save the low 128 bits. */
6015 machine_mode mode
= aarch64_reg_save_mode (cfun
->decl
, regno
);
6017 rtx reg
= gen_rtx_REG (mode
, regno
);
6018 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
6019 if (!frame_pointer_needed
)
6020 offset
+= cfun
->machine
->frame
.frame_size
6021 - cfun
->machine
->frame
.hard_fp_offset
;
6022 rtx addr
= plus_constant (Pmode
, ptr_reg
, offset
);
6023 rtx mem
= gen_frame_mem (mode
, addr
);
6025 rtx set
= prologue_p
? gen_rtx_SET (mem
, reg
) : gen_rtx_SET (reg
, mem
);
6026 unsigned regno2
= aarch64_get_next_set_bit (components
, regno
+ 1);
6027 /* No more registers to handle after REGNO.
6028 Emit a single save/restore and exit. */
6029 if (regno2
== last_regno
)
6031 insn
= emit_insn (set
);
6032 RTX_FRAME_RELATED_P (insn
) = 1;
6034 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
6036 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
6040 poly_int64 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
6041 /* The next register is not of the same class or its offset is not
6042 mergeable with the current one into a pair. */
6043 if (!satisfies_constraint_Ump (mem
)
6044 || GP_REGNUM_P (regno
) != GP_REGNUM_P (regno2
)
6045 || (aarch64_simd_decl_p (cfun
->decl
) && FP_REGNUM_P (regno
))
6046 || maybe_ne ((offset2
- cfun
->machine
->frame
.reg_offset
[regno
]),
6047 GET_MODE_SIZE (mode
)))
6049 insn
= emit_insn (set
);
6050 RTX_FRAME_RELATED_P (insn
) = 1;
6052 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
6054 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
6060 /* REGNO2 can be saved/restored in a pair with REGNO. */
6061 rtx reg2
= gen_rtx_REG (mode
, regno2
);
6062 if (!frame_pointer_needed
)
6063 offset2
+= cfun
->machine
->frame
.frame_size
6064 - cfun
->machine
->frame
.hard_fp_offset
;
6065 rtx addr2
= plus_constant (Pmode
, ptr_reg
, offset2
);
6066 rtx mem2
= gen_frame_mem (mode
, addr2
);
6067 rtx set2
= prologue_p
? gen_rtx_SET (mem2
, reg2
)
6068 : gen_rtx_SET (reg2
, mem2
);
6071 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
, reg2
));
6073 insn
= emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
6075 RTX_FRAME_RELATED_P (insn
) = 1;
6078 add_reg_note (insn
, REG_CFA_OFFSET
, set
);
6079 add_reg_note (insn
, REG_CFA_OFFSET
, set2
);
6083 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
6084 add_reg_note (insn
, REG_CFA_RESTORE
, reg2
);
6087 regno
= aarch64_get_next_set_bit (components
, regno2
+ 1);
6091 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
6094 aarch64_emit_prologue_components (sbitmap components
)
6096 aarch64_process_components (components
, true);
6099 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
6102 aarch64_emit_epilogue_components (sbitmap components
)
6104 aarch64_process_components (components
, false);
6107 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
6110 aarch64_set_handled_components (sbitmap components
)
6112 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
6113 if (bitmap_bit_p (components
, regno
))
6114 cfun
->machine
->reg_is_wrapped_separately
[regno
] = true;
6117 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
6118 determining the probe offset for alloca. */
6120 static HOST_WIDE_INT
6121 aarch64_stack_clash_protection_alloca_probe_range (void)
6123 return STACK_CLASH_CALLER_GUARD
;
6127 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
6128 registers. If POLY_SIZE is not large enough to require a probe this function
6129 will only adjust the stack. When allocating the stack space
6130 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
6131 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
6132 arguments. If we are then we ensure that any allocation larger than the ABI
6133 defined buffer needs a probe so that the invariant of having a 1KB buffer is
6136 We emit barriers after each stack adjustment to prevent optimizations from
6137 breaking the invariant that we never drop the stack more than a page. This
6138 invariant is needed to make it easier to correctly handle asynchronous
6139 events, e.g. if we were to allow the stack to be dropped by more than a page
6140 and then have multiple probes up and we take a signal somewhere in between
6141 then the signal handler doesn't know the state of the stack and can make no
6142 assumptions about which pages have been probed. */
6145 aarch64_allocate_and_probe_stack_space (rtx temp1
, rtx temp2
,
6146 poly_int64 poly_size
,
6147 bool frame_related_p
,
6148 bool final_adjustment_p
)
6150 HOST_WIDE_INT guard_size
6151 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
6152 HOST_WIDE_INT guard_used_by_caller
= STACK_CLASH_CALLER_GUARD
;
6153 /* When doing the final adjustment for the outgoing argument size we can't
6154 assume that LR was saved at position 0. So subtract it's offset from the
6155 ABI safe buffer so that we don't accidentally allow an adjustment that
6156 would result in an allocation larger than the ABI buffer without
6158 HOST_WIDE_INT min_probe_threshold
6159 = final_adjustment_p
6160 ? guard_used_by_caller
- cfun
->machine
->frame
.reg_offset
[LR_REGNUM
]
6161 : guard_size
- guard_used_by_caller
;
6163 poly_int64 frame_size
= cfun
->machine
->frame
.frame_size
;
6165 /* We should always have a positive probe threshold. */
6166 gcc_assert (min_probe_threshold
> 0);
6168 if (flag_stack_clash_protection
&& !final_adjustment_p
)
6170 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
6171 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
6173 if (known_eq (frame_size
, 0))
6175 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME
, false);
6177 else if (known_lt (initial_adjust
, guard_size
- guard_used_by_caller
)
6178 && known_lt (final_adjust
, guard_used_by_caller
))
6180 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME
, true);
6184 /* If SIZE is not large enough to require probing, just adjust the stack and
6186 if (known_lt (poly_size
, min_probe_threshold
)
6187 || !flag_stack_clash_protection
)
6189 aarch64_sub_sp (temp1
, temp2
, poly_size
, frame_related_p
);
6194 /* Handle the SVE non-constant case first. */
6195 if (!poly_size
.is_constant (&size
))
6199 fprintf (dump_file
, "Stack clash SVE prologue: ");
6200 print_dec (poly_size
, dump_file
);
6201 fprintf (dump_file
, " bytes, dynamic probing will be required.\n");
6204 /* First calculate the amount of bytes we're actually spilling. */
6205 aarch64_add_offset (Pmode
, temp1
, CONST0_RTX (Pmode
),
6206 poly_size
, temp1
, temp2
, false, true);
6208 rtx_insn
*insn
= get_last_insn ();
6210 if (frame_related_p
)
6212 /* This is done to provide unwinding information for the stack
6213 adjustments we're about to do, however to prevent the optimizers
6214 from removing the R11 move and leaving the CFA note (which would be
6215 very wrong) we tie the old and new stack pointer together.
6216 The tie will expand to nothing but the optimizers will not touch
6218 rtx stack_ptr_copy
= gen_rtx_REG (Pmode
, STACK_CLASH_SVE_CFA_REGNUM
);
6219 emit_move_insn (stack_ptr_copy
, stack_pointer_rtx
);
6220 emit_insn (gen_stack_tie (stack_ptr_copy
, stack_pointer_rtx
));
6222 /* We want the CFA independent of the stack pointer for the
6223 duration of the loop. */
6224 add_reg_note (insn
, REG_CFA_DEF_CFA
, stack_ptr_copy
);
6225 RTX_FRAME_RELATED_P (insn
) = 1;
6228 rtx probe_const
= gen_int_mode (min_probe_threshold
, Pmode
);
6229 rtx guard_const
= gen_int_mode (guard_size
, Pmode
);
6231 insn
= emit_insn (gen_probe_sve_stack_clash (Pmode
, stack_pointer_rtx
,
6232 stack_pointer_rtx
, temp1
,
6233 probe_const
, guard_const
));
6235 /* Now reset the CFA register if needed. */
6236 if (frame_related_p
)
6238 add_reg_note (insn
, REG_CFA_DEF_CFA
,
6239 gen_rtx_PLUS (Pmode
, stack_pointer_rtx
,
6240 gen_int_mode (poly_size
, Pmode
)));
6241 RTX_FRAME_RELATED_P (insn
) = 1;
6249 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
6250 " bytes, probing will be required.\n", size
);
6252 /* Round size to the nearest multiple of guard_size, and calculate the
6253 residual as the difference between the original size and the rounded
6255 HOST_WIDE_INT rounded_size
= ROUND_DOWN (size
, guard_size
);
6256 HOST_WIDE_INT residual
= size
- rounded_size
;
6258 /* We can handle a small number of allocations/probes inline. Otherwise
6260 if (rounded_size
<= STACK_CLASH_MAX_UNROLL_PAGES
* guard_size
)
6262 for (HOST_WIDE_INT i
= 0; i
< rounded_size
; i
+= guard_size
)
6264 aarch64_sub_sp (NULL
, temp2
, guard_size
, true);
6265 emit_stack_probe (plus_constant (Pmode
, stack_pointer_rtx
,
6266 guard_used_by_caller
));
6267 emit_insn (gen_blockage ());
6269 dump_stack_clash_frame_info (PROBE_INLINE
, size
!= rounded_size
);
6273 /* Compute the ending address. */
6274 aarch64_add_offset (Pmode
, temp1
, stack_pointer_rtx
, -rounded_size
,
6275 temp1
, NULL
, false, true);
6276 rtx_insn
*insn
= get_last_insn ();
6278 /* For the initial allocation, we don't have a frame pointer
6279 set up, so we always need CFI notes. If we're doing the
6280 final allocation, then we may have a frame pointer, in which
6281 case it is the CFA, otherwise we need CFI notes.
6283 We can determine which allocation we are doing by looking at
6284 the value of FRAME_RELATED_P since the final allocations are not
6286 if (frame_related_p
)
6288 /* We want the CFA independent of the stack pointer for the
6289 duration of the loop. */
6290 add_reg_note (insn
, REG_CFA_DEF_CFA
,
6291 plus_constant (Pmode
, temp1
, rounded_size
));
6292 RTX_FRAME_RELATED_P (insn
) = 1;
6295 /* This allocates and probes the stack. Note that this re-uses some of
6296 the existing Ada stack protection code. However we are guaranteed not
6297 to enter the non loop or residual branches of that code.
6299 The non-loop part won't be entered because if our allocation amount
6300 doesn't require a loop, the case above would handle it.
6302 The residual amount won't be entered because TEMP1 is a mutliple of
6303 the allocation size. The residual will always be 0. As such, the only
6304 part we are actually using from that code is the loop setup. The
6305 actual probing is done in aarch64_output_probe_stack_range. */
6306 insn
= emit_insn (gen_probe_stack_range (stack_pointer_rtx
,
6307 stack_pointer_rtx
, temp1
));
6309 /* Now reset the CFA register if needed. */
6310 if (frame_related_p
)
6312 add_reg_note (insn
, REG_CFA_DEF_CFA
,
6313 plus_constant (Pmode
, stack_pointer_rtx
, rounded_size
));
6314 RTX_FRAME_RELATED_P (insn
) = 1;
6317 emit_insn (gen_blockage ());
6318 dump_stack_clash_frame_info (PROBE_LOOP
, size
!= rounded_size
);
6321 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
6322 be probed. This maintains the requirement that each page is probed at
6323 least once. For initial probing we probe only if the allocation is
6324 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
6325 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
6326 GUARD_SIZE. This works that for any allocation that is large enough to
6327 trigger a probe here, we'll have at least one, and if they're not large
6328 enough for this code to emit anything for them, The page would have been
6329 probed by the saving of FP/LR either by this function or any callees. If
6330 we don't have any callees then we won't have more stack adjustments and so
6334 HOST_WIDE_INT residual_probe_offset
= guard_used_by_caller
;
6335 /* If we're doing final adjustments, and we've done any full page
6336 allocations then any residual needs to be probed. */
6337 if (final_adjustment_p
&& rounded_size
!= 0)
6338 min_probe_threshold
= 0;
6339 /* If doing a small final adjustment, we always probe at offset 0.
6340 This is done to avoid issues when LR is not at position 0 or when
6341 the final adjustment is smaller than the probing offset. */
6342 else if (final_adjustment_p
&& rounded_size
== 0)
6343 residual_probe_offset
= 0;
6345 aarch64_sub_sp (temp1
, temp2
, residual
, frame_related_p
);
6346 if (residual
>= min_probe_threshold
)
6350 "Stack clash AArch64 prologue residuals: "
6351 HOST_WIDE_INT_PRINT_DEC
" bytes, probing will be required."
6354 emit_stack_probe (plus_constant (Pmode
, stack_pointer_rtx
,
6355 residual_probe_offset
));
6356 emit_insn (gen_blockage ());
6361 /* Return 1 if the register is used by the epilogue. We need to say the
6362 return register is used, but only after epilogue generation is complete.
6363 Note that in the case of sibcalls, the values "used by the epilogue" are
6364 considered live at the start of the called function.
6366 For SIMD functions we need to return 1 for FP registers that are saved and
6367 restored by a function but are not zero in call_used_regs. If we do not do
6368 this optimizations may remove the restore of the register. */
6371 aarch64_epilogue_uses (int regno
)
6373 if (epilogue_completed
)
6375 if (regno
== LR_REGNUM
)
6377 if (aarch64_simd_decl_p (cfun
->decl
) && FP_SIMD_SAVED_REGNUM_P (regno
))
6383 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6384 is saved at BASE + OFFSET. */
6387 aarch64_add_cfa_expression (rtx_insn
*insn
, unsigned int reg
,
6388 rtx base
, poly_int64 offset
)
6390 rtx mem
= gen_frame_mem (DImode
, plus_constant (Pmode
, base
, offset
));
6391 add_reg_note (insn
, REG_CFA_EXPRESSION
,
6392 gen_rtx_SET (mem
, regno_reg_rtx
[reg
]));
6395 /* AArch64 stack frames generated by this compiler look like:
6397 +-------------------------------+
6399 | incoming stack arguments |
6401 +-------------------------------+
6402 | | <-- incoming stack pointer (aligned)
6403 | callee-allocated save area |
6404 | for register varargs |
6406 +-------------------------------+
6407 | local variables | <-- frame_pointer_rtx
6409 +-------------------------------+
6411 +-------------------------------+ |
6412 | callee-saved registers | | frame.saved_regs_size
6413 +-------------------------------+ |
6415 +-------------------------------+ |
6416 | FP' | / <- hard_frame_pointer_rtx (aligned)
6417 +-------------------------------+
6418 | dynamic allocation |
6419 +-------------------------------+
6421 +-------------------------------+
6422 | outgoing stack arguments | <-- arg_pointer
6424 +-------------------------------+
6425 | | <-- stack_pointer_rtx (aligned)
6427 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
6428 but leave frame_pointer_rtx and hard_frame_pointer_rtx
6431 By default for stack-clash we assume the guard is at least 64KB, but this
6432 value is configurable to either 4KB or 64KB. We also force the guard size to
6433 be the same as the probing interval and both values are kept in sync.
6435 With those assumptions the callee can allocate up to 63KB (or 3KB depending
6436 on the guard size) of stack space without probing.
6438 When probing is needed, we emit a probe at the start of the prologue
6439 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
6441 We have to track how much space has been allocated and the only stores
6442 to the stack we track as implicit probes are the FP/LR stores.
6444 For outgoing arguments we probe if the size is larger than 1KB, such that
6445 the ABI specified buffer is maintained for the next callee.
6447 The following registers are reserved during frame layout and should not be
6448 used for any other purpose:
6450 - r11: Used by stack clash protection when SVE is enabled.
6451 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
6452 - r14 and r15: Used for speculation tracking.
6453 - r16(IP0), r17(IP1): Used by indirect tailcalls.
6454 - r30(LR), r29(FP): Used by standard frame layout.
6456 These registers must be avoided in frame layout related code unless the
6457 explicit intention is to interact with one of the features listed above. */
6459 /* Generate the prologue instructions for entry into a function.
6460 Establish the stack frame by decreasing the stack pointer with a
6461 properly calculated size and, if necessary, create a frame record
6462 filled with the values of LR and previous frame pointer. The
6463 current FP is also set up if it is in use. */
6466 aarch64_expand_prologue (void)
6468 poly_int64 frame_size
= cfun
->machine
->frame
.frame_size
;
6469 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
6470 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
6471 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
6472 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
6473 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
6474 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
6475 bool emit_frame_chain
= cfun
->machine
->frame
.emit_frame_chain
;
6478 /* Sign return address for functions. */
6479 if (aarch64_return_address_signing_enabled ())
6481 switch (aarch64_ra_sign_key
)
6484 insn
= emit_insn (gen_paciasp ());
6487 insn
= emit_insn (gen_pacibsp ());
6492 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
6493 RTX_FRAME_RELATED_P (insn
) = 1;
6496 if (flag_stack_usage_info
)
6497 current_function_static_stack_size
= constant_lower_bound (frame_size
);
6499 if (flag_stack_check
== STATIC_BUILTIN_STACK_CHECK
)
6501 if (crtl
->is_leaf
&& !cfun
->calls_alloca
)
6503 if (maybe_gt (frame_size
, PROBE_INTERVAL
)
6504 && maybe_gt (frame_size
, get_stack_check_protect ()))
6505 aarch64_emit_probe_stack_range (get_stack_check_protect (),
6507 - get_stack_check_protect ()));
6509 else if (maybe_gt (frame_size
, 0))
6510 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size
);
6513 rtx tmp0_rtx
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
6514 rtx tmp1_rtx
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
6516 /* In theory we should never have both an initial adjustment
6517 and a callee save adjustment. Verify that is the case since the
6518 code below does not handle it for -fstack-clash-protection. */
6519 gcc_assert (known_eq (initial_adjust
, 0) || callee_adjust
== 0);
6521 /* Will only probe if the initial adjustment is larger than the guard
6522 less the amount of the guard reserved for use by the caller's
6524 aarch64_allocate_and_probe_stack_space (tmp0_rtx
, tmp1_rtx
, initial_adjust
,
6527 if (callee_adjust
!= 0)
6528 aarch64_push_regs (reg1
, reg2
, callee_adjust
);
6530 if (emit_frame_chain
)
6532 poly_int64 reg_offset
= callee_adjust
;
6533 if (callee_adjust
== 0)
6537 reg_offset
= callee_offset
;
6538 aarch64_save_callee_saves (DImode
, reg_offset
, reg1
, reg2
, false);
6540 aarch64_add_offset (Pmode
, hard_frame_pointer_rtx
,
6541 stack_pointer_rtx
, callee_offset
,
6542 tmp1_rtx
, tmp0_rtx
, frame_pointer_needed
);
6543 if (frame_pointer_needed
&& !frame_size
.is_constant ())
6545 /* Variable-sized frames need to describe the save slot
6546 address using DW_CFA_expression rather than DW_CFA_offset.
6547 This means that, without taking further action, the
6548 locations of the registers that we've already saved would
6549 remain based on the stack pointer even after we redefine
6550 the CFA based on the frame pointer. We therefore need new
6551 DW_CFA_expressions to re-express the save slots with addresses
6552 based on the frame pointer. */
6553 rtx_insn
*insn
= get_last_insn ();
6554 gcc_assert (RTX_FRAME_RELATED_P (insn
));
6556 /* Add an explicit CFA definition if this was previously
6558 if (!find_reg_note (insn
, REG_CFA_ADJUST_CFA
, NULL_RTX
))
6560 rtx src
= plus_constant (Pmode
, stack_pointer_rtx
,
6562 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
6563 gen_rtx_SET (hard_frame_pointer_rtx
, src
));
6566 /* Change the save slot expressions for the registers that
6567 we've already saved. */
6568 reg_offset
-= callee_offset
;
6569 aarch64_add_cfa_expression (insn
, reg2
, hard_frame_pointer_rtx
,
6570 reg_offset
+ UNITS_PER_WORD
);
6571 aarch64_add_cfa_expression (insn
, reg1
, hard_frame_pointer_rtx
,
6574 emit_insn (gen_stack_tie (stack_pointer_rtx
, hard_frame_pointer_rtx
));
6577 aarch64_save_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
6578 callee_adjust
!= 0 || emit_frame_chain
);
6579 if (aarch64_simd_decl_p (cfun
->decl
))
6580 aarch64_save_callee_saves (TFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
6581 callee_adjust
!= 0 || emit_frame_chain
);
6583 aarch64_save_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
6584 callee_adjust
!= 0 || emit_frame_chain
);
6586 /* We may need to probe the final adjustment if it is larger than the guard
6587 that is assumed by the called. */
6588 aarch64_allocate_and_probe_stack_space (tmp1_rtx
, tmp0_rtx
, final_adjust
,
6589 !frame_pointer_needed
, true);
6592 /* Return TRUE if we can use a simple_return insn.
6594 This function checks whether the callee saved stack is empty, which
6595 means no restore actions are need. The pro_and_epilogue will use
6596 this to check whether shrink-wrapping opt is feasible. */
6599 aarch64_use_return_insn_p (void)
6601 if (!reload_completed
)
6607 return known_eq (cfun
->machine
->frame
.frame_size
, 0);
6610 /* Return false for non-leaf SIMD functions in order to avoid
6611 shrink-wrapping them. Doing this will lose the necessary
6612 save/restore of FP registers. */
6615 aarch64_use_simple_return_insn_p (void)
6617 if (aarch64_simd_decl_p (cfun
->decl
) && !crtl
->is_leaf
)
6623 /* Generate the epilogue instructions for returning from a function.
6624 This is almost exactly the reverse of the prolog sequence, except
6625 that we need to insert barriers to avoid scheduling loads that read
6626 from a deallocated stack, and we optimize the unwind records by
6627 emitting them all together if possible. */
6629 aarch64_expand_epilogue (bool for_sibcall
)
6631 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
6632 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
6633 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
6634 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
6635 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
6636 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
6639 /* A stack clash protection prologue may not have left EP0_REGNUM or
6640 EP1_REGNUM in a usable state. The same is true for allocations
6641 with an SVE component, since we then need both temporary registers
6642 for each allocation. For stack clash we are in a usable state if
6643 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
6644 HOST_WIDE_INT guard_size
6645 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
6646 HOST_WIDE_INT guard_used_by_caller
= STACK_CLASH_CALLER_GUARD
;
6648 /* We can re-use the registers when the allocation amount is smaller than
6649 guard_size - guard_used_by_caller because we won't be doing any probes
6650 then. In such situations the register should remain live with the correct
6652 bool can_inherit_p
= (initial_adjust
.is_constant ()
6653 && final_adjust
.is_constant ())
6654 && (!flag_stack_clash_protection
6655 || known_lt (initial_adjust
,
6656 guard_size
- guard_used_by_caller
));
6658 /* We need to add memory barrier to prevent read from deallocated stack. */
6660 = maybe_ne (get_frame_size ()
6661 + cfun
->machine
->frame
.saved_varargs_size
, 0);
6663 /* Emit a barrier to prevent loads from a deallocated stack. */
6664 if (maybe_gt (final_adjust
, crtl
->outgoing_args_size
)
6665 || cfun
->calls_alloca
6666 || crtl
->calls_eh_return
)
6668 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
6669 need_barrier_p
= false;
6672 /* Restore the stack pointer from the frame pointer if it may not
6673 be the same as the stack pointer. */
6674 rtx tmp0_rtx
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
6675 rtx tmp1_rtx
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
6676 if (frame_pointer_needed
6677 && (maybe_ne (final_adjust
, 0) || cfun
->calls_alloca
))
6678 /* If writeback is used when restoring callee-saves, the CFA
6679 is restored on the instruction doing the writeback. */
6680 aarch64_add_offset (Pmode
, stack_pointer_rtx
,
6681 hard_frame_pointer_rtx
, -callee_offset
,
6682 tmp1_rtx
, tmp0_rtx
, callee_adjust
== 0);
6684 /* The case where we need to re-use the register here is very rare, so
6685 avoid the complicated condition and just always emit a move if the
6686 immediate doesn't fit. */
6687 aarch64_add_sp (tmp1_rtx
, tmp0_rtx
, final_adjust
, true);
6689 aarch64_restore_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
6690 callee_adjust
!= 0, &cfi_ops
);
6691 if (aarch64_simd_decl_p (cfun
->decl
))
6692 aarch64_restore_callee_saves (TFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
6693 callee_adjust
!= 0, &cfi_ops
);
6695 aarch64_restore_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
6696 callee_adjust
!= 0, &cfi_ops
);
6699 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
6701 if (callee_adjust
!= 0)
6702 aarch64_pop_regs (reg1
, reg2
, callee_adjust
, &cfi_ops
);
6704 if (callee_adjust
!= 0 || maybe_gt (initial_adjust
, 65536))
6706 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
6707 insn
= get_last_insn ();
6708 rtx new_cfa
= plus_constant (Pmode
, stack_pointer_rtx
, initial_adjust
);
6709 REG_NOTES (insn
) = alloc_reg_note (REG_CFA_DEF_CFA
, new_cfa
, cfi_ops
);
6710 RTX_FRAME_RELATED_P (insn
) = 1;
6714 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
6715 add restriction on emit_move optimization to leaf functions. */
6716 aarch64_add_sp (tmp0_rtx
, tmp1_rtx
, initial_adjust
,
6717 (!can_inherit_p
|| !crtl
->is_leaf
6718 || df_regs_ever_live_p (EP0_REGNUM
)));
6722 /* Emit delayed restores and reset the CFA to be SP. */
6723 insn
= get_last_insn ();
6724 cfi_ops
= alloc_reg_note (REG_CFA_DEF_CFA
, stack_pointer_rtx
, cfi_ops
);
6725 REG_NOTES (insn
) = cfi_ops
;
6726 RTX_FRAME_RELATED_P (insn
) = 1;
6729 /* We prefer to emit the combined return/authenticate instruction RETAA,
6730 however there are three cases in which we must instead emit an explicit
6731 authentication instruction.
6733 1) Sibcalls don't return in a normal way, so if we're about to call one
6734 we must authenticate.
6736 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
6737 generating code for !TARGET_ARMV8_3 we can't use it and must
6738 explicitly authenticate.
6740 3) On an eh_return path we make extra stack adjustments to update the
6741 canonical frame address to be the exception handler's CFA. We want
6742 to authenticate using the CFA of the function which calls eh_return.
6744 if (aarch64_return_address_signing_enabled ()
6745 && (for_sibcall
|| !TARGET_ARMV8_3
|| crtl
->calls_eh_return
))
6747 switch (aarch64_ra_sign_key
)
6750 insn
= emit_insn (gen_autiasp ());
6753 insn
= emit_insn (gen_autibsp ());
6758 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
6759 RTX_FRAME_RELATED_P (insn
) = 1;
6762 /* Stack adjustment for exception handler. */
6763 if (crtl
->calls_eh_return
&& !for_sibcall
)
6765 /* We need to unwind the stack by the offset computed by
6766 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
6767 to be SP; letting the CFA move during this adjustment
6768 is just as correct as retaining the CFA from the body
6769 of the function. Therefore, do nothing special. */
6770 emit_insn (gen_add2_insn (stack_pointer_rtx
, EH_RETURN_STACKADJ_RTX
));
6773 emit_use (gen_rtx_REG (DImode
, LR_REGNUM
));
6775 emit_jump_insn (ret_rtx
);
6778 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
6779 normally or return to a previous frame after unwinding.
6781 An EH return uses a single shared return sequence. The epilogue is
6782 exactly like a normal epilogue except that it has an extra input
6783 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
6784 that must be applied after the frame has been destroyed. An extra label
6785 is inserted before the epilogue which initializes this register to zero,
6786 and this is the entry point for a normal return.
6788 An actual EH return updates the return address, initializes the stack
6789 adjustment and jumps directly into the epilogue (bypassing the zeroing
6790 of the adjustment). Since the return address is typically saved on the
6791 stack when a function makes a call, the saved LR must be updated outside
6794 This poses problems as the store is generated well before the epilogue,
6795 so the offset of LR is not known yet. Also optimizations will remove the
6796 store as it appears dead, even after the epilogue is generated (as the
6797 base or offset for loading LR is different in many cases).
6799 To avoid these problems this implementation forces the frame pointer
6800 in eh_return functions so that the location of LR is fixed and known early.
6801 It also marks the store volatile, so no optimization is permitted to
6802 remove the store. */
6804 aarch64_eh_return_handler_rtx (void)
6806 rtx tmp
= gen_frame_mem (Pmode
,
6807 plus_constant (Pmode
, hard_frame_pointer_rtx
, UNITS_PER_WORD
));
6809 /* Mark the store volatile, so no optimization is permitted to remove it. */
6810 MEM_VOLATILE_P (tmp
) = true;
6814 /* Output code to add DELTA to the first argument, and then jump
6815 to FUNCTION. Used for C++ multiple inheritance. */
6817 aarch64_output_mi_thunk (FILE *file
, tree thunk ATTRIBUTE_UNUSED
,
6818 HOST_WIDE_INT delta
,
6819 HOST_WIDE_INT vcall_offset
,
6822 /* The this pointer is always in x0. Note that this differs from
6823 Arm where the this pointer maybe bumped to r1 if r0 is required
6824 to return a pointer to an aggregate. On AArch64 a result value
6825 pointer will be in x8. */
6826 int this_regno
= R0_REGNUM
;
6827 rtx this_rtx
, temp0
, temp1
, addr
, funexp
;
6829 const char *fnname
= IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk
));
6831 if (aarch64_bti_enabled ())
6832 emit_insn (gen_bti_c());
6834 reload_completed
= 1;
6835 emit_note (NOTE_INSN_PROLOGUE_END
);
6837 this_rtx
= gen_rtx_REG (Pmode
, this_regno
);
6838 temp0
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
6839 temp1
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
6841 if (vcall_offset
== 0)
6842 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
, temp1
, temp0
, false);
6845 gcc_assert ((vcall_offset
& (POINTER_BYTES
- 1)) == 0);
6850 if (delta
>= -256 && delta
< 256)
6851 addr
= gen_rtx_PRE_MODIFY (Pmode
, this_rtx
,
6852 plus_constant (Pmode
, this_rtx
, delta
));
6854 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
,
6855 temp1
, temp0
, false);
6858 if (Pmode
== ptr_mode
)
6859 aarch64_emit_move (temp0
, gen_rtx_MEM (ptr_mode
, addr
));
6861 aarch64_emit_move (temp0
,
6862 gen_rtx_ZERO_EXTEND (Pmode
,
6863 gen_rtx_MEM (ptr_mode
, addr
)));
6865 if (vcall_offset
>= -256 && vcall_offset
< 4096 * POINTER_BYTES
)
6866 addr
= plus_constant (Pmode
, temp0
, vcall_offset
);
6869 aarch64_internal_mov_immediate (temp1
, GEN_INT (vcall_offset
), true,
6871 addr
= gen_rtx_PLUS (Pmode
, temp0
, temp1
);
6874 if (Pmode
== ptr_mode
)
6875 aarch64_emit_move (temp1
, gen_rtx_MEM (ptr_mode
,addr
));
6877 aarch64_emit_move (temp1
,
6878 gen_rtx_SIGN_EXTEND (Pmode
,
6879 gen_rtx_MEM (ptr_mode
, addr
)));
6881 emit_insn (gen_add2_insn (this_rtx
, temp1
));
6884 /* Generate a tail call to the target function. */
6885 if (!TREE_USED (function
))
6887 assemble_external (function
);
6888 TREE_USED (function
) = 1;
6890 funexp
= XEXP (DECL_RTL (function
), 0);
6891 funexp
= gen_rtx_MEM (FUNCTION_MODE
, funexp
);
6892 insn
= emit_call_insn (gen_sibcall (funexp
, const0_rtx
, NULL_RTX
));
6893 SIBLING_CALL_P (insn
) = 1;
6895 insn
= get_insns ();
6896 shorten_branches (insn
);
6898 assemble_start_function (thunk
, fnname
);
6899 final_start_function (insn
, file
, 1);
6900 final (insn
, file
, 1);
6901 final_end_function ();
6902 assemble_end_function (thunk
, fnname
);
6904 /* Stop pretending to be a post-reload pass. */
6905 reload_completed
= 0;
6909 aarch64_tls_referenced_p (rtx x
)
6911 if (!TARGET_HAVE_TLS
)
6913 subrtx_iterator::array_type array
;
6914 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
6916 const_rtx x
= *iter
;
6917 if (GET_CODE (x
) == SYMBOL_REF
&& SYMBOL_REF_TLS_MODEL (x
) != 0)
6919 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6920 TLS offsets, not real symbol references. */
6921 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
6922 iter
.skip_subrtxes ();
6928 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6929 a left shift of 0 or 12 bits. */
6931 aarch64_uimm12_shift (HOST_WIDE_INT val
)
6933 return ((val
& (((HOST_WIDE_INT
) 0xfff) << 0)) == val
6934 || (val
& (((HOST_WIDE_INT
) 0xfff) << 12)) == val
6938 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6939 that can be created with a left shift of 0 or 12. */
6940 static HOST_WIDE_INT
6941 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val
)
6943 /* Check to see if the value fits in 24 bits, as that is the maximum we can
6944 handle correctly. */
6945 gcc_assert ((val
& 0xffffff) == val
);
6947 if (((val
& 0xfff) << 0) == val
)
6950 return val
& (0xfff << 12);
6953 /* Return true if val is an immediate that can be loaded into a
6954 register by a MOVZ instruction. */
6956 aarch64_movw_imm (HOST_WIDE_INT val
, scalar_int_mode mode
)
6958 if (GET_MODE_SIZE (mode
) > 4)
6960 if ((val
& (((HOST_WIDE_INT
) 0xffff) << 32)) == val
6961 || (val
& (((HOST_WIDE_INT
) 0xffff) << 48)) == val
)
6966 /* Ignore sign extension. */
6967 val
&= (HOST_WIDE_INT
) 0xffffffff;
6969 return ((val
& (((HOST_WIDE_INT
) 0xffff) << 0)) == val
6970 || (val
& (((HOST_WIDE_INT
) 0xffff) << 16)) == val
);
6973 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
6974 64-bit (DImode) integer. */
6976 static unsigned HOST_WIDE_INT
6977 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val
, machine_mode mode
)
6979 unsigned int size
= GET_MODE_UNIT_PRECISION (mode
);
6982 val
&= (HOST_WIDE_INT_1U
<< size
) - 1;
6989 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
6991 static const unsigned HOST_WIDE_INT bitmask_imm_mul
[] =
6993 0x0000000100000001ull
,
6994 0x0001000100010001ull
,
6995 0x0101010101010101ull
,
6996 0x1111111111111111ull
,
6997 0x5555555555555555ull
,
7001 /* Return true if val is a valid bitmask immediate. */
7004 aarch64_bitmask_imm (HOST_WIDE_INT val_in
, machine_mode mode
)
7006 unsigned HOST_WIDE_INT val
, tmp
, mask
, first_one
, next_one
;
7009 /* Check for a single sequence of one bits and return quickly if so.
7010 The special cases of all ones and all zeroes returns false. */
7011 val
= aarch64_replicate_bitmask_imm (val_in
, mode
);
7012 tmp
= val
+ (val
& -val
);
7014 if (tmp
== (tmp
& -tmp
))
7015 return (val
+ 1) > 1;
7017 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
7019 val
= (val
<< 32) | (val
& 0xffffffff);
7021 /* Invert if the immediate doesn't start with a zero bit - this means we
7022 only need to search for sequences of one bits. */
7026 /* Find the first set bit and set tmp to val with the first sequence of one
7027 bits removed. Return success if there is a single sequence of ones. */
7028 first_one
= val
& -val
;
7029 tmp
= val
& (val
+ first_one
);
7034 /* Find the next set bit and compute the difference in bit position. */
7035 next_one
= tmp
& -tmp
;
7036 bits
= clz_hwi (first_one
) - clz_hwi (next_one
);
7039 /* Check the bit position difference is a power of 2, and that the first
7040 sequence of one bits fits within 'bits' bits. */
7041 if ((mask
>> bits
) != 0 || bits
!= (bits
& -bits
))
7044 /* Check the sequence of one bits is repeated 64/bits times. */
7045 return val
== mask
* bitmask_imm_mul
[__builtin_clz (bits
) - 26];
7048 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
7049 Assumed precondition: VAL_IN Is not zero. */
7051 unsigned HOST_WIDE_INT
7052 aarch64_and_split_imm1 (HOST_WIDE_INT val_in
)
7054 int lowest_bit_set
= ctz_hwi (val_in
);
7055 int highest_bit_set
= floor_log2 (val_in
);
7056 gcc_assert (val_in
!= 0);
7058 return ((HOST_WIDE_INT_UC (2) << highest_bit_set
) -
7059 (HOST_WIDE_INT_1U
<< lowest_bit_set
));
7062 /* Create constant where bits outside of lowest bit set to highest bit set
7065 unsigned HOST_WIDE_INT
7066 aarch64_and_split_imm2 (HOST_WIDE_INT val_in
)
7068 return val_in
| ~aarch64_and_split_imm1 (val_in
);
7071 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
7074 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in
, machine_mode mode
)
7076 scalar_int_mode int_mode
;
7077 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
7080 if (aarch64_bitmask_imm (val_in
, int_mode
))
7083 if (aarch64_move_imm (val_in
, int_mode
))
7086 unsigned HOST_WIDE_INT imm2
= aarch64_and_split_imm2 (val_in
);
7088 return aarch64_bitmask_imm (imm2
, int_mode
);
7091 /* Return true if val is an immediate that can be loaded into a
7092 register in a single instruction. */
7094 aarch64_move_imm (HOST_WIDE_INT val
, machine_mode mode
)
7096 scalar_int_mode int_mode
;
7097 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
7100 if (aarch64_movw_imm (val
, int_mode
) || aarch64_movw_imm (~val
, int_mode
))
7102 return aarch64_bitmask_imm (val
, int_mode
);
7106 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED
, rtx x
)
7110 if (GET_CODE (x
) == HIGH
)
7113 /* There's no way to calculate VL-based values using relocations. */
7114 subrtx_iterator::array_type array
;
7115 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
7116 if (GET_CODE (*iter
) == CONST_POLY_INT
)
7119 split_const (x
, &base
, &offset
);
7120 if (GET_CODE (base
) == SYMBOL_REF
|| GET_CODE (base
) == LABEL_REF
)
7122 if (aarch64_classify_symbol (base
, INTVAL (offset
))
7123 != SYMBOL_FORCE_TO_MEM
)
7126 /* Avoid generating a 64-bit relocation in ILP32; leave
7127 to aarch64_expand_mov_immediate to handle it properly. */
7128 return mode
!= ptr_mode
;
7131 return aarch64_tls_referenced_p (x
);
7134 /* Implement TARGET_CASE_VALUES_THRESHOLD.
7135 The expansion for a table switch is quite expensive due to the number
7136 of instructions, the table lookup and hard to predict indirect jump.
7137 When optimizing for speed, and -O3 enabled, use the per-core tuning if
7138 set, otherwise use tables for > 16 cases as a tradeoff between size and
7139 performance. When optimizing for size, use the default setting. */
7142 aarch64_case_values_threshold (void)
7144 /* Use the specified limit for the number of cases before using jump
7145 tables at higher optimization levels. */
7147 && selected_cpu
->tune
->max_case_values
!= 0)
7148 return selected_cpu
->tune
->max_case_values
;
7150 return optimize_size
? default_case_values_threshold () : 17;
7153 /* Return true if register REGNO is a valid index register.
7154 STRICT_P is true if REG_OK_STRICT is in effect. */
7157 aarch64_regno_ok_for_index_p (int regno
, bool strict_p
)
7159 if (!HARD_REGISTER_NUM_P (regno
))
7167 regno
= reg_renumber
[regno
];
7169 return GP_REGNUM_P (regno
);
7172 /* Return true if register REGNO is a valid base register for mode MODE.
7173 STRICT_P is true if REG_OK_STRICT is in effect. */
7176 aarch64_regno_ok_for_base_p (int regno
, bool strict_p
)
7178 if (!HARD_REGISTER_NUM_P (regno
))
7186 regno
= reg_renumber
[regno
];
7189 /* The fake registers will be eliminated to either the stack or
7190 hard frame pointer, both of which are usually valid base registers.
7191 Reload deals with the cases where the eliminated form isn't valid. */
7192 return (GP_REGNUM_P (regno
)
7193 || regno
== SP_REGNUM
7194 || regno
== FRAME_POINTER_REGNUM
7195 || regno
== ARG_POINTER_REGNUM
);
7198 /* Return true if X is a valid base register for mode MODE.
7199 STRICT_P is true if REG_OK_STRICT is in effect. */
7202 aarch64_base_register_rtx_p (rtx x
, bool strict_p
)
7205 && GET_CODE (x
) == SUBREG
7206 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (x
))])
7209 return (REG_P (x
) && aarch64_regno_ok_for_base_p (REGNO (x
), strict_p
));
7212 /* Return true if address offset is a valid index. If it is, fill in INFO
7213 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
7216 aarch64_classify_index (struct aarch64_address_info
*info
, rtx x
,
7217 machine_mode mode
, bool strict_p
)
7219 enum aarch64_address_type type
;
7224 if ((REG_P (x
) || GET_CODE (x
) == SUBREG
)
7225 && GET_MODE (x
) == Pmode
)
7227 type
= ADDRESS_REG_REG
;
7231 /* (sign_extend:DI (reg:SI)) */
7232 else if ((GET_CODE (x
) == SIGN_EXTEND
7233 || GET_CODE (x
) == ZERO_EXTEND
)
7234 && GET_MODE (x
) == DImode
7235 && GET_MODE (XEXP (x
, 0)) == SImode
)
7237 type
= (GET_CODE (x
) == SIGN_EXTEND
)
7238 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
7239 index
= XEXP (x
, 0);
7242 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
7243 else if (GET_CODE (x
) == MULT
7244 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
7245 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
7246 && GET_MODE (XEXP (x
, 0)) == DImode
7247 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
7248 && CONST_INT_P (XEXP (x
, 1)))
7250 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
7251 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
7252 index
= XEXP (XEXP (x
, 0), 0);
7253 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
7255 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
7256 else if (GET_CODE (x
) == ASHIFT
7257 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
7258 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
7259 && GET_MODE (XEXP (x
, 0)) == DImode
7260 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
7261 && CONST_INT_P (XEXP (x
, 1)))
7263 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
7264 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
7265 index
= XEXP (XEXP (x
, 0), 0);
7266 shift
= INTVAL (XEXP (x
, 1));
7268 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
7269 else if ((GET_CODE (x
) == SIGN_EXTRACT
7270 || GET_CODE (x
) == ZERO_EXTRACT
)
7271 && GET_MODE (x
) == DImode
7272 && GET_CODE (XEXP (x
, 0)) == MULT
7273 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
7274 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
7276 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
7277 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
7278 index
= XEXP (XEXP (x
, 0), 0);
7279 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
7280 if (INTVAL (XEXP (x
, 1)) != 32 + shift
7281 || INTVAL (XEXP (x
, 2)) != 0)
7284 /* (and:DI (mult:DI (reg:DI) (const_int scale))
7285 (const_int 0xffffffff<<shift)) */
7286 else if (GET_CODE (x
) == AND
7287 && GET_MODE (x
) == DImode
7288 && GET_CODE (XEXP (x
, 0)) == MULT
7289 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
7290 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
7291 && CONST_INT_P (XEXP (x
, 1)))
7293 type
= ADDRESS_REG_UXTW
;
7294 index
= XEXP (XEXP (x
, 0), 0);
7295 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
7296 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
7299 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
7300 else if ((GET_CODE (x
) == SIGN_EXTRACT
7301 || GET_CODE (x
) == ZERO_EXTRACT
)
7302 && GET_MODE (x
) == DImode
7303 && GET_CODE (XEXP (x
, 0)) == ASHIFT
7304 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
7305 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
7307 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
7308 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
7309 index
= XEXP (XEXP (x
, 0), 0);
7310 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
7311 if (INTVAL (XEXP (x
, 1)) != 32 + shift
7312 || INTVAL (XEXP (x
, 2)) != 0)
7315 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
7316 (const_int 0xffffffff<<shift)) */
7317 else if (GET_CODE (x
) == AND
7318 && GET_MODE (x
) == DImode
7319 && GET_CODE (XEXP (x
, 0)) == ASHIFT
7320 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
7321 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
7322 && CONST_INT_P (XEXP (x
, 1)))
7324 type
= ADDRESS_REG_UXTW
;
7325 index
= XEXP (XEXP (x
, 0), 0);
7326 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
7327 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
7330 /* (mult:P (reg:P) (const_int scale)) */
7331 else if (GET_CODE (x
) == MULT
7332 && GET_MODE (x
) == Pmode
7333 && GET_MODE (XEXP (x
, 0)) == Pmode
7334 && CONST_INT_P (XEXP (x
, 1)))
7336 type
= ADDRESS_REG_REG
;
7337 index
= XEXP (x
, 0);
7338 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
7340 /* (ashift:P (reg:P) (const_int shift)) */
7341 else if (GET_CODE (x
) == ASHIFT
7342 && GET_MODE (x
) == Pmode
7343 && GET_MODE (XEXP (x
, 0)) == Pmode
7344 && CONST_INT_P (XEXP (x
, 1)))
7346 type
= ADDRESS_REG_REG
;
7347 index
= XEXP (x
, 0);
7348 shift
= INTVAL (XEXP (x
, 1));
7354 && GET_CODE (index
) == SUBREG
7355 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (index
))])
7356 index
= SUBREG_REG (index
);
7358 if (aarch64_sve_data_mode_p (mode
))
7360 if (type
!= ADDRESS_REG_REG
7361 || (1 << shift
) != GET_MODE_UNIT_SIZE (mode
))
7367 && !(IN_RANGE (shift
, 1, 3)
7368 && known_eq (1 << shift
, GET_MODE_SIZE (mode
))))
7373 && aarch64_regno_ok_for_index_p (REGNO (index
), strict_p
))
7376 info
->offset
= index
;
7377 info
->shift
= shift
;
7384 /* Return true if MODE is one of the modes for which we
7385 support LDP/STP operations. */
7388 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode
)
7390 return mode
== SImode
|| mode
== DImode
7391 || mode
== SFmode
|| mode
== DFmode
7392 || (aarch64_vector_mode_supported_p (mode
)
7393 && (known_eq (GET_MODE_SIZE (mode
), 8)
7394 || (known_eq (GET_MODE_SIZE (mode
), 16)
7395 && (aarch64_tune_params
.extra_tuning_flags
7396 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
) == 0)));
7399 /* Return true if REGNO is a virtual pointer register, or an eliminable
7400 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
7401 include stack_pointer or hard_frame_pointer. */
7403 virt_or_elim_regno_p (unsigned regno
)
7405 return ((regno
>= FIRST_VIRTUAL_REGISTER
7406 && regno
<= LAST_VIRTUAL_POINTER_REGISTER
)
7407 || regno
== FRAME_POINTER_REGNUM
7408 || regno
== ARG_POINTER_REGNUM
);
7411 /* Return true if X is a valid address of type TYPE for machine mode MODE.
7412 If it is, fill in INFO appropriately. STRICT_P is true if
7413 REG_OK_STRICT is in effect. */
7416 aarch64_classify_address (struct aarch64_address_info
*info
,
7417 rtx x
, machine_mode mode
, bool strict_p
,
7418 aarch64_addr_query_type type
)
7420 enum rtx_code code
= GET_CODE (x
);
7424 HOST_WIDE_INT const_size
;
7426 /* On BE, we use load/store pair for all large int mode load/stores.
7427 TI/TFmode may also use a load/store pair. */
7428 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
7429 bool advsimd_struct_p
= (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
));
7430 bool load_store_pair_p
= (type
== ADDR_QUERY_LDP_STP
7431 || type
== ADDR_QUERY_LDP_STP_N
7434 || (BYTES_BIG_ENDIAN
&& advsimd_struct_p
));
7436 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
7437 corresponds to the actual size of the memory being loaded/stored and the
7438 mode of the corresponding addressing mode is half of that. */
7439 if (type
== ADDR_QUERY_LDP_STP_N
7440 && known_eq (GET_MODE_SIZE (mode
), 16))
7443 bool allow_reg_index_p
= (!load_store_pair_p
7444 && (known_lt (GET_MODE_SIZE (mode
), 16)
7445 || vec_flags
== VEC_ADVSIMD
7446 || vec_flags
& VEC_SVE_DATA
));
7448 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
7449 [Rn, #offset, MUL VL]. */
7450 if ((vec_flags
& (VEC_SVE_DATA
| VEC_SVE_PRED
)) != 0
7451 && (code
!= REG
&& code
!= PLUS
))
7454 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
7456 if (advsimd_struct_p
7457 && !BYTES_BIG_ENDIAN
7458 && (code
!= POST_INC
&& code
!= REG
))
7461 gcc_checking_assert (GET_MODE (x
) == VOIDmode
7462 || SCALAR_INT_MODE_P (GET_MODE (x
)));
7468 info
->type
= ADDRESS_REG_IMM
;
7470 info
->offset
= const0_rtx
;
7471 info
->const_offset
= 0;
7472 return aarch64_base_register_rtx_p (x
, strict_p
);
7480 && virt_or_elim_regno_p (REGNO (op0
))
7481 && poly_int_rtx_p (op1
, &offset
))
7483 info
->type
= ADDRESS_REG_IMM
;
7486 info
->const_offset
= offset
;
7491 if (maybe_ne (GET_MODE_SIZE (mode
), 0)
7492 && aarch64_base_register_rtx_p (op0
, strict_p
)
7493 && poly_int_rtx_p (op1
, &offset
))
7495 info
->type
= ADDRESS_REG_IMM
;
7498 info
->const_offset
= offset
;
7500 /* TImode and TFmode values are allowed in both pairs of X
7501 registers and individual Q registers. The available
7503 X,X: 7-bit signed scaled offset
7504 Q: 9-bit signed offset
7505 We conservatively require an offset representable in either mode.
7506 When performing the check for pairs of X registers i.e. LDP/STP
7507 pass down DImode since that is the natural size of the LDP/STP
7508 instruction memory accesses. */
7509 if (mode
== TImode
|| mode
== TFmode
)
7510 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
7511 && (aarch64_offset_9bit_signed_unscaled_p (mode
, offset
)
7512 || offset_12bit_unsigned_scaled_p (mode
, offset
)));
7514 /* A 7bit offset check because OImode will emit a ldp/stp
7515 instruction (only big endian will get here).
7516 For ldp/stp instructions, the offset is scaled for the size of a
7517 single element of the pair. */
7519 return aarch64_offset_7bit_signed_scaled_p (TImode
, offset
);
7521 /* Three 9/12 bit offsets checks because CImode will emit three
7522 ldr/str instructions (only big endian will get here). */
7524 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
7525 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode
,
7527 || offset_12bit_unsigned_scaled_p (V16QImode
,
7530 /* Two 7bit offsets checks because XImode will emit two ldp/stp
7531 instructions (only big endian will get here). */
7533 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
7534 && aarch64_offset_7bit_signed_scaled_p (TImode
,
7537 /* Make "m" use the LD1 offset range for SVE data modes, so
7538 that pre-RTL optimizers like ivopts will work to that
7539 instead of the wider LDR/STR range. */
7540 if (vec_flags
== VEC_SVE_DATA
)
7541 return (type
== ADDR_QUERY_M
7542 ? offset_4bit_signed_scaled_p (mode
, offset
)
7543 : offset_9bit_signed_scaled_p (mode
, offset
));
7545 if (vec_flags
== (VEC_SVE_DATA
| VEC_STRUCT
))
7547 poly_int64 end_offset
= (offset
7548 + GET_MODE_SIZE (mode
)
7549 - BYTES_PER_SVE_VECTOR
);
7550 return (type
== ADDR_QUERY_M
7551 ? offset_4bit_signed_scaled_p (mode
, offset
)
7552 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE
, offset
)
7553 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE
,
7557 if (vec_flags
== VEC_SVE_PRED
)
7558 return offset_9bit_signed_scaled_p (mode
, offset
);
7560 if (load_store_pair_p
)
7561 return ((known_eq (GET_MODE_SIZE (mode
), 4)
7562 || known_eq (GET_MODE_SIZE (mode
), 8)
7563 || known_eq (GET_MODE_SIZE (mode
), 16))
7564 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
7566 return (aarch64_offset_9bit_signed_unscaled_p (mode
, offset
)
7567 || offset_12bit_unsigned_scaled_p (mode
, offset
));
7570 if (allow_reg_index_p
)
7572 /* Look for base + (scaled/extended) index register. */
7573 if (aarch64_base_register_rtx_p (op0
, strict_p
)
7574 && aarch64_classify_index (info
, op1
, mode
, strict_p
))
7579 if (aarch64_base_register_rtx_p (op1
, strict_p
)
7580 && aarch64_classify_index (info
, op0
, mode
, strict_p
))
7593 info
->type
= ADDRESS_REG_WB
;
7594 info
->base
= XEXP (x
, 0);
7595 info
->offset
= NULL_RTX
;
7596 return aarch64_base_register_rtx_p (info
->base
, strict_p
);
7600 info
->type
= ADDRESS_REG_WB
;
7601 info
->base
= XEXP (x
, 0);
7602 if (GET_CODE (XEXP (x
, 1)) == PLUS
7603 && poly_int_rtx_p (XEXP (XEXP (x
, 1), 1), &offset
)
7604 && rtx_equal_p (XEXP (XEXP (x
, 1), 0), info
->base
)
7605 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
7607 info
->offset
= XEXP (XEXP (x
, 1), 1);
7608 info
->const_offset
= offset
;
7610 /* TImode and TFmode values are allowed in both pairs of X
7611 registers and individual Q registers. The available
7613 X,X: 7-bit signed scaled offset
7614 Q: 9-bit signed offset
7615 We conservatively require an offset representable in either mode.
7617 if (mode
== TImode
|| mode
== TFmode
)
7618 return (aarch64_offset_7bit_signed_scaled_p (mode
, offset
)
7619 && aarch64_offset_9bit_signed_unscaled_p (mode
, offset
));
7621 if (load_store_pair_p
)
7622 return ((known_eq (GET_MODE_SIZE (mode
), 4)
7623 || known_eq (GET_MODE_SIZE (mode
), 8)
7624 || known_eq (GET_MODE_SIZE (mode
), 16))
7625 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
7627 return aarch64_offset_9bit_signed_unscaled_p (mode
, offset
);
7634 /* load literal: pc-relative constant pool entry. Only supported
7635 for SI mode or larger. */
7636 info
->type
= ADDRESS_SYMBOLIC
;
7638 if (!load_store_pair_p
7639 && GET_MODE_SIZE (mode
).is_constant (&const_size
)
7644 split_const (x
, &sym
, &addend
);
7645 return ((GET_CODE (sym
) == LABEL_REF
7646 || (GET_CODE (sym
) == SYMBOL_REF
7647 && CONSTANT_POOL_ADDRESS_P (sym
)
7648 && aarch64_pcrelative_literal_loads
)));
7653 info
->type
= ADDRESS_LO_SUM
;
7654 info
->base
= XEXP (x
, 0);
7655 info
->offset
= XEXP (x
, 1);
7656 if (allow_reg_index_p
7657 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
7660 split_const (info
->offset
, &sym
, &offs
);
7661 if (GET_CODE (sym
) == SYMBOL_REF
7662 && (aarch64_classify_symbol (sym
, INTVAL (offs
))
7663 == SYMBOL_SMALL_ABSOLUTE
))
7665 /* The symbol and offset must be aligned to the access size. */
7668 if (CONSTANT_POOL_ADDRESS_P (sym
))
7669 align
= GET_MODE_ALIGNMENT (get_pool_mode (sym
));
7670 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym
))
7672 tree exp
= SYMBOL_REF_DECL (sym
);
7673 align
= TYPE_ALIGN (TREE_TYPE (exp
));
7674 align
= aarch64_constant_alignment (exp
, align
);
7676 else if (SYMBOL_REF_DECL (sym
))
7677 align
= DECL_ALIGN (SYMBOL_REF_DECL (sym
));
7678 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym
)
7679 && SYMBOL_REF_BLOCK (sym
) != NULL
)
7680 align
= SYMBOL_REF_BLOCK (sym
)->alignment
;
7682 align
= BITS_PER_UNIT
;
7684 poly_int64 ref_size
= GET_MODE_SIZE (mode
);
7685 if (known_eq (ref_size
, 0))
7686 ref_size
= GET_MODE_SIZE (DImode
);
7688 return (multiple_p (INTVAL (offs
), ref_size
)
7689 && multiple_p (align
/ BITS_PER_UNIT
, ref_size
));
7699 /* Return true if the address X is valid for a PRFM instruction.
7700 STRICT_P is true if we should do strict checking with
7701 aarch64_classify_address. */
7704 aarch64_address_valid_for_prefetch_p (rtx x
, bool strict_p
)
7706 struct aarch64_address_info addr
;
7708 /* PRFM accepts the same addresses as DImode... */
7709 bool res
= aarch64_classify_address (&addr
, x
, DImode
, strict_p
);
7713 /* ... except writeback forms. */
7714 return addr
.type
!= ADDRESS_REG_WB
;
7718 aarch64_symbolic_address_p (rtx x
)
7722 split_const (x
, &x
, &offset
);
7723 return GET_CODE (x
) == SYMBOL_REF
|| GET_CODE (x
) == LABEL_REF
;
7726 /* Classify the base of symbolic expression X. */
7728 enum aarch64_symbol_type
7729 aarch64_classify_symbolic_expression (rtx x
)
7733 split_const (x
, &x
, &offset
);
7734 return aarch64_classify_symbol (x
, INTVAL (offset
));
7738 /* Return TRUE if X is a legitimate address for accessing memory in
7741 aarch64_legitimate_address_hook_p (machine_mode mode
, rtx x
, bool strict_p
)
7743 struct aarch64_address_info addr
;
7745 return aarch64_classify_address (&addr
, x
, mode
, strict_p
);
7748 /* Return TRUE if X is a legitimate address of type TYPE for accessing
7749 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
7751 aarch64_legitimate_address_p (machine_mode mode
, rtx x
, bool strict_p
,
7752 aarch64_addr_query_type type
)
7754 struct aarch64_address_info addr
;
7756 return aarch64_classify_address (&addr
, x
, mode
, strict_p
, type
);
7759 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
7762 aarch64_legitimize_address_displacement (rtx
*offset1
, rtx
*offset2
,
7763 poly_int64 orig_offset
,
7767 if (GET_MODE_SIZE (mode
).is_constant (&size
))
7769 HOST_WIDE_INT const_offset
, second_offset
;
7771 /* A general SVE offset is A * VQ + B. Remove the A component from
7772 coefficient 0 in order to get the constant B. */
7773 const_offset
= orig_offset
.coeffs
[0] - orig_offset
.coeffs
[1];
7775 /* Split an out-of-range address displacement into a base and
7776 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
7777 range otherwise to increase opportunities for sharing the base
7778 address of different sizes. Unaligned accesses use the signed
7779 9-bit range, TImode/TFmode use the intersection of signed
7780 scaled 7-bit and signed 9-bit offset. */
7781 if (mode
== TImode
|| mode
== TFmode
)
7782 second_offset
= ((const_offset
+ 0x100) & 0x1f8) - 0x100;
7783 else if ((const_offset
& (size
- 1)) != 0)
7784 second_offset
= ((const_offset
+ 0x100) & 0x1ff) - 0x100;
7786 second_offset
= const_offset
& (size
< 4 ? 0xfff : 0x3ffc);
7788 if (second_offset
== 0 || known_eq (orig_offset
, second_offset
))
7791 /* Split the offset into second_offset and the rest. */
7792 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
7793 *offset2
= gen_int_mode (second_offset
, Pmode
);
7798 /* Get the mode we should use as the basis of the range. For structure
7799 modes this is the mode of one vector. */
7800 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
7801 machine_mode step_mode
7802 = (vec_flags
& VEC_STRUCT
) != 0 ? SVE_BYTE_MODE
: mode
;
7804 /* Get the "mul vl" multiplier we'd like to use. */
7805 HOST_WIDE_INT factor
= GET_MODE_SIZE (step_mode
).coeffs
[1];
7806 HOST_WIDE_INT vnum
= orig_offset
.coeffs
[1] / factor
;
7807 if (vec_flags
& VEC_SVE_DATA
)
7808 /* LDR supports a 9-bit range, but the move patterns for
7809 structure modes require all vectors to be in range of the
7810 same base. The simplest way of accomodating that while still
7811 promoting reuse of anchor points between different modes is
7812 to use an 8-bit range unconditionally. */
7813 vnum
= ((vnum
+ 128) & 255) - 128;
7815 /* Predicates are only handled singly, so we might as well use
7817 vnum
= ((vnum
+ 256) & 511) - 256;
7821 /* Convert the "mul vl" multiplier into a byte offset. */
7822 poly_int64 second_offset
= GET_MODE_SIZE (step_mode
) * vnum
;
7823 if (known_eq (second_offset
, orig_offset
))
7826 /* Split the offset into second_offset and the rest. */
7827 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
7828 *offset2
= gen_int_mode (second_offset
, Pmode
);
7833 /* Return the binary representation of floating point constant VALUE in INTVAL.
7834 If the value cannot be converted, return false without setting INTVAL.
7835 The conversion is done in the given MODE. */
7837 aarch64_reinterpret_float_as_int (rtx value
, unsigned HOST_WIDE_INT
*intval
)
7840 /* We make a general exception for 0. */
7841 if (aarch64_float_const_zero_rtx_p (value
))
7847 scalar_float_mode mode
;
7848 if (GET_CODE (value
) != CONST_DOUBLE
7849 || !is_a
<scalar_float_mode
> (GET_MODE (value
), &mode
)
7850 || GET_MODE_BITSIZE (mode
) > HOST_BITS_PER_WIDE_INT
7851 /* Only support up to DF mode. */
7852 || GET_MODE_BITSIZE (mode
) > GET_MODE_BITSIZE (DFmode
))
7855 unsigned HOST_WIDE_INT ival
= 0;
7858 real_to_target (res
,
7859 CONST_DOUBLE_REAL_VALUE (value
),
7860 REAL_MODE_FORMAT (mode
));
7864 int order
= BYTES_BIG_ENDIAN
? 1 : 0;
7865 ival
= zext_hwi (res
[order
], 32);
7866 ival
|= (zext_hwi (res
[1 - order
], 32) << 32);
7869 ival
= zext_hwi (res
[0], 32);
7875 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7876 single MOV(+MOVK) followed by an FMOV. */
7878 aarch64_float_const_rtx_p (rtx x
)
7880 machine_mode mode
= GET_MODE (x
);
7881 if (mode
== VOIDmode
)
7884 /* Determine whether it's cheaper to write float constants as
7885 mov/movk pairs over ldr/adrp pairs. */
7886 unsigned HOST_WIDE_INT ival
;
7888 if (GET_CODE (x
) == CONST_DOUBLE
7889 && SCALAR_FLOAT_MODE_P (mode
)
7890 && aarch64_reinterpret_float_as_int (x
, &ival
))
7892 scalar_int_mode imode
= (mode
== HFmode
7894 : int_mode_for_mode (mode
).require ());
7895 int num_instr
= aarch64_internal_mov_immediate
7896 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
7897 return num_instr
< 3;
7903 /* Return TRUE if rtx X is immediate constant 0.0 */
7905 aarch64_float_const_zero_rtx_p (rtx x
)
7907 if (GET_MODE (x
) == VOIDmode
)
7910 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x
)))
7911 return !HONOR_SIGNED_ZEROS (GET_MODE (x
));
7912 return real_equal (CONST_DOUBLE_REAL_VALUE (x
), &dconst0
);
7915 /* Return TRUE if rtx X is immediate constant that fits in a single
7916 MOVI immediate operation. */
7918 aarch64_can_const_movi_rtx_p (rtx x
, machine_mode mode
)
7924 scalar_int_mode imode
;
7925 unsigned HOST_WIDE_INT ival
;
7927 if (GET_CODE (x
) == CONST_DOUBLE
7928 && SCALAR_FLOAT_MODE_P (mode
))
7930 if (!aarch64_reinterpret_float_as_int (x
, &ival
))
7933 /* We make a general exception for 0. */
7934 if (aarch64_float_const_zero_rtx_p (x
))
7937 imode
= int_mode_for_mode (mode
).require ();
7939 else if (GET_CODE (x
) == CONST_INT
7940 && is_a
<scalar_int_mode
> (mode
, &imode
))
7945 /* use a 64 bit mode for everything except for DI/DF mode, where we use
7946 a 128 bit vector mode. */
7947 int width
= GET_MODE_BITSIZE (imode
) == 64 ? 128 : 64;
7949 vmode
= aarch64_simd_container_mode (imode
, width
);
7950 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, ival
);
7952 return aarch64_simd_valid_immediate (v_op
, NULL
);
7956 /* Return the fixed registers used for condition codes. */
7959 aarch64_fixed_condition_code_regs (unsigned int *p1
, unsigned int *p2
)
7962 *p2
= INVALID_REGNUM
;
7966 /* This function is used by the call expanders of the machine description.
7967 RESULT is the register in which the result is returned. It's NULL for
7968 "call" and "sibcall".
7969 MEM is the location of the function call.
7970 SIBCALL indicates whether this function call is normal call or sibling call.
7971 It will generate different pattern accordingly. */
7974 aarch64_expand_call (rtx result
, rtx mem
, bool sibcall
)
7976 rtx call
, callee
, tmp
;
7980 gcc_assert (MEM_P (mem
));
7981 callee
= XEXP (mem
, 0);
7982 mode
= GET_MODE (callee
);
7983 gcc_assert (mode
== Pmode
);
7985 /* Decide if we should generate indirect calls by loading the
7986 address of the callee into a register before performing
7987 the branch-and-link. */
7988 if (SYMBOL_REF_P (callee
)
7989 ? (aarch64_is_long_call_p (callee
)
7990 || aarch64_is_noplt_call_p (callee
))
7992 XEXP (mem
, 0) = force_reg (mode
, callee
);
7994 call
= gen_rtx_CALL (VOIDmode
, mem
, const0_rtx
);
7996 if (result
!= NULL_RTX
)
7997 call
= gen_rtx_SET (result
, call
);
8002 tmp
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (Pmode
, LR_REGNUM
));
8004 vec
= gen_rtvec (2, call
, tmp
);
8005 call
= gen_rtx_PARALLEL (VOIDmode
, vec
);
8007 aarch64_emit_call_insn (call
);
8010 /* Emit call insn with PAT and do aarch64-specific handling. */
8013 aarch64_emit_call_insn (rtx pat
)
8015 rtx insn
= emit_call_insn (pat
);
8017 rtx
*fusage
= &CALL_INSN_FUNCTION_USAGE (insn
);
8018 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP0_REGNUM
));
8019 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP1_REGNUM
));
8023 aarch64_select_cc_mode (RTX_CODE code
, rtx x
, rtx y
)
8025 machine_mode mode_x
= GET_MODE (x
);
8026 rtx_code code_x
= GET_CODE (x
);
8028 /* All floating point compares return CCFP if it is an equality
8029 comparison, and CCFPE otherwise. */
8030 if (GET_MODE_CLASS (mode_x
) == MODE_FLOAT
)
8057 /* Equality comparisons of short modes against zero can be performed
8058 using the TST instruction with the appropriate bitmask. */
8059 if (y
== const0_rtx
&& (REG_P (x
) || SUBREG_P (x
))
8060 && (code
== EQ
|| code
== NE
)
8061 && (mode_x
== HImode
|| mode_x
== QImode
))
8064 /* Similarly, comparisons of zero_extends from shorter modes can
8065 be performed using an ANDS with an immediate mask. */
8066 if (y
== const0_rtx
&& code_x
== ZERO_EXTEND
8067 && (mode_x
== SImode
|| mode_x
== DImode
)
8068 && (GET_MODE (XEXP (x
, 0)) == HImode
|| GET_MODE (XEXP (x
, 0)) == QImode
)
8069 && (code
== EQ
|| code
== NE
))
8072 if ((mode_x
== SImode
|| mode_x
== DImode
)
8074 && (code
== EQ
|| code
== NE
|| code
== LT
|| code
== GE
)
8075 && (code_x
== PLUS
|| code_x
== MINUS
|| code_x
== AND
8077 || (code_x
== ZERO_EXTRACT
&& CONST_INT_P (XEXP (x
, 1))
8078 && CONST_INT_P (XEXP (x
, 2)))))
8081 /* A compare with a shifted operand. Because of canonicalization,
8082 the comparison will have to be swapped when we emit the assembly
8084 if ((mode_x
== SImode
|| mode_x
== DImode
)
8085 && (REG_P (y
) || GET_CODE (y
) == SUBREG
|| y
== const0_rtx
)
8086 && (code_x
== ASHIFT
|| code_x
== ASHIFTRT
8087 || code_x
== LSHIFTRT
8088 || code_x
== ZERO_EXTEND
|| code_x
== SIGN_EXTEND
))
8091 /* Similarly for a negated operand, but we can only do this for
8093 if ((mode_x
== SImode
|| mode_x
== DImode
)
8094 && (REG_P (y
) || GET_CODE (y
) == SUBREG
)
8095 && (code
== EQ
|| code
== NE
)
8099 /* A test for unsigned overflow from an addition. */
8100 if ((mode_x
== DImode
|| mode_x
== TImode
)
8101 && (code
== LTU
|| code
== GEU
)
8103 && rtx_equal_p (XEXP (x
, 0), y
))
8106 /* A test for unsigned overflow from an add with carry. */
8107 if ((mode_x
== DImode
|| mode_x
== TImode
)
8108 && (code
== LTU
|| code
== GEU
)
8110 && CONST_SCALAR_INT_P (y
)
8111 && (rtx_mode_t (y
, mode_x
)
8112 == (wi::shwi (1, mode_x
)
8113 << (GET_MODE_BITSIZE (mode_x
).to_constant () / 2))))
8116 /* A test for signed overflow. */
8117 if ((mode_x
== DImode
|| mode_x
== TImode
)
8120 && GET_CODE (y
) == SIGN_EXTEND
)
8123 /* For everything else, return CCmode. */
8128 aarch64_get_condition_code_1 (machine_mode
, enum rtx_code
);
8131 aarch64_get_condition_code (rtx x
)
8133 machine_mode mode
= GET_MODE (XEXP (x
, 0));
8134 enum rtx_code comp_code
= GET_CODE (x
);
8136 if (GET_MODE_CLASS (mode
) != MODE_CC
)
8137 mode
= SELECT_CC_MODE (comp_code
, XEXP (x
, 0), XEXP (x
, 1));
8138 return aarch64_get_condition_code_1 (mode
, comp_code
);
8142 aarch64_get_condition_code_1 (machine_mode mode
, enum rtx_code comp_code
)
8150 case GE
: return AARCH64_GE
;
8151 case GT
: return AARCH64_GT
;
8152 case LE
: return AARCH64_LS
;
8153 case LT
: return AARCH64_MI
;
8154 case NE
: return AARCH64_NE
;
8155 case EQ
: return AARCH64_EQ
;
8156 case ORDERED
: return AARCH64_VC
;
8157 case UNORDERED
: return AARCH64_VS
;
8158 case UNLT
: return AARCH64_LT
;
8159 case UNLE
: return AARCH64_LE
;
8160 case UNGT
: return AARCH64_HI
;
8161 case UNGE
: return AARCH64_PL
;
8169 case NE
: return AARCH64_NE
;
8170 case EQ
: return AARCH64_EQ
;
8171 case GE
: return AARCH64_GE
;
8172 case GT
: return AARCH64_GT
;
8173 case LE
: return AARCH64_LE
;
8174 case LT
: return AARCH64_LT
;
8175 case GEU
: return AARCH64_CS
;
8176 case GTU
: return AARCH64_HI
;
8177 case LEU
: return AARCH64_LS
;
8178 case LTU
: return AARCH64_CC
;
8186 case NE
: return AARCH64_NE
;
8187 case EQ
: return AARCH64_EQ
;
8188 case GE
: return AARCH64_LE
;
8189 case GT
: return AARCH64_LT
;
8190 case LE
: return AARCH64_GE
;
8191 case LT
: return AARCH64_GT
;
8192 case GEU
: return AARCH64_LS
;
8193 case GTU
: return AARCH64_CC
;
8194 case LEU
: return AARCH64_CS
;
8195 case LTU
: return AARCH64_HI
;
8203 case NE
: return AARCH64_NE
; /* = any */
8204 case EQ
: return AARCH64_EQ
; /* = none */
8205 case GE
: return AARCH64_PL
; /* = nfrst */
8206 case LT
: return AARCH64_MI
; /* = first */
8207 case GEU
: return AARCH64_CS
; /* = nlast */
8208 case GTU
: return AARCH64_HI
; /* = pmore */
8209 case LEU
: return AARCH64_LS
; /* = plast */
8210 case LTU
: return AARCH64_CC
; /* = last */
8218 case NE
: return AARCH64_NE
;
8219 case EQ
: return AARCH64_EQ
;
8220 case GE
: return AARCH64_PL
;
8221 case LT
: return AARCH64_MI
;
8229 case NE
: return AARCH64_NE
;
8230 case EQ
: return AARCH64_EQ
;
8238 case LTU
: return AARCH64_CS
;
8239 case GEU
: return AARCH64_CC
;
8247 case GEU
: return AARCH64_CS
;
8248 case LTU
: return AARCH64_CC
;
8256 case NE
: return AARCH64_VS
;
8257 case EQ
: return AARCH64_VC
;
8270 aarch64_const_vec_all_same_in_range_p (rtx x
,
8271 HOST_WIDE_INT minval
,
8272 HOST_WIDE_INT maxval
)
8275 return (const_vec_duplicate_p (x
, &elt
)
8276 && CONST_INT_P (elt
)
8277 && IN_RANGE (INTVAL (elt
), minval
, maxval
));
8281 aarch64_const_vec_all_same_int_p (rtx x
, HOST_WIDE_INT val
)
8283 return aarch64_const_vec_all_same_in_range_p (x
, val
, val
);
8286 /* Return true if VEC is a constant in which every element is in the range
8287 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
8290 aarch64_const_vec_all_in_range_p (rtx vec
,
8291 HOST_WIDE_INT minval
,
8292 HOST_WIDE_INT maxval
)
8294 if (GET_CODE (vec
) != CONST_VECTOR
8295 || GET_MODE_CLASS (GET_MODE (vec
)) != MODE_VECTOR_INT
)
8299 if (!CONST_VECTOR_STEPPED_P (vec
))
8300 nunits
= const_vector_encoded_nelts (vec
);
8301 else if (!CONST_VECTOR_NUNITS (vec
).is_constant (&nunits
))
8304 for (int i
= 0; i
< nunits
; i
++)
8306 rtx vec_elem
= CONST_VECTOR_ELT (vec
, i
);
8307 if (!CONST_INT_P (vec_elem
)
8308 || !IN_RANGE (INTVAL (vec_elem
), minval
, maxval
))
8315 #define AARCH64_CC_V 1
8316 #define AARCH64_CC_C (1 << 1)
8317 #define AARCH64_CC_Z (1 << 2)
8318 #define AARCH64_CC_N (1 << 3)
8320 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
8321 static const int aarch64_nzcv_codes
[] =
8323 0, /* EQ, Z == 1. */
8324 AARCH64_CC_Z
, /* NE, Z == 0. */
8325 0, /* CS, C == 1. */
8326 AARCH64_CC_C
, /* CC, C == 0. */
8327 0, /* MI, N == 1. */
8328 AARCH64_CC_N
, /* PL, N == 0. */
8329 0, /* VS, V == 1. */
8330 AARCH64_CC_V
, /* VC, V == 0. */
8331 0, /* HI, C ==1 && Z == 0. */
8332 AARCH64_CC_C
, /* LS, !(C == 1 && Z == 0). */
8333 AARCH64_CC_V
, /* GE, N == V. */
8334 0, /* LT, N != V. */
8335 AARCH64_CC_Z
, /* GT, Z == 0 && N == V. */
8336 0, /* LE, !(Z == 0 && N == V). */
8341 /* Print floating-point vector immediate operand X to F, negating it
8342 first if NEGATE is true. Return true on success, false if it isn't
8343 a constant we can handle. */
8346 aarch64_print_vector_float_operand (FILE *f
, rtx x
, bool negate
)
8350 if (!const_vec_duplicate_p (x
, &elt
))
8353 REAL_VALUE_TYPE r
= *CONST_DOUBLE_REAL_VALUE (elt
);
8355 r
= real_value_negate (&r
);
8357 /* Handle the SVE single-bit immediates specially, since they have a
8358 fixed form in the assembly syntax. */
8359 if (real_equal (&r
, &dconst0
))
8360 asm_fprintf (f
, "0.0");
8361 else if (real_equal (&r
, &dconst2
))
8362 asm_fprintf (f
, "2.0");
8363 else if (real_equal (&r
, &dconst1
))
8364 asm_fprintf (f
, "1.0");
8365 else if (real_equal (&r
, &dconsthalf
))
8366 asm_fprintf (f
, "0.5");
8369 const int buf_size
= 20;
8370 char float_buf
[buf_size
] = {'\0'};
8371 real_to_decimal_for_mode (float_buf
, &r
, buf_size
, buf_size
,
8373 asm_fprintf (f
, "%s", float_buf
);
8379 /* Return the equivalent letter for size. */
8381 sizetochar (int size
)
8385 case 64: return 'd';
8386 case 32: return 's';
8387 case 16: return 'h';
8388 case 8 : return 'b';
8389 default: gcc_unreachable ();
8393 /* Print operand X to file F in a target specific manner according to CODE.
8394 The acceptable formatting commands given by CODE are:
8395 'c': An integer or symbol address without a preceding #
8397 'C': Take the duplicated element in a vector constant
8398 and print it in hex.
8399 'D': Take the duplicated element in a vector constant
8400 and print it as an unsigned integer, in decimal.
8401 'e': Print the sign/zero-extend size as a character 8->b,
8402 16->h, 32->w. Can also be used for masks:
8403 0xff->b, 0xffff->h, 0xffffffff->w.
8404 'I': If the operand is a duplicated vector constant,
8405 replace it with the duplicated scalar. If the
8406 operand is then a floating-point constant, replace
8407 it with the integer bit representation. Print the
8408 transformed constant as a signed decimal number.
8409 'p': Prints N such that 2^N == X (X must be power of 2 and
8411 'P': Print the number of non-zero bits in X (a const_int).
8412 'H': Print the higher numbered register of a pair (TImode)
8414 'm': Print a condition (eq, ne, etc).
8415 'M': Same as 'm', but invert condition.
8416 'N': Take the duplicated element in a vector constant
8417 and print the negative of it in decimal.
8418 'b/h/s/d/q': Print a scalar FP/SIMD register name.
8419 'S/T/U/V': Print a FP/SIMD register name for a register list.
8420 The register printed is the FP/SIMD register name
8421 of X + 0/1/2/3 for S/T/U/V.
8422 'R': Print a scalar FP/SIMD register name + 1.
8423 'X': Print bottom 16 bits of integer constant in hex.
8424 'w/x': Print a general register name or the zero register
8426 '0': Print a normal operand, if it's a general register,
8427 then we assume DImode.
8428 'k': Print NZCV for conditional compare instructions.
8429 'A': Output address constant representing the first
8430 argument of X, specifying a relocation offset
8432 'L': Output constant address specified by X
8433 with a relocation offset if appropriate.
8434 'G': Prints address of X, specifying a PC relative
8435 relocation mode if appropriate.
8436 'y': Output address of LDP or STP - this is used for
8437 some LDP/STPs which don't use a PARALLEL in their
8438 pattern (so the mode needs to be adjusted).
8439 'z': Output address of a typical LDP or STP. */
8442 aarch64_print_operand (FILE *f
, rtx x
, int code
)
8448 switch (GET_CODE (x
))
8451 fprintf (f
, HOST_WIDE_INT_PRINT_DEC
, INTVAL (x
));
8455 output_addr_const (f
, x
);
8459 if (GET_CODE (XEXP (x
, 0)) == PLUS
8460 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
)
8462 output_addr_const (f
, x
);
8468 output_operand_lossage ("unsupported operand for code '%c'", code
);
8474 x
= unwrap_const_vec_duplicate (x
);
8475 if (!CONST_INT_P (x
))
8477 output_operand_lossage ("invalid operand for '%%%c'", code
);
8481 HOST_WIDE_INT val
= INTVAL (x
);
8482 if ((val
& ~7) == 8 || val
== 0xff)
8484 else if ((val
& ~7) == 16 || val
== 0xffff)
8486 else if ((val
& ~7) == 32 || val
== 0xffffffff)
8490 output_operand_lossage ("invalid operand for '%%%c'", code
);
8500 if (!CONST_INT_P (x
) || (n
= exact_log2 (INTVAL (x
))) < 0)
8502 output_operand_lossage ("invalid operand for '%%%c'", code
);
8506 asm_fprintf (f
, "%d", n
);
8511 if (!CONST_INT_P (x
))
8513 output_operand_lossage ("invalid operand for '%%%c'", code
);
8517 asm_fprintf (f
, "%u", popcount_hwi (INTVAL (x
)));
8521 if (x
== const0_rtx
)
8523 asm_fprintf (f
, "xzr");
8527 if (!REG_P (x
) || !GP_REGNUM_P (REGNO (x
) + 1))
8529 output_operand_lossage ("invalid operand for '%%%c'", code
);
8533 asm_fprintf (f
, "%s", reg_names
[REGNO (x
) + 1]);
8538 x
= aarch64_bit_representation (unwrap_const_vec_duplicate (x
));
8539 if (CONST_INT_P (x
))
8540 asm_fprintf (f
, "%wd", INTVAL (x
));
8543 output_operand_lossage ("invalid operand for '%%%c'", code
);
8553 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
8554 if (x
== const_true_rtx
)
8561 if (!COMPARISON_P (x
))
8563 output_operand_lossage ("invalid operand for '%%%c'", code
);
8567 cond_code
= aarch64_get_condition_code (x
);
8568 gcc_assert (cond_code
>= 0);
8570 cond_code
= AARCH64_INVERSE_CONDITION_CODE (cond_code
);
8571 if (GET_MODE (XEXP (x
, 0)) == CC_NZCmode
)
8572 fputs (aarch64_sve_condition_codes
[cond_code
], f
);
8574 fputs (aarch64_condition_codes
[cond_code
], f
);
8579 if (!const_vec_duplicate_p (x
, &elt
))
8581 output_operand_lossage ("invalid vector constant");
8585 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
8586 asm_fprintf (f
, "%wd", -INTVAL (elt
));
8587 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
8588 && aarch64_print_vector_float_operand (f
, x
, true))
8592 output_operand_lossage ("invalid vector constant");
8602 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
8604 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
8607 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - V0_REGNUM
);
8614 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
8616 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
8619 asm_fprintf (f
, "%c%d",
8620 aarch64_sve_data_mode_p (GET_MODE (x
)) ? 'z' : 'v',
8621 REGNO (x
) - V0_REGNUM
+ (code
- 'S'));
8625 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
8627 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
8630 asm_fprintf (f
, "q%d", REGNO (x
) - V0_REGNUM
+ 1);
8634 if (!CONST_INT_P (x
))
8636 output_operand_lossage ("invalid operand for '%%%c'", code
);
8639 asm_fprintf (f
, "0x%wx", UINTVAL (x
) & 0xffff);
8644 /* Print a replicated constant in hex. */
8645 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
8647 output_operand_lossage ("invalid operand for '%%%c'", code
);
8650 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
8651 asm_fprintf (f
, "0x%wx", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
8657 /* Print a replicated constant in decimal, treating it as
8659 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
8661 output_operand_lossage ("invalid operand for '%%%c'", code
);
8664 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
8665 asm_fprintf (f
, "%wd", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
8672 || (CONST_DOUBLE_P (x
) && aarch64_float_const_zero_rtx_p (x
)))
8674 asm_fprintf (f
, "%czr", code
);
8678 if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
8680 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - R0_REGNUM
);
8684 if (REG_P (x
) && REGNO (x
) == SP_REGNUM
)
8686 asm_fprintf (f
, "%ssp", code
== 'w' ? "w" : "");
8695 output_operand_lossage ("missing operand");
8699 switch (GET_CODE (x
))
8702 if (aarch64_sve_data_mode_p (GET_MODE (x
)))
8704 if (REG_NREGS (x
) == 1)
8705 asm_fprintf (f
, "z%d", REGNO (x
) - V0_REGNUM
);
8709 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x
)));
8710 asm_fprintf (f
, "{z%d.%c - z%d.%c}",
8711 REGNO (x
) - V0_REGNUM
, suffix
,
8712 END_REGNO (x
) - V0_REGNUM
- 1, suffix
);
8716 asm_fprintf (f
, "%s", reg_names
[REGNO (x
)]);
8720 output_address (GET_MODE (x
), XEXP (x
, 0));
8725 output_addr_const (asm_out_file
, x
);
8729 asm_fprintf (f
, "%wd", INTVAL (x
));
8733 if (!VECTOR_MODE_P (GET_MODE (x
)))
8735 output_addr_const (asm_out_file
, x
);
8741 if (!const_vec_duplicate_p (x
, &elt
))
8743 output_operand_lossage ("invalid vector constant");
8747 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
8748 asm_fprintf (f
, "%wd", INTVAL (elt
));
8749 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
8750 && aarch64_print_vector_float_operand (f
, x
, false))
8754 output_operand_lossage ("invalid vector constant");
8760 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
8761 be getting CONST_DOUBLEs holding integers. */
8762 gcc_assert (GET_MODE (x
) != VOIDmode
);
8763 if (aarch64_float_const_zero_rtx_p (x
))
8768 else if (aarch64_float_const_representable_p (x
))
8771 char float_buf
[buf_size
] = {'\0'};
8772 real_to_decimal_for_mode (float_buf
,
8773 CONST_DOUBLE_REAL_VALUE (x
),
8776 asm_fprintf (asm_out_file
, "%s", float_buf
);
8780 output_operand_lossage ("invalid constant");
8783 output_operand_lossage ("invalid operand");
8789 if (GET_CODE (x
) == HIGH
)
8792 switch (aarch64_classify_symbolic_expression (x
))
8794 case SYMBOL_SMALL_GOT_4G
:
8795 asm_fprintf (asm_out_file
, ":got:");
8798 case SYMBOL_SMALL_TLSGD
:
8799 asm_fprintf (asm_out_file
, ":tlsgd:");
8802 case SYMBOL_SMALL_TLSDESC
:
8803 asm_fprintf (asm_out_file
, ":tlsdesc:");
8806 case SYMBOL_SMALL_TLSIE
:
8807 asm_fprintf (asm_out_file
, ":gottprel:");
8810 case SYMBOL_TLSLE24
:
8811 asm_fprintf (asm_out_file
, ":tprel:");
8814 case SYMBOL_TINY_GOT
:
8821 output_addr_const (asm_out_file
, x
);
8825 switch (aarch64_classify_symbolic_expression (x
))
8827 case SYMBOL_SMALL_GOT_4G
:
8828 asm_fprintf (asm_out_file
, ":lo12:");
8831 case SYMBOL_SMALL_TLSGD
:
8832 asm_fprintf (asm_out_file
, ":tlsgd_lo12:");
8835 case SYMBOL_SMALL_TLSDESC
:
8836 asm_fprintf (asm_out_file
, ":tlsdesc_lo12:");
8839 case SYMBOL_SMALL_TLSIE
:
8840 asm_fprintf (asm_out_file
, ":gottprel_lo12:");
8843 case SYMBOL_TLSLE12
:
8844 asm_fprintf (asm_out_file
, ":tprel_lo12:");
8847 case SYMBOL_TLSLE24
:
8848 asm_fprintf (asm_out_file
, ":tprel_lo12_nc:");
8851 case SYMBOL_TINY_GOT
:
8852 asm_fprintf (asm_out_file
, ":got:");
8855 case SYMBOL_TINY_TLSIE
:
8856 asm_fprintf (asm_out_file
, ":gottprel:");
8862 output_addr_const (asm_out_file
, x
);
8866 switch (aarch64_classify_symbolic_expression (x
))
8868 case SYMBOL_TLSLE24
:
8869 asm_fprintf (asm_out_file
, ":tprel_hi12:");
8874 output_addr_const (asm_out_file
, x
);
8879 HOST_WIDE_INT cond_code
;
8881 if (!CONST_INT_P (x
))
8883 output_operand_lossage ("invalid operand for '%%%c'", code
);
8887 cond_code
= INTVAL (x
);
8888 gcc_assert (cond_code
>= 0 && cond_code
<= AARCH64_NV
);
8889 asm_fprintf (f
, "%d", aarch64_nzcv_codes
[cond_code
]);
8896 machine_mode mode
= GET_MODE (x
);
8898 if (GET_CODE (x
) != MEM
8899 || (code
== 'y' && maybe_ne (GET_MODE_SIZE (mode
), 16)))
8901 output_operand_lossage ("invalid operand for '%%%c'", code
);
8905 if (!aarch64_print_address_internal (f
, mode
, XEXP (x
, 0),
8907 ? ADDR_QUERY_LDP_STP_N
8908 : ADDR_QUERY_LDP_STP
))
8909 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
8914 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
8919 /* Print address 'x' of a memory access with mode 'mode'.
8920 'op' is the context required by aarch64_classify_address. It can either be
8921 MEM for a normal memory access or PARALLEL for LDP/STP. */
8923 aarch64_print_address_internal (FILE *f
, machine_mode mode
, rtx x
,
8924 aarch64_addr_query_type type
)
8926 struct aarch64_address_info addr
;
8929 /* Check all addresses are Pmode - including ILP32. */
8930 if (GET_MODE (x
) != Pmode
8931 && (!CONST_INT_P (x
)
8932 || trunc_int_for_mode (INTVAL (x
), Pmode
) != INTVAL (x
)))
8934 output_operand_lossage ("invalid address mode");
8938 if (aarch64_classify_address (&addr
, x
, mode
, true, type
))
8941 case ADDRESS_REG_IMM
:
8942 if (known_eq (addr
.const_offset
, 0))
8943 asm_fprintf (f
, "[%s]", reg_names
[REGNO (addr
.base
)]);
8944 else if (aarch64_sve_data_mode_p (mode
))
8947 = exact_div (addr
.const_offset
,
8948 BYTES_PER_SVE_VECTOR
).to_constant ();
8949 asm_fprintf (f
, "[%s, #%wd, mul vl]",
8950 reg_names
[REGNO (addr
.base
)], vnum
);
8952 else if (aarch64_sve_pred_mode_p (mode
))
8955 = exact_div (addr
.const_offset
,
8956 BYTES_PER_SVE_PRED
).to_constant ();
8957 asm_fprintf (f
, "[%s, #%wd, mul vl]",
8958 reg_names
[REGNO (addr
.base
)], vnum
);
8961 asm_fprintf (f
, "[%s, %wd]", reg_names
[REGNO (addr
.base
)],
8962 INTVAL (addr
.offset
));
8965 case ADDRESS_REG_REG
:
8966 if (addr
.shift
== 0)
8967 asm_fprintf (f
, "[%s, %s]", reg_names
[REGNO (addr
.base
)],
8968 reg_names
[REGNO (addr
.offset
)]);
8970 asm_fprintf (f
, "[%s, %s, lsl %u]", reg_names
[REGNO (addr
.base
)],
8971 reg_names
[REGNO (addr
.offset
)], addr
.shift
);
8974 case ADDRESS_REG_UXTW
:
8975 if (addr
.shift
== 0)
8976 asm_fprintf (f
, "[%s, w%d, uxtw]", reg_names
[REGNO (addr
.base
)],
8977 REGNO (addr
.offset
) - R0_REGNUM
);
8979 asm_fprintf (f
, "[%s, w%d, uxtw %u]", reg_names
[REGNO (addr
.base
)],
8980 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
8983 case ADDRESS_REG_SXTW
:
8984 if (addr
.shift
== 0)
8985 asm_fprintf (f
, "[%s, w%d, sxtw]", reg_names
[REGNO (addr
.base
)],
8986 REGNO (addr
.offset
) - R0_REGNUM
);
8988 asm_fprintf (f
, "[%s, w%d, sxtw %u]", reg_names
[REGNO (addr
.base
)],
8989 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
8992 case ADDRESS_REG_WB
:
8993 /* Writeback is only supported for fixed-width modes. */
8994 size
= GET_MODE_SIZE (mode
).to_constant ();
8995 switch (GET_CODE (x
))
8998 asm_fprintf (f
, "[%s, %d]!", reg_names
[REGNO (addr
.base
)], size
);
9001 asm_fprintf (f
, "[%s], %d", reg_names
[REGNO (addr
.base
)], size
);
9004 asm_fprintf (f
, "[%s, -%d]!", reg_names
[REGNO (addr
.base
)], size
);
9007 asm_fprintf (f
, "[%s], -%d", reg_names
[REGNO (addr
.base
)], size
);
9010 asm_fprintf (f
, "[%s, %wd]!", reg_names
[REGNO (addr
.base
)],
9011 INTVAL (addr
.offset
));
9014 asm_fprintf (f
, "[%s], %wd", reg_names
[REGNO (addr
.base
)],
9015 INTVAL (addr
.offset
));
9022 case ADDRESS_LO_SUM
:
9023 asm_fprintf (f
, "[%s, #:lo12:", reg_names
[REGNO (addr
.base
)]);
9024 output_addr_const (f
, addr
.offset
);
9025 asm_fprintf (f
, "]");
9028 case ADDRESS_SYMBOLIC
:
9029 output_addr_const (f
, x
);
9036 /* Print address 'x' of a memory access with mode 'mode'. */
9038 aarch64_print_operand_address (FILE *f
, machine_mode mode
, rtx x
)
9040 if (!aarch64_print_address_internal (f
, mode
, x
, ADDR_QUERY_ANY
))
9041 output_addr_const (f
, x
);
9045 aarch64_label_mentioned_p (rtx x
)
9050 if (GET_CODE (x
) == LABEL_REF
)
9053 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
9054 referencing instruction, but they are constant offsets, not
9056 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
9059 fmt
= GET_RTX_FORMAT (GET_CODE (x
));
9060 for (i
= GET_RTX_LENGTH (GET_CODE (x
)) - 1; i
>= 0; i
--)
9066 for (j
= XVECLEN (x
, i
) - 1; j
>= 0; j
--)
9067 if (aarch64_label_mentioned_p (XVECEXP (x
, i
, j
)))
9070 else if (fmt
[i
] == 'e' && aarch64_label_mentioned_p (XEXP (x
, i
)))
9077 /* Implement REGNO_REG_CLASS. */
9080 aarch64_regno_regclass (unsigned regno
)
9082 if (GP_REGNUM_P (regno
))
9083 return GENERAL_REGS
;
9085 if (regno
== SP_REGNUM
)
9088 if (regno
== FRAME_POINTER_REGNUM
9089 || regno
== ARG_POINTER_REGNUM
)
9090 return POINTER_REGS
;
9092 if (FP_REGNUM_P (regno
))
9093 return (FP_LO8_REGNUM_P (regno
) ? FP_LO8_REGS
9094 : FP_LO_REGNUM_P (regno
) ? FP_LO_REGS
: FP_REGS
);
9096 if (PR_REGNUM_P (regno
))
9097 return PR_LO_REGNUM_P (regno
) ? PR_LO_REGS
: PR_HI_REGS
;
9102 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
9103 If OFFSET is out of range, return an offset of an anchor point
9104 that is in range. Return 0 otherwise. */
9106 static HOST_WIDE_INT
9107 aarch64_anchor_offset (HOST_WIDE_INT offset
, HOST_WIDE_INT size
,
9110 /* Does it look like we'll need a 16-byte load/store-pair operation? */
9112 return (offset
+ 0x400) & ~0x7f0;
9114 /* For offsets that aren't a multiple of the access size, the limit is
9116 if (offset
& (size
- 1))
9118 /* BLKmode typically uses LDP of X-registers. */
9119 if (mode
== BLKmode
)
9120 return (offset
+ 512) & ~0x3ff;
9121 return (offset
+ 0x100) & ~0x1ff;
9124 /* Small negative offsets are supported. */
9125 if (IN_RANGE (offset
, -256, 0))
9128 if (mode
== TImode
|| mode
== TFmode
)
9129 return (offset
+ 0x100) & ~0x1ff;
9131 /* Use 12-bit offset by access size. */
9132 return offset
& (~0xfff * size
);
9136 aarch64_legitimize_address (rtx x
, rtx
/* orig_x */, machine_mode mode
)
9138 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
9139 where mask is selected by alignment and size of the offset.
9140 We try to pick as large a range for the offset as possible to
9141 maximize the chance of a CSE. However, for aligned addresses
9142 we limit the range to 4k so that structures with different sized
9143 elements are likely to use the same base. We need to be careful
9144 not to split a CONST for some forms of address expression, otherwise
9145 it will generate sub-optimal code. */
9147 if (GET_CODE (x
) == PLUS
&& CONST_INT_P (XEXP (x
, 1)))
9149 rtx base
= XEXP (x
, 0);
9150 rtx offset_rtx
= XEXP (x
, 1);
9151 HOST_WIDE_INT offset
= INTVAL (offset_rtx
);
9153 if (GET_CODE (base
) == PLUS
)
9155 rtx op0
= XEXP (base
, 0);
9156 rtx op1
= XEXP (base
, 1);
9158 /* Force any scaling into a temp for CSE. */
9159 op0
= force_reg (Pmode
, op0
);
9160 op1
= force_reg (Pmode
, op1
);
9162 /* Let the pointer register be in op0. */
9163 if (REG_POINTER (op1
))
9164 std::swap (op0
, op1
);
9166 /* If the pointer is virtual or frame related, then we know that
9167 virtual register instantiation or register elimination is going
9168 to apply a second constant. We want the two constants folded
9169 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
9170 if (virt_or_elim_regno_p (REGNO (op0
)))
9172 base
= expand_binop (Pmode
, add_optab
, op0
, offset_rtx
,
9173 NULL_RTX
, true, OPTAB_DIRECT
);
9174 return gen_rtx_PLUS (Pmode
, base
, op1
);
9177 /* Otherwise, in order to encourage CSE (and thence loop strength
9178 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
9179 base
= expand_binop (Pmode
, add_optab
, op0
, op1
,
9180 NULL_RTX
, true, OPTAB_DIRECT
);
9181 x
= gen_rtx_PLUS (Pmode
, base
, offset_rtx
);
9185 if (GET_MODE_SIZE (mode
).is_constant (&size
))
9187 HOST_WIDE_INT base_offset
= aarch64_anchor_offset (offset
, size
,
9189 if (base_offset
!= 0)
9191 base
= plus_constant (Pmode
, base
, base_offset
);
9192 base
= force_operand (base
, NULL_RTX
);
9193 return plus_constant (Pmode
, base
, offset
- base_offset
);
9202 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED
, rtx x
,
9205 secondary_reload_info
*sri
)
9207 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
9208 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
9209 comment at the head of aarch64-sve.md for more details about the
9210 big-endian handling. */
9211 if (BYTES_BIG_ENDIAN
9212 && reg_class_subset_p (rclass
, FP_REGS
)
9213 && !((REG_P (x
) && HARD_REGISTER_P (x
))
9214 || aarch64_simd_valid_immediate (x
, NULL
))
9215 && aarch64_sve_data_mode_p (mode
))
9217 sri
->icode
= CODE_FOR_aarch64_sve_reload_be
;
9221 /* If we have to disable direct literal pool loads and stores because the
9222 function is too big, then we need a scratch register. */
9223 if (MEM_P (x
) && GET_CODE (x
) == SYMBOL_REF
&& CONSTANT_POOL_ADDRESS_P (x
)
9224 && (SCALAR_FLOAT_MODE_P (GET_MODE (x
))
9225 || targetm
.vector_mode_supported_p (GET_MODE (x
)))
9226 && !aarch64_pcrelative_literal_loads
)
9228 sri
->icode
= code_for_aarch64_reload_movcp (mode
, DImode
);
9232 /* Without the TARGET_SIMD instructions we cannot move a Q register
9233 to a Q register directly. We need a scratch. */
9234 if (REG_P (x
) && (mode
== TFmode
|| mode
== TImode
) && mode
== GET_MODE (x
)
9235 && FP_REGNUM_P (REGNO (x
)) && !TARGET_SIMD
9236 && reg_class_subset_p (rclass
, FP_REGS
))
9238 sri
->icode
= code_for_aarch64_reload_mov (mode
);
9242 /* A TFmode or TImode memory access should be handled via an FP_REGS
9243 because AArch64 has richer addressing modes for LDR/STR instructions
9244 than LDP/STP instructions. */
9245 if (TARGET_FLOAT
&& rclass
== GENERAL_REGS
9246 && known_eq (GET_MODE_SIZE (mode
), 16) && MEM_P (x
))
9249 if (rclass
== FP_REGS
&& (mode
== TImode
|| mode
== TFmode
) && CONSTANT_P(x
))
9250 return GENERAL_REGS
;
9256 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED
, const int to
)
9258 gcc_assert (from
== ARG_POINTER_REGNUM
|| from
== FRAME_POINTER_REGNUM
);
9260 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
9261 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
9262 if (frame_pointer_needed
)
9263 return to
== HARD_FRAME_POINTER_REGNUM
;
9268 aarch64_initial_elimination_offset (unsigned from
, unsigned to
)
9270 if (to
== HARD_FRAME_POINTER_REGNUM
)
9272 if (from
== ARG_POINTER_REGNUM
)
9273 return cfun
->machine
->frame
.hard_fp_offset
;
9275 if (from
== FRAME_POINTER_REGNUM
)
9276 return cfun
->machine
->frame
.hard_fp_offset
9277 - cfun
->machine
->frame
.locals_offset
;
9280 if (to
== STACK_POINTER_REGNUM
)
9282 if (from
== FRAME_POINTER_REGNUM
)
9283 return cfun
->machine
->frame
.frame_size
9284 - cfun
->machine
->frame
.locals_offset
;
9287 return cfun
->machine
->frame
.frame_size
;
9290 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
9294 aarch64_return_addr (int count
, rtx frame ATTRIBUTE_UNUSED
)
9298 return get_hard_reg_initial_val (Pmode
, LR_REGNUM
);
9303 aarch64_asm_trampoline_template (FILE *f
)
9308 if (aarch64_bti_enabled ())
9310 asm_fprintf (f
, "\thint\t34 // bti c\n");
9317 asm_fprintf (f
, "\tldr\tw%d, .+%d\n", IP1_REGNUM
- R0_REGNUM
, offset1
);
9318 asm_fprintf (f
, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM
- R0_REGNUM
,
9323 asm_fprintf (f
, "\tldr\t%s, .+%d\n", reg_names
[IP1_REGNUM
], offset1
);
9324 asm_fprintf (f
, "\tldr\t%s, .+%d\n", reg_names
[STATIC_CHAIN_REGNUM
],
9327 asm_fprintf (f
, "\tbr\t%s\n", reg_names
[IP1_REGNUM
]);
9329 /* The trampoline needs an extra padding instruction. In case if BTI is
9330 enabled the padding instruction is replaced by the BTI instruction at
9332 if (!aarch64_bti_enabled ())
9333 assemble_aligned_integer (4, const0_rtx
);
9335 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
9336 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
9340 aarch64_trampoline_init (rtx m_tramp
, tree fndecl
, rtx chain_value
)
9342 rtx fnaddr
, mem
, a_tramp
;
9343 const int tramp_code_sz
= 16;
9345 /* Don't need to copy the trailing D-words, we fill those in below. */
9346 emit_block_move (m_tramp
, assemble_trampoline_template (),
9347 GEN_INT (tramp_code_sz
), BLOCK_OP_NORMAL
);
9348 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
);
9349 fnaddr
= XEXP (DECL_RTL (fndecl
), 0);
9350 if (GET_MODE (fnaddr
) != ptr_mode
)
9351 fnaddr
= convert_memory_address (ptr_mode
, fnaddr
);
9352 emit_move_insn (mem
, fnaddr
);
9354 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
+ POINTER_BYTES
);
9355 emit_move_insn (mem
, chain_value
);
9357 /* XXX We should really define a "clear_cache" pattern and use
9358 gen_clear_cache(). */
9359 a_tramp
= XEXP (m_tramp
, 0);
9360 emit_library_call (gen_rtx_SYMBOL_REF (Pmode
, "__clear_cache"),
9361 LCT_NORMAL
, VOIDmode
, a_tramp
, ptr_mode
,
9362 plus_constant (ptr_mode
, a_tramp
, TRAMPOLINE_SIZE
),
9366 static unsigned char
9367 aarch64_class_max_nregs (reg_class_t regclass
, machine_mode mode
)
9369 /* ??? Logically we should only need to provide a value when
9370 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
9371 can hold MODE, but at the moment we need to handle all modes.
9372 Just ignore any runtime parts for registers that can't store them. */
9373 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
9377 case TAILCALL_ADDR_REGS
:
9381 case POINTER_AND_FP_REGS
:
9385 if (aarch64_sve_data_mode_p (mode
)
9386 && constant_multiple_p (GET_MODE_SIZE (mode
),
9387 BYTES_PER_SVE_VECTOR
, &nregs
))
9389 return (aarch64_vector_data_mode_p (mode
)
9390 ? CEIL (lowest_size
, UNITS_PER_VREG
)
9391 : CEIL (lowest_size
, UNITS_PER_WORD
));
9408 aarch64_preferred_reload_class (rtx x
, reg_class_t regclass
)
9410 if (regclass
== POINTER_REGS
)
9411 return GENERAL_REGS
;
9413 if (regclass
== STACK_REG
)
9416 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x
)), POINTER_REGS
))
9422 /* Register eliminiation can result in a request for
9423 SP+constant->FP_REGS. We cannot support such operations which
9424 use SP as source and an FP_REG as destination, so reject out
9426 if (! reg_class_subset_p (regclass
, GENERAL_REGS
) && GET_CODE (x
) == PLUS
)
9428 rtx lhs
= XEXP (x
, 0);
9430 /* Look through a possible SUBREG introduced by ILP32. */
9431 if (GET_CODE (lhs
) == SUBREG
)
9432 lhs
= SUBREG_REG (lhs
);
9434 gcc_assert (REG_P (lhs
));
9435 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs
)),
9444 aarch64_asm_output_labelref (FILE* f
, const char *name
)
9446 asm_fprintf (f
, "%U%s", name
);
9450 aarch64_elf_asm_constructor (rtx symbol
, int priority
)
9452 if (priority
== DEFAULT_INIT_PRIORITY
)
9453 default_ctor_section_asm_out_constructor (symbol
, priority
);
9457 /* While priority is known to be in range [0, 65535], so 18 bytes
9458 would be enough, the compiler might not know that. To avoid
9459 -Wformat-truncation false positive, use a larger size. */
9461 snprintf (buf
, sizeof (buf
), ".init_array.%.5u", priority
);
9462 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
9463 switch_to_section (s
);
9464 assemble_align (POINTER_SIZE
);
9465 assemble_aligned_integer (POINTER_BYTES
, symbol
);
9470 aarch64_elf_asm_destructor (rtx symbol
, int priority
)
9472 if (priority
== DEFAULT_INIT_PRIORITY
)
9473 default_dtor_section_asm_out_destructor (symbol
, priority
);
9477 /* While priority is known to be in range [0, 65535], so 18 bytes
9478 would be enough, the compiler might not know that. To avoid
9479 -Wformat-truncation false positive, use a larger size. */
9481 snprintf (buf
, sizeof (buf
), ".fini_array.%.5u", priority
);
9482 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
9483 switch_to_section (s
);
9484 assemble_align (POINTER_SIZE
);
9485 assemble_aligned_integer (POINTER_BYTES
, symbol
);
9490 aarch64_output_casesi (rtx
*operands
)
9494 rtx diff_vec
= PATTERN (NEXT_INSN (as_a
<rtx_insn
*> (operands
[2])));
9496 static const char *const patterns
[4][2] =
9499 "ldrb\t%w3, [%0,%w1,uxtw]",
9500 "add\t%3, %4, %w3, sxtb #2"
9503 "ldrh\t%w3, [%0,%w1,uxtw #1]",
9504 "add\t%3, %4, %w3, sxth #2"
9507 "ldr\t%w3, [%0,%w1,uxtw #2]",
9508 "add\t%3, %4, %w3, sxtw #2"
9510 /* We assume that DImode is only generated when not optimizing and
9511 that we don't really need 64-bit address offsets. That would
9512 imply an object file with 8GB of code in a single function! */
9514 "ldr\t%w3, [%0,%w1,uxtw #2]",
9515 "add\t%3, %4, %w3, sxtw #2"
9519 gcc_assert (GET_CODE (diff_vec
) == ADDR_DIFF_VEC
);
9521 scalar_int_mode mode
= as_a
<scalar_int_mode
> (GET_MODE (diff_vec
));
9522 index
= exact_log2 (GET_MODE_SIZE (mode
));
9524 gcc_assert (index
>= 0 && index
<= 3);
9526 /* Need to implement table size reduction, by chaning the code below. */
9527 output_asm_insn (patterns
[index
][0], operands
);
9528 ASM_GENERATE_INTERNAL_LABEL (label
, "Lrtx", CODE_LABEL_NUMBER (operands
[2]));
9529 snprintf (buf
, sizeof (buf
),
9530 "adr\t%%4, %s", targetm
.strip_name_encoding (label
));
9531 output_asm_insn (buf
, operands
);
9532 output_asm_insn (patterns
[index
][1], operands
);
9533 output_asm_insn ("br\t%3", operands
);
9534 assemble_label (asm_out_file
, label
);
9539 /* Return size in bits of an arithmetic operand which is shifted/scaled and
9540 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
9544 aarch64_uxt_size (int shift
, HOST_WIDE_INT mask
)
9546 if (shift
>= 0 && shift
<= 3)
9549 for (size
= 8; size
<= 32; size
*= 2)
9551 HOST_WIDE_INT bits
= ((HOST_WIDE_INT
)1U << size
) - 1;
9552 if (mask
== bits
<< shift
)
9559 /* Constant pools are per function only when PC relative
9560 literal loads are true or we are in the large memory
9564 aarch64_can_use_per_function_literal_pools_p (void)
9566 return (aarch64_pcrelative_literal_loads
9567 || aarch64_cmodel
== AARCH64_CMODEL_LARGE
);
9571 aarch64_use_blocks_for_constant_p (machine_mode
, const_rtx
)
9573 /* We can't use blocks for constants when we're using a per-function
9575 return !aarch64_can_use_per_function_literal_pools_p ();
9578 /* Select appropriate section for constants depending
9579 on where we place literal pools. */
9582 aarch64_select_rtx_section (machine_mode mode
,
9584 unsigned HOST_WIDE_INT align
)
9586 if (aarch64_can_use_per_function_literal_pools_p ())
9587 return function_section (current_function_decl
);
9589 return default_elf_select_rtx_section (mode
, x
, align
);
9592 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
9594 aarch64_asm_output_pool_epilogue (FILE *f
, const char *, tree
,
9595 HOST_WIDE_INT offset
)
9597 /* When using per-function literal pools, we must ensure that any code
9598 section is aligned to the minimal instruction length, lest we get
9599 errors from the assembler re "unaligned instructions". */
9600 if ((offset
& 3) && aarch64_can_use_per_function_literal_pools_p ())
9601 ASM_OUTPUT_ALIGN (f
, 2);
9606 /* Helper function for rtx cost calculation. Strip a shift expression
9607 from X. Returns the inner operand if successful, or the original
9608 expression on failure. */
9610 aarch64_strip_shift (rtx x
)
9614 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
9615 we can convert both to ROR during final output. */
9616 if ((GET_CODE (op
) == ASHIFT
9617 || GET_CODE (op
) == ASHIFTRT
9618 || GET_CODE (op
) == LSHIFTRT
9619 || GET_CODE (op
) == ROTATERT
9620 || GET_CODE (op
) == ROTATE
)
9621 && CONST_INT_P (XEXP (op
, 1)))
9622 return XEXP (op
, 0);
9624 if (GET_CODE (op
) == MULT
9625 && CONST_INT_P (XEXP (op
, 1))
9626 && ((unsigned) exact_log2 (INTVAL (XEXP (op
, 1)))) < 64)
9627 return XEXP (op
, 0);
9632 /* Helper function for rtx cost calculation. Strip an extend
9633 expression from X. Returns the inner operand if successful, or the
9634 original expression on failure. We deal with a number of possible
9635 canonicalization variations here. If STRIP_SHIFT is true, then
9636 we can strip off a shift also. */
9638 aarch64_strip_extend (rtx x
, bool strip_shift
)
9640 scalar_int_mode mode
;
9643 if (!is_a
<scalar_int_mode
> (GET_MODE (op
), &mode
))
9646 /* Zero and sign extraction of a widened value. */
9647 if ((GET_CODE (op
) == ZERO_EXTRACT
|| GET_CODE (op
) == SIGN_EXTRACT
)
9648 && XEXP (op
, 2) == const0_rtx
9649 && GET_CODE (XEXP (op
, 0)) == MULT
9650 && aarch64_is_extend_from_extract (mode
, XEXP (XEXP (op
, 0), 1),
9652 return XEXP (XEXP (op
, 0), 0);
9654 /* It can also be represented (for zero-extend) as an AND with an
9656 if (GET_CODE (op
) == AND
9657 && GET_CODE (XEXP (op
, 0)) == MULT
9658 && CONST_INT_P (XEXP (XEXP (op
, 0), 1))
9659 && CONST_INT_P (XEXP (op
, 1))
9660 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op
, 0), 1))),
9661 INTVAL (XEXP (op
, 1))) != 0)
9662 return XEXP (XEXP (op
, 0), 0);
9664 /* Now handle extended register, as this may also have an optional
9665 left shift by 1..4. */
9667 && GET_CODE (op
) == ASHIFT
9668 && CONST_INT_P (XEXP (op
, 1))
9669 && ((unsigned HOST_WIDE_INT
) INTVAL (XEXP (op
, 1))) <= 4)
9672 if (GET_CODE (op
) == ZERO_EXTEND
9673 || GET_CODE (op
) == SIGN_EXTEND
)
9682 /* Return true iff CODE is a shift supported in combination
9683 with arithmetic instructions. */
9686 aarch64_shift_p (enum rtx_code code
)
9688 return code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
;
9692 /* Return true iff X is a cheap shift without a sign extend. */
9695 aarch64_cheap_mult_shift_p (rtx x
)
9702 if (!(aarch64_tune_params
.extra_tuning_flags
9703 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
))
9706 if (GET_CODE (op0
) == SIGN_EXTEND
)
9709 if (GET_CODE (x
) == ASHIFT
&& CONST_INT_P (op1
)
9710 && UINTVAL (op1
) <= 4)
9713 if (GET_CODE (x
) != MULT
|| !CONST_INT_P (op1
))
9716 HOST_WIDE_INT l2
= exact_log2 (INTVAL (op1
));
9718 if (l2
> 0 && l2
<= 4)
9724 /* Helper function for rtx cost calculation. Calculate the cost of
9725 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
9726 Return the calculated cost of the expression, recursing manually in to
9727 operands where needed. */
9730 aarch64_rtx_mult_cost (rtx x
, enum rtx_code code
, int outer
, bool speed
)
9733 const struct cpu_cost_table
*extra_cost
9734 = aarch64_tune_params
.insn_extra_cost
;
9736 bool compound_p
= (outer
== PLUS
|| outer
== MINUS
);
9737 machine_mode mode
= GET_MODE (x
);
9739 gcc_checking_assert (code
== MULT
);
9744 if (VECTOR_MODE_P (mode
))
9745 mode
= GET_MODE_INNER (mode
);
9747 /* Integer multiply/fma. */
9748 if (GET_MODE_CLASS (mode
) == MODE_INT
)
9750 /* The multiply will be canonicalized as a shift, cost it as such. */
9751 if (aarch64_shift_p (GET_CODE (x
))
9752 || (CONST_INT_P (op1
)
9753 && exact_log2 (INTVAL (op1
)) > 0))
9755 bool is_extend
= GET_CODE (op0
) == ZERO_EXTEND
9756 || GET_CODE (op0
) == SIGN_EXTEND
;
9761 /* If the shift is considered cheap,
9762 then don't add any cost. */
9763 if (aarch64_cheap_mult_shift_p (x
))
9765 else if (REG_P (op1
))
9766 /* ARITH + shift-by-register. */
9767 cost
+= extra_cost
->alu
.arith_shift_reg
;
9769 /* ARITH + extended register. We don't have a cost field
9770 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
9771 cost
+= extra_cost
->alu
.extend_arith
;
9773 /* ARITH + shift-by-immediate. */
9774 cost
+= extra_cost
->alu
.arith_shift
;
9777 /* LSL (immediate). */
9778 cost
+= extra_cost
->alu
.shift
;
9781 /* Strip extends as we will have costed them in the case above. */
9783 op0
= aarch64_strip_extend (op0
, true);
9785 cost
+= rtx_cost (op0
, VOIDmode
, code
, 0, speed
);
9790 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
9791 compound and let the below cases handle it. After all, MNEG is a
9792 special-case alias of MSUB. */
9793 if (GET_CODE (op0
) == NEG
)
9795 op0
= XEXP (op0
, 0);
9799 /* Integer multiplies or FMAs have zero/sign extending variants. */
9800 if ((GET_CODE (op0
) == ZERO_EXTEND
9801 && GET_CODE (op1
) == ZERO_EXTEND
)
9802 || (GET_CODE (op0
) == SIGN_EXTEND
9803 && GET_CODE (op1
) == SIGN_EXTEND
))
9805 cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, MULT
, 0, speed
);
9806 cost
+= rtx_cost (XEXP (op1
, 0), VOIDmode
, MULT
, 1, speed
);
9811 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
9812 cost
+= extra_cost
->mult
[0].extend_add
;
9814 /* MUL/SMULL/UMULL. */
9815 cost
+= extra_cost
->mult
[0].extend
;
9821 /* This is either an integer multiply or a MADD. In both cases
9822 we want to recurse and cost the operands. */
9823 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
9824 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
9830 cost
+= extra_cost
->mult
[mode
== DImode
].add
;
9833 cost
+= extra_cost
->mult
[mode
== DImode
].simple
;
9842 /* Floating-point FMA/FMUL can also support negations of the
9843 operands, unless the rounding mode is upward or downward in
9844 which case FNMUL is different than FMUL with operand negation. */
9845 bool neg0
= GET_CODE (op0
) == NEG
;
9846 bool neg1
= GET_CODE (op1
) == NEG
;
9847 if (compound_p
|| !flag_rounding_math
|| (neg0
&& neg1
))
9850 op0
= XEXP (op0
, 0);
9852 op1
= XEXP (op1
, 0);
9856 /* FMADD/FNMADD/FNMSUB/FMSUB. */
9857 cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
9860 cost
+= extra_cost
->fp
[mode
== DFmode
].mult
;
9863 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
9864 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
9870 aarch64_address_cost (rtx x
,
9872 addr_space_t as ATTRIBUTE_UNUSED
,
9875 enum rtx_code c
= GET_CODE (x
);
9876 const struct cpu_addrcost_table
*addr_cost
= aarch64_tune_params
.addr_cost
;
9877 struct aarch64_address_info info
;
9881 if (!aarch64_classify_address (&info
, x
, mode
, false))
9883 if (GET_CODE (x
) == CONST
|| GET_CODE (x
) == SYMBOL_REF
)
9885 /* This is a CONST or SYMBOL ref which will be split
9886 in a different way depending on the code model in use.
9887 Cost it through the generic infrastructure. */
9888 int cost_symbol_ref
= rtx_cost (x
, Pmode
, MEM
, 1, speed
);
9889 /* Divide through by the cost of one instruction to
9890 bring it to the same units as the address costs. */
9891 cost_symbol_ref
/= COSTS_N_INSNS (1);
9892 /* The cost is then the cost of preparing the address,
9893 followed by an immediate (possibly 0) offset. */
9894 return cost_symbol_ref
+ addr_cost
->imm_offset
;
9898 /* This is most likely a jump table from a case
9900 return addr_cost
->register_offset
;
9906 case ADDRESS_LO_SUM
:
9907 case ADDRESS_SYMBOLIC
:
9908 case ADDRESS_REG_IMM
:
9909 cost
+= addr_cost
->imm_offset
;
9912 case ADDRESS_REG_WB
:
9913 if (c
== PRE_INC
|| c
== PRE_DEC
|| c
== PRE_MODIFY
)
9914 cost
+= addr_cost
->pre_modify
;
9915 else if (c
== POST_INC
|| c
== POST_DEC
|| c
== POST_MODIFY
)
9916 cost
+= addr_cost
->post_modify
;
9922 case ADDRESS_REG_REG
:
9923 cost
+= addr_cost
->register_offset
;
9926 case ADDRESS_REG_SXTW
:
9927 cost
+= addr_cost
->register_sextend
;
9930 case ADDRESS_REG_UXTW
:
9931 cost
+= addr_cost
->register_zextend
;
9941 /* For the sake of calculating the cost of the shifted register
9942 component, we can treat same sized modes in the same way. */
9943 if (known_eq (GET_MODE_BITSIZE (mode
), 16))
9944 cost
+= addr_cost
->addr_scale_costs
.hi
;
9945 else if (known_eq (GET_MODE_BITSIZE (mode
), 32))
9946 cost
+= addr_cost
->addr_scale_costs
.si
;
9947 else if (known_eq (GET_MODE_BITSIZE (mode
), 64))
9948 cost
+= addr_cost
->addr_scale_costs
.di
;
9950 /* We can't tell, or this is a 128-bit vector. */
9951 cost
+= addr_cost
->addr_scale_costs
.ti
;
9957 /* Return the cost of a branch. If SPEED_P is true then the compiler is
9958 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
9962 aarch64_branch_cost (bool speed_p
, bool predictable_p
)
9964 /* When optimizing for speed, use the cost of unpredictable branches. */
9965 const struct cpu_branch_cost
*branch_costs
=
9966 aarch64_tune_params
.branch_costs
;
9968 if (!speed_p
|| predictable_p
)
9969 return branch_costs
->predictable
;
9971 return branch_costs
->unpredictable
;
9974 /* Return true if the RTX X in mode MODE is a zero or sign extract
9975 usable in an ADD or SUB (extended register) instruction. */
9977 aarch64_rtx_arith_op_extract_p (rtx x
, scalar_int_mode mode
)
9979 /* Catch add with a sign extract.
9980 This is add_<optab><mode>_multp2. */
9981 if (GET_CODE (x
) == SIGN_EXTRACT
9982 || GET_CODE (x
) == ZERO_EXTRACT
)
9984 rtx op0
= XEXP (x
, 0);
9985 rtx op1
= XEXP (x
, 1);
9986 rtx op2
= XEXP (x
, 2);
9988 if (GET_CODE (op0
) == MULT
9989 && CONST_INT_P (op1
)
9990 && op2
== const0_rtx
9991 && CONST_INT_P (XEXP (op0
, 1))
9992 && aarch64_is_extend_from_extract (mode
,
9999 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
10001 else if (GET_CODE (x
) == SIGN_EXTEND
10002 || GET_CODE (x
) == ZERO_EXTEND
)
10003 return REG_P (XEXP (x
, 0));
10009 aarch64_frint_unspec_p (unsigned int u
)
10013 case UNSPEC_FRINTZ
:
10014 case UNSPEC_FRINTP
:
10015 case UNSPEC_FRINTM
:
10016 case UNSPEC_FRINTA
:
10017 case UNSPEC_FRINTN
:
10018 case UNSPEC_FRINTX
:
10019 case UNSPEC_FRINTI
:
10027 /* Return true iff X is an rtx that will match an extr instruction
10028 i.e. as described in the *extr<mode>5_insn family of patterns.
10029 OP0 and OP1 will be set to the operands of the shifts involved
10030 on success and will be NULL_RTX otherwise. */
10033 aarch64_extr_rtx_p (rtx x
, rtx
*res_op0
, rtx
*res_op1
)
10036 scalar_int_mode mode
;
10037 if (!is_a
<scalar_int_mode
> (GET_MODE (x
), &mode
))
10040 *res_op0
= NULL_RTX
;
10041 *res_op1
= NULL_RTX
;
10043 if (GET_CODE (x
) != IOR
)
10049 if ((GET_CODE (op0
) == ASHIFT
&& GET_CODE (op1
) == LSHIFTRT
)
10050 || (GET_CODE (op1
) == ASHIFT
&& GET_CODE (op0
) == LSHIFTRT
))
10052 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
10053 if (GET_CODE (op1
) == ASHIFT
)
10054 std::swap (op0
, op1
);
10056 if (!CONST_INT_P (XEXP (op0
, 1)) || !CONST_INT_P (XEXP (op1
, 1)))
10059 unsigned HOST_WIDE_INT shft_amnt_0
= UINTVAL (XEXP (op0
, 1));
10060 unsigned HOST_WIDE_INT shft_amnt_1
= UINTVAL (XEXP (op1
, 1));
10062 if (shft_amnt_0
< GET_MODE_BITSIZE (mode
)
10063 && shft_amnt_0
+ shft_amnt_1
== GET_MODE_BITSIZE (mode
))
10065 *res_op0
= XEXP (op0
, 0);
10066 *res_op1
= XEXP (op1
, 0);
10074 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
10075 storing it in *COST. Result is true if the total cost of the operation
10076 has now been calculated. */
10078 aarch64_if_then_else_costs (rtx op0
, rtx op1
, rtx op2
, int *cost
, bool speed
)
10082 enum rtx_code cmpcode
;
10084 if (COMPARISON_P (op0
))
10086 inner
= XEXP (op0
, 0);
10087 comparator
= XEXP (op0
, 1);
10088 cmpcode
= GET_CODE (op0
);
10093 comparator
= const0_rtx
;
10097 if (GET_CODE (op1
) == PC
|| GET_CODE (op2
) == PC
)
10099 /* Conditional branch. */
10100 if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
10104 if (cmpcode
== NE
|| cmpcode
== EQ
)
10106 if (comparator
== const0_rtx
)
10108 /* TBZ/TBNZ/CBZ/CBNZ. */
10109 if (GET_CODE (inner
) == ZERO_EXTRACT
)
10111 *cost
+= rtx_cost (XEXP (inner
, 0), VOIDmode
,
10112 ZERO_EXTRACT
, 0, speed
);
10115 *cost
+= rtx_cost (inner
, VOIDmode
, cmpcode
, 0, speed
);
10120 else if (cmpcode
== LT
|| cmpcode
== GE
)
10123 if (comparator
== const0_rtx
)
10128 else if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
10131 if (GET_CODE (op1
) == COMPARE
)
10133 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
10134 if (XEXP (op1
, 1) == const0_rtx
)
10138 machine_mode mode
= GET_MODE (XEXP (op1
, 0));
10139 const struct cpu_cost_table
*extra_cost
10140 = aarch64_tune_params
.insn_extra_cost
;
10142 if (GET_MODE_CLASS (mode
) == MODE_INT
)
10143 *cost
+= extra_cost
->alu
.arith
;
10145 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
10150 /* It's a conditional operation based on the status flags,
10151 so it must be some flavor of CSEL. */
10153 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
10154 if (GET_CODE (op1
) == NEG
10155 || GET_CODE (op1
) == NOT
10156 || (GET_CODE (op1
) == PLUS
&& XEXP (op1
, 1) == const1_rtx
))
10157 op1
= XEXP (op1
, 0);
10158 else if (GET_CODE (op1
) == ZERO_EXTEND
&& GET_CODE (op2
) == ZERO_EXTEND
)
10160 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
10161 op1
= XEXP (op1
, 0);
10162 op2
= XEXP (op2
, 0);
10165 *cost
+= rtx_cost (op1
, VOIDmode
, IF_THEN_ELSE
, 1, speed
);
10166 *cost
+= rtx_cost (op2
, VOIDmode
, IF_THEN_ELSE
, 2, speed
);
10170 /* We don't know what this is, cost all operands. */
10174 /* Check whether X is a bitfield operation of the form shift + extend that
10175 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
10176 operand to which the bitfield operation is applied. Otherwise return
10180 aarch64_extend_bitfield_pattern_p (rtx x
)
10182 rtx_code outer_code
= GET_CODE (x
);
10183 machine_mode outer_mode
= GET_MODE (x
);
10185 if (outer_code
!= ZERO_EXTEND
&& outer_code
!= SIGN_EXTEND
10186 && outer_mode
!= SImode
&& outer_mode
!= DImode
)
10189 rtx inner
= XEXP (x
, 0);
10190 rtx_code inner_code
= GET_CODE (inner
);
10191 machine_mode inner_mode
= GET_MODE (inner
);
10194 switch (inner_code
)
10197 if (CONST_INT_P (XEXP (inner
, 1))
10198 && (inner_mode
== QImode
|| inner_mode
== HImode
))
10199 op
= XEXP (inner
, 0);
10202 if (outer_code
== ZERO_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
10203 && (inner_mode
== QImode
|| inner_mode
== HImode
))
10204 op
= XEXP (inner
, 0);
10207 if (outer_code
== SIGN_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
10208 && (inner_mode
== QImode
|| inner_mode
== HImode
))
10209 op
= XEXP (inner
, 0);
10218 /* Return true if the mask and a shift amount from an RTX of the form
10219 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
10220 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
10223 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode
, rtx mask
,
10226 return CONST_INT_P (mask
) && CONST_INT_P (shft_amnt
)
10227 && INTVAL (shft_amnt
) < GET_MODE_BITSIZE (mode
)
10228 && exact_log2 ((INTVAL (mask
) >> INTVAL (shft_amnt
)) + 1) >= 0
10230 & ((HOST_WIDE_INT_1U
<< INTVAL (shft_amnt
)) - 1)) == 0;
10233 /* Return true if the masks and a shift amount from an RTX of the form
10234 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
10235 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
10238 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode
,
10239 unsigned HOST_WIDE_INT mask1
,
10240 unsigned HOST_WIDE_INT shft_amnt
,
10241 unsigned HOST_WIDE_INT mask2
)
10243 unsigned HOST_WIDE_INT t
;
10245 /* Verify that there is no overlap in what bits are set in the two masks. */
10246 if (mask1
!= ~mask2
)
10249 /* Verify that mask2 is not all zeros or ones. */
10250 if (mask2
== 0 || mask2
== HOST_WIDE_INT_M1U
)
10253 /* The shift amount should always be less than the mode size. */
10254 gcc_assert (shft_amnt
< GET_MODE_BITSIZE (mode
));
10256 /* Verify that the mask being shifted is contiguous and would be in the
10257 least significant bits after shifting by shft_amnt. */
10258 t
= mask2
+ (HOST_WIDE_INT_1U
<< shft_amnt
);
10259 return (t
== (t
& -t
));
10262 /* Calculate the cost of calculating X, storing it in *COST. Result
10263 is true if the total cost of the operation has now been calculated. */
10265 aarch64_rtx_costs (rtx x
, machine_mode mode
, int outer ATTRIBUTE_UNUSED
,
10266 int param ATTRIBUTE_UNUSED
, int *cost
, bool speed
)
10269 const struct cpu_cost_table
*extra_cost
10270 = aarch64_tune_params
.insn_extra_cost
;
10271 int code
= GET_CODE (x
);
10272 scalar_int_mode int_mode
;
10274 /* By default, assume that everything has equivalent cost to the
10275 cheapest instruction. Any additional costs are applied as a delta
10276 above this default. */
10277 *cost
= COSTS_N_INSNS (1);
10282 /* The cost depends entirely on the operands to SET. */
10284 op0
= SET_DEST (x
);
10287 switch (GET_CODE (op0
))
10292 rtx address
= XEXP (op0
, 0);
10293 if (VECTOR_MODE_P (mode
))
10294 *cost
+= extra_cost
->ldst
.storev
;
10295 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
10296 *cost
+= extra_cost
->ldst
.store
;
10297 else if (mode
== SFmode
)
10298 *cost
+= extra_cost
->ldst
.storef
;
10299 else if (mode
== DFmode
)
10300 *cost
+= extra_cost
->ldst
.stored
;
10303 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
10307 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
10311 if (! REG_P (SUBREG_REG (op0
)))
10312 *cost
+= rtx_cost (SUBREG_REG (op0
), VOIDmode
, SET
, 0, speed
);
10314 /* Fall through. */
10316 /* The cost is one per vector-register copied. */
10317 if (VECTOR_MODE_P (GET_MODE (op0
)) && REG_P (op1
))
10319 int nregs
= aarch64_hard_regno_nregs (V0_REGNUM
, GET_MODE (op0
));
10320 *cost
= COSTS_N_INSNS (nregs
);
10322 /* const0_rtx is in general free, but we will use an
10323 instruction to set a register to 0. */
10324 else if (REG_P (op1
) || op1
== const0_rtx
)
10326 /* The cost is 1 per register copied. */
10327 int nregs
= aarch64_hard_regno_nregs (R0_REGNUM
, GET_MODE (op0
));
10328 *cost
= COSTS_N_INSNS (nregs
);
10331 /* Cost is just the cost of the RHS of the set. */
10332 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
10337 /* Bit-field insertion. Strip any redundant widening of
10338 the RHS to meet the width of the target. */
10339 if (GET_CODE (op1
) == SUBREG
)
10340 op1
= SUBREG_REG (op1
);
10341 if ((GET_CODE (op1
) == ZERO_EXTEND
10342 || GET_CODE (op1
) == SIGN_EXTEND
)
10343 && CONST_INT_P (XEXP (op0
, 1))
10344 && is_a
<scalar_int_mode
> (GET_MODE (XEXP (op1
, 0)), &int_mode
)
10345 && GET_MODE_BITSIZE (int_mode
) >= INTVAL (XEXP (op0
, 1)))
10346 op1
= XEXP (op1
, 0);
10348 if (CONST_INT_P (op1
))
10350 /* MOV immediate is assumed to always be cheap. */
10351 *cost
= COSTS_N_INSNS (1);
10357 *cost
+= extra_cost
->alu
.bfi
;
10358 *cost
+= rtx_cost (op1
, VOIDmode
, (enum rtx_code
) code
, 1, speed
);
10364 /* We can't make sense of this, assume default cost. */
10365 *cost
= COSTS_N_INSNS (1);
10371 /* If an instruction can incorporate a constant within the
10372 instruction, the instruction's expression avoids calling
10373 rtx_cost() on the constant. If rtx_cost() is called on a
10374 constant, then it is usually because the constant must be
10375 moved into a register by one or more instructions.
10377 The exception is constant 0, which can be expressed
10378 as XZR/WZR and is therefore free. The exception to this is
10379 if we have (set (reg) (const0_rtx)) in which case we must cost
10380 the move. However, we can catch that when we cost the SET, so
10381 we don't need to consider that here. */
10382 if (x
== const0_rtx
)
10386 /* To an approximation, building any other constant is
10387 proportionally expensive to the number of instructions
10388 required to build that constant. This is true whether we
10389 are compiling for SPEED or otherwise. */
10390 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
10391 int_mode
= word_mode
;
10392 *cost
= COSTS_N_INSNS (aarch64_internal_mov_immediate
10393 (NULL_RTX
, x
, false, int_mode
));
10399 /* First determine number of instructions to do the move
10400 as an integer constant. */
10401 if (!aarch64_float_const_representable_p (x
)
10402 && !aarch64_can_const_movi_rtx_p (x
, mode
)
10403 && aarch64_float_const_rtx_p (x
))
10405 unsigned HOST_WIDE_INT ival
;
10406 bool succeed
= aarch64_reinterpret_float_as_int (x
, &ival
);
10407 gcc_assert (succeed
);
10409 scalar_int_mode imode
= (mode
== HFmode
10411 : int_mode_for_mode (mode
).require ());
10412 int ncost
= aarch64_internal_mov_immediate
10413 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
10414 *cost
+= COSTS_N_INSNS (ncost
);
10420 /* mov[df,sf]_aarch64. */
10421 if (aarch64_float_const_representable_p (x
))
10422 /* FMOV (scalar immediate). */
10423 *cost
+= extra_cost
->fp
[mode
== DFmode
].fpconst
;
10424 else if (!aarch64_float_const_zero_rtx_p (x
))
10426 /* This will be a load from memory. */
10427 if (mode
== DFmode
)
10428 *cost
+= extra_cost
->ldst
.loadd
;
10430 *cost
+= extra_cost
->ldst
.loadf
;
10433 /* Otherwise this is +0.0. We get this using MOVI d0, #0
10434 or MOV v0.s[0], wzr - neither of which are modeled by the
10435 cost tables. Just use the default cost. */
10445 /* For loads we want the base cost of a load, plus an
10446 approximation for the additional cost of the addressing
10448 rtx address
= XEXP (x
, 0);
10449 if (VECTOR_MODE_P (mode
))
10450 *cost
+= extra_cost
->ldst
.loadv
;
10451 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
10452 *cost
+= extra_cost
->ldst
.load
;
10453 else if (mode
== SFmode
)
10454 *cost
+= extra_cost
->ldst
.loadf
;
10455 else if (mode
== DFmode
)
10456 *cost
+= extra_cost
->ldst
.loadd
;
10459 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
10468 if (VECTOR_MODE_P (mode
))
10473 *cost
+= extra_cost
->vect
.alu
;
10478 if (GET_MODE_CLASS (mode
) == MODE_INT
)
10480 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
10481 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
10484 *cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, NEG
, 0, speed
);
10488 /* Cost this as SUB wzr, X. */
10489 op0
= CONST0_RTX (mode
);
10494 if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
10496 /* Support (neg(fma...)) as a single instruction only if
10497 sign of zeros is unimportant. This matches the decision
10498 making in aarch64.md. */
10499 if (GET_CODE (op0
) == FMA
&& !HONOR_SIGNED_ZEROS (GET_MODE (op0
)))
10502 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
10505 if (GET_CODE (op0
) == MULT
)
10508 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
10513 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
10523 if (VECTOR_MODE_P (mode
))
10524 *cost
+= extra_cost
->vect
.alu
;
10526 *cost
+= extra_cost
->alu
.clz
;
10535 if (op1
== const0_rtx
10536 && GET_CODE (op0
) == AND
)
10539 mode
= GET_MODE (op0
);
10543 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
)
10545 /* TODO: A write to the CC flags possibly costs extra, this
10546 needs encoding in the cost tables. */
10548 mode
= GET_MODE (op0
);
10550 if (GET_CODE (op0
) == AND
)
10556 if (GET_CODE (op0
) == PLUS
)
10558 /* ADDS (and CMN alias). */
10563 if (GET_CODE (op0
) == MINUS
)
10570 if (GET_CODE (op0
) == ZERO_EXTRACT
&& op1
== const0_rtx
10571 && GET_MODE (x
) == CC_NZmode
&& CONST_INT_P (XEXP (op0
, 1))
10572 && CONST_INT_P (XEXP (op0
, 2)))
10574 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
10575 Handle it here directly rather than going to cost_logic
10576 since we know the immediate generated for the TST is valid
10577 so we can avoid creating an intermediate rtx for it only
10578 for costing purposes. */
10580 *cost
+= extra_cost
->alu
.logical
;
10582 *cost
+= rtx_cost (XEXP (op0
, 0), GET_MODE (op0
),
10583 ZERO_EXTRACT
, 0, speed
);
10587 if (GET_CODE (op1
) == NEG
)
10591 *cost
+= extra_cost
->alu
.arith
;
10593 *cost
+= rtx_cost (op0
, mode
, COMPARE
, 0, speed
);
10594 *cost
+= rtx_cost (XEXP (op1
, 0), mode
, NEG
, 1, speed
);
10600 Compare can freely swap the order of operands, and
10601 canonicalization puts the more complex operation first.
10602 But the integer MINUS logic expects the shift/extend
10603 operation in op1. */
10605 || (GET_CODE (op0
) == SUBREG
&& REG_P (SUBREG_REG (op0
)))))
10613 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_FLOAT
)
10617 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
10619 if (CONST_DOUBLE_P (op1
) && aarch64_float_const_zero_rtx_p (op1
))
10621 *cost
+= rtx_cost (op0
, VOIDmode
, COMPARE
, 0, speed
);
10622 /* FCMP supports constant 0.0 for no extra cost. */
10628 if (VECTOR_MODE_P (mode
))
10630 /* Vector compare. */
10632 *cost
+= extra_cost
->vect
.alu
;
10634 if (aarch64_float_const_zero_rtx_p (op1
))
10636 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
10650 *cost
+= rtx_cost (op0
, mode
, MINUS
, 0, speed
);
10652 /* Detect valid immediates. */
10653 if ((GET_MODE_CLASS (mode
) == MODE_INT
10654 || (GET_MODE_CLASS (mode
) == MODE_CC
10655 && GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
))
10656 && CONST_INT_P (op1
)
10657 && aarch64_uimm12_shift (INTVAL (op1
)))
10660 /* SUB(S) (immediate). */
10661 *cost
+= extra_cost
->alu
.arith
;
10665 /* Look for SUB (extended register). */
10666 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
10667 && aarch64_rtx_arith_op_extract_p (op1
, int_mode
))
10670 *cost
+= extra_cost
->alu
.extend_arith
;
10672 op1
= aarch64_strip_extend (op1
, true);
10673 *cost
+= rtx_cost (op1
, VOIDmode
,
10674 (enum rtx_code
) GET_CODE (op1
), 0, speed
);
10678 rtx new_op1
= aarch64_strip_extend (op1
, false);
10680 /* Cost this as an FMA-alike operation. */
10681 if ((GET_CODE (new_op1
) == MULT
10682 || aarch64_shift_p (GET_CODE (new_op1
)))
10683 && code
!= COMPARE
)
10685 *cost
+= aarch64_rtx_mult_cost (new_op1
, MULT
,
10686 (enum rtx_code
) code
,
10691 *cost
+= rtx_cost (new_op1
, VOIDmode
, MINUS
, 1, speed
);
10695 if (VECTOR_MODE_P (mode
))
10698 *cost
+= extra_cost
->vect
.alu
;
10700 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
10703 *cost
+= extra_cost
->alu
.arith
;
10705 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
10708 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
10722 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
10723 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
10726 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, PLUS
, 0, speed
);
10727 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
10731 if (GET_MODE_CLASS (mode
) == MODE_INT
10732 && ((CONST_INT_P (op1
) && aarch64_uimm12_shift (INTVAL (op1
)))
10733 || aarch64_sve_addvl_addpl_immediate (op1
, mode
)))
10735 *cost
+= rtx_cost (op0
, mode
, PLUS
, 0, speed
);
10738 /* ADD (immediate). */
10739 *cost
+= extra_cost
->alu
.arith
;
10743 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
10745 /* Look for ADD (extended register). */
10746 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
10747 && aarch64_rtx_arith_op_extract_p (op0
, int_mode
))
10750 *cost
+= extra_cost
->alu
.extend_arith
;
10752 op0
= aarch64_strip_extend (op0
, true);
10753 *cost
+= rtx_cost (op0
, VOIDmode
,
10754 (enum rtx_code
) GET_CODE (op0
), 0, speed
);
10758 /* Strip any extend, leave shifts behind as we will
10759 cost them through mult_cost. */
10760 new_op0
= aarch64_strip_extend (op0
, false);
10762 if (GET_CODE (new_op0
) == MULT
10763 || aarch64_shift_p (GET_CODE (new_op0
)))
10765 *cost
+= aarch64_rtx_mult_cost (new_op0
, MULT
, PLUS
,
10770 *cost
+= rtx_cost (new_op0
, VOIDmode
, PLUS
, 0, speed
);
10774 if (VECTOR_MODE_P (mode
))
10777 *cost
+= extra_cost
->vect
.alu
;
10779 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
10782 *cost
+= extra_cost
->alu
.arith
;
10784 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
10787 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
10794 *cost
= COSTS_N_INSNS (1);
10798 if (VECTOR_MODE_P (mode
))
10799 *cost
+= extra_cost
->vect
.alu
;
10801 *cost
+= extra_cost
->alu
.rev
;
10806 if (aarch_rev16_p (x
))
10808 *cost
= COSTS_N_INSNS (1);
10812 if (VECTOR_MODE_P (mode
))
10813 *cost
+= extra_cost
->vect
.alu
;
10815 *cost
+= extra_cost
->alu
.rev
;
10820 if (aarch64_extr_rtx_p (x
, &op0
, &op1
))
10822 *cost
+= rtx_cost (op0
, mode
, IOR
, 0, speed
);
10823 *cost
+= rtx_cost (op1
, mode
, IOR
, 1, speed
);
10825 *cost
+= extra_cost
->alu
.shift
;
10829 /* Fall through. */
10836 if (VECTOR_MODE_P (mode
))
10839 *cost
+= extra_cost
->vect
.alu
;
10844 && GET_CODE (op0
) == MULT
10845 && CONST_INT_P (XEXP (op0
, 1))
10846 && CONST_INT_P (op1
)
10847 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0
, 1))),
10848 INTVAL (op1
)) != 0)
10850 /* This is a UBFM/SBFM. */
10851 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, ZERO_EXTRACT
, 0, speed
);
10853 *cost
+= extra_cost
->alu
.bfx
;
10857 if (is_int_mode (mode
, &int_mode
))
10859 if (CONST_INT_P (op1
))
10861 /* We have a mask + shift version of a UBFIZ
10862 i.e. the *andim_ashift<mode>_bfiz pattern. */
10863 if (GET_CODE (op0
) == ASHIFT
10864 && aarch64_mask_and_shift_for_ubfiz_p (int_mode
, op1
,
10867 *cost
+= rtx_cost (XEXP (op0
, 0), int_mode
,
10868 (enum rtx_code
) code
, 0, speed
);
10870 *cost
+= extra_cost
->alu
.bfx
;
10874 else if (aarch64_bitmask_imm (INTVAL (op1
), int_mode
))
10876 /* We possibly get the immediate for free, this is not
10878 *cost
+= rtx_cost (op0
, int_mode
,
10879 (enum rtx_code
) code
, 0, speed
);
10881 *cost
+= extra_cost
->alu
.logical
;
10890 /* Handle ORN, EON, or BIC. */
10891 if (GET_CODE (op0
) == NOT
)
10892 op0
= XEXP (op0
, 0);
10894 new_op0
= aarch64_strip_shift (op0
);
10896 /* If we had a shift on op0 then this is a logical-shift-
10897 by-register/immediate operation. Otherwise, this is just
10898 a logical operation. */
10901 if (new_op0
!= op0
)
10903 /* Shift by immediate. */
10904 if (CONST_INT_P (XEXP (op0
, 1)))
10905 *cost
+= extra_cost
->alu
.log_shift
;
10907 *cost
+= extra_cost
->alu
.log_shift_reg
;
10910 *cost
+= extra_cost
->alu
.logical
;
10913 /* In both cases we want to cost both operands. */
10914 *cost
+= rtx_cost (new_op0
, int_mode
, (enum rtx_code
) code
,
10916 *cost
+= rtx_cost (op1
, int_mode
, (enum rtx_code
) code
,
10926 op0
= aarch64_strip_shift (x
);
10928 if (VECTOR_MODE_P (mode
))
10931 *cost
+= extra_cost
->vect
.alu
;
10935 /* MVN-shifted-reg. */
10938 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
10941 *cost
+= extra_cost
->alu
.log_shift
;
10945 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10946 Handle the second form here taking care that 'a' in the above can
10948 else if (GET_CODE (op0
) == XOR
)
10950 rtx newop0
= XEXP (op0
, 0);
10951 rtx newop1
= XEXP (op0
, 1);
10952 rtx op0_stripped
= aarch64_strip_shift (newop0
);
10954 *cost
+= rtx_cost (newop1
, mode
, (enum rtx_code
) code
, 1, speed
);
10955 *cost
+= rtx_cost (op0_stripped
, mode
, XOR
, 0, speed
);
10959 if (op0_stripped
!= newop0
)
10960 *cost
+= extra_cost
->alu
.log_shift
;
10962 *cost
+= extra_cost
->alu
.logical
;
10969 *cost
+= extra_cost
->alu
.logical
;
10976 /* If a value is written in SI mode, then zero extended to DI
10977 mode, the operation will in general be free as a write to
10978 a 'w' register implicitly zeroes the upper bits of an 'x'
10979 register. However, if this is
10981 (set (reg) (zero_extend (reg)))
10983 we must cost the explicit register move. */
10985 && GET_MODE (op0
) == SImode
10988 int op_cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, 0, speed
);
10990 /* If OP_COST is non-zero, then the cost of the zero extend
10991 is effectively the cost of the inner operation. Otherwise
10992 we have a MOV instruction and we take the cost from the MOV
10993 itself. This is true independently of whether we are
10994 optimizing for space or time. */
11000 else if (MEM_P (op0
))
11002 /* All loads can zero extend to any size for free. */
11003 *cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, param
, speed
);
11007 op0
= aarch64_extend_bitfield_pattern_p (x
);
11010 *cost
+= rtx_cost (op0
, mode
, ZERO_EXTEND
, 0, speed
);
11012 *cost
+= extra_cost
->alu
.bfx
;
11018 if (VECTOR_MODE_P (mode
))
11021 *cost
+= extra_cost
->vect
.alu
;
11025 /* We generate an AND instead of UXTB/UXTH. */
11026 *cost
+= extra_cost
->alu
.logical
;
11032 if (MEM_P (XEXP (x
, 0)))
11037 rtx address
= XEXP (XEXP (x
, 0), 0);
11038 *cost
+= extra_cost
->ldst
.load_sign_extend
;
11041 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
11047 op0
= aarch64_extend_bitfield_pattern_p (x
);
11050 *cost
+= rtx_cost (op0
, mode
, SIGN_EXTEND
, 0, speed
);
11052 *cost
+= extra_cost
->alu
.bfx
;
11058 if (VECTOR_MODE_P (mode
))
11059 *cost
+= extra_cost
->vect
.alu
;
11061 *cost
+= extra_cost
->alu
.extend
;
11069 if (CONST_INT_P (op1
))
11073 if (VECTOR_MODE_P (mode
))
11075 /* Vector shift (immediate). */
11076 *cost
+= extra_cost
->vect
.alu
;
11080 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
11082 *cost
+= extra_cost
->alu
.shift
;
11086 /* We can incorporate zero/sign extend for free. */
11087 if (GET_CODE (op0
) == ZERO_EXTEND
11088 || GET_CODE (op0
) == SIGN_EXTEND
)
11089 op0
= XEXP (op0
, 0);
11091 *cost
+= rtx_cost (op0
, VOIDmode
, ASHIFT
, 0, speed
);
11096 if (VECTOR_MODE_P (mode
))
11099 /* Vector shift (register). */
11100 *cost
+= extra_cost
->vect
.alu
;
11106 *cost
+= extra_cost
->alu
.shift_reg
;
11108 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
11109 && CONST_INT_P (XEXP (op1
, 1))
11110 && known_eq (INTVAL (XEXP (op1
, 1)),
11111 GET_MODE_BITSIZE (mode
) - 1))
11113 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
11114 /* We already demanded XEXP (op1, 0) to be REG_P, so
11115 don't recurse into it. */
11119 return false; /* All arguments need to be in registers. */
11129 if (CONST_INT_P (op1
))
11131 /* ASR (immediate) and friends. */
11134 if (VECTOR_MODE_P (mode
))
11135 *cost
+= extra_cost
->vect
.alu
;
11137 *cost
+= extra_cost
->alu
.shift
;
11140 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
11145 if (VECTOR_MODE_P (mode
))
11148 /* Vector shift (register). */
11149 *cost
+= extra_cost
->vect
.alu
;
11154 /* ASR (register) and friends. */
11155 *cost
+= extra_cost
->alu
.shift_reg
;
11157 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
11158 && CONST_INT_P (XEXP (op1
, 1))
11159 && known_eq (INTVAL (XEXP (op1
, 1)),
11160 GET_MODE_BITSIZE (mode
) - 1))
11162 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
11163 /* We already demanded XEXP (op1, 0) to be REG_P, so
11164 don't recurse into it. */
11168 return false; /* All arguments need to be in registers. */
11173 if (aarch64_cmodel
== AARCH64_CMODEL_LARGE
11174 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
)
11178 *cost
+= extra_cost
->ldst
.load
;
11180 else if (aarch64_cmodel
== AARCH64_CMODEL_SMALL
11181 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_PIC
)
11183 /* ADRP, followed by ADD. */
11184 *cost
+= COSTS_N_INSNS (1);
11186 *cost
+= 2 * extra_cost
->alu
.arith
;
11188 else if (aarch64_cmodel
== AARCH64_CMODEL_TINY
11189 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
11193 *cost
+= extra_cost
->alu
.arith
;
11198 /* One extra load instruction, after accessing the GOT. */
11199 *cost
+= COSTS_N_INSNS (1);
11201 *cost
+= extra_cost
->ldst
.load
;
11207 /* ADRP/ADD (immediate). */
11209 *cost
+= extra_cost
->alu
.arith
;
11217 if (VECTOR_MODE_P (mode
))
11218 *cost
+= extra_cost
->vect
.alu
;
11220 *cost
+= extra_cost
->alu
.bfx
;
11223 /* We can trust that the immediates used will be correct (there
11224 are no by-register forms), so we need only cost op0. */
11225 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (enum rtx_code
) code
, 0, speed
);
11229 *cost
+= aarch64_rtx_mult_cost (x
, MULT
, 0, speed
);
11230 /* aarch64_rtx_mult_cost always handles recursion to its
11235 /* We can expand signed mod by power of 2 using a NEGS, two parallel
11236 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
11237 an unconditional negate. This case should only ever be reached through
11238 the set_smod_pow2_cheap check in expmed.c. */
11239 if (CONST_INT_P (XEXP (x
, 1))
11240 && exact_log2 (INTVAL (XEXP (x
, 1))) > 0
11241 && (mode
== SImode
|| mode
== DImode
))
11243 /* We expand to 4 instructions. Reset the baseline. */
11244 *cost
= COSTS_N_INSNS (4);
11247 *cost
+= 2 * extra_cost
->alu
.logical
11248 + 2 * extra_cost
->alu
.arith
;
11253 /* Fall-through. */
11257 /* Slighly prefer UMOD over SMOD. */
11258 if (VECTOR_MODE_P (mode
))
11259 *cost
+= extra_cost
->vect
.alu
;
11260 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
11261 *cost
+= (extra_cost
->mult
[mode
== DImode
].add
11262 + extra_cost
->mult
[mode
== DImode
].idiv
11263 + (code
== MOD
? 1 : 0));
11265 return false; /* All arguments need to be in registers. */
11272 if (VECTOR_MODE_P (mode
))
11273 *cost
+= extra_cost
->vect
.alu
;
11274 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
11275 /* There is no integer SQRT, so only DIV and UDIV can get
11277 *cost
+= (extra_cost
->mult
[mode
== DImode
].idiv
11278 /* Slighly prefer UDIV over SDIV. */
11279 + (code
== DIV
? 1 : 0));
11281 *cost
+= extra_cost
->fp
[mode
== DFmode
].div
;
11283 return false; /* All arguments need to be in registers. */
11286 return aarch64_if_then_else_costs (XEXP (x
, 0), XEXP (x
, 1),
11287 XEXP (x
, 2), cost
, speed
);
11300 return false; /* All arguments must be in registers. */
11309 if (VECTOR_MODE_P (mode
))
11310 *cost
+= extra_cost
->vect
.alu
;
11312 *cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
11315 /* FMSUB, FNMADD, and FNMSUB are free. */
11316 if (GET_CODE (op0
) == NEG
)
11317 op0
= XEXP (op0
, 0);
11319 if (GET_CODE (op2
) == NEG
)
11320 op2
= XEXP (op2
, 0);
11322 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
11323 and the by-element operand as operand 0. */
11324 if (GET_CODE (op1
) == NEG
)
11325 op1
= XEXP (op1
, 0);
11327 /* Catch vector-by-element operations. The by-element operand can
11328 either be (vec_duplicate (vec_select (x))) or just
11329 (vec_select (x)), depending on whether we are multiplying by
11330 a vector or a scalar.
11332 Canonicalization is not very good in these cases, FMA4 will put the
11333 by-element operand as operand 0, FNMA4 will have it as operand 1. */
11334 if (GET_CODE (op0
) == VEC_DUPLICATE
)
11335 op0
= XEXP (op0
, 0);
11336 else if (GET_CODE (op1
) == VEC_DUPLICATE
)
11337 op1
= XEXP (op1
, 0);
11339 if (GET_CODE (op0
) == VEC_SELECT
)
11340 op0
= XEXP (op0
, 0);
11341 else if (GET_CODE (op1
) == VEC_SELECT
)
11342 op1
= XEXP (op1
, 0);
11344 /* If the remaining parameters are not registers,
11345 get the cost to put them into registers. */
11346 *cost
+= rtx_cost (op0
, mode
, FMA
, 0, speed
);
11347 *cost
+= rtx_cost (op1
, mode
, FMA
, 1, speed
);
11348 *cost
+= rtx_cost (op2
, mode
, FMA
, 2, speed
);
11352 case UNSIGNED_FLOAT
:
11354 *cost
+= extra_cost
->fp
[mode
== DFmode
].fromint
;
11360 if (VECTOR_MODE_P (mode
))
11362 /*Vector truncate. */
11363 *cost
+= extra_cost
->vect
.alu
;
11366 *cost
+= extra_cost
->fp
[mode
== DFmode
].widen
;
11370 case FLOAT_TRUNCATE
:
11373 if (VECTOR_MODE_P (mode
))
11375 /*Vector conversion. */
11376 *cost
+= extra_cost
->vect
.alu
;
11379 *cost
+= extra_cost
->fp
[mode
== DFmode
].narrow
;
11386 /* Strip the rounding part. They will all be implemented
11387 by the fcvt* family of instructions anyway. */
11388 if (GET_CODE (x
) == UNSPEC
)
11390 unsigned int uns_code
= XINT (x
, 1);
11392 if (uns_code
== UNSPEC_FRINTA
11393 || uns_code
== UNSPEC_FRINTM
11394 || uns_code
== UNSPEC_FRINTN
11395 || uns_code
== UNSPEC_FRINTP
11396 || uns_code
== UNSPEC_FRINTZ
)
11397 x
= XVECEXP (x
, 0, 0);
11402 if (VECTOR_MODE_P (mode
))
11403 *cost
+= extra_cost
->vect
.alu
;
11405 *cost
+= extra_cost
->fp
[GET_MODE (x
) == DFmode
].toint
;
11408 /* We can combine fmul by a power of 2 followed by a fcvt into a single
11409 fixed-point fcvt. */
11410 if (GET_CODE (x
) == MULT
11411 && ((VECTOR_MODE_P (mode
)
11412 && aarch64_vec_fpconst_pow_of_2 (XEXP (x
, 1)) > 0)
11413 || aarch64_fpconst_pow_of_2 (XEXP (x
, 1)) > 0))
11415 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (rtx_code
) code
,
11420 *cost
+= rtx_cost (x
, VOIDmode
, (enum rtx_code
) code
, 0, speed
);
11424 if (VECTOR_MODE_P (mode
))
11426 /* ABS (vector). */
11428 *cost
+= extra_cost
->vect
.alu
;
11430 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
11434 /* FABD, which is analogous to FADD. */
11435 if (GET_CODE (op0
) == MINUS
)
11437 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, MINUS
, 0, speed
);
11438 *cost
+= rtx_cost (XEXP (op0
, 1), mode
, MINUS
, 1, speed
);
11440 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
11444 /* Simple FABS is analogous to FNEG. */
11446 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
11450 /* Integer ABS will either be split to
11451 two arithmetic instructions, or will be an ABS
11452 (scalar), which we don't model. */
11453 *cost
= COSTS_N_INSNS (2);
11455 *cost
+= 2 * extra_cost
->alu
.arith
;
11463 if (VECTOR_MODE_P (mode
))
11464 *cost
+= extra_cost
->vect
.alu
;
11467 /* FMAXNM/FMINNM/FMAX/FMIN.
11468 TODO: This may not be accurate for all implementations, but
11469 we do not model this in the cost tables. */
11470 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
11476 /* The floating point round to integer frint* instructions. */
11477 if (aarch64_frint_unspec_p (XINT (x
, 1)))
11480 *cost
+= extra_cost
->fp
[mode
== DFmode
].roundint
;
11485 if (XINT (x
, 1) == UNSPEC_RBIT
)
11488 *cost
+= extra_cost
->alu
.rev
;
11496 /* Decompose <su>muldi3_highpart. */
11497 if (/* (truncate:DI */
11500 && GET_MODE (XEXP (x
, 0)) == TImode
11501 && GET_CODE (XEXP (x
, 0)) == LSHIFTRT
11503 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == MULT
11504 /* (ANY_EXTEND:TI (reg:DI))
11505 (ANY_EXTEND:TI (reg:DI))) */
11506 && ((GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == ZERO_EXTEND
11507 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == ZERO_EXTEND
)
11508 || (GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == SIGN_EXTEND
11509 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == SIGN_EXTEND
))
11510 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0)) == DImode
11511 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0)) == DImode
11512 /* (const_int 64) */
11513 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
11514 && UINTVAL (XEXP (XEXP (x
, 0), 1)) == 64)
11518 *cost
+= extra_cost
->mult
[mode
== DImode
].extend
;
11519 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0),
11520 mode
, MULT
, 0, speed
);
11521 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0),
11522 mode
, MULT
, 1, speed
);
11526 /* Fall through. */
11532 && flag_aarch64_verbose_cost
)
11533 fprintf (dump_file
,
11534 "\nFailed to cost RTX. Assuming default cost.\n");
11539 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
11540 calculated for X. This cost is stored in *COST. Returns true
11541 if the total cost of X was calculated. */
11543 aarch64_rtx_costs_wrapper (rtx x
, machine_mode mode
, int outer
,
11544 int param
, int *cost
, bool speed
)
11546 bool result
= aarch64_rtx_costs (x
, mode
, outer
, param
, cost
, speed
);
11549 && flag_aarch64_verbose_cost
)
11551 print_rtl_single (dump_file
, x
);
11552 fprintf (dump_file
, "\n%s cost: %d (%s)\n",
11553 speed
? "Hot" : "Cold",
11554 *cost
, result
? "final" : "partial");
11561 aarch64_register_move_cost (machine_mode mode
,
11562 reg_class_t from_i
, reg_class_t to_i
)
11564 enum reg_class from
= (enum reg_class
) from_i
;
11565 enum reg_class to
= (enum reg_class
) to_i
;
11566 const struct cpu_regmove_cost
*regmove_cost
11567 = aarch64_tune_params
.regmove_cost
;
11569 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
11570 if (to
== TAILCALL_ADDR_REGS
|| to
== POINTER_REGS
)
11573 if (from
== TAILCALL_ADDR_REGS
|| from
== POINTER_REGS
)
11574 from
= GENERAL_REGS
;
11576 /* Moving between GPR and stack cost is the same as GP2GP. */
11577 if ((from
== GENERAL_REGS
&& to
== STACK_REG
)
11578 || (to
== GENERAL_REGS
&& from
== STACK_REG
))
11579 return regmove_cost
->GP2GP
;
11581 /* To/From the stack register, we move via the gprs. */
11582 if (to
== STACK_REG
|| from
== STACK_REG
)
11583 return aarch64_register_move_cost (mode
, from
, GENERAL_REGS
)
11584 + aarch64_register_move_cost (mode
, GENERAL_REGS
, to
);
11586 if (known_eq (GET_MODE_SIZE (mode
), 16))
11588 /* 128-bit operations on general registers require 2 instructions. */
11589 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
11590 return regmove_cost
->GP2GP
* 2;
11591 else if (from
== GENERAL_REGS
)
11592 return regmove_cost
->GP2FP
* 2;
11593 else if (to
== GENERAL_REGS
)
11594 return regmove_cost
->FP2GP
* 2;
11596 /* When AdvSIMD instructions are disabled it is not possible to move
11597 a 128-bit value directly between Q registers. This is handled in
11598 secondary reload. A general register is used as a scratch to move
11599 the upper DI value and the lower DI value is moved directly,
11600 hence the cost is the sum of three moves. */
11602 return regmove_cost
->GP2FP
+ regmove_cost
->FP2GP
+ regmove_cost
->FP2FP
;
11604 return regmove_cost
->FP2FP
;
11607 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
11608 return regmove_cost
->GP2GP
;
11609 else if (from
== GENERAL_REGS
)
11610 return regmove_cost
->GP2FP
;
11611 else if (to
== GENERAL_REGS
)
11612 return regmove_cost
->FP2GP
;
11614 return regmove_cost
->FP2FP
;
11618 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED
,
11619 reg_class_t rclass ATTRIBUTE_UNUSED
,
11620 bool in ATTRIBUTE_UNUSED
)
11622 return aarch64_tune_params
.memmov_cost
;
11625 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
11626 to optimize 1.0/sqrt. */
11629 use_rsqrt_p (machine_mode mode
)
11631 return (!flag_trapping_math
11632 && flag_unsafe_math_optimizations
11633 && ((aarch64_tune_params
.approx_modes
->recip_sqrt
11634 & AARCH64_APPROX_MODE (mode
))
11635 || flag_mrecip_low_precision_sqrt
));
11638 /* Function to decide when to use the approximate reciprocal square root
11642 aarch64_builtin_reciprocal (tree fndecl
)
11644 machine_mode mode
= TYPE_MODE (TREE_TYPE (fndecl
));
11646 if (!use_rsqrt_p (mode
))
11648 return aarch64_builtin_rsqrt (DECL_MD_FUNCTION_CODE (fndecl
));
11651 /* Emit instruction sequence to compute either the approximate square root
11652 or its approximate reciprocal, depending on the flag RECP, and return
11653 whether the sequence was emitted or not. */
11656 aarch64_emit_approx_sqrt (rtx dst
, rtx src
, bool recp
)
11658 machine_mode mode
= GET_MODE (dst
);
11660 if (GET_MODE_INNER (mode
) == HFmode
)
11662 gcc_assert (!recp
);
11668 if (!(flag_mlow_precision_sqrt
11669 || (aarch64_tune_params
.approx_modes
->sqrt
11670 & AARCH64_APPROX_MODE (mode
))))
11673 if (flag_finite_math_only
11674 || flag_trapping_math
11675 || !flag_unsafe_math_optimizations
11676 || optimize_function_for_size_p (cfun
))
11680 /* Caller assumes we cannot fail. */
11681 gcc_assert (use_rsqrt_p (mode
));
11683 machine_mode mmsk
= mode_for_int_vector (mode
).require ();
11684 rtx xmsk
= gen_reg_rtx (mmsk
);
11686 /* When calculating the approximate square root, compare the
11687 argument with 0.0 and create a mask. */
11688 emit_insn (gen_rtx_SET (xmsk
,
11690 gen_rtx_EQ (mmsk
, src
,
11691 CONST0_RTX (mode
)))));
11693 /* Estimate the approximate reciprocal square root. */
11694 rtx xdst
= gen_reg_rtx (mode
);
11695 emit_insn (gen_aarch64_rsqrte (mode
, xdst
, src
));
11697 /* Iterate over the series twice for SF and thrice for DF. */
11698 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
11700 /* Optionally iterate over the series once less for faster performance
11701 while sacrificing the accuracy. */
11702 if ((recp
&& flag_mrecip_low_precision_sqrt
)
11703 || (!recp
&& flag_mlow_precision_sqrt
))
11706 /* Iterate over the series to calculate the approximate reciprocal square
11708 rtx x1
= gen_reg_rtx (mode
);
11709 while (iterations
--)
11711 rtx x2
= gen_reg_rtx (mode
);
11712 emit_set_insn (x2
, gen_rtx_MULT (mode
, xdst
, xdst
));
11714 emit_insn (gen_aarch64_rsqrts (mode
, x1
, src
, x2
));
11716 if (iterations
> 0)
11717 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, x1
));
11722 /* Qualify the approximate reciprocal square root when the argument is
11723 0.0 by squashing the intermediary result to 0.0. */
11724 rtx xtmp
= gen_reg_rtx (mmsk
);
11725 emit_set_insn (xtmp
, gen_rtx_AND (mmsk
, gen_rtx_NOT (mmsk
, xmsk
),
11726 gen_rtx_SUBREG (mmsk
, xdst
, 0)));
11727 emit_move_insn (xdst
, gen_rtx_SUBREG (mode
, xtmp
, 0));
11729 /* Calculate the approximate square root. */
11730 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, src
));
11733 /* Finalize the approximation. */
11734 emit_set_insn (dst
, gen_rtx_MULT (mode
, xdst
, x1
));
11739 /* Emit the instruction sequence to compute the approximation for the division
11740 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
11743 aarch64_emit_approx_div (rtx quo
, rtx num
, rtx den
)
11745 machine_mode mode
= GET_MODE (quo
);
11747 if (GET_MODE_INNER (mode
) == HFmode
)
11750 bool use_approx_division_p
= (flag_mlow_precision_div
11751 || (aarch64_tune_params
.approx_modes
->division
11752 & AARCH64_APPROX_MODE (mode
)));
11754 if (!flag_finite_math_only
11755 || flag_trapping_math
11756 || !flag_unsafe_math_optimizations
11757 || optimize_function_for_size_p (cfun
)
11758 || !use_approx_division_p
)
11761 if (!TARGET_SIMD
&& VECTOR_MODE_P (mode
))
11764 /* Estimate the approximate reciprocal. */
11765 rtx xrcp
= gen_reg_rtx (mode
);
11766 emit_insn (gen_aarch64_frecpe (mode
, xrcp
, den
));
11768 /* Iterate over the series twice for SF and thrice for DF. */
11769 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
11771 /* Optionally iterate over the series once less for faster performance,
11772 while sacrificing the accuracy. */
11773 if (flag_mlow_precision_div
)
11776 /* Iterate over the series to calculate the approximate reciprocal. */
11777 rtx xtmp
= gen_reg_rtx (mode
);
11778 while (iterations
--)
11780 emit_insn (gen_aarch64_frecps (mode
, xtmp
, xrcp
, den
));
11782 if (iterations
> 0)
11783 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
11786 if (num
!= CONST1_RTX (mode
))
11788 /* As the approximate reciprocal of DEN is already calculated, only
11789 calculate the approximate division when NUM is not 1.0. */
11790 rtx xnum
= force_reg (mode
, num
);
11791 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xnum
));
11794 /* Finalize the approximation. */
11795 emit_set_insn (quo
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
11799 /* Return the number of instructions that can be issued per cycle. */
11801 aarch64_sched_issue_rate (void)
11803 return aarch64_tune_params
.issue_rate
;
11807 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
11809 int issue_rate
= aarch64_sched_issue_rate ();
11811 return issue_rate
> 1 && !sched_fusion
? issue_rate
: 0;
11815 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
11816 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
11817 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
11820 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn
*insn
,
11823 return autopref_multipass_dfa_lookahead_guard (insn
, ready_index
);
11827 /* Vectorizer cost model target hooks. */
11829 /* Implement targetm.vectorize.builtin_vectorization_cost. */
11831 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost
,
11833 int misalign ATTRIBUTE_UNUSED
)
11836 const cpu_vector_cost
*costs
= aarch64_tune_params
.vec_costs
;
11839 if (vectype
!= NULL
)
11840 fp
= FLOAT_TYPE_P (vectype
);
11842 switch (type_of_cost
)
11845 return fp
? costs
->scalar_fp_stmt_cost
: costs
->scalar_int_stmt_cost
;
11848 return costs
->scalar_load_cost
;
11851 return costs
->scalar_store_cost
;
11854 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
11857 return costs
->vec_align_load_cost
;
11860 return costs
->vec_store_cost
;
11862 case vec_to_scalar
:
11863 return costs
->vec_to_scalar_cost
;
11865 case scalar_to_vec
:
11866 return costs
->scalar_to_vec_cost
;
11868 case unaligned_load
:
11869 case vector_gather_load
:
11870 return costs
->vec_unalign_load_cost
;
11872 case unaligned_store
:
11873 case vector_scatter_store
:
11874 return costs
->vec_unalign_store_cost
;
11876 case cond_branch_taken
:
11877 return costs
->cond_taken_branch_cost
;
11879 case cond_branch_not_taken
:
11880 return costs
->cond_not_taken_branch_cost
;
11883 return costs
->vec_permute_cost
;
11885 case vec_promote_demote
:
11886 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
11888 case vec_construct
:
11889 elements
= estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype
));
11890 return elements
/ 2 + 1;
11893 gcc_unreachable ();
11897 /* Implement targetm.vectorize.add_stmt_cost. */
11899 aarch64_add_stmt_cost (void *data
, int count
, enum vect_cost_for_stmt kind
,
11900 struct _stmt_vec_info
*stmt_info
, int misalign
,
11901 enum vect_cost_model_location where
)
11903 unsigned *cost
= (unsigned *) data
;
11904 unsigned retval
= 0;
11906 if (flag_vect_cost_model
)
11908 tree vectype
= stmt_info
? stmt_vectype (stmt_info
) : NULL_TREE
;
11910 aarch64_builtin_vectorization_cost (kind
, vectype
, misalign
);
11912 /* Statements in an inner loop relative to the loop being
11913 vectorized are weighted more heavily. The value here is
11914 arbitrary and could potentially be improved with analysis. */
11915 if (where
== vect_body
&& stmt_info
&& stmt_in_inner_loop_p (stmt_info
))
11916 count
*= 50; /* FIXME */
11918 retval
= (unsigned) (count
* stmt_cost
);
11919 cost
[where
] += retval
;
11925 static void initialize_aarch64_code_model (struct gcc_options
*);
11927 /* Parse the TO_PARSE string and put the architecture struct that it
11928 selects into RES and the architectural features into ISA_FLAGS.
11929 Return an aarch64_parse_opt_result describing the parse result.
11930 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
11931 When the TO_PARSE string contains an invalid extension,
11932 a copy of the string is created and stored to INVALID_EXTENSION. */
11934 static enum aarch64_parse_opt_result
11935 aarch64_parse_arch (const char *to_parse
, const struct processor
**res
,
11936 uint64_t *isa_flags
, std::string
*invalid_extension
)
11939 const struct processor
*arch
;
11942 ext
= strchr (to_parse
, '+');
11945 len
= ext
- to_parse
;
11947 len
= strlen (to_parse
);
11950 return AARCH64_PARSE_MISSING_ARG
;
11953 /* Loop through the list of supported ARCHes to find a match. */
11954 for (arch
= all_architectures
; arch
->name
!= NULL
; arch
++)
11956 if (strlen (arch
->name
) == len
11957 && strncmp (arch
->name
, to_parse
, len
) == 0)
11959 uint64_t isa_temp
= arch
->flags
;
11963 /* TO_PARSE string contains at least one extension. */
11964 enum aarch64_parse_opt_result ext_res
11965 = aarch64_parse_extension (ext
, &isa_temp
, invalid_extension
);
11967 if (ext_res
!= AARCH64_PARSE_OK
)
11970 /* Extension parsing was successful. Confirm the result
11971 arch and ISA flags. */
11973 *isa_flags
= isa_temp
;
11974 return AARCH64_PARSE_OK
;
11978 /* ARCH name not found in list. */
11979 return AARCH64_PARSE_INVALID_ARG
;
11982 /* Parse the TO_PARSE string and put the result tuning in RES and the
11983 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
11984 describing the parse result. If there is an error parsing, RES and
11985 ISA_FLAGS are left unchanged.
11986 When the TO_PARSE string contains an invalid extension,
11987 a copy of the string is created and stored to INVALID_EXTENSION. */
11989 static enum aarch64_parse_opt_result
11990 aarch64_parse_cpu (const char *to_parse
, const struct processor
**res
,
11991 uint64_t *isa_flags
, std::string
*invalid_extension
)
11994 const struct processor
*cpu
;
11997 ext
= strchr (to_parse
, '+');
12000 len
= ext
- to_parse
;
12002 len
= strlen (to_parse
);
12005 return AARCH64_PARSE_MISSING_ARG
;
12008 /* Loop through the list of supported CPUs to find a match. */
12009 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
12011 if (strlen (cpu
->name
) == len
&& strncmp (cpu
->name
, to_parse
, len
) == 0)
12013 uint64_t isa_temp
= cpu
->flags
;
12018 /* TO_PARSE string contains at least one extension. */
12019 enum aarch64_parse_opt_result ext_res
12020 = aarch64_parse_extension (ext
, &isa_temp
, invalid_extension
);
12022 if (ext_res
!= AARCH64_PARSE_OK
)
12025 /* Extension parsing was successfull. Confirm the result
12026 cpu and ISA flags. */
12028 *isa_flags
= isa_temp
;
12029 return AARCH64_PARSE_OK
;
12033 /* CPU name not found in list. */
12034 return AARCH64_PARSE_INVALID_ARG
;
12037 /* Parse the TO_PARSE string and put the cpu it selects into RES.
12038 Return an aarch64_parse_opt_result describing the parse result.
12039 If the parsing fails the RES does not change. */
12041 static enum aarch64_parse_opt_result
12042 aarch64_parse_tune (const char *to_parse
, const struct processor
**res
)
12044 const struct processor
*cpu
;
12046 /* Loop through the list of supported CPUs to find a match. */
12047 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
12049 if (strcmp (cpu
->name
, to_parse
) == 0)
12052 return AARCH64_PARSE_OK
;
12056 /* CPU name not found in list. */
12057 return AARCH64_PARSE_INVALID_ARG
;
12060 /* Parse TOKEN, which has length LENGTH to see if it is an option
12061 described in FLAG. If it is, return the index bit for that fusion type.
12062 If not, error (printing OPTION_NAME) and return zero. */
12064 static unsigned int
12065 aarch64_parse_one_option_token (const char *token
,
12067 const struct aarch64_flag_desc
*flag
,
12068 const char *option_name
)
12070 for (; flag
->name
!= NULL
; flag
++)
12072 if (length
== strlen (flag
->name
)
12073 && !strncmp (flag
->name
, token
, length
))
12077 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name
, token
);
12081 /* Parse OPTION which is a comma-separated list of flags to enable.
12082 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
12083 default state we inherit from the CPU tuning structures. OPTION_NAME
12084 gives the top-level option we are parsing in the -moverride string,
12085 for use in error messages. */
12087 static unsigned int
12088 aarch64_parse_boolean_options (const char *option
,
12089 const struct aarch64_flag_desc
*flags
,
12090 unsigned int initial_state
,
12091 const char *option_name
)
12093 const char separator
= '.';
12094 const char* specs
= option
;
12095 const char* ntoken
= option
;
12096 unsigned int found_flags
= initial_state
;
12098 while ((ntoken
= strchr (specs
, separator
)))
12100 size_t token_length
= ntoken
- specs
;
12101 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
12105 /* If we find "none" (or, for simplicity's sake, an error) anywhere
12106 in the token stream, reset the supported operations. So:
12108 adrp+add.cmp+branch.none.adrp+add
12110 would have the result of turning on only adrp+add fusion. */
12114 found_flags
|= token_ops
;
12118 /* We ended with a comma, print something. */
12121 error ("%s string ill-formed\n", option_name
);
12125 /* We still have one more token to parse. */
12126 size_t token_length
= strlen (specs
);
12127 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
12134 found_flags
|= token_ops
;
12135 return found_flags
;
12138 /* Support for overriding instruction fusion. */
12141 aarch64_parse_fuse_string (const char *fuse_string
,
12142 struct tune_params
*tune
)
12144 tune
->fusible_ops
= aarch64_parse_boolean_options (fuse_string
,
12145 aarch64_fusible_pairs
,
12150 /* Support for overriding other tuning flags. */
12153 aarch64_parse_tune_string (const char *tune_string
,
12154 struct tune_params
*tune
)
12156 tune
->extra_tuning_flags
12157 = aarch64_parse_boolean_options (tune_string
,
12158 aarch64_tuning_flags
,
12159 tune
->extra_tuning_flags
,
12163 /* Parse the sve_width tuning moverride string in TUNE_STRING.
12164 Accept the valid SVE vector widths allowed by
12165 aarch64_sve_vector_bits_enum and use it to override sve_width
12169 aarch64_parse_sve_width_string (const char *tune_string
,
12170 struct tune_params
*tune
)
12174 int n
= sscanf (tune_string
, "%d", &width
);
12177 error ("invalid format for sve_width");
12189 error ("invalid sve_width value: %d", width
);
12191 tune
->sve_width
= (enum aarch64_sve_vector_bits_enum
) width
;
12194 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
12195 we understand. If it is, extract the option string and handoff to
12196 the appropriate function. */
12199 aarch64_parse_one_override_token (const char* token
,
12201 struct tune_params
*tune
)
12203 const struct aarch64_tuning_override_function
*fn
12204 = aarch64_tuning_override_functions
;
12206 const char *option_part
= strchr (token
, '=');
12209 error ("tuning string missing in option (%s)", token
);
12213 /* Get the length of the option name. */
12214 length
= option_part
- token
;
12215 /* Skip the '=' to get to the option string. */
12218 for (; fn
->name
!= NULL
; fn
++)
12220 if (!strncmp (fn
->name
, token
, length
))
12222 fn
->parse_override (option_part
, tune
);
12227 error ("unknown tuning option (%s)",token
);
12231 /* A checking mechanism for the implementation of the tls size. */
12234 initialize_aarch64_tls_size (struct gcc_options
*opts
)
12236 if (aarch64_tls_size
== 0)
12237 aarch64_tls_size
= 24;
12239 switch (opts
->x_aarch64_cmodel_var
)
12241 case AARCH64_CMODEL_TINY
:
12242 /* Both the default and maximum TLS size allowed under tiny is 1M which
12243 needs two instructions to address, so we clamp the size to 24. */
12244 if (aarch64_tls_size
> 24)
12245 aarch64_tls_size
= 24;
12247 case AARCH64_CMODEL_SMALL
:
12248 /* The maximum TLS size allowed under small is 4G. */
12249 if (aarch64_tls_size
> 32)
12250 aarch64_tls_size
= 32;
12252 case AARCH64_CMODEL_LARGE
:
12253 /* The maximum TLS size allowed under large is 16E.
12254 FIXME: 16E should be 64bit, we only support 48bit offset now. */
12255 if (aarch64_tls_size
> 48)
12256 aarch64_tls_size
= 48;
12259 gcc_unreachable ();
12265 /* Parse STRING looking for options in the format:
12266 string :: option:string
12267 option :: name=substring
12269 substring :: defined by option. */
12272 aarch64_parse_override_string (const char* input_string
,
12273 struct tune_params
* tune
)
12275 const char separator
= ':';
12276 size_t string_length
= strlen (input_string
) + 1;
12277 char *string_root
= (char *) xmalloc (sizeof (*string_root
) * string_length
);
12278 char *string
= string_root
;
12279 strncpy (string
, input_string
, string_length
);
12280 string
[string_length
- 1] = '\0';
12282 char* ntoken
= string
;
12284 while ((ntoken
= strchr (string
, separator
)))
12286 size_t token_length
= ntoken
- string
;
12287 /* Make this substring look like a string. */
12289 aarch64_parse_one_override_token (string
, token_length
, tune
);
12293 /* One last option to parse. */
12294 aarch64_parse_one_override_token (string
, strlen (string
), tune
);
12295 free (string_root
);
12300 aarch64_override_options_after_change_1 (struct gcc_options
*opts
)
12302 if (accepted_branch_protection_string
)
12304 opts
->x_aarch64_branch_protection_string
12305 = xstrdup (accepted_branch_protection_string
);
12308 /* PR 70044: We have to be careful about being called multiple times for the
12309 same function. This means all changes should be repeatable. */
12311 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
12312 Disable the frame pointer flag so the mid-end will not use a frame
12313 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
12314 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
12315 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
12316 aarch64_use_frame_pointer
= opts
->x_flag_omit_frame_pointer
!= 1;
12317 if (opts
->x_flag_omit_frame_pointer
== 0)
12318 opts
->x_flag_omit_frame_pointer
= 2;
12320 /* If not optimizing for size, set the default
12321 alignment to what the target wants. */
12322 if (!opts
->x_optimize_size
)
12324 if (opts
->x_flag_align_loops
&& !opts
->x_str_align_loops
)
12325 opts
->x_str_align_loops
= aarch64_tune_params
.loop_align
;
12326 if (opts
->x_flag_align_jumps
&& !opts
->x_str_align_jumps
)
12327 opts
->x_str_align_jumps
= aarch64_tune_params
.jump_align
;
12328 if (opts
->x_flag_align_functions
&& !opts
->x_str_align_functions
)
12329 opts
->x_str_align_functions
= aarch64_tune_params
.function_align
;
12332 /* We default to no pc-relative literal loads. */
12334 aarch64_pcrelative_literal_loads
= false;
12336 /* If -mpc-relative-literal-loads is set on the command line, this
12337 implies that the user asked for PC relative literal loads. */
12338 if (opts
->x_pcrelative_literal_loads
== 1)
12339 aarch64_pcrelative_literal_loads
= true;
12341 /* In the tiny memory model it makes no sense to disallow PC relative
12342 literal pool loads. */
12343 if (aarch64_cmodel
== AARCH64_CMODEL_TINY
12344 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
12345 aarch64_pcrelative_literal_loads
= true;
12347 /* When enabling the lower precision Newton series for the square root, also
12348 enable it for the reciprocal square root, since the latter is an
12349 intermediary step for the former. */
12350 if (flag_mlow_precision_sqrt
)
12351 flag_mrecip_low_precision_sqrt
= true;
12354 /* 'Unpack' up the internal tuning structs and update the options
12355 in OPTS. The caller must have set up selected_tune and selected_arch
12356 as all the other target-specific codegen decisions are
12357 derived from them. */
12360 aarch64_override_options_internal (struct gcc_options
*opts
)
12362 aarch64_tune_flags
= selected_tune
->flags
;
12363 aarch64_tune
= selected_tune
->sched_core
;
12364 /* Make a copy of the tuning parameters attached to the core, which
12365 we may later overwrite. */
12366 aarch64_tune_params
= *(selected_tune
->tune
);
12367 aarch64_architecture_version
= selected_arch
->architecture_version
;
12369 if (opts
->x_aarch64_override_tune_string
)
12370 aarch64_parse_override_string (opts
->x_aarch64_override_tune_string
,
12371 &aarch64_tune_params
);
12373 /* This target defaults to strict volatile bitfields. */
12374 if (opts
->x_flag_strict_volatile_bitfields
< 0 && abi_version_at_least (2))
12375 opts
->x_flag_strict_volatile_bitfields
= 1;
12377 if (aarch64_stack_protector_guard
== SSP_GLOBAL
12378 && opts
->x_aarch64_stack_protector_guard_offset_str
)
12380 error ("incompatible options %<-mstack-protector-guard=global%> and "
12381 "%<-mstack-protector-guard-offset=%s%>",
12382 aarch64_stack_protector_guard_offset_str
);
12385 if (aarch64_stack_protector_guard
== SSP_SYSREG
12386 && !(opts
->x_aarch64_stack_protector_guard_offset_str
12387 && opts
->x_aarch64_stack_protector_guard_reg_str
))
12389 error ("both %<-mstack-protector-guard-offset%> and "
12390 "%<-mstack-protector-guard-reg%> must be used "
12391 "with %<-mstack-protector-guard=sysreg%>");
12394 if (opts
->x_aarch64_stack_protector_guard_reg_str
)
12396 if (strlen (opts
->x_aarch64_stack_protector_guard_reg_str
) > 100)
12397 error ("specify a system register with a small string length.");
12400 if (opts
->x_aarch64_stack_protector_guard_offset_str
)
12403 const char *str
= aarch64_stack_protector_guard_offset_str
;
12405 long offs
= strtol (aarch64_stack_protector_guard_offset_str
, &end
, 0);
12406 if (!*str
|| *end
|| errno
)
12407 error ("%qs is not a valid offset in %qs", str
,
12408 "-mstack-protector-guard-offset=");
12409 aarch64_stack_protector_guard_offset
= offs
;
12412 initialize_aarch64_code_model (opts
);
12413 initialize_aarch64_tls_size (opts
);
12415 int queue_depth
= 0;
12416 switch (aarch64_tune_params
.autoprefetcher_model
)
12418 case tune_params::AUTOPREFETCHER_OFF
:
12421 case tune_params::AUTOPREFETCHER_WEAK
:
12424 case tune_params::AUTOPREFETCHER_STRONG
:
12425 queue_depth
= max_insn_queue_index
+ 1;
12428 gcc_unreachable ();
12431 /* We don't mind passing in global_options_set here as we don't use
12432 the *options_set structs anyway. */
12433 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH
,
12435 opts
->x_param_values
,
12436 global_options_set
.x_param_values
);
12438 /* Set up parameters to be used in prefetching algorithm. Do not
12439 override the defaults unless we are tuning for a core we have
12440 researched values for. */
12441 if (aarch64_tune_params
.prefetch
->num_slots
> 0)
12442 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES
,
12443 aarch64_tune_params
.prefetch
->num_slots
,
12444 opts
->x_param_values
,
12445 global_options_set
.x_param_values
);
12446 if (aarch64_tune_params
.prefetch
->l1_cache_size
>= 0)
12447 maybe_set_param_value (PARAM_L1_CACHE_SIZE
,
12448 aarch64_tune_params
.prefetch
->l1_cache_size
,
12449 opts
->x_param_values
,
12450 global_options_set
.x_param_values
);
12451 if (aarch64_tune_params
.prefetch
->l1_cache_line_size
>= 0)
12452 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE
,
12453 aarch64_tune_params
.prefetch
->l1_cache_line_size
,
12454 opts
->x_param_values
,
12455 global_options_set
.x_param_values
);
12456 if (aarch64_tune_params
.prefetch
->l2_cache_size
>= 0)
12457 maybe_set_param_value (PARAM_L2_CACHE_SIZE
,
12458 aarch64_tune_params
.prefetch
->l2_cache_size
,
12459 opts
->x_param_values
,
12460 global_options_set
.x_param_values
);
12461 if (!aarch64_tune_params
.prefetch
->prefetch_dynamic_strides
)
12462 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES
,
12464 opts
->x_param_values
,
12465 global_options_set
.x_param_values
);
12466 if (aarch64_tune_params
.prefetch
->minimum_stride
>= 0)
12467 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE
,
12468 aarch64_tune_params
.prefetch
->minimum_stride
,
12469 opts
->x_param_values
,
12470 global_options_set
.x_param_values
);
12472 /* Use the alternative scheduling-pressure algorithm by default. */
12473 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM
, SCHED_PRESSURE_MODEL
,
12474 opts
->x_param_values
,
12475 global_options_set
.x_param_values
);
12477 /* If the user hasn't changed it via configure then set the default to 64 KB
12478 for the backend. */
12479 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
,
12480 DEFAULT_STK_CLASH_GUARD_SIZE
== 0
12481 ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE
,
12482 opts
->x_param_values
,
12483 global_options_set
.x_param_values
);
12485 /* Validate the guard size. */
12486 int guard_size
= PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
12488 /* Enforce that interval is the same size as size so the mid-end does the
12490 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL
,
12492 opts
->x_param_values
,
12493 global_options_set
.x_param_values
);
12495 /* The maybe_set calls won't update the value if the user has explicitly set
12496 one. Which means we need to validate that probing interval and guard size
12499 = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL
);
12500 if (guard_size
!= probe_interval
)
12501 error ("stack clash guard size %<%d%> must be equal to probing interval "
12502 "%<%d%>", guard_size
, probe_interval
);
12504 /* Enable sw prefetching at specified optimization level for
12505 CPUS that have prefetch. Lower optimization level threshold by 1
12506 when profiling is enabled. */
12507 if (opts
->x_flag_prefetch_loop_arrays
< 0
12508 && !opts
->x_optimize_size
12509 && aarch64_tune_params
.prefetch
->default_opt_level
>= 0
12510 && opts
->x_optimize
>= aarch64_tune_params
.prefetch
->default_opt_level
)
12511 opts
->x_flag_prefetch_loop_arrays
= 1;
12513 if (opts
->x_aarch64_arch_string
== NULL
)
12514 opts
->x_aarch64_arch_string
= selected_arch
->name
;
12515 if (opts
->x_aarch64_cpu_string
== NULL
)
12516 opts
->x_aarch64_cpu_string
= selected_cpu
->name
;
12517 if (opts
->x_aarch64_tune_string
== NULL
)
12518 opts
->x_aarch64_tune_string
= selected_tune
->name
;
12520 aarch64_override_options_after_change_1 (opts
);
12523 /* Print a hint with a suggestion for a core or architecture name that
12524 most closely resembles what the user passed in STR. ARCH is true if
12525 the user is asking for an architecture name. ARCH is false if the user
12526 is asking for a core name. */
12529 aarch64_print_hint_for_core_or_arch (const char *str
, bool arch
)
12531 auto_vec
<const char *> candidates
;
12532 const struct processor
*entry
= arch
? all_architectures
: all_cores
;
12533 for (; entry
->name
!= NULL
; entry
++)
12534 candidates
.safe_push (entry
->name
);
12536 #ifdef HAVE_LOCAL_CPU_DETECT
12537 /* Add also "native" as possible value. */
12539 candidates
.safe_push ("native");
12543 const char *hint
= candidates_list_and_hint (str
, s
, candidates
);
12545 inform (input_location
, "valid arguments are: %s;"
12546 " did you mean %qs?", s
, hint
);
12548 inform (input_location
, "valid arguments are: %s", s
);
12553 /* Print a hint with a suggestion for a core name that most closely resembles
12554 what the user passed in STR. */
12557 aarch64_print_hint_for_core (const char *str
)
12559 aarch64_print_hint_for_core_or_arch (str
, false);
12562 /* Print a hint with a suggestion for an architecture name that most closely
12563 resembles what the user passed in STR. */
12566 aarch64_print_hint_for_arch (const char *str
)
12568 aarch64_print_hint_for_core_or_arch (str
, true);
12572 /* Print a hint with a suggestion for an extension name
12573 that most closely resembles what the user passed in STR. */
12576 aarch64_print_hint_for_extensions (const std::string
&str
)
12578 auto_vec
<const char *> candidates
;
12579 aarch64_get_all_extension_candidates (&candidates
);
12581 const char *hint
= candidates_list_and_hint (str
.c_str (), s
, candidates
);
12583 inform (input_location
, "valid arguments are: %s;"
12584 " did you mean %qs?", s
, hint
);
12586 inform (input_location
, "valid arguments are: %s;", s
);
12591 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
12592 specified in STR and throw errors if appropriate. Put the results if
12593 they are valid in RES and ISA_FLAGS. Return whether the option is
12597 aarch64_validate_mcpu (const char *str
, const struct processor
**res
,
12598 uint64_t *isa_flags
)
12600 std::string invalid_extension
;
12601 enum aarch64_parse_opt_result parse_res
12602 = aarch64_parse_cpu (str
, res
, isa_flags
, &invalid_extension
);
12604 if (parse_res
== AARCH64_PARSE_OK
)
12609 case AARCH64_PARSE_MISSING_ARG
:
12610 error ("missing cpu name in %<-mcpu=%s%>", str
);
12612 case AARCH64_PARSE_INVALID_ARG
:
12613 error ("unknown value %qs for %<-mcpu%>", str
);
12614 aarch64_print_hint_for_core (str
);
12616 case AARCH64_PARSE_INVALID_FEATURE
:
12617 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
12618 invalid_extension
.c_str (), str
);
12619 aarch64_print_hint_for_extensions (invalid_extension
);
12622 gcc_unreachable ();
12628 /* Parses CONST_STR for branch protection features specified in
12629 aarch64_branch_protect_types, and set any global variables required. Returns
12630 the parsing result and assigns LAST_STR to the last processed token from
12631 CONST_STR so that it can be used for error reporting. */
12634 aarch64_parse_opt_result
aarch64_parse_branch_protection (const char *const_str
,
12637 char *str_root
= xstrdup (const_str
);
12638 char* token_save
= NULL
;
12639 char *str
= strtok_r (str_root
, "+", &token_save
);
12640 enum aarch64_parse_opt_result res
= AARCH64_PARSE_OK
;
12642 res
= AARCH64_PARSE_MISSING_ARG
;
12645 char *next_str
= strtok_r (NULL
, "+", &token_save
);
12646 /* Reset the branch protection features to their defaults. */
12647 aarch64_handle_no_branch_protection (NULL
, NULL
);
12649 while (str
&& res
== AARCH64_PARSE_OK
)
12651 const aarch64_branch_protect_type
* type
= aarch64_branch_protect_types
;
12652 bool found
= false;
12653 /* Search for this type. */
12654 while (type
&& type
->name
&& !found
&& res
== AARCH64_PARSE_OK
)
12656 if (strcmp (str
, type
->name
) == 0)
12659 res
= type
->handler (str
, next_str
);
12661 next_str
= strtok_r (NULL
, "+", &token_save
);
12666 if (found
&& res
== AARCH64_PARSE_OK
)
12668 bool found_subtype
= true;
12669 /* Loop through each token until we find one that isn't a
12671 while (found_subtype
)
12673 found_subtype
= false;
12674 const aarch64_branch_protect_type
*subtype
= type
->subtypes
;
12675 /* Search for the subtype. */
12676 while (str
&& subtype
&& subtype
->name
&& !found_subtype
12677 && res
== AARCH64_PARSE_OK
)
12679 if (strcmp (str
, subtype
->name
) == 0)
12681 found_subtype
= true;
12682 res
= subtype
->handler (str
, next_str
);
12684 next_str
= strtok_r (NULL
, "+", &token_save
);
12692 res
= AARCH64_PARSE_INVALID_ARG
;
12695 /* Copy the last processed token into the argument to pass it back.
12696 Used by option and attribute validation to print the offending token. */
12699 if (str
) strcpy (*last_str
, str
);
12700 else *last_str
= NULL
;
12702 if (res
== AARCH64_PARSE_OK
)
12704 /* If needed, alloc the accepted string then copy in const_str.
12705 Used by override_option_after_change_1. */
12706 if (!accepted_branch_protection_string
)
12707 accepted_branch_protection_string
= (char *) xmalloc (
12708 BRANCH_PROTECT_STR_MAX
12710 strncpy (accepted_branch_protection_string
, const_str
,
12711 BRANCH_PROTECT_STR_MAX
+ 1);
12712 /* Forcibly null-terminate. */
12713 accepted_branch_protection_string
[BRANCH_PROTECT_STR_MAX
] = '\0';
12719 aarch64_validate_mbranch_protection (const char *const_str
)
12721 char *str
= (char *) xmalloc (strlen (const_str
));
12722 enum aarch64_parse_opt_result res
=
12723 aarch64_parse_branch_protection (const_str
, &str
);
12724 if (res
== AARCH64_PARSE_INVALID_ARG
)
12725 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str
);
12726 else if (res
== AARCH64_PARSE_MISSING_ARG
)
12727 error ("missing argument for %<-mbranch-protection=%>");
12729 return res
== AARCH64_PARSE_OK
;
12732 /* Validate a command-line -march option. Parse the arch and extensions
12733 (if any) specified in STR and throw errors if appropriate. Put the
12734 results, if they are valid, in RES and ISA_FLAGS. Return whether the
12735 option is valid. */
12738 aarch64_validate_march (const char *str
, const struct processor
**res
,
12739 uint64_t *isa_flags
)
12741 std::string invalid_extension
;
12742 enum aarch64_parse_opt_result parse_res
12743 = aarch64_parse_arch (str
, res
, isa_flags
, &invalid_extension
);
12745 if (parse_res
== AARCH64_PARSE_OK
)
12750 case AARCH64_PARSE_MISSING_ARG
:
12751 error ("missing arch name in %<-march=%s%>", str
);
12753 case AARCH64_PARSE_INVALID_ARG
:
12754 error ("unknown value %qs for %<-march%>", str
);
12755 aarch64_print_hint_for_arch (str
);
12757 case AARCH64_PARSE_INVALID_FEATURE
:
12758 error ("invalid feature modifier %qs in %<-march=%s%>",
12759 invalid_extension
.c_str (), str
);
12760 aarch64_print_hint_for_extensions (invalid_extension
);
12763 gcc_unreachable ();
12769 /* Validate a command-line -mtune option. Parse the cpu
12770 specified in STR and throw errors if appropriate. Put the
12771 result, if it is valid, in RES. Return whether the option is
12775 aarch64_validate_mtune (const char *str
, const struct processor
**res
)
12777 enum aarch64_parse_opt_result parse_res
12778 = aarch64_parse_tune (str
, res
);
12780 if (parse_res
== AARCH64_PARSE_OK
)
12785 case AARCH64_PARSE_MISSING_ARG
:
12786 error ("missing cpu name in %<-mtune=%s%>", str
);
12788 case AARCH64_PARSE_INVALID_ARG
:
12789 error ("unknown value %qs for %<-mtune%>", str
);
12790 aarch64_print_hint_for_core (str
);
12793 gcc_unreachable ();
12798 /* Return the CPU corresponding to the enum CPU.
12799 If it doesn't specify a cpu, return the default. */
12801 static const struct processor
*
12802 aarch64_get_tune_cpu (enum aarch64_processor cpu
)
12804 if (cpu
!= aarch64_none
)
12805 return &all_cores
[cpu
];
12807 /* The & 0x3f is to extract the bottom 6 bits that encode the
12808 default cpu as selected by the --with-cpu GCC configure option
12810 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
12811 flags mechanism should be reworked to make it more sane. */
12812 return &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
12815 /* Return the architecture corresponding to the enum ARCH.
12816 If it doesn't specify a valid architecture, return the default. */
12818 static const struct processor
*
12819 aarch64_get_arch (enum aarch64_arch arch
)
12821 if (arch
!= aarch64_no_arch
)
12822 return &all_architectures
[arch
];
12824 const struct processor
*cpu
= &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
12826 return &all_architectures
[cpu
->arch
];
12829 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
12832 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value
)
12834 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
12835 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
12836 deciding which .md file patterns to use and when deciding whether
12837 something is a legitimate address or constant. */
12838 if (value
== SVE_SCALABLE
|| value
== SVE_128
)
12839 return poly_uint16 (2, 2);
12841 return (int) value
/ 64;
12844 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
12845 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
12846 tuning structs. In particular it must set selected_tune and
12847 aarch64_isa_flags that define the available ISA features and tuning
12848 decisions. It must also set selected_arch as this will be used to
12849 output the .arch asm tags for each function. */
12852 aarch64_override_options (void)
12854 uint64_t cpu_isa
= 0;
12855 uint64_t arch_isa
= 0;
12856 aarch64_isa_flags
= 0;
12858 bool valid_cpu
= true;
12859 bool valid_tune
= true;
12860 bool valid_arch
= true;
12862 selected_cpu
= NULL
;
12863 selected_arch
= NULL
;
12864 selected_tune
= NULL
;
12866 if (aarch64_branch_protection_string
)
12867 aarch64_validate_mbranch_protection (aarch64_branch_protection_string
);
12869 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
12870 If either of -march or -mtune is given, they override their
12871 respective component of -mcpu. */
12872 if (aarch64_cpu_string
)
12873 valid_cpu
= aarch64_validate_mcpu (aarch64_cpu_string
, &selected_cpu
,
12876 if (aarch64_arch_string
)
12877 valid_arch
= aarch64_validate_march (aarch64_arch_string
, &selected_arch
,
12880 if (aarch64_tune_string
)
12881 valid_tune
= aarch64_validate_mtune (aarch64_tune_string
, &selected_tune
);
12883 #ifdef SUBTARGET_OVERRIDE_OPTIONS
12884 SUBTARGET_OVERRIDE_OPTIONS
;
12887 /* If the user did not specify a processor, choose the default
12888 one for them. This will be the CPU set during configuration using
12889 --with-cpu, otherwise it is "generic". */
12894 selected_cpu
= &all_cores
[selected_arch
->ident
];
12895 aarch64_isa_flags
= arch_isa
;
12896 explicit_arch
= selected_arch
->arch
;
12900 /* Get default configure-time CPU. */
12901 selected_cpu
= aarch64_get_tune_cpu (aarch64_none
);
12902 aarch64_isa_flags
= TARGET_CPU_DEFAULT
>> 6;
12906 explicit_tune_core
= selected_tune
->ident
;
12908 /* If both -mcpu and -march are specified check that they are architecturally
12909 compatible, warn if they're not and prefer the -march ISA flags. */
12910 else if (selected_arch
)
12912 if (selected_arch
->arch
!= selected_cpu
->arch
)
12914 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
12915 all_architectures
[selected_cpu
->arch
].name
,
12916 selected_arch
->name
);
12918 aarch64_isa_flags
= arch_isa
;
12919 explicit_arch
= selected_arch
->arch
;
12920 explicit_tune_core
= selected_tune
? selected_tune
->ident
12921 : selected_cpu
->ident
;
12925 /* -mcpu but no -march. */
12926 aarch64_isa_flags
= cpu_isa
;
12927 explicit_tune_core
= selected_tune
? selected_tune
->ident
12928 : selected_cpu
->ident
;
12929 gcc_assert (selected_cpu
);
12930 selected_arch
= &all_architectures
[selected_cpu
->arch
];
12931 explicit_arch
= selected_arch
->arch
;
12934 /* Set the arch as well as we will need it when outputing
12935 the .arch directive in assembly. */
12936 if (!selected_arch
)
12938 gcc_assert (selected_cpu
);
12939 selected_arch
= &all_architectures
[selected_cpu
->arch
];
12942 if (!selected_tune
)
12943 selected_tune
= selected_cpu
;
12945 if (aarch64_enable_bti
== 2)
12947 #ifdef TARGET_ENABLE_BTI
12948 aarch64_enable_bti
= 1;
12950 aarch64_enable_bti
= 0;
12954 /* Return address signing is currently not supported for ILP32 targets. For
12955 LP64 targets use the configured option in the absence of a command-line
12956 option for -mbranch-protection. */
12957 if (!TARGET_ILP32
&& accepted_branch_protection_string
== NULL
)
12959 #ifdef TARGET_ENABLE_PAC_RET
12960 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
12962 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NONE
;
12966 #ifndef HAVE_AS_MABI_OPTION
12967 /* The compiler may have been configured with 2.23.* binutils, which does
12968 not have support for ILP32. */
12970 error ("assembler does not support %<-mabi=ilp32%>");
12973 /* Convert -msve-vector-bits to a VG count. */
12974 aarch64_sve_vg
= aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits
);
12976 if (aarch64_ra_sign_scope
!= AARCH64_FUNCTION_NONE
&& TARGET_ILP32
)
12977 sorry ("return address signing is only supported for %<-mabi=lp64%>");
12979 /* Make sure we properly set up the explicit options. */
12980 if ((aarch64_cpu_string
&& valid_cpu
)
12981 || (aarch64_tune_string
&& valid_tune
))
12982 gcc_assert (explicit_tune_core
!= aarch64_none
);
12984 if ((aarch64_cpu_string
&& valid_cpu
)
12985 || (aarch64_arch_string
&& valid_arch
))
12986 gcc_assert (explicit_arch
!= aarch64_no_arch
);
12988 /* The pass to insert speculation tracking runs before
12989 shrink-wrapping and the latter does not know how to update the
12990 tracking status. So disable it in this case. */
12991 if (aarch64_track_speculation
)
12992 flag_shrink_wrap
= 0;
12994 aarch64_override_options_internal (&global_options
);
12996 /* Save these options as the default ones in case we push and pop them later
12997 while processing functions with potential target attributes. */
12998 target_option_default_node
= target_option_current_node
12999 = build_target_option_node (&global_options
);
13002 /* Implement targetm.override_options_after_change. */
13005 aarch64_override_options_after_change (void)
13007 aarch64_override_options_after_change_1 (&global_options
);
13010 static struct machine_function
*
13011 aarch64_init_machine_status (void)
13013 struct machine_function
*machine
;
13014 machine
= ggc_cleared_alloc
<machine_function
> ();
13019 aarch64_init_expanders (void)
13021 init_machine_status
= aarch64_init_machine_status
;
13024 /* A checking mechanism for the implementation of the various code models. */
13026 initialize_aarch64_code_model (struct gcc_options
*opts
)
13028 if (opts
->x_flag_pic
)
13030 switch (opts
->x_aarch64_cmodel_var
)
13032 case AARCH64_CMODEL_TINY
:
13033 aarch64_cmodel
= AARCH64_CMODEL_TINY_PIC
;
13035 case AARCH64_CMODEL_SMALL
:
13036 #ifdef HAVE_AS_SMALL_PIC_RELOCS
13037 aarch64_cmodel
= (flag_pic
== 2
13038 ? AARCH64_CMODEL_SMALL_PIC
13039 : AARCH64_CMODEL_SMALL_SPIC
);
13041 aarch64_cmodel
= AARCH64_CMODEL_SMALL_PIC
;
13044 case AARCH64_CMODEL_LARGE
:
13045 sorry ("code model %qs with %<-f%s%>", "large",
13046 opts
->x_flag_pic
> 1 ? "PIC" : "pic");
13049 gcc_unreachable ();
13053 aarch64_cmodel
= opts
->x_aarch64_cmodel_var
;
13056 /* Implement TARGET_OPTION_SAVE. */
13059 aarch64_option_save (struct cl_target_option
*ptr
, struct gcc_options
*opts
)
13061 ptr
->x_aarch64_override_tune_string
= opts
->x_aarch64_override_tune_string
;
13062 ptr
->x_aarch64_branch_protection_string
13063 = opts
->x_aarch64_branch_protection_string
;
13066 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
13067 using the information saved in PTR. */
13070 aarch64_option_restore (struct gcc_options
*opts
, struct cl_target_option
*ptr
)
13072 opts
->x_explicit_tune_core
= ptr
->x_explicit_tune_core
;
13073 selected_tune
= aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
13074 opts
->x_explicit_arch
= ptr
->x_explicit_arch
;
13075 selected_arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
13076 opts
->x_aarch64_override_tune_string
= ptr
->x_aarch64_override_tune_string
;
13077 opts
->x_aarch64_branch_protection_string
13078 = ptr
->x_aarch64_branch_protection_string
;
13079 if (opts
->x_aarch64_branch_protection_string
)
13081 aarch64_parse_branch_protection (opts
->x_aarch64_branch_protection_string
,
13085 aarch64_override_options_internal (opts
);
13088 /* Implement TARGET_OPTION_PRINT. */
13091 aarch64_option_print (FILE *file
, int indent
, struct cl_target_option
*ptr
)
13093 const struct processor
*cpu
13094 = aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
13095 uint64_t isa_flags
= ptr
->x_aarch64_isa_flags
;
13096 const struct processor
*arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
13097 std::string extension
13098 = aarch64_get_extension_string_for_isa_flags (isa_flags
, arch
->flags
);
13100 fprintf (file
, "%*sselected tune = %s\n", indent
, "", cpu
->name
);
13101 fprintf (file
, "%*sselected arch = %s%s\n", indent
, "",
13102 arch
->name
, extension
.c_str ());
13105 static GTY(()) tree aarch64_previous_fndecl
;
13108 aarch64_reset_previous_fndecl (void)
13110 aarch64_previous_fndecl
= NULL
;
13113 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
13114 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
13115 make sure optab availability predicates are recomputed when necessary. */
13118 aarch64_save_restore_target_globals (tree new_tree
)
13120 if (TREE_TARGET_GLOBALS (new_tree
))
13121 restore_target_globals (TREE_TARGET_GLOBALS (new_tree
));
13122 else if (new_tree
== target_option_default_node
)
13123 restore_target_globals (&default_target_globals
);
13125 TREE_TARGET_GLOBALS (new_tree
) = save_target_globals_default_opts ();
13128 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
13129 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
13130 of the function, if such exists. This function may be called multiple
13131 times on a single function so use aarch64_previous_fndecl to avoid
13132 setting up identical state. */
13135 aarch64_set_current_function (tree fndecl
)
13137 if (!fndecl
|| fndecl
== aarch64_previous_fndecl
)
13140 tree old_tree
= (aarch64_previous_fndecl
13141 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl
)
13144 tree new_tree
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
13146 /* If current function has no attributes but the previous one did,
13147 use the default node. */
13148 if (!new_tree
&& old_tree
)
13149 new_tree
= target_option_default_node
;
13151 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
13152 the default have been handled by aarch64_save_restore_target_globals from
13153 aarch64_pragma_target_parse. */
13154 if (old_tree
== new_tree
)
13157 aarch64_previous_fndecl
= fndecl
;
13159 /* First set the target options. */
13160 cl_target_option_restore (&global_options
, TREE_TARGET_OPTION (new_tree
));
13162 aarch64_save_restore_target_globals (new_tree
);
13165 /* Enum describing the various ways we can handle attributes.
13166 In many cases we can reuse the generic option handling machinery. */
13168 enum aarch64_attr_opt_type
13170 aarch64_attr_mask
, /* Attribute should set a bit in target_flags. */
13171 aarch64_attr_bool
, /* Attribute sets or unsets a boolean variable. */
13172 aarch64_attr_enum
, /* Attribute sets an enum variable. */
13173 aarch64_attr_custom
/* Attribute requires a custom handling function. */
13176 /* All the information needed to handle a target attribute.
13177 NAME is the name of the attribute.
13178 ATTR_TYPE specifies the type of behavior of the attribute as described
13179 in the definition of enum aarch64_attr_opt_type.
13180 ALLOW_NEG is true if the attribute supports a "no-" form.
13181 HANDLER is the function that takes the attribute string as an argument
13182 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
13183 OPT_NUM is the enum specifying the option that the attribute modifies.
13184 This is needed for attributes that mirror the behavior of a command-line
13185 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
13186 aarch64_attr_enum. */
13188 struct aarch64_attribute_info
13191 enum aarch64_attr_opt_type attr_type
;
13193 bool (*handler
) (const char *);
13194 enum opt_code opt_num
;
13197 /* Handle the ARCH_STR argument to the arch= target attribute. */
13200 aarch64_handle_attr_arch (const char *str
)
13202 const struct processor
*tmp_arch
= NULL
;
13203 std::string invalid_extension
;
13204 enum aarch64_parse_opt_result parse_res
13205 = aarch64_parse_arch (str
, &tmp_arch
, &aarch64_isa_flags
, &invalid_extension
);
13207 if (parse_res
== AARCH64_PARSE_OK
)
13209 gcc_assert (tmp_arch
);
13210 selected_arch
= tmp_arch
;
13211 explicit_arch
= selected_arch
->arch
;
13217 case AARCH64_PARSE_MISSING_ARG
:
13218 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
13220 case AARCH64_PARSE_INVALID_ARG
:
13221 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str
);
13222 aarch64_print_hint_for_arch (str
);
13224 case AARCH64_PARSE_INVALID_FEATURE
:
13225 error ("invalid feature modifier %s of value (\"%s\") in "
13226 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
13227 aarch64_print_hint_for_extensions (invalid_extension
);
13230 gcc_unreachable ();
13236 /* Handle the argument CPU_STR to the cpu= target attribute. */
13239 aarch64_handle_attr_cpu (const char *str
)
13241 const struct processor
*tmp_cpu
= NULL
;
13242 std::string invalid_extension
;
13243 enum aarch64_parse_opt_result parse_res
13244 = aarch64_parse_cpu (str
, &tmp_cpu
, &aarch64_isa_flags
, &invalid_extension
);
13246 if (parse_res
== AARCH64_PARSE_OK
)
13248 gcc_assert (tmp_cpu
);
13249 selected_tune
= tmp_cpu
;
13250 explicit_tune_core
= selected_tune
->ident
;
13252 selected_arch
= &all_architectures
[tmp_cpu
->arch
];
13253 explicit_arch
= selected_arch
->arch
;
13259 case AARCH64_PARSE_MISSING_ARG
:
13260 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
13262 case AARCH64_PARSE_INVALID_ARG
:
13263 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str
);
13264 aarch64_print_hint_for_core (str
);
13266 case AARCH64_PARSE_INVALID_FEATURE
:
13267 error ("invalid feature modifier %s of value (\"%s\") in "
13268 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
13269 aarch64_print_hint_for_extensions (invalid_extension
);
13272 gcc_unreachable ();
13278 /* Handle the argument STR to the branch-protection= attribute. */
13281 aarch64_handle_attr_branch_protection (const char* str
)
13283 char *err_str
= (char *) xmalloc (strlen (str
));
13284 enum aarch64_parse_opt_result res
= aarch64_parse_branch_protection (str
,
13286 bool success
= false;
13289 case AARCH64_PARSE_MISSING_ARG
:
13290 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
13293 case AARCH64_PARSE_INVALID_ARG
:
13294 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
13295 "=\")%> pragma or attribute", err_str
);
13297 case AARCH64_PARSE_OK
:
13299 /* Fall through. */
13300 case AARCH64_PARSE_INVALID_FEATURE
:
13303 gcc_unreachable ();
13309 /* Handle the argument STR to the tune= target attribute. */
13312 aarch64_handle_attr_tune (const char *str
)
13314 const struct processor
*tmp_tune
= NULL
;
13315 enum aarch64_parse_opt_result parse_res
13316 = aarch64_parse_tune (str
, &tmp_tune
);
13318 if (parse_res
== AARCH64_PARSE_OK
)
13320 gcc_assert (tmp_tune
);
13321 selected_tune
= tmp_tune
;
13322 explicit_tune_core
= selected_tune
->ident
;
13328 case AARCH64_PARSE_INVALID_ARG
:
13329 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str
);
13330 aarch64_print_hint_for_core (str
);
13333 gcc_unreachable ();
13339 /* Parse an architecture extensions target attribute string specified in STR.
13340 For example "+fp+nosimd". Show any errors if needed. Return TRUE
13341 if successful. Update aarch64_isa_flags to reflect the ISA features
13345 aarch64_handle_attr_isa_flags (char *str
)
13347 enum aarch64_parse_opt_result parse_res
;
13348 uint64_t isa_flags
= aarch64_isa_flags
;
13350 /* We allow "+nothing" in the beginning to clear out all architectural
13351 features if the user wants to handpick specific features. */
13352 if (strncmp ("+nothing", str
, 8) == 0)
13358 std::string invalid_extension
;
13359 parse_res
= aarch64_parse_extension (str
, &isa_flags
, &invalid_extension
);
13361 if (parse_res
== AARCH64_PARSE_OK
)
13363 aarch64_isa_flags
= isa_flags
;
13369 case AARCH64_PARSE_MISSING_ARG
:
13370 error ("missing value in %<target()%> pragma or attribute");
13373 case AARCH64_PARSE_INVALID_FEATURE
:
13374 error ("invalid feature modifier %s of value (\"%s\") in "
13375 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
13379 gcc_unreachable ();
13385 /* The target attributes that we support. On top of these we also support just
13386 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
13387 handled explicitly in aarch64_process_one_target_attr. */
13389 static const struct aarch64_attribute_info aarch64_attributes
[] =
13391 { "general-regs-only", aarch64_attr_mask
, false, NULL
,
13392 OPT_mgeneral_regs_only
},
13393 { "fix-cortex-a53-835769", aarch64_attr_bool
, true, NULL
,
13394 OPT_mfix_cortex_a53_835769
},
13395 { "fix-cortex-a53-843419", aarch64_attr_bool
, true, NULL
,
13396 OPT_mfix_cortex_a53_843419
},
13397 { "cmodel", aarch64_attr_enum
, false, NULL
, OPT_mcmodel_
},
13398 { "strict-align", aarch64_attr_mask
, true, NULL
, OPT_mstrict_align
},
13399 { "omit-leaf-frame-pointer", aarch64_attr_bool
, true, NULL
,
13400 OPT_momit_leaf_frame_pointer
},
13401 { "tls-dialect", aarch64_attr_enum
, false, NULL
, OPT_mtls_dialect_
},
13402 { "arch", aarch64_attr_custom
, false, aarch64_handle_attr_arch
,
13404 { "cpu", aarch64_attr_custom
, false, aarch64_handle_attr_cpu
, OPT_mcpu_
},
13405 { "tune", aarch64_attr_custom
, false, aarch64_handle_attr_tune
,
13407 { "branch-protection", aarch64_attr_custom
, false,
13408 aarch64_handle_attr_branch_protection
, OPT_mbranch_protection_
},
13409 { "sign-return-address", aarch64_attr_enum
, false, NULL
,
13410 OPT_msign_return_address_
},
13411 { NULL
, aarch64_attr_custom
, false, NULL
, OPT____
}
13414 /* Parse ARG_STR which contains the definition of one target attribute.
13415 Show appropriate errors if any or return true if the attribute is valid. */
13418 aarch64_process_one_target_attr (char *arg_str
)
13420 bool invert
= false;
13422 size_t len
= strlen (arg_str
);
13426 error ("malformed %<target()%> pragma or attribute");
13430 char *str_to_check
= (char *) alloca (len
+ 1);
13431 strcpy (str_to_check
, arg_str
);
13433 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
13434 It is easier to detect and handle it explicitly here rather than going
13435 through the machinery for the rest of the target attributes in this
13437 if (*str_to_check
== '+')
13438 return aarch64_handle_attr_isa_flags (str_to_check
);
13440 if (len
> 3 && strncmp (str_to_check
, "no-", 3) == 0)
13445 char *arg
= strchr (str_to_check
, '=');
13447 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
13448 and point ARG to "foo". */
13454 const struct aarch64_attribute_info
*p_attr
;
13455 bool found
= false;
13456 for (p_attr
= aarch64_attributes
; p_attr
->name
; p_attr
++)
13458 /* If the names don't match up, or the user has given an argument
13459 to an attribute that doesn't accept one, or didn't give an argument
13460 to an attribute that expects one, fail to match. */
13461 if (strcmp (str_to_check
, p_attr
->name
) != 0)
13465 bool attr_need_arg_p
= p_attr
->attr_type
== aarch64_attr_custom
13466 || p_attr
->attr_type
== aarch64_attr_enum
;
13468 if (attr_need_arg_p
^ (arg
!= NULL
))
13470 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check
);
13474 /* If the name matches but the attribute does not allow "no-" versions
13475 then we can't match. */
13476 if (invert
&& !p_attr
->allow_neg
)
13478 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check
);
13482 switch (p_attr
->attr_type
)
13484 /* Has a custom handler registered.
13485 For example, cpu=, arch=, tune=. */
13486 case aarch64_attr_custom
:
13487 gcc_assert (p_attr
->handler
);
13488 if (!p_attr
->handler (arg
))
13492 /* Either set or unset a boolean option. */
13493 case aarch64_attr_bool
:
13495 struct cl_decoded_option decoded
;
13497 generate_option (p_attr
->opt_num
, NULL
, !invert
,
13498 CL_TARGET
, &decoded
);
13499 aarch64_handle_option (&global_options
, &global_options_set
,
13500 &decoded
, input_location
);
13503 /* Set or unset a bit in the target_flags. aarch64_handle_option
13504 should know what mask to apply given the option number. */
13505 case aarch64_attr_mask
:
13507 struct cl_decoded_option decoded
;
13508 /* We only need to specify the option number.
13509 aarch64_handle_option will know which mask to apply. */
13510 decoded
.opt_index
= p_attr
->opt_num
;
13511 decoded
.value
= !invert
;
13512 aarch64_handle_option (&global_options
, &global_options_set
,
13513 &decoded
, input_location
);
13516 /* Use the option setting machinery to set an option to an enum. */
13517 case aarch64_attr_enum
:
13522 valid
= opt_enum_arg_to_value (p_attr
->opt_num
, arg
,
13523 &value
, CL_TARGET
);
13526 set_option (&global_options
, NULL
, p_attr
->opt_num
, value
,
13527 NULL
, DK_UNSPECIFIED
, input_location
,
13532 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check
, arg
);
13537 gcc_unreachable ();
13541 /* If we reached here we either have found an attribute and validated
13542 it or didn't match any. If we matched an attribute but its arguments
13543 were malformed we will have returned false already. */
13547 /* Count how many times the character C appears in
13548 NULL-terminated string STR. */
13550 static unsigned int
13551 num_occurences_in_str (char c
, char *str
)
13553 unsigned int res
= 0;
13554 while (*str
!= '\0')
13565 /* Parse the tree in ARGS that contains the target attribute information
13566 and update the global target options space. */
13569 aarch64_process_target_attr (tree args
)
13571 if (TREE_CODE (args
) == TREE_LIST
)
13575 tree head
= TREE_VALUE (args
);
13578 if (!aarch64_process_target_attr (head
))
13581 args
= TREE_CHAIN (args
);
13587 if (TREE_CODE (args
) != STRING_CST
)
13589 error ("attribute %<target%> argument not a string");
13593 size_t len
= strlen (TREE_STRING_POINTER (args
));
13594 char *str_to_check
= (char *) alloca (len
+ 1);
13595 strcpy (str_to_check
, TREE_STRING_POINTER (args
));
13599 error ("malformed %<target()%> pragma or attribute");
13603 /* Used to catch empty spaces between commas i.e.
13604 attribute ((target ("attr1,,attr2"))). */
13605 unsigned int num_commas
= num_occurences_in_str (',', str_to_check
);
13607 /* Handle multiple target attributes separated by ','. */
13608 char *token
= strtok_r (str_to_check
, ",", &str_to_check
);
13610 unsigned int num_attrs
= 0;
13614 if (!aarch64_process_one_target_attr (token
))
13616 error ("pragma or attribute %<target(\"%s\")%> is not valid", token
);
13620 token
= strtok_r (NULL
, ",", &str_to_check
);
13623 if (num_attrs
!= num_commas
+ 1)
13625 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args
));
13632 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
13633 process attribute ((target ("..."))). */
13636 aarch64_option_valid_attribute_p (tree fndecl
, tree
, tree args
, int)
13638 struct cl_target_option cur_target
;
13641 tree new_target
, new_optimize
;
13642 tree existing_target
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
13644 /* If what we're processing is the current pragma string then the
13645 target option node is already stored in target_option_current_node
13646 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
13647 having to re-parse the string. This is especially useful to keep
13648 arm_neon.h compile times down since that header contains a lot
13649 of intrinsics enclosed in pragmas. */
13650 if (!existing_target
&& args
== current_target_pragma
)
13652 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = target_option_current_node
;
13655 tree func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
13657 old_optimize
= build_optimization_node (&global_options
);
13658 func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
13660 /* If the function changed the optimization levels as well as setting
13661 target options, start with the optimizations specified. */
13662 if (func_optimize
&& func_optimize
!= old_optimize
)
13663 cl_optimization_restore (&global_options
,
13664 TREE_OPTIMIZATION (func_optimize
));
13666 /* Save the current target options to restore at the end. */
13667 cl_target_option_save (&cur_target
, &global_options
);
13669 /* If fndecl already has some target attributes applied to it, unpack
13670 them so that we add this attribute on top of them, rather than
13671 overwriting them. */
13672 if (existing_target
)
13674 struct cl_target_option
*existing_options
13675 = TREE_TARGET_OPTION (existing_target
);
13677 if (existing_options
)
13678 cl_target_option_restore (&global_options
, existing_options
);
13681 cl_target_option_restore (&global_options
,
13682 TREE_TARGET_OPTION (target_option_current_node
));
13684 ret
= aarch64_process_target_attr (args
);
13686 /* Set up any additional state. */
13689 aarch64_override_options_internal (&global_options
);
13690 /* Initialize SIMD builtins if we haven't already.
13691 Set current_target_pragma to NULL for the duration so that
13692 the builtin initialization code doesn't try to tag the functions
13693 being built with the attributes specified by any current pragma, thus
13694 going into an infinite recursion. */
13697 tree saved_current_target_pragma
= current_target_pragma
;
13698 current_target_pragma
= NULL
;
13699 aarch64_init_simd_builtins ();
13700 current_target_pragma
= saved_current_target_pragma
;
13702 new_target
= build_target_option_node (&global_options
);
13707 new_optimize
= build_optimization_node (&global_options
);
13711 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = new_target
;
13713 if (old_optimize
!= new_optimize
)
13714 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
) = new_optimize
;
13717 cl_target_option_restore (&global_options
, &cur_target
);
13719 if (old_optimize
!= new_optimize
)
13720 cl_optimization_restore (&global_options
,
13721 TREE_OPTIMIZATION (old_optimize
));
13725 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
13726 tri-bool options (yes, no, don't care) and the default value is
13727 DEF, determine whether to reject inlining. */
13730 aarch64_tribools_ok_for_inlining_p (int caller
, int callee
,
13731 int dont_care
, int def
)
13733 /* If the callee doesn't care, always allow inlining. */
13734 if (callee
== dont_care
)
13737 /* If the caller doesn't care, always allow inlining. */
13738 if (caller
== dont_care
)
13741 /* Otherwise, allow inlining if either the callee and caller values
13742 agree, or if the callee is using the default value. */
13743 return (callee
== caller
|| callee
== def
);
13746 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
13747 to inline CALLEE into CALLER based on target-specific info.
13748 Make sure that the caller and callee have compatible architectural
13749 features. Then go through the other possible target attributes
13750 and see if they can block inlining. Try not to reject always_inline
13751 callees unless they are incompatible architecturally. */
13754 aarch64_can_inline_p (tree caller
, tree callee
)
13756 tree caller_tree
= DECL_FUNCTION_SPECIFIC_TARGET (caller
);
13757 tree callee_tree
= DECL_FUNCTION_SPECIFIC_TARGET (callee
);
13759 struct cl_target_option
*caller_opts
13760 = TREE_TARGET_OPTION (caller_tree
? caller_tree
13761 : target_option_default_node
);
13763 struct cl_target_option
*callee_opts
13764 = TREE_TARGET_OPTION (callee_tree
? callee_tree
13765 : target_option_default_node
);
13767 /* Callee's ISA flags should be a subset of the caller's. */
13768 if ((caller_opts
->x_aarch64_isa_flags
& callee_opts
->x_aarch64_isa_flags
)
13769 != callee_opts
->x_aarch64_isa_flags
)
13772 /* Allow non-strict aligned functions inlining into strict
13774 if ((TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)
13775 != TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
))
13776 && !(!TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
)
13777 && TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)))
13780 bool always_inline
= lookup_attribute ("always_inline",
13781 DECL_ATTRIBUTES (callee
));
13783 /* If the architectural features match up and the callee is always_inline
13784 then the other attributes don't matter. */
13788 if (caller_opts
->x_aarch64_cmodel_var
13789 != callee_opts
->x_aarch64_cmodel_var
)
13792 if (caller_opts
->x_aarch64_tls_dialect
13793 != callee_opts
->x_aarch64_tls_dialect
)
13796 /* Honour explicit requests to workaround errata. */
13797 if (!aarch64_tribools_ok_for_inlining_p (
13798 caller_opts
->x_aarch64_fix_a53_err835769
,
13799 callee_opts
->x_aarch64_fix_a53_err835769
,
13800 2, TARGET_FIX_ERR_A53_835769_DEFAULT
))
13803 if (!aarch64_tribools_ok_for_inlining_p (
13804 caller_opts
->x_aarch64_fix_a53_err843419
,
13805 callee_opts
->x_aarch64_fix_a53_err843419
,
13806 2, TARGET_FIX_ERR_A53_843419
))
13809 /* If the user explicitly specified -momit-leaf-frame-pointer for the
13810 caller and calle and they don't match up, reject inlining. */
13811 if (!aarch64_tribools_ok_for_inlining_p (
13812 caller_opts
->x_flag_omit_leaf_frame_pointer
,
13813 callee_opts
->x_flag_omit_leaf_frame_pointer
,
13817 /* If the callee has specific tuning overrides, respect them. */
13818 if (callee_opts
->x_aarch64_override_tune_string
!= NULL
13819 && caller_opts
->x_aarch64_override_tune_string
== NULL
)
13822 /* If the user specified tuning override strings for the
13823 caller and callee and they don't match up, reject inlining.
13824 We just do a string compare here, we don't analyze the meaning
13825 of the string, as it would be too costly for little gain. */
13826 if (callee_opts
->x_aarch64_override_tune_string
13827 && caller_opts
->x_aarch64_override_tune_string
13828 && (strcmp (callee_opts
->x_aarch64_override_tune_string
,
13829 caller_opts
->x_aarch64_override_tune_string
) != 0))
13835 /* Return true if SYMBOL_REF X binds locally. */
13838 aarch64_symbol_binds_local_p (const_rtx x
)
13840 return (SYMBOL_REF_DECL (x
)
13841 ? targetm
.binds_local_p (SYMBOL_REF_DECL (x
))
13842 : SYMBOL_REF_LOCAL_P (x
));
13845 /* Return true if SYMBOL_REF X is thread local */
13847 aarch64_tls_symbol_p (rtx x
)
13849 if (! TARGET_HAVE_TLS
)
13852 if (GET_CODE (x
) != SYMBOL_REF
)
13855 return SYMBOL_REF_TLS_MODEL (x
) != 0;
13858 /* Classify a TLS symbol into one of the TLS kinds. */
13859 enum aarch64_symbol_type
13860 aarch64_classify_tls_symbol (rtx x
)
13862 enum tls_model tls_kind
= tls_symbolic_operand_type (x
);
13866 case TLS_MODEL_GLOBAL_DYNAMIC
:
13867 case TLS_MODEL_LOCAL_DYNAMIC
:
13868 return TARGET_TLS_DESC
? SYMBOL_SMALL_TLSDESC
: SYMBOL_SMALL_TLSGD
;
13870 case TLS_MODEL_INITIAL_EXEC
:
13871 switch (aarch64_cmodel
)
13873 case AARCH64_CMODEL_TINY
:
13874 case AARCH64_CMODEL_TINY_PIC
:
13875 return SYMBOL_TINY_TLSIE
;
13877 return SYMBOL_SMALL_TLSIE
;
13880 case TLS_MODEL_LOCAL_EXEC
:
13881 if (aarch64_tls_size
== 12)
13882 return SYMBOL_TLSLE12
;
13883 else if (aarch64_tls_size
== 24)
13884 return SYMBOL_TLSLE24
;
13885 else if (aarch64_tls_size
== 32)
13886 return SYMBOL_TLSLE32
;
13887 else if (aarch64_tls_size
== 48)
13888 return SYMBOL_TLSLE48
;
13890 gcc_unreachable ();
13892 case TLS_MODEL_EMULATED
:
13893 case TLS_MODEL_NONE
:
13894 return SYMBOL_FORCE_TO_MEM
;
13897 gcc_unreachable ();
13901 /* Return the correct method for accessing X + OFFSET, where X is either
13902 a SYMBOL_REF or LABEL_REF. */
13904 enum aarch64_symbol_type
13905 aarch64_classify_symbol (rtx x
, HOST_WIDE_INT offset
)
13907 if (GET_CODE (x
) == LABEL_REF
)
13909 switch (aarch64_cmodel
)
13911 case AARCH64_CMODEL_LARGE
:
13912 return SYMBOL_FORCE_TO_MEM
;
13914 case AARCH64_CMODEL_TINY_PIC
:
13915 case AARCH64_CMODEL_TINY
:
13916 return SYMBOL_TINY_ABSOLUTE
;
13918 case AARCH64_CMODEL_SMALL_SPIC
:
13919 case AARCH64_CMODEL_SMALL_PIC
:
13920 case AARCH64_CMODEL_SMALL
:
13921 return SYMBOL_SMALL_ABSOLUTE
;
13924 gcc_unreachable ();
13928 if (GET_CODE (x
) == SYMBOL_REF
)
13930 if (aarch64_tls_symbol_p (x
))
13931 return aarch64_classify_tls_symbol (x
);
13933 switch (aarch64_cmodel
)
13935 case AARCH64_CMODEL_TINY
:
13936 /* When we retrieve symbol + offset address, we have to make sure
13937 the offset does not cause overflow of the final address. But
13938 we have no way of knowing the address of symbol at compile time
13939 so we can't accurately say if the distance between the PC and
13940 symbol + offset is outside the addressible range of +/-1M in the
13941 TINY code model. So we rely on images not being greater than
13942 1M and cap the offset at 1M and anything beyond 1M will have to
13943 be loaded using an alternative mechanism. Furthermore if the
13944 symbol is a weak reference to something that isn't known to
13945 resolve to a symbol in this module, then force to memory. */
13946 if ((SYMBOL_REF_WEAK (x
)
13947 && !aarch64_symbol_binds_local_p (x
))
13948 || !IN_RANGE (offset
, -1048575, 1048575))
13949 return SYMBOL_FORCE_TO_MEM
;
13950 return SYMBOL_TINY_ABSOLUTE
;
13952 case AARCH64_CMODEL_SMALL
:
13953 /* Same reasoning as the tiny code model, but the offset cap here is
13955 if ((SYMBOL_REF_WEAK (x
)
13956 && !aarch64_symbol_binds_local_p (x
))
13957 || !IN_RANGE (offset
, HOST_WIDE_INT_C (-4294967263),
13958 HOST_WIDE_INT_C (4294967264)))
13959 return SYMBOL_FORCE_TO_MEM
;
13960 return SYMBOL_SMALL_ABSOLUTE
;
13962 case AARCH64_CMODEL_TINY_PIC
:
13963 if (!aarch64_symbol_binds_local_p (x
))
13964 return SYMBOL_TINY_GOT
;
13965 return SYMBOL_TINY_ABSOLUTE
;
13967 case AARCH64_CMODEL_SMALL_SPIC
:
13968 case AARCH64_CMODEL_SMALL_PIC
:
13969 if (!aarch64_symbol_binds_local_p (x
))
13970 return (aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
13971 ? SYMBOL_SMALL_GOT_28K
: SYMBOL_SMALL_GOT_4G
);
13972 return SYMBOL_SMALL_ABSOLUTE
;
13974 case AARCH64_CMODEL_LARGE
:
13975 /* This is alright even in PIC code as the constant
13976 pool reference is always PC relative and within
13977 the same translation unit. */
13978 if (!aarch64_pcrelative_literal_loads
&& CONSTANT_POOL_ADDRESS_P (x
))
13979 return SYMBOL_SMALL_ABSOLUTE
;
13981 return SYMBOL_FORCE_TO_MEM
;
13984 gcc_unreachable ();
13988 /* By default push everything into the constant pool. */
13989 return SYMBOL_FORCE_TO_MEM
;
13993 aarch64_constant_address_p (rtx x
)
13995 return (CONSTANT_P (x
) && memory_address_p (DImode
, x
));
13999 aarch64_legitimate_pic_operand_p (rtx x
)
14001 if (GET_CODE (x
) == SYMBOL_REF
14002 || (GET_CODE (x
) == CONST
14003 && GET_CODE (XEXP (x
, 0)) == PLUS
14004 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
))
14010 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
14011 that should be rematerialized rather than spilled. */
14014 aarch64_legitimate_constant_p (machine_mode mode
, rtx x
)
14016 /* Support CSE and rematerialization of common constants. */
14017 if (CONST_INT_P (x
)
14018 || (CONST_DOUBLE_P (x
) && GET_MODE_CLASS (mode
) == MODE_FLOAT
)
14019 || GET_CODE (x
) == CONST_VECTOR
)
14022 /* Do not allow vector struct mode constants for Advanced SIMD.
14023 We could support 0 and -1 easily, but they need support in
14024 aarch64-simd.md. */
14025 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
14026 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
14029 /* Only accept variable-length vector constants if they can be
14032 ??? It would be possible to handle rematerialization of other
14033 constants via secondary reloads. */
14034 if (vec_flags
& VEC_ANY_SVE
)
14035 return aarch64_simd_valid_immediate (x
, NULL
);
14037 if (GET_CODE (x
) == HIGH
)
14040 /* Accept polynomial constants that can be calculated by using the
14041 destination of a move as the sole temporary. Constants that
14042 require a second temporary cannot be rematerialized (they can't be
14043 forced to memory and also aren't legitimate constants). */
14045 if (poly_int_rtx_p (x
, &offset
))
14046 return aarch64_offset_temporaries (false, offset
) <= 1;
14048 /* If an offset is being added to something else, we need to allow the
14049 base to be moved into the destination register, meaning that there
14050 are no free temporaries for the offset. */
14051 x
= strip_offset (x
, &offset
);
14052 if (!offset
.is_constant () && aarch64_offset_temporaries (true, offset
) > 0)
14055 /* Do not allow const (plus (anchor_symbol, const_int)). */
14056 if (maybe_ne (offset
, 0) && SYMBOL_REF_P (x
) && SYMBOL_REF_ANCHOR_P (x
))
14059 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
14060 so spilling them is better than rematerialization. */
14061 if (SYMBOL_REF_P (x
) && !SYMBOL_REF_TLS_MODEL (x
))
14064 /* Label references are always constant. */
14065 if (GET_CODE (x
) == LABEL_REF
)
14072 aarch64_load_tp (rtx target
)
14075 || GET_MODE (target
) != Pmode
14076 || !register_operand (target
, Pmode
))
14077 target
= gen_reg_rtx (Pmode
);
14079 /* Can return in any reg. */
14080 emit_insn (gen_aarch64_load_tp_hard (target
));
14084 /* On AAPCS systems, this is the "struct __va_list". */
14085 static GTY(()) tree va_list_type
;
14087 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
14088 Return the type to use as __builtin_va_list.
14090 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
14102 aarch64_build_builtin_va_list (void)
14105 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
14107 /* Create the type. */
14108 va_list_type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
14109 /* Give it the required name. */
14110 va_list_name
= build_decl (BUILTINS_LOCATION
,
14112 get_identifier ("__va_list"),
14114 DECL_ARTIFICIAL (va_list_name
) = 1;
14115 TYPE_NAME (va_list_type
) = va_list_name
;
14116 TYPE_STUB_DECL (va_list_type
) = va_list_name
;
14118 /* Create the fields. */
14119 f_stack
= build_decl (BUILTINS_LOCATION
,
14120 FIELD_DECL
, get_identifier ("__stack"),
14122 f_grtop
= build_decl (BUILTINS_LOCATION
,
14123 FIELD_DECL
, get_identifier ("__gr_top"),
14125 f_vrtop
= build_decl (BUILTINS_LOCATION
,
14126 FIELD_DECL
, get_identifier ("__vr_top"),
14128 f_groff
= build_decl (BUILTINS_LOCATION
,
14129 FIELD_DECL
, get_identifier ("__gr_offs"),
14130 integer_type_node
);
14131 f_vroff
= build_decl (BUILTINS_LOCATION
,
14132 FIELD_DECL
, get_identifier ("__vr_offs"),
14133 integer_type_node
);
14135 /* Tell tree-stdarg pass about our internal offset fields.
14136 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
14137 purpose to identify whether the code is updating va_list internal
14138 offset fields through irregular way. */
14139 va_list_gpr_counter_field
= f_groff
;
14140 va_list_fpr_counter_field
= f_vroff
;
14142 DECL_ARTIFICIAL (f_stack
) = 1;
14143 DECL_ARTIFICIAL (f_grtop
) = 1;
14144 DECL_ARTIFICIAL (f_vrtop
) = 1;
14145 DECL_ARTIFICIAL (f_groff
) = 1;
14146 DECL_ARTIFICIAL (f_vroff
) = 1;
14148 DECL_FIELD_CONTEXT (f_stack
) = va_list_type
;
14149 DECL_FIELD_CONTEXT (f_grtop
) = va_list_type
;
14150 DECL_FIELD_CONTEXT (f_vrtop
) = va_list_type
;
14151 DECL_FIELD_CONTEXT (f_groff
) = va_list_type
;
14152 DECL_FIELD_CONTEXT (f_vroff
) = va_list_type
;
14154 TYPE_FIELDS (va_list_type
) = f_stack
;
14155 DECL_CHAIN (f_stack
) = f_grtop
;
14156 DECL_CHAIN (f_grtop
) = f_vrtop
;
14157 DECL_CHAIN (f_vrtop
) = f_groff
;
14158 DECL_CHAIN (f_groff
) = f_vroff
;
14160 /* Compute its layout. */
14161 layout_type (va_list_type
);
14163 return va_list_type
;
14166 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
14168 aarch64_expand_builtin_va_start (tree valist
, rtx nextarg ATTRIBUTE_UNUSED
)
14170 const CUMULATIVE_ARGS
*cum
;
14171 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
14172 tree stack
, grtop
, vrtop
, groff
, vroff
;
14174 int gr_save_area_size
= cfun
->va_list_gpr_size
;
14175 int vr_save_area_size
= cfun
->va_list_fpr_size
;
14178 cum
= &crtl
->args
.info
;
14179 if (cfun
->va_list_gpr_size
)
14180 gr_save_area_size
= MIN ((NUM_ARG_REGS
- cum
->aapcs_ncrn
) * UNITS_PER_WORD
,
14181 cfun
->va_list_gpr_size
);
14182 if (cfun
->va_list_fpr_size
)
14183 vr_save_area_size
= MIN ((NUM_FP_ARG_REGS
- cum
->aapcs_nvrn
)
14184 * UNITS_PER_VREG
, cfun
->va_list_fpr_size
);
14188 gcc_assert (cum
->aapcs_nvrn
== 0);
14189 vr_save_area_size
= 0;
14192 f_stack
= TYPE_FIELDS (va_list_type_node
);
14193 f_grtop
= DECL_CHAIN (f_stack
);
14194 f_vrtop
= DECL_CHAIN (f_grtop
);
14195 f_groff
= DECL_CHAIN (f_vrtop
);
14196 f_vroff
= DECL_CHAIN (f_groff
);
14198 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), valist
, f_stack
,
14200 grtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
), valist
, f_grtop
,
14202 vrtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
), valist
, f_vrtop
,
14204 groff
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
), valist
, f_groff
,
14206 vroff
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
), valist
, f_vroff
,
14209 /* Emit code to initialize STACK, which points to the next varargs stack
14210 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
14211 by named arguments. STACK is 8-byte aligned. */
14212 t
= make_tree (TREE_TYPE (stack
), virtual_incoming_args_rtx
);
14213 if (cum
->aapcs_stack_size
> 0)
14214 t
= fold_build_pointer_plus_hwi (t
, cum
->aapcs_stack_size
* UNITS_PER_WORD
);
14215 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), stack
, t
);
14216 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
14218 /* Emit code to initialize GRTOP, the top of the GR save area.
14219 virtual_incoming_args_rtx should have been 16 byte aligned. */
14220 t
= make_tree (TREE_TYPE (grtop
), virtual_incoming_args_rtx
);
14221 t
= build2 (MODIFY_EXPR
, TREE_TYPE (grtop
), grtop
, t
);
14222 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
14224 /* Emit code to initialize VRTOP, the top of the VR save area.
14225 This address is gr_save_area_bytes below GRTOP, rounded
14226 down to the next 16-byte boundary. */
14227 t
= make_tree (TREE_TYPE (vrtop
), virtual_incoming_args_rtx
);
14228 vr_offset
= ROUND_UP (gr_save_area_size
,
14229 STACK_BOUNDARY
/ BITS_PER_UNIT
);
14232 t
= fold_build_pointer_plus_hwi (t
, -vr_offset
);
14233 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vrtop
), vrtop
, t
);
14234 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
14236 /* Emit code to initialize GROFF, the offset from GRTOP of the
14237 next GPR argument. */
14238 t
= build2 (MODIFY_EXPR
, TREE_TYPE (groff
), groff
,
14239 build_int_cst (TREE_TYPE (groff
), -gr_save_area_size
));
14240 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
14242 /* Likewise emit code to initialize VROFF, the offset from FTOP
14243 of the next VR argument. */
14244 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vroff
), vroff
,
14245 build_int_cst (TREE_TYPE (vroff
), -vr_save_area_size
));
14246 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
14249 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
14252 aarch64_gimplify_va_arg_expr (tree valist
, tree type
, gimple_seq
*pre_p
,
14253 gimple_seq
*post_p ATTRIBUTE_UNUSED
)
14257 bool is_ha
; /* is HFA or HVA. */
14258 bool dw_align
; /* double-word align. */
14259 machine_mode ag_mode
= VOIDmode
;
14263 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
14264 tree stack
, f_top
, f_off
, off
, arg
, roundup
, on_stack
;
14265 HOST_WIDE_INT size
, rsize
, adjust
, align
;
14266 tree t
, u
, cond1
, cond2
;
14268 indirect_p
= pass_va_arg_by_reference (type
);
14270 type
= build_pointer_type (type
);
14272 mode
= TYPE_MODE (type
);
14274 f_stack
= TYPE_FIELDS (va_list_type_node
);
14275 f_grtop
= DECL_CHAIN (f_stack
);
14276 f_vrtop
= DECL_CHAIN (f_grtop
);
14277 f_groff
= DECL_CHAIN (f_vrtop
);
14278 f_vroff
= DECL_CHAIN (f_groff
);
14280 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), unshare_expr (valist
),
14281 f_stack
, NULL_TREE
);
14282 size
= int_size_in_bytes (type
);
14286 = aarch64_function_arg_alignment (mode
, type
, &abi_break
) / BITS_PER_UNIT
;
14290 if (aarch64_vfp_is_call_or_return_candidate (mode
,
14296 /* No frontends can create types with variable-sized modes, so we
14297 shouldn't be asked to pass or return them. */
14298 unsigned int ag_size
= GET_MODE_SIZE (ag_mode
).to_constant ();
14300 /* TYPE passed in fp/simd registers. */
14302 aarch64_err_no_fpadvsimd (mode
);
14304 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
),
14305 unshare_expr (valist
), f_vrtop
, NULL_TREE
);
14306 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
),
14307 unshare_expr (valist
), f_vroff
, NULL_TREE
);
14309 rsize
= nregs
* UNITS_PER_VREG
;
14313 if (BYTES_BIG_ENDIAN
&& ag_size
< UNITS_PER_VREG
)
14314 adjust
= UNITS_PER_VREG
- ag_size
;
14316 else if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
14317 && size
< UNITS_PER_VREG
)
14319 adjust
= UNITS_PER_VREG
- size
;
14324 /* TYPE passed in general registers. */
14325 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
),
14326 unshare_expr (valist
), f_grtop
, NULL_TREE
);
14327 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
),
14328 unshare_expr (valist
), f_groff
, NULL_TREE
);
14329 rsize
= ROUND_UP (size
, UNITS_PER_WORD
);
14330 nregs
= rsize
/ UNITS_PER_WORD
;
14334 if (abi_break
&& warn_psabi
)
14335 inform (input_location
, "parameter passing for argument of type "
14336 "%qT changed in GCC 9.1", type
);
14340 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
14341 && size
< UNITS_PER_WORD
)
14343 adjust
= UNITS_PER_WORD
- size
;
14347 /* Get a local temporary for the field value. */
14348 off
= get_initialized_tmp_var (f_off
, pre_p
, NULL
);
14350 /* Emit code to branch if off >= 0. */
14351 t
= build2 (GE_EXPR
, boolean_type_node
, off
,
14352 build_int_cst (TREE_TYPE (off
), 0));
14353 cond1
= build3 (COND_EXPR
, ptr_type_node
, t
, NULL_TREE
, NULL_TREE
);
14357 /* Emit: offs = (offs + 15) & -16. */
14358 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
14359 build_int_cst (TREE_TYPE (off
), 15));
14360 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (off
), t
,
14361 build_int_cst (TREE_TYPE (off
), -16));
14362 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (off
), off
, t
);
14367 /* Update ap.__[g|v]r_offs */
14368 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
14369 build_int_cst (TREE_TYPE (off
), rsize
));
14370 t
= build2 (MODIFY_EXPR
, TREE_TYPE (f_off
), unshare_expr (f_off
), t
);
14374 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
14376 /* [cond2] if (ap.__[g|v]r_offs > 0) */
14377 u
= build2 (GT_EXPR
, boolean_type_node
, unshare_expr (f_off
),
14378 build_int_cst (TREE_TYPE (f_off
), 0));
14379 cond2
= build3 (COND_EXPR
, ptr_type_node
, u
, NULL_TREE
, NULL_TREE
);
14381 /* String up: make sure the assignment happens before the use. */
14382 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (cond2
), t
, cond2
);
14383 COND_EXPR_ELSE (cond1
) = t
;
14385 /* Prepare the trees handling the argument that is passed on the stack;
14386 the top level node will store in ON_STACK. */
14387 arg
= get_initialized_tmp_var (stack
, pre_p
, NULL
);
14390 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
14391 t
= fold_build_pointer_plus_hwi (arg
, 15);
14392 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
14393 build_int_cst (TREE_TYPE (t
), -16));
14394 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (arg
), arg
, t
);
14398 /* Advance ap.__stack */
14399 t
= fold_build_pointer_plus_hwi (arg
, size
+ 7);
14400 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
14401 build_int_cst (TREE_TYPE (t
), -8));
14402 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), unshare_expr (stack
), t
);
14403 /* String up roundup and advance. */
14405 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
14406 /* String up with arg */
14407 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), t
, arg
);
14408 /* Big-endianness related address adjustment. */
14409 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
14410 && size
< UNITS_PER_WORD
)
14412 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (arg
), arg
,
14413 size_int (UNITS_PER_WORD
- size
));
14414 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), on_stack
, t
);
14417 COND_EXPR_THEN (cond1
) = unshare_expr (on_stack
);
14418 COND_EXPR_THEN (cond2
) = unshare_expr (on_stack
);
14420 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
14423 t
= build2 (PREINCREMENT_EXPR
, TREE_TYPE (off
), off
,
14424 build_int_cst (TREE_TYPE (off
), adjust
));
14426 t
= fold_convert (sizetype
, t
);
14427 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (f_top
), f_top
, t
);
14431 /* type ha; // treat as "struct {ftype field[n];}"
14432 ... [computing offs]
14433 for (i = 0; i <nregs; ++i, offs += 16)
14434 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
14437 tree tmp_ha
, field_t
, field_ptr_t
;
14439 /* Declare a local variable. */
14440 tmp_ha
= create_tmp_var_raw (type
, "ha");
14441 gimple_add_tmp_var (tmp_ha
);
14443 /* Establish the base type. */
14447 field_t
= float_type_node
;
14448 field_ptr_t
= float_ptr_type_node
;
14451 field_t
= double_type_node
;
14452 field_ptr_t
= double_ptr_type_node
;
14455 field_t
= long_double_type_node
;
14456 field_ptr_t
= long_double_ptr_type_node
;
14459 field_t
= aarch64_fp16_type_node
;
14460 field_ptr_t
= aarch64_fp16_ptr_type_node
;
14465 tree innertype
= make_signed_type (GET_MODE_PRECISION (SImode
));
14466 field_t
= build_vector_type_for_mode (innertype
, ag_mode
);
14467 field_ptr_t
= build_pointer_type (field_t
);
14474 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
14475 tmp_ha
= build1 (ADDR_EXPR
, field_ptr_t
, tmp_ha
);
14477 t
= fold_convert (field_ptr_t
, addr
);
14478 t
= build2 (MODIFY_EXPR
, field_t
,
14479 build1 (INDIRECT_REF
, field_t
, tmp_ha
),
14480 build1 (INDIRECT_REF
, field_t
, t
));
14482 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
14483 for (i
= 1; i
< nregs
; ++i
)
14485 addr
= fold_build_pointer_plus_hwi (addr
, UNITS_PER_VREG
);
14486 u
= fold_convert (field_ptr_t
, addr
);
14487 u
= build2 (MODIFY_EXPR
, field_t
,
14488 build2 (MEM_REF
, field_t
, tmp_ha
,
14489 build_int_cst (field_ptr_t
,
14491 int_size_in_bytes (field_t
)))),
14492 build1 (INDIRECT_REF
, field_t
, u
));
14493 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), t
, u
);
14496 u
= fold_convert (TREE_TYPE (f_top
), tmp_ha
);
14497 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (f_top
), t
, u
);
14500 COND_EXPR_ELSE (cond2
) = t
;
14501 addr
= fold_convert (build_pointer_type (type
), cond1
);
14502 addr
= build_va_arg_indirect_ref (addr
);
14505 addr
= build_va_arg_indirect_ref (addr
);
14510 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
14513 aarch64_setup_incoming_varargs (cumulative_args_t cum_v
,
14514 const function_arg_info
&arg
,
14515 int *pretend_size ATTRIBUTE_UNUSED
, int no_rtl
)
14517 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
14518 CUMULATIVE_ARGS local_cum
;
14519 int gr_saved
= cfun
->va_list_gpr_size
;
14520 int vr_saved
= cfun
->va_list_fpr_size
;
14522 /* The caller has advanced CUM up to, but not beyond, the last named
14523 argument. Advance a local copy of CUM past the last "real" named
14524 argument, to find out how many registers are left over. */
14526 aarch64_function_arg_advance (pack_cumulative_args(&local_cum
),
14527 arg
.mode
, arg
.type
, arg
.named
);
14529 /* Found out how many registers we need to save.
14530 Honor tree-stdvar analysis results. */
14531 if (cfun
->va_list_gpr_size
)
14532 gr_saved
= MIN (NUM_ARG_REGS
- local_cum
.aapcs_ncrn
,
14533 cfun
->va_list_gpr_size
/ UNITS_PER_WORD
);
14534 if (cfun
->va_list_fpr_size
)
14535 vr_saved
= MIN (NUM_FP_ARG_REGS
- local_cum
.aapcs_nvrn
,
14536 cfun
->va_list_fpr_size
/ UNITS_PER_VREG
);
14540 gcc_assert (local_cum
.aapcs_nvrn
== 0);
14550 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
14551 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
,
14552 - gr_saved
* UNITS_PER_WORD
);
14553 mem
= gen_frame_mem (BLKmode
, ptr
);
14554 set_mem_alias_set (mem
, get_varargs_alias_set ());
14556 move_block_from_reg (local_cum
.aapcs_ncrn
+ R0_REGNUM
,
14561 /* We can't use move_block_from_reg, because it will use
14562 the wrong mode, storing D regs only. */
14563 machine_mode mode
= TImode
;
14564 int off
, i
, vr_start
;
14566 /* Set OFF to the offset from virtual_incoming_args_rtx of
14567 the first vector register. The VR save area lies below
14568 the GR one, and is aligned to 16 bytes. */
14569 off
= -ROUND_UP (gr_saved
* UNITS_PER_WORD
,
14570 STACK_BOUNDARY
/ BITS_PER_UNIT
);
14571 off
-= vr_saved
* UNITS_PER_VREG
;
14573 vr_start
= V0_REGNUM
+ local_cum
.aapcs_nvrn
;
14574 for (i
= 0; i
< vr_saved
; ++i
)
14578 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
, off
);
14579 mem
= gen_frame_mem (mode
, ptr
);
14580 set_mem_alias_set (mem
, get_varargs_alias_set ());
14581 aarch64_emit_move (mem
, gen_rtx_REG (mode
, vr_start
+ i
));
14582 off
+= UNITS_PER_VREG
;
14587 /* We don't save the size into *PRETEND_SIZE because we want to avoid
14588 any complication of having crtl->args.pretend_args_size changed. */
14589 cfun
->machine
->frame
.saved_varargs_size
14590 = (ROUND_UP (gr_saved
* UNITS_PER_WORD
,
14591 STACK_BOUNDARY
/ BITS_PER_UNIT
)
14592 + vr_saved
* UNITS_PER_VREG
);
14596 aarch64_conditional_register_usage (void)
14601 for (i
= V0_REGNUM
; i
<= V31_REGNUM
; i
++)
14604 call_used_regs
[i
] = 1;
14608 for (i
= P0_REGNUM
; i
<= P15_REGNUM
; i
++)
14611 call_used_regs
[i
] = 1;
14614 /* When tracking speculation, we need a couple of call-clobbered registers
14615 to track the speculation state. It would be nice to just use
14616 IP0 and IP1, but currently there are numerous places that just
14617 assume these registers are free for other uses (eg pointer
14618 authentication). */
14619 if (aarch64_track_speculation
)
14621 fixed_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
14622 call_used_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
14623 fixed_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
14624 call_used_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
14628 /* Walk down the type tree of TYPE counting consecutive base elements.
14629 If *MODEP is VOIDmode, then set it to the first valid floating point
14630 type. If a non-floating point type is found, or if a floating point
14631 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
14632 otherwise return the count in the sub-tree. */
14634 aapcs_vfp_sub_candidate (const_tree type
, machine_mode
*modep
)
14637 HOST_WIDE_INT size
;
14639 switch (TREE_CODE (type
))
14642 mode
= TYPE_MODE (type
);
14643 if (mode
!= DFmode
&& mode
!= SFmode
14644 && mode
!= TFmode
&& mode
!= HFmode
)
14647 if (*modep
== VOIDmode
)
14650 if (*modep
== mode
)
14656 mode
= TYPE_MODE (TREE_TYPE (type
));
14657 if (mode
!= DFmode
&& mode
!= SFmode
14658 && mode
!= TFmode
&& mode
!= HFmode
)
14661 if (*modep
== VOIDmode
)
14664 if (*modep
== mode
)
14670 /* Use V2SImode and V4SImode as representatives of all 64-bit
14671 and 128-bit vector types. */
14672 size
= int_size_in_bytes (type
);
14685 if (*modep
== VOIDmode
)
14688 /* Vector modes are considered to be opaque: two vectors are
14689 equivalent for the purposes of being homogeneous aggregates
14690 if they are the same size. */
14691 if (*modep
== mode
)
14699 tree index
= TYPE_DOMAIN (type
);
14701 /* Can't handle incomplete types nor sizes that are not
14703 if (!COMPLETE_TYPE_P (type
)
14704 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
14707 count
= aapcs_vfp_sub_candidate (TREE_TYPE (type
), modep
);
14710 || !TYPE_MAX_VALUE (index
)
14711 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index
))
14712 || !TYPE_MIN_VALUE (index
)
14713 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index
))
14717 count
*= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index
))
14718 - tree_to_uhwi (TYPE_MIN_VALUE (index
)));
14720 /* There must be no padding. */
14721 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
14722 count
* GET_MODE_BITSIZE (*modep
)))
14734 /* Can't handle incomplete types nor sizes that are not
14736 if (!COMPLETE_TYPE_P (type
)
14737 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
14740 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
14742 if (TREE_CODE (field
) != FIELD_DECL
)
14745 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
14748 count
+= sub_count
;
14751 /* There must be no padding. */
14752 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
14753 count
* GET_MODE_BITSIZE (*modep
)))
14760 case QUAL_UNION_TYPE
:
14762 /* These aren't very interesting except in a degenerate case. */
14767 /* Can't handle incomplete types nor sizes that are not
14769 if (!COMPLETE_TYPE_P (type
)
14770 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
14773 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
14775 if (TREE_CODE (field
) != FIELD_DECL
)
14778 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
14781 count
= count
> sub_count
? count
: sub_count
;
14784 /* There must be no padding. */
14785 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
14786 count
* GET_MODE_BITSIZE (*modep
)))
14799 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
14800 type as described in AAPCS64 \S 4.1.2.
14802 See the comment above aarch64_composite_type_p for the notes on MODE. */
14805 aarch64_short_vector_p (const_tree type
,
14808 poly_int64 size
= -1;
14810 if (type
&& TREE_CODE (type
) == VECTOR_TYPE
)
14811 size
= int_size_in_bytes (type
);
14812 else if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
14813 || GET_MODE_CLASS (mode
) == MODE_VECTOR_FLOAT
)
14814 size
= GET_MODE_SIZE (mode
);
14816 return known_eq (size
, 8) || known_eq (size
, 16);
14819 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
14820 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
14821 array types. The C99 floating-point complex types are also considered
14822 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
14823 types, which are GCC extensions and out of the scope of AAPCS64, are
14824 treated as composite types here as well.
14826 Note that MODE itself is not sufficient in determining whether a type
14827 is such a composite type or not. This is because
14828 stor-layout.c:compute_record_mode may have already changed the MODE
14829 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
14830 structure with only one field may have its MODE set to the mode of the
14831 field. Also an integer mode whose size matches the size of the
14832 RECORD_TYPE type may be used to substitute the original mode
14833 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
14834 solely relied on. */
14837 aarch64_composite_type_p (const_tree type
,
14840 if (aarch64_short_vector_p (type
, mode
))
14843 if (type
&& (AGGREGATE_TYPE_P (type
) || TREE_CODE (type
) == COMPLEX_TYPE
))
14846 if (mode
== BLKmode
14847 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
14848 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_INT
)
14854 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
14855 shall be passed or returned in simd/fp register(s) (providing these
14856 parameter passing registers are available).
14858 Upon successful return, *COUNT returns the number of needed registers,
14859 *BASE_MODE returns the mode of the individual register and when IS_HAF
14860 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
14861 floating-point aggregate or a homogeneous short-vector aggregate. */
14864 aarch64_vfp_is_call_or_return_candidate (machine_mode mode
,
14866 machine_mode
*base_mode
,
14870 machine_mode new_mode
= VOIDmode
;
14871 bool composite_p
= aarch64_composite_type_p (type
, mode
);
14873 if (is_ha
!= NULL
) *is_ha
= false;
14875 if ((!composite_p
&& GET_MODE_CLASS (mode
) == MODE_FLOAT
)
14876 || aarch64_short_vector_p (type
, mode
))
14881 else if (GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
)
14883 if (is_ha
!= NULL
) *is_ha
= true;
14885 new_mode
= GET_MODE_INNER (mode
);
14887 else if (type
&& composite_p
)
14889 int ag_count
= aapcs_vfp_sub_candidate (type
, &new_mode
);
14891 if (ag_count
> 0 && ag_count
<= HA_MAX_NUM_FLDS
)
14893 if (is_ha
!= NULL
) *is_ha
= true;
14902 *base_mode
= new_mode
;
14906 /* Implement TARGET_STRUCT_VALUE_RTX. */
14909 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED
,
14910 int incoming ATTRIBUTE_UNUSED
)
14912 return gen_rtx_REG (Pmode
, AARCH64_STRUCT_VALUE_REGNUM
);
14915 /* Implements target hook vector_mode_supported_p. */
14917 aarch64_vector_mode_supported_p (machine_mode mode
)
14919 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
14920 return vec_flags
!= 0 && (vec_flags
& VEC_STRUCT
) == 0;
14923 /* Return the full-width SVE vector mode for element mode MODE, if one
14926 aarch64_full_sve_mode (scalar_mode mode
)
14943 return VNx16QImode
;
14945 return opt_machine_mode ();
14949 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
14952 aarch64_vq_mode (scalar_mode mode
)
14971 return opt_machine_mode ();
14975 /* Return appropriate SIMD container
14976 for MODE within a vector of WIDTH bits. */
14977 static machine_mode
14978 aarch64_simd_container_mode (scalar_mode mode
, poly_int64 width
)
14980 if (TARGET_SVE
&& known_eq (width
, BITS_PER_SVE_VECTOR
))
14981 return aarch64_full_sve_mode (mode
).else_mode (word_mode
);
14983 gcc_assert (known_eq (width
, 64) || known_eq (width
, 128));
14986 if (known_eq (width
, 128))
14987 return aarch64_vq_mode (mode
).else_mode (word_mode
);
15008 /* Return 128-bit container as the preferred SIMD mode for MODE. */
15009 static machine_mode
15010 aarch64_preferred_simd_mode (scalar_mode mode
)
15012 poly_int64 bits
= TARGET_SVE
? BITS_PER_SVE_VECTOR
: 128;
15013 return aarch64_simd_container_mode (mode
, bits
);
15016 /* Return a list of possible vector sizes for the vectorizer
15017 to iterate over. */
15019 aarch64_autovectorize_vector_sizes (vector_sizes
*sizes
, bool)
15022 sizes
->safe_push (BYTES_PER_SVE_VECTOR
);
15023 sizes
->safe_push (16);
15024 sizes
->safe_push (8);
15027 /* Implement TARGET_MANGLE_TYPE. */
15029 static const char *
15030 aarch64_mangle_type (const_tree type
)
15032 /* The AArch64 ABI documents say that "__va_list" has to be
15033 mangled as if it is in the "std" namespace. */
15034 if (lang_hooks
.types_compatible_p (CONST_CAST_TREE (type
), va_list_type
))
15035 return "St9__va_list";
15037 /* Half-precision float. */
15038 if (TREE_CODE (type
) == REAL_TYPE
&& TYPE_PRECISION (type
) == 16)
15041 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
15043 if (TYPE_NAME (type
) != NULL
)
15044 return aarch64_mangle_builtin_type (type
);
15046 /* Use the default mangling. */
15050 /* Find the first rtx_insn before insn that will generate an assembly
15054 aarch64_prev_real_insn (rtx_insn
*insn
)
15061 insn
= prev_real_insn (insn
);
15063 while (insn
&& recog_memoized (insn
) < 0);
15069 is_madd_op (enum attr_type t1
)
15072 /* A number of these may be AArch32 only. */
15073 enum attr_type mlatypes
[] = {
15074 TYPE_MLA
, TYPE_MLAS
, TYPE_SMLAD
, TYPE_SMLADX
, TYPE_SMLAL
, TYPE_SMLALD
,
15075 TYPE_SMLALS
, TYPE_SMLALXY
, TYPE_SMLAWX
, TYPE_SMLAWY
, TYPE_SMLAXY
,
15076 TYPE_SMMLA
, TYPE_UMLAL
, TYPE_UMLALS
,TYPE_SMLSD
, TYPE_SMLSDX
, TYPE_SMLSLD
15079 for (i
= 0; i
< sizeof (mlatypes
) / sizeof (enum attr_type
); i
++)
15081 if (t1
== mlatypes
[i
])
15088 /* Check if there is a register dependency between a load and the insn
15089 for which we hold recog_data. */
15092 dep_between_memop_and_curr (rtx memop
)
15097 gcc_assert (GET_CODE (memop
) == SET
);
15099 if (!REG_P (SET_DEST (memop
)))
15102 load_reg
= SET_DEST (memop
);
15103 for (opno
= 1; opno
< recog_data
.n_operands
; opno
++)
15105 rtx operand
= recog_data
.operand
[opno
];
15106 if (REG_P (operand
)
15107 && reg_overlap_mentioned_p (load_reg
, operand
))
15115 /* When working around the Cortex-A53 erratum 835769,
15116 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
15117 instruction and has a preceding memory instruction such that a NOP
15118 should be inserted between them. */
15121 aarch64_madd_needs_nop (rtx_insn
* insn
)
15123 enum attr_type attr_type
;
15127 if (!TARGET_FIX_ERR_A53_835769
)
15130 if (!INSN_P (insn
) || recog_memoized (insn
) < 0)
15133 attr_type
= get_attr_type (insn
);
15134 if (!is_madd_op (attr_type
))
15137 prev
= aarch64_prev_real_insn (insn
);
15138 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
15139 Restore recog state to INSN to avoid state corruption. */
15140 extract_constrain_insn_cached (insn
);
15142 if (!prev
|| !contains_mem_rtx_p (PATTERN (prev
)))
15145 body
= single_set (prev
);
15147 /* If the previous insn is a memory op and there is no dependency between
15148 it and the DImode madd, emit a NOP between them. If body is NULL then we
15149 have a complex memory operation, probably a load/store pair.
15150 Be conservative for now and emit a NOP. */
15151 if (GET_MODE (recog_data
.operand
[0]) == DImode
15152 && (!body
|| !dep_between_memop_and_curr (body
)))
15160 /* Implement FINAL_PRESCAN_INSN. */
15163 aarch64_final_prescan_insn (rtx_insn
*insn
)
15165 if (aarch64_madd_needs_nop (insn
))
15166 fprintf (asm_out_file
, "\tnop // between mem op and mult-accumulate\n");
15170 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
15174 aarch64_sve_index_immediate_p (rtx base_or_step
)
15176 return (CONST_INT_P (base_or_step
)
15177 && IN_RANGE (INTVAL (base_or_step
), -16, 15));
15180 /* Return true if X is a valid immediate for the SVE ADD and SUB
15181 instructions. Negate X first if NEGATE_P is true. */
15184 aarch64_sve_arith_immediate_p (rtx x
, bool negate_p
)
15188 if (!const_vec_duplicate_p (x
, &elt
)
15189 || !CONST_INT_P (elt
))
15192 HOST_WIDE_INT val
= INTVAL (elt
);
15195 val
&= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x
)));
15198 return IN_RANGE (val
, 0, 0xff);
15199 return IN_RANGE (val
, 0, 0xff00);
15202 /* Return true if X is a valid immediate operand for an SVE logical
15203 instruction such as AND. */
15206 aarch64_sve_bitmask_immediate_p (rtx x
)
15210 return (const_vec_duplicate_p (x
, &elt
)
15211 && CONST_INT_P (elt
)
15212 && aarch64_bitmask_imm (INTVAL (elt
),
15213 GET_MODE_INNER (GET_MODE (x
))));
15216 /* Return true if X is a valid immediate for the SVE DUP and CPY
15220 aarch64_sve_dup_immediate_p (rtx x
)
15222 x
= aarch64_bit_representation (unwrap_const_vec_duplicate (x
));
15223 if (!CONST_INT_P (x
))
15226 HOST_WIDE_INT val
= INTVAL (x
);
15228 return IN_RANGE (val
, -0x80, 0x7f);
15229 return IN_RANGE (val
, -0x8000, 0x7f00);
15232 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
15233 SIGNED_P says whether the operand is signed rather than unsigned. */
15236 aarch64_sve_cmp_immediate_p (rtx x
, bool signed_p
)
15240 return (const_vec_duplicate_p (x
, &elt
)
15241 && CONST_INT_P (elt
)
15243 ? IN_RANGE (INTVAL (elt
), -16, 15)
15244 : IN_RANGE (INTVAL (elt
), 0, 127)));
15247 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
15248 instruction. Negate X first if NEGATE_P is true. */
15251 aarch64_sve_float_arith_immediate_p (rtx x
, bool negate_p
)
15256 if (!const_vec_duplicate_p (x
, &elt
)
15257 || GET_CODE (elt
) != CONST_DOUBLE
)
15260 r
= *CONST_DOUBLE_REAL_VALUE (elt
);
15263 r
= real_value_negate (&r
);
15265 if (real_equal (&r
, &dconst1
))
15267 if (real_equal (&r
, &dconsthalf
))
15272 /* Return true if X is a valid immediate operand for an SVE FMUL
15276 aarch64_sve_float_mul_immediate_p (rtx x
)
15280 return (const_vec_duplicate_p (x
, &elt
)
15281 && GET_CODE (elt
) == CONST_DOUBLE
15282 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt
), &dconsthalf
)
15283 || real_equal (CONST_DOUBLE_REAL_VALUE (elt
), &dconst2
)));
15286 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
15287 for the Advanced SIMD operation described by WHICH and INSN. If INFO
15288 is nonnull, use it to describe valid immediates. */
15290 aarch64_advsimd_valid_immediate_hs (unsigned int val32
,
15291 simd_immediate_info
*info
,
15292 enum simd_immediate_check which
,
15293 simd_immediate_info::insn_type insn
)
15295 /* Try a 4-byte immediate with LSL. */
15296 for (unsigned int shift
= 0; shift
< 32; shift
+= 8)
15297 if ((val32
& (0xff << shift
)) == val32
)
15300 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
15301 simd_immediate_info::LSL
, shift
);
15305 /* Try a 2-byte immediate with LSL. */
15306 unsigned int imm16
= val32
& 0xffff;
15307 if (imm16
== (val32
>> 16))
15308 for (unsigned int shift
= 0; shift
< 16; shift
+= 8)
15309 if ((imm16
& (0xff << shift
)) == imm16
)
15312 *info
= simd_immediate_info (HImode
, imm16
>> shift
, insn
,
15313 simd_immediate_info::LSL
, shift
);
15317 /* Try a 4-byte immediate with MSL, except for cases that MVN
15319 if (which
== AARCH64_CHECK_MOV
)
15320 for (unsigned int shift
= 8; shift
< 24; shift
+= 8)
15322 unsigned int low
= (1 << shift
) - 1;
15323 if (((val32
& (0xff << shift
)) | low
) == val32
)
15326 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
15327 simd_immediate_info::MSL
, shift
);
15335 /* Return true if replicating VAL64 is a valid immediate for the
15336 Advanced SIMD operation described by WHICH. If INFO is nonnull,
15337 use it to describe valid immediates. */
15339 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64
,
15340 simd_immediate_info
*info
,
15341 enum simd_immediate_check which
)
15343 unsigned int val32
= val64
& 0xffffffff;
15344 unsigned int val16
= val64
& 0xffff;
15345 unsigned int val8
= val64
& 0xff;
15347 if (val32
== (val64
>> 32))
15349 if ((which
& AARCH64_CHECK_ORR
) != 0
15350 && aarch64_advsimd_valid_immediate_hs (val32
, info
, which
,
15351 simd_immediate_info::MOV
))
15354 if ((which
& AARCH64_CHECK_BIC
) != 0
15355 && aarch64_advsimd_valid_immediate_hs (~val32
, info
, which
,
15356 simd_immediate_info::MVN
))
15359 /* Try using a replicated byte. */
15360 if (which
== AARCH64_CHECK_MOV
15361 && val16
== (val32
>> 16)
15362 && val8
== (val16
>> 8))
15365 *info
= simd_immediate_info (QImode
, val8
);
15370 /* Try using a bit-to-bytemask. */
15371 if (which
== AARCH64_CHECK_MOV
)
15374 for (i
= 0; i
< 64; i
+= 8)
15376 unsigned char byte
= (val64
>> i
) & 0xff;
15377 if (byte
!= 0 && byte
!= 0xff)
15383 *info
= simd_immediate_info (DImode
, val64
);
15390 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
15391 instruction. If INFO is nonnull, use it to describe valid immediates. */
15394 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64
,
15395 simd_immediate_info
*info
)
15397 scalar_int_mode mode
= DImode
;
15398 unsigned int val32
= val64
& 0xffffffff;
15399 if (val32
== (val64
>> 32))
15402 unsigned int val16
= val32
& 0xffff;
15403 if (val16
== (val32
>> 16))
15406 unsigned int val8
= val16
& 0xff;
15407 if (val8
== (val16
>> 8))
15411 HOST_WIDE_INT val
= trunc_int_for_mode (val64
, mode
);
15412 if (IN_RANGE (val
, -0x80, 0x7f))
15414 /* DUP with no shift. */
15416 *info
= simd_immediate_info (mode
, val
);
15419 if ((val
& 0xff) == 0 && IN_RANGE (val
, -0x8000, 0x7f00))
15421 /* DUP with LSL #8. */
15423 *info
= simd_immediate_info (mode
, val
);
15426 if (aarch64_bitmask_imm (val64
, mode
))
15430 *info
= simd_immediate_info (mode
, val
);
15436 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
15437 it to describe valid immediates. */
15440 aarch64_sve_pred_valid_immediate (rtx x
, simd_immediate_info
*info
)
15442 if (x
== CONST0_RTX (GET_MODE (x
)))
15445 *info
= simd_immediate_info (DImode
, 0);
15449 /* Analyze the value as a VNx16BImode. This should be relatively
15450 efficient, since rtx_vector_builder has enough built-in capacity
15451 to store all VLA predicate constants without needing the heap. */
15452 rtx_vector_builder builder
;
15453 if (!aarch64_get_sve_pred_bits (builder
, x
))
15456 unsigned int elt_size
= aarch64_widest_sve_pred_elt_size (builder
);
15457 if (int vl
= aarch64_partial_ptrue_length (builder
, elt_size
))
15459 machine_mode mode
= aarch64_sve_pred_mode (elt_size
).require ();
15460 aarch64_svpattern pattern
= aarch64_svpattern_for_vl (mode
, vl
);
15461 if (pattern
!= AARCH64_NUM_SVPATTERNS
)
15465 scalar_int_mode int_mode
= aarch64_sve_element_int_mode (mode
);
15466 *info
= simd_immediate_info (int_mode
, pattern
);
15474 /* Return true if OP is a valid SIMD immediate for the operation
15475 described by WHICH. If INFO is nonnull, use it to describe valid
15478 aarch64_simd_valid_immediate (rtx op
, simd_immediate_info
*info
,
15479 enum simd_immediate_check which
)
15481 machine_mode mode
= GET_MODE (op
);
15482 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
15483 if (vec_flags
== 0 || vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
15486 if (vec_flags
& VEC_SVE_PRED
)
15487 return aarch64_sve_pred_valid_immediate (op
, info
);
15489 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
15491 unsigned int n_elts
;
15492 if (GET_CODE (op
) == CONST_VECTOR
15493 && CONST_VECTOR_DUPLICATE_P (op
))
15494 n_elts
= CONST_VECTOR_NPATTERNS (op
);
15495 else if ((vec_flags
& VEC_SVE_DATA
)
15496 && const_vec_series_p (op
, &base
, &step
))
15498 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
);
15499 if (!aarch64_sve_index_immediate_p (base
)
15500 || !aarch64_sve_index_immediate_p (step
))
15504 *info
= simd_immediate_info (elt_mode
, base
, step
);
15507 else if (GET_CODE (op
) == CONST_VECTOR
15508 && CONST_VECTOR_NUNITS (op
).is_constant (&n_elts
))
15509 /* N_ELTS set above. */;
15513 scalar_float_mode elt_float_mode
;
15515 && is_a
<scalar_float_mode
> (elt_mode
, &elt_float_mode
))
15517 rtx elt
= CONST_VECTOR_ENCODED_ELT (op
, 0);
15518 if (aarch64_float_const_zero_rtx_p (elt
)
15519 || aarch64_float_const_representable_p (elt
))
15522 *info
= simd_immediate_info (elt_float_mode
, elt
);
15527 unsigned int elt_size
= GET_MODE_SIZE (elt_mode
);
15531 scalar_int_mode elt_int_mode
= int_mode_for_mode (elt_mode
).require ();
15533 /* Expand the vector constant out into a byte vector, with the least
15534 significant byte of the register first. */
15535 auto_vec
<unsigned char, 16> bytes
;
15536 bytes
.reserve (n_elts
* elt_size
);
15537 for (unsigned int i
= 0; i
< n_elts
; i
++)
15539 /* The vector is provided in gcc endian-neutral fashion.
15540 For aarch64_be Advanced SIMD, it must be laid out in the vector
15541 register in reverse order. */
15542 bool swap_p
= ((vec_flags
& VEC_ADVSIMD
) != 0 && BYTES_BIG_ENDIAN
);
15543 rtx elt
= CONST_VECTOR_ELT (op
, swap_p
? (n_elts
- 1 - i
) : i
);
15545 if (elt_mode
!= elt_int_mode
)
15546 elt
= gen_lowpart (elt_int_mode
, elt
);
15548 if (!CONST_INT_P (elt
))
15551 unsigned HOST_WIDE_INT elt_val
= INTVAL (elt
);
15552 for (unsigned int byte
= 0; byte
< elt_size
; byte
++)
15554 bytes
.quick_push (elt_val
& 0xff);
15555 elt_val
>>= BITS_PER_UNIT
;
15559 /* The immediate must repeat every eight bytes. */
15560 unsigned int nbytes
= bytes
.length ();
15561 for (unsigned i
= 8; i
< nbytes
; ++i
)
15562 if (bytes
[i
] != bytes
[i
- 8])
15565 /* Get the repeating 8-byte value as an integer. No endian correction
15566 is needed here because bytes is already in lsb-first order. */
15567 unsigned HOST_WIDE_INT val64
= 0;
15568 for (unsigned int i
= 0; i
< 8; i
++)
15569 val64
|= ((unsigned HOST_WIDE_INT
) bytes
[i
% nbytes
]
15570 << (i
* BITS_PER_UNIT
));
15572 if (vec_flags
& VEC_SVE_DATA
)
15573 return aarch64_sve_valid_immediate (val64
, info
);
15575 return aarch64_advsimd_valid_immediate (val64
, info
, which
);
15578 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
15579 has a step in the range of INDEX. Return the index expression if so,
15580 otherwise return null. */
15582 aarch64_check_zero_based_sve_index_immediate (rtx x
)
15585 if (const_vec_series_p (x
, &base
, &step
)
15586 && base
== const0_rtx
15587 && aarch64_sve_index_immediate_p (step
))
15592 /* Check of immediate shift constants are within range. */
15594 aarch64_simd_shift_imm_p (rtx x
, machine_mode mode
, bool left
)
15596 int bit_width
= GET_MODE_UNIT_SIZE (mode
) * BITS_PER_UNIT
;
15598 return aarch64_const_vec_all_same_in_range_p (x
, 0, bit_width
- 1);
15600 return aarch64_const_vec_all_same_in_range_p (x
, 1, bit_width
);
15603 /* Return the bitmask CONST_INT to select the bits required by a zero extract
15604 operation of width WIDTH at bit position POS. */
15607 aarch64_mask_from_zextract_ops (rtx width
, rtx pos
)
15609 gcc_assert (CONST_INT_P (width
));
15610 gcc_assert (CONST_INT_P (pos
));
15612 unsigned HOST_WIDE_INT mask
15613 = ((unsigned HOST_WIDE_INT
) 1 << UINTVAL (width
)) - 1;
15614 return GEN_INT (mask
<< UINTVAL (pos
));
15618 aarch64_mov_operand_p (rtx x
, machine_mode mode
)
15620 if (GET_CODE (x
) == HIGH
15621 && aarch64_valid_symref (XEXP (x
, 0), GET_MODE (XEXP (x
, 0))))
15624 if (CONST_INT_P (x
))
15627 if (VECTOR_MODE_P (GET_MODE (x
)))
15629 /* Require predicate constants to be VNx16BI before RA, so that we
15630 force everything to have a canonical form. */
15631 if (!lra_in_progress
15632 && !reload_completed
15633 && GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_BOOL
15634 && GET_MODE (x
) != VNx16BImode
)
15637 return aarch64_simd_valid_immediate (x
, NULL
);
15640 if (GET_CODE (x
) == SYMBOL_REF
&& mode
== DImode
&& CONSTANT_ADDRESS_P (x
))
15643 if (aarch64_sve_cnt_immediate_p (x
))
15646 return aarch64_classify_symbolic_expression (x
)
15647 == SYMBOL_TINY_ABSOLUTE
;
15650 /* Return a const_int vector of VAL. */
15652 aarch64_simd_gen_const_vector_dup (machine_mode mode
, HOST_WIDE_INT val
)
15654 rtx c
= gen_int_mode (val
, GET_MODE_INNER (mode
));
15655 return gen_const_vec_duplicate (mode
, c
);
15658 /* Check OP is a legal scalar immediate for the MOVI instruction. */
15661 aarch64_simd_scalar_immediate_valid_for_move (rtx op
, scalar_int_mode mode
)
15663 machine_mode vmode
;
15665 vmode
= aarch64_simd_container_mode (mode
, 64);
15666 rtx op_v
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (op
));
15667 return aarch64_simd_valid_immediate (op_v
, NULL
);
15670 /* Construct and return a PARALLEL RTX vector with elements numbering the
15671 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
15672 the vector - from the perspective of the architecture. This does not
15673 line up with GCC's perspective on lane numbers, so we end up with
15674 different masks depending on our target endian-ness. The diagram
15675 below may help. We must draw the distinction when building masks
15676 which select one half of the vector. An instruction selecting
15677 architectural low-lanes for a big-endian target, must be described using
15678 a mask selecting GCC high-lanes.
15680 Big-Endian Little-Endian
15682 GCC 0 1 2 3 3 2 1 0
15683 | x | x | x | x | | x | x | x | x |
15684 Architecture 3 2 1 0 3 2 1 0
15686 Low Mask: { 2, 3 } { 0, 1 }
15687 High Mask: { 0, 1 } { 2, 3 }
15689 MODE Is the mode of the vector and NUNITS is the number of units in it. */
15692 aarch64_simd_vect_par_cnst_half (machine_mode mode
, int nunits
, bool high
)
15694 rtvec v
= rtvec_alloc (nunits
/ 2);
15695 int high_base
= nunits
/ 2;
15701 if (BYTES_BIG_ENDIAN
)
15702 base
= high
? low_base
: high_base
;
15704 base
= high
? high_base
: low_base
;
15706 for (i
= 0; i
< nunits
/ 2; i
++)
15707 RTVEC_ELT (v
, i
) = GEN_INT (base
+ i
);
15709 t1
= gen_rtx_PARALLEL (mode
, v
);
15713 /* Check OP for validity as a PARALLEL RTX vector with elements
15714 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
15715 from the perspective of the architecture. See the diagram above
15716 aarch64_simd_vect_par_cnst_half for more details. */
15719 aarch64_simd_check_vect_par_cnst_half (rtx op
, machine_mode mode
,
15723 if (!VECTOR_MODE_P (mode
) || !GET_MODE_NUNITS (mode
).is_constant (&nelts
))
15726 rtx ideal
= aarch64_simd_vect_par_cnst_half (mode
, nelts
, high
);
15727 HOST_WIDE_INT count_op
= XVECLEN (op
, 0);
15728 HOST_WIDE_INT count_ideal
= XVECLEN (ideal
, 0);
15731 if (count_op
!= count_ideal
)
15734 for (i
= 0; i
< count_ideal
; i
++)
15736 rtx elt_op
= XVECEXP (op
, 0, i
);
15737 rtx elt_ideal
= XVECEXP (ideal
, 0, i
);
15739 if (!CONST_INT_P (elt_op
)
15740 || INTVAL (elt_ideal
) != INTVAL (elt_op
))
15746 /* Return a PARALLEL containing NELTS elements, with element I equal
15747 to BASE + I * STEP. */
15750 aarch64_gen_stepped_int_parallel (unsigned int nelts
, int base
, int step
)
15752 rtvec vec
= rtvec_alloc (nelts
);
15753 for (unsigned int i
= 0; i
< nelts
; ++i
)
15754 RTVEC_ELT (vec
, i
) = gen_int_mode (base
+ i
* step
, DImode
);
15755 return gen_rtx_PARALLEL (VOIDmode
, vec
);
15758 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
15759 series with step STEP. */
15762 aarch64_stepped_int_parallel_p (rtx op
, int step
)
15764 if (GET_CODE (op
) != PARALLEL
|| !CONST_INT_P (XVECEXP (op
, 0, 0)))
15767 unsigned HOST_WIDE_INT base
= UINTVAL (XVECEXP (op
, 0, 0));
15768 for (int i
= 1; i
< XVECLEN (op
, 0); ++i
)
15769 if (!CONST_INT_P (XVECEXP (op
, 0, i
))
15770 || UINTVAL (XVECEXP (op
, 0, i
)) != base
+ i
* step
)
15776 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
15777 HIGH (exclusive). */
15779 aarch64_simd_lane_bounds (rtx operand
, HOST_WIDE_INT low
, HOST_WIDE_INT high
,
15782 HOST_WIDE_INT lane
;
15783 gcc_assert (CONST_INT_P (operand
));
15784 lane
= INTVAL (operand
);
15786 if (lane
< low
|| lane
>= high
)
15789 error ("%Klane %wd out of range %wd - %wd", exp
, lane
, low
, high
- 1);
15791 error ("lane %wd out of range %wd - %wd", lane
, low
, high
- 1);
15795 /* Peform endian correction on lane number N, which indexes a vector
15796 of mode MODE, and return the result as an SImode rtx. */
15799 aarch64_endian_lane_rtx (machine_mode mode
, unsigned int n
)
15801 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode
), n
), SImode
);
15804 /* Return TRUE if OP is a valid vector addressing mode. */
15807 aarch64_simd_mem_operand_p (rtx op
)
15809 return MEM_P (op
) && (GET_CODE (XEXP (op
, 0)) == POST_INC
15810 || REG_P (XEXP (op
, 0)));
15813 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
15816 aarch64_sve_ld1r_operand_p (rtx op
)
15818 struct aarch64_address_info addr
;
15822 && is_a
<scalar_mode
> (GET_MODE (op
), &mode
)
15823 && aarch64_classify_address (&addr
, XEXP (op
, 0), mode
, false)
15824 && addr
.type
== ADDRESS_REG_IMM
15825 && offset_6bit_unsigned_scaled_p (mode
, addr
.const_offset
));
15828 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
15830 aarch64_sve_ld1rq_operand_p (rtx op
)
15832 struct aarch64_address_info addr
;
15833 scalar_mode elem_mode
= GET_MODE_INNER (GET_MODE (op
));
15835 || !aarch64_classify_address (&addr
, XEXP (op
, 0), elem_mode
, false))
15838 if (addr
.type
== ADDRESS_REG_IMM
)
15839 return offset_4bit_signed_scaled_p (TImode
, addr
.const_offset
);
15841 if (addr
.type
== ADDRESS_REG_REG
)
15842 return (1U << addr
.shift
) == GET_MODE_SIZE (elem_mode
);
15847 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
15848 The conditions for STR are the same. */
15850 aarch64_sve_ldr_operand_p (rtx op
)
15852 struct aarch64_address_info addr
;
15855 && aarch64_classify_address (&addr
, XEXP (op
, 0), GET_MODE (op
),
15856 false, ADDR_QUERY_ANY
)
15857 && addr
.type
== ADDRESS_REG_IMM
);
15860 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
15861 We need to be able to access the individual pieces, so the range
15862 is different from LD[234] and ST[234]. */
15864 aarch64_sve_struct_memory_operand_p (rtx op
)
15869 machine_mode mode
= GET_MODE (op
);
15870 struct aarch64_address_info addr
;
15871 if (!aarch64_classify_address (&addr
, XEXP (op
, 0), SVE_BYTE_MODE
, false,
15873 || addr
.type
!= ADDRESS_REG_IMM
)
15876 poly_int64 first
= addr
.const_offset
;
15877 poly_int64 last
= first
+ GET_MODE_SIZE (mode
) - BYTES_PER_SVE_VECTOR
;
15878 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, first
)
15879 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, last
));
15882 /* Emit a register copy from operand to operand, taking care not to
15883 early-clobber source registers in the process.
15885 COUNT is the number of components into which the copy needs to be
15888 aarch64_simd_emit_reg_reg_move (rtx
*operands
, machine_mode mode
,
15889 unsigned int count
)
15892 int rdest
= REGNO (operands
[0]);
15893 int rsrc
= REGNO (operands
[1]);
15895 if (!reg_overlap_mentioned_p (operands
[0], operands
[1])
15897 for (i
= 0; i
< count
; i
++)
15898 emit_move_insn (gen_rtx_REG (mode
, rdest
+ i
),
15899 gen_rtx_REG (mode
, rsrc
+ i
));
15901 for (i
= 0; i
< count
; i
++)
15902 emit_move_insn (gen_rtx_REG (mode
, rdest
+ count
- i
- 1),
15903 gen_rtx_REG (mode
, rsrc
+ count
- i
- 1));
15906 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
15907 one of VSTRUCT modes: OI, CI, or XI. */
15909 aarch64_simd_attr_length_rglist (machine_mode mode
)
15911 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
15912 return (GET_MODE_SIZE (mode
).to_constant () / UNITS_PER_VREG
) * 4;
15915 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
15916 alignment of a vector to 128 bits. SVE predicates have an alignment of
15918 static HOST_WIDE_INT
15919 aarch64_simd_vector_alignment (const_tree type
)
15921 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
15922 be set for non-predicate vectors of booleans. Modes are the most
15923 direct way we have of identifying real SVE predicate types. */
15924 if (GET_MODE_CLASS (TYPE_MODE (type
)) == MODE_VECTOR_BOOL
)
15926 if (TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
15928 return wi::umin (wi::to_wide (TYPE_SIZE (type
)), 128).to_uhwi ();
15931 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
15933 aarch64_vectorize_preferred_vector_alignment (const_tree type
)
15935 if (aarch64_sve_data_mode_p (TYPE_MODE (type
)))
15937 /* If the length of the vector is fixed, try to align to that length,
15938 otherwise don't try to align at all. */
15939 HOST_WIDE_INT result
;
15940 if (!BITS_PER_SVE_VECTOR
.is_constant (&result
))
15941 result
= TYPE_ALIGN (TREE_TYPE (type
));
15944 return TYPE_ALIGN (type
);
15947 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
15949 aarch64_simd_vector_alignment_reachable (const_tree type
, bool is_packed
)
15954 /* For fixed-length vectors, check that the vectorizer will aim for
15955 full-vector alignment. This isn't true for generic GCC vectors
15956 that are wider than the ABI maximum of 128 bits. */
15957 poly_uint64 preferred_alignment
=
15958 aarch64_vectorize_preferred_vector_alignment (type
);
15959 if (TREE_CODE (TYPE_SIZE (type
)) == INTEGER_CST
15960 && maybe_ne (wi::to_widest (TYPE_SIZE (type
)),
15961 preferred_alignment
))
15964 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
15968 /* Return true if the vector misalignment factor is supported by the
15971 aarch64_builtin_support_vector_misalignment (machine_mode mode
,
15972 const_tree type
, int misalignment
,
15975 if (TARGET_SIMD
&& STRICT_ALIGNMENT
)
15977 /* Return if movmisalign pattern is not supported for this mode. */
15978 if (optab_handler (movmisalign_optab
, mode
) == CODE_FOR_nothing
)
15981 /* Misalignment factor is unknown at compile time. */
15982 if (misalignment
== -1)
15985 return default_builtin_support_vector_misalignment (mode
, type
, misalignment
,
15989 /* If VALS is a vector constant that can be loaded into a register
15990 using DUP, generate instructions to do so and return an RTX to
15991 assign to the register. Otherwise return NULL_RTX. */
15993 aarch64_simd_dup_constant (rtx vals
)
15995 machine_mode mode
= GET_MODE (vals
);
15996 machine_mode inner_mode
= GET_MODE_INNER (mode
);
15999 if (!const_vec_duplicate_p (vals
, &x
))
16002 /* We can load this constant by using DUP and a constant in a
16003 single ARM register. This will be cheaper than a vector
16005 x
= copy_to_mode_reg (inner_mode
, x
);
16006 return gen_vec_duplicate (mode
, x
);
16010 /* Generate code to load VALS, which is a PARALLEL containing only
16011 constants (for vec_init) or CONST_VECTOR, efficiently into a
16012 register. Returns an RTX to copy into the register, or NULL_RTX
16013 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
16015 aarch64_simd_make_constant (rtx vals
)
16017 machine_mode mode
= GET_MODE (vals
);
16019 rtx const_vec
= NULL_RTX
;
16023 if (GET_CODE (vals
) == CONST_VECTOR
)
16025 else if (GET_CODE (vals
) == PARALLEL
)
16027 /* A CONST_VECTOR must contain only CONST_INTs and
16028 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
16029 Only store valid constants in a CONST_VECTOR. */
16030 int n_elts
= XVECLEN (vals
, 0);
16031 for (i
= 0; i
< n_elts
; ++i
)
16033 rtx x
= XVECEXP (vals
, 0, i
);
16034 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
16037 if (n_const
== n_elts
)
16038 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0));
16041 gcc_unreachable ();
16043 if (const_vec
!= NULL_RTX
16044 && aarch64_simd_valid_immediate (const_vec
, NULL
))
16045 /* Load using MOVI/MVNI. */
16047 else if ((const_dup
= aarch64_simd_dup_constant (vals
)) != NULL_RTX
)
16048 /* Loaded using DUP. */
16050 else if (const_vec
!= NULL_RTX
)
16051 /* Load from constant pool. We cannot take advantage of single-cycle
16052 LD1 because we need a PC-relative addressing mode. */
16055 /* A PARALLEL containing something not valid inside CONST_VECTOR.
16056 We cannot construct an initializer. */
16060 /* Expand a vector initialisation sequence, such that TARGET is
16061 initialised to contain VALS. */
16064 aarch64_expand_vector_init (rtx target
, rtx vals
)
16066 machine_mode mode
= GET_MODE (target
);
16067 scalar_mode inner_mode
= GET_MODE_INNER (mode
);
16068 /* The number of vector elements. */
16069 int n_elts
= XVECLEN (vals
, 0);
16070 /* The number of vector elements which are not constant. */
16072 rtx any_const
= NULL_RTX
;
16073 /* The first element of vals. */
16074 rtx v0
= XVECEXP (vals
, 0, 0);
16075 bool all_same
= true;
16077 /* This is a special vec_init<M><N> where N is not an element mode but a
16078 vector mode with half the elements of M. We expect to find two entries
16079 of mode N in VALS and we must put their concatentation into TARGET. */
16080 if (XVECLEN (vals
, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals
, 0, 0))))
16082 gcc_assert (known_eq (GET_MODE_SIZE (mode
),
16083 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals
, 0, 0)))));
16084 rtx lo
= XVECEXP (vals
, 0, 0);
16085 rtx hi
= XVECEXP (vals
, 0, 1);
16086 machine_mode narrow_mode
= GET_MODE (lo
);
16087 gcc_assert (GET_MODE_INNER (narrow_mode
) == inner_mode
);
16088 gcc_assert (narrow_mode
== GET_MODE (hi
));
16090 /* When we want to concatenate a half-width vector with zeroes we can
16091 use the aarch64_combinez[_be] patterns. Just make sure that the
16092 zeroes are in the right half. */
16093 if (BYTES_BIG_ENDIAN
16094 && aarch64_simd_imm_zero (lo
, narrow_mode
)
16095 && general_operand (hi
, narrow_mode
))
16096 emit_insn (gen_aarch64_combinez_be (narrow_mode
, target
, hi
, lo
));
16097 else if (!BYTES_BIG_ENDIAN
16098 && aarch64_simd_imm_zero (hi
, narrow_mode
)
16099 && general_operand (lo
, narrow_mode
))
16100 emit_insn (gen_aarch64_combinez (narrow_mode
, target
, lo
, hi
));
16103 /* Else create the two half-width registers and combine them. */
16105 lo
= force_reg (GET_MODE (lo
), lo
);
16107 hi
= force_reg (GET_MODE (hi
), hi
);
16109 if (BYTES_BIG_ENDIAN
)
16110 std::swap (lo
, hi
);
16111 emit_insn (gen_aarch64_simd_combine (narrow_mode
, target
, lo
, hi
));
16116 /* Count the number of variable elements to initialise. */
16117 for (int i
= 0; i
< n_elts
; ++i
)
16119 rtx x
= XVECEXP (vals
, 0, i
);
16120 if (!(CONST_INT_P (x
) || CONST_DOUBLE_P (x
)))
16125 all_same
&= rtx_equal_p (x
, v0
);
16128 /* No variable elements, hand off to aarch64_simd_make_constant which knows
16129 how best to handle this. */
16132 rtx constant
= aarch64_simd_make_constant (vals
);
16133 if (constant
!= NULL_RTX
)
16135 emit_move_insn (target
, constant
);
16140 /* Splat a single non-constant element if we can. */
16143 rtx x
= copy_to_mode_reg (inner_mode
, v0
);
16144 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
16148 enum insn_code icode
= optab_handler (vec_set_optab
, mode
);
16149 gcc_assert (icode
!= CODE_FOR_nothing
);
16151 /* If there are only variable elements, try to optimize
16152 the insertion using dup for the most common element
16153 followed by insertions. */
16155 /* The algorithm will fill matches[*][0] with the earliest matching element,
16156 and matches[X][1] with the count of duplicate elements (if X is the
16157 earliest element which has duplicates). */
16159 if (n_var
== n_elts
&& n_elts
<= 16)
16161 int matches
[16][2] = {0};
16162 for (int i
= 0; i
< n_elts
; i
++)
16164 for (int j
= 0; j
<= i
; j
++)
16166 if (rtx_equal_p (XVECEXP (vals
, 0, i
), XVECEXP (vals
, 0, j
)))
16174 int maxelement
= 0;
16176 for (int i
= 0; i
< n_elts
; i
++)
16177 if (matches
[i
][1] > maxv
)
16180 maxv
= matches
[i
][1];
16183 /* Create a duplicate of the most common element, unless all elements
16184 are equally useless to us, in which case just immediately set the
16185 vector register using the first element. */
16189 /* For vectors of two 64-bit elements, we can do even better. */
16191 && (inner_mode
== E_DImode
16192 || inner_mode
== E_DFmode
))
16195 rtx x0
= XVECEXP (vals
, 0, 0);
16196 rtx x1
= XVECEXP (vals
, 0, 1);
16197 /* Combine can pick up this case, but handling it directly
16198 here leaves clearer RTL.
16200 This is load_pair_lanes<mode>, and also gives us a clean-up
16201 for store_pair_lanes<mode>. */
16202 if (memory_operand (x0
, inner_mode
)
16203 && memory_operand (x1
, inner_mode
)
16204 && !STRICT_ALIGNMENT
16205 && rtx_equal_p (XEXP (x1
, 0),
16206 plus_constant (Pmode
,
16208 GET_MODE_SIZE (inner_mode
))))
16211 if (inner_mode
== DFmode
)
16212 t
= gen_load_pair_lanesdf (target
, x0
, x1
);
16214 t
= gen_load_pair_lanesdi (target
, x0
, x1
);
16219 /* The subreg-move sequence below will move into lane zero of the
16220 vector register. For big-endian we want that position to hold
16221 the last element of VALS. */
16222 maxelement
= BYTES_BIG_ENDIAN
? n_elts
- 1 : 0;
16223 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
16224 aarch64_emit_move (target
, lowpart_subreg (mode
, x
, inner_mode
));
16228 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
16229 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
16232 /* Insert the rest. */
16233 for (int i
= 0; i
< n_elts
; i
++)
16235 rtx x
= XVECEXP (vals
, 0, i
);
16236 if (matches
[i
][0] == maxelement
)
16238 x
= copy_to_mode_reg (inner_mode
, x
);
16239 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
16244 /* Initialise a vector which is part-variable. We want to first try
16245 to build those lanes which are constant in the most efficient way we
16247 if (n_var
!= n_elts
)
16249 rtx copy
= copy_rtx (vals
);
16251 /* Load constant part of vector. We really don't care what goes into the
16252 parts we will overwrite, but we're more likely to be able to load the
16253 constant efficiently if it has fewer, larger, repeating parts
16254 (see aarch64_simd_valid_immediate). */
16255 for (int i
= 0; i
< n_elts
; i
++)
16257 rtx x
= XVECEXP (vals
, 0, i
);
16258 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
16260 rtx subst
= any_const
;
16261 for (int bit
= n_elts
/ 2; bit
> 0; bit
/= 2)
16263 /* Look in the copied vector, as more elements are const. */
16264 rtx test
= XVECEXP (copy
, 0, i
^ bit
);
16265 if (CONST_INT_P (test
) || CONST_DOUBLE_P (test
))
16271 XVECEXP (copy
, 0, i
) = subst
;
16273 aarch64_expand_vector_init (target
, copy
);
16276 /* Insert the variable lanes directly. */
16277 for (int i
= 0; i
< n_elts
; i
++)
16279 rtx x
= XVECEXP (vals
, 0, i
);
16280 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
16282 x
= copy_to_mode_reg (inner_mode
, x
);
16283 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
16287 /* Emit RTL corresponding to:
16288 insr TARGET, ELEM. */
16291 emit_insr (rtx target
, rtx elem
)
16293 machine_mode mode
= GET_MODE (target
);
16294 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
16295 elem
= force_reg (elem_mode
, elem
);
16297 insn_code icode
= optab_handler (vec_shl_insert_optab
, mode
);
16298 gcc_assert (icode
!= CODE_FOR_nothing
);
16299 emit_insn (GEN_FCN (icode
) (target
, target
, elem
));
16302 /* Subroutine of aarch64_sve_expand_vector_init for handling
16303 trailing constants.
16304 This function works as follows:
16305 (a) Create a new vector consisting of trailing constants.
16306 (b) Initialize TARGET with the constant vector using emit_move_insn.
16307 (c) Insert remaining elements in TARGET using insr.
16308 NELTS is the total number of elements in original vector while
16309 while NELTS_REQD is the number of elements that are actually
16312 ??? The heuristic used is to do above only if number of constants
16313 is at least half the total number of elements. May need fine tuning. */
16316 aarch64_sve_expand_vector_init_handle_trailing_constants
16317 (rtx target
, const rtx_vector_builder
&builder
, int nelts
, int nelts_reqd
)
16319 machine_mode mode
= GET_MODE (target
);
16320 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
16321 int n_trailing_constants
= 0;
16323 for (int i
= nelts_reqd
- 1;
16324 i
>= 0 && aarch64_legitimate_constant_p (elem_mode
, builder
.elt (i
));
16326 n_trailing_constants
++;
16328 if (n_trailing_constants
>= nelts_reqd
/ 2)
16330 rtx_vector_builder
v (mode
, 1, nelts
);
16331 for (int i
= 0; i
< nelts
; i
++)
16332 v
.quick_push (builder
.elt (i
+ nelts_reqd
- n_trailing_constants
));
16333 rtx const_vec
= v
.build ();
16334 emit_move_insn (target
, const_vec
);
16336 for (int i
= nelts_reqd
- n_trailing_constants
- 1; i
>= 0; i
--)
16337 emit_insr (target
, builder
.elt (i
));
16345 /* Subroutine of aarch64_sve_expand_vector_init.
16347 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
16348 (b) Skip trailing elements from BUILDER, which are the same as
16349 element NELTS_REQD - 1.
16350 (c) Insert earlier elements in reverse order in TARGET using insr. */
16353 aarch64_sve_expand_vector_init_insert_elems (rtx target
,
16354 const rtx_vector_builder
&builder
,
16357 machine_mode mode
= GET_MODE (target
);
16358 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
16360 struct expand_operand ops
[2];
16361 enum insn_code icode
= optab_handler (vec_duplicate_optab
, mode
);
16362 gcc_assert (icode
!= CODE_FOR_nothing
);
16364 create_output_operand (&ops
[0], target
, mode
);
16365 create_input_operand (&ops
[1], builder
.elt (nelts_reqd
- 1), elem_mode
);
16366 expand_insn (icode
, 2, ops
);
16368 int ndups
= builder
.count_dups (nelts_reqd
- 1, -1, -1);
16369 for (int i
= nelts_reqd
- ndups
- 1; i
>= 0; i
--)
16370 emit_insr (target
, builder
.elt (i
));
16373 /* Subroutine of aarch64_sve_expand_vector_init to handle case
16374 when all trailing elements of builder are same.
16375 This works as follows:
16376 (a) Use expand_insn interface to broadcast last vector element in TARGET.
16377 (b) Insert remaining elements in TARGET using insr.
16379 ??? The heuristic used is to do above if number of same trailing elements
16380 is at least 3/4 of total number of elements, loosely based on
16381 heuristic from mostly_zeros_p. May need fine-tuning. */
16384 aarch64_sve_expand_vector_init_handle_trailing_same_elem
16385 (rtx target
, const rtx_vector_builder
&builder
, int nelts_reqd
)
16387 int ndups
= builder
.count_dups (nelts_reqd
- 1, -1, -1);
16388 if (ndups
>= (3 * nelts_reqd
) / 4)
16390 aarch64_sve_expand_vector_init_insert_elems (target
, builder
,
16391 nelts_reqd
- ndups
+ 1);
16398 /* Initialize register TARGET from BUILDER. NELTS is the constant number
16399 of elements in BUILDER.
16401 The function tries to initialize TARGET from BUILDER if it fits one
16402 of the special cases outlined below.
16404 Failing that, the function divides BUILDER into two sub-vectors:
16405 v_even = even elements of BUILDER;
16406 v_odd = odd elements of BUILDER;
16408 and recursively calls itself with v_even and v_odd.
16410 if (recursive call succeeded for v_even or v_odd)
16411 TARGET = zip (v_even, v_odd)
16413 The function returns true if it managed to build TARGET from BUILDER
16414 with one of the special cases, false otherwise.
16416 Example: {a, 1, b, 2, c, 3, d, 4}
16418 The vector gets divided into:
16419 v_even = {a, b, c, d}
16420 v_odd = {1, 2, 3, 4}
16422 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
16423 initialize tmp2 from constant vector v_odd using emit_move_insn.
16425 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
16426 4 elements, so we construct tmp1 from v_even using insr:
16433 TARGET = zip (tmp1, tmp2)
16434 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
16437 aarch64_sve_expand_vector_init (rtx target
, const rtx_vector_builder
&builder
,
16438 int nelts
, int nelts_reqd
)
16440 machine_mode mode
= GET_MODE (target
);
16442 /* Case 1: Vector contains trailing constants. */
16444 if (aarch64_sve_expand_vector_init_handle_trailing_constants
16445 (target
, builder
, nelts
, nelts_reqd
))
16448 /* Case 2: Vector contains leading constants. */
16450 rtx_vector_builder
rev_builder (mode
, 1, nelts_reqd
);
16451 for (int i
= 0; i
< nelts_reqd
; i
++)
16452 rev_builder
.quick_push (builder
.elt (nelts_reqd
- i
- 1));
16453 rev_builder
.finalize ();
16455 if (aarch64_sve_expand_vector_init_handle_trailing_constants
16456 (target
, rev_builder
, nelts
, nelts_reqd
))
16458 emit_insn (gen_aarch64_sve_rev (mode
, target
, target
));
16462 /* Case 3: Vector contains trailing same element. */
16464 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16465 (target
, builder
, nelts_reqd
))
16468 /* Case 4: Vector contains leading same element. */
16470 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16471 (target
, rev_builder
, nelts_reqd
) && nelts_reqd
== nelts
)
16473 emit_insn (gen_aarch64_sve_rev (mode
, target
, target
));
16477 /* Avoid recursing below 4-elements.
16478 ??? The threshold 4 may need fine-tuning. */
16480 if (nelts_reqd
<= 4)
16483 rtx_vector_builder
v_even (mode
, 1, nelts
);
16484 rtx_vector_builder
v_odd (mode
, 1, nelts
);
16486 for (int i
= 0; i
< nelts
* 2; i
+= 2)
16488 v_even
.quick_push (builder
.elt (i
));
16489 v_odd
.quick_push (builder
.elt (i
+ 1));
16492 v_even
.finalize ();
16495 rtx tmp1
= gen_reg_rtx (mode
);
16496 bool did_even_p
= aarch64_sve_expand_vector_init (tmp1
, v_even
,
16497 nelts
, nelts_reqd
/ 2);
16499 rtx tmp2
= gen_reg_rtx (mode
);
16500 bool did_odd_p
= aarch64_sve_expand_vector_init (tmp2
, v_odd
,
16501 nelts
, nelts_reqd
/ 2);
16503 if (!did_even_p
&& !did_odd_p
)
16506 /* Initialize v_even and v_odd using INSR if it didn't match any of the
16507 special cases and zip v_even, v_odd. */
16510 aarch64_sve_expand_vector_init_insert_elems (tmp1
, v_even
, nelts_reqd
/ 2);
16513 aarch64_sve_expand_vector_init_insert_elems (tmp2
, v_odd
, nelts_reqd
/ 2);
16515 rtvec v
= gen_rtvec (2, tmp1
, tmp2
);
16516 emit_set_insn (target
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
16520 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
16523 aarch64_sve_expand_vector_init (rtx target
, rtx vals
)
16525 machine_mode mode
= GET_MODE (target
);
16526 int nelts
= XVECLEN (vals
, 0);
16528 rtx_vector_builder
v (mode
, 1, nelts
);
16529 for (int i
= 0; i
< nelts
; i
++)
16530 v
.quick_push (XVECEXP (vals
, 0, i
));
16533 /* If neither sub-vectors of v could be initialized specially,
16534 then use INSR to insert all elements from v into TARGET.
16535 ??? This might not be optimal for vectors with large
16536 initializers like 16-element or above.
16537 For nelts < 4, it probably isn't useful to handle specially. */
16540 || !aarch64_sve_expand_vector_init (target
, v
, nelts
, nelts
))
16541 aarch64_sve_expand_vector_init_insert_elems (target
, v
, nelts
);
16544 /* Check whether VALUE is a vector constant in which every element
16545 is either a power of 2 or a negated power of 2. If so, return
16546 a constant vector of log2s, and flip CODE between PLUS and MINUS
16547 if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
16550 aarch64_convert_mult_to_shift (rtx value
, rtx_code
&code
)
16552 if (GET_CODE (value
) != CONST_VECTOR
)
16555 rtx_vector_builder builder
;
16556 if (!builder
.new_unary_operation (GET_MODE (value
), value
, false))
16559 scalar_mode int_mode
= GET_MODE_INNER (GET_MODE (value
));
16560 /* 1 if the result of the multiplication must be negated,
16561 0 if it mustn't, or -1 if we don't yet care. */
16563 unsigned int encoded_nelts
= const_vector_encoded_nelts (value
);
16564 for (unsigned int i
= 0; i
< encoded_nelts
; ++i
)
16566 rtx elt
= CONST_VECTOR_ENCODED_ELT (value
, i
);
16567 if (!CONST_SCALAR_INT_P (elt
))
16569 rtx_mode_t
val (elt
, int_mode
);
16570 wide_int pow2
= wi::neg (val
);
16573 /* It matters whether we negate or not. Make that choice,
16574 and make sure that it's consistent with previous elements. */
16575 if (negate
== !wi::neg_p (val
))
16577 negate
= wi::neg_p (val
);
16581 /* POW2 is now the value that we want to be a power of 2. */
16582 int shift
= wi::exact_log2 (pow2
);
16585 builder
.quick_push (gen_int_mode (shift
, int_mode
));
16588 /* PLUS and MINUS are equivalent; canonicalize on PLUS. */
16590 else if (negate
== 1)
16591 code
= code
== PLUS
? MINUS
: PLUS
;
16592 return builder
.build ();
16595 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
16596 CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
16597 operands array, in the same order as for fma_optab. Return true if
16598 the function emitted all the necessary instructions, false if the caller
16599 should generate the pattern normally with the new OPERANDS array. */
16602 aarch64_prepare_sve_int_fma (rtx
*operands
, rtx_code code
)
16604 machine_mode mode
= GET_MODE (operands
[0]);
16605 if (rtx shifts
= aarch64_convert_mult_to_shift (operands
[2], code
))
16607 rtx product
= expand_binop (mode
, vashl_optab
, operands
[1], shifts
,
16608 NULL_RTX
, true, OPTAB_DIRECT
);
16609 force_expand_binop (mode
, code
== PLUS
? add_optab
: sub_optab
,
16610 operands
[3], product
, operands
[0], true,
16614 operands
[2] = force_reg (mode
, operands
[2]);
16618 /* Likewise, but for a conditional pattern. */
16621 aarch64_prepare_sve_cond_int_fma (rtx
*operands
, rtx_code code
)
16623 machine_mode mode
= GET_MODE (operands
[0]);
16624 if (rtx shifts
= aarch64_convert_mult_to_shift (operands
[3], code
))
16626 rtx product
= expand_binop (mode
, vashl_optab
, operands
[2], shifts
,
16627 NULL_RTX
, true, OPTAB_DIRECT
);
16628 emit_insn (gen_cond (code
, mode
, operands
[0], operands
[1],
16629 operands
[4], product
, operands
[5]));
16632 operands
[3] = force_reg (mode
, operands
[3]);
16636 static unsigned HOST_WIDE_INT
16637 aarch64_shift_truncation_mask (machine_mode mode
)
16639 if (!SHIFT_COUNT_TRUNCATED
|| aarch64_vector_data_mode_p (mode
))
16641 return GET_MODE_UNIT_BITSIZE (mode
) - 1;
16644 /* Select a format to encode pointers in exception handling data. */
16646 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED
, int global
)
16649 switch (aarch64_cmodel
)
16651 case AARCH64_CMODEL_TINY
:
16652 case AARCH64_CMODEL_TINY_PIC
:
16653 case AARCH64_CMODEL_SMALL
:
16654 case AARCH64_CMODEL_SMALL_PIC
:
16655 case AARCH64_CMODEL_SMALL_SPIC
:
16656 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
16658 type
= DW_EH_PE_sdata4
;
16661 /* No assumptions here. 8-byte relocs required. */
16662 type
= DW_EH_PE_sdata8
;
16665 return (global
? DW_EH_PE_indirect
: 0) | DW_EH_PE_pcrel
| type
;
16668 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
16671 aarch64_asm_output_variant_pcs (FILE *stream
, const tree decl
, const char* name
)
16673 if (aarch64_simd_decl_p (decl
))
16675 fprintf (stream
, "\t.variant_pcs\t");
16676 assemble_name (stream
, name
);
16677 fprintf (stream
, "\n");
16681 /* The last .arch and .tune assembly strings that we printed. */
16682 static std::string aarch64_last_printed_arch_string
;
16683 static std::string aarch64_last_printed_tune_string
;
16685 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
16686 by the function fndecl. */
16689 aarch64_declare_function_name (FILE *stream
, const char* name
,
16692 tree target_parts
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
16694 struct cl_target_option
*targ_options
;
16696 targ_options
= TREE_TARGET_OPTION (target_parts
);
16698 targ_options
= TREE_TARGET_OPTION (target_option_current_node
);
16699 gcc_assert (targ_options
);
16701 const struct processor
*this_arch
16702 = aarch64_get_arch (targ_options
->x_explicit_arch
);
16704 uint64_t isa_flags
= targ_options
->x_aarch64_isa_flags
;
16705 std::string extension
16706 = aarch64_get_extension_string_for_isa_flags (isa_flags
,
16708 /* Only update the assembler .arch string if it is distinct from the last
16709 such string we printed. */
16710 std::string to_print
= this_arch
->name
+ extension
;
16711 if (to_print
!= aarch64_last_printed_arch_string
)
16713 asm_fprintf (asm_out_file
, "\t.arch %s\n", to_print
.c_str ());
16714 aarch64_last_printed_arch_string
= to_print
;
16717 /* Print the cpu name we're tuning for in the comments, might be
16718 useful to readers of the generated asm. Do it only when it changes
16719 from function to function and verbose assembly is requested. */
16720 const struct processor
*this_tune
16721 = aarch64_get_tune_cpu (targ_options
->x_explicit_tune_core
);
16723 if (flag_debug_asm
&& aarch64_last_printed_tune_string
!= this_tune
->name
)
16725 asm_fprintf (asm_out_file
, "\t" ASM_COMMENT_START
".tune %s\n",
16727 aarch64_last_printed_tune_string
= this_tune
->name
;
16730 aarch64_asm_output_variant_pcs (stream
, fndecl
, name
);
16732 /* Don't forget the type directive for ELF. */
16733 ASM_OUTPUT_TYPE_DIRECTIVE (stream
, name
, "function");
16734 ASM_OUTPUT_LABEL (stream
, name
);
16737 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
16740 aarch64_asm_output_alias (FILE *stream
, const tree decl
, const tree target
)
16742 const char *name
= XSTR (XEXP (DECL_RTL (decl
), 0), 0);
16743 const char *value
= IDENTIFIER_POINTER (target
);
16744 aarch64_asm_output_variant_pcs (stream
, decl
, name
);
16745 ASM_OUTPUT_DEF (stream
, name
, value
);
16748 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
16749 function symbol references. */
16752 aarch64_asm_output_external (FILE *stream
, tree decl
, const char* name
)
16754 default_elf_asm_output_external (stream
, decl
, name
);
16755 aarch64_asm_output_variant_pcs (stream
, decl
, name
);
16758 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
16759 Used to output the .cfi_b_key_frame directive when signing the current
16760 function with the B key. */
16763 aarch64_post_cfi_startproc (FILE *f
, tree ignored ATTRIBUTE_UNUSED
)
16765 if (cfun
->machine
->frame
.laid_out
&& aarch64_return_address_signing_enabled ()
16766 && aarch64_ra_sign_key
== AARCH64_KEY_B
)
16767 asm_fprintf (f
, "\t.cfi_b_key_frame\n");
16770 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
16773 aarch64_start_file (void)
16775 struct cl_target_option
*default_options
16776 = TREE_TARGET_OPTION (target_option_default_node
);
16778 const struct processor
*default_arch
16779 = aarch64_get_arch (default_options
->x_explicit_arch
);
16780 uint64_t default_isa_flags
= default_options
->x_aarch64_isa_flags
;
16781 std::string extension
16782 = aarch64_get_extension_string_for_isa_flags (default_isa_flags
,
16783 default_arch
->flags
);
16785 aarch64_last_printed_arch_string
= default_arch
->name
+ extension
;
16786 aarch64_last_printed_tune_string
= "";
16787 asm_fprintf (asm_out_file
, "\t.arch %s\n",
16788 aarch64_last_printed_arch_string
.c_str ());
16790 default_file_start ();
16793 /* Emit load exclusive. */
16796 aarch64_emit_load_exclusive (machine_mode mode
, rtx rval
,
16797 rtx mem
, rtx model_rtx
)
16799 emit_insn (gen_aarch64_load_exclusive (mode
, rval
, mem
, model_rtx
));
16802 /* Emit store exclusive. */
16805 aarch64_emit_store_exclusive (machine_mode mode
, rtx bval
,
16806 rtx rval
, rtx mem
, rtx model_rtx
)
16808 emit_insn (gen_aarch64_store_exclusive (mode
, bval
, rval
, mem
, model_rtx
));
16811 /* Mark the previous jump instruction as unlikely. */
16814 aarch64_emit_unlikely_jump (rtx insn
)
16816 rtx_insn
*jump
= emit_jump_insn (insn
);
16817 add_reg_br_prob_note (jump
, profile_probability::very_unlikely ());
16820 /* Expand a compare and swap pattern. */
16823 aarch64_expand_compare_and_swap (rtx operands
[])
16825 rtx bval
, rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
, x
, cc_reg
;
16826 machine_mode mode
, r_mode
;
16828 bval
= operands
[0];
16829 rval
= operands
[1];
16831 oldval
= operands
[3];
16832 newval
= operands
[4];
16833 is_weak
= operands
[5];
16834 mod_s
= operands
[6];
16835 mod_f
= operands
[7];
16836 mode
= GET_MODE (mem
);
16838 /* Normally the succ memory model must be stronger than fail, but in the
16839 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
16840 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
16841 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f
)))
16842 && is_mm_release (memmodel_from_int (INTVAL (mod_s
))))
16843 mod_s
= GEN_INT (MEMMODEL_ACQ_REL
);
16846 if (mode
== QImode
|| mode
== HImode
)
16849 rval
= gen_reg_rtx (r_mode
);
16854 /* The CAS insn requires oldval and rval overlap, but we need to
16855 have a copy of oldval saved across the operation to tell if
16856 the operation is successful. */
16857 if (reg_overlap_mentioned_p (rval
, oldval
))
16858 rval
= copy_to_mode_reg (r_mode
, oldval
);
16860 emit_move_insn (rval
, gen_lowpart (r_mode
, oldval
));
16862 emit_insn (gen_aarch64_compare_and_swap_lse (mode
, rval
, mem
,
16864 cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
16868 /* The oldval predicate varies by mode. Test it and force to reg. */
16869 insn_code code
= code_for_aarch64_compare_and_swap (mode
);
16870 if (!insn_data
[code
].operand
[2].predicate (oldval
, mode
))
16871 oldval
= force_reg (mode
, oldval
);
16873 emit_insn (GEN_FCN (code
) (rval
, mem
, oldval
, newval
,
16874 is_weak
, mod_s
, mod_f
));
16875 cc_reg
= gen_rtx_REG (CCmode
, CC_REGNUM
);
16878 if (r_mode
!= mode
)
16879 rval
= gen_lowpart (mode
, rval
);
16880 emit_move_insn (operands
[1], rval
);
16882 x
= gen_rtx_EQ (SImode
, cc_reg
, const0_rtx
);
16883 emit_insn (gen_rtx_SET (bval
, x
));
16886 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
16887 sequence implementing an atomic operation. */
16890 aarch64_emit_post_barrier (enum memmodel model
)
16892 const enum memmodel base_model
= memmodel_base (model
);
16894 if (is_mm_sync (model
)
16895 && (base_model
== MEMMODEL_ACQUIRE
16896 || base_model
== MEMMODEL_ACQ_REL
16897 || base_model
== MEMMODEL_SEQ_CST
))
16899 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST
)));
16903 /* Split a compare and swap pattern. */
16906 aarch64_split_compare_and_swap (rtx operands
[])
16908 rtx rval
, mem
, oldval
, newval
, scratch
;
16911 rtx_code_label
*label1
, *label2
;
16913 enum memmodel model
;
16916 rval
= operands
[0];
16918 oldval
= operands
[2];
16919 newval
= operands
[3];
16920 is_weak
= (operands
[4] != const0_rtx
);
16921 model_rtx
= operands
[5];
16922 scratch
= operands
[7];
16923 mode
= GET_MODE (mem
);
16924 model
= memmodel_from_int (INTVAL (model_rtx
));
16926 /* When OLDVAL is zero and we want the strong version we can emit a tighter
16929 LD[A]XR rval, [mem]
16931 ST[L]XR scratch, newval, [mem]
16932 CBNZ scratch, .label1
16935 bool strong_zero_p
= !is_weak
&& oldval
== const0_rtx
;
16940 label1
= gen_label_rtx ();
16941 emit_label (label1
);
16943 label2
= gen_label_rtx ();
16945 /* The initial load can be relaxed for a __sync operation since a final
16946 barrier will be emitted to stop code hoisting. */
16947 if (is_mm_sync (model
))
16948 aarch64_emit_load_exclusive (mode
, rval
, mem
,
16949 GEN_INT (MEMMODEL_RELAXED
));
16951 aarch64_emit_load_exclusive (mode
, rval
, mem
, model_rtx
);
16955 if (aarch64_track_speculation
)
16957 /* Emit an explicit compare instruction, so that we can correctly
16958 track the condition codes. */
16959 rtx cc_reg
= aarch64_gen_compare_reg (NE
, rval
, const0_rtx
);
16960 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
16963 x
= gen_rtx_NE (VOIDmode
, rval
, const0_rtx
);
16965 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
16966 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
16967 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
16971 cond
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
16972 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
16973 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
16974 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
16975 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
16978 aarch64_emit_store_exclusive (mode
, scratch
, mem
, newval
, model_rtx
);
16982 if (aarch64_track_speculation
)
16984 /* Emit an explicit compare instruction, so that we can correctly
16985 track the condition codes. */
16986 rtx cc_reg
= aarch64_gen_compare_reg (NE
, scratch
, const0_rtx
);
16987 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
16990 x
= gen_rtx_NE (VOIDmode
, scratch
, const0_rtx
);
16992 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
16993 gen_rtx_LABEL_REF (Pmode
, label1
), pc_rtx
);
16994 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
16998 cond
= gen_rtx_REG (CCmode
, CC_REGNUM
);
16999 x
= gen_rtx_COMPARE (CCmode
, scratch
, const0_rtx
);
17000 emit_insn (gen_rtx_SET (cond
, x
));
17003 emit_label (label2
);
17004 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
17005 to set the condition flags. If this is not used it will be removed by
17009 cond
= gen_rtx_REG (CCmode
, CC_REGNUM
);
17010 x
= gen_rtx_COMPARE (CCmode
, rval
, const0_rtx
);
17011 emit_insn (gen_rtx_SET (cond
, x
));
17013 /* Emit any final barrier needed for a __sync operation. */
17014 if (is_mm_sync (model
))
17015 aarch64_emit_post_barrier (model
);
17018 /* Split an atomic operation. */
17021 aarch64_split_atomic_op (enum rtx_code code
, rtx old_out
, rtx new_out
, rtx mem
,
17022 rtx value
, rtx model_rtx
, rtx cond
)
17024 machine_mode mode
= GET_MODE (mem
);
17025 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
17026 const enum memmodel model
= memmodel_from_int (INTVAL (model_rtx
));
17027 const bool is_sync
= is_mm_sync (model
);
17028 rtx_code_label
*label
;
17031 /* Split the atomic operation into a sequence. */
17032 label
= gen_label_rtx ();
17033 emit_label (label
);
17036 new_out
= gen_lowpart (wmode
, new_out
);
17038 old_out
= gen_lowpart (wmode
, old_out
);
17041 value
= simplify_gen_subreg (wmode
, value
, mode
, 0);
17043 /* The initial load can be relaxed for a __sync operation since a final
17044 barrier will be emitted to stop code hoisting. */
17046 aarch64_emit_load_exclusive (mode
, old_out
, mem
,
17047 GEN_INT (MEMMODEL_RELAXED
));
17049 aarch64_emit_load_exclusive (mode
, old_out
, mem
, model_rtx
);
17058 x
= gen_rtx_AND (wmode
, old_out
, value
);
17059 emit_insn (gen_rtx_SET (new_out
, x
));
17060 x
= gen_rtx_NOT (wmode
, new_out
);
17061 emit_insn (gen_rtx_SET (new_out
, x
));
17065 if (CONST_INT_P (value
))
17067 value
= GEN_INT (-INTVAL (value
));
17070 /* Fall through. */
17073 x
= gen_rtx_fmt_ee (code
, wmode
, old_out
, value
);
17074 emit_insn (gen_rtx_SET (new_out
, x
));
17078 aarch64_emit_store_exclusive (mode
, cond
, mem
,
17079 gen_lowpart (mode
, new_out
), model_rtx
);
17081 if (aarch64_track_speculation
)
17083 /* Emit an explicit compare instruction, so that we can correctly
17084 track the condition codes. */
17085 rtx cc_reg
= aarch64_gen_compare_reg (NE
, cond
, const0_rtx
);
17086 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
17089 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
17091 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
17092 gen_rtx_LABEL_REF (Pmode
, label
), pc_rtx
);
17093 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
17095 /* Emit any final barrier needed for a __sync operation. */
17097 aarch64_emit_post_barrier (model
);
17101 aarch64_init_libfuncs (void)
17103 /* Half-precision float operations. The compiler handles all operations
17104 with NULL libfuncs by converting to SFmode. */
17107 set_conv_libfunc (trunc_optab
, HFmode
, SFmode
, "__gnu_f2h_ieee");
17108 set_conv_libfunc (sext_optab
, SFmode
, HFmode
, "__gnu_h2f_ieee");
17111 set_optab_libfunc (add_optab
, HFmode
, NULL
);
17112 set_optab_libfunc (sdiv_optab
, HFmode
, NULL
);
17113 set_optab_libfunc (smul_optab
, HFmode
, NULL
);
17114 set_optab_libfunc (neg_optab
, HFmode
, NULL
);
17115 set_optab_libfunc (sub_optab
, HFmode
, NULL
);
17118 set_optab_libfunc (eq_optab
, HFmode
, NULL
);
17119 set_optab_libfunc (ne_optab
, HFmode
, NULL
);
17120 set_optab_libfunc (lt_optab
, HFmode
, NULL
);
17121 set_optab_libfunc (le_optab
, HFmode
, NULL
);
17122 set_optab_libfunc (ge_optab
, HFmode
, NULL
);
17123 set_optab_libfunc (gt_optab
, HFmode
, NULL
);
17124 set_optab_libfunc (unord_optab
, HFmode
, NULL
);
17127 /* Target hook for c_mode_for_suffix. */
17128 static machine_mode
17129 aarch64_c_mode_for_suffix (char suffix
)
17137 /* We can only represent floating point constants which will fit in
17138 "quarter-precision" values. These values are characterised by
17139 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
17142 (-1)^s * (n/16) * 2^r
17145 's' is the sign bit.
17146 'n' is an integer in the range 16 <= n <= 31.
17147 'r' is an integer in the range -3 <= r <= 4. */
17149 /* Return true iff X can be represented by a quarter-precision
17150 floating point immediate operand X. Note, we cannot represent 0.0. */
17152 aarch64_float_const_representable_p (rtx x
)
17154 /* This represents our current view of how many bits
17155 make up the mantissa. */
17156 int point_pos
= 2 * HOST_BITS_PER_WIDE_INT
- 1;
17158 unsigned HOST_WIDE_INT mantissa
, mask
;
17159 REAL_VALUE_TYPE r
, m
;
17162 x
= unwrap_const_vec_duplicate (x
);
17163 if (!CONST_DOUBLE_P (x
))
17166 if (GET_MODE (x
) == VOIDmode
17167 || (GET_MODE (x
) == HFmode
&& !TARGET_FP_F16INST
))
17170 r
= *CONST_DOUBLE_REAL_VALUE (x
);
17172 /* We cannot represent infinities, NaNs or +/-zero. We won't
17173 know if we have +zero until we analyse the mantissa, but we
17174 can reject the other invalid values. */
17175 if (REAL_VALUE_ISINF (r
) || REAL_VALUE_ISNAN (r
)
17176 || REAL_VALUE_MINUS_ZERO (r
))
17179 /* Extract exponent. */
17180 r
= real_value_abs (&r
);
17181 exponent
= REAL_EXP (&r
);
17183 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
17184 highest (sign) bit, with a fixed binary point at bit point_pos.
17185 m1 holds the low part of the mantissa, m2 the high part.
17186 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
17187 bits for the mantissa, this can fail (low bits will be lost). */
17188 real_ldexp (&m
, &r
, point_pos
- exponent
);
17189 wide_int w
= real_to_integer (&m
, &fail
, HOST_BITS_PER_WIDE_INT
* 2);
17191 /* If the low part of the mantissa has bits set we cannot represent
17193 if (w
.ulow () != 0)
17195 /* We have rejected the lower HOST_WIDE_INT, so update our
17196 understanding of how many bits lie in the mantissa and
17197 look only at the high HOST_WIDE_INT. */
17198 mantissa
= w
.elt (1);
17199 point_pos
-= HOST_BITS_PER_WIDE_INT
;
17201 /* We can only represent values with a mantissa of the form 1.xxxx. */
17202 mask
= ((unsigned HOST_WIDE_INT
)1 << (point_pos
- 5)) - 1;
17203 if ((mantissa
& mask
) != 0)
17206 /* Having filtered unrepresentable values, we may now remove all
17207 but the highest 5 bits. */
17208 mantissa
>>= point_pos
- 5;
17210 /* We cannot represent the value 0.0, so reject it. This is handled
17215 /* Then, as bit 4 is always set, we can mask it off, leaving
17216 the mantissa in the range [0, 15]. */
17217 mantissa
&= ~(1 << 4);
17218 gcc_assert (mantissa
<= 15);
17220 /* GCC internally does not use IEEE754-like encoding (where normalized
17221 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
17222 Our mantissa values are shifted 4 places to the left relative to
17223 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
17224 by 5 places to correct for GCC's representation. */
17225 exponent
= 5 - exponent
;
17227 return (exponent
>= 0 && exponent
<= 7);
17230 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
17231 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
17232 output MOVI/MVNI, ORR or BIC immediate. */
17234 aarch64_output_simd_mov_immediate (rtx const_vector
, unsigned width
,
17235 enum simd_immediate_check which
)
17238 static char templ
[40];
17239 const char *mnemonic
;
17240 const char *shift_op
;
17241 unsigned int lane_count
= 0;
17244 struct simd_immediate_info info
;
17246 /* This will return true to show const_vector is legal for use as either
17247 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
17248 It will also update INFO to show how the immediate should be generated.
17249 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
17250 is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
, which
);
17251 gcc_assert (is_valid
);
17253 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
17254 lane_count
= width
/ GET_MODE_BITSIZE (info
.elt_mode
);
17256 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
17258 gcc_assert (info
.insn
== simd_immediate_info::MOV
17259 && info
.u
.mov
.shift
== 0);
17260 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
17261 move immediate path. */
17262 if (aarch64_float_const_zero_rtx_p (info
.u
.mov
.value
))
17263 info
.u
.mov
.value
= GEN_INT (0);
17266 const unsigned int buf_size
= 20;
17267 char float_buf
[buf_size
] = {'\0'};
17268 real_to_decimal_for_mode (float_buf
,
17269 CONST_DOUBLE_REAL_VALUE (info
.u
.mov
.value
),
17270 buf_size
, buf_size
, 1, info
.elt_mode
);
17272 if (lane_count
== 1)
17273 snprintf (templ
, sizeof (templ
), "fmov\t%%d0, %s", float_buf
);
17275 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%d%c, %s",
17276 lane_count
, element_char
, float_buf
);
17281 gcc_assert (CONST_INT_P (info
.u
.mov
.value
));
17283 if (which
== AARCH64_CHECK_MOV
)
17285 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "mvni" : "movi";
17286 shift_op
= (info
.u
.mov
.modifier
== simd_immediate_info::MSL
17288 if (lane_count
== 1)
17289 snprintf (templ
, sizeof (templ
), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX
,
17290 mnemonic
, UINTVAL (info
.u
.mov
.value
));
17291 else if (info
.u
.mov
.shift
)
17292 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
17293 HOST_WIDE_INT_PRINT_HEX
", %s %d", mnemonic
, lane_count
,
17294 element_char
, UINTVAL (info
.u
.mov
.value
), shift_op
,
17297 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
17298 HOST_WIDE_INT_PRINT_HEX
, mnemonic
, lane_count
,
17299 element_char
, UINTVAL (info
.u
.mov
.value
));
17303 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
17304 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "bic" : "orr";
17305 if (info
.u
.mov
.shift
)
17306 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
17307 HOST_WIDE_INT_PRINT_DEC
", %s #%d", mnemonic
, lane_count
,
17308 element_char
, UINTVAL (info
.u
.mov
.value
), "lsl",
17311 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
17312 HOST_WIDE_INT_PRINT_DEC
, mnemonic
, lane_count
,
17313 element_char
, UINTVAL (info
.u
.mov
.value
));
17319 aarch64_output_scalar_simd_mov_immediate (rtx immediate
, scalar_int_mode mode
)
17322 /* If a floating point number was passed and we desire to use it in an
17323 integer mode do the conversion to integer. */
17324 if (CONST_DOUBLE_P (immediate
) && GET_MODE_CLASS (mode
) == MODE_INT
)
17326 unsigned HOST_WIDE_INT ival
;
17327 if (!aarch64_reinterpret_float_as_int (immediate
, &ival
))
17328 gcc_unreachable ();
17329 immediate
= gen_int_mode (ival
, mode
);
17332 machine_mode vmode
;
17333 /* use a 64 bit mode for everything except for DI/DF mode, where we use
17334 a 128 bit vector mode. */
17335 int width
= GET_MODE_BITSIZE (mode
) == 64 ? 128 : 64;
17337 vmode
= aarch64_simd_container_mode (mode
, width
);
17338 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (immediate
));
17339 return aarch64_output_simd_mov_immediate (v_op
, width
);
17342 /* Return the output string to use for moving immediate CONST_VECTOR
17343 into an SVE register. */
17346 aarch64_output_sve_mov_immediate (rtx const_vector
)
17348 static char templ
[40];
17349 struct simd_immediate_info info
;
17352 bool is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
);
17353 gcc_assert (is_valid
);
17355 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
17357 machine_mode vec_mode
= GET_MODE (const_vector
);
17358 if (aarch64_sve_pred_mode_p (vec_mode
))
17360 static char buf
[sizeof ("ptrue\t%0.N, vlNNNNN")];
17361 if (info
.insn
== simd_immediate_info::MOV
)
17363 gcc_assert (info
.u
.mov
.value
== const0_rtx
);
17364 snprintf (buf
, sizeof (buf
), "pfalse\t%%0.b");
17368 gcc_assert (info
.insn
== simd_immediate_info::PTRUE
);
17369 unsigned int total_bytes
;
17370 if (info
.u
.pattern
== AARCH64_SV_ALL
17371 && BYTES_PER_SVE_VECTOR
.is_constant (&total_bytes
))
17372 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, vl%d", element_char
,
17373 total_bytes
/ GET_MODE_SIZE (info
.elt_mode
));
17375 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, %s", element_char
,
17376 svpattern_token (info
.u
.pattern
));
17381 if (info
.insn
== simd_immediate_info::INDEX
)
17383 snprintf (templ
, sizeof (templ
), "index\t%%0.%c, #"
17384 HOST_WIDE_INT_PRINT_DEC
", #" HOST_WIDE_INT_PRINT_DEC
,
17385 element_char
, INTVAL (info
.u
.index
.base
),
17386 INTVAL (info
.u
.index
.step
));
17390 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
17392 if (aarch64_float_const_zero_rtx_p (info
.u
.mov
.value
))
17393 info
.u
.mov
.value
= GEN_INT (0);
17396 const int buf_size
= 20;
17397 char float_buf
[buf_size
] = {};
17398 real_to_decimal_for_mode (float_buf
,
17399 CONST_DOUBLE_REAL_VALUE (info
.u
.mov
.value
),
17400 buf_size
, buf_size
, 1, info
.elt_mode
);
17402 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%c, #%s",
17403 element_char
, float_buf
);
17408 snprintf (templ
, sizeof (templ
), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC
,
17409 element_char
, INTVAL (info
.u
.mov
.value
));
17413 /* Split operands into moves from op[1] + op[2] into op[0]. */
17416 aarch64_split_combinev16qi (rtx operands
[3])
17418 unsigned int dest
= REGNO (operands
[0]);
17419 unsigned int src1
= REGNO (operands
[1]);
17420 unsigned int src2
= REGNO (operands
[2]);
17421 machine_mode halfmode
= GET_MODE (operands
[1]);
17422 unsigned int halfregs
= REG_NREGS (operands
[1]);
17423 rtx destlo
, desthi
;
17425 gcc_assert (halfmode
== V16QImode
);
17427 if (src1
== dest
&& src2
== dest
+ halfregs
)
17429 /* No-op move. Can't split to nothing; emit something. */
17430 emit_note (NOTE_INSN_DELETED
);
17434 /* Preserve register attributes for variable tracking. */
17435 destlo
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
, 0);
17436 desthi
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
+ halfregs
,
17437 GET_MODE_SIZE (halfmode
));
17439 /* Special case of reversed high/low parts. */
17440 if (reg_overlap_mentioned_p (operands
[2], destlo
)
17441 && reg_overlap_mentioned_p (operands
[1], desthi
))
17443 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
17444 emit_insn (gen_xorv16qi3 (operands
[2], operands
[1], operands
[2]));
17445 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
17447 else if (!reg_overlap_mentioned_p (operands
[2], destlo
))
17449 /* Try to avoid unnecessary moves if part of the result
17450 is in the right place already. */
17452 emit_move_insn (destlo
, operands
[1]);
17453 if (src2
!= dest
+ halfregs
)
17454 emit_move_insn (desthi
, operands
[2]);
17458 if (src2
!= dest
+ halfregs
)
17459 emit_move_insn (desthi
, operands
[2]);
17461 emit_move_insn (destlo
, operands
[1]);
17465 /* vec_perm support. */
17467 struct expand_vec_perm_d
17469 rtx target
, op0
, op1
;
17470 vec_perm_indices perm
;
17471 machine_mode vmode
;
17472 unsigned int vec_flags
;
17477 /* Generate a variable permutation. */
17480 aarch64_expand_vec_perm_1 (rtx target
, rtx op0
, rtx op1
, rtx sel
)
17482 machine_mode vmode
= GET_MODE (target
);
17483 bool one_vector_p
= rtx_equal_p (op0
, op1
);
17485 gcc_checking_assert (vmode
== V8QImode
|| vmode
== V16QImode
);
17486 gcc_checking_assert (GET_MODE (op0
) == vmode
);
17487 gcc_checking_assert (GET_MODE (op1
) == vmode
);
17488 gcc_checking_assert (GET_MODE (sel
) == vmode
);
17489 gcc_checking_assert (TARGET_SIMD
);
17493 if (vmode
== V8QImode
)
17495 /* Expand the argument to a V16QI mode by duplicating it. */
17496 rtx pair
= gen_reg_rtx (V16QImode
);
17497 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op0
));
17498 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
17502 emit_insn (gen_aarch64_tbl1v16qi (target
, op0
, sel
));
17509 if (vmode
== V8QImode
)
17511 pair
= gen_reg_rtx (V16QImode
);
17512 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op1
));
17513 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
17517 pair
= gen_reg_rtx (OImode
);
17518 emit_insn (gen_aarch64_combinev16qi (pair
, op0
, op1
));
17519 emit_insn (gen_aarch64_tbl2v16qi (target
, pair
, sel
));
17524 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
17525 NELT is the number of elements in the vector. */
17528 aarch64_expand_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
,
17531 machine_mode vmode
= GET_MODE (target
);
17532 bool one_vector_p
= rtx_equal_p (op0
, op1
);
17535 /* The TBL instruction does not use a modulo index, so we must take care
17536 of that ourselves. */
17537 mask
= aarch64_simd_gen_const_vector_dup (vmode
,
17538 one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
17539 sel
= expand_simple_binop (vmode
, AND
, sel
, mask
, NULL
, 0, OPTAB_LIB_WIDEN
);
17541 /* For big-endian, we also need to reverse the index within the vector
17542 (but not which vector). */
17543 if (BYTES_BIG_ENDIAN
)
17545 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
17547 mask
= aarch64_simd_gen_const_vector_dup (vmode
, nelt
- 1);
17548 sel
= expand_simple_binop (vmode
, XOR
, sel
, mask
,
17549 NULL
, 0, OPTAB_LIB_WIDEN
);
17551 aarch64_expand_vec_perm_1 (target
, op0
, op1
, sel
);
17554 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
17557 emit_unspec2 (rtx target
, int code
, rtx op0
, rtx op1
)
17559 emit_insn (gen_rtx_SET (target
,
17560 gen_rtx_UNSPEC (GET_MODE (target
),
17561 gen_rtvec (2, op0
, op1
), code
)));
17564 /* Expand an SVE vec_perm with the given operands. */
17567 aarch64_expand_sve_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
)
17569 machine_mode data_mode
= GET_MODE (target
);
17570 machine_mode sel_mode
= GET_MODE (sel
);
17571 /* Enforced by the pattern condition. */
17572 int nunits
= GET_MODE_NUNITS (sel_mode
).to_constant ();
17574 /* Note: vec_perm indices are supposed to wrap when they go beyond the
17575 size of the two value vectors, i.e. the upper bits of the indices
17576 are effectively ignored. SVE TBL instead produces 0 for any
17577 out-of-range indices, so we need to modulo all the vec_perm indices
17578 to ensure they are all in range. */
17579 rtx sel_reg
= force_reg (sel_mode
, sel
);
17581 /* Check if the sel only references the first values vector. */
17582 if (GET_CODE (sel
) == CONST_VECTOR
17583 && aarch64_const_vec_all_in_range_p (sel
, 0, nunits
- 1))
17585 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_reg
);
17589 /* Check if the two values vectors are the same. */
17590 if (rtx_equal_p (op0
, op1
))
17592 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
, nunits
- 1);
17593 rtx sel_mod
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
17594 NULL
, 0, OPTAB_DIRECT
);
17595 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_mod
);
17599 /* Run TBL on for each value vector and combine the results. */
17601 rtx res0
= gen_reg_rtx (data_mode
);
17602 rtx res1
= gen_reg_rtx (data_mode
);
17603 rtx neg_num_elems
= aarch64_simd_gen_const_vector_dup (sel_mode
, -nunits
);
17604 if (GET_CODE (sel
) != CONST_VECTOR
17605 || !aarch64_const_vec_all_in_range_p (sel
, 0, 2 * nunits
- 1))
17607 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
,
17609 sel_reg
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
17610 NULL
, 0, OPTAB_DIRECT
);
17612 emit_unspec2 (res0
, UNSPEC_TBL
, op0
, sel_reg
);
17613 rtx sel_sub
= expand_simple_binop (sel_mode
, PLUS
, sel_reg
, neg_num_elems
,
17614 NULL
, 0, OPTAB_DIRECT
);
17615 emit_unspec2 (res1
, UNSPEC_TBL
, op1
, sel_sub
);
17616 if (GET_MODE_CLASS (data_mode
) == MODE_VECTOR_INT
)
17617 emit_insn (gen_rtx_SET (target
, gen_rtx_IOR (data_mode
, res0
, res1
)));
17619 emit_unspec2 (target
, UNSPEC_IORF
, res0
, res1
);
17622 /* Recognize patterns suitable for the TRN instructions. */
17624 aarch64_evpc_trn (struct expand_vec_perm_d
*d
)
17627 poly_uint64 nelt
= d
->perm
.length ();
17628 rtx out
, in0
, in1
, x
;
17629 machine_mode vmode
= d
->vmode
;
17631 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
17634 /* Note that these are little-endian tests.
17635 We correct for big-endian later. */
17636 if (!d
->perm
[0].is_constant (&odd
)
17637 || (odd
!= 0 && odd
!= 1)
17638 || !d
->perm
.series_p (0, 2, odd
, 2)
17639 || !d
->perm
.series_p (1, 2, nelt
+ odd
, 2))
17648 /* We don't need a big-endian lane correction for SVE; see the comment
17649 at the head of aarch64-sve.md for details. */
17650 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
17652 x
= in0
, in0
= in1
, in1
= x
;
17657 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
17658 odd
? UNSPEC_TRN2
: UNSPEC_TRN1
));
17662 /* Recognize patterns suitable for the UZP instructions. */
17664 aarch64_evpc_uzp (struct expand_vec_perm_d
*d
)
17667 rtx out
, in0
, in1
, x
;
17668 machine_mode vmode
= d
->vmode
;
17670 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
17673 /* Note that these are little-endian tests.
17674 We correct for big-endian later. */
17675 if (!d
->perm
[0].is_constant (&odd
)
17676 || (odd
!= 0 && odd
!= 1)
17677 || !d
->perm
.series_p (0, 1, odd
, 2))
17686 /* We don't need a big-endian lane correction for SVE; see the comment
17687 at the head of aarch64-sve.md for details. */
17688 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
17690 x
= in0
, in0
= in1
, in1
= x
;
17695 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
17696 odd
? UNSPEC_UZP2
: UNSPEC_UZP1
));
17700 /* Recognize patterns suitable for the ZIP instructions. */
17702 aarch64_evpc_zip (struct expand_vec_perm_d
*d
)
17705 poly_uint64 nelt
= d
->perm
.length ();
17706 rtx out
, in0
, in1
, x
;
17707 machine_mode vmode
= d
->vmode
;
17709 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
17712 /* Note that these are little-endian tests.
17713 We correct for big-endian later. */
17714 poly_uint64 first
= d
->perm
[0];
17715 if ((maybe_ne (first
, 0U) && maybe_ne (first
* 2, nelt
))
17716 || !d
->perm
.series_p (0, 2, first
, 1)
17717 || !d
->perm
.series_p (1, 2, first
+ nelt
, 1))
17719 high
= maybe_ne (first
, 0U);
17727 /* We don't need a big-endian lane correction for SVE; see the comment
17728 at the head of aarch64-sve.md for details. */
17729 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
17731 x
= in0
, in0
= in1
, in1
= x
;
17736 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
17737 high
? UNSPEC_ZIP2
: UNSPEC_ZIP1
));
17741 /* Recognize patterns for the EXT insn. */
17744 aarch64_evpc_ext (struct expand_vec_perm_d
*d
)
17746 HOST_WIDE_INT location
;
17749 /* The first element always refers to the first vector.
17750 Check if the extracted indices are increasing by one. */
17751 if (d
->vec_flags
== VEC_SVE_PRED
17752 || !d
->perm
[0].is_constant (&location
)
17753 || !d
->perm
.series_p (0, 1, location
, 1))
17760 /* The case where (location == 0) is a no-op for both big- and little-endian,
17761 and is removed by the mid-end at optimization levels -O1 and higher.
17763 We don't need a big-endian lane correction for SVE; see the comment
17764 at the head of aarch64-sve.md for details. */
17765 if (BYTES_BIG_ENDIAN
&& location
!= 0 && d
->vec_flags
== VEC_ADVSIMD
)
17767 /* After setup, we want the high elements of the first vector (stored
17768 at the LSB end of the register), and the low elements of the second
17769 vector (stored at the MSB end of the register). So swap. */
17770 std::swap (d
->op0
, d
->op1
);
17771 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
17772 to_constant () is safe since this is restricted to Advanced SIMD
17774 location
= d
->perm
.length ().to_constant () - location
;
17777 offset
= GEN_INT (location
);
17778 emit_set_insn (d
->target
,
17779 gen_rtx_UNSPEC (d
->vmode
,
17780 gen_rtvec (3, d
->op0
, d
->op1
, offset
),
17785 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
17786 within each 64-bit, 32-bit or 16-bit granule. */
17789 aarch64_evpc_rev_local (struct expand_vec_perm_d
*d
)
17791 HOST_WIDE_INT diff
;
17792 unsigned int i
, size
, unspec
;
17793 machine_mode pred_mode
;
17795 if (d
->vec_flags
== VEC_SVE_PRED
17796 || !d
->one_vector_p
17797 || !d
->perm
[0].is_constant (&diff
))
17800 size
= (diff
+ 1) * GET_MODE_UNIT_SIZE (d
->vmode
);
17803 unspec
= UNSPEC_REV64
;
17804 pred_mode
= VNx2BImode
;
17806 else if (size
== 4)
17808 unspec
= UNSPEC_REV32
;
17809 pred_mode
= VNx4BImode
;
17811 else if (size
== 2)
17813 unspec
= UNSPEC_REV16
;
17814 pred_mode
= VNx8BImode
;
17819 unsigned int step
= diff
+ 1;
17820 for (i
= 0; i
< step
; ++i
)
17821 if (!d
->perm
.series_p (i
, step
, diff
- i
, step
))
17828 if (d
->vec_flags
== VEC_SVE_DATA
)
17830 machine_mode int_mode
= aarch64_sve_int_mode (pred_mode
);
17831 rtx target
= gen_reg_rtx (int_mode
);
17832 if (BYTES_BIG_ENDIAN
)
17833 /* The act of taking a subreg between INT_MODE and d->vmode
17834 is itself a reversing operation on big-endian targets;
17835 see the comment at the head of aarch64-sve.md for details.
17836 First reinterpret OP0 as INT_MODE without using a subreg
17837 and without changing the contents. */
17838 emit_insn (gen_aarch64_sve_reinterpret (int_mode
, target
, d
->op0
));
17841 /* For SVE we use REV[BHW] unspecs derived from the element size
17842 of v->mode and vector modes whose elements have SIZE bytes.
17843 This ensures that the vector modes match the predicate modes. */
17844 int unspec
= aarch64_sve_rev_unspec (d
->vmode
);
17845 rtx pred
= aarch64_ptrue_reg (pred_mode
);
17846 emit_insn (gen_aarch64_pred (unspec
, int_mode
, target
, pred
,
17847 gen_lowpart (int_mode
, d
->op0
)));
17849 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
17852 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), unspec
);
17853 emit_set_insn (d
->target
, src
);
17857 /* Recognize patterns for the REV insn, which reverses elements within
17861 aarch64_evpc_rev_global (struct expand_vec_perm_d
*d
)
17863 poly_uint64 nelt
= d
->perm
.length ();
17865 if (!d
->one_vector_p
|| d
->vec_flags
!= VEC_SVE_DATA
)
17868 if (!d
->perm
.series_p (0, 1, nelt
- 1, -1))
17875 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), UNSPEC_REV
);
17876 emit_set_insn (d
->target
, src
);
17881 aarch64_evpc_dup (struct expand_vec_perm_d
*d
)
17883 rtx out
= d
->target
;
17886 machine_mode vmode
= d
->vmode
;
17889 if (d
->vec_flags
== VEC_SVE_PRED
17890 || d
->perm
.encoding ().encoded_nelts () != 1
17891 || !d
->perm
[0].is_constant (&elt
))
17894 if (d
->vec_flags
== VEC_SVE_DATA
&& elt
>= 64 * GET_MODE_UNIT_SIZE (vmode
))
17901 /* The generic preparation in aarch64_expand_vec_perm_const_1
17902 swaps the operand order and the permute indices if it finds
17903 d->perm[0] to be in the second operand. Thus, we can always
17904 use d->op0 and need not do any extra arithmetic to get the
17905 correct lane number. */
17907 lane
= GEN_INT (elt
); /* The pattern corrects for big-endian. */
17909 rtx parallel
= gen_rtx_PARALLEL (vmode
, gen_rtvec (1, lane
));
17910 rtx select
= gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode
), in0
, parallel
);
17911 emit_set_insn (out
, gen_rtx_VEC_DUPLICATE (vmode
, select
));
17916 aarch64_evpc_tbl (struct expand_vec_perm_d
*d
)
17918 rtx rperm
[MAX_COMPILE_TIME_VEC_BYTES
], sel
;
17919 machine_mode vmode
= d
->vmode
;
17921 /* Make sure that the indices are constant. */
17922 unsigned int encoded_nelts
= d
->perm
.encoding ().encoded_nelts ();
17923 for (unsigned int i
= 0; i
< encoded_nelts
; ++i
)
17924 if (!d
->perm
[i
].is_constant ())
17930 /* Generic code will try constant permutation twice. Once with the
17931 original mode and again with the elements lowered to QImode.
17932 So wait and don't do the selector expansion ourselves. */
17933 if (vmode
!= V8QImode
&& vmode
!= V16QImode
)
17936 /* to_constant is safe since this routine is specific to Advanced SIMD
17938 unsigned int nelt
= d
->perm
.length ().to_constant ();
17939 for (unsigned int i
= 0; i
< nelt
; ++i
)
17940 /* If big-endian and two vectors we end up with a weird mixed-endian
17941 mode on NEON. Reverse the index within each word but not the word
17942 itself. to_constant is safe because we checked is_constant above. */
17943 rperm
[i
] = GEN_INT (BYTES_BIG_ENDIAN
17944 ? d
->perm
[i
].to_constant () ^ (nelt
- 1)
17945 : d
->perm
[i
].to_constant ());
17947 sel
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
17948 sel
= force_reg (vmode
, sel
);
17950 aarch64_expand_vec_perm_1 (d
->target
, d
->op0
, d
->op1
, sel
);
17954 /* Try to implement D using an SVE TBL instruction. */
17957 aarch64_evpc_sve_tbl (struct expand_vec_perm_d
*d
)
17959 unsigned HOST_WIDE_INT nelt
;
17961 /* Permuting two variable-length vectors could overflow the
17963 if (!d
->one_vector_p
&& !d
->perm
.length ().is_constant (&nelt
))
17969 machine_mode sel_mode
= mode_for_int_vector (d
->vmode
).require ();
17970 rtx sel
= vec_perm_indices_to_rtx (sel_mode
, d
->perm
);
17971 if (d
->one_vector_p
)
17972 emit_unspec2 (d
->target
, UNSPEC_TBL
, d
->op0
, force_reg (sel_mode
, sel
));
17974 aarch64_expand_sve_vec_perm (d
->target
, d
->op0
, d
->op1
, sel
);
17979 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
17981 /* The pattern matching functions above are written to look for a small
17982 number to begin the sequence (0, 1, N/2). If we begin with an index
17983 from the second operand, we can swap the operands. */
17984 poly_int64 nelt
= d
->perm
.length ();
17985 if (known_ge (d
->perm
[0], nelt
))
17987 d
->perm
.rotate_inputs (1);
17988 std::swap (d
->op0
, d
->op1
);
17991 if ((d
->vec_flags
== VEC_ADVSIMD
17992 || d
->vec_flags
== VEC_SVE_DATA
17993 || d
->vec_flags
== VEC_SVE_PRED
)
17994 && known_gt (nelt
, 1))
17996 if (aarch64_evpc_rev_local (d
))
17998 else if (aarch64_evpc_rev_global (d
))
18000 else if (aarch64_evpc_ext (d
))
18002 else if (aarch64_evpc_dup (d
))
18004 else if (aarch64_evpc_zip (d
))
18006 else if (aarch64_evpc_uzp (d
))
18008 else if (aarch64_evpc_trn (d
))
18010 if (d
->vec_flags
== VEC_SVE_DATA
)
18011 return aarch64_evpc_sve_tbl (d
);
18012 else if (d
->vec_flags
== VEC_ADVSIMD
)
18013 return aarch64_evpc_tbl (d
);
18018 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
18021 aarch64_vectorize_vec_perm_const (machine_mode vmode
, rtx target
, rtx op0
,
18022 rtx op1
, const vec_perm_indices
&sel
)
18024 struct expand_vec_perm_d d
;
18026 /* Check whether the mask can be applied to a single vector. */
18027 if (sel
.ninputs () == 1
18028 || (op0
&& rtx_equal_p (op0
, op1
)))
18029 d
.one_vector_p
= true;
18030 else if (sel
.all_from_input_p (0))
18032 d
.one_vector_p
= true;
18035 else if (sel
.all_from_input_p (1))
18037 d
.one_vector_p
= true;
18041 d
.one_vector_p
= false;
18043 d
.perm
.new_vector (sel
.encoding (), d
.one_vector_p
? 1 : 2,
18044 sel
.nelts_per_input ());
18046 d
.vec_flags
= aarch64_classify_vector_mode (d
.vmode
);
18050 d
.testing_p
= !target
;
18053 return aarch64_expand_vec_perm_const_1 (&d
);
18055 rtx_insn
*last
= get_last_insn ();
18056 bool ret
= aarch64_expand_vec_perm_const_1 (&d
);
18057 gcc_assert (last
== get_last_insn ());
18062 /* Generate a byte permute mask for a register of mode MODE,
18063 which has NUNITS units. */
18066 aarch64_reverse_mask (machine_mode mode
, unsigned int nunits
)
18068 /* We have to reverse each vector because we dont have
18069 a permuted load that can reverse-load according to ABI rules. */
18071 rtvec v
= rtvec_alloc (16);
18073 unsigned int usize
= GET_MODE_UNIT_SIZE (mode
);
18075 gcc_assert (BYTES_BIG_ENDIAN
);
18076 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode
));
18078 for (i
= 0; i
< nunits
; i
++)
18079 for (j
= 0; j
< usize
; j
++)
18080 RTVEC_ELT (v
, i
* usize
+ j
) = GEN_INT ((i
+ 1) * usize
- 1 - j
);
18081 mask
= gen_rtx_CONST_VECTOR (V16QImode
, v
);
18082 return force_reg (V16QImode
, mask
);
18085 /* Expand an SVE integer comparison using the SVE equivalent of:
18087 (set TARGET (CODE OP0 OP1)). */
18090 aarch64_expand_sve_vec_cmp_int (rtx target
, rtx_code code
, rtx op0
, rtx op1
)
18092 machine_mode pred_mode
= GET_MODE (target
);
18093 machine_mode data_mode
= GET_MODE (op0
);
18094 rtx res
= aarch64_sve_emit_int_cmp (target
, pred_mode
, code
, data_mode
,
18096 if (!rtx_equal_p (target
, res
))
18097 emit_move_insn (target
, res
);
18100 /* Return the UNSPEC_COND_* code for comparison CODE. */
18102 static unsigned int
18103 aarch64_unspec_cond_code (rtx_code code
)
18108 return UNSPEC_COND_FCMNE
;
18110 return UNSPEC_COND_FCMEQ
;
18112 return UNSPEC_COND_FCMLT
;
18114 return UNSPEC_COND_FCMGT
;
18116 return UNSPEC_COND_FCMLE
;
18118 return UNSPEC_COND_FCMGE
;
18120 return UNSPEC_COND_FCMUO
;
18122 gcc_unreachable ();
18128 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
18130 where <X> is the operation associated with comparison CODE.
18131 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
18134 aarch64_emit_sve_fp_cond (rtx target
, rtx_code code
, rtx pred
,
18135 bool known_ptrue_p
, rtx op0
, rtx op1
)
18137 rtx flag
= gen_int_mode (known_ptrue_p
, SImode
);
18138 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (pred
),
18139 gen_rtvec (4, pred
, flag
, op0
, op1
),
18140 aarch64_unspec_cond_code (code
));
18141 emit_set_insn (target
, unspec
);
18144 /* Emit the SVE equivalent of:
18146 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
18147 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
18148 (set TARGET (ior:PRED_MODE TMP1 TMP2))
18150 where <Xi> is the operation associated with comparison CODEi.
18151 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
18154 aarch64_emit_sve_or_fp_conds (rtx target
, rtx_code code1
, rtx_code code2
,
18155 rtx pred
, bool known_ptrue_p
, rtx op0
, rtx op1
)
18157 machine_mode pred_mode
= GET_MODE (pred
);
18158 rtx tmp1
= gen_reg_rtx (pred_mode
);
18159 aarch64_emit_sve_fp_cond (tmp1
, code1
, pred
, known_ptrue_p
, op0
, op1
);
18160 rtx tmp2
= gen_reg_rtx (pred_mode
);
18161 aarch64_emit_sve_fp_cond (tmp2
, code2
, pred
, known_ptrue_p
, op0
, op1
);
18162 aarch64_emit_binop (target
, ior_optab
, tmp1
, tmp2
);
18165 /* Emit the SVE equivalent of:
18167 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
18168 (set TARGET (not TMP))
18170 where <X> is the operation associated with comparison CODE.
18171 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
18174 aarch64_emit_sve_invert_fp_cond (rtx target
, rtx_code code
, rtx pred
,
18175 bool known_ptrue_p
, rtx op0
, rtx op1
)
18177 machine_mode pred_mode
= GET_MODE (pred
);
18178 rtx tmp
= gen_reg_rtx (pred_mode
);
18179 aarch64_emit_sve_fp_cond (tmp
, code
, pred
, known_ptrue_p
, op0
, op1
);
18180 aarch64_emit_unop (target
, one_cmpl_optab
, tmp
);
18183 /* Expand an SVE floating-point comparison using the SVE equivalent of:
18185 (set TARGET (CODE OP0 OP1))
18187 If CAN_INVERT_P is true, the caller can also handle inverted results;
18188 return true if the result is in fact inverted. */
18191 aarch64_expand_sve_vec_cmp_float (rtx target
, rtx_code code
,
18192 rtx op0
, rtx op1
, bool can_invert_p
)
18194 machine_mode pred_mode
= GET_MODE (target
);
18195 machine_mode data_mode
= GET_MODE (op0
);
18197 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
18201 /* UNORDERED has no immediate form. */
18202 op1
= force_reg (data_mode
, op1
);
18211 /* There is native support for the comparison. */
18212 aarch64_emit_sve_fp_cond (target
, code
, ptrue
, true, op0
, op1
);
18217 /* This is a trapping operation (LT or GT). */
18218 aarch64_emit_sve_or_fp_conds (target
, LT
, GT
, ptrue
, true, op0
, op1
);
18222 if (!flag_trapping_math
)
18224 /* This would trap for signaling NaNs. */
18225 op1
= force_reg (data_mode
, op1
);
18226 aarch64_emit_sve_or_fp_conds (target
, UNORDERED
, EQ
,
18227 ptrue
, true, op0
, op1
);
18235 if (flag_trapping_math
)
18237 /* Work out which elements are ordered. */
18238 rtx ordered
= gen_reg_rtx (pred_mode
);
18239 op1
= force_reg (data_mode
, op1
);
18240 aarch64_emit_sve_invert_fp_cond (ordered
, UNORDERED
,
18241 ptrue
, true, op0
, op1
);
18243 /* Test the opposite condition for the ordered elements,
18244 then invert the result. */
18248 code
= reverse_condition_maybe_unordered (code
);
18251 aarch64_emit_sve_fp_cond (target
, code
,
18252 ordered
, false, op0
, op1
);
18255 aarch64_emit_sve_invert_fp_cond (target
, code
,
18256 ordered
, false, op0
, op1
);
18262 /* ORDERED has no immediate form. */
18263 op1
= force_reg (data_mode
, op1
);
18267 gcc_unreachable ();
18270 /* There is native support for the inverse comparison. */
18271 code
= reverse_condition_maybe_unordered (code
);
18274 aarch64_emit_sve_fp_cond (target
, code
, ptrue
, true, op0
, op1
);
18277 aarch64_emit_sve_invert_fp_cond (target
, code
, ptrue
, true, op0
, op1
);
18281 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
18282 of the data being selected and CMP_MODE is the mode of the values being
18286 aarch64_expand_sve_vcond (machine_mode data_mode
, machine_mode cmp_mode
,
18289 machine_mode pred_mode
18290 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode
),
18291 GET_MODE_SIZE (cmp_mode
)).require ();
18292 rtx pred
= gen_reg_rtx (pred_mode
);
18293 if (FLOAT_MODE_P (cmp_mode
))
18295 if (aarch64_expand_sve_vec_cmp_float (pred
, GET_CODE (ops
[3]),
18296 ops
[4], ops
[5], true))
18297 std::swap (ops
[1], ops
[2]);
18300 aarch64_expand_sve_vec_cmp_int (pred
, GET_CODE (ops
[3]), ops
[4], ops
[5]);
18302 if (!aarch64_sve_reg_or_dup_imm (ops
[1], data_mode
))
18303 ops
[1] = force_reg (data_mode
, ops
[1]);
18304 /* The "false" value can only be zero if the "true" value is a constant. */
18305 if (register_operand (ops
[1], data_mode
)
18306 || !aarch64_simd_reg_or_zero (ops
[2], data_mode
))
18307 ops
[2] = force_reg (data_mode
, ops
[2]);
18309 rtvec vec
= gen_rtvec (3, pred
, ops
[1], ops
[2]);
18310 emit_set_insn (ops
[0], gen_rtx_UNSPEC (data_mode
, vec
, UNSPEC_SEL
));
18313 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
18314 true. However due to issues with register allocation it is preferable
18315 to avoid tieing integer scalar and FP scalar modes. Executing integer
18316 operations in general registers is better than treating them as scalar
18317 vector operations. This reduces latency and avoids redundant int<->FP
18318 moves. So tie modes if they are either the same class, or vector modes
18319 with other vector modes, vector structs or any scalar mode. */
18322 aarch64_modes_tieable_p (machine_mode mode1
, machine_mode mode2
)
18324 if (GET_MODE_CLASS (mode1
) == GET_MODE_CLASS (mode2
))
18327 /* We specifically want to allow elements of "structure" modes to
18328 be tieable to the structure. This more general condition allows
18329 other rarer situations too. The reason we don't extend this to
18330 predicate modes is that there are no predicate structure modes
18331 nor any specific instructions for extracting part of a predicate
18333 if (aarch64_vector_data_mode_p (mode1
)
18334 && aarch64_vector_data_mode_p (mode2
))
18337 /* Also allow any scalar modes with vectors. */
18338 if (aarch64_vector_mode_supported_p (mode1
)
18339 || aarch64_vector_mode_supported_p (mode2
))
18345 /* Return a new RTX holding the result of moving POINTER forward by
18349 aarch64_move_pointer (rtx pointer
, poly_int64 amount
)
18351 rtx next
= plus_constant (Pmode
, XEXP (pointer
, 0), amount
);
18353 return adjust_automodify_address (pointer
, GET_MODE (pointer
),
18357 /* Return a new RTX holding the result of moving POINTER forward by the
18358 size of the mode it points to. */
18361 aarch64_progress_pointer (rtx pointer
)
18363 return aarch64_move_pointer (pointer
, GET_MODE_SIZE (GET_MODE (pointer
)));
18366 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
18370 aarch64_copy_one_block_and_progress_pointers (rtx
*src
, rtx
*dst
,
18373 rtx reg
= gen_reg_rtx (mode
);
18375 /* "Cast" the pointers to the correct mode. */
18376 *src
= adjust_address (*src
, mode
, 0);
18377 *dst
= adjust_address (*dst
, mode
, 0);
18378 /* Emit the memcpy. */
18379 emit_move_insn (reg
, *src
);
18380 emit_move_insn (*dst
, reg
);
18381 /* Move the pointers forward. */
18382 *src
= aarch64_progress_pointer (*src
);
18383 *dst
= aarch64_progress_pointer (*dst
);
18386 /* Expand cpymem, as if from a __builtin_memcpy. Return true if
18387 we succeed, otherwise return false. */
18390 aarch64_expand_cpymem (rtx
*operands
)
18393 rtx dst
= operands
[0];
18394 rtx src
= operands
[1];
18396 machine_mode cur_mode
= BLKmode
, next_mode
;
18397 bool speed_p
= !optimize_function_for_size_p (cfun
);
18399 /* When optimizing for size, give a better estimate of the length of a
18400 memcpy call, but use the default otherwise. Moves larger than 8 bytes
18401 will always require an even number of instructions to do now. And each
18402 operation requires both a load+store, so devide the max number by 2. */
18403 int max_num_moves
= (speed_p
? 16 : AARCH64_CALL_RATIO
) / 2;
18405 /* We can't do anything smart if the amount to copy is not constant. */
18406 if (!CONST_INT_P (operands
[2]))
18409 n
= INTVAL (operands
[2]);
18411 /* Try to keep the number of instructions low. For all cases we will do at
18412 most two moves for the residual amount, since we'll always overlap the
18414 if (((n
/ 16) + (n
% 16 ? 2 : 0)) > max_num_moves
)
18417 base
= copy_to_mode_reg (Pmode
, XEXP (dst
, 0));
18418 dst
= adjust_automodify_address (dst
, VOIDmode
, base
, 0);
18420 base
= copy_to_mode_reg (Pmode
, XEXP (src
, 0));
18421 src
= adjust_automodify_address (src
, VOIDmode
, base
, 0);
18423 /* Convert n to bits to make the rest of the code simpler. */
18424 n
= n
* BITS_PER_UNIT
;
18426 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
18427 larger than TImode, but we should not use them for loads/stores here. */
18428 const int copy_limit
= GET_MODE_BITSIZE (TImode
);
18432 /* Find the largest mode in which to do the copy in without over reading
18434 opt_scalar_int_mode mode_iter
;
18435 FOR_EACH_MODE_IN_CLASS (mode_iter
, MODE_INT
)
18436 if (GET_MODE_BITSIZE (mode_iter
.require ()) <= MIN (n
, copy_limit
))
18437 cur_mode
= mode_iter
.require ();
18439 gcc_assert (cur_mode
!= BLKmode
);
18441 mode_bits
= GET_MODE_BITSIZE (cur_mode
).to_constant ();
18442 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, cur_mode
);
18446 /* Do certain trailing copies as overlapping if it's going to be
18447 cheaper. i.e. less instructions to do so. For instance doing a 15
18448 byte copy it's more efficient to do two overlapping 8 byte copies than
18450 if (n
> 0 && n
<= 8 * BITS_PER_UNIT
)
18452 next_mode
= smallest_mode_for_size (n
, MODE_INT
);
18453 int n_bits
= GET_MODE_BITSIZE (next_mode
).to_constant ();
18454 src
= aarch64_move_pointer (src
, (n
- n_bits
) / BITS_PER_UNIT
);
18455 dst
= aarch64_move_pointer (dst
, (n
- n_bits
) / BITS_PER_UNIT
);
18463 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
18464 SImode stores. Handle the case when the constant has identical
18465 bottom and top halves. This is beneficial when the two stores can be
18466 merged into an STP and we avoid synthesising potentially expensive
18467 immediates twice. Return true if such a split is possible. */
18470 aarch64_split_dimode_const_store (rtx dst
, rtx src
)
18472 rtx lo
= gen_lowpart (SImode
, src
);
18473 rtx hi
= gen_highpart_mode (SImode
, DImode
, src
);
18475 bool size_p
= optimize_function_for_size_p (cfun
);
18477 if (!rtx_equal_p (lo
, hi
))
18480 unsigned int orig_cost
18481 = aarch64_internal_mov_immediate (NULL_RTX
, src
, false, DImode
);
18482 unsigned int lo_cost
18483 = aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, SImode
);
18485 /* We want to transform:
18487 MOVK x1, 0x140, lsl 16
18488 MOVK x1, 0xc0da, lsl 32
18489 MOVK x1, 0x140, lsl 48
18493 MOVK w1, 0x140, lsl 16
18495 So we want to perform this only when we save two instructions
18496 or more. When optimizing for size, however, accept any code size
18498 if (size_p
&& orig_cost
<= lo_cost
)
18502 && (orig_cost
<= lo_cost
+ 1))
18505 rtx mem_lo
= adjust_address (dst
, SImode
, 0);
18506 if (!aarch64_mem_pair_operand (mem_lo
, SImode
))
18509 rtx tmp_reg
= gen_reg_rtx (SImode
);
18510 aarch64_expand_mov_immediate (tmp_reg
, lo
);
18511 rtx mem_hi
= aarch64_move_pointer (mem_lo
, GET_MODE_SIZE (SImode
));
18512 /* Don't emit an explicit store pair as this may not be always profitable.
18513 Let the sched-fusion logic decide whether to merge them. */
18514 emit_move_insn (mem_lo
, tmp_reg
);
18515 emit_move_insn (mem_hi
, tmp_reg
);
18520 /* Generate RTL for a conditional branch with rtx comparison CODE in
18521 mode CC_MODE. The destination of the unlikely conditional branch
18525 aarch64_gen_unlikely_cbranch (enum rtx_code code
, machine_mode cc_mode
,
18529 x
= gen_rtx_fmt_ee (code
, VOIDmode
,
18530 gen_rtx_REG (cc_mode
, CC_REGNUM
),
18533 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
18534 gen_rtx_LABEL_REF (VOIDmode
, label_ref
),
18536 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
18539 /* Generate DImode scratch registers for 128-bit (TImode) addition.
18541 OP1 represents the TImode destination operand 1
18542 OP2 represents the TImode destination operand 2
18543 LOW_DEST represents the low half (DImode) of TImode operand 0
18544 LOW_IN1 represents the low half (DImode) of TImode operand 1
18545 LOW_IN2 represents the low half (DImode) of TImode operand 2
18546 HIGH_DEST represents the high half (DImode) of TImode operand 0
18547 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18548 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
18551 aarch64_addti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
18552 rtx
*low_in1
, rtx
*low_in2
,
18553 rtx
*high_dest
, rtx
*high_in1
,
18556 *low_dest
= gen_reg_rtx (DImode
);
18557 *low_in1
= gen_lowpart (DImode
, op1
);
18558 *low_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
18559 subreg_lowpart_offset (DImode
, TImode
));
18560 *high_dest
= gen_reg_rtx (DImode
);
18561 *high_in1
= gen_highpart (DImode
, op1
);
18562 *high_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
18563 subreg_highpart_offset (DImode
, TImode
));
18566 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
18568 This function differs from 'arch64_addti_scratch_regs' in that
18569 OP1 can be an immediate constant (zero). We must call
18570 subreg_highpart_offset with DImode and TImode arguments, otherwise
18571 VOIDmode will be used for the const_int which generates an internal
18572 error from subreg_size_highpart_offset which does not expect a size of zero.
18574 OP1 represents the TImode destination operand 1
18575 OP2 represents the TImode destination operand 2
18576 LOW_DEST represents the low half (DImode) of TImode operand 0
18577 LOW_IN1 represents the low half (DImode) of TImode operand 1
18578 LOW_IN2 represents the low half (DImode) of TImode operand 2
18579 HIGH_DEST represents the high half (DImode) of TImode operand 0
18580 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18581 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
18585 aarch64_subvti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
18586 rtx
*low_in1
, rtx
*low_in2
,
18587 rtx
*high_dest
, rtx
*high_in1
,
18590 *low_dest
= gen_reg_rtx (DImode
);
18591 *low_in1
= simplify_gen_subreg (DImode
, op1
, TImode
,
18592 subreg_lowpart_offset (DImode
, TImode
));
18594 *low_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
18595 subreg_lowpart_offset (DImode
, TImode
));
18596 *high_dest
= gen_reg_rtx (DImode
);
18598 *high_in1
= simplify_gen_subreg (DImode
, op1
, TImode
,
18599 subreg_highpart_offset (DImode
, TImode
));
18600 *high_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
18601 subreg_highpart_offset (DImode
, TImode
));
18604 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
18606 OP0 represents the TImode destination operand 0
18607 LOW_DEST represents the low half (DImode) of TImode operand 0
18608 LOW_IN1 represents the low half (DImode) of TImode operand 1
18609 LOW_IN2 represents the low half (DImode) of TImode operand 2
18610 HIGH_DEST represents the high half (DImode) of TImode operand 0
18611 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18612 HIGH_IN2 represents the high half (DImode) of TImode operand 2
18613 UNSIGNED_P is true if the operation is being performed on unsigned
18616 aarch64_expand_subvti (rtx op0
, rtx low_dest
, rtx low_in1
,
18617 rtx low_in2
, rtx high_dest
, rtx high_in1
,
18618 rtx high_in2
, bool unsigned_p
)
18620 if (low_in2
== const0_rtx
)
18622 low_dest
= low_in1
;
18623 high_in2
= force_reg (DImode
, high_in2
);
18625 emit_insn (gen_subdi3_compare1 (high_dest
, high_in1
, high_in2
));
18627 emit_insn (gen_subvdi_insn (high_dest
, high_in1
, high_in2
));
18631 if (CONST_INT_P (low_in2
))
18633 high_in2
= force_reg (DImode
, high_in2
);
18634 emit_insn (gen_subdi3_compare1_imm (low_dest
, low_in1
, low_in2
,
18635 GEN_INT (-INTVAL (low_in2
))));
18638 emit_insn (gen_subdi3_compare1 (low_dest
, low_in1
, low_in2
));
18641 emit_insn (gen_usubdi3_carryinC (high_dest
, high_in1
, high_in2
));
18643 emit_insn (gen_subdi3_carryinV (high_dest
, high_in1
, high_in2
));
18646 emit_move_insn (gen_lowpart (DImode
, op0
), low_dest
);
18647 emit_move_insn (gen_highpart (DImode
, op0
), high_dest
);
18651 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
18653 static unsigned HOST_WIDE_INT
18654 aarch64_asan_shadow_offset (void)
18657 return (HOST_WIDE_INT_1
<< 29);
18659 return (HOST_WIDE_INT_1
<< 36);
18663 aarch64_gen_ccmp_first (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
,
18664 int code
, tree treeop0
, tree treeop1
)
18666 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
18668 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
18670 struct expand_operand ops
[4];
18673 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
18675 op_mode
= GET_MODE (op0
);
18676 if (op_mode
== VOIDmode
)
18677 op_mode
= GET_MODE (op1
);
18685 icode
= CODE_FOR_cmpsi
;
18690 icode
= CODE_FOR_cmpdi
;
18695 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
18696 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpesf
: CODE_FOR_fcmpsf
;
18701 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
18702 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpedf
: CODE_FOR_fcmpdf
;
18710 op0
= prepare_operand (icode
, op0
, 0, op_mode
, cmp_mode
, unsignedp
);
18711 op1
= prepare_operand (icode
, op1
, 1, op_mode
, cmp_mode
, unsignedp
);
18717 *prep_seq
= get_insns ();
18720 create_fixed_operand (&ops
[0], op0
);
18721 create_fixed_operand (&ops
[1], op1
);
18724 if (!maybe_expand_insn (icode
, 2, ops
))
18729 *gen_seq
= get_insns ();
18732 return gen_rtx_fmt_ee ((rtx_code
) code
, cc_mode
,
18733 gen_rtx_REG (cc_mode
, CC_REGNUM
), const0_rtx
);
18737 aarch64_gen_ccmp_next (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
, rtx prev
,
18738 int cmp_code
, tree treeop0
, tree treeop1
, int bit_code
)
18740 rtx op0
, op1
, target
;
18741 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
18742 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
18744 struct expand_operand ops
[6];
18747 push_to_sequence (*prep_seq
);
18748 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
18750 op_mode
= GET_MODE (op0
);
18751 if (op_mode
== VOIDmode
)
18752 op_mode
= GET_MODE (op1
);
18760 icode
= CODE_FOR_ccmpsi
;
18765 icode
= CODE_FOR_ccmpdi
;
18770 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
18771 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpesf
: CODE_FOR_fccmpsf
;
18776 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
18777 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpedf
: CODE_FOR_fccmpdf
;
18785 op0
= prepare_operand (icode
, op0
, 2, op_mode
, cmp_mode
, unsignedp
);
18786 op1
= prepare_operand (icode
, op1
, 3, op_mode
, cmp_mode
, unsignedp
);
18792 *prep_seq
= get_insns ();
18795 target
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
18796 aarch64_cond
= aarch64_get_condition_code_1 (cc_mode
, (rtx_code
) cmp_code
);
18798 if (bit_code
!= AND
)
18800 prev
= gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev
),
18801 GET_MODE (XEXP (prev
, 0))),
18802 VOIDmode
, XEXP (prev
, 0), const0_rtx
);
18803 aarch64_cond
= AARCH64_INVERSE_CONDITION_CODE (aarch64_cond
);
18806 create_fixed_operand (&ops
[0], XEXP (prev
, 0));
18807 create_fixed_operand (&ops
[1], target
);
18808 create_fixed_operand (&ops
[2], op0
);
18809 create_fixed_operand (&ops
[3], op1
);
18810 create_fixed_operand (&ops
[4], prev
);
18811 create_fixed_operand (&ops
[5], GEN_INT (aarch64_cond
));
18813 push_to_sequence (*gen_seq
);
18814 if (!maybe_expand_insn (icode
, 6, ops
))
18820 *gen_seq
= get_insns ();
18823 return gen_rtx_fmt_ee ((rtx_code
) cmp_code
, VOIDmode
, target
, const0_rtx
);
18826 #undef TARGET_GEN_CCMP_FIRST
18827 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
18829 #undef TARGET_GEN_CCMP_NEXT
18830 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
18832 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
18833 instruction fusion of some sort. */
18836 aarch64_macro_fusion_p (void)
18838 return aarch64_tune_params
.fusible_ops
!= AARCH64_FUSE_NOTHING
;
18842 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
18843 should be kept together during scheduling. */
18846 aarch_macro_fusion_pair_p (rtx_insn
*prev
, rtx_insn
*curr
)
18849 rtx prev_set
= single_set (prev
);
18850 rtx curr_set
= single_set (curr
);
18851 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
18852 bool simple_sets_p
= prev_set
&& curr_set
&& !any_condjump_p (curr
);
18854 if (!aarch64_macro_fusion_p ())
18857 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK
))
18859 /* We are trying to match:
18860 prev (mov) == (set (reg r0) (const_int imm16))
18861 curr (movk) == (set (zero_extract (reg r0)
18864 (const_int imm16_1)) */
18866 set_dest
= SET_DEST (curr_set
);
18868 if (GET_CODE (set_dest
) == ZERO_EXTRACT
18869 && CONST_INT_P (SET_SRC (curr_set
))
18870 && CONST_INT_P (SET_SRC (prev_set
))
18871 && CONST_INT_P (XEXP (set_dest
, 2))
18872 && INTVAL (XEXP (set_dest
, 2)) == 16
18873 && REG_P (XEXP (set_dest
, 0))
18874 && REG_P (SET_DEST (prev_set
))
18875 && REGNO (XEXP (set_dest
, 0)) == REGNO (SET_DEST (prev_set
)))
18881 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD
))
18884 /* We're trying to match:
18885 prev (adrp) == (set (reg r1)
18886 (high (symbol_ref ("SYM"))))
18887 curr (add) == (set (reg r0)
18889 (symbol_ref ("SYM"))))
18890 Note that r0 need not necessarily be the same as r1, especially
18891 during pre-regalloc scheduling. */
18893 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
18894 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
18896 if (GET_CODE (SET_SRC (curr_set
)) == LO_SUM
18897 && REG_P (XEXP (SET_SRC (curr_set
), 0))
18898 && REGNO (XEXP (SET_SRC (curr_set
), 0))
18899 == REGNO (SET_DEST (prev_set
))
18900 && rtx_equal_p (XEXP (SET_SRC (prev_set
), 0),
18901 XEXP (SET_SRC (curr_set
), 1)))
18906 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK
))
18909 /* We're trying to match:
18910 prev (movk) == (set (zero_extract (reg r0)
18913 (const_int imm16_1))
18914 curr (movk) == (set (zero_extract (reg r0)
18917 (const_int imm16_2)) */
18919 if (GET_CODE (SET_DEST (prev_set
)) == ZERO_EXTRACT
18920 && GET_CODE (SET_DEST (curr_set
)) == ZERO_EXTRACT
18921 && REG_P (XEXP (SET_DEST (prev_set
), 0))
18922 && REG_P (XEXP (SET_DEST (curr_set
), 0))
18923 && REGNO (XEXP (SET_DEST (prev_set
), 0))
18924 == REGNO (XEXP (SET_DEST (curr_set
), 0))
18925 && CONST_INT_P (XEXP (SET_DEST (prev_set
), 2))
18926 && CONST_INT_P (XEXP (SET_DEST (curr_set
), 2))
18927 && INTVAL (XEXP (SET_DEST (prev_set
), 2)) == 32
18928 && INTVAL (XEXP (SET_DEST (curr_set
), 2)) == 48
18929 && CONST_INT_P (SET_SRC (prev_set
))
18930 && CONST_INT_P (SET_SRC (curr_set
)))
18934 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR
))
18936 /* We're trying to match:
18937 prev (adrp) == (set (reg r0)
18938 (high (symbol_ref ("SYM"))))
18939 curr (ldr) == (set (reg r1)
18940 (mem (lo_sum (reg r0)
18941 (symbol_ref ("SYM")))))
18943 curr (ldr) == (set (reg r1)
18946 (symbol_ref ("SYM")))))) */
18947 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
18948 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
18950 rtx curr_src
= SET_SRC (curr_set
);
18952 if (GET_CODE (curr_src
) == ZERO_EXTEND
)
18953 curr_src
= XEXP (curr_src
, 0);
18955 if (MEM_P (curr_src
) && GET_CODE (XEXP (curr_src
, 0)) == LO_SUM
18956 && REG_P (XEXP (XEXP (curr_src
, 0), 0))
18957 && REGNO (XEXP (XEXP (curr_src
, 0), 0))
18958 == REGNO (SET_DEST (prev_set
))
18959 && rtx_equal_p (XEXP (XEXP (curr_src
, 0), 1),
18960 XEXP (SET_SRC (prev_set
), 0)))
18965 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH
)
18966 && any_condjump_p (curr
))
18968 unsigned int condreg1
, condreg2
;
18970 aarch64_fixed_condition_code_regs (&condreg1
, &condreg2
);
18971 cc_reg_1
= gen_rtx_REG (CCmode
, condreg1
);
18973 if (reg_referenced_p (cc_reg_1
, PATTERN (curr
))
18975 && modified_in_p (cc_reg_1
, prev
))
18977 enum attr_type prev_type
= get_attr_type (prev
);
18979 /* FIXME: this misses some which is considered simple arthematic
18980 instructions for ThunderX. Simple shifts are missed here. */
18981 if (prev_type
== TYPE_ALUS_SREG
18982 || prev_type
== TYPE_ALUS_IMM
18983 || prev_type
== TYPE_LOGICS_REG
18984 || prev_type
== TYPE_LOGICS_IMM
)
18991 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH
)
18992 && any_condjump_p (curr
))
18994 /* We're trying to match:
18995 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
18996 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
18998 (label_ref ("SYM"))
19000 if (SET_DEST (curr_set
) == (pc_rtx
)
19001 && GET_CODE (SET_SRC (curr_set
)) == IF_THEN_ELSE
19002 && REG_P (XEXP (XEXP (SET_SRC (curr_set
), 0), 0))
19003 && REG_P (SET_DEST (prev_set
))
19004 && REGNO (SET_DEST (prev_set
))
19005 == REGNO (XEXP (XEXP (SET_SRC (curr_set
), 0), 0)))
19007 /* Fuse ALU operations followed by conditional branch instruction. */
19008 switch (get_attr_type (prev
))
19011 case TYPE_ALU_SREG
:
19014 case TYPE_ADCS_REG
:
19015 case TYPE_ADCS_IMM
:
19016 case TYPE_LOGIC_REG
:
19017 case TYPE_LOGIC_IMM
:
19021 case TYPE_SHIFT_REG
:
19022 case TYPE_SHIFT_IMM
:
19037 /* Return true iff the instruction fusion described by OP is enabled. */
19040 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op
)
19042 return (aarch64_tune_params
.fusible_ops
& op
) != 0;
19045 /* If MEM is in the form of [base+offset], extract the two parts
19046 of address and set to BASE and OFFSET, otherwise return false
19047 after clearing BASE and OFFSET. */
19050 extract_base_offset_in_addr (rtx mem
, rtx
*base
, rtx
*offset
)
19054 gcc_assert (MEM_P (mem
));
19056 addr
= XEXP (mem
, 0);
19061 *offset
= const0_rtx
;
19065 if (GET_CODE (addr
) == PLUS
19066 && REG_P (XEXP (addr
, 0)) && CONST_INT_P (XEXP (addr
, 1)))
19068 *base
= XEXP (addr
, 0);
19069 *offset
= XEXP (addr
, 1);
19074 *offset
= NULL_RTX
;
19079 /* Types for scheduling fusion. */
19080 enum sched_fusion_type
19082 SCHED_FUSION_NONE
= 0,
19083 SCHED_FUSION_LD_SIGN_EXTEND
,
19084 SCHED_FUSION_LD_ZERO_EXTEND
,
19090 /* If INSN is a load or store of address in the form of [base+offset],
19091 extract the two parts and set to BASE and OFFSET. Return scheduling
19092 fusion type this INSN is. */
19094 static enum sched_fusion_type
19095 fusion_load_store (rtx_insn
*insn
, rtx
*base
, rtx
*offset
)
19098 enum sched_fusion_type fusion
= SCHED_FUSION_LD
;
19100 gcc_assert (INSN_P (insn
));
19101 x
= PATTERN (insn
);
19102 if (GET_CODE (x
) != SET
)
19103 return SCHED_FUSION_NONE
;
19106 dest
= SET_DEST (x
);
19108 machine_mode dest_mode
= GET_MODE (dest
);
19110 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode
))
19111 return SCHED_FUSION_NONE
;
19113 if (GET_CODE (src
) == SIGN_EXTEND
)
19115 fusion
= SCHED_FUSION_LD_SIGN_EXTEND
;
19116 src
= XEXP (src
, 0);
19117 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
19118 return SCHED_FUSION_NONE
;
19120 else if (GET_CODE (src
) == ZERO_EXTEND
)
19122 fusion
= SCHED_FUSION_LD_ZERO_EXTEND
;
19123 src
= XEXP (src
, 0);
19124 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
19125 return SCHED_FUSION_NONE
;
19128 if (GET_CODE (src
) == MEM
&& REG_P (dest
))
19129 extract_base_offset_in_addr (src
, base
, offset
);
19130 else if (GET_CODE (dest
) == MEM
&& (REG_P (src
) || src
== const0_rtx
))
19132 fusion
= SCHED_FUSION_ST
;
19133 extract_base_offset_in_addr (dest
, base
, offset
);
19136 return SCHED_FUSION_NONE
;
19138 if (*base
== NULL_RTX
|| *offset
== NULL_RTX
)
19139 fusion
= SCHED_FUSION_NONE
;
19144 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
19146 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
19147 and PRI are only calculated for these instructions. For other instruction,
19148 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
19149 type instruction fusion can be added by returning different priorities.
19151 It's important that irrelevant instructions get the largest FUSION_PRI. */
19154 aarch64_sched_fusion_priority (rtx_insn
*insn
, int max_pri
,
19155 int *fusion_pri
, int *pri
)
19159 enum sched_fusion_type fusion
;
19161 gcc_assert (INSN_P (insn
));
19164 fusion
= fusion_load_store (insn
, &base
, &offset
);
19165 if (fusion
== SCHED_FUSION_NONE
)
19172 /* Set FUSION_PRI according to fusion type and base register. */
19173 *fusion_pri
= tmp
- fusion
* FIRST_PSEUDO_REGISTER
- REGNO (base
);
19175 /* Calculate PRI. */
19178 /* INSN with smaller offset goes first. */
19179 off_val
= (int)(INTVAL (offset
));
19181 tmp
-= (off_val
& 0xfffff);
19183 tmp
+= ((- off_val
) & 0xfffff);
19189 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
19190 Adjust priority of sha1h instructions so they are scheduled before
19191 other SHA1 instructions. */
19194 aarch64_sched_adjust_priority (rtx_insn
*insn
, int priority
)
19196 rtx x
= PATTERN (insn
);
19198 if (GET_CODE (x
) == SET
)
19202 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_SHA1H
)
19203 return priority
+ 10;
19209 /* Given OPERANDS of consecutive load/store, check if we can merge
19210 them into ldp/stp. LOAD is true if they are load instructions.
19211 MODE is the mode of memory operands. */
19214 aarch64_operands_ok_for_ldpstp (rtx
*operands
, bool load
,
19217 HOST_WIDE_INT offval_1
, offval_2
, msize
;
19218 enum reg_class rclass_1
, rclass_2
;
19219 rtx mem_1
, mem_2
, reg_1
, reg_2
, base_1
, base_2
, offset_1
, offset_2
;
19223 mem_1
= operands
[1];
19224 mem_2
= operands
[3];
19225 reg_1
= operands
[0];
19226 reg_2
= operands
[2];
19227 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
));
19228 if (REGNO (reg_1
) == REGNO (reg_2
))
19233 mem_1
= operands
[0];
19234 mem_2
= operands
[2];
19235 reg_1
= operands
[1];
19236 reg_2
= operands
[3];
19239 /* The mems cannot be volatile. */
19240 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
))
19243 /* If we have SImode and slow unaligned ldp,
19244 check the alignment to be at least 8 byte. */
19246 && (aarch64_tune_params
.extra_tuning_flags
19247 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
19249 && MEM_ALIGN (mem_1
) < 8 * BITS_PER_UNIT
)
19252 /* Check if the addresses are in the form of [base+offset]. */
19253 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
19254 if (base_1
== NULL_RTX
|| offset_1
== NULL_RTX
)
19256 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
19257 if (base_2
== NULL_RTX
|| offset_2
== NULL_RTX
)
19260 /* Check if the bases are same. */
19261 if (!rtx_equal_p (base_1
, base_2
))
19264 /* The operands must be of the same size. */
19265 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1
)),
19266 GET_MODE_SIZE (GET_MODE (mem_2
))));
19268 offval_1
= INTVAL (offset_1
);
19269 offval_2
= INTVAL (offset_2
);
19270 /* We should only be trying this for fixed-sized modes. There is no
19271 SVE LDP/STP instruction. */
19272 msize
= GET_MODE_SIZE (mode
).to_constant ();
19273 /* Check if the offsets are consecutive. */
19274 if (offval_1
!= (offval_2
+ msize
) && offval_2
!= (offval_1
+ msize
))
19277 /* Check if the addresses are clobbered by load. */
19280 if (reg_mentioned_p (reg_1
, mem_1
))
19283 /* In increasing order, the last load can clobber the address. */
19284 if (offval_1
> offval_2
&& reg_mentioned_p (reg_2
, mem_2
))
19288 /* One of the memory accesses must be a mempair operand.
19289 If it is not the first one, they need to be swapped by the
19291 if (!aarch64_mem_pair_operand (mem_1
, GET_MODE (mem_1
))
19292 && !aarch64_mem_pair_operand (mem_2
, GET_MODE (mem_2
)))
19295 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
19296 rclass_1
= FP_REGS
;
19298 rclass_1
= GENERAL_REGS
;
19300 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
19301 rclass_2
= FP_REGS
;
19303 rclass_2
= GENERAL_REGS
;
19305 /* Check if the registers are of same class. */
19306 if (rclass_1
!= rclass_2
)
19312 /* Given OPERANDS of consecutive load/store that can be merged,
19313 swap them if they are not in ascending order. */
19315 aarch64_swap_ldrstr_operands (rtx
* operands
, bool load
)
19317 rtx mem_1
, mem_2
, base_1
, base_2
, offset_1
, offset_2
;
19318 HOST_WIDE_INT offval_1
, offval_2
;
19322 mem_1
= operands
[1];
19323 mem_2
= operands
[3];
19327 mem_1
= operands
[0];
19328 mem_2
= operands
[2];
19331 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
19332 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
19334 offval_1
= INTVAL (offset_1
);
19335 offval_2
= INTVAL (offset_2
);
19337 if (offval_1
> offval_2
)
19339 /* Irrespective of whether this is a load or a store,
19340 we do the same swap. */
19341 std::swap (operands
[0], operands
[2]);
19342 std::swap (operands
[1], operands
[3]);
19346 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
19347 comparison between the two. */
19349 aarch64_host_wide_int_compare (const void *x
, const void *y
)
19351 return wi::cmps (* ((const HOST_WIDE_INT
*) x
),
19352 * ((const HOST_WIDE_INT
*) y
));
19355 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
19356 other pointing to a REG rtx containing an offset, compare the offsets
19361 1 iff offset (X) > offset (Y)
19362 0 iff offset (X) == offset (Y)
19363 -1 iff offset (X) < offset (Y) */
19365 aarch64_ldrstr_offset_compare (const void *x
, const void *y
)
19367 const rtx
* operands_1
= (const rtx
*) x
;
19368 const rtx
* operands_2
= (const rtx
*) y
;
19369 rtx mem_1
, mem_2
, base
, offset_1
, offset_2
;
19371 if (MEM_P (operands_1
[0]))
19372 mem_1
= operands_1
[0];
19374 mem_1
= operands_1
[1];
19376 if (MEM_P (operands_2
[0]))
19377 mem_2
= operands_2
[0];
19379 mem_2
= operands_2
[1];
19381 /* Extract the offsets. */
19382 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
19383 extract_base_offset_in_addr (mem_2
, &base
, &offset_2
);
19385 gcc_assert (offset_1
!= NULL_RTX
&& offset_2
!= NULL_RTX
);
19387 return wi::cmps (INTVAL (offset_1
), INTVAL (offset_2
));
19390 /* Given OPERANDS of consecutive load/store, check if we can merge
19391 them into ldp/stp by adjusting the offset. LOAD is true if they
19392 are load instructions. MODE is the mode of memory operands.
19394 Given below consecutive stores:
19396 str w1, [xb, 0x100]
19397 str w1, [xb, 0x104]
19398 str w1, [xb, 0x108]
19399 str w1, [xb, 0x10c]
19401 Though the offsets are out of the range supported by stp, we can
19402 still pair them after adjusting the offset, like:
19404 add scratch, xb, 0x100
19405 stp w1, w1, [scratch]
19406 stp w1, w1, [scratch, 0x8]
19408 The peephole patterns detecting this opportunity should guarantee
19409 the scratch register is avaliable. */
19412 aarch64_operands_adjust_ok_for_ldpstp (rtx
*operands
, bool load
,
19415 const int num_insns
= 4;
19416 enum reg_class rclass
;
19417 HOST_WIDE_INT offvals
[num_insns
], msize
;
19418 rtx mem
[num_insns
], reg
[num_insns
], base
[num_insns
], offset
[num_insns
];
19422 for (int i
= 0; i
< num_insns
; i
++)
19424 reg
[i
] = operands
[2 * i
];
19425 mem
[i
] = operands
[2 * i
+ 1];
19427 gcc_assert (REG_P (reg
[i
]));
19430 /* Do not attempt to merge the loads if the loads clobber each other. */
19431 for (int i
= 0; i
< 8; i
+= 2)
19432 for (int j
= i
+ 2; j
< 8; j
+= 2)
19433 if (reg_overlap_mentioned_p (operands
[i
], operands
[j
]))
19437 for (int i
= 0; i
< num_insns
; i
++)
19439 mem
[i
] = operands
[2 * i
];
19440 reg
[i
] = operands
[2 * i
+ 1];
19443 /* Skip if memory operand is by itself valid for ldp/stp. */
19444 if (!MEM_P (mem
[0]) || aarch64_mem_pair_operand (mem
[0], mode
))
19447 for (int i
= 0; i
< num_insns
; i
++)
19449 /* The mems cannot be volatile. */
19450 if (MEM_VOLATILE_P (mem
[i
]))
19453 /* Check if the addresses are in the form of [base+offset]. */
19454 extract_base_offset_in_addr (mem
[i
], base
+ i
, offset
+ i
);
19455 if (base
[i
] == NULL_RTX
|| offset
[i
] == NULL_RTX
)
19459 /* Check if the registers are of same class. */
19460 rclass
= REG_P (reg
[0]) && FP_REGNUM_P (REGNO (reg
[0]))
19461 ? FP_REGS
: GENERAL_REGS
;
19463 for (int i
= 1; i
< num_insns
; i
++)
19464 if (REG_P (reg
[i
]) && FP_REGNUM_P (REGNO (reg
[i
])))
19466 if (rclass
!= FP_REGS
)
19471 if (rclass
!= GENERAL_REGS
)
19475 /* Only the last register in the order in which they occur
19476 may be clobbered by the load. */
19477 if (rclass
== GENERAL_REGS
&& load
)
19478 for (int i
= 0; i
< num_insns
- 1; i
++)
19479 if (reg_mentioned_p (reg
[i
], mem
[i
]))
19482 /* Check if the bases are same. */
19483 for (int i
= 0; i
< num_insns
- 1; i
++)
19484 if (!rtx_equal_p (base
[i
], base
[i
+ 1]))
19487 for (int i
= 0; i
< num_insns
; i
++)
19488 offvals
[i
] = INTVAL (offset
[i
]);
19490 msize
= GET_MODE_SIZE (mode
);
19492 /* Check if the offsets can be put in the right order to do a ldp/stp. */
19493 qsort (offvals
, num_insns
, sizeof (HOST_WIDE_INT
),
19494 aarch64_host_wide_int_compare
);
19496 if (!(offvals
[1] == offvals
[0] + msize
19497 && offvals
[3] == offvals
[2] + msize
))
19500 /* Check that offsets are within range of each other. The ldp/stp
19501 instructions have 7 bit immediate offsets, so use 0x80. */
19502 if (offvals
[2] - offvals
[0] >= msize
* 0x80)
19505 /* The offsets must be aligned with respect to each other. */
19506 if (offvals
[0] % msize
!= offvals
[2] % msize
)
19509 /* If we have SImode and slow unaligned ldp,
19510 check the alignment to be at least 8 byte. */
19512 && (aarch64_tune_params
.extra_tuning_flags
19513 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
19515 && MEM_ALIGN (mem
[0]) < 8 * BITS_PER_UNIT
)
19521 /* Given OPERANDS of consecutive load/store, this function pairs them
19522 into LDP/STP after adjusting the offset. It depends on the fact
19523 that the operands can be sorted so the offsets are correct for STP.
19524 MODE is the mode of memory operands. CODE is the rtl operator
19525 which should be applied to all memory operands, it's SIGN_EXTEND,
19526 ZERO_EXTEND or UNKNOWN. */
19529 aarch64_gen_adjusted_ldpstp (rtx
*operands
, bool load
,
19530 scalar_mode mode
, RTX_CODE code
)
19532 rtx base
, offset_1
, offset_3
, t1
, t2
;
19533 rtx mem_1
, mem_2
, mem_3
, mem_4
;
19534 rtx temp_operands
[8];
19535 HOST_WIDE_INT off_val_1
, off_val_3
, base_off
, new_off_1
, new_off_3
,
19536 stp_off_upper_limit
, stp_off_lower_limit
, msize
;
19538 /* We make changes on a copy as we may still bail out. */
19539 for (int i
= 0; i
< 8; i
++)
19540 temp_operands
[i
] = operands
[i
];
19542 /* Sort the operands. */
19543 qsort (temp_operands
, 4, 2 * sizeof (rtx
*), aarch64_ldrstr_offset_compare
);
19545 /* Copy the memory operands so that if we have to bail for some
19546 reason the original addresses are unchanged. */
19549 mem_1
= copy_rtx (temp_operands
[1]);
19550 mem_2
= copy_rtx (temp_operands
[3]);
19551 mem_3
= copy_rtx (temp_operands
[5]);
19552 mem_4
= copy_rtx (temp_operands
[7]);
19556 mem_1
= copy_rtx (temp_operands
[0]);
19557 mem_2
= copy_rtx (temp_operands
[2]);
19558 mem_3
= copy_rtx (temp_operands
[4]);
19559 mem_4
= copy_rtx (temp_operands
[6]);
19560 gcc_assert (code
== UNKNOWN
);
19563 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
19564 extract_base_offset_in_addr (mem_3
, &base
, &offset_3
);
19565 gcc_assert (base
!= NULL_RTX
&& offset_1
!= NULL_RTX
19566 && offset_3
!= NULL_RTX
);
19568 /* Adjust offset so it can fit in LDP/STP instruction. */
19569 msize
= GET_MODE_SIZE (mode
);
19570 stp_off_upper_limit
= msize
* (0x40 - 1);
19571 stp_off_lower_limit
= - msize
* 0x40;
19573 off_val_1
= INTVAL (offset_1
);
19574 off_val_3
= INTVAL (offset_3
);
19576 /* The base offset is optimally half way between the two STP/LDP offsets. */
19578 base_off
= (off_val_1
+ off_val_3
) / 2;
19580 /* However, due to issues with negative LDP/STP offset generation for
19581 larger modes, for DF, DI and vector modes. we must not use negative
19582 addresses smaller than 9 signed unadjusted bits can store. This
19583 provides the most range in this case. */
19584 base_off
= off_val_1
;
19586 /* Adjust the base so that it is aligned with the addresses but still
19588 if (base_off
% msize
!= off_val_1
% msize
)
19589 /* Fix the offset, bearing in mind we want to make it bigger not
19591 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
19592 else if (msize
<= 4)
19593 /* The negative range of LDP/STP is one larger than the positive range. */
19596 /* Check if base offset is too big or too small. We can attempt to resolve
19597 this issue by setting it to the maximum value and seeing if the offsets
19599 if (base_off
>= 0x1000)
19601 base_off
= 0x1000 - 1;
19602 /* We must still make sure that the base offset is aligned with respect
19603 to the address. But it may may not be made any bigger. */
19604 base_off
-= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
19607 /* Likewise for the case where the base is too small. */
19608 if (base_off
<= -0x1000)
19610 base_off
= -0x1000 + 1;
19611 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
19614 /* Offset of the first STP/LDP. */
19615 new_off_1
= off_val_1
- base_off
;
19617 /* Offset of the second STP/LDP. */
19618 new_off_3
= off_val_3
- base_off
;
19620 /* The offsets must be within the range of the LDP/STP instructions. */
19621 if (new_off_1
> stp_off_upper_limit
|| new_off_1
< stp_off_lower_limit
19622 || new_off_3
> stp_off_upper_limit
|| new_off_3
< stp_off_lower_limit
)
19625 replace_equiv_address_nv (mem_1
, plus_constant (Pmode
, operands
[8],
19627 replace_equiv_address_nv (mem_2
, plus_constant (Pmode
, operands
[8],
19628 new_off_1
+ msize
), true);
19629 replace_equiv_address_nv (mem_3
, plus_constant (Pmode
, operands
[8],
19631 replace_equiv_address_nv (mem_4
, plus_constant (Pmode
, operands
[8],
19632 new_off_3
+ msize
), true);
19634 if (!aarch64_mem_pair_operand (mem_1
, mode
)
19635 || !aarch64_mem_pair_operand (mem_3
, mode
))
19638 if (code
== ZERO_EXTEND
)
19640 mem_1
= gen_rtx_ZERO_EXTEND (DImode
, mem_1
);
19641 mem_2
= gen_rtx_ZERO_EXTEND (DImode
, mem_2
);
19642 mem_3
= gen_rtx_ZERO_EXTEND (DImode
, mem_3
);
19643 mem_4
= gen_rtx_ZERO_EXTEND (DImode
, mem_4
);
19645 else if (code
== SIGN_EXTEND
)
19647 mem_1
= gen_rtx_SIGN_EXTEND (DImode
, mem_1
);
19648 mem_2
= gen_rtx_SIGN_EXTEND (DImode
, mem_2
);
19649 mem_3
= gen_rtx_SIGN_EXTEND (DImode
, mem_3
);
19650 mem_4
= gen_rtx_SIGN_EXTEND (DImode
, mem_4
);
19655 operands
[0] = temp_operands
[0];
19656 operands
[1] = mem_1
;
19657 operands
[2] = temp_operands
[2];
19658 operands
[3] = mem_2
;
19659 operands
[4] = temp_operands
[4];
19660 operands
[5] = mem_3
;
19661 operands
[6] = temp_operands
[6];
19662 operands
[7] = mem_4
;
19666 operands
[0] = mem_1
;
19667 operands
[1] = temp_operands
[1];
19668 operands
[2] = mem_2
;
19669 operands
[3] = temp_operands
[3];
19670 operands
[4] = mem_3
;
19671 operands
[5] = temp_operands
[5];
19672 operands
[6] = mem_4
;
19673 operands
[7] = temp_operands
[7];
19676 /* Emit adjusting instruction. */
19677 emit_insn (gen_rtx_SET (operands
[8], plus_constant (DImode
, base
, base_off
)));
19678 /* Emit ldp/stp instructions. */
19679 t1
= gen_rtx_SET (operands
[0], operands
[1]);
19680 t2
= gen_rtx_SET (operands
[2], operands
[3]);
19681 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
19682 t1
= gen_rtx_SET (operands
[4], operands
[5]);
19683 t2
= gen_rtx_SET (operands
[6], operands
[7]);
19684 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
19688 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
19689 it isn't worth branching around empty masked ops (including masked
19693 aarch64_empty_mask_is_expensive (unsigned)
19698 /* Return 1 if pseudo register should be created and used to hold
19699 GOT address for PIC code. */
19702 aarch64_use_pseudo_pic_reg (void)
19704 return aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
;
19707 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
19710 aarch64_unspec_may_trap_p (const_rtx x
, unsigned flags
)
19712 switch (XINT (x
, 1))
19714 case UNSPEC_GOTSMALLPIC
:
19715 case UNSPEC_GOTSMALLPIC28K
:
19716 case UNSPEC_GOTTINYPIC
:
19722 return default_unspec_may_trap_p (x
, flags
);
19726 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
19727 return the log2 of that value. Otherwise return -1. */
19730 aarch64_fpconst_pow_of_2 (rtx x
)
19732 const REAL_VALUE_TYPE
*r
;
19734 if (!CONST_DOUBLE_P (x
))
19737 r
= CONST_DOUBLE_REAL_VALUE (x
);
19739 if (REAL_VALUE_NEGATIVE (*r
)
19740 || REAL_VALUE_ISNAN (*r
)
19741 || REAL_VALUE_ISINF (*r
)
19742 || !real_isinteger (r
, DFmode
))
19745 return exact_log2 (real_to_integer (r
));
19748 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
19749 power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
19750 return n. Otherwise return -1. */
19753 aarch64_fpconst_pow2_recip (rtx x
)
19755 REAL_VALUE_TYPE r0
;
19757 if (!CONST_DOUBLE_P (x
))
19760 r0
= *CONST_DOUBLE_REAL_VALUE (x
);
19761 if (exact_real_inverse (DFmode
, &r0
)
19762 && !REAL_VALUE_NEGATIVE (r0
))
19764 int ret
= exact_log2 (real_to_integer (&r0
));
19765 if (ret
>= 1 && ret
<= 32)
19771 /* If X is a vector of equal CONST_DOUBLE values and that value is
19772 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
19775 aarch64_vec_fpconst_pow_of_2 (rtx x
)
19778 if (GET_CODE (x
) != CONST_VECTOR
19779 || !CONST_VECTOR_NUNITS (x
).is_constant (&nelts
))
19782 if (GET_MODE_CLASS (GET_MODE (x
)) != MODE_VECTOR_FLOAT
)
19785 int firstval
= aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, 0));
19789 for (int i
= 1; i
< nelts
; i
++)
19790 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, i
)) != firstval
)
19796 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
19799 __fp16 always promotes through this hook.
19800 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
19801 through the generic excess precision logic rather than here. */
19804 aarch64_promoted_type (const_tree t
)
19806 if (SCALAR_FLOAT_TYPE_P (t
)
19807 && TYPE_MAIN_VARIANT (t
) == aarch64_fp16_type_node
)
19808 return float_type_node
;
19813 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
19816 aarch64_optab_supported_p (int op
, machine_mode mode1
, machine_mode
,
19817 optimization_type opt_type
)
19822 return opt_type
== OPTIMIZE_FOR_SPEED
&& use_rsqrt_p (mode1
);
19829 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
19831 static unsigned int
19832 aarch64_dwarf_poly_indeterminate_value (unsigned int i
, unsigned int *factor
,
19835 /* Polynomial invariant 1 == (VG / 2) - 1. */
19836 gcc_assert (i
== 1);
19839 return AARCH64_DWARF_VG
;
19842 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
19843 if MODE is HFmode, and punt to the generic implementation otherwise. */
19846 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode
)
19848 return (mode
== HFmode
19850 : default_libgcc_floating_mode_supported_p (mode
));
19853 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
19854 if MODE is HFmode, and punt to the generic implementation otherwise. */
19857 aarch64_scalar_mode_supported_p (scalar_mode mode
)
19859 return (mode
== HFmode
19861 : default_scalar_mode_supported_p (mode
));
19864 /* Set the value of FLT_EVAL_METHOD.
19865 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
19867 0: evaluate all operations and constants, whose semantic type has at
19868 most the range and precision of type float, to the range and
19869 precision of float; evaluate all other operations and constants to
19870 the range and precision of the semantic type;
19872 N, where _FloatN is a supported interchange floating type
19873 evaluate all operations and constants, whose semantic type has at
19874 most the range and precision of _FloatN type, to the range and
19875 precision of the _FloatN type; evaluate all other operations and
19876 constants to the range and precision of the semantic type;
19878 If we have the ARMv8.2-A extensions then we support _Float16 in native
19879 precision, so we should set this to 16. Otherwise, we support the type,
19880 but want to evaluate expressions in float precision, so set this to
19883 static enum flt_eval_method
19884 aarch64_excess_precision (enum excess_precision_type type
)
19888 case EXCESS_PRECISION_TYPE_FAST
:
19889 case EXCESS_PRECISION_TYPE_STANDARD
:
19890 /* We can calculate either in 16-bit range and precision or
19891 32-bit range and precision. Make that decision based on whether
19892 we have native support for the ARMv8.2-A 16-bit floating-point
19893 instructions or not. */
19894 return (TARGET_FP_F16INST
19895 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
19896 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
);
19897 case EXCESS_PRECISION_TYPE_IMPLICIT
:
19898 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
;
19900 gcc_unreachable ();
19902 return FLT_EVAL_METHOD_UNPREDICTABLE
;
19905 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
19906 scheduled for speculative execution. Reject the long-running division
19907 and square-root instructions. */
19910 aarch64_sched_can_speculate_insn (rtx_insn
*insn
)
19912 switch (get_attr_type (insn
))
19920 case TYPE_NEON_FP_SQRT_S
:
19921 case TYPE_NEON_FP_SQRT_D
:
19922 case TYPE_NEON_FP_SQRT_S_Q
:
19923 case TYPE_NEON_FP_SQRT_D_Q
:
19924 case TYPE_NEON_FP_DIV_S
:
19925 case TYPE_NEON_FP_DIV_D
:
19926 case TYPE_NEON_FP_DIV_S_Q
:
19927 case TYPE_NEON_FP_DIV_D_Q
:
19934 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
19937 aarch64_compute_pressure_classes (reg_class
*classes
)
19940 classes
[i
++] = GENERAL_REGS
;
19941 classes
[i
++] = FP_REGS
;
19942 /* PR_REGS isn't a useful pressure class because many predicate pseudo
19943 registers need to go in PR_LO_REGS at some point during their
19944 lifetime. Splitting it into two halves has the effect of making
19945 all predicates count against PR_LO_REGS, so that we try whenever
19946 possible to restrict the number of live predicates to 8. This
19947 greatly reduces the amount of spilling in certain loops. */
19948 classes
[i
++] = PR_LO_REGS
;
19949 classes
[i
++] = PR_HI_REGS
;
19953 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
19956 aarch64_can_change_mode_class (machine_mode from
,
19957 machine_mode to
, reg_class_t
)
19959 if (BYTES_BIG_ENDIAN
)
19961 bool from_sve_p
= aarch64_sve_data_mode_p (from
);
19962 bool to_sve_p
= aarch64_sve_data_mode_p (to
);
19964 /* Don't allow changes between SVE data modes and non-SVE modes.
19965 See the comment at the head of aarch64-sve.md for details. */
19966 if (from_sve_p
!= to_sve_p
)
19969 /* Don't allow changes in element size: lane 0 of the new vector
19970 would not then be lane 0 of the old vector. See the comment
19971 above aarch64_maybe_expand_sve_subreg_move for a more detailed
19974 In the worst case, this forces a register to be spilled in
19975 one mode and reloaded in the other, which handles the
19976 endianness correctly. */
19977 if (from_sve_p
&& GET_MODE_UNIT_SIZE (from
) != GET_MODE_UNIT_SIZE (to
))
19983 /* Implement TARGET_EARLY_REMAT_MODES. */
19986 aarch64_select_early_remat_modes (sbitmap modes
)
19988 /* SVE values are not normally live across a call, so it should be
19989 worth doing early rematerialization even in VL-specific mode. */
19990 for (int i
= 0; i
< NUM_MACHINE_MODES
; ++i
)
19991 if (aarch64_sve_mode_p ((machine_mode
) i
))
19992 bitmap_set_bit (modes
, i
);
19995 /* Override the default target speculation_safe_value. */
19997 aarch64_speculation_safe_value (machine_mode mode
,
19998 rtx result
, rtx val
, rtx failval
)
20000 /* Maybe we should warn if falling back to hard barriers. They are
20001 likely to be noticably more expensive than the alternative below. */
20002 if (!aarch64_track_speculation
)
20003 return default_speculation_safe_value (mode
, result
, val
, failval
);
20006 val
= copy_to_mode_reg (mode
, val
);
20008 if (!aarch64_reg_or_zero (failval
, mode
))
20009 failval
= copy_to_mode_reg (mode
, failval
);
20011 emit_insn (gen_despeculate_copy (mode
, result
, val
, failval
));
20015 /* Implement TARGET_ESTIMATED_POLY_VALUE.
20016 Look into the tuning structure for an estimate.
20017 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
20018 Advanced SIMD 128 bits. */
20020 static HOST_WIDE_INT
20021 aarch64_estimated_poly_value (poly_int64 val
)
20023 enum aarch64_sve_vector_bits_enum width_source
20024 = aarch64_tune_params
.sve_width
;
20026 /* If we still don't have an estimate, use the default. */
20027 if (width_source
== SVE_SCALABLE
)
20028 return default_estimated_poly_value (val
);
20030 HOST_WIDE_INT over_128
= width_source
- 128;
20031 return val
.coeffs
[0] + val
.coeffs
[1] * over_128
/ 128;
20035 /* Return true for types that could be supported as SIMD return or
20039 supported_simd_type (tree t
)
20041 if (SCALAR_FLOAT_TYPE_P (t
) || INTEGRAL_TYPE_P (t
) || POINTER_TYPE_P (t
))
20043 HOST_WIDE_INT s
= tree_to_shwi (TYPE_SIZE_UNIT (t
));
20044 return s
== 1 || s
== 2 || s
== 4 || s
== 8;
20049 /* Return true for types that currently are supported as SIMD return
20050 or argument types. */
20053 currently_supported_simd_type (tree t
, tree b
)
20055 if (COMPLEX_FLOAT_TYPE_P (t
))
20058 if (TYPE_SIZE (t
) != TYPE_SIZE (b
))
20061 return supported_simd_type (t
);
20064 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
20067 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node
*node
,
20068 struct cgraph_simd_clone
*clonei
,
20069 tree base_type
, int num
)
20071 tree t
, ret_type
, arg_type
;
20072 unsigned int elt_bits
, vec_bits
, count
;
20077 if (clonei
->simdlen
20078 && (clonei
->simdlen
< 2
20079 || clonei
->simdlen
> 1024
20080 || (clonei
->simdlen
& (clonei
->simdlen
- 1)) != 0))
20082 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
20083 "unsupported simdlen %d", clonei
->simdlen
);
20087 ret_type
= TREE_TYPE (TREE_TYPE (node
->decl
));
20088 if (TREE_CODE (ret_type
) != VOID_TYPE
20089 && !currently_supported_simd_type (ret_type
, base_type
))
20091 if (TYPE_SIZE (ret_type
) != TYPE_SIZE (base_type
))
20092 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
20093 "GCC does not currently support mixed size types "
20094 "for %<simd%> functions");
20095 else if (supported_simd_type (ret_type
))
20096 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
20097 "GCC does not currently support return type %qT "
20098 "for %<simd%> functions", ret_type
);
20100 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
20101 "unsupported return type %qT for %<simd%> functions",
20106 for (t
= DECL_ARGUMENTS (node
->decl
); t
; t
= DECL_CHAIN (t
))
20108 arg_type
= TREE_TYPE (t
);
20110 if (!currently_supported_simd_type (arg_type
, base_type
))
20112 if (TYPE_SIZE (arg_type
) != TYPE_SIZE (base_type
))
20113 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
20114 "GCC does not currently support mixed size types "
20115 "for %<simd%> functions");
20117 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
20118 "GCC does not currently support argument type %qT "
20119 "for %<simd%> functions", arg_type
);
20124 clonei
->vecsize_mangle
= 'n';
20125 clonei
->mask_mode
= VOIDmode
;
20126 elt_bits
= GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type
));
20127 if (clonei
->simdlen
== 0)
20130 vec_bits
= (num
== 0 ? 64 : 128);
20131 clonei
->simdlen
= vec_bits
/ elt_bits
;
20136 vec_bits
= clonei
->simdlen
* elt_bits
;
20137 if (vec_bits
!= 64 && vec_bits
!= 128)
20139 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
20140 "GCC does not currently support simdlen %d for type %qT",
20141 clonei
->simdlen
, base_type
);
20145 clonei
->vecsize_int
= vec_bits
;
20146 clonei
->vecsize_float
= vec_bits
;
20150 /* Implement TARGET_SIMD_CLONE_ADJUST. */
20153 aarch64_simd_clone_adjust (struct cgraph_node
*node
)
20155 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
20156 use the correct ABI. */
20158 tree t
= TREE_TYPE (node
->decl
);
20159 TYPE_ATTRIBUTES (t
) = make_attribute ("aarch64_vector_pcs", "default",
20160 TYPE_ATTRIBUTES (t
));
20163 /* Implement TARGET_SIMD_CLONE_USABLE. */
20166 aarch64_simd_clone_usable (struct cgraph_node
*node
)
20168 switch (node
->simdclone
->vecsize_mangle
)
20175 gcc_unreachable ();
20179 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
20182 aarch64_comp_type_attributes (const_tree type1
, const_tree type2
)
20184 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1
))
20185 != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2
)))
20190 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
20192 static const char *
20193 aarch64_get_multilib_abi_name (void)
20195 if (TARGET_BIG_END
)
20196 return TARGET_ILP32
? "aarch64_be_ilp32" : "aarch64_be";
20197 return TARGET_ILP32
? "aarch64_ilp32" : "aarch64";
20200 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
20201 global variable based guard use the default else
20202 return a null tree. */
20204 aarch64_stack_protect_guard (void)
20206 if (aarch64_stack_protector_guard
== SSP_GLOBAL
)
20207 return default_stack_protect_guard ();
20212 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
20213 section at the end if needed. */
20214 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
20215 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
20216 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
20218 aarch64_file_end_indicate_exec_stack ()
20220 file_end_indicate_exec_stack ();
20222 unsigned feature_1_and
= 0;
20223 if (aarch64_bti_enabled ())
20224 feature_1_and
|= GNU_PROPERTY_AARCH64_FEATURE_1_BTI
;
20226 if (aarch64_ra_sign_scope
!= AARCH64_FUNCTION_NONE
)
20227 feature_1_and
|= GNU_PROPERTY_AARCH64_FEATURE_1_PAC
;
20231 /* Generate .note.gnu.property section. */
20232 switch_to_section (get_section (".note.gnu.property",
20233 SECTION_NOTYPE
, NULL
));
20235 /* PT_NOTE header: namesz, descsz, type.
20236 namesz = 4 ("GNU\0")
20237 descsz = 16 (Size of the program property array)
20238 [(12 + padding) * Number of array elements]
20239 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
20240 assemble_align (POINTER_SIZE
);
20241 assemble_integer (GEN_INT (4), 4, 32, 1);
20242 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES
)), 4, 32, 1);
20243 assemble_integer (GEN_INT (5), 4, 32, 1);
20245 /* PT_NOTE name. */
20246 assemble_string ("GNU", 4);
20248 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
20249 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
20251 data = feature_1_and. */
20252 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND
), 4, 32, 1);
20253 assemble_integer (GEN_INT (4), 4, 32, 1);
20254 assemble_integer (GEN_INT (feature_1_and
), 4, 32, 1);
20256 /* Pad the size of the note to the required alignment. */
20257 assemble_align (POINTER_SIZE
);
20260 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
20261 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
20262 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
20264 /* Target-specific selftests. */
20268 namespace selftest
{
20270 /* Selftest for the RTL loader.
20271 Verify that the RTL loader copes with a dump from
20272 print_rtx_function. This is essentially just a test that class
20273 function_reader can handle a real dump, but it also verifies
20274 that lookup_reg_by_dump_name correctly handles hard regs.
20275 The presence of hard reg names in the dump means that the test is
20276 target-specific, hence it is in this file. */
20279 aarch64_test_loading_full_dump ()
20281 rtl_dump_test
t (SELFTEST_LOCATION
, locate_file ("aarch64/times-two.rtl"));
20283 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun
->decl
)));
20285 rtx_insn
*insn_1
= get_insn_by_uid (1);
20286 ASSERT_EQ (NOTE
, GET_CODE (insn_1
));
20288 rtx_insn
*insn_15
= get_insn_by_uid (15);
20289 ASSERT_EQ (INSN
, GET_CODE (insn_15
));
20290 ASSERT_EQ (USE
, GET_CODE (PATTERN (insn_15
)));
20292 /* Verify crtl->return_rtx. */
20293 ASSERT_EQ (REG
, GET_CODE (crtl
->return_rtx
));
20294 ASSERT_EQ (0, REGNO (crtl
->return_rtx
));
20295 ASSERT_EQ (SImode
, GET_MODE (crtl
->return_rtx
));
20298 /* Run all target-specific selftests. */
20301 aarch64_run_selftests (void)
20303 aarch64_test_loading_full_dump ();
20306 } // namespace selftest
20308 #endif /* #if CHECKING_P */
20310 #undef TARGET_STACK_PROTECT_GUARD
20311 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
20313 #undef TARGET_ADDRESS_COST
20314 #define TARGET_ADDRESS_COST aarch64_address_cost
20316 /* This hook will determines whether unnamed bitfields affect the alignment
20317 of the containing structure. The hook returns true if the structure
20318 should inherit the alignment requirements of an unnamed bitfield's
20320 #undef TARGET_ALIGN_ANON_BITFIELD
20321 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
20323 #undef TARGET_ASM_ALIGNED_DI_OP
20324 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
20326 #undef TARGET_ASM_ALIGNED_HI_OP
20327 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
20329 #undef TARGET_ASM_ALIGNED_SI_OP
20330 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
20332 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
20333 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
20334 hook_bool_const_tree_hwi_hwi_const_tree_true
20336 #undef TARGET_ASM_FILE_START
20337 #define TARGET_ASM_FILE_START aarch64_start_file
20339 #undef TARGET_ASM_OUTPUT_MI_THUNK
20340 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
20342 #undef TARGET_ASM_SELECT_RTX_SECTION
20343 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
20345 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
20346 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
20348 #undef TARGET_BUILD_BUILTIN_VA_LIST
20349 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
20351 #undef TARGET_CALLEE_COPIES
20352 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
20354 #undef TARGET_CAN_ELIMINATE
20355 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
20357 #undef TARGET_CAN_INLINE_P
20358 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
20360 #undef TARGET_CANNOT_FORCE_CONST_MEM
20361 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
20363 #undef TARGET_CASE_VALUES_THRESHOLD
20364 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
20366 #undef TARGET_CONDITIONAL_REGISTER_USAGE
20367 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
20369 /* Only the least significant bit is used for initialization guard
20371 #undef TARGET_CXX_GUARD_MASK_BIT
20372 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
20374 #undef TARGET_C_MODE_FOR_SUFFIX
20375 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
20377 #ifdef TARGET_BIG_ENDIAN_DEFAULT
20378 #undef TARGET_DEFAULT_TARGET_FLAGS
20379 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
20382 #undef TARGET_CLASS_MAX_NREGS
20383 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
20385 #undef TARGET_BUILTIN_DECL
20386 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
20388 #undef TARGET_BUILTIN_RECIPROCAL
20389 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
20391 #undef TARGET_C_EXCESS_PRECISION
20392 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
20394 #undef TARGET_EXPAND_BUILTIN
20395 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
20397 #undef TARGET_EXPAND_BUILTIN_VA_START
20398 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
20400 #undef TARGET_FOLD_BUILTIN
20401 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
20403 #undef TARGET_FUNCTION_ARG
20404 #define TARGET_FUNCTION_ARG aarch64_function_arg
20406 #undef TARGET_FUNCTION_ARG_ADVANCE
20407 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
20409 #undef TARGET_FUNCTION_ARG_BOUNDARY
20410 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
20412 #undef TARGET_FUNCTION_ARG_PADDING
20413 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
20415 #undef TARGET_GET_RAW_RESULT_MODE
20416 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
20417 #undef TARGET_GET_RAW_ARG_MODE
20418 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
20420 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
20421 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
20423 #undef TARGET_FUNCTION_VALUE
20424 #define TARGET_FUNCTION_VALUE aarch64_function_value
20426 #undef TARGET_FUNCTION_VALUE_REGNO_P
20427 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
20429 #undef TARGET_GIMPLE_FOLD_BUILTIN
20430 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
20432 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
20433 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
20435 #undef TARGET_INIT_BUILTINS
20436 #define TARGET_INIT_BUILTINS aarch64_init_builtins
20438 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
20439 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
20440 aarch64_ira_change_pseudo_allocno_class
20442 #undef TARGET_LEGITIMATE_ADDRESS_P
20443 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
20445 #undef TARGET_LEGITIMATE_CONSTANT_P
20446 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
20448 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
20449 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
20450 aarch64_legitimize_address_displacement
20452 #undef TARGET_LIBGCC_CMP_RETURN_MODE
20453 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
20455 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
20456 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
20457 aarch64_libgcc_floating_mode_supported_p
20459 #undef TARGET_MANGLE_TYPE
20460 #define TARGET_MANGLE_TYPE aarch64_mangle_type
20462 #undef TARGET_MEMORY_MOVE_COST
20463 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
20465 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
20466 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
20468 #undef TARGET_MUST_PASS_IN_STACK
20469 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
20471 /* This target hook should return true if accesses to volatile bitfields
20472 should use the narrowest mode possible. It should return false if these
20473 accesses should use the bitfield container type. */
20474 #undef TARGET_NARROW_VOLATILE_BITFIELD
20475 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
20477 #undef TARGET_OPTION_OVERRIDE
20478 #define TARGET_OPTION_OVERRIDE aarch64_override_options
20480 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
20481 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
20482 aarch64_override_options_after_change
20484 #undef TARGET_OPTION_SAVE
20485 #define TARGET_OPTION_SAVE aarch64_option_save
20487 #undef TARGET_OPTION_RESTORE
20488 #define TARGET_OPTION_RESTORE aarch64_option_restore
20490 #undef TARGET_OPTION_PRINT
20491 #define TARGET_OPTION_PRINT aarch64_option_print
20493 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
20494 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
20496 #undef TARGET_SET_CURRENT_FUNCTION
20497 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
20499 #undef TARGET_PASS_BY_REFERENCE
20500 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
20502 #undef TARGET_PREFERRED_RELOAD_CLASS
20503 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
20505 #undef TARGET_SCHED_REASSOCIATION_WIDTH
20506 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
20508 #undef TARGET_PROMOTED_TYPE
20509 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
20511 #undef TARGET_SECONDARY_RELOAD
20512 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
20514 #undef TARGET_SHIFT_TRUNCATION_MASK
20515 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
20517 #undef TARGET_SETUP_INCOMING_VARARGS
20518 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
20520 #undef TARGET_STRUCT_VALUE_RTX
20521 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
20523 #undef TARGET_REGISTER_MOVE_COST
20524 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
20526 #undef TARGET_RETURN_IN_MEMORY
20527 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
20529 #undef TARGET_RETURN_IN_MSB
20530 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
20532 #undef TARGET_RTX_COSTS
20533 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
20535 #undef TARGET_SCALAR_MODE_SUPPORTED_P
20536 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
20538 #undef TARGET_SCHED_ISSUE_RATE
20539 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
20541 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
20542 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
20543 aarch64_sched_first_cycle_multipass_dfa_lookahead
20545 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
20546 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
20547 aarch64_first_cycle_multipass_dfa_lookahead_guard
20549 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
20550 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
20551 aarch64_get_separate_components
20553 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
20554 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
20555 aarch64_components_for_bb
20557 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
20558 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
20559 aarch64_disqualify_components
20561 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
20562 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
20563 aarch64_emit_prologue_components
20565 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
20566 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
20567 aarch64_emit_epilogue_components
20569 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
20570 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
20571 aarch64_set_handled_components
20573 #undef TARGET_TRAMPOLINE_INIT
20574 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
20576 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
20577 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
20579 #undef TARGET_VECTOR_MODE_SUPPORTED_P
20580 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
20582 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
20583 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
20584 aarch64_builtin_support_vector_misalignment
20586 #undef TARGET_ARRAY_MODE
20587 #define TARGET_ARRAY_MODE aarch64_array_mode
20589 #undef TARGET_ARRAY_MODE_SUPPORTED_P
20590 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
20592 #undef TARGET_VECTORIZE_ADD_STMT_COST
20593 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
20595 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
20596 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
20597 aarch64_builtin_vectorization_cost
20599 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
20600 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
20602 #undef TARGET_VECTORIZE_BUILTINS
20603 #define TARGET_VECTORIZE_BUILTINS
20605 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
20606 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
20607 aarch64_builtin_vectorized_function
20609 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
20610 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
20611 aarch64_autovectorize_vector_sizes
20613 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
20614 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
20615 aarch64_atomic_assign_expand_fenv
20617 /* Section anchor support. */
20619 #undef TARGET_MIN_ANCHOR_OFFSET
20620 #define TARGET_MIN_ANCHOR_OFFSET -256
20622 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
20623 byte offset; we can do much more for larger data types, but have no way
20624 to determine the size of the access. We assume accesses are aligned. */
20625 #undef TARGET_MAX_ANCHOR_OFFSET
20626 #define TARGET_MAX_ANCHOR_OFFSET 4095
20628 #undef TARGET_VECTOR_ALIGNMENT
20629 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
20631 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
20632 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
20633 aarch64_vectorize_preferred_vector_alignment
20634 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
20635 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
20636 aarch64_simd_vector_alignment_reachable
20638 /* vec_perm support. */
20640 #undef TARGET_VECTORIZE_VEC_PERM_CONST
20641 #define TARGET_VECTORIZE_VEC_PERM_CONST \
20642 aarch64_vectorize_vec_perm_const
20644 #undef TARGET_VECTORIZE_GET_MASK_MODE
20645 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
20646 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
20647 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
20648 aarch64_empty_mask_is_expensive
20649 #undef TARGET_PREFERRED_ELSE_VALUE
20650 #define TARGET_PREFERRED_ELSE_VALUE \
20651 aarch64_preferred_else_value
20653 #undef TARGET_INIT_LIBFUNCS
20654 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
20656 #undef TARGET_FIXED_CONDITION_CODE_REGS
20657 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
20659 #undef TARGET_FLAGS_REGNUM
20660 #define TARGET_FLAGS_REGNUM CC_REGNUM
20662 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
20663 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
20665 #undef TARGET_ASAN_SHADOW_OFFSET
20666 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
20668 #undef TARGET_LEGITIMIZE_ADDRESS
20669 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
20671 #undef TARGET_SCHED_CAN_SPECULATE_INSN
20672 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
20674 #undef TARGET_CAN_USE_DOLOOP_P
20675 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
20677 #undef TARGET_SCHED_ADJUST_PRIORITY
20678 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
20680 #undef TARGET_SCHED_MACRO_FUSION_P
20681 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
20683 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
20684 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
20686 #undef TARGET_SCHED_FUSION_PRIORITY
20687 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
20689 #undef TARGET_UNSPEC_MAY_TRAP_P
20690 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
20692 #undef TARGET_USE_PSEUDO_PIC_REG
20693 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
20695 #undef TARGET_PRINT_OPERAND
20696 #define TARGET_PRINT_OPERAND aarch64_print_operand
20698 #undef TARGET_PRINT_OPERAND_ADDRESS
20699 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
20701 #undef TARGET_OPTAB_SUPPORTED_P
20702 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
20704 #undef TARGET_OMIT_STRUCT_RETURN_REG
20705 #define TARGET_OMIT_STRUCT_RETURN_REG true
20707 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
20708 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
20709 aarch64_dwarf_poly_indeterminate_value
20711 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
20712 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
20713 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
20715 #undef TARGET_HARD_REGNO_NREGS
20716 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
20717 #undef TARGET_HARD_REGNO_MODE_OK
20718 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
20720 #undef TARGET_MODES_TIEABLE_P
20721 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
20723 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
20724 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
20725 aarch64_hard_regno_call_part_clobbered
20727 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
20728 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
20729 aarch64_remove_extra_call_preserved_regs
20731 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
20732 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
20733 aarch64_return_call_with_max_clobbers
20735 #undef TARGET_CONSTANT_ALIGNMENT
20736 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
20738 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
20739 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
20740 aarch64_stack_clash_protection_alloca_probe_range
20742 #undef TARGET_COMPUTE_PRESSURE_CLASSES
20743 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
20745 #undef TARGET_CAN_CHANGE_MODE_CLASS
20746 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
20748 #undef TARGET_SELECT_EARLY_REMAT_MODES
20749 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
20751 #undef TARGET_SPECULATION_SAFE_VALUE
20752 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
20754 #undef TARGET_ESTIMATED_POLY_VALUE
20755 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
20757 #undef TARGET_ATTRIBUTE_TABLE
20758 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
20760 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
20761 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
20762 aarch64_simd_clone_compute_vecsize_and_simdlen
20764 #undef TARGET_SIMD_CLONE_ADJUST
20765 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
20767 #undef TARGET_SIMD_CLONE_USABLE
20768 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
20770 #undef TARGET_COMP_TYPE_ATTRIBUTES
20771 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
20773 #undef TARGET_GET_MULTILIB_ABI_NAME
20774 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
20777 #undef TARGET_RUN_TARGET_SELFTESTS
20778 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
20779 #endif /* #if CHECKING_P */
20781 #undef TARGET_ASM_POST_CFI_STARTPROC
20782 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
20784 struct gcc_target targetm
= TARGET_INITIALIZER
;
20786 #include "gt-aarch64.h"