1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2020 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
24 #define INCLUDE_STRING
26 #include "coretypes.h"
37 #include "stringpool.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
56 #include "langhooks.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
76 #include "function-abi.h"
78 /* This file should be included last. */
79 #include "target-def.h"
81 /* Defined for convenience. */
82 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
84 /* Information about a legitimate vector immediate operand. */
85 struct simd_immediate_info
87 enum insn_type
{ MOV
, MVN
, INDEX
, PTRUE
};
88 enum modifier_type
{ LSL
, MSL
};
90 simd_immediate_info () {}
91 simd_immediate_info (scalar_float_mode
, rtx
);
92 simd_immediate_info (scalar_int_mode
, unsigned HOST_WIDE_INT
,
93 insn_type
= MOV
, modifier_type
= LSL
,
95 simd_immediate_info (scalar_mode
, rtx
, rtx
);
96 simd_immediate_info (scalar_int_mode
, aarch64_svpattern
);
98 /* The mode of the elements. */
101 /* The instruction to use to move the immediate into a vector. */
106 /* For MOV and MVN. */
109 /* The value of each element. */
112 /* The kind of shift modifier to use, and the number of bits to shift.
113 This is (LSL, 0) if no shift is needed. */
114 modifier_type modifier
;
121 /* The value of the first element and the step to be added for each
122 subsequent element. */
127 aarch64_svpattern pattern
;
131 /* Construct a floating-point immediate in which each element has mode
132 ELT_MODE_IN and value VALUE_IN. */
133 inline simd_immediate_info
134 ::simd_immediate_info (scalar_float_mode elt_mode_in
, rtx value_in
)
135 : elt_mode (elt_mode_in
), insn (MOV
)
137 u
.mov
.value
= value_in
;
138 u
.mov
.modifier
= LSL
;
142 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
143 and value VALUE_IN. The other parameters are as for the structure
145 inline simd_immediate_info
146 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
147 unsigned HOST_WIDE_INT value_in
,
148 insn_type insn_in
, modifier_type modifier_in
,
149 unsigned int shift_in
)
150 : elt_mode (elt_mode_in
), insn (insn_in
)
152 u
.mov
.value
= gen_int_mode (value_in
, elt_mode_in
);
153 u
.mov
.modifier
= modifier_in
;
154 u
.mov
.shift
= shift_in
;
157 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
158 and where element I is equal to BASE_IN + I * STEP_IN. */
159 inline simd_immediate_info
160 ::simd_immediate_info (scalar_mode elt_mode_in
, rtx base_in
, rtx step_in
)
161 : elt_mode (elt_mode_in
), insn (INDEX
)
163 u
.index
.base
= base_in
;
164 u
.index
.step
= step_in
;
167 /* Construct a predicate that controls elements of mode ELT_MODE_IN
168 and has PTRUE pattern PATTERN_IN. */
169 inline simd_immediate_info
170 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
171 aarch64_svpattern pattern_in
)
172 : elt_mode (elt_mode_in
), insn (PTRUE
)
174 u
.pattern
= pattern_in
;
177 /* The current code model. */
178 enum aarch64_code_model aarch64_cmodel
;
180 /* The number of 64-bit elements in an SVE vector. */
181 poly_uint16 aarch64_sve_vg
;
184 #undef TARGET_HAVE_TLS
185 #define TARGET_HAVE_TLS 1
188 static bool aarch64_composite_type_p (const_tree
, machine_mode
);
189 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode
,
191 machine_mode
*, int *,
193 static void aarch64_elf_asm_constructor (rtx
, int) ATTRIBUTE_UNUSED
;
194 static void aarch64_elf_asm_destructor (rtx
, int) ATTRIBUTE_UNUSED
;
195 static void aarch64_override_options_after_change (void);
196 static bool aarch64_vector_mode_supported_p (machine_mode
);
197 static int aarch64_address_cost (rtx
, machine_mode
, addr_space_t
, bool);
198 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode
,
202 static machine_mode
aarch64_simd_container_mode (scalar_mode
, poly_int64
);
203 static bool aarch64_print_address_internal (FILE*, machine_mode
, rtx
,
204 aarch64_addr_query_type
);
205 static HOST_WIDE_INT
aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val
);
207 /* Major revision number of the ARM Architecture implemented by the target. */
208 unsigned aarch64_architecture_version
;
210 /* The processor for which instructions should be scheduled. */
211 enum aarch64_processor aarch64_tune
= cortexa53
;
213 /* Mask to specify which instruction scheduling options should be used. */
214 uint64_t aarch64_tune_flags
= 0;
216 /* Global flag for PC relative loads. */
217 bool aarch64_pcrelative_literal_loads
;
219 /* Global flag for whether frame pointer is enabled. */
220 bool aarch64_use_frame_pointer
;
222 #define BRANCH_PROTECT_STR_MAX 255
223 char *accepted_branch_protection_string
= NULL
;
225 static enum aarch64_parse_opt_result
226 aarch64_parse_branch_protection (const char*, char**);
228 /* Support for command line parsing of boolean flags in the tuning
230 struct aarch64_flag_desc
236 #define AARCH64_FUSION_PAIR(name, internal_name) \
237 { name, AARCH64_FUSE_##internal_name },
238 static const struct aarch64_flag_desc aarch64_fusible_pairs
[] =
240 { "none", AARCH64_FUSE_NOTHING
},
241 #include "aarch64-fusion-pairs.def"
242 { "all", AARCH64_FUSE_ALL
},
243 { NULL
, AARCH64_FUSE_NOTHING
}
246 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
247 { name, AARCH64_EXTRA_TUNE_##internal_name },
248 static const struct aarch64_flag_desc aarch64_tuning_flags
[] =
250 { "none", AARCH64_EXTRA_TUNE_NONE
},
251 #include "aarch64-tuning-flags.def"
252 { "all", AARCH64_EXTRA_TUNE_ALL
},
253 { NULL
, AARCH64_EXTRA_TUNE_NONE
}
256 /* Tuning parameters. */
258 static const struct cpu_addrcost_table generic_addrcost_table
=
268 0, /* register_offset */
269 0, /* register_sextend */
270 0, /* register_zextend */
274 static const struct cpu_addrcost_table exynosm1_addrcost_table
=
284 1, /* register_offset */
285 1, /* register_sextend */
286 2, /* register_zextend */
290 static const struct cpu_addrcost_table xgene1_addrcost_table
=
300 0, /* register_offset */
301 1, /* register_sextend */
302 1, /* register_zextend */
306 static const struct cpu_addrcost_table thunderx2t99_addrcost_table
=
316 2, /* register_offset */
317 3, /* register_sextend */
318 3, /* register_zextend */
322 static const struct cpu_addrcost_table tsv110_addrcost_table
=
332 0, /* register_offset */
333 1, /* register_sextend */
334 1, /* register_zextend */
338 static const struct cpu_addrcost_table qdf24xx_addrcost_table
=
348 3, /* register_offset */
349 3, /* register_sextend */
350 3, /* register_zextend */
354 static const struct cpu_regmove_cost generic_regmove_cost
=
357 /* Avoid the use of slow int<->fp moves for spilling by setting
358 their cost higher than memmov_cost. */
364 static const struct cpu_regmove_cost cortexa57_regmove_cost
=
367 /* Avoid the use of slow int<->fp moves for spilling by setting
368 their cost higher than memmov_cost. */
374 static const struct cpu_regmove_cost cortexa53_regmove_cost
=
377 /* Avoid the use of slow int<->fp moves for spilling by setting
378 their cost higher than memmov_cost. */
384 static const struct cpu_regmove_cost exynosm1_regmove_cost
=
387 /* Avoid the use of slow int<->fp moves for spilling by setting
388 their cost higher than memmov_cost (actual, 4 and 9). */
394 static const struct cpu_regmove_cost thunderx_regmove_cost
=
402 static const struct cpu_regmove_cost xgene1_regmove_cost
=
405 /* Avoid the use of slow int<->fp moves for spilling by setting
406 their cost higher than memmov_cost. */
412 static const struct cpu_regmove_cost qdf24xx_regmove_cost
=
415 /* Avoid the use of int<->fp moves for spilling. */
421 static const struct cpu_regmove_cost thunderx2t99_regmove_cost
=
424 /* Avoid the use of int<->fp moves for spilling. */
430 static const struct cpu_regmove_cost tsv110_regmove_cost
=
433 /* Avoid the use of slow int<->fp moves for spilling by setting
434 their cost higher than memmov_cost. */
440 /* Generic costs for vector insn classes. */
441 static const struct cpu_vector_cost generic_vector_cost
=
443 1, /* scalar_int_stmt_cost */
444 1, /* scalar_fp_stmt_cost */
445 1, /* scalar_load_cost */
446 1, /* scalar_store_cost */
447 1, /* vec_int_stmt_cost */
448 1, /* vec_fp_stmt_cost */
449 2, /* vec_permute_cost */
450 2, /* vec_to_scalar_cost */
451 1, /* scalar_to_vec_cost */
452 1, /* vec_align_load_cost */
453 1, /* vec_unalign_load_cost */
454 1, /* vec_unalign_store_cost */
455 1, /* vec_store_cost */
456 3, /* cond_taken_branch_cost */
457 1 /* cond_not_taken_branch_cost */
460 /* QDF24XX costs for vector insn classes. */
461 static const struct cpu_vector_cost qdf24xx_vector_cost
=
463 1, /* scalar_int_stmt_cost */
464 1, /* scalar_fp_stmt_cost */
465 1, /* scalar_load_cost */
466 1, /* scalar_store_cost */
467 1, /* vec_int_stmt_cost */
468 3, /* vec_fp_stmt_cost */
469 2, /* vec_permute_cost */
470 1, /* vec_to_scalar_cost */
471 1, /* scalar_to_vec_cost */
472 1, /* vec_align_load_cost */
473 1, /* vec_unalign_load_cost */
474 1, /* vec_unalign_store_cost */
475 1, /* vec_store_cost */
476 3, /* cond_taken_branch_cost */
477 1 /* cond_not_taken_branch_cost */
480 /* ThunderX costs for vector insn classes. */
481 static const struct cpu_vector_cost thunderx_vector_cost
=
483 1, /* scalar_int_stmt_cost */
484 1, /* scalar_fp_stmt_cost */
485 3, /* scalar_load_cost */
486 1, /* scalar_store_cost */
487 4, /* vec_int_stmt_cost */
488 1, /* vec_fp_stmt_cost */
489 4, /* vec_permute_cost */
490 2, /* vec_to_scalar_cost */
491 2, /* scalar_to_vec_cost */
492 3, /* vec_align_load_cost */
493 5, /* vec_unalign_load_cost */
494 5, /* vec_unalign_store_cost */
495 1, /* vec_store_cost */
496 3, /* cond_taken_branch_cost */
497 3 /* cond_not_taken_branch_cost */
500 static const struct cpu_vector_cost tsv110_vector_cost
=
502 1, /* scalar_int_stmt_cost */
503 1, /* scalar_fp_stmt_cost */
504 5, /* scalar_load_cost */
505 1, /* scalar_store_cost */
506 2, /* vec_int_stmt_cost */
507 2, /* vec_fp_stmt_cost */
508 2, /* vec_permute_cost */
509 3, /* vec_to_scalar_cost */
510 2, /* scalar_to_vec_cost */
511 5, /* vec_align_load_cost */
512 5, /* vec_unalign_load_cost */
513 1, /* vec_unalign_store_cost */
514 1, /* vec_store_cost */
515 1, /* cond_taken_branch_cost */
516 1 /* cond_not_taken_branch_cost */
519 /* Generic costs for vector insn classes. */
520 static const struct cpu_vector_cost cortexa57_vector_cost
=
522 1, /* scalar_int_stmt_cost */
523 1, /* scalar_fp_stmt_cost */
524 4, /* scalar_load_cost */
525 1, /* scalar_store_cost */
526 2, /* vec_int_stmt_cost */
527 2, /* vec_fp_stmt_cost */
528 3, /* vec_permute_cost */
529 8, /* vec_to_scalar_cost */
530 8, /* scalar_to_vec_cost */
531 4, /* vec_align_load_cost */
532 4, /* vec_unalign_load_cost */
533 1, /* vec_unalign_store_cost */
534 1, /* vec_store_cost */
535 1, /* cond_taken_branch_cost */
536 1 /* cond_not_taken_branch_cost */
539 static const struct cpu_vector_cost exynosm1_vector_cost
=
541 1, /* scalar_int_stmt_cost */
542 1, /* scalar_fp_stmt_cost */
543 5, /* scalar_load_cost */
544 1, /* scalar_store_cost */
545 3, /* vec_int_stmt_cost */
546 3, /* vec_fp_stmt_cost */
547 3, /* vec_permute_cost */
548 3, /* vec_to_scalar_cost */
549 3, /* scalar_to_vec_cost */
550 5, /* vec_align_load_cost */
551 5, /* vec_unalign_load_cost */
552 1, /* vec_unalign_store_cost */
553 1, /* vec_store_cost */
554 1, /* cond_taken_branch_cost */
555 1 /* cond_not_taken_branch_cost */
558 /* Generic costs for vector insn classes. */
559 static const struct cpu_vector_cost xgene1_vector_cost
=
561 1, /* scalar_int_stmt_cost */
562 1, /* scalar_fp_stmt_cost */
563 5, /* scalar_load_cost */
564 1, /* scalar_store_cost */
565 2, /* vec_int_stmt_cost */
566 2, /* vec_fp_stmt_cost */
567 2, /* vec_permute_cost */
568 4, /* vec_to_scalar_cost */
569 4, /* scalar_to_vec_cost */
570 10, /* vec_align_load_cost */
571 10, /* vec_unalign_load_cost */
572 2, /* vec_unalign_store_cost */
573 2, /* vec_store_cost */
574 2, /* cond_taken_branch_cost */
575 1 /* cond_not_taken_branch_cost */
578 /* Costs for vector insn classes for Vulcan. */
579 static const struct cpu_vector_cost thunderx2t99_vector_cost
=
581 1, /* scalar_int_stmt_cost */
582 6, /* scalar_fp_stmt_cost */
583 4, /* scalar_load_cost */
584 1, /* scalar_store_cost */
585 5, /* vec_int_stmt_cost */
586 6, /* vec_fp_stmt_cost */
587 10, /* vec_permute_cost */
588 6, /* vec_to_scalar_cost */
589 5, /* scalar_to_vec_cost */
590 8, /* vec_align_load_cost */
591 8, /* vec_unalign_load_cost */
592 4, /* vec_unalign_store_cost */
593 4, /* vec_store_cost */
594 2, /* cond_taken_branch_cost */
595 1 /* cond_not_taken_branch_cost */
598 /* Generic costs for branch instructions. */
599 static const struct cpu_branch_cost generic_branch_cost
=
601 1, /* Predictable. */
602 3 /* Unpredictable. */
605 /* Generic approximation modes. */
606 static const cpu_approx_modes generic_approx_modes
=
608 AARCH64_APPROX_NONE
, /* division */
609 AARCH64_APPROX_NONE
, /* sqrt */
610 AARCH64_APPROX_NONE
/* recip_sqrt */
613 /* Approximation modes for Exynos M1. */
614 static const cpu_approx_modes exynosm1_approx_modes
=
616 AARCH64_APPROX_NONE
, /* division */
617 AARCH64_APPROX_ALL
, /* sqrt */
618 AARCH64_APPROX_ALL
/* recip_sqrt */
621 /* Approximation modes for X-Gene 1. */
622 static const cpu_approx_modes xgene1_approx_modes
=
624 AARCH64_APPROX_NONE
, /* division */
625 AARCH64_APPROX_NONE
, /* sqrt */
626 AARCH64_APPROX_ALL
/* recip_sqrt */
629 /* Generic prefetch settings (which disable prefetch). */
630 static const cpu_prefetch_tune generic_prefetch_tune
=
633 -1, /* l1_cache_size */
634 -1, /* l1_cache_line_size */
635 -1, /* l2_cache_size */
636 true, /* prefetch_dynamic_strides */
637 -1, /* minimum_stride */
638 -1 /* default_opt_level */
641 static const cpu_prefetch_tune exynosm1_prefetch_tune
=
644 -1, /* l1_cache_size */
645 64, /* l1_cache_line_size */
646 -1, /* l2_cache_size */
647 true, /* prefetch_dynamic_strides */
648 -1, /* minimum_stride */
649 -1 /* default_opt_level */
652 static const cpu_prefetch_tune qdf24xx_prefetch_tune
=
655 32, /* l1_cache_size */
656 64, /* l1_cache_line_size */
657 512, /* l2_cache_size */
658 false, /* prefetch_dynamic_strides */
659 2048, /* minimum_stride */
660 3 /* default_opt_level */
663 static const cpu_prefetch_tune thunderxt88_prefetch_tune
=
666 32, /* l1_cache_size */
667 128, /* l1_cache_line_size */
668 16*1024, /* l2_cache_size */
669 true, /* prefetch_dynamic_strides */
670 -1, /* minimum_stride */
671 3 /* default_opt_level */
674 static const cpu_prefetch_tune thunderx_prefetch_tune
=
677 32, /* l1_cache_size */
678 128, /* l1_cache_line_size */
679 -1, /* l2_cache_size */
680 true, /* prefetch_dynamic_strides */
681 -1, /* minimum_stride */
682 -1 /* default_opt_level */
685 static const cpu_prefetch_tune thunderx2t99_prefetch_tune
=
688 32, /* l1_cache_size */
689 64, /* l1_cache_line_size */
690 256, /* l2_cache_size */
691 true, /* prefetch_dynamic_strides */
692 -1, /* minimum_stride */
693 -1 /* default_opt_level */
696 static const cpu_prefetch_tune tsv110_prefetch_tune
=
699 64, /* l1_cache_size */
700 64, /* l1_cache_line_size */
701 512, /* l2_cache_size */
702 true, /* prefetch_dynamic_strides */
703 -1, /* minimum_stride */
704 -1 /* default_opt_level */
707 static const cpu_prefetch_tune xgene1_prefetch_tune
=
710 32, /* l1_cache_size */
711 64, /* l1_cache_line_size */
712 256, /* l2_cache_size */
713 true, /* prefetch_dynamic_strides */
714 -1, /* minimum_stride */
715 -1 /* default_opt_level */
718 static const struct tune_params generic_tunings
=
720 &cortexa57_extra_costs
,
721 &generic_addrcost_table
,
722 &generic_regmove_cost
,
723 &generic_vector_cost
,
724 &generic_branch_cost
,
725 &generic_approx_modes
,
726 SVE_NOT_IMPLEMENTED
, /* sve_width */
729 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_CMP_BRANCH
), /* fusible_ops */
730 "16:12", /* function_align. */
731 "4", /* jump_align. */
732 "8", /* loop_align. */
733 2, /* int_reassoc_width. */
734 4, /* fp_reassoc_width. */
735 1, /* vec_reassoc_width. */
736 2, /* min_div_recip_mul_sf. */
737 2, /* min_div_recip_mul_df. */
738 0, /* max_case_values. */
739 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
740 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
741 &generic_prefetch_tune
744 static const struct tune_params cortexa35_tunings
=
746 &cortexa53_extra_costs
,
747 &generic_addrcost_table
,
748 &cortexa53_regmove_cost
,
749 &generic_vector_cost
,
750 &generic_branch_cost
,
751 &generic_approx_modes
,
752 SVE_NOT_IMPLEMENTED
, /* sve_width */
755 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
756 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
757 "16", /* function_align. */
758 "4", /* jump_align. */
759 "8", /* loop_align. */
760 2, /* int_reassoc_width. */
761 4, /* fp_reassoc_width. */
762 1, /* vec_reassoc_width. */
763 2, /* min_div_recip_mul_sf. */
764 2, /* min_div_recip_mul_df. */
765 0, /* max_case_values. */
766 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
767 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
768 &generic_prefetch_tune
771 static const struct tune_params cortexa53_tunings
=
773 &cortexa53_extra_costs
,
774 &generic_addrcost_table
,
775 &cortexa53_regmove_cost
,
776 &generic_vector_cost
,
777 &generic_branch_cost
,
778 &generic_approx_modes
,
779 SVE_NOT_IMPLEMENTED
, /* sve_width */
782 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
783 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
784 "16", /* function_align. */
785 "4", /* jump_align. */
786 "8", /* loop_align. */
787 2, /* int_reassoc_width. */
788 4, /* fp_reassoc_width. */
789 1, /* vec_reassoc_width. */
790 2, /* min_div_recip_mul_sf. */
791 2, /* min_div_recip_mul_df. */
792 0, /* max_case_values. */
793 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
794 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
795 &generic_prefetch_tune
798 static const struct tune_params cortexa57_tunings
=
800 &cortexa57_extra_costs
,
801 &generic_addrcost_table
,
802 &cortexa57_regmove_cost
,
803 &cortexa57_vector_cost
,
804 &generic_branch_cost
,
805 &generic_approx_modes
,
806 SVE_NOT_IMPLEMENTED
, /* sve_width */
809 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
810 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
811 "16", /* function_align. */
812 "4", /* jump_align. */
813 "8", /* loop_align. */
814 2, /* int_reassoc_width. */
815 4, /* fp_reassoc_width. */
816 1, /* vec_reassoc_width. */
817 2, /* min_div_recip_mul_sf. */
818 2, /* min_div_recip_mul_df. */
819 0, /* max_case_values. */
820 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
821 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
), /* tune_flags. */
822 &generic_prefetch_tune
825 static const struct tune_params cortexa72_tunings
=
827 &cortexa57_extra_costs
,
828 &generic_addrcost_table
,
829 &cortexa57_regmove_cost
,
830 &cortexa57_vector_cost
,
831 &generic_branch_cost
,
832 &generic_approx_modes
,
833 SVE_NOT_IMPLEMENTED
, /* sve_width */
836 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
837 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
838 "16", /* function_align. */
839 "4", /* jump_align. */
840 "8", /* loop_align. */
841 2, /* int_reassoc_width. */
842 4, /* fp_reassoc_width. */
843 1, /* vec_reassoc_width. */
844 2, /* min_div_recip_mul_sf. */
845 2, /* min_div_recip_mul_df. */
846 0, /* max_case_values. */
847 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
848 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
849 &generic_prefetch_tune
852 static const struct tune_params cortexa73_tunings
=
854 &cortexa57_extra_costs
,
855 &generic_addrcost_table
,
856 &cortexa57_regmove_cost
,
857 &cortexa57_vector_cost
,
858 &generic_branch_cost
,
859 &generic_approx_modes
,
860 SVE_NOT_IMPLEMENTED
, /* sve_width */
861 4, /* memmov_cost. */
863 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
864 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
865 "16", /* function_align. */
866 "4", /* jump_align. */
867 "8", /* loop_align. */
868 2, /* int_reassoc_width. */
869 4, /* fp_reassoc_width. */
870 1, /* vec_reassoc_width. */
871 2, /* min_div_recip_mul_sf. */
872 2, /* min_div_recip_mul_df. */
873 0, /* max_case_values. */
874 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
875 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
876 &generic_prefetch_tune
881 static const struct tune_params exynosm1_tunings
=
883 &exynosm1_extra_costs
,
884 &exynosm1_addrcost_table
,
885 &exynosm1_regmove_cost
,
886 &exynosm1_vector_cost
,
887 &generic_branch_cost
,
888 &exynosm1_approx_modes
,
889 SVE_NOT_IMPLEMENTED
, /* sve_width */
892 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
893 "4", /* function_align. */
894 "4", /* jump_align. */
895 "4", /* loop_align. */
896 2, /* int_reassoc_width. */
897 4, /* fp_reassoc_width. */
898 1, /* vec_reassoc_width. */
899 2, /* min_div_recip_mul_sf. */
900 2, /* min_div_recip_mul_df. */
901 48, /* max_case_values. */
902 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
903 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
904 &exynosm1_prefetch_tune
907 static const struct tune_params thunderxt88_tunings
=
909 &thunderx_extra_costs
,
910 &generic_addrcost_table
,
911 &thunderx_regmove_cost
,
912 &thunderx_vector_cost
,
913 &generic_branch_cost
,
914 &generic_approx_modes
,
915 SVE_NOT_IMPLEMENTED
, /* sve_width */
918 AARCH64_FUSE_ALU_BRANCH
, /* fusible_ops */
919 "8", /* function_align. */
920 "8", /* jump_align. */
921 "8", /* loop_align. */
922 2, /* int_reassoc_width. */
923 4, /* fp_reassoc_width. */
924 1, /* vec_reassoc_width. */
925 2, /* min_div_recip_mul_sf. */
926 2, /* min_div_recip_mul_df. */
927 0, /* max_case_values. */
928 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
929 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
), /* tune_flags. */
930 &thunderxt88_prefetch_tune
933 static const struct tune_params thunderx_tunings
=
935 &thunderx_extra_costs
,
936 &generic_addrcost_table
,
937 &thunderx_regmove_cost
,
938 &thunderx_vector_cost
,
939 &generic_branch_cost
,
940 &generic_approx_modes
,
941 SVE_NOT_IMPLEMENTED
, /* sve_width */
944 AARCH64_FUSE_ALU_BRANCH
, /* fusible_ops */
945 "8", /* function_align. */
946 "8", /* jump_align. */
947 "8", /* loop_align. */
948 2, /* int_reassoc_width. */
949 4, /* fp_reassoc_width. */
950 1, /* vec_reassoc_width. */
951 2, /* min_div_recip_mul_sf. */
952 2, /* min_div_recip_mul_df. */
953 0, /* max_case_values. */
954 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
955 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
956 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
), /* tune_flags. */
957 &thunderx_prefetch_tune
960 static const struct tune_params tsv110_tunings
=
963 &tsv110_addrcost_table
,
964 &tsv110_regmove_cost
,
966 &generic_branch_cost
,
967 &generic_approx_modes
,
968 SVE_NOT_IMPLEMENTED
, /* sve_width */
971 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_ALU_BRANCH
972 | AARCH64_FUSE_ALU_CBZ
), /* fusible_ops */
973 "16", /* function_align. */
974 "4", /* jump_align. */
975 "8", /* loop_align. */
976 2, /* int_reassoc_width. */
977 4, /* fp_reassoc_width. */
978 1, /* vec_reassoc_width. */
979 2, /* min_div_recip_mul_sf. */
980 2, /* min_div_recip_mul_df. */
981 0, /* max_case_values. */
982 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
983 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
984 &tsv110_prefetch_tune
987 static const struct tune_params xgene1_tunings
=
990 &xgene1_addrcost_table
,
991 &xgene1_regmove_cost
,
993 &generic_branch_cost
,
994 &xgene1_approx_modes
,
995 SVE_NOT_IMPLEMENTED
, /* sve_width */
998 AARCH64_FUSE_NOTHING
, /* fusible_ops */
999 "16", /* function_align. */
1000 "16", /* jump_align. */
1001 "16", /* loop_align. */
1002 2, /* int_reassoc_width. */
1003 4, /* fp_reassoc_width. */
1004 1, /* vec_reassoc_width. */
1005 2, /* min_div_recip_mul_sf. */
1006 2, /* min_div_recip_mul_df. */
1007 17, /* max_case_values. */
1008 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
1009 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
), /* tune_flags. */
1010 &xgene1_prefetch_tune
1013 static const struct tune_params emag_tunings
=
1015 &xgene1_extra_costs
,
1016 &xgene1_addrcost_table
,
1017 &xgene1_regmove_cost
,
1018 &xgene1_vector_cost
,
1019 &generic_branch_cost
,
1020 &xgene1_approx_modes
,
1021 SVE_NOT_IMPLEMENTED
,
1022 6, /* memmov_cost */
1024 AARCH64_FUSE_NOTHING
, /* fusible_ops */
1025 "16", /* function_align. */
1026 "16", /* jump_align. */
1027 "16", /* loop_align. */
1028 2, /* int_reassoc_width. */
1029 4, /* fp_reassoc_width. */
1030 1, /* vec_reassoc_width. */
1031 2, /* min_div_recip_mul_sf. */
1032 2, /* min_div_recip_mul_df. */
1033 17, /* max_case_values. */
1034 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
1035 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
), /* tune_flags. */
1036 &xgene1_prefetch_tune
1039 static const struct tune_params qdf24xx_tunings
=
1041 &qdf24xx_extra_costs
,
1042 &qdf24xx_addrcost_table
,
1043 &qdf24xx_regmove_cost
,
1044 &qdf24xx_vector_cost
,
1045 &generic_branch_cost
,
1046 &generic_approx_modes
,
1047 SVE_NOT_IMPLEMENTED
, /* sve_width */
1048 4, /* memmov_cost */
1050 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
1051 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
1052 "16", /* function_align. */
1053 "8", /* jump_align. */
1054 "16", /* loop_align. */
1055 2, /* int_reassoc_width. */
1056 4, /* fp_reassoc_width. */
1057 1, /* vec_reassoc_width. */
1058 2, /* min_div_recip_mul_sf. */
1059 2, /* min_div_recip_mul_df. */
1060 0, /* max_case_values. */
1061 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1062 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS
, /* tune_flags. */
1063 &qdf24xx_prefetch_tune
1066 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1068 static const struct tune_params saphira_tunings
=
1070 &generic_extra_costs
,
1071 &generic_addrcost_table
,
1072 &generic_regmove_cost
,
1073 &generic_vector_cost
,
1074 &generic_branch_cost
,
1075 &generic_approx_modes
,
1076 SVE_NOT_IMPLEMENTED
, /* sve_width */
1077 4, /* memmov_cost */
1079 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
1080 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
1081 "16", /* function_align. */
1082 "8", /* jump_align. */
1083 "16", /* loop_align. */
1084 2, /* int_reassoc_width. */
1085 4, /* fp_reassoc_width. */
1086 1, /* vec_reassoc_width. */
1087 2, /* min_div_recip_mul_sf. */
1088 2, /* min_div_recip_mul_df. */
1089 0, /* max_case_values. */
1090 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1091 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1092 &generic_prefetch_tune
1095 static const struct tune_params thunderx2t99_tunings
=
1097 &thunderx2t99_extra_costs
,
1098 &thunderx2t99_addrcost_table
,
1099 &thunderx2t99_regmove_cost
,
1100 &thunderx2t99_vector_cost
,
1101 &generic_branch_cost
,
1102 &generic_approx_modes
,
1103 SVE_NOT_IMPLEMENTED
, /* sve_width */
1104 4, /* memmov_cost. */
1105 4, /* issue_rate. */
1106 (AARCH64_FUSE_ALU_BRANCH
| AARCH64_FUSE_AES_AESMC
1107 | AARCH64_FUSE_ALU_CBZ
), /* fusible_ops */
1108 "16", /* function_align. */
1109 "8", /* jump_align. */
1110 "16", /* loop_align. */
1111 3, /* int_reassoc_width. */
1112 2, /* fp_reassoc_width. */
1113 2, /* vec_reassoc_width. */
1114 2, /* min_div_recip_mul_sf. */
1115 2, /* min_div_recip_mul_df. */
1116 0, /* max_case_values. */
1117 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1118 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1119 &thunderx2t99_prefetch_tune
1122 static const struct tune_params neoversen1_tunings
=
1124 &cortexa57_extra_costs
,
1125 &generic_addrcost_table
,
1126 &generic_regmove_cost
,
1127 &cortexa57_vector_cost
,
1128 &generic_branch_cost
,
1129 &generic_approx_modes
,
1130 SVE_NOT_IMPLEMENTED
, /* sve_width */
1131 4, /* memmov_cost */
1133 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_CMP_BRANCH
), /* fusible_ops */
1134 "32:16", /* function_align. */
1135 "4", /* jump_align. */
1136 "32:16", /* loop_align. */
1137 2, /* int_reassoc_width. */
1138 4, /* fp_reassoc_width. */
1139 2, /* vec_reassoc_width. */
1140 2, /* min_div_recip_mul_sf. */
1141 2, /* min_div_recip_mul_df. */
1142 0, /* max_case_values. */
1143 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1144 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1145 &generic_prefetch_tune
1148 /* Support for fine-grained override of the tuning structures. */
1149 struct aarch64_tuning_override_function
1152 void (*parse_override
)(const char*, struct tune_params
*);
1155 static void aarch64_parse_fuse_string (const char*, struct tune_params
*);
1156 static void aarch64_parse_tune_string (const char*, struct tune_params
*);
1157 static void aarch64_parse_sve_width_string (const char*, struct tune_params
*);
1159 static const struct aarch64_tuning_override_function
1160 aarch64_tuning_override_functions
[] =
1162 { "fuse", aarch64_parse_fuse_string
},
1163 { "tune", aarch64_parse_tune_string
},
1164 { "sve_width", aarch64_parse_sve_width_string
},
1168 /* A processor implementing AArch64. */
1171 const char *const name
;
1172 enum aarch64_processor ident
;
1173 enum aarch64_processor sched_core
;
1174 enum aarch64_arch arch
;
1175 unsigned architecture_version
;
1176 const uint64_t flags
;
1177 const struct tune_params
*const tune
;
1180 /* Architectures implementing AArch64. */
1181 static const struct processor all_architectures
[] =
1183 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1184 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1185 #include "aarch64-arches.def"
1186 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
1189 /* Processor cores implementing AArch64. */
1190 static const struct processor all_cores
[] =
1192 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1193 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1194 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1195 FLAGS, &COSTS##_tunings},
1196 #include "aarch64-cores.def"
1197 {"generic", generic
, cortexa53
, AARCH64_ARCH_8A
, 8,
1198 AARCH64_FL_FOR_ARCH8
, &generic_tunings
},
1199 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
1203 /* Target specification. These are populated by the -march, -mtune, -mcpu
1204 handling code or by target attributes. */
1205 static const struct processor
*selected_arch
;
1206 static const struct processor
*selected_cpu
;
1207 static const struct processor
*selected_tune
;
1209 enum aarch64_key_type aarch64_ra_sign_key
= AARCH64_KEY_A
;
1211 /* The current tuning set. */
1212 struct tune_params aarch64_tune_params
= generic_tunings
;
1214 /* Check whether an 'aarch64_vector_pcs' attribute is valid. */
1217 handle_aarch64_vector_pcs_attribute (tree
*node
, tree name
, tree
,
1218 int, bool *no_add_attrs
)
1220 /* Since we set fn_type_req to true, the caller should have checked
1222 gcc_assert (FUNC_OR_METHOD_TYPE_P (*node
));
1223 switch ((arm_pcs
) fntype_abi (*node
).id ())
1225 case ARM_PCS_AAPCS64
:
1230 error ("the %qE attribute cannot be applied to an SVE function type",
1232 *no_add_attrs
= true;
1235 case ARM_PCS_TLSDESC
:
1236 case ARM_PCS_UNKNOWN
:
1242 /* Table of machine attributes. */
1243 static const struct attribute_spec aarch64_attribute_table
[] =
1245 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1246 affects_type_identity, handler, exclude } */
1247 { "aarch64_vector_pcs", 0, 0, false, true, true, true,
1248 handle_aarch64_vector_pcs_attribute
, NULL
},
1249 { "SVE type", 3, 3, false, true, false, true, NULL
, NULL
},
1250 { NULL
, 0, 0, false, false, false, false, NULL
, NULL
}
1253 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1255 /* An ISA extension in the co-processor and main instruction set space. */
1256 struct aarch64_option_extension
1258 const char *const name
;
1259 const unsigned long flags_on
;
1260 const unsigned long flags_off
;
1263 typedef enum aarch64_cond_code
1265 AARCH64_EQ
= 0, AARCH64_NE
, AARCH64_CS
, AARCH64_CC
, AARCH64_MI
, AARCH64_PL
,
1266 AARCH64_VS
, AARCH64_VC
, AARCH64_HI
, AARCH64_LS
, AARCH64_GE
, AARCH64_LT
,
1267 AARCH64_GT
, AARCH64_LE
, AARCH64_AL
, AARCH64_NV
1271 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1273 struct aarch64_branch_protect_type
1275 /* The type's name that the user passes to the branch-protection option
1278 /* Function to handle the protection type and set global variables.
1279 First argument is the string token corresponding with this type and the
1280 second argument is the next token in the option string.
1282 * AARCH64_PARSE_OK: Handling was sucessful.
1283 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1284 should print an error.
1285 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1287 enum aarch64_parse_opt_result (*handler
)(char*, char*);
1288 /* A list of types that can follow this type in the option string. */
1289 const aarch64_branch_protect_type
* subtypes
;
1290 unsigned int num_subtypes
;
1293 static enum aarch64_parse_opt_result
1294 aarch64_handle_no_branch_protection (char* str
, char* rest
)
1296 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NONE
;
1297 aarch64_enable_bti
= 0;
1300 error ("unexpected %<%s%> after %<%s%>", rest
, str
);
1301 return AARCH64_PARSE_INVALID_FEATURE
;
1303 return AARCH64_PARSE_OK
;
1306 static enum aarch64_parse_opt_result
1307 aarch64_handle_standard_branch_protection (char* str
, char* rest
)
1309 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
1310 aarch64_ra_sign_key
= AARCH64_KEY_A
;
1311 aarch64_enable_bti
= 1;
1314 error ("unexpected %<%s%> after %<%s%>", rest
, str
);
1315 return AARCH64_PARSE_INVALID_FEATURE
;
1317 return AARCH64_PARSE_OK
;
1320 static enum aarch64_parse_opt_result
1321 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED
,
1322 char* rest ATTRIBUTE_UNUSED
)
1324 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
1325 aarch64_ra_sign_key
= AARCH64_KEY_A
;
1326 return AARCH64_PARSE_OK
;
1329 static enum aarch64_parse_opt_result
1330 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED
,
1331 char* rest ATTRIBUTE_UNUSED
)
1333 aarch64_ra_sign_scope
= AARCH64_FUNCTION_ALL
;
1334 return AARCH64_PARSE_OK
;
1337 static enum aarch64_parse_opt_result
1338 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED
,
1339 char* rest ATTRIBUTE_UNUSED
)
1341 aarch64_ra_sign_key
= AARCH64_KEY_B
;
1342 return AARCH64_PARSE_OK
;
1345 static enum aarch64_parse_opt_result
1346 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED
,
1347 char* rest ATTRIBUTE_UNUSED
)
1349 aarch64_enable_bti
= 1;
1350 return AARCH64_PARSE_OK
;
1353 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes
[] = {
1354 { "leaf", aarch64_handle_pac_ret_leaf
, NULL
, 0 },
1355 { "b-key", aarch64_handle_pac_ret_b_key
, NULL
, 0 },
1356 { NULL
, NULL
, NULL
, 0 }
1359 static const struct aarch64_branch_protect_type aarch64_branch_protect_types
[] = {
1360 { "none", aarch64_handle_no_branch_protection
, NULL
, 0 },
1361 { "standard", aarch64_handle_standard_branch_protection
, NULL
, 0 },
1362 { "pac-ret", aarch64_handle_pac_ret_protection
, aarch64_pac_ret_subtypes
,
1363 ARRAY_SIZE (aarch64_pac_ret_subtypes
) },
1364 { "bti", aarch64_handle_bti_protection
, NULL
, 0 },
1365 { NULL
, NULL
, NULL
, 0 }
1368 /* The condition codes of the processor, and the inverse function. */
1369 static const char * const aarch64_condition_codes
[] =
1371 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1372 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1375 /* The preferred condition codes for SVE conditions. */
1376 static const char *const aarch64_sve_condition_codes
[] =
1378 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1379 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1382 /* Return the assembly token for svpattern value VALUE. */
1385 svpattern_token (enum aarch64_svpattern pattern
)
1389 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1390 AARCH64_FOR_SVPATTERN (CASE
)
1392 case AARCH64_NUM_SVPATTERNS
:
1398 /* Return the descriptor of the SIMD ABI. */
1400 static const predefined_function_abi
&
1401 aarch64_simd_abi (void)
1403 predefined_function_abi
&simd_abi
= function_abis
[ARM_PCS_SIMD
];
1404 if (!simd_abi
.initialized_p ())
1406 HARD_REG_SET full_reg_clobbers
1407 = default_function_abi
.full_reg_clobbers ();
1408 for (int regno
= 0; regno
< FIRST_PSEUDO_REGISTER
; regno
++)
1409 if (FP_SIMD_SAVED_REGNUM_P (regno
))
1410 CLEAR_HARD_REG_BIT (full_reg_clobbers
, regno
);
1411 simd_abi
.initialize (ARM_PCS_SIMD
, full_reg_clobbers
);
1416 /* Return the descriptor of the SVE PCS. */
1418 static const predefined_function_abi
&
1419 aarch64_sve_abi (void)
1421 predefined_function_abi
&sve_abi
= function_abis
[ARM_PCS_SVE
];
1422 if (!sve_abi
.initialized_p ())
1424 HARD_REG_SET full_reg_clobbers
1425 = default_function_abi
.full_reg_clobbers ();
1426 for (int regno
= V8_REGNUM
; regno
<= V23_REGNUM
; ++regno
)
1427 CLEAR_HARD_REG_BIT (full_reg_clobbers
, regno
);
1428 for (int regno
= P4_REGNUM
; regno
<= P15_REGNUM
; ++regno
)
1429 CLEAR_HARD_REG_BIT (full_reg_clobbers
, regno
);
1430 sve_abi
.initialize (ARM_PCS_SVE
, full_reg_clobbers
);
1435 /* Generate code to enable conditional branches in functions over 1 MiB. */
1437 aarch64_gen_far_branch (rtx
* operands
, int pos_label
, const char * dest
,
1438 const char * branch_format
)
1440 rtx_code_label
* tmp_label
= gen_label_rtx ();
1441 char label_buf
[256];
1443 ASM_GENERATE_INTERNAL_LABEL (label_buf
, dest
,
1444 CODE_LABEL_NUMBER (tmp_label
));
1445 const char *label_ptr
= targetm
.strip_name_encoding (label_buf
);
1446 rtx dest_label
= operands
[pos_label
];
1447 operands
[pos_label
] = tmp_label
;
1449 snprintf (buffer
, sizeof (buffer
), "%s%s", branch_format
, label_ptr
);
1450 output_asm_insn (buffer
, operands
);
1452 snprintf (buffer
, sizeof (buffer
), "b\t%%l%d\n%s:", pos_label
, label_ptr
);
1453 operands
[pos_label
] = dest_label
;
1454 output_asm_insn (buffer
, operands
);
1459 aarch64_err_no_fpadvsimd (machine_mode mode
)
1461 if (TARGET_GENERAL_REGS_ONLY
)
1462 if (FLOAT_MODE_P (mode
))
1463 error ("%qs is incompatible with the use of floating-point types",
1464 "-mgeneral-regs-only");
1466 error ("%qs is incompatible with the use of vector types",
1467 "-mgeneral-regs-only");
1469 if (FLOAT_MODE_P (mode
))
1470 error ("%qs feature modifier is incompatible with the use of"
1471 " floating-point types", "+nofp");
1473 error ("%qs feature modifier is incompatible with the use of"
1474 " vector types", "+nofp");
1477 /* Report when we try to do something that requires SVE when SVE is disabled.
1478 This is an error of last resort and isn't very high-quality. It usually
1479 involves attempts to measure the vector length in some way. */
1481 aarch64_report_sve_required (void)
1483 static bool reported_p
= false;
1485 /* Avoid reporting a slew of messages for a single oversight. */
1489 error ("this operation requires the SVE ISA extension");
1490 inform (input_location
, "you can enable SVE using the command-line"
1491 " option %<-march%>, or by using the %<target%>"
1492 " attribute or pragma");
1496 /* Return true if REGNO is P0-P15 or one of the special FFR-related
1499 pr_or_ffr_regnum_p (unsigned int regno
)
1501 return PR_REGNUM_P (regno
) || regno
== FFR_REGNUM
|| regno
== FFRT_REGNUM
;
1504 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1505 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1506 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1507 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1508 and GENERAL_REGS is lower than the memory cost (in this case the best class
1509 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1510 cost results in bad allocations with many redundant int<->FP moves which
1511 are expensive on various cores.
1512 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1513 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1514 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1515 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1516 The result of this is that it is no longer inefficient to have a higher
1517 memory move cost than the register move cost.
1521 aarch64_ira_change_pseudo_allocno_class (int regno
, reg_class_t allocno_class
,
1522 reg_class_t best_class
)
1526 if (!reg_class_subset_p (GENERAL_REGS
, allocno_class
)
1527 || !reg_class_subset_p (FP_REGS
, allocno_class
))
1528 return allocno_class
;
1530 if (!reg_class_subset_p (GENERAL_REGS
, best_class
)
1531 || !reg_class_subset_p (FP_REGS
, best_class
))
1534 mode
= PSEUDO_REGNO_MODE (regno
);
1535 return FLOAT_MODE_P (mode
) || VECTOR_MODE_P (mode
) ? FP_REGS
: GENERAL_REGS
;
1539 aarch64_min_divisions_for_recip_mul (machine_mode mode
)
1541 if (GET_MODE_UNIT_SIZE (mode
) == 4)
1542 return aarch64_tune_params
.min_div_recip_mul_sf
;
1543 return aarch64_tune_params
.min_div_recip_mul_df
;
1546 /* Return the reassociation width of treeop OPC with mode MODE. */
1548 aarch64_reassociation_width (unsigned opc
, machine_mode mode
)
1550 if (VECTOR_MODE_P (mode
))
1551 return aarch64_tune_params
.vec_reassoc_width
;
1552 if (INTEGRAL_MODE_P (mode
))
1553 return aarch64_tune_params
.int_reassoc_width
;
1554 /* Avoid reassociating floating point addition so we emit more FMAs. */
1555 if (FLOAT_MODE_P (mode
) && opc
!= PLUS_EXPR
)
1556 return aarch64_tune_params
.fp_reassoc_width
;
1560 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1562 aarch64_dbx_register_number (unsigned regno
)
1564 if (GP_REGNUM_P (regno
))
1565 return AARCH64_DWARF_R0
+ regno
- R0_REGNUM
;
1566 else if (regno
== SP_REGNUM
)
1567 return AARCH64_DWARF_SP
;
1568 else if (FP_REGNUM_P (regno
))
1569 return AARCH64_DWARF_V0
+ regno
- V0_REGNUM
;
1570 else if (PR_REGNUM_P (regno
))
1571 return AARCH64_DWARF_P0
+ regno
- P0_REGNUM
;
1572 else if (regno
== VG_REGNUM
)
1573 return AARCH64_DWARF_VG
;
1575 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1576 equivalent DWARF register. */
1577 return DWARF_FRAME_REGISTERS
;
1580 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1581 integer, otherwise return X unmodified. */
1583 aarch64_bit_representation (rtx x
)
1585 if (CONST_DOUBLE_P (x
))
1586 x
= gen_lowpart (int_mode_for_mode (GET_MODE (x
)).require (), x
);
1590 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1592 aarch64_advsimd_struct_mode_p (machine_mode mode
)
1595 && (mode
== OImode
|| mode
== CImode
|| mode
== XImode
));
1598 /* Return true if MODE is an SVE predicate mode. */
1600 aarch64_sve_pred_mode_p (machine_mode mode
)
1603 && (mode
== VNx16BImode
1604 || mode
== VNx8BImode
1605 || mode
== VNx4BImode
1606 || mode
== VNx2BImode
));
1609 /* Three mutually-exclusive flags describing a vector or predicate type. */
1610 const unsigned int VEC_ADVSIMD
= 1;
1611 const unsigned int VEC_SVE_DATA
= 2;
1612 const unsigned int VEC_SVE_PRED
= 4;
1613 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1614 a structure of 2, 3 or 4 vectors. */
1615 const unsigned int VEC_STRUCT
= 8;
1616 /* Can be used in combination with VEC_SVE_DATA to indicate that the
1617 vector has fewer significant bytes than a full SVE vector. */
1618 const unsigned int VEC_PARTIAL
= 16;
1619 /* Useful combinations of the above. */
1620 const unsigned int VEC_ANY_SVE
= VEC_SVE_DATA
| VEC_SVE_PRED
;
1621 const unsigned int VEC_ANY_DATA
= VEC_ADVSIMD
| VEC_SVE_DATA
;
1623 /* Return a set of flags describing the vector properties of mode MODE.
1624 Ignore modes that are not supported by the current target. */
1626 aarch64_classify_vector_mode (machine_mode mode
)
1628 if (aarch64_advsimd_struct_mode_p (mode
))
1629 return VEC_ADVSIMD
| VEC_STRUCT
;
1631 if (aarch64_sve_pred_mode_p (mode
))
1632 return VEC_SVE_PRED
;
1634 /* Make the decision based on the mode's enum value rather than its
1635 properties, so that we keep the correct classification regardless
1636 of -msve-vector-bits. */
1639 /* Partial SVE QI vectors. */
1643 /* Partial SVE HI vectors. */
1646 /* Partial SVE SI vector. */
1648 /* Partial SVE HF vectors. */
1651 /* Partial SVE SF vector. */
1653 return TARGET_SVE
? VEC_SVE_DATA
| VEC_PARTIAL
: 0;
1663 return TARGET_SVE
? VEC_SVE_DATA
: 0;
1665 /* x2 SVE vectors. */
1674 /* x3 SVE vectors. */
1683 /* x4 SVE vectors. */
1692 return TARGET_SVE
? VEC_SVE_DATA
| VEC_STRUCT
: 0;
1694 /* 64-bit Advanced SIMD vectors. */
1698 /* ...E_V1DImode doesn't exist. */
1703 /* 128-bit Advanced SIMD vectors. */
1712 return TARGET_SIMD
? VEC_ADVSIMD
: 0;
1719 /* Return true if MODE is any of the data vector modes, including
1722 aarch64_vector_data_mode_p (machine_mode mode
)
1724 return aarch64_classify_vector_mode (mode
) & VEC_ANY_DATA
;
1727 /* Return true if MODE is any form of SVE mode, including predicates,
1728 vectors and structures. */
1730 aarch64_sve_mode_p (machine_mode mode
)
1732 return aarch64_classify_vector_mode (mode
) & VEC_ANY_SVE
;
1735 /* Return true if MODE is an SVE data vector mode; either a single vector
1736 or a structure of vectors. */
1738 aarch64_sve_data_mode_p (machine_mode mode
)
1740 return aarch64_classify_vector_mode (mode
) & VEC_SVE_DATA
;
1743 /* Return the number of defined bytes in one constituent vector of
1744 SVE mode MODE, which has vector flags VEC_FLAGS. */
1746 aarch64_vl_bytes (machine_mode mode
, unsigned int vec_flags
)
1748 if (vec_flags
& VEC_PARTIAL
)
1749 /* A single partial vector. */
1750 return GET_MODE_SIZE (mode
);
1752 if (vec_flags
& VEC_SVE_DATA
)
1753 /* A single vector or a tuple. */
1754 return BYTES_PER_SVE_VECTOR
;
1756 /* A single predicate. */
1757 gcc_assert (vec_flags
& VEC_SVE_PRED
);
1758 return BYTES_PER_SVE_PRED
;
1761 /* Implement target hook TARGET_ARRAY_MODE. */
1762 static opt_machine_mode
1763 aarch64_array_mode (machine_mode mode
, unsigned HOST_WIDE_INT nelems
)
1765 if (aarch64_classify_vector_mode (mode
) == VEC_SVE_DATA
1766 && IN_RANGE (nelems
, 2, 4))
1767 return mode_for_vector (GET_MODE_INNER (mode
),
1768 GET_MODE_NUNITS (mode
) * nelems
);
1770 return opt_machine_mode ();
1773 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1775 aarch64_array_mode_supported_p (machine_mode mode
,
1776 unsigned HOST_WIDE_INT nelems
)
1779 && (AARCH64_VALID_SIMD_QREG_MODE (mode
)
1780 || AARCH64_VALID_SIMD_DREG_MODE (mode
))
1781 && (nelems
>= 2 && nelems
<= 4))
1787 /* MODE is some form of SVE vector mode. For data modes, return the number
1788 of vector register bits that each element of MODE occupies, such as 64
1789 for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
1790 in a 64-bit container). For predicate modes, return the number of
1791 data bits controlled by each significant predicate bit. */
1794 aarch64_sve_container_bits (machine_mode mode
)
1796 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1797 poly_uint64 vector_bits
= (vec_flags
& (VEC_PARTIAL
| VEC_SVE_PRED
)
1798 ? BITS_PER_SVE_VECTOR
1799 : GET_MODE_BITSIZE (mode
));
1800 return vector_element_size (vector_bits
, GET_MODE_NUNITS (mode
));
1803 /* Return the SVE predicate mode to use for elements that have
1804 ELEM_NBYTES bytes, if such a mode exists. */
1807 aarch64_sve_pred_mode (unsigned int elem_nbytes
)
1811 if (elem_nbytes
== 1)
1813 if (elem_nbytes
== 2)
1815 if (elem_nbytes
== 4)
1817 if (elem_nbytes
== 8)
1820 return opt_machine_mode ();
1823 /* Return the SVE predicate mode that should be used to control
1827 aarch64_sve_pred_mode (machine_mode mode
)
1829 unsigned int bits
= aarch64_sve_container_bits (mode
);
1830 return aarch64_sve_pred_mode (bits
/ BITS_PER_UNIT
).require ();
1833 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1835 static opt_machine_mode
1836 aarch64_get_mask_mode (machine_mode mode
)
1838 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1839 if (vec_flags
& VEC_SVE_DATA
)
1840 return aarch64_sve_pred_mode (mode
);
1842 return default_get_mask_mode (mode
);
1845 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
1848 aarch64_sve_data_mode (scalar_mode inner_mode
, poly_uint64 nunits
)
1850 enum mode_class mclass
= (is_a
<scalar_float_mode
> (inner_mode
)
1851 ? MODE_VECTOR_FLOAT
: MODE_VECTOR_INT
);
1853 FOR_EACH_MODE_IN_CLASS (mode
, mclass
)
1854 if (inner_mode
== GET_MODE_INNER (mode
)
1855 && known_eq (nunits
, GET_MODE_NUNITS (mode
))
1856 && aarch64_sve_data_mode_p (mode
))
1858 return opt_machine_mode ();
1861 /* Return the integer element mode associated with SVE mode MODE. */
1863 static scalar_int_mode
1864 aarch64_sve_element_int_mode (machine_mode mode
)
1866 poly_uint64 vector_bits
= (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
1867 ? BITS_PER_SVE_VECTOR
1868 : GET_MODE_BITSIZE (mode
));
1869 unsigned int elt_bits
= vector_element_size (vector_bits
,
1870 GET_MODE_NUNITS (mode
));
1871 return int_mode_for_size (elt_bits
, 0).require ();
1874 /* Return an integer element mode that contains exactly
1875 aarch64_sve_container_bits (MODE) bits. This is wider than
1876 aarch64_sve_element_int_mode if MODE is a partial vector,
1877 otherwise it's the same. */
1879 static scalar_int_mode
1880 aarch64_sve_container_int_mode (machine_mode mode
)
1882 return int_mode_for_size (aarch64_sve_container_bits (mode
), 0).require ();
1885 /* Return the integer vector mode associated with SVE mode MODE.
1886 Unlike related_int_vector_mode, this can handle the case in which
1887 MODE is a predicate (and thus has a different total size). */
1890 aarch64_sve_int_mode (machine_mode mode
)
1892 scalar_int_mode int_mode
= aarch64_sve_element_int_mode (mode
);
1893 return aarch64_sve_data_mode (int_mode
, GET_MODE_NUNITS (mode
)).require ();
1896 /* Implement TARGET_VECTORIZE_RELATED_MODE. */
1898 static opt_machine_mode
1899 aarch64_vectorize_related_mode (machine_mode vector_mode
,
1900 scalar_mode element_mode
,
1903 unsigned int vec_flags
= aarch64_classify_vector_mode (vector_mode
);
1905 /* If we're operating on SVE vectors, try to return an SVE mode. */
1906 poly_uint64 sve_nunits
;
1907 if ((vec_flags
& VEC_SVE_DATA
)
1908 && multiple_p (BYTES_PER_SVE_VECTOR
,
1909 GET_MODE_SIZE (element_mode
), &sve_nunits
))
1911 machine_mode sve_mode
;
1912 if (maybe_ne (nunits
, 0U))
1914 /* Try to find a full or partial SVE mode with exactly
1916 if (multiple_p (sve_nunits
, nunits
)
1917 && aarch64_sve_data_mode (element_mode
,
1918 nunits
).exists (&sve_mode
))
1923 /* Take the preferred number of units from the number of bytes
1924 that fit in VECTOR_MODE. We always start by "autodetecting"
1925 a full vector mode with preferred_simd_mode, so vectors
1926 chosen here will also be full vector modes. Then
1927 autovectorize_vector_modes tries smaller starting modes
1928 and thus smaller preferred numbers of units. */
1929 sve_nunits
= ordered_min (sve_nunits
, GET_MODE_SIZE (vector_mode
));
1930 if (aarch64_sve_data_mode (element_mode
,
1931 sve_nunits
).exists (&sve_mode
))
1936 /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors. */
1937 if ((vec_flags
& VEC_ADVSIMD
)
1938 && known_eq (nunits
, 0U)
1939 && known_eq (GET_MODE_BITSIZE (vector_mode
), 64U)
1940 && maybe_ge (GET_MODE_BITSIZE (element_mode
)
1941 * GET_MODE_NUNITS (vector_mode
), 128U))
1943 machine_mode res
= aarch64_simd_container_mode (element_mode
, 128);
1944 if (VECTOR_MODE_P (res
))
1948 return default_vectorize_related_mode (vector_mode
, element_mode
, nunits
);
1951 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1952 prefer to use the first arithmetic operand as the else value if
1953 the else value doesn't matter, since that exactly matches the SVE
1954 destructive merging form. For ternary operations we could either
1955 pick the first operand and use FMAD-like instructions or the last
1956 operand and use FMLA-like instructions; the latter seems more
1960 aarch64_preferred_else_value (unsigned, tree
, unsigned int nops
, tree
*ops
)
1962 return nops
== 3 ? ops
[2] : ops
[0];
1965 /* Implement TARGET_HARD_REGNO_NREGS. */
1968 aarch64_hard_regno_nregs (unsigned regno
, machine_mode mode
)
1970 /* ??? Logically we should only need to provide a value when
1971 HARD_REGNO_MODE_OK says that the combination is valid,
1972 but at the moment we need to handle all modes. Just ignore
1973 any runtime parts for registers that can't store them. */
1974 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
1975 switch (aarch64_regno_regclass (regno
))
1981 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1982 if (vec_flags
& VEC_SVE_DATA
)
1983 return exact_div (GET_MODE_SIZE (mode
),
1984 aarch64_vl_bytes (mode
, vec_flags
)).to_constant ();
1985 return CEIL (lowest_size
, UNITS_PER_VREG
);
1991 case PR_AND_FFR_REGS
:
1994 return CEIL (lowest_size
, UNITS_PER_WORD
);
1999 /* Implement TARGET_HARD_REGNO_MODE_OK. */
2002 aarch64_hard_regno_mode_ok (unsigned regno
, machine_mode mode
)
2004 if (GET_MODE_CLASS (mode
) == MODE_CC
)
2005 return regno
== CC_REGNUM
;
2007 if (regno
== VG_REGNUM
)
2008 /* This must have the same size as _Unwind_Word. */
2009 return mode
== DImode
;
2011 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
2012 if (vec_flags
& VEC_SVE_PRED
)
2013 return pr_or_ffr_regnum_p (regno
);
2015 if (pr_or_ffr_regnum_p (regno
))
2018 if (regno
== SP_REGNUM
)
2019 /* The purpose of comparing with ptr_mode is to support the
2020 global register variable associated with the stack pointer
2021 register via the syntax of asm ("wsp") in ILP32. */
2022 return mode
== Pmode
|| mode
== ptr_mode
;
2024 if (regno
== FRAME_POINTER_REGNUM
|| regno
== ARG_POINTER_REGNUM
)
2025 return mode
== Pmode
;
2027 if (GP_REGNUM_P (regno
))
2029 if (vec_flags
& VEC_ANY_SVE
)
2031 if (known_le (GET_MODE_SIZE (mode
), 8))
2033 if (known_le (GET_MODE_SIZE (mode
), 16))
2034 return (regno
& 1) == 0;
2036 else if (FP_REGNUM_P (regno
))
2038 if (vec_flags
& VEC_STRUCT
)
2039 return end_hard_regno (mode
, regno
) - 1 <= V31_REGNUM
;
2041 return !VECTOR_MODE_P (mode
) || vec_flags
!= 0;
2047 /* Return true if TYPE is a type that should be passed or returned in
2048 SVE registers, assuming enough registers are available. When returning
2049 true, set *NUM_ZR and *NUM_PR to the number of required Z and P registers
2052 /* Return true if a function with type FNTYPE returns its value in
2053 SVE vector or predicate registers. */
2056 aarch64_returns_value_in_sve_regs_p (const_tree fntype
)
2058 tree return_type
= TREE_TYPE (fntype
);
2059 return (return_type
!= error_mark_node
2060 && aarch64_sve::builtin_type_p (return_type
));
2063 /* Return true if a function with type FNTYPE takes arguments in
2064 SVE vector or predicate registers. */
2067 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype
)
2069 CUMULATIVE_ARGS args_so_far_v
;
2070 aarch64_init_cumulative_args (&args_so_far_v
, NULL_TREE
, NULL_RTX
,
2071 NULL_TREE
, 0, true);
2072 cumulative_args_t args_so_far
= pack_cumulative_args (&args_so_far_v
);
2074 for (tree chain
= TYPE_ARG_TYPES (fntype
);
2075 chain
&& chain
!= void_list_node
;
2076 chain
= TREE_CHAIN (chain
))
2078 tree arg_type
= TREE_VALUE (chain
);
2079 if (arg_type
== error_mark_node
)
2082 function_arg_info
arg (arg_type
, /*named=*/true);
2083 apply_pass_by_reference_rules (&args_so_far_v
, arg
);
2084 if (aarch64_sve::builtin_type_p (arg
.type
))
2087 targetm
.calls
.function_arg_advance (args_so_far
, arg
);
2092 /* Implement TARGET_FNTYPE_ABI. */
2094 static const predefined_function_abi
&
2095 aarch64_fntype_abi (const_tree fntype
)
2097 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype
)))
2098 return aarch64_simd_abi ();
2100 if (aarch64_returns_value_in_sve_regs_p (fntype
)
2101 || aarch64_takes_arguments_in_sve_regs_p (fntype
))
2102 return aarch64_sve_abi ();
2104 return default_function_abi
;
2107 /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P. */
2110 aarch64_compatible_vector_types_p (const_tree type1
, const_tree type2
)
2112 return (aarch64_sve::builtin_type_p (type1
)
2113 == aarch64_sve::builtin_type_p (type2
));
2116 /* Return true if we should emit CFI for register REGNO. */
2119 aarch64_emit_cfi_for_reg_p (unsigned int regno
)
2121 return (GP_REGNUM_P (regno
)
2122 || !default_function_abi
.clobbers_full_reg_p (regno
));
2125 /* Return the mode we should use to save and restore register REGNO. */
2128 aarch64_reg_save_mode (unsigned int regno
)
2130 if (GP_REGNUM_P (regno
))
2133 if (FP_REGNUM_P (regno
))
2134 switch (crtl
->abi
->id ())
2136 case ARM_PCS_AAPCS64
:
2137 /* Only the low 64 bits are saved by the base PCS. */
2141 /* The vector PCS saves the low 128 bits (which is the full
2142 register on non-SVE targets). */
2146 /* Use vectors of DImode for registers that need frame
2147 information, so that the first 64 bytes of the save slot
2148 are always the equivalent of what storing D<n> would give. */
2149 if (aarch64_emit_cfi_for_reg_p (regno
))
2152 /* Use vectors of bytes otherwise, so that the layout is
2153 endian-agnostic, and so that we can use LDR and STR for
2154 big-endian targets. */
2157 case ARM_PCS_TLSDESC
:
2158 case ARM_PCS_UNKNOWN
:
2162 if (PR_REGNUM_P (regno
))
2163 /* Save the full predicate register. */
2169 /* Implement TARGET_INSN_CALLEE_ABI. */
2171 const predefined_function_abi
&
2172 aarch64_insn_callee_abi (const rtx_insn
*insn
)
2174 rtx pat
= PATTERN (insn
);
2175 gcc_assert (GET_CODE (pat
) == PARALLEL
);
2176 rtx unspec
= XVECEXP (pat
, 0, 1);
2177 gcc_assert (GET_CODE (unspec
) == UNSPEC
2178 && XINT (unspec
, 1) == UNSPEC_CALLEE_ABI
);
2179 return function_abis
[INTVAL (XVECEXP (unspec
, 0, 0))];
2182 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
2183 the lower 64 bits of a 128-bit register. Tell the compiler the callee
2184 clobbers the top 64 bits when restoring the bottom 64 bits. */
2187 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id
,
2191 if (FP_REGNUM_P (regno
) && abi_id
!= ARM_PCS_SVE
)
2193 poly_int64 per_register_size
= GET_MODE_SIZE (mode
);
2194 unsigned int nregs
= hard_regno_nregs (regno
, mode
);
2196 per_register_size
= exact_div (per_register_size
, nregs
);
2197 if (abi_id
== ARM_PCS_SIMD
|| abi_id
== ARM_PCS_TLSDESC
)
2198 return maybe_gt (per_register_size
, 16);
2199 return maybe_gt (per_register_size
, 8);
2204 /* Implement REGMODE_NATURAL_SIZE. */
2206 aarch64_regmode_natural_size (machine_mode mode
)
2208 /* The natural size for SVE data modes is one SVE data vector,
2209 and similarly for predicates. We can't independently modify
2210 anything smaller than that. */
2211 /* ??? For now, only do this for variable-width SVE registers.
2212 Doing it for constant-sized registers breaks lower-subreg.c. */
2213 /* ??? And once that's fixed, we should probably have similar
2214 code for Advanced SIMD. */
2215 if (!aarch64_sve_vg
.is_constant ())
2217 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
2218 if (vec_flags
& VEC_SVE_PRED
)
2219 return BYTES_PER_SVE_PRED
;
2220 if (vec_flags
& VEC_SVE_DATA
)
2221 return BYTES_PER_SVE_VECTOR
;
2223 return UNITS_PER_WORD
;
2226 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
2228 aarch64_hard_regno_caller_save_mode (unsigned regno
, unsigned,
2231 /* The predicate mode determines which bits are significant and
2232 which are "don't care". Decreasing the number of lanes would
2233 lose data while increasing the number of lanes would make bits
2234 unnecessarily significant. */
2235 if (PR_REGNUM_P (regno
))
2237 if (known_ge (GET_MODE_SIZE (mode
), 4))
2243 /* Return true if I's bits are consecutive ones from the MSB. */
2245 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i
)
2247 return exact_log2 (-i
) != HOST_WIDE_INT_M1
;
2250 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
2251 that strcpy from constants will be faster. */
2253 static HOST_WIDE_INT
2254 aarch64_constant_alignment (const_tree exp
, HOST_WIDE_INT align
)
2256 if (TREE_CODE (exp
) == STRING_CST
&& !optimize_size
)
2257 return MAX (align
, BITS_PER_WORD
);
2261 /* Return true if calls to DECL should be treated as
2262 long-calls (ie called via a register). */
2264 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED
)
2269 /* Return true if calls to symbol-ref SYM should be treated as
2270 long-calls (ie called via a register). */
2272 aarch64_is_long_call_p (rtx sym
)
2274 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym
));
2277 /* Return true if calls to symbol-ref SYM should not go through
2281 aarch64_is_noplt_call_p (rtx sym
)
2283 const_tree decl
= SYMBOL_REF_DECL (sym
);
2288 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl
)))
2289 && !targetm
.binds_local_p (decl
))
2295 /* Return true if the offsets to a zero/sign-extract operation
2296 represent an expression that matches an extend operation. The
2297 operands represent the parameters from
2299 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
2301 aarch64_is_extend_from_extract (scalar_int_mode mode
, rtx mult_imm
,
2304 HOST_WIDE_INT mult_val
, extract_val
;
2306 if (! CONST_INT_P (mult_imm
) || ! CONST_INT_P (extract_imm
))
2309 mult_val
= INTVAL (mult_imm
);
2310 extract_val
= INTVAL (extract_imm
);
2313 && extract_val
< GET_MODE_BITSIZE (mode
)
2314 && exact_log2 (extract_val
& ~7) > 0
2315 && (extract_val
& 7) <= 4
2316 && mult_val
== (1 << (extract_val
& 7)))
2322 /* Emit an insn that's a simple single-set. Both the operands must be
2323 known to be valid. */
2324 inline static rtx_insn
*
2325 emit_set_insn (rtx x
, rtx y
)
2327 return emit_insn (gen_rtx_SET (x
, y
));
2330 /* X and Y are two things to compare using CODE. Emit the compare insn and
2331 return the rtx for register 0 in the proper mode. */
2333 aarch64_gen_compare_reg (RTX_CODE code
, rtx x
, rtx y
)
2335 machine_mode cmp_mode
= GET_MODE (x
);
2336 machine_mode cc_mode
;
2339 if (cmp_mode
== TImode
)
2341 gcc_assert (code
== NE
);
2344 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
2346 rtx x_lo
= operand_subword (x
, 0, 0, TImode
);
2347 rtx y_lo
= operand_subword (y
, 0, 0, TImode
);
2348 emit_set_insn (cc_reg
, gen_rtx_COMPARE (cc_mode
, x_lo
, y_lo
));
2350 rtx x_hi
= operand_subword (x
, 1, 0, TImode
);
2351 rtx y_hi
= operand_subword (y
, 1, 0, TImode
);
2352 emit_insn (gen_ccmpccdi (cc_reg
, cc_reg
, x_hi
, y_hi
,
2353 gen_rtx_EQ (cc_mode
, cc_reg
, const0_rtx
),
2354 GEN_INT (AARCH64_EQ
)));
2358 cc_mode
= SELECT_CC_MODE (code
, x
, y
);
2359 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
2360 emit_set_insn (cc_reg
, gen_rtx_COMPARE (cc_mode
, x
, y
));
2365 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
2368 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code
, rtx x
, rtx y
,
2369 machine_mode y_mode
)
2371 if (y_mode
== E_QImode
|| y_mode
== E_HImode
)
2373 if (CONST_INT_P (y
))
2374 y
= GEN_INT (INTVAL (y
) & GET_MODE_MASK (y_mode
));
2378 machine_mode cc_mode
;
2380 t
= gen_rtx_ZERO_EXTEND (SImode
, y
);
2381 t
= gen_rtx_COMPARE (CC_SWPmode
, t
, x
);
2382 cc_mode
= CC_SWPmode
;
2383 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
2384 emit_set_insn (cc_reg
, t
);
2389 if (!aarch64_plus_operand (y
, y_mode
))
2390 y
= force_reg (y_mode
, y
);
2392 return aarch64_gen_compare_reg (code
, x
, y
);
2395 /* Build the SYMBOL_REF for __tls_get_addr. */
2397 static GTY(()) rtx tls_get_addr_libfunc
;
2400 aarch64_tls_get_addr (void)
2402 if (!tls_get_addr_libfunc
)
2403 tls_get_addr_libfunc
= init_one_libfunc ("__tls_get_addr");
2404 return tls_get_addr_libfunc
;
2407 /* Return the TLS model to use for ADDR. */
2409 static enum tls_model
2410 tls_symbolic_operand_type (rtx addr
)
2412 enum tls_model tls_kind
= TLS_MODEL_NONE
;
2413 if (GET_CODE (addr
) == CONST
)
2416 rtx sym
= strip_offset (addr
, &addend
);
2417 if (GET_CODE (sym
) == SYMBOL_REF
)
2418 tls_kind
= SYMBOL_REF_TLS_MODEL (sym
);
2420 else if (GET_CODE (addr
) == SYMBOL_REF
)
2421 tls_kind
= SYMBOL_REF_TLS_MODEL (addr
);
2426 /* We'll allow lo_sum's in addresses in our legitimate addresses
2427 so that combine would take care of combining addresses where
2428 necessary, but for generation purposes, we'll generate the address
2431 tmp = hi (symbol_ref); adrp x1, foo
2432 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
2436 adrp x1, :got:foo adrp tmp, :tlsgd:foo
2437 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
2441 Load TLS symbol, depending on TLS mechanism and TLS access model.
2443 Global Dynamic - Traditional TLS:
2444 adrp tmp, :tlsgd:imm
2445 add dest, tmp, #:tlsgd_lo12:imm
2448 Global Dynamic - TLS Descriptors:
2449 adrp dest, :tlsdesc:imm
2450 ldr tmp, [dest, #:tlsdesc_lo12:imm]
2451 add dest, dest, #:tlsdesc_lo12:imm
2458 adrp tmp, :gottprel:imm
2459 ldr dest, [tmp, #:gottprel_lo12:imm]
2464 add t0, tp, #:tprel_hi12:imm, lsl #12
2465 add t0, t0, #:tprel_lo12_nc:imm
2469 aarch64_load_symref_appropriately (rtx dest
, rtx imm
,
2470 enum aarch64_symbol_type type
)
2474 case SYMBOL_SMALL_ABSOLUTE
:
2476 /* In ILP32, the mode of dest can be either SImode or DImode. */
2478 machine_mode mode
= GET_MODE (dest
);
2480 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
2482 if (can_create_pseudo_p ())
2483 tmp_reg
= gen_reg_rtx (mode
);
2485 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
2486 emit_insn (gen_add_losym (dest
, tmp_reg
, imm
));
2490 case SYMBOL_TINY_ABSOLUTE
:
2491 emit_insn (gen_rtx_SET (dest
, imm
));
2494 case SYMBOL_SMALL_GOT_28K
:
2496 machine_mode mode
= GET_MODE (dest
);
2497 rtx gp_rtx
= pic_offset_table_rtx
;
2501 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2502 here before rtl expand. Tree IVOPT will generate rtl pattern to
2503 decide rtx costs, in which case pic_offset_table_rtx is not
2504 initialized. For that case no need to generate the first adrp
2505 instruction as the final cost for global variable access is
2509 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2510 using the page base as GOT base, the first page may be wasted,
2511 in the worst scenario, there is only 28K space for GOT).
2513 The generate instruction sequence for accessing global variable
2516 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2518 Only one instruction needed. But we must initialize
2519 pic_offset_table_rtx properly. We generate initialize insn for
2520 every global access, and allow CSE to remove all redundant.
2522 The final instruction sequences will look like the following
2523 for multiply global variables access.
2525 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2527 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2528 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2529 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2532 rtx s
= gen_rtx_SYMBOL_REF (Pmode
, "_GLOBAL_OFFSET_TABLE_");
2533 crtl
->uses_pic_offset_table
= 1;
2534 emit_move_insn (gp_rtx
, gen_rtx_HIGH (Pmode
, s
));
2536 if (mode
!= GET_MODE (gp_rtx
))
2537 gp_rtx
= gen_lowpart (mode
, gp_rtx
);
2541 if (mode
== ptr_mode
)
2544 insn
= gen_ldr_got_small_28k_di (dest
, gp_rtx
, imm
);
2546 insn
= gen_ldr_got_small_28k_si (dest
, gp_rtx
, imm
);
2548 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
2552 gcc_assert (mode
== Pmode
);
2554 insn
= gen_ldr_got_small_28k_sidi (dest
, gp_rtx
, imm
);
2555 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
2558 /* The operand is expected to be MEM. Whenever the related insn
2559 pattern changed, above code which calculate mem should be
2561 gcc_assert (GET_CODE (mem
) == MEM
);
2562 MEM_READONLY_P (mem
) = 1;
2563 MEM_NOTRAP_P (mem
) = 1;
2568 case SYMBOL_SMALL_GOT_4G
:
2570 /* In ILP32, the mode of dest can be either SImode or DImode,
2571 while the got entry is always of SImode size. The mode of
2572 dest depends on how dest is used: if dest is assigned to a
2573 pointer (e.g. in the memory), it has SImode; it may have
2574 DImode if dest is dereferenced to access the memeory.
2575 This is why we have to handle three different ldr_got_small
2576 patterns here (two patterns for ILP32). */
2581 machine_mode mode
= GET_MODE (dest
);
2583 if (can_create_pseudo_p ())
2584 tmp_reg
= gen_reg_rtx (mode
);
2586 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
2587 if (mode
== ptr_mode
)
2590 insn
= gen_ldr_got_small_di (dest
, tmp_reg
, imm
);
2592 insn
= gen_ldr_got_small_si (dest
, tmp_reg
, imm
);
2594 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
2598 gcc_assert (mode
== Pmode
);
2600 insn
= gen_ldr_got_small_sidi (dest
, tmp_reg
, imm
);
2601 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
2604 gcc_assert (GET_CODE (mem
) == MEM
);
2605 MEM_READONLY_P (mem
) = 1;
2606 MEM_NOTRAP_P (mem
) = 1;
2611 case SYMBOL_SMALL_TLSGD
:
2614 /* The return type of __tls_get_addr is the C pointer type
2616 rtx result
= gen_rtx_REG (ptr_mode
, R0_REGNUM
);
2619 if (GET_MODE (dest
) != ptr_mode
)
2620 tmp_reg
= can_create_pseudo_p () ? gen_reg_rtx (ptr_mode
) : result
;
2623 if (ptr_mode
== SImode
)
2624 aarch64_emit_call_insn (gen_tlsgd_small_si (result
, imm
));
2626 aarch64_emit_call_insn (gen_tlsgd_small_di (result
, imm
));
2627 insns
= get_insns ();
2630 RTL_CONST_CALL_P (insns
) = 1;
2631 emit_libcall_block (insns
, tmp_reg
, result
, imm
);
2632 /* Convert back to the mode of the dest adding a zero_extend
2633 from SImode (ptr_mode) to DImode (Pmode). */
2634 if (dest
!= tmp_reg
)
2635 convert_move (dest
, tmp_reg
, true);
2639 case SYMBOL_SMALL_TLSDESC
:
2641 machine_mode mode
= GET_MODE (dest
);
2642 rtx x0
= gen_rtx_REG (mode
, R0_REGNUM
);
2645 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
2647 /* In ILP32, the got entry is always of SImode size. Unlike
2648 small GOT, the dest is fixed at reg 0. */
2650 emit_insn (gen_tlsdesc_small_si (imm
));
2652 emit_insn (gen_tlsdesc_small_di (imm
));
2653 tp
= aarch64_load_tp (NULL
);
2656 tp
= gen_lowpart (mode
, tp
);
2658 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, x0
)));
2660 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2664 case SYMBOL_SMALL_TLSIE
:
2666 /* In ILP32, the mode of dest can be either SImode or DImode,
2667 while the got entry is always of SImode size. The mode of
2668 dest depends on how dest is used: if dest is assigned to a
2669 pointer (e.g. in the memory), it has SImode; it may have
2670 DImode if dest is dereferenced to access the memeory.
2671 This is why we have to handle three different tlsie_small
2672 patterns here (two patterns for ILP32). */
2673 machine_mode mode
= GET_MODE (dest
);
2674 rtx tmp_reg
= gen_reg_rtx (mode
);
2675 rtx tp
= aarch64_load_tp (NULL
);
2677 if (mode
== ptr_mode
)
2680 emit_insn (gen_tlsie_small_di (tmp_reg
, imm
));
2683 emit_insn (gen_tlsie_small_si (tmp_reg
, imm
));
2684 tp
= gen_lowpart (mode
, tp
);
2689 gcc_assert (mode
== Pmode
);
2690 emit_insn (gen_tlsie_small_sidi (tmp_reg
, imm
));
2693 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, tmp_reg
)));
2695 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2699 case SYMBOL_TLSLE12
:
2700 case SYMBOL_TLSLE24
:
2701 case SYMBOL_TLSLE32
:
2702 case SYMBOL_TLSLE48
:
2704 machine_mode mode
= GET_MODE (dest
);
2705 rtx tp
= aarch64_load_tp (NULL
);
2708 tp
= gen_lowpart (mode
, tp
);
2712 case SYMBOL_TLSLE12
:
2713 emit_insn ((mode
== DImode
? gen_tlsle12_di
: gen_tlsle12_si
)
2716 case SYMBOL_TLSLE24
:
2717 emit_insn ((mode
== DImode
? gen_tlsle24_di
: gen_tlsle24_si
)
2720 case SYMBOL_TLSLE32
:
2721 emit_insn ((mode
== DImode
? gen_tlsle32_di
: gen_tlsle32_si
)
2723 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
2726 case SYMBOL_TLSLE48
:
2727 emit_insn ((mode
== DImode
? gen_tlsle48_di
: gen_tlsle48_si
)
2729 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
2737 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2741 case SYMBOL_TINY_GOT
:
2742 emit_insn (gen_ldr_got_tiny (dest
, imm
));
2745 case SYMBOL_TINY_TLSIE
:
2747 machine_mode mode
= GET_MODE (dest
);
2748 rtx tp
= aarch64_load_tp (NULL
);
2750 if (mode
== ptr_mode
)
2753 emit_insn (gen_tlsie_tiny_di (dest
, imm
, tp
));
2756 tp
= gen_lowpart (mode
, tp
);
2757 emit_insn (gen_tlsie_tiny_si (dest
, imm
, tp
));
2762 gcc_assert (mode
== Pmode
);
2763 emit_insn (gen_tlsie_tiny_sidi (dest
, imm
, tp
));
2767 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2776 /* Emit a move from SRC to DEST. Assume that the move expanders can
2777 handle all moves if !can_create_pseudo_p (). The distinction is
2778 important because, unlike emit_move_insn, the move expanders know
2779 how to force Pmode objects into the constant pool even when the
2780 constant pool address is not itself legitimate. */
2782 aarch64_emit_move (rtx dest
, rtx src
)
2784 return (can_create_pseudo_p ()
2785 ? emit_move_insn (dest
, src
)
2786 : emit_move_insn_1 (dest
, src
));
2789 /* Apply UNOPTAB to OP and store the result in DEST. */
2792 aarch64_emit_unop (rtx dest
, optab unoptab
, rtx op
)
2794 rtx tmp
= expand_unop (GET_MODE (dest
), unoptab
, op
, dest
, 0);
2796 emit_move_insn (dest
, tmp
);
2799 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
2802 aarch64_emit_binop (rtx dest
, optab binoptab
, rtx op0
, rtx op1
)
2804 rtx tmp
= expand_binop (GET_MODE (dest
), binoptab
, op0
, op1
, dest
, 0,
2807 emit_move_insn (dest
, tmp
);
2810 /* Split a 128-bit move operation into two 64-bit move operations,
2811 taking care to handle partial overlap of register to register
2812 copies. Special cases are needed when moving between GP regs and
2813 FP regs. SRC can be a register, constant or memory; DST a register
2814 or memory. If either operand is memory it must not have any side
2817 aarch64_split_128bit_move (rtx dst
, rtx src
)
2822 machine_mode mode
= GET_MODE (dst
);
2824 gcc_assert (mode
== TImode
|| mode
== TFmode
);
2825 gcc_assert (!(side_effects_p (src
) || side_effects_p (dst
)));
2826 gcc_assert (mode
== GET_MODE (src
) || GET_MODE (src
) == VOIDmode
);
2828 if (REG_P (dst
) && REG_P (src
))
2830 int src_regno
= REGNO (src
);
2831 int dst_regno
= REGNO (dst
);
2833 /* Handle FP <-> GP regs. */
2834 if (FP_REGNUM_P (dst_regno
) && GP_REGNUM_P (src_regno
))
2836 src_lo
= gen_lowpart (word_mode
, src
);
2837 src_hi
= gen_highpart (word_mode
, src
);
2839 emit_insn (gen_aarch64_movlow_di (mode
, dst
, src_lo
));
2840 emit_insn (gen_aarch64_movhigh_di (mode
, dst
, src_hi
));
2843 else if (GP_REGNUM_P (dst_regno
) && FP_REGNUM_P (src_regno
))
2845 dst_lo
= gen_lowpart (word_mode
, dst
);
2846 dst_hi
= gen_highpart (word_mode
, dst
);
2848 emit_insn (gen_aarch64_movdi_low (mode
, dst_lo
, src
));
2849 emit_insn (gen_aarch64_movdi_high (mode
, dst_hi
, src
));
2854 dst_lo
= gen_lowpart (word_mode
, dst
);
2855 dst_hi
= gen_highpart (word_mode
, dst
);
2856 src_lo
= gen_lowpart (word_mode
, src
);
2857 src_hi
= gen_highpart_mode (word_mode
, mode
, src
);
2859 /* At most one pairing may overlap. */
2860 if (reg_overlap_mentioned_p (dst_lo
, src_hi
))
2862 aarch64_emit_move (dst_hi
, src_hi
);
2863 aarch64_emit_move (dst_lo
, src_lo
);
2867 aarch64_emit_move (dst_lo
, src_lo
);
2868 aarch64_emit_move (dst_hi
, src_hi
);
2873 aarch64_split_128bit_move_p (rtx dst
, rtx src
)
2875 return (! REG_P (src
)
2876 || ! (FP_REGNUM_P (REGNO (dst
)) && FP_REGNUM_P (REGNO (src
))));
2879 /* Split a complex SIMD combine. */
2882 aarch64_split_simd_combine (rtx dst
, rtx src1
, rtx src2
)
2884 machine_mode src_mode
= GET_MODE (src1
);
2885 machine_mode dst_mode
= GET_MODE (dst
);
2887 gcc_assert (VECTOR_MODE_P (dst_mode
));
2888 gcc_assert (register_operand (dst
, dst_mode
)
2889 && register_operand (src1
, src_mode
)
2890 && register_operand (src2
, src_mode
));
2892 emit_insn (gen_aarch64_simd_combine (src_mode
, dst
, src1
, src2
));
2896 /* Split a complex SIMD move. */
2899 aarch64_split_simd_move (rtx dst
, rtx src
)
2901 machine_mode src_mode
= GET_MODE (src
);
2902 machine_mode dst_mode
= GET_MODE (dst
);
2904 gcc_assert (VECTOR_MODE_P (dst_mode
));
2906 if (REG_P (dst
) && REG_P (src
))
2908 gcc_assert (VECTOR_MODE_P (src_mode
));
2909 emit_insn (gen_aarch64_split_simd_mov (src_mode
, dst
, src
));
2914 aarch64_zero_extend_const_eq (machine_mode xmode
, rtx x
,
2915 machine_mode ymode
, rtx y
)
2917 rtx r
= simplify_const_unary_operation (ZERO_EXTEND
, xmode
, y
, ymode
);
2918 gcc_assert (r
!= NULL
);
2919 return rtx_equal_p (x
, r
);
2922 /* Return TARGET if it is nonnull and a register of mode MODE.
2923 Otherwise, return a fresh register of mode MODE if we can,
2924 or TARGET reinterpreted as MODE if we can't. */
2927 aarch64_target_reg (rtx target
, machine_mode mode
)
2929 if (target
&& REG_P (target
) && GET_MODE (target
) == mode
)
2931 if (!can_create_pseudo_p ())
2933 gcc_assert (target
);
2934 return gen_lowpart (mode
, target
);
2936 return gen_reg_rtx (mode
);
2939 /* Return a register that contains the constant in BUILDER, given that
2940 the constant is a legitimate move operand. Use TARGET as the register
2941 if it is nonnull and convenient. */
2944 aarch64_emit_set_immediate (rtx target
, rtx_vector_builder
&builder
)
2946 rtx src
= builder
.build ();
2947 target
= aarch64_target_reg (target
, GET_MODE (src
));
2948 emit_insn (gen_rtx_SET (target
, src
));
2953 aarch64_force_temporary (machine_mode mode
, rtx x
, rtx value
)
2955 if (can_create_pseudo_p ())
2956 return force_reg (mode
, value
);
2960 aarch64_emit_move (x
, value
);
2965 /* Return true if predicate value X is a constant in which every element
2966 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
2967 value, i.e. as a predicate in which all bits are significant. */
2970 aarch64_get_sve_pred_bits (rtx_vector_builder
&builder
, rtx x
)
2972 if (GET_CODE (x
) != CONST_VECTOR
)
2975 unsigned int factor
= vector_element_size (GET_MODE_NUNITS (VNx16BImode
),
2976 GET_MODE_NUNITS (GET_MODE (x
)));
2977 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (x
) * factor
;
2978 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (x
);
2979 builder
.new_vector (VNx16BImode
, npatterns
, nelts_per_pattern
);
2981 unsigned int nelts
= const_vector_encoded_nelts (x
);
2982 for (unsigned int i
= 0; i
< nelts
; ++i
)
2984 rtx elt
= CONST_VECTOR_ENCODED_ELT (x
, i
);
2985 if (!CONST_INT_P (elt
))
2988 builder
.quick_push (elt
);
2989 for (unsigned int j
= 1; j
< factor
; ++j
)
2990 builder
.quick_push (const0_rtx
);
2992 builder
.finalize ();
2996 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
2997 widest predicate element size it can have (that is, the largest size
2998 for which each element would still be 0 or 1). */
3001 aarch64_widest_sve_pred_elt_size (rtx_vector_builder
&builder
)
3003 /* Start with the most optimistic assumption: that we only need
3004 one bit per pattern. This is what we will use if only the first
3005 bit in each pattern is ever set. */
3006 unsigned int mask
= GET_MODE_SIZE (DImode
);
3007 mask
|= builder
.npatterns ();
3009 /* Look for set bits. */
3010 unsigned int nelts
= builder
.encoded_nelts ();
3011 for (unsigned int i
= 1; i
< nelts
; ++i
)
3012 if (INTVAL (builder
.elt (i
)) != 0)
3018 return mask
& -mask
;
3021 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
3022 return that predicate mode, otherwise return opt_machine_mode (). */
3025 aarch64_ptrue_all_mode (rtx x
)
3027 gcc_assert (GET_MODE (x
) == VNx16BImode
);
3028 if (GET_CODE (x
) != CONST_VECTOR
3029 || !CONST_VECTOR_DUPLICATE_P (x
)
3030 || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x
, 0))
3031 || INTVAL (CONST_VECTOR_ENCODED_ELT (x
, 0)) == 0)
3032 return opt_machine_mode ();
3034 unsigned int nelts
= const_vector_encoded_nelts (x
);
3035 for (unsigned int i
= 1; i
< nelts
; ++i
)
3036 if (CONST_VECTOR_ENCODED_ELT (x
, i
) != const0_rtx
)
3037 return opt_machine_mode ();
3039 return aarch64_sve_pred_mode (nelts
);
3042 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
3043 that the constant would have with predicate element size ELT_SIZE
3044 (ignoring the upper bits in each element) and return:
3046 * -1 if all bits are set
3047 * N if the predicate has N leading set bits followed by all clear bits
3048 * 0 if the predicate does not have any of these forms. */
3051 aarch64_partial_ptrue_length (rtx_vector_builder
&builder
,
3052 unsigned int elt_size
)
3054 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
3055 followed by set bits. */
3056 if (builder
.nelts_per_pattern () == 3)
3059 /* Skip over leading set bits. */
3060 unsigned int nelts
= builder
.encoded_nelts ();
3062 for (; i
< nelts
; i
+= elt_size
)
3063 if (INTVAL (builder
.elt (i
)) == 0)
3065 unsigned int vl
= i
/ elt_size
;
3067 /* Check for the all-true case. */
3071 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
3072 repeating pattern of set bits followed by clear bits. */
3073 if (builder
.nelts_per_pattern () != 2)
3076 /* We have a "foreground" value and a duplicated "background" value.
3077 If the background might repeat and the last set bit belongs to it,
3078 we might have set bits followed by clear bits followed by set bits. */
3079 if (i
> builder
.npatterns () && maybe_ne (nelts
, builder
.full_nelts ()))
3082 /* Make sure that the rest are all clear. */
3083 for (; i
< nelts
; i
+= elt_size
)
3084 if (INTVAL (builder
.elt (i
)) != 0)
3090 /* See if there is an svpattern that encodes an SVE predicate of mode
3091 PRED_MODE in which the first VL bits are set and the rest are clear.
3092 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
3093 A VL of -1 indicates an all-true vector. */
3096 aarch64_svpattern_for_vl (machine_mode pred_mode
, int vl
)
3099 return AARCH64_SV_ALL
;
3101 if (maybe_gt (vl
, GET_MODE_NUNITS (pred_mode
)))
3102 return AARCH64_NUM_SVPATTERNS
;
3104 if (vl
>= 1 && vl
<= 8)
3105 return aarch64_svpattern (AARCH64_SV_VL1
+ (vl
- 1));
3107 if (vl
>= 16 && vl
<= 256 && pow2p_hwi (vl
))
3108 return aarch64_svpattern (AARCH64_SV_VL16
+ (exact_log2 (vl
) - 4));
3111 if (GET_MODE_NUNITS (pred_mode
).is_constant (&max_vl
))
3113 if (vl
== (max_vl
/ 3) * 3)
3114 return AARCH64_SV_MUL3
;
3115 /* These would only trigger for non-power-of-2 lengths. */
3116 if (vl
== (max_vl
& -4))
3117 return AARCH64_SV_MUL4
;
3118 if (vl
== (1 << floor_log2 (max_vl
)))
3119 return AARCH64_SV_POW2
;
3121 return AARCH64_SV_ALL
;
3123 return AARCH64_NUM_SVPATTERNS
;
3126 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
3127 bits has the lowest bit set and the upper bits clear. This is the
3128 VNx16BImode equivalent of a PTRUE for controlling elements of
3129 ELT_SIZE bytes. However, because the constant is VNx16BImode,
3130 all bits are significant, even the upper zeros. */
3133 aarch64_ptrue_all (unsigned int elt_size
)
3135 rtx_vector_builder
builder (VNx16BImode
, elt_size
, 1);
3136 builder
.quick_push (const1_rtx
);
3137 for (unsigned int i
= 1; i
< elt_size
; ++i
)
3138 builder
.quick_push (const0_rtx
);
3139 return builder
.build ();
3142 /* Return an all-true predicate register of mode MODE. */
3145 aarch64_ptrue_reg (machine_mode mode
)
3147 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
);
3148 rtx reg
= force_reg (VNx16BImode
, CONSTM1_RTX (VNx16BImode
));
3149 return gen_lowpart (mode
, reg
);
3152 /* Return an all-false predicate register of mode MODE. */
3155 aarch64_pfalse_reg (machine_mode mode
)
3157 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
);
3158 rtx reg
= force_reg (VNx16BImode
, CONST0_RTX (VNx16BImode
));
3159 return gen_lowpart (mode
, reg
);
3162 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
3163 true, or alternatively if we know that the operation predicated by
3164 PRED1[0] is safe to perform whenever PRED2 is true. PRED1[1] is a
3165 aarch64_sve_gp_strictness operand that describes the operation
3166 predicated by PRED1[0]. */
3169 aarch64_sve_pred_dominates_p (rtx
*pred1
, rtx pred2
)
3171 machine_mode mode
= GET_MODE (pred2
);
3172 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
3173 && mode
== GET_MODE (pred1
[0])
3174 && aarch64_sve_gp_strictness (pred1
[1], SImode
));
3175 return (pred1
[0] == CONSTM1_RTX (mode
)
3176 || INTVAL (pred1
[1]) == SVE_RELAXED_GP
3177 || rtx_equal_p (pred1
[0], pred2
));
3180 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
3181 for it. PRED2[0] is the predicate for the instruction whose result
3182 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
3183 for it. Return true if we can prove that the two predicates are
3184 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
3185 with PRED1[0] without changing behavior. */
3188 aarch64_sve_same_pred_for_ptest_p (rtx
*pred1
, rtx
*pred2
)
3190 machine_mode mode
= GET_MODE (pred1
[0]);
3191 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
3192 && mode
== GET_MODE (pred2
[0])
3193 && aarch64_sve_ptrue_flag (pred1
[1], SImode
)
3194 && aarch64_sve_ptrue_flag (pred2
[1], SImode
));
3196 bool ptrue1_p
= (pred1
[0] == CONSTM1_RTX (mode
)
3197 || INTVAL (pred1
[1]) == SVE_KNOWN_PTRUE
);
3198 bool ptrue2_p
= (pred2
[0] == CONSTM1_RTX (mode
)
3199 || INTVAL (pred2
[1]) == SVE_KNOWN_PTRUE
);
3200 return (ptrue1_p
&& ptrue2_p
) || rtx_equal_p (pred1
[0], pred2
[0]);
3203 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
3204 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
3205 Use TARGET as the target register if nonnull and convenient. */
3208 aarch64_sve_emit_int_cmp (rtx target
, machine_mode pred_mode
, rtx_code cmp
,
3209 machine_mode data_mode
, rtx op1
, rtx op2
)
3211 insn_code icode
= code_for_aarch64_pred_cmp (cmp
, data_mode
);
3212 expand_operand ops
[5];
3213 create_output_operand (&ops
[0], target
, pred_mode
);
3214 create_input_operand (&ops
[1], CONSTM1_RTX (pred_mode
), pred_mode
);
3215 create_integer_operand (&ops
[2], SVE_KNOWN_PTRUE
);
3216 create_input_operand (&ops
[3], op1
, data_mode
);
3217 create_input_operand (&ops
[4], op2
, data_mode
);
3218 expand_insn (icode
, 5, ops
);
3219 return ops
[0].value
;
3222 /* Use a comparison to convert integer vector SRC into MODE, which is
3223 the corresponding SVE predicate mode. Use TARGET for the result
3224 if it's nonnull and convenient. */
3227 aarch64_convert_sve_data_to_pred (rtx target
, machine_mode mode
, rtx src
)
3229 machine_mode src_mode
= GET_MODE (src
);
3230 return aarch64_sve_emit_int_cmp (target
, mode
, NE
, src_mode
,
3231 src
, CONST0_RTX (src_mode
));
3234 /* Return the assembly token for svprfop value PRFOP. */
3237 svprfop_token (enum aarch64_svprfop prfop
)
3241 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
3242 AARCH64_FOR_SVPRFOP (CASE
)
3244 case AARCH64_NUM_SVPRFOPS
:
3250 /* Return the assembly string for an SVE prefetch operation with
3251 mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
3252 and that SUFFIX is the format for the remaining operands. */
3255 aarch64_output_sve_prefetch (const char *mnemonic
, rtx prfop_rtx
,
3258 static char buffer
[128];
3259 aarch64_svprfop prfop
= (aarch64_svprfop
) INTVAL (prfop_rtx
);
3260 unsigned int written
= snprintf (buffer
, sizeof (buffer
), "%s\t%s, %s",
3261 mnemonic
, svprfop_token (prfop
), suffix
);
3262 gcc_assert (written
< sizeof (buffer
));
3266 /* Check whether we can calculate the number of elements in PATTERN
3267 at compile time, given that there are NELTS_PER_VQ elements per
3268 128-bit block. Return the value if so, otherwise return -1. */
3271 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern
, unsigned int nelts_per_vq
)
3273 unsigned int vl
, const_vg
;
3274 if (pattern
>= AARCH64_SV_VL1
&& pattern
<= AARCH64_SV_VL8
)
3275 vl
= 1 + (pattern
- AARCH64_SV_VL1
);
3276 else if (pattern
>= AARCH64_SV_VL16
&& pattern
<= AARCH64_SV_VL256
)
3277 vl
= 16 << (pattern
- AARCH64_SV_VL16
);
3278 else if (aarch64_sve_vg
.is_constant (&const_vg
))
3280 /* There are two vector granules per quadword. */
3281 unsigned int nelts
= (const_vg
/ 2) * nelts_per_vq
;
3284 case AARCH64_SV_POW2
: return 1 << floor_log2 (nelts
);
3285 case AARCH64_SV_MUL4
: return nelts
& -4;
3286 case AARCH64_SV_MUL3
: return (nelts
/ 3) * 3;
3287 case AARCH64_SV_ALL
: return nelts
;
3288 default: gcc_unreachable ();
3294 /* There are two vector granules per quadword. */
3295 poly_uint64 nelts_all
= exact_div (aarch64_sve_vg
, 2) * nelts_per_vq
;
3296 if (known_le (vl
, nelts_all
))
3299 /* Requesting more elements than are available results in a PFALSE. */
3300 if (known_gt (vl
, nelts_all
))
3306 /* Return true if we can move VALUE into a register using a single
3307 CNT[BHWD] instruction. */
3310 aarch64_sve_cnt_immediate_p (poly_int64 value
)
3312 HOST_WIDE_INT factor
= value
.coeffs
[0];
3313 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
3314 return (value
.coeffs
[1] == factor
3315 && IN_RANGE (factor
, 2, 16 * 16)
3316 && (factor
& 1) == 0
3317 && factor
<= 16 * (factor
& -factor
));
3320 /* Likewise for rtx X. */
3323 aarch64_sve_cnt_immediate_p (rtx x
)
3326 return poly_int_rtx_p (x
, &value
) && aarch64_sve_cnt_immediate_p (value
);
3329 /* Return the asm string for an instruction with a CNT-like vector size
3330 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3331 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3332 first part of the operands template (the part that comes before the
3333 vector size itself). PATTERN is the pattern to use. FACTOR is the
3334 number of quadwords. NELTS_PER_VQ, if nonzero, is the number of elements
3335 in each quadword. If it is zero, we can use any element size. */
3338 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
3339 aarch64_svpattern pattern
,
3340 unsigned int factor
,
3341 unsigned int nelts_per_vq
)
3343 static char buffer
[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
3345 if (nelts_per_vq
== 0)
3346 /* There is some overlap in the ranges of the four CNT instructions.
3347 Here we always use the smallest possible element size, so that the
3348 multiplier is 1 whereever possible. */
3349 nelts_per_vq
= factor
& -factor
;
3350 int shift
= std::min (exact_log2 (nelts_per_vq
), 4);
3351 gcc_assert (IN_RANGE (shift
, 1, 4));
3352 char suffix
= "dwhb"[shift
- 1];
3355 unsigned int written
;
3356 if (pattern
== AARCH64_SV_ALL
&& factor
== 1)
3357 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s",
3358 prefix
, suffix
, operands
);
3359 else if (factor
== 1)
3360 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s, %s",
3361 prefix
, suffix
, operands
, svpattern_token (pattern
));
3363 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s, %s, mul #%d",
3364 prefix
, suffix
, operands
, svpattern_token (pattern
),
3366 gcc_assert (written
< sizeof (buffer
));
3370 /* Return the asm string for an instruction with a CNT-like vector size
3371 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3372 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3373 first part of the operands template (the part that comes before the
3374 vector size itself). X is the value of the vector size operand,
3375 as a polynomial integer rtx; we need to convert this into an "all"
3376 pattern with a multiplier. */
3379 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
3382 poly_int64 value
= rtx_to_poly_int64 (x
);
3383 gcc_assert (aarch64_sve_cnt_immediate_p (value
));
3384 return aarch64_output_sve_cnt_immediate (prefix
, operands
, AARCH64_SV_ALL
,
3385 value
.coeffs
[1], 0);
3388 /* Return the asm string for an instruction with a CNT-like vector size
3389 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3390 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3391 first part of the operands template (the part that comes before the
3392 vector size itself). CNT_PAT[0..2] are the operands of the
3393 UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details. */
3396 aarch64_output_sve_cnt_pat_immediate (const char *prefix
,
3397 const char *operands
, rtx
*cnt_pat
)
3399 aarch64_svpattern pattern
= (aarch64_svpattern
) INTVAL (cnt_pat
[0]);
3400 unsigned int nelts_per_vq
= INTVAL (cnt_pat
[1]);
3401 unsigned int factor
= INTVAL (cnt_pat
[2]) * nelts_per_vq
;
3402 return aarch64_output_sve_cnt_immediate (prefix
, operands
, pattern
,
3403 factor
, nelts_per_vq
);
3406 /* Return true if we can add X using a single SVE INC or DEC instruction. */
3409 aarch64_sve_scalar_inc_dec_immediate_p (rtx x
)
3412 return (poly_int_rtx_p (x
, &value
)
3413 && (aarch64_sve_cnt_immediate_p (value
)
3414 || aarch64_sve_cnt_immediate_p (-value
)));
3417 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
3421 aarch64_output_sve_scalar_inc_dec (rtx offset
)
3423 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
3424 gcc_assert (offset_value
.coeffs
[0] == offset_value
.coeffs
[1]);
3425 if (offset_value
.coeffs
[1] > 0)
3426 return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL
,
3427 offset_value
.coeffs
[1], 0);
3429 return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL
,
3430 -offset_value
.coeffs
[1], 0);
3433 /* Return true if we can add VALUE to a register using a single ADDVL
3434 or ADDPL instruction. */
3437 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value
)
3439 HOST_WIDE_INT factor
= value
.coeffs
[0];
3440 if (factor
== 0 || value
.coeffs
[1] != factor
)
3442 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
3443 and a value of 16 is one vector width. */
3444 return (((factor
& 15) == 0 && IN_RANGE (factor
, -32 * 16, 31 * 16))
3445 || ((factor
& 1) == 0 && IN_RANGE (factor
, -32 * 2, 31 * 2)));
3448 /* Likewise for rtx X. */
3451 aarch64_sve_addvl_addpl_immediate_p (rtx x
)
3454 return (poly_int_rtx_p (x
, &value
)
3455 && aarch64_sve_addvl_addpl_immediate_p (value
));
3458 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3459 to operand 1 and storing the result in operand 0. */
3462 aarch64_output_sve_addvl_addpl (rtx offset
)
3464 static char buffer
[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3465 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
3466 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value
));
3468 int factor
= offset_value
.coeffs
[1];
3469 if ((factor
& 15) == 0)
3470 snprintf (buffer
, sizeof (buffer
), "addvl\t%%x0, %%x1, #%d", factor
/ 16);
3472 snprintf (buffer
, sizeof (buffer
), "addpl\t%%x0, %%x1, #%d", factor
/ 2);
3476 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3477 instruction. If it is, store the number of elements in each vector
3478 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3479 factor in *FACTOR_OUT (if nonnull). */
3482 aarch64_sve_vector_inc_dec_immediate_p (rtx x
, int *factor_out
,
3483 unsigned int *nelts_per_vq_out
)
3488 if (!const_vec_duplicate_p (x
, &elt
)
3489 || !poly_int_rtx_p (elt
, &value
))
3492 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x
));
3493 if (nelts_per_vq
!= 8 && nelts_per_vq
!= 4 && nelts_per_vq
!= 2)
3494 /* There's no vector INCB. */
3497 HOST_WIDE_INT factor
= value
.coeffs
[0];
3498 if (value
.coeffs
[1] != factor
)
3501 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
3502 if ((factor
% nelts_per_vq
) != 0
3503 || !IN_RANGE (abs (factor
), nelts_per_vq
, 16 * nelts_per_vq
))
3507 *factor_out
= factor
;
3508 if (nelts_per_vq_out
)
3509 *nelts_per_vq_out
= nelts_per_vq
;
3513 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3517 aarch64_sve_vector_inc_dec_immediate_p (rtx x
)
3519 return aarch64_sve_vector_inc_dec_immediate_p (x
, NULL
, NULL
);
3522 /* Return the asm template for an SVE vector INC or DEC instruction.
3523 OPERANDS gives the operands before the vector count and X is the
3524 value of the vector count operand itself. */
3527 aarch64_output_sve_vector_inc_dec (const char *operands
, rtx x
)
3530 unsigned int nelts_per_vq
;
3531 if (!aarch64_sve_vector_inc_dec_immediate_p (x
, &factor
, &nelts_per_vq
))
3534 return aarch64_output_sve_cnt_immediate ("dec", operands
, AARCH64_SV_ALL
,
3535 -factor
, nelts_per_vq
);
3537 return aarch64_output_sve_cnt_immediate ("inc", operands
, AARCH64_SV_ALL
,
3538 factor
, nelts_per_vq
);
3542 aarch64_internal_mov_immediate (rtx dest
, rtx imm
, bool generate
,
3543 scalar_int_mode mode
)
3546 unsigned HOST_WIDE_INT val
, val2
, mask
;
3547 int one_match
, zero_match
;
3552 if (aarch64_move_imm (val
, mode
))
3555 emit_insn (gen_rtx_SET (dest
, imm
));
3559 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3560 (with XXXX non-zero). In that case check to see if the move can be done in
3562 val2
= val
& 0xffffffff;
3564 && aarch64_move_imm (val2
, SImode
)
3565 && (((val
>> 32) & 0xffff) == 0 || (val
>> 48) == 0))
3568 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
3570 /* Check if we have to emit a second instruction by checking to see
3571 if any of the upper 32 bits of the original DI mode value is set. */
3575 i
= (val
>> 48) ? 48 : 32;
3578 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
3579 GEN_INT ((val
>> i
) & 0xffff)));
3584 if ((val
>> 32) == 0 || mode
== SImode
)
3588 emit_insn (gen_rtx_SET (dest
, GEN_INT (val
& 0xffff)));
3590 emit_insn (gen_insv_immsi (dest
, GEN_INT (16),
3591 GEN_INT ((val
>> 16) & 0xffff)));
3593 emit_insn (gen_insv_immdi (dest
, GEN_INT (16),
3594 GEN_INT ((val
>> 16) & 0xffff)));
3599 /* Remaining cases are all for DImode. */
3602 zero_match
= ((val
& mask
) == 0) + ((val
& (mask
<< 16)) == 0) +
3603 ((val
& (mask
<< 32)) == 0) + ((val
& (mask
<< 48)) == 0);
3604 one_match
= ((~val
& mask
) == 0) + ((~val
& (mask
<< 16)) == 0) +
3605 ((~val
& (mask
<< 32)) == 0) + ((~val
& (mask
<< 48)) == 0);
3607 if (zero_match
!= 2 && one_match
!= 2)
3609 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3610 For a 64-bit bitmask try whether changing 16 bits to all ones or
3611 zeroes creates a valid bitmask. To check any repeated bitmask,
3612 try using 16 bits from the other 32-bit half of val. */
3614 for (i
= 0; i
< 64; i
+= 16, mask
<<= 16)
3617 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
3620 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
3622 val2
= val2
& ~mask
;
3623 val2
= val2
| (((val2
>> 32) | (val2
<< 32)) & mask
);
3624 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
3631 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
3632 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
3633 GEN_INT ((val
>> i
) & 0xffff)));
3639 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3640 are emitted by the initial mov. If one_match > zero_match, skip set bits,
3641 otherwise skip zero bits. */
3645 val2
= one_match
> zero_match
? ~val
: val
;
3646 i
= (val2
& mask
) != 0 ? 0 : (val2
& (mask
<< 16)) != 0 ? 16 : 32;
3649 emit_insn (gen_rtx_SET (dest
, GEN_INT (one_match
> zero_match
3650 ? (val
| ~(mask
<< i
))
3651 : (val
& (mask
<< i
)))));
3652 for (i
+= 16; i
< 64; i
+= 16)
3654 if ((val2
& (mask
<< i
)) == 0)
3657 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
3658 GEN_INT ((val
>> i
) & 0xffff)));
3665 /* Return whether imm is a 128-bit immediate which is simple enough to
3668 aarch64_mov128_immediate (rtx imm
)
3670 if (GET_CODE (imm
) == CONST_INT
)
3673 gcc_assert (CONST_WIDE_INT_NUNITS (imm
) == 2);
3675 rtx lo
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 0));
3676 rtx hi
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 1));
3678 return aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, DImode
)
3679 + aarch64_internal_mov_immediate (NULL_RTX
, hi
, false, DImode
) <= 4;
3683 /* Return the number of temporary registers that aarch64_add_offset_1
3684 would need to add OFFSET to a register. */
3687 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset
)
3689 return abs_hwi (offset
) < 0x1000000 ? 0 : 1;
3692 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
3693 a non-polynomial OFFSET. MODE is the mode of the addition.
3694 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3695 be set and CFA adjustments added to the generated instructions.
3697 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3698 temporary if register allocation is already complete. This temporary
3699 register may overlap DEST but must not overlap SRC. If TEMP1 is known
3700 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3701 the immediate again.
3703 Since this function may be used to adjust the stack pointer, we must
3704 ensure that it cannot cause transient stack deallocation (for example
3705 by first incrementing SP and then decrementing when adjusting by a
3706 large immediate). */
3709 aarch64_add_offset_1 (scalar_int_mode mode
, rtx dest
,
3710 rtx src
, HOST_WIDE_INT offset
, rtx temp1
,
3711 bool frame_related_p
, bool emit_move_imm
)
3713 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
3714 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
3716 unsigned HOST_WIDE_INT moffset
= absu_hwi (offset
);
3721 if (!rtx_equal_p (dest
, src
))
3723 insn
= emit_insn (gen_rtx_SET (dest
, src
));
3724 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3729 /* Single instruction adjustment. */
3730 if (aarch64_uimm12_shift (moffset
))
3732 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (offset
)));
3733 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3737 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3740 a) the offset cannot be loaded by a 16-bit move or
3741 b) there is no spare register into which we can move it. */
3742 if (moffset
< 0x1000000
3743 && ((!temp1
&& !can_create_pseudo_p ())
3744 || !aarch64_move_imm (moffset
, mode
)))
3746 HOST_WIDE_INT low_off
= moffset
& 0xfff;
3748 low_off
= offset
< 0 ? -low_off
: low_off
;
3749 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (low_off
)));
3750 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3751 insn
= emit_insn (gen_add2_insn (dest
, GEN_INT (offset
- low_off
)));
3752 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3756 /* Emit a move immediate if required and an addition/subtraction. */
3759 gcc_assert (temp1
!= NULL_RTX
|| can_create_pseudo_p ());
3760 temp1
= aarch64_force_temporary (mode
, temp1
,
3761 gen_int_mode (moffset
, mode
));
3763 insn
= emit_insn (offset
< 0
3764 ? gen_sub3_insn (dest
, src
, temp1
)
3765 : gen_add3_insn (dest
, src
, temp1
));
3766 if (frame_related_p
)
3768 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3769 rtx adj
= plus_constant (mode
, src
, offset
);
3770 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, gen_rtx_SET (dest
, adj
));
3774 /* Return the number of temporary registers that aarch64_add_offset
3775 would need to move OFFSET into a register or add OFFSET to a register;
3776 ADD_P is true if we want the latter rather than the former. */
3779 aarch64_offset_temporaries (bool add_p
, poly_int64 offset
)
3781 /* This follows the same structure as aarch64_add_offset. */
3782 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
3785 unsigned int count
= 0;
3786 HOST_WIDE_INT factor
= offset
.coeffs
[1];
3787 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
3788 poly_int64
poly_offset (factor
, factor
);
3789 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
3790 /* Need one register for the ADDVL/ADDPL result. */
3792 else if (factor
!= 0)
3794 factor
= abs (factor
);
3795 if (factor
> 16 * (factor
& -factor
))
3796 /* Need one register for the CNT result and one for the multiplication
3797 factor. If necessary, the second temporary can be reused for the
3798 constant part of the offset. */
3800 /* Need one register for the CNT result (which might then
3804 return count
+ aarch64_add_offset_1_temporaries (constant
);
3807 /* If X can be represented as a poly_int64, return the number
3808 of temporaries that are required to add it to a register.
3809 Return -1 otherwise. */
3812 aarch64_add_offset_temporaries (rtx x
)
3815 if (!poly_int_rtx_p (x
, &offset
))
3817 return aarch64_offset_temporaries (true, offset
);
3820 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
3821 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3822 be set and CFA adjustments added to the generated instructions.
3824 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3825 temporary if register allocation is already complete. This temporary
3826 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3827 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3828 false to avoid emitting the immediate again.
3830 TEMP2, if nonnull, is a second temporary register that doesn't
3831 overlap either DEST or REG.
3833 Since this function may be used to adjust the stack pointer, we must
3834 ensure that it cannot cause transient stack deallocation (for example
3835 by first incrementing SP and then decrementing when adjusting by a
3836 large immediate). */
3839 aarch64_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
3840 poly_int64 offset
, rtx temp1
, rtx temp2
,
3841 bool frame_related_p
, bool emit_move_imm
= true)
3843 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
3844 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
3845 gcc_assert (temp1
== NULL_RTX
3847 || !reg_overlap_mentioned_p (temp1
, dest
));
3848 gcc_assert (temp2
== NULL_RTX
|| !reg_overlap_mentioned_p (dest
, temp2
));
3850 /* Try using ADDVL or ADDPL to add the whole value. */
3851 if (src
!= const0_rtx
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
3853 rtx offset_rtx
= gen_int_mode (offset
, mode
);
3854 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
3855 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3859 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3860 SVE vector register, over and above the minimum size of 128 bits.
3861 This is equivalent to half the value returned by CNTD with a
3862 vector shape of ALL. */
3863 HOST_WIDE_INT factor
= offset
.coeffs
[1];
3864 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
3866 /* Try using ADDVL or ADDPL to add the VG-based part. */
3867 poly_int64
poly_offset (factor
, factor
);
3868 if (src
!= const0_rtx
3869 && aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
3871 rtx offset_rtx
= gen_int_mode (poly_offset
, mode
);
3872 if (frame_related_p
)
3874 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
3875 RTX_FRAME_RELATED_P (insn
) = true;
3880 rtx addr
= gen_rtx_PLUS (mode
, src
, offset_rtx
);
3881 src
= aarch64_force_temporary (mode
, temp1
, addr
);
3886 /* Otherwise use a CNT-based sequence. */
3887 else if (factor
!= 0)
3889 /* Use a subtraction if we have a negative factor. */
3890 rtx_code code
= PLUS
;
3897 /* Calculate CNTD * FACTOR / 2. First try to fold the division
3898 into the multiplication. */
3902 /* Use a right shift by 1. */
3906 HOST_WIDE_INT low_bit
= factor
& -factor
;
3907 if (factor
<= 16 * low_bit
)
3909 if (factor
> 16 * 8)
3911 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3912 the value with the minimum multiplier and shift it into
3914 int extra_shift
= exact_log2 (low_bit
);
3915 shift
+= extra_shift
;
3916 factor
>>= extra_shift
;
3918 val
= gen_int_mode (poly_int64 (factor
* 2, factor
* 2), mode
);
3922 /* Base the factor on LOW_BIT if we can calculate LOW_BIT
3923 directly, since that should increase the chances of being
3924 able to use a shift and add sequence. If LOW_BIT itself
3925 is out of range, just use CNTD. */
3926 if (low_bit
<= 16 * 8)
3931 val
= gen_int_mode (poly_int64 (low_bit
* 2, low_bit
* 2), mode
);
3932 val
= aarch64_force_temporary (mode
, temp1
, val
);
3934 if (can_create_pseudo_p ())
3936 rtx coeff1
= gen_int_mode (factor
, mode
);
3937 val
= expand_mult (mode
, val
, coeff1
, NULL_RTX
, false, true);
3941 /* Go back to using a negative multiplication factor if we have
3942 no register from which to subtract. */
3943 if (code
== MINUS
&& src
== const0_rtx
)
3948 rtx coeff1
= gen_int_mode (factor
, mode
);
3949 coeff1
= aarch64_force_temporary (mode
, temp2
, coeff1
);
3950 val
= gen_rtx_MULT (mode
, val
, coeff1
);
3956 /* Multiply by 1 << SHIFT. */
3957 val
= aarch64_force_temporary (mode
, temp1
, val
);
3958 val
= gen_rtx_ASHIFT (mode
, val
, GEN_INT (shift
));
3960 else if (shift
== -1)
3963 val
= aarch64_force_temporary (mode
, temp1
, val
);
3964 val
= gen_rtx_ASHIFTRT (mode
, val
, const1_rtx
);
3967 /* Calculate SRC +/- CNTD * FACTOR / 2. */
3968 if (src
!= const0_rtx
)
3970 val
= aarch64_force_temporary (mode
, temp1
, val
);
3971 val
= gen_rtx_fmt_ee (code
, mode
, src
, val
);
3973 else if (code
== MINUS
)
3975 val
= aarch64_force_temporary (mode
, temp1
, val
);
3976 val
= gen_rtx_NEG (mode
, val
);
3979 if (constant
== 0 || frame_related_p
)
3981 rtx_insn
*insn
= emit_insn (gen_rtx_SET (dest
, val
));
3982 if (frame_related_p
)
3984 RTX_FRAME_RELATED_P (insn
) = true;
3985 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
3986 gen_rtx_SET (dest
, plus_constant (Pmode
, src
,
3995 src
= aarch64_force_temporary (mode
, temp1
, val
);
4000 emit_move_imm
= true;
4003 aarch64_add_offset_1 (mode
, dest
, src
, constant
, temp1
,
4004 frame_related_p
, emit_move_imm
);
4007 /* Like aarch64_add_offset, but the offset is given as an rtx rather
4008 than a poly_int64. */
4011 aarch64_split_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
4012 rtx offset_rtx
, rtx temp1
, rtx temp2
)
4014 aarch64_add_offset (mode
, dest
, src
, rtx_to_poly_int64 (offset_rtx
),
4015 temp1
, temp2
, false);
4018 /* Add DELTA to the stack pointer, marking the instructions frame-related.
4019 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
4020 if TEMP1 already contains abs (DELTA). */
4023 aarch64_add_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool emit_move_imm
)
4025 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, delta
,
4026 temp1
, temp2
, true, emit_move_imm
);
4029 /* Subtract DELTA from the stack pointer, marking the instructions
4030 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
4034 aarch64_sub_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool frame_related_p
,
4035 bool emit_move_imm
= true)
4037 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, -delta
,
4038 temp1
, temp2
, frame_related_p
, emit_move_imm
);
4041 /* Set DEST to (vec_series BASE STEP). */
4044 aarch64_expand_vec_series (rtx dest
, rtx base
, rtx step
)
4046 machine_mode mode
= GET_MODE (dest
);
4047 scalar_mode inner
= GET_MODE_INNER (mode
);
4049 /* Each operand can be a register or an immediate in the range [-16, 15]. */
4050 if (!aarch64_sve_index_immediate_p (base
))
4051 base
= force_reg (inner
, base
);
4052 if (!aarch64_sve_index_immediate_p (step
))
4053 step
= force_reg (inner
, step
);
4055 emit_set_insn (dest
, gen_rtx_VEC_SERIES (mode
, base
, step
));
4058 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
4059 register of mode MODE. Use TARGET for the result if it's nonnull
4062 The two vector modes must have the same element mode. The behavior
4063 is to duplicate architectural lane N of SRC into architectural lanes
4064 N + I * STEP of the result. On big-endian targets, architectural
4065 lane 0 of an Advanced SIMD vector is the last element of the vector
4066 in memory layout, so for big-endian targets this operation has the
4067 effect of reversing SRC before duplicating it. Callers need to
4068 account for this. */
4071 aarch64_expand_sve_dupq (rtx target
, machine_mode mode
, rtx src
)
4073 machine_mode src_mode
= GET_MODE (src
);
4074 gcc_assert (GET_MODE_INNER (mode
) == GET_MODE_INNER (src_mode
));
4075 insn_code icode
= (BYTES_BIG_ENDIAN
4076 ? code_for_aarch64_vec_duplicate_vq_be (mode
)
4077 : code_for_aarch64_vec_duplicate_vq_le (mode
));
4080 expand_operand ops
[3];
4081 create_output_operand (&ops
[i
++], target
, mode
);
4082 create_output_operand (&ops
[i
++], src
, src_mode
);
4083 if (BYTES_BIG_ENDIAN
)
4085 /* Create a PARALLEL describing the reversal of SRC. */
4086 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (mode
);
4087 rtx sel
= aarch64_gen_stepped_int_parallel (nelts_per_vq
,
4088 nelts_per_vq
- 1, -1);
4089 create_fixed_operand (&ops
[i
++], sel
);
4091 expand_insn (icode
, i
, ops
);
4092 return ops
[0].value
;
4095 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
4096 the memory image into DEST. Return true on success. */
4099 aarch64_expand_sve_ld1rq (rtx dest
, rtx src
)
4101 src
= force_const_mem (GET_MODE (src
), src
);
4105 /* Make sure that the address is legitimate. */
4106 if (!aarch64_sve_ld1rq_operand_p (src
))
4108 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
4109 src
= replace_equiv_address (src
, addr
);
4112 machine_mode mode
= GET_MODE (dest
);
4113 machine_mode pred_mode
= aarch64_sve_pred_mode (mode
);
4114 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
4115 emit_insn (gen_aarch64_sve_ld1rq (mode
, dest
, src
, ptrue
));
4119 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
4120 SVE data mode and isn't a legitimate constant. Use TARGET for the
4121 result if convenient.
4123 The returned register can have whatever mode seems most natural
4124 given the contents of SRC. */
4127 aarch64_expand_sve_const_vector (rtx target
, rtx src
)
4129 machine_mode mode
= GET_MODE (src
);
4130 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (src
);
4131 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (src
);
4132 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
4133 unsigned int elt_bits
= GET_MODE_BITSIZE (elt_mode
);
4134 unsigned int container_bits
= aarch64_sve_container_bits (mode
);
4135 unsigned int encoded_bits
= npatterns
* nelts_per_pattern
* container_bits
;
4137 if (nelts_per_pattern
== 1
4138 && encoded_bits
<= 128
4139 && container_bits
!= elt_bits
)
4141 /* We have a partial vector mode and a constant whose full-vector
4142 equivalent would occupy a repeating 128-bit sequence. Build that
4143 full-vector equivalent instead, so that we have the option of
4144 using LD1RQ and Advanced SIMD operations. */
4145 unsigned int repeat
= container_bits
/ elt_bits
;
4146 machine_mode full_mode
= aarch64_full_sve_mode (elt_mode
).require ();
4147 rtx_vector_builder
builder (full_mode
, npatterns
* repeat
, 1);
4148 for (unsigned int i
= 0; i
< npatterns
; ++i
)
4149 for (unsigned int j
= 0; j
< repeat
; ++j
)
4150 builder
.quick_push (CONST_VECTOR_ENCODED_ELT (src
, i
));
4151 target
= aarch64_target_reg (target
, full_mode
);
4152 return aarch64_expand_sve_const_vector (target
, builder
.build ());
4155 if (nelts_per_pattern
== 1 && encoded_bits
== 128)
4157 /* The constant is a duplicated quadword but can't be narrowed
4158 beyond a quadword. Get the memory image of the first quadword
4159 as a 128-bit vector and try using LD1RQ to load it from memory.
4161 The effect for both endiannesses is to load memory lane N into
4162 architectural lanes N + I * STEP of the result. On big-endian
4163 targets, the layout of the 128-bit vector in an Advanced SIMD
4164 register would be different from its layout in an SVE register,
4165 but this 128-bit vector is a memory value only. */
4166 machine_mode vq_mode
= aarch64_vq_mode (elt_mode
).require ();
4167 rtx vq_value
= simplify_gen_subreg (vq_mode
, src
, mode
, 0);
4168 if (vq_value
&& aarch64_expand_sve_ld1rq (target
, vq_value
))
4172 if (nelts_per_pattern
== 1 && encoded_bits
< 128)
4174 /* The vector is a repeating sequence of 64 bits or fewer.
4175 See if we can load them using an Advanced SIMD move and then
4176 duplicate it to fill a vector. This is better than using a GPR
4177 move because it keeps everything in the same register file. */
4178 machine_mode vq_mode
= aarch64_vq_mode (elt_mode
).require ();
4179 rtx_vector_builder
builder (vq_mode
, npatterns
, 1);
4180 for (unsigned int i
= 0; i
< npatterns
; ++i
)
4182 /* We want memory lane N to go into architectural lane N,
4183 so reverse for big-endian targets. The DUP .Q pattern
4184 has a compensating reverse built-in. */
4185 unsigned int srci
= BYTES_BIG_ENDIAN
? npatterns
- i
- 1 : i
;
4186 builder
.quick_push (CONST_VECTOR_ENCODED_ELT (src
, srci
));
4188 rtx vq_src
= builder
.build ();
4189 if (aarch64_simd_valid_immediate (vq_src
, NULL
))
4191 vq_src
= force_reg (vq_mode
, vq_src
);
4192 return aarch64_expand_sve_dupq (target
, mode
, vq_src
);
4195 /* Get an integer representation of the repeating part of Advanced
4196 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
4197 which for big-endian targets is lane-swapped wrt a normal
4198 Advanced SIMD vector. This means that for both endiannesses,
4199 memory lane N of SVE vector SRC corresponds to architectural
4200 lane N of a register holding VQ_SRC. This in turn means that
4201 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
4202 as a single 128-bit value) and thus that memory lane 0 of SRC is
4203 in the lsb of the integer. Duplicating the integer therefore
4204 ensures that memory lane N of SRC goes into architectural lane
4205 N + I * INDEX of the SVE register. */
4206 scalar_mode int_mode
= int_mode_for_size (encoded_bits
, 0).require ();
4207 rtx elt_value
= simplify_gen_subreg (int_mode
, vq_src
, vq_mode
, 0);
4210 /* Pretend that we had a vector of INT_MODE to start with. */
4211 elt_mode
= int_mode
;
4212 mode
= aarch64_full_sve_mode (int_mode
).require ();
4214 /* If the integer can be moved into a general register by a
4215 single instruction, do that and duplicate the result. */
4216 if (CONST_INT_P (elt_value
)
4217 && aarch64_move_imm (INTVAL (elt_value
), elt_mode
))
4219 elt_value
= force_reg (elt_mode
, elt_value
);
4220 return expand_vector_broadcast (mode
, elt_value
);
4223 else if (npatterns
== 1)
4224 /* We're duplicating a single value, but can't do better than
4225 force it to memory and load from there. This handles things
4226 like symbolic constants. */
4227 elt_value
= CONST_VECTOR_ENCODED_ELT (src
, 0);
4231 /* Load the element from memory if we can, otherwise move it into
4232 a register and use a DUP. */
4233 rtx op
= force_const_mem (elt_mode
, elt_value
);
4235 op
= force_reg (elt_mode
, elt_value
);
4236 return expand_vector_broadcast (mode
, op
);
4240 /* Try using INDEX. */
4242 if (const_vec_series_p (src
, &base
, &step
))
4244 aarch64_expand_vec_series (target
, base
, step
);
4248 /* From here on, it's better to force the whole constant to memory
4250 if (GET_MODE_NUNITS (mode
).is_constant ())
4253 /* Expand each pattern individually. */
4254 gcc_assert (npatterns
> 1);
4255 rtx_vector_builder builder
;
4256 auto_vec
<rtx
, 16> vectors (npatterns
);
4257 for (unsigned int i
= 0; i
< npatterns
; ++i
)
4259 builder
.new_vector (mode
, 1, nelts_per_pattern
);
4260 for (unsigned int j
= 0; j
< nelts_per_pattern
; ++j
)
4261 builder
.quick_push (CONST_VECTOR_ELT (src
, i
+ j
* npatterns
));
4262 vectors
.quick_push (force_reg (mode
, builder
.build ()));
4265 /* Use permutes to interleave the separate vectors. */
4266 while (npatterns
> 1)
4269 for (unsigned int i
= 0; i
< npatterns
; ++i
)
4271 rtx tmp
= (npatterns
== 1 ? target
: gen_reg_rtx (mode
));
4272 rtvec v
= gen_rtvec (2, vectors
[i
], vectors
[i
+ npatterns
]);
4273 emit_set_insn (tmp
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
4277 gcc_assert (vectors
[0] == target
);
4281 /* Use WHILE to set a predicate register of mode MODE in which the first
4282 VL bits are set and the rest are clear. Use TARGET for the register
4283 if it's nonnull and convenient. */
4286 aarch64_sve_move_pred_via_while (rtx target
, machine_mode mode
,
4289 rtx limit
= force_reg (DImode
, gen_int_mode (vl
, DImode
));
4290 target
= aarch64_target_reg (target
, mode
);
4291 emit_insn (gen_while (UNSPEC_WHILELO
, DImode
, mode
,
4292 target
, const0_rtx
, limit
));
4297 aarch64_expand_sve_const_pred_1 (rtx
, rtx_vector_builder
&, bool);
4299 /* BUILDER is a constant predicate in which the index of every set bit
4300 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
4301 by inverting every element at a multiple of ELT_SIZE and EORing the
4302 result with an ELT_SIZE PTRUE.
4304 Return a register that contains the constant on success, otherwise
4305 return null. Use TARGET as the register if it is nonnull and
4309 aarch64_expand_sve_const_pred_eor (rtx target
, rtx_vector_builder
&builder
,
4310 unsigned int elt_size
)
4312 /* Invert every element at a multiple of ELT_SIZE, keeping the
4314 rtx_vector_builder
inv_builder (VNx16BImode
, builder
.npatterns (),
4315 builder
.nelts_per_pattern ());
4316 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); ++i
)
4317 if ((i
& (elt_size
- 1)) == 0 && INTVAL (builder
.elt (i
)) == 0)
4318 inv_builder
.quick_push (const1_rtx
);
4320 inv_builder
.quick_push (const0_rtx
);
4321 inv_builder
.finalize ();
4323 /* See if we can load the constant cheaply. */
4324 rtx inv
= aarch64_expand_sve_const_pred_1 (NULL_RTX
, inv_builder
, false);
4328 /* EOR the result with an ELT_SIZE PTRUE. */
4329 rtx mask
= aarch64_ptrue_all (elt_size
);
4330 mask
= force_reg (VNx16BImode
, mask
);
4331 target
= aarch64_target_reg (target
, VNx16BImode
);
4332 emit_insn (gen_aarch64_pred_z (XOR
, VNx16BImode
, target
, mask
, inv
, mask
));
4336 /* BUILDER is a constant predicate in which the index of every set bit
4337 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
4338 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
4339 register on success, otherwise return null. Use TARGET as the register
4340 if nonnull and convenient. */
4343 aarch64_expand_sve_const_pred_trn (rtx target
, rtx_vector_builder
&builder
,
4344 unsigned int elt_size
,
4345 unsigned int permute_size
)
4347 /* We're going to split the constant into two new constants A and B,
4348 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
4349 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
4351 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
4352 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
4354 where _ indicates elements that will be discarded by the permute.
4356 First calculate the ELT_SIZEs for A and B. */
4357 unsigned int a_elt_size
= GET_MODE_SIZE (DImode
);
4358 unsigned int b_elt_size
= GET_MODE_SIZE (DImode
);
4359 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); i
+= elt_size
)
4360 if (INTVAL (builder
.elt (i
)) != 0)
4362 if (i
& permute_size
)
4363 b_elt_size
|= i
- permute_size
;
4367 a_elt_size
&= -a_elt_size
;
4368 b_elt_size
&= -b_elt_size
;
4370 /* Now construct the vectors themselves. */
4371 rtx_vector_builder
a_builder (VNx16BImode
, builder
.npatterns (),
4372 builder
.nelts_per_pattern ());
4373 rtx_vector_builder
b_builder (VNx16BImode
, builder
.npatterns (),
4374 builder
.nelts_per_pattern ());
4375 unsigned int nelts
= builder
.encoded_nelts ();
4376 for (unsigned int i
= 0; i
< nelts
; ++i
)
4377 if (i
& (elt_size
- 1))
4379 a_builder
.quick_push (const0_rtx
);
4380 b_builder
.quick_push (const0_rtx
);
4382 else if ((i
& permute_size
) == 0)
4384 /* The A and B elements are significant. */
4385 a_builder
.quick_push (builder
.elt (i
));
4386 b_builder
.quick_push (builder
.elt (i
+ permute_size
));
4390 /* The A and B elements are going to be discarded, so pick whatever
4391 is likely to give a nice constant. We are targeting element
4392 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
4393 with the aim of each being a sequence of ones followed by
4394 a sequence of zeros. So:
4396 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
4397 duplicate the last X_ELT_SIZE element, to extend the
4398 current sequence of ones or zeros.
4400 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
4401 zero, so that the constant really does have X_ELT_SIZE and
4402 not a smaller size. */
4403 if (a_elt_size
> permute_size
)
4404 a_builder
.quick_push (const0_rtx
);
4406 a_builder
.quick_push (a_builder
.elt (i
- a_elt_size
));
4407 if (b_elt_size
> permute_size
)
4408 b_builder
.quick_push (const0_rtx
);
4410 b_builder
.quick_push (b_builder
.elt (i
- b_elt_size
));
4412 a_builder
.finalize ();
4413 b_builder
.finalize ();
4415 /* Try loading A into a register. */
4416 rtx_insn
*last
= get_last_insn ();
4417 rtx a
= aarch64_expand_sve_const_pred_1 (NULL_RTX
, a_builder
, false);
4421 /* Try loading B into a register. */
4423 if (a_builder
!= b_builder
)
4425 b
= aarch64_expand_sve_const_pred_1 (NULL_RTX
, b_builder
, false);
4428 delete_insns_since (last
);
4433 /* Emit the TRN1 itself. */
4434 machine_mode mode
= aarch64_sve_pred_mode (permute_size
).require ();
4435 target
= aarch64_target_reg (target
, mode
);
4436 emit_insn (gen_aarch64_sve (UNSPEC_TRN1
, mode
, target
,
4437 gen_lowpart (mode
, a
),
4438 gen_lowpart (mode
, b
)));
4442 /* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
4443 constant in BUILDER into an SVE predicate register. Return the register
4444 on success, otherwise return null. Use TARGET for the register if
4445 nonnull and convenient.
4447 ALLOW_RECURSE_P is true if we can use methods that would call this
4448 function recursively. */
4451 aarch64_expand_sve_const_pred_1 (rtx target
, rtx_vector_builder
&builder
,
4452 bool allow_recurse_p
)
4454 if (builder
.encoded_nelts () == 1)
4455 /* A PFALSE or a PTRUE .B ALL. */
4456 return aarch64_emit_set_immediate (target
, builder
);
4458 unsigned int elt_size
= aarch64_widest_sve_pred_elt_size (builder
);
4459 if (int vl
= aarch64_partial_ptrue_length (builder
, elt_size
))
4461 /* If we can load the constant using PTRUE, use it as-is. */
4462 machine_mode mode
= aarch64_sve_pred_mode (elt_size
).require ();
4463 if (aarch64_svpattern_for_vl (mode
, vl
) != AARCH64_NUM_SVPATTERNS
)
4464 return aarch64_emit_set_immediate (target
, builder
);
4466 /* Otherwise use WHILE to set the first VL bits. */
4467 return aarch64_sve_move_pred_via_while (target
, mode
, vl
);
4470 if (!allow_recurse_p
)
4473 /* Try inverting the vector in element size ELT_SIZE and then EORing
4474 the result with an ELT_SIZE PTRUE. */
4475 if (INTVAL (builder
.elt (0)) == 0)
4476 if (rtx res
= aarch64_expand_sve_const_pred_eor (target
, builder
,
4480 /* Try using TRN1 to permute two simpler constants. */
4481 for (unsigned int i
= elt_size
; i
<= 8; i
*= 2)
4482 if (rtx res
= aarch64_expand_sve_const_pred_trn (target
, builder
,
4489 /* Return an SVE predicate register that contains the VNx16BImode
4490 constant in BUILDER, without going through the move expanders.
4492 The returned register can have whatever mode seems most natural
4493 given the contents of BUILDER. Use TARGET for the result if
4497 aarch64_expand_sve_const_pred (rtx target
, rtx_vector_builder
&builder
)
4499 /* Try loading the constant using pure predicate operations. */
4500 if (rtx res
= aarch64_expand_sve_const_pred_1 (target
, builder
, true))
4503 /* Try forcing the constant to memory. */
4504 if (builder
.full_nelts ().is_constant ())
4505 if (rtx mem
= force_const_mem (VNx16BImode
, builder
.build ()))
4507 target
= aarch64_target_reg (target
, VNx16BImode
);
4508 emit_move_insn (target
, mem
);
4512 /* The last resort is to load the constant as an integer and then
4513 compare it against zero. Use -1 for set bits in order to increase
4514 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
4515 rtx_vector_builder
int_builder (VNx16QImode
, builder
.npatterns (),
4516 builder
.nelts_per_pattern ());
4517 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); ++i
)
4518 int_builder
.quick_push (INTVAL (builder
.elt (i
))
4519 ? constm1_rtx
: const0_rtx
);
4520 return aarch64_convert_sve_data_to_pred (target
, VNx16BImode
,
4521 int_builder
.build ());
4524 /* Set DEST to immediate IMM. */
4527 aarch64_expand_mov_immediate (rtx dest
, rtx imm
)
4529 machine_mode mode
= GET_MODE (dest
);
4531 /* Check on what type of symbol it is. */
4532 scalar_int_mode int_mode
;
4533 if ((GET_CODE (imm
) == SYMBOL_REF
4534 || GET_CODE (imm
) == LABEL_REF
4535 || GET_CODE (imm
) == CONST
4536 || GET_CODE (imm
) == CONST_POLY_INT
)
4537 && is_a
<scalar_int_mode
> (mode
, &int_mode
))
4541 HOST_WIDE_INT const_offset
;
4542 enum aarch64_symbol_type sty
;
4544 /* If we have (const (plus symbol offset)), separate out the offset
4545 before we start classifying the symbol. */
4546 rtx base
= strip_offset (imm
, &offset
);
4548 /* We must always add an offset involving VL separately, rather than
4549 folding it into the relocation. */
4550 if (!offset
.is_constant (&const_offset
))
4554 aarch64_report_sve_required ();
4557 if (base
== const0_rtx
&& aarch64_sve_cnt_immediate_p (offset
))
4558 emit_insn (gen_rtx_SET (dest
, imm
));
4561 /* Do arithmetic on 32-bit values if the result is smaller
4563 if (partial_subreg_p (int_mode
, SImode
))
4565 /* It is invalid to do symbol calculations in modes
4566 narrower than SImode. */
4567 gcc_assert (base
== const0_rtx
);
4568 dest
= gen_lowpart (SImode
, dest
);
4571 if (base
!= const0_rtx
)
4573 base
= aarch64_force_temporary (int_mode
, dest
, base
);
4574 aarch64_add_offset (int_mode
, dest
, base
, offset
,
4575 NULL_RTX
, NULL_RTX
, false);
4578 aarch64_add_offset (int_mode
, dest
, base
, offset
,
4579 dest
, NULL_RTX
, false);
4584 sty
= aarch64_classify_symbol (base
, const_offset
);
4587 case SYMBOL_FORCE_TO_MEM
:
4588 if (const_offset
!= 0
4589 && targetm
.cannot_force_const_mem (int_mode
, imm
))
4591 gcc_assert (can_create_pseudo_p ());
4592 base
= aarch64_force_temporary (int_mode
, dest
, base
);
4593 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
4594 NULL_RTX
, NULL_RTX
, false);
4598 mem
= force_const_mem (ptr_mode
, imm
);
4601 /* If we aren't generating PC relative literals, then
4602 we need to expand the literal pool access carefully.
4603 This is something that needs to be done in a number
4604 of places, so could well live as a separate function. */
4605 if (!aarch64_pcrelative_literal_loads
)
4607 gcc_assert (can_create_pseudo_p ());
4608 base
= gen_reg_rtx (ptr_mode
);
4609 aarch64_expand_mov_immediate (base
, XEXP (mem
, 0));
4610 if (ptr_mode
!= Pmode
)
4611 base
= convert_memory_address (Pmode
, base
);
4612 mem
= gen_rtx_MEM (ptr_mode
, base
);
4615 if (int_mode
!= ptr_mode
)
4616 mem
= gen_rtx_ZERO_EXTEND (int_mode
, mem
);
4618 emit_insn (gen_rtx_SET (dest
, mem
));
4622 case SYMBOL_SMALL_TLSGD
:
4623 case SYMBOL_SMALL_TLSDESC
:
4624 case SYMBOL_SMALL_TLSIE
:
4625 case SYMBOL_SMALL_GOT_28K
:
4626 case SYMBOL_SMALL_GOT_4G
:
4627 case SYMBOL_TINY_GOT
:
4628 case SYMBOL_TINY_TLSIE
:
4629 if (const_offset
!= 0)
4631 gcc_assert(can_create_pseudo_p ());
4632 base
= aarch64_force_temporary (int_mode
, dest
, base
);
4633 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
4634 NULL_RTX
, NULL_RTX
, false);
4639 case SYMBOL_SMALL_ABSOLUTE
:
4640 case SYMBOL_TINY_ABSOLUTE
:
4641 case SYMBOL_TLSLE12
:
4642 case SYMBOL_TLSLE24
:
4643 case SYMBOL_TLSLE32
:
4644 case SYMBOL_TLSLE48
:
4645 aarch64_load_symref_appropriately (dest
, imm
, sty
);
4653 if (!CONST_INT_P (imm
))
4655 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
)
4657 /* Only the low bit of each .H, .S and .D element is defined,
4658 so we can set the upper bits to whatever we like. If the
4659 predicate is all-true in MODE, prefer to set all the undefined
4660 bits as well, so that we can share a single .B predicate for
4662 if (imm
== CONSTM1_RTX (mode
))
4663 imm
= CONSTM1_RTX (VNx16BImode
);
4665 /* All methods for constructing predicate modes wider than VNx16BI
4666 will set the upper bits of each element to zero. Expose this
4667 by moving such constants as a VNx16BI, so that all bits are
4668 significant and so that constants for different modes can be
4669 shared. The wider constant will still be available as a
4671 rtx_vector_builder builder
;
4672 if (aarch64_get_sve_pred_bits (builder
, imm
))
4674 rtx res
= aarch64_expand_sve_const_pred (dest
, builder
);
4676 emit_move_insn (dest
, gen_lowpart (mode
, res
));
4681 if (GET_CODE (imm
) == HIGH
4682 || aarch64_simd_valid_immediate (imm
, NULL
))
4684 emit_insn (gen_rtx_SET (dest
, imm
));
4688 if (GET_CODE (imm
) == CONST_VECTOR
&& aarch64_sve_data_mode_p (mode
))
4689 if (rtx res
= aarch64_expand_sve_const_vector (dest
, imm
))
4692 emit_insn (gen_aarch64_sve_reinterpret (mode
, dest
, res
));
4696 rtx mem
= force_const_mem (mode
, imm
);
4698 emit_move_insn (dest
, mem
);
4702 aarch64_internal_mov_immediate (dest
, imm
, true,
4703 as_a
<scalar_int_mode
> (mode
));
4706 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
4707 that is known to contain PTRUE. */
4710 aarch64_emit_sve_pred_move (rtx dest
, rtx pred
, rtx src
)
4712 expand_operand ops
[3];
4713 machine_mode mode
= GET_MODE (dest
);
4714 create_output_operand (&ops
[0], dest
, mode
);
4715 create_input_operand (&ops
[1], pred
, GET_MODE(pred
));
4716 create_input_operand (&ops
[2], src
, mode
);
4717 temporary_volatile_ok
v (true);
4718 expand_insn (code_for_aarch64_pred_mov (mode
), 3, ops
);
4721 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
4722 operand is in memory. In this case we need to use the predicated LD1
4723 and ST1 instead of LDR and STR, both for correctness on big-endian
4724 targets and because LD1 and ST1 support a wider range of addressing modes.
4725 PRED_MODE is the mode of the predicate.
4727 See the comment at the head of aarch64-sve.md for details about the
4728 big-endian handling. */
4731 aarch64_expand_sve_mem_move (rtx dest
, rtx src
, machine_mode pred_mode
)
4733 machine_mode mode
= GET_MODE (dest
);
4734 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
4735 if (!register_operand (src
, mode
)
4736 && !register_operand (dest
, mode
))
4738 rtx tmp
= gen_reg_rtx (mode
);
4740 aarch64_emit_sve_pred_move (tmp
, ptrue
, src
);
4742 emit_move_insn (tmp
, src
);
4745 aarch64_emit_sve_pred_move (dest
, ptrue
, src
);
4748 /* Called only on big-endian targets. See whether an SVE vector move
4749 from SRC to DEST is effectively a REV[BHW] instruction, because at
4750 least one operand is a subreg of an SVE vector that has wider or
4751 narrower elements. Return true and emit the instruction if so.
4755 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
4757 represents a VIEW_CONVERT between the following vectors, viewed
4760 R2: { [0].high, [0].low, [1].high, [1].low, ... }
4761 R1: { [0], [1], [2], [3], ... }
4763 The high part of lane X in R2 should therefore correspond to lane X*2
4764 of R1, but the register representations are:
4767 R2: ...... [1].high [1].low [0].high [0].low
4768 R1: ...... [3] [2] [1] [0]
4770 where the low part of lane X in R2 corresponds to lane X*2 in R1.
4771 We therefore need a reverse operation to swap the high and low values
4774 This is purely an optimization. Without it we would spill the
4775 subreg operand to the stack in one mode and reload it in the
4776 other mode, which has the same effect as the REV. */
4779 aarch64_maybe_expand_sve_subreg_move (rtx dest
, rtx src
)
4781 gcc_assert (BYTES_BIG_ENDIAN
);
4782 if (GET_CODE (dest
) == SUBREG
)
4783 dest
= SUBREG_REG (dest
);
4784 if (GET_CODE (src
) == SUBREG
)
4785 src
= SUBREG_REG (src
);
4787 /* The optimization handles two single SVE REGs with different element
4791 || aarch64_classify_vector_mode (GET_MODE (dest
)) != VEC_SVE_DATA
4792 || aarch64_classify_vector_mode (GET_MODE (src
)) != VEC_SVE_DATA
4793 || (GET_MODE_UNIT_SIZE (GET_MODE (dest
))
4794 == GET_MODE_UNIT_SIZE (GET_MODE (src
))))
4797 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
4798 rtx ptrue
= aarch64_ptrue_reg (VNx16BImode
);
4799 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (dest
), gen_rtvec (2, ptrue
, src
),
4801 emit_insn (gen_rtx_SET (dest
, unspec
));
4805 /* Return a copy of X with mode MODE, without changing its other
4806 attributes. Unlike gen_lowpart, this doesn't care whether the
4807 mode change is valid. */
4810 aarch64_replace_reg_mode (rtx x
, machine_mode mode
)
4812 if (GET_MODE (x
) == mode
)
4815 x
= shallow_copy_rtx (x
);
4816 set_mode_and_regno (x
, mode
, REGNO (x
));
4820 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
4821 stored in wider integer containers. */
4824 aarch64_sve_rev_unspec (machine_mode mode
)
4826 switch (GET_MODE_UNIT_SIZE (mode
))
4828 case 1: return UNSPEC_REVB
;
4829 case 2: return UNSPEC_REVH
;
4830 case 4: return UNSPEC_REVW
;
4835 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
4839 aarch64_split_sve_subreg_move (rtx dest
, rtx ptrue
, rtx src
)
4841 /* Decide which REV operation we need. The mode with wider elements
4842 determines the mode of the operands and the mode with the narrower
4843 elements determines the reverse width. */
4844 machine_mode mode_with_wider_elts
= aarch64_sve_int_mode (GET_MODE (dest
));
4845 machine_mode mode_with_narrower_elts
= aarch64_sve_int_mode (GET_MODE (src
));
4846 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts
)
4847 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts
))
4848 std::swap (mode_with_wider_elts
, mode_with_narrower_elts
);
4850 unsigned int unspec
= aarch64_sve_rev_unspec (mode_with_narrower_elts
);
4851 machine_mode pred_mode
= aarch64_sve_pred_mode (mode_with_wider_elts
);
4853 /* Get the operands in the appropriate modes and emit the instruction. */
4854 ptrue
= gen_lowpart (pred_mode
, ptrue
);
4855 dest
= aarch64_replace_reg_mode (dest
, mode_with_wider_elts
);
4856 src
= aarch64_replace_reg_mode (src
, mode_with_wider_elts
);
4857 emit_insn (gen_aarch64_pred (unspec
, mode_with_wider_elts
,
4862 aarch64_function_ok_for_sibcall (tree
, tree exp
)
4864 if (crtl
->abi
->id () != expr_callee_abi (exp
).id ())
4870 /* Implement TARGET_PASS_BY_REFERENCE. */
4873 aarch64_pass_by_reference (cumulative_args_t pcum_v
,
4874 const function_arg_info
&arg
)
4876 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
4878 machine_mode dummymode
;
4881 unsigned int num_zr
, num_pr
;
4882 if (arg
.type
&& aarch64_sve::builtin_type_p (arg
.type
, &num_zr
, &num_pr
))
4884 if (pcum
&& !pcum
->silent_p
&& !TARGET_SVE
)
4885 /* We can't gracefully recover at this point, so make this a
4887 fatal_error (input_location
, "arguments of type %qT require"
4888 " the SVE ISA extension", arg
.type
);
4890 /* Variadic SVE types are passed by reference. Normal non-variadic
4891 arguments are too if we've run out of registers. */
4893 || pcum
->aapcs_nvrn
+ num_zr
> NUM_FP_ARG_REGS
4894 || pcum
->aapcs_nprn
+ num_pr
> NUM_PR_ARG_REGS
);
4897 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
4898 if (arg
.mode
== BLKmode
&& arg
.type
)
4899 size
= int_size_in_bytes (arg
.type
);
4901 /* No frontends can create types with variable-sized modes, so we
4902 shouldn't be asked to pass or return them. */
4903 size
= GET_MODE_SIZE (arg
.mode
).to_constant ();
4905 /* Aggregates are passed by reference based on their size. */
4906 if (arg
.aggregate_type_p ())
4907 size
= int_size_in_bytes (arg
.type
);
4909 /* Variable sized arguments are always returned by reference. */
4913 /* Can this be a candidate to be passed in fp/simd register(s)? */
4914 if (aarch64_vfp_is_call_or_return_candidate (arg
.mode
, arg
.type
,
4919 /* Arguments which are variable sized or larger than 2 registers are
4920 passed by reference unless they are a homogenous floating point
4922 return size
> 2 * UNITS_PER_WORD
;
4925 /* Return TRUE if VALTYPE is padded to its least significant bits. */
4927 aarch64_return_in_msb (const_tree valtype
)
4929 machine_mode dummy_mode
;
4932 /* Never happens in little-endian mode. */
4933 if (!BYTES_BIG_ENDIAN
)
4936 /* Only composite types smaller than or equal to 16 bytes can
4937 be potentially returned in registers. */
4938 if (!aarch64_composite_type_p (valtype
, TYPE_MODE (valtype
))
4939 || int_size_in_bytes (valtype
) <= 0
4940 || int_size_in_bytes (valtype
) > 16)
4943 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4944 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4945 is always passed/returned in the least significant bits of fp/simd
4947 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype
), valtype
,
4948 &dummy_mode
, &dummy_int
, NULL
))
4954 /* Subroutine of aarch64_function_value. MODE is the mode of the argument
4955 after promotion, and after partial SVE types have been replaced by
4956 their integer equivalents. */
4958 aarch64_function_value_1 (const_tree type
, machine_mode mode
)
4960 unsigned int num_zr
, num_pr
;
4961 if (type
&& aarch64_sve::builtin_type_p (type
, &num_zr
, &num_pr
))
4963 /* Don't raise an error here if we're called when SVE is disabled,
4964 since this is really just a query function. Other code must
4965 do that where appropriate. */
4966 mode
= TYPE_MODE_RAW (type
);
4967 gcc_assert (VECTOR_MODE_P (mode
)
4968 && (!TARGET_SVE
|| aarch64_sve_mode_p (mode
)));
4970 if (num_zr
> 0 && num_pr
== 0)
4971 return gen_rtx_REG (mode
, V0_REGNUM
);
4973 if (num_zr
== 0 && num_pr
== 1)
4974 return gen_rtx_REG (mode
, P0_REGNUM
);
4979 /* Generic vectors that map to SVE modes with -msve-vector-bits=N are
4980 returned in memory, not by value. */
4981 gcc_assert (!aarch64_sve_mode_p (mode
));
4983 if (aarch64_return_in_msb (type
))
4985 HOST_WIDE_INT size
= int_size_in_bytes (type
);
4987 if (size
% UNITS_PER_WORD
!= 0)
4989 size
+= UNITS_PER_WORD
- size
% UNITS_PER_WORD
;
4990 mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 0).require ();
4995 machine_mode ag_mode
;
4996 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
4997 &ag_mode
, &count
, NULL
))
4999 if (!aarch64_composite_type_p (type
, mode
))
5001 gcc_assert (count
== 1 && mode
== ag_mode
);
5002 return gen_rtx_REG (mode
, V0_REGNUM
);
5009 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (count
));
5010 for (i
= 0; i
< count
; i
++)
5012 rtx tmp
= gen_rtx_REG (ag_mode
, V0_REGNUM
+ i
);
5013 rtx offset
= gen_int_mode (i
* GET_MODE_SIZE (ag_mode
), Pmode
);
5014 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
5015 XVECEXP (par
, 0, i
) = tmp
;
5021 return gen_rtx_REG (mode
, R0_REGNUM
);
5024 /* Implement TARGET_FUNCTION_VALUE.
5025 Define how to find the value returned by a function. */
5028 aarch64_function_value (const_tree type
, const_tree func
,
5029 bool outgoing ATTRIBUTE_UNUSED
)
5034 mode
= TYPE_MODE (type
);
5035 if (INTEGRAL_TYPE_P (type
))
5036 mode
= promote_function_mode (type
, mode
, &unsignedp
, func
, 1);
5038 /* Vector types can acquire a partial SVE mode using things like
5039 __attribute__((vector_size(N))), and this is potentially useful.
5040 However, the choice of mode doesn't affect the type's ABI identity,
5041 so we should treat the types as though they had the associated
5042 integer mode, just like they did before SVE was introduced.
5044 We know that the vector must be 128 bits or smaller, otherwise we'd
5045 have returned it in memory instead. */
5046 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
5047 if ((vec_flags
& VEC_ANY_SVE
) && (vec_flags
& VEC_PARTIAL
))
5049 scalar_int_mode int_mode
= int_mode_for_mode (mode
).require ();
5050 rtx reg
= aarch64_function_value_1 (type
, int_mode
);
5051 /* Vector types are never returned in the MSB and are never split. */
5052 gcc_assert (REG_P (reg
) && GET_MODE (reg
) == int_mode
);
5053 rtx pair
= gen_rtx_EXPR_LIST (VOIDmode
, reg
, const0_rtx
);
5054 return gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, pair
));
5057 return aarch64_function_value_1 (type
, mode
);
5060 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
5061 Return true if REGNO is the number of a hard register in which the values
5062 of called function may come back. */
5065 aarch64_function_value_regno_p (const unsigned int regno
)
5067 /* Maximum of 16 bytes can be returned in the general registers. Examples
5068 of 16-byte return values are: 128-bit integers and 16-byte small
5069 structures (excluding homogeneous floating-point aggregates). */
5070 if (regno
== R0_REGNUM
|| regno
== R1_REGNUM
)
5073 /* Up to four fp/simd registers can return a function value, e.g. a
5074 homogeneous floating-point aggregate having four members. */
5075 if (regno
>= V0_REGNUM
&& regno
< V0_REGNUM
+ HA_MAX_NUM_FLDS
)
5076 return TARGET_FLOAT
;
5081 /* Implement TARGET_RETURN_IN_MEMORY.
5083 If the type T of the result of a function is such that
5085 would require that arg be passed as a value in a register (or set of
5086 registers) according to the parameter passing rules, then the result
5087 is returned in the same registers as would be used for such an
5091 aarch64_return_in_memory (const_tree type
, const_tree fndecl ATTRIBUTE_UNUSED
)
5094 machine_mode ag_mode
;
5097 if (!AGGREGATE_TYPE_P (type
)
5098 && TREE_CODE (type
) != COMPLEX_TYPE
5099 && TREE_CODE (type
) != VECTOR_TYPE
)
5100 /* Simple scalar types always returned in registers. */
5103 unsigned int num_zr
, num_pr
;
5104 if (type
&& aarch64_sve::builtin_type_p (type
, &num_zr
, &num_pr
))
5106 /* All SVE types we support fit in registers. For example, it isn't
5107 yet possible to define an aggregate of 9+ SVE vectors or 5+ SVE
5109 gcc_assert (num_zr
<= NUM_FP_ARG_REGS
&& num_pr
<= NUM_PR_ARG_REGS
);
5113 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
),
5120 /* Types larger than 2 registers returned in memory. */
5121 size
= int_size_in_bytes (type
);
5122 return (size
< 0 || size
> 2 * UNITS_PER_WORD
);
5126 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v
, machine_mode mode
,
5127 const_tree type
, int *nregs
)
5129 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
5130 return aarch64_vfp_is_call_or_return_candidate (mode
,
5132 &pcum
->aapcs_vfp_rmode
,
5137 /* Given MODE and TYPE of a function argument, return the alignment in
5138 bits. The idea is to suppress any stronger alignment requested by
5139 the user and opt for the natural alignment (specified in AAPCS64 \S
5140 4.1). ABI_BREAK is set to true if the alignment was incorrectly
5141 calculated in versions of GCC prior to GCC-9. This is a helper
5142 function for local use only. */
5145 aarch64_function_arg_alignment (machine_mode mode
, const_tree type
,
5150 return GET_MODE_ALIGNMENT (mode
);
5152 if (integer_zerop (TYPE_SIZE (type
)))
5155 gcc_assert (TYPE_MODE (type
) == mode
);
5157 if (!AGGREGATE_TYPE_P (type
))
5158 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type
));
5160 if (TREE_CODE (type
) == ARRAY_TYPE
)
5161 return TYPE_ALIGN (TREE_TYPE (type
));
5163 unsigned int alignment
= 0;
5164 unsigned int bitfield_alignment
= 0;
5165 for (tree field
= TYPE_FIELDS (type
); field
; field
= DECL_CHAIN (field
))
5166 if (TREE_CODE (field
) == FIELD_DECL
)
5168 alignment
= std::max (alignment
, DECL_ALIGN (field
));
5169 if (DECL_BIT_FIELD_TYPE (field
))
5171 = std::max (bitfield_alignment
,
5172 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field
)));
5175 if (bitfield_alignment
> alignment
)
5178 return bitfield_alignment
;
5184 /* Layout a function argument according to the AAPCS64 rules. The rule
5185 numbers refer to the rule numbers in the AAPCS64. ORIG_MODE is the
5186 mode that was originally given to us by the target hook, whereas the
5187 mode in ARG might be the result of replacing partial SVE modes with
5188 the equivalent integer mode. */
5191 aarch64_layout_arg (cumulative_args_t pcum_v
, const function_arg_info
&arg
,
5192 machine_mode orig_mode
)
5194 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
5195 tree type
= arg
.type
;
5196 machine_mode mode
= arg
.mode
;
5197 int ncrn
, nvrn
, nregs
;
5198 bool allocate_ncrn
, allocate_nvrn
;
5202 /* We need to do this once per argument. */
5203 if (pcum
->aapcs_arg_processed
)
5206 /* Vector types can acquire a partial SVE mode using things like
5207 __attribute__((vector_size(N))), and this is potentially useful.
5208 However, the choice of mode doesn't affect the type's ABI identity,
5209 so we should treat the types as though they had the associated
5210 integer mode, just like they did before SVE was introduced.
5212 We know that the vector must be 128 bits or smaller, otherwise we'd
5213 have passed it by reference instead. */
5214 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
5215 if ((vec_flags
& VEC_ANY_SVE
) && (vec_flags
& VEC_PARTIAL
))
5217 function_arg_info tmp_arg
= arg
;
5218 tmp_arg
.mode
= int_mode_for_mode (mode
).require ();
5219 aarch64_layout_arg (pcum_v
, tmp_arg
, orig_mode
);
5220 if (rtx reg
= pcum
->aapcs_reg
)
5222 gcc_assert (REG_P (reg
) && GET_MODE (reg
) == tmp_arg
.mode
);
5223 rtx pair
= gen_rtx_EXPR_LIST (VOIDmode
, reg
, const0_rtx
);
5224 pcum
->aapcs_reg
= gen_rtx_PARALLEL (mode
, gen_rtvec (1, pair
));
5229 pcum
->aapcs_arg_processed
= true;
5231 unsigned int num_zr
, num_pr
;
5232 if (type
&& aarch64_sve::builtin_type_p (type
, &num_zr
, &num_pr
))
5234 /* The PCS says that it is invalid to pass an SVE value to an
5235 unprototyped function. There is no ABI-defined location we
5236 can return in this case, so we have no real choice but to raise
5237 an error immediately, even though this is only a query function. */
5238 if (arg
.named
&& pcum
->pcs_variant
!= ARM_PCS_SVE
)
5240 gcc_assert (!pcum
->silent_p
);
5241 error ("SVE type %qT cannot be passed to an unprototyped function",
5243 /* Avoid repeating the message, and avoid tripping the assert
5245 pcum
->pcs_variant
= ARM_PCS_SVE
;
5248 /* We would have converted the argument into pass-by-reference
5249 form if it didn't fit in registers. */
5250 pcum
->aapcs_nextnvrn
= pcum
->aapcs_nvrn
+ num_zr
;
5251 pcum
->aapcs_nextnprn
= pcum
->aapcs_nprn
+ num_pr
;
5252 gcc_assert (arg
.named
5253 && pcum
->pcs_variant
== ARM_PCS_SVE
5254 && aarch64_sve_mode_p (mode
)
5255 && pcum
->aapcs_nextnvrn
<= NUM_FP_ARG_REGS
5256 && pcum
->aapcs_nextnprn
<= NUM_PR_ARG_REGS
);
5258 if (num_zr
> 0 && num_pr
== 0)
5259 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ pcum
->aapcs_nvrn
);
5260 else if (num_zr
== 0 && num_pr
== 1)
5261 pcum
->aapcs_reg
= gen_rtx_REG (mode
, P0_REGNUM
+ pcum
->aapcs_nprn
);
5267 /* Generic vectors that map to SVE modes with -msve-vector-bits=N are
5268 passed by reference, not by value. */
5269 gcc_assert (!aarch64_sve_mode_p (mode
));
5271 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
5273 size
= int_size_in_bytes (type
);
5275 /* No frontends can create types with variable-sized modes, so we
5276 shouldn't be asked to pass or return them. */
5277 size
= GET_MODE_SIZE (mode
).to_constant ();
5278 size
= ROUND_UP (size
, UNITS_PER_WORD
);
5280 allocate_ncrn
= (type
) ? !(FLOAT_TYPE_P (type
)) : !FLOAT_MODE_P (mode
);
5281 allocate_nvrn
= aarch64_vfp_is_call_candidate (pcum_v
,
5286 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
5287 The following code thus handles passing by SIMD/FP registers first. */
5289 nvrn
= pcum
->aapcs_nvrn
;
5291 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
5292 and homogenous short-vector aggregates (HVA). */
5295 if (!pcum
->silent_p
&& !TARGET_FLOAT
)
5296 aarch64_err_no_fpadvsimd (mode
);
5298 if (nvrn
+ nregs
<= NUM_FP_ARG_REGS
)
5300 pcum
->aapcs_nextnvrn
= nvrn
+ nregs
;
5301 if (!aarch64_composite_type_p (type
, mode
))
5303 gcc_assert (nregs
== 1);
5304 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ nvrn
);
5310 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
5311 for (i
= 0; i
< nregs
; i
++)
5313 rtx tmp
= gen_rtx_REG (pcum
->aapcs_vfp_rmode
,
5314 V0_REGNUM
+ nvrn
+ i
);
5315 rtx offset
= gen_int_mode
5316 (i
* GET_MODE_SIZE (pcum
->aapcs_vfp_rmode
), Pmode
);
5317 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
5318 XVECEXP (par
, 0, i
) = tmp
;
5320 pcum
->aapcs_reg
= par
;
5326 /* C.3 NSRN is set to 8. */
5327 pcum
->aapcs_nextnvrn
= NUM_FP_ARG_REGS
;
5332 ncrn
= pcum
->aapcs_ncrn
;
5333 nregs
= size
/ UNITS_PER_WORD
;
5335 /* C6 - C9. though the sign and zero extension semantics are
5336 handled elsewhere. This is the case where the argument fits
5337 entirely general registers. */
5338 if (allocate_ncrn
&& (ncrn
+ nregs
<= NUM_ARG_REGS
))
5340 gcc_assert (nregs
== 0 || nregs
== 1 || nregs
== 2);
5342 /* C.8 if the argument has an alignment of 16 then the NGRN is
5343 rounded up to the next even number. */
5346 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
5347 comparison is there because for > 16 * BITS_PER_UNIT
5348 alignment nregs should be > 2 and therefore it should be
5349 passed by reference rather than value. */
5350 && (aarch64_function_arg_alignment (orig_mode
, type
, &abi_break
)
5351 == 16 * BITS_PER_UNIT
))
5353 if (abi_break
&& warn_psabi
&& currently_expanding_gimple_stmt
)
5354 inform (input_location
, "parameter passing for argument of type "
5355 "%qT changed in GCC 9.1", type
);
5357 gcc_assert (ncrn
+ nregs
<= NUM_ARG_REGS
);
5360 /* NREGS can be 0 when e.g. an empty structure is to be passed.
5361 A reg is still generated for it, but the caller should be smart
5362 enough not to use it. */
5363 if (nregs
== 0 || nregs
== 1 || GET_MODE_CLASS (mode
) == MODE_INT
)
5364 pcum
->aapcs_reg
= gen_rtx_REG (mode
, R0_REGNUM
+ ncrn
);
5370 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
5371 for (i
= 0; i
< nregs
; i
++)
5373 rtx tmp
= gen_rtx_REG (word_mode
, R0_REGNUM
+ ncrn
+ i
);
5374 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
,
5375 GEN_INT (i
* UNITS_PER_WORD
));
5376 XVECEXP (par
, 0, i
) = tmp
;
5378 pcum
->aapcs_reg
= par
;
5381 pcum
->aapcs_nextncrn
= ncrn
+ nregs
;
5386 pcum
->aapcs_nextncrn
= NUM_ARG_REGS
;
5388 /* The argument is passed on stack; record the needed number of words for
5389 this argument and align the total size if necessary. */
5391 pcum
->aapcs_stack_words
= size
/ UNITS_PER_WORD
;
5393 if (aarch64_function_arg_alignment (orig_mode
, type
, &abi_break
)
5394 == 16 * BITS_PER_UNIT
)
5396 int new_size
= ROUND_UP (pcum
->aapcs_stack_size
, 16 / UNITS_PER_WORD
);
5397 if (pcum
->aapcs_stack_size
!= new_size
)
5399 if (abi_break
&& warn_psabi
&& currently_expanding_gimple_stmt
)
5400 inform (input_location
, "parameter passing for argument of type "
5401 "%qT changed in GCC 9.1", type
);
5402 pcum
->aapcs_stack_size
= new_size
;
5408 /* Implement TARGET_FUNCTION_ARG. */
5411 aarch64_function_arg (cumulative_args_t pcum_v
, const function_arg_info
&arg
)
5413 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
5414 gcc_assert (pcum
->pcs_variant
== ARM_PCS_AAPCS64
5415 || pcum
->pcs_variant
== ARM_PCS_SIMD
5416 || pcum
->pcs_variant
== ARM_PCS_SVE
);
5418 if (arg
.end_marker_p ())
5419 return gen_int_mode (pcum
->pcs_variant
, DImode
);
5421 aarch64_layout_arg (pcum_v
, arg
, arg
.mode
);
5422 return pcum
->aapcs_reg
;
5426 aarch64_init_cumulative_args (CUMULATIVE_ARGS
*pcum
,
5428 rtx libname ATTRIBUTE_UNUSED
,
5429 const_tree fndecl ATTRIBUTE_UNUSED
,
5430 unsigned n_named ATTRIBUTE_UNUSED
,
5433 pcum
->aapcs_ncrn
= 0;
5434 pcum
->aapcs_nvrn
= 0;
5435 pcum
->aapcs_nprn
= 0;
5436 pcum
->aapcs_nextncrn
= 0;
5437 pcum
->aapcs_nextnvrn
= 0;
5438 pcum
->aapcs_nextnprn
= 0;
5440 pcum
->pcs_variant
= (arm_pcs
) fntype_abi (fntype
).id ();
5442 pcum
->pcs_variant
= ARM_PCS_AAPCS64
;
5443 pcum
->aapcs_reg
= NULL_RTX
;
5444 pcum
->aapcs_arg_processed
= false;
5445 pcum
->aapcs_stack_words
= 0;
5446 pcum
->aapcs_stack_size
= 0;
5447 pcum
->silent_p
= silent_p
;
5451 && fndecl
&& TREE_PUBLIC (fndecl
)
5452 && fntype
&& fntype
!= error_mark_node
)
5454 const_tree type
= TREE_TYPE (fntype
);
5455 machine_mode mode ATTRIBUTE_UNUSED
; /* To pass pointer as argument. */
5456 int nregs ATTRIBUTE_UNUSED
; /* Likewise. */
5457 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
), type
,
5458 &mode
, &nregs
, NULL
))
5459 aarch64_err_no_fpadvsimd (TYPE_MODE (type
));
5464 && pcum
->pcs_variant
== ARM_PCS_SVE
)
5466 /* We can't gracefully recover at this point, so make this a
5469 fatal_error (input_location
, "%qE requires the SVE ISA extension",
5472 fatal_error (input_location
, "calls to functions of type %qT require"
5473 " the SVE ISA extension", fntype
);
5478 aarch64_function_arg_advance (cumulative_args_t pcum_v
,
5479 const function_arg_info
&arg
)
5481 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
5482 if (pcum
->pcs_variant
== ARM_PCS_AAPCS64
5483 || pcum
->pcs_variant
== ARM_PCS_SIMD
5484 || pcum
->pcs_variant
== ARM_PCS_SVE
)
5486 aarch64_layout_arg (pcum_v
, arg
, arg
.mode
);
5487 gcc_assert ((pcum
->aapcs_reg
!= NULL_RTX
)
5488 != (pcum
->aapcs_stack_words
!= 0));
5489 pcum
->aapcs_arg_processed
= false;
5490 pcum
->aapcs_ncrn
= pcum
->aapcs_nextncrn
;
5491 pcum
->aapcs_nvrn
= pcum
->aapcs_nextnvrn
;
5492 pcum
->aapcs_nprn
= pcum
->aapcs_nextnprn
;
5493 pcum
->aapcs_stack_size
+= pcum
->aapcs_stack_words
;
5494 pcum
->aapcs_stack_words
= 0;
5495 pcum
->aapcs_reg
= NULL_RTX
;
5500 aarch64_function_arg_regno_p (unsigned regno
)
5502 return ((GP_REGNUM_P (regno
) && regno
< R0_REGNUM
+ NUM_ARG_REGS
)
5503 || (FP_REGNUM_P (regno
) && regno
< V0_REGNUM
+ NUM_FP_ARG_REGS
));
5506 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
5507 PARM_BOUNDARY bits of alignment, but will be given anything up
5508 to STACK_BOUNDARY bits if the type requires it. This makes sure
5509 that both before and after the layout of each argument, the Next
5510 Stacked Argument Address (NSAA) will have a minimum alignment of
5514 aarch64_function_arg_boundary (machine_mode mode
, const_tree type
)
5517 unsigned int alignment
= aarch64_function_arg_alignment (mode
, type
,
5519 if (abi_break
& warn_psabi
)
5520 inform (input_location
, "parameter passing for argument of type "
5521 "%qT changed in GCC 9.1", type
);
5523 return MIN (MAX (alignment
, PARM_BOUNDARY
), STACK_BOUNDARY
);
5526 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
5528 static fixed_size_mode
5529 aarch64_get_reg_raw_mode (int regno
)
5531 if (TARGET_SVE
&& FP_REGNUM_P (regno
))
5532 /* Don't use the SVE part of the register for __builtin_apply and
5533 __builtin_return. The SVE registers aren't used by the normal PCS,
5534 so using them there would be a waste of time. The PCS extensions
5535 for SVE types are fundamentally incompatible with the
5536 __builtin_return/__builtin_apply interface. */
5537 return as_a
<fixed_size_mode
> (V16QImode
);
5538 return default_get_reg_raw_mode (regno
);
5541 /* Implement TARGET_FUNCTION_ARG_PADDING.
5543 Small aggregate types are placed in the lowest memory address.
5545 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
5547 static pad_direction
5548 aarch64_function_arg_padding (machine_mode mode
, const_tree type
)
5550 /* On little-endian targets, the least significant byte of every stack
5551 argument is passed at the lowest byte address of the stack slot. */
5552 if (!BYTES_BIG_ENDIAN
)
5555 /* Otherwise, integral, floating-point and pointer types are padded downward:
5556 the least significant byte of a stack argument is passed at the highest
5557 byte address of the stack slot. */
5559 ? (INTEGRAL_TYPE_P (type
) || SCALAR_FLOAT_TYPE_P (type
)
5560 || POINTER_TYPE_P (type
))
5561 : (SCALAR_INT_MODE_P (mode
) || SCALAR_FLOAT_MODE_P (mode
)))
5562 return PAD_DOWNWARD
;
5564 /* Everything else padded upward, i.e. data in first byte of stack slot. */
5568 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
5570 It specifies padding for the last (may also be the only)
5571 element of a block move between registers and memory. If
5572 assuming the block is in the memory, padding upward means that
5573 the last element is padded after its highest significant byte,
5574 while in downward padding, the last element is padded at the
5575 its least significant byte side.
5577 Small aggregates and small complex types are always padded
5580 We don't need to worry about homogeneous floating-point or
5581 short-vector aggregates; their move is not affected by the
5582 padding direction determined here. Regardless of endianness,
5583 each element of such an aggregate is put in the least
5584 significant bits of a fp/simd register.
5586 Return !BYTES_BIG_ENDIAN if the least significant byte of the
5587 register has useful data, and return the opposite if the most
5588 significant byte does. */
5591 aarch64_pad_reg_upward (machine_mode mode
, const_tree type
,
5592 bool first ATTRIBUTE_UNUSED
)
5595 /* Small composite types are always padded upward. */
5596 if (BYTES_BIG_ENDIAN
&& aarch64_composite_type_p (type
, mode
))
5600 size
= int_size_in_bytes (type
);
5602 /* No frontends can create types with variable-sized modes, so we
5603 shouldn't be asked to pass or return them. */
5604 size
= GET_MODE_SIZE (mode
).to_constant ();
5605 if (size
< 2 * UNITS_PER_WORD
)
5609 /* Otherwise, use the default padding. */
5610 return !BYTES_BIG_ENDIAN
;
5613 static scalar_int_mode
5614 aarch64_libgcc_cmp_return_mode (void)
5619 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
5621 /* We use the 12-bit shifted immediate arithmetic instructions so values
5622 must be multiple of (1 << 12), i.e. 4096. */
5623 #define ARITH_FACTOR 4096
5625 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
5626 #error Cannot use simple address calculation for stack probing
5629 /* The pair of scratch registers used for stack probing. */
5630 #define PROBE_STACK_FIRST_REG R9_REGNUM
5631 #define PROBE_STACK_SECOND_REG R10_REGNUM
5633 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
5634 inclusive. These are offsets from the current stack pointer. */
5637 aarch64_emit_probe_stack_range (HOST_WIDE_INT first
, poly_int64 poly_size
)
5640 if (!poly_size
.is_constant (&size
))
5642 sorry ("stack probes for SVE frames");
5646 rtx reg1
= gen_rtx_REG (Pmode
, PROBE_STACK_FIRST_REG
);
5648 /* See the same assertion on PROBE_INTERVAL above. */
5649 gcc_assert ((first
% ARITH_FACTOR
) == 0);
5651 /* See if we have a constant small number of probes to generate. If so,
5652 that's the easy case. */
5653 if (size
<= PROBE_INTERVAL
)
5655 const HOST_WIDE_INT base
= ROUND_UP (size
, ARITH_FACTOR
);
5657 emit_set_insn (reg1
,
5658 plus_constant (Pmode
,
5659 stack_pointer_rtx
, -(first
+ base
)));
5660 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- size
));
5663 /* The run-time loop is made up of 8 insns in the generic case while the
5664 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
5665 else if (size
<= 4 * PROBE_INTERVAL
)
5667 HOST_WIDE_INT i
, rem
;
5669 emit_set_insn (reg1
,
5670 plus_constant (Pmode
,
5672 -(first
+ PROBE_INTERVAL
)));
5673 emit_stack_probe (reg1
);
5675 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
5676 it exceeds SIZE. If only two probes are needed, this will not
5677 generate any code. Then probe at FIRST + SIZE. */
5678 for (i
= 2 * PROBE_INTERVAL
; i
< size
; i
+= PROBE_INTERVAL
)
5680 emit_set_insn (reg1
,
5681 plus_constant (Pmode
, reg1
, -PROBE_INTERVAL
));
5682 emit_stack_probe (reg1
);
5685 rem
= size
- (i
- PROBE_INTERVAL
);
5688 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
5690 emit_set_insn (reg1
, plus_constant (Pmode
, reg1
, -base
));
5691 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- rem
));
5694 emit_stack_probe (plus_constant (Pmode
, reg1
, -rem
));
5697 /* Otherwise, do the same as above, but in a loop. Note that we must be
5698 extra careful with variables wrapping around because we might be at
5699 the very top (or the very bottom) of the address space and we have
5700 to be able to handle this case properly; in particular, we use an
5701 equality test for the loop condition. */
5704 rtx reg2
= gen_rtx_REG (Pmode
, PROBE_STACK_SECOND_REG
);
5706 /* Step 1: round SIZE to the previous multiple of the interval. */
5708 HOST_WIDE_INT rounded_size
= size
& -PROBE_INTERVAL
;
5711 /* Step 2: compute initial and final value of the loop counter. */
5713 /* TEST_ADDR = SP + FIRST. */
5714 emit_set_insn (reg1
,
5715 plus_constant (Pmode
, stack_pointer_rtx
, -first
));
5717 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
5718 HOST_WIDE_INT adjustment
= - (first
+ rounded_size
);
5719 if (! aarch64_uimm12_shift (adjustment
))
5721 aarch64_internal_mov_immediate (reg2
, GEN_INT (adjustment
),
5723 emit_set_insn (reg2
, gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, reg2
));
5726 emit_set_insn (reg2
,
5727 plus_constant (Pmode
, stack_pointer_rtx
, adjustment
));
5733 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
5736 while (TEST_ADDR != LAST_ADDR)
5738 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
5739 until it is equal to ROUNDED_SIZE. */
5741 emit_insn (gen_probe_stack_range (reg1
, reg1
, reg2
));
5744 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
5745 that SIZE is equal to ROUNDED_SIZE. */
5747 if (size
!= rounded_size
)
5749 HOST_WIDE_INT rem
= size
- rounded_size
;
5753 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
5755 emit_set_insn (reg2
, plus_constant (Pmode
, reg2
, -base
));
5756 emit_stack_probe (plus_constant (Pmode
, reg2
, base
- rem
));
5759 emit_stack_probe (plus_constant (Pmode
, reg2
, -rem
));
5763 /* Make sure nothing is scheduled before we are done. */
5764 emit_insn (gen_blockage ());
5767 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
5768 absolute addresses. */
5771 aarch64_output_probe_stack_range (rtx reg1
, rtx reg2
)
5773 static int labelno
= 0;
5777 ASM_GENERATE_INTERNAL_LABEL (loop_lab
, "LPSRL", labelno
++);
5780 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_lab
);
5782 HOST_WIDE_INT stack_clash_probe_interval
5783 = 1 << param_stack_clash_protection_guard_size
;
5785 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
5787 HOST_WIDE_INT interval
;
5788 if (flag_stack_clash_protection
)
5789 interval
= stack_clash_probe_interval
;
5791 interval
= PROBE_INTERVAL
;
5793 gcc_assert (aarch64_uimm12_shift (interval
));
5794 xops
[1] = GEN_INT (interval
);
5796 output_asm_insn ("sub\t%0, %0, %1", xops
);
5798 /* If doing stack clash protection then we probe up by the ABI specified
5799 amount. We do this because we're dropping full pages at a time in the
5800 loop. But if we're doing non-stack clash probing, probe at SP 0. */
5801 if (flag_stack_clash_protection
)
5802 xops
[1] = GEN_INT (STACK_CLASH_CALLER_GUARD
);
5804 xops
[1] = CONST0_RTX (GET_MODE (xops
[1]));
5806 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
5807 by this amount for each iteration. */
5808 output_asm_insn ("str\txzr, [%0, %1]", xops
);
5810 /* Test if TEST_ADDR == LAST_ADDR. */
5812 output_asm_insn ("cmp\t%0, %1", xops
);
5815 fputs ("\tb.ne\t", asm_out_file
);
5816 assemble_name_raw (asm_out_file
, loop_lab
);
5817 fputc ('\n', asm_out_file
);
5822 /* Emit the probe loop for doing stack clash probes and stack adjustments for
5823 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
5824 of GUARD_SIZE. When a probe is emitted it is done at most
5825 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
5826 at most MIN_PROBE_THRESHOLD. By the end of this function
5827 BASE = BASE - ADJUSTMENT. */
5830 aarch64_output_probe_sve_stack_clash (rtx base
, rtx adjustment
,
5831 rtx min_probe_threshold
, rtx guard_size
)
5833 /* This function is not allowed to use any instruction generation function
5834 like gen_ and friends. If you do you'll likely ICE during CFG validation,
5835 so instead emit the code you want using output_asm_insn. */
5836 gcc_assert (flag_stack_clash_protection
);
5837 gcc_assert (CONST_INT_P (min_probe_threshold
) && CONST_INT_P (guard_size
));
5838 gcc_assert (INTVAL (guard_size
) > INTVAL (min_probe_threshold
));
5840 /* The minimum required allocation before the residual requires probing. */
5841 HOST_WIDE_INT residual_probe_guard
= INTVAL (min_probe_threshold
);
5843 /* Clamp the value down to the nearest value that can be used with a cmp. */
5844 residual_probe_guard
= aarch64_clamp_to_uimm12_shift (residual_probe_guard
);
5845 rtx probe_offset_value_rtx
= gen_int_mode (residual_probe_guard
, Pmode
);
5847 gcc_assert (INTVAL (min_probe_threshold
) >= residual_probe_guard
);
5848 gcc_assert (aarch64_uimm12_shift (residual_probe_guard
));
5850 static int labelno
= 0;
5851 char loop_start_lab
[32];
5852 char loop_end_lab
[32];
5855 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab
, "SVLPSPL", labelno
);
5856 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab
, "SVLPEND", labelno
++);
5858 /* Emit loop start label. */
5859 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_start_lab
);
5861 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
5862 xops
[0] = adjustment
;
5863 xops
[1] = probe_offset_value_rtx
;
5864 output_asm_insn ("cmp\t%0, %1", xops
);
5866 /* Branch to end if not enough adjustment to probe. */
5867 fputs ("\tb.lt\t", asm_out_file
);
5868 assemble_name_raw (asm_out_file
, loop_end_lab
);
5869 fputc ('\n', asm_out_file
);
5871 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
5873 xops
[1] = probe_offset_value_rtx
;
5874 output_asm_insn ("sub\t%0, %0, %1", xops
);
5876 /* Probe at BASE. */
5877 xops
[1] = const0_rtx
;
5878 output_asm_insn ("str\txzr, [%0, %1]", xops
);
5880 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
5881 xops
[0] = adjustment
;
5882 xops
[1] = probe_offset_value_rtx
;
5883 output_asm_insn ("sub\t%0, %0, %1", xops
);
5885 /* Branch to start if still more bytes to allocate. */
5886 fputs ("\tb\t", asm_out_file
);
5887 assemble_name_raw (asm_out_file
, loop_start_lab
);
5888 fputc ('\n', asm_out_file
);
5890 /* No probe leave. */
5891 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_end_lab
);
5893 /* BASE = BASE - ADJUSTMENT. */
5895 xops
[1] = adjustment
;
5896 output_asm_insn ("sub\t%0, %0, %1", xops
);
5900 /* Determine whether a frame chain needs to be generated. */
5902 aarch64_needs_frame_chain (void)
5904 /* Force a frame chain for EH returns so the return address is at FP+8. */
5905 if (frame_pointer_needed
|| crtl
->calls_eh_return
)
5908 /* A leaf function cannot have calls or write LR. */
5909 bool is_leaf
= crtl
->is_leaf
&& !df_regs_ever_live_p (LR_REGNUM
);
5911 /* Don't use a frame chain in leaf functions if leaf frame pointers
5913 if (flag_omit_leaf_frame_pointer
&& is_leaf
)
5916 return aarch64_use_frame_pointer
;
5919 /* Mark the registers that need to be saved by the callee and calculate
5920 the size of the callee-saved registers area and frame record (both FP
5921 and LR may be omitted). */
5923 aarch64_layout_frame (void)
5925 poly_int64 offset
= 0;
5926 int regno
, last_fp_reg
= INVALID_REGNUM
;
5927 machine_mode vector_save_mode
= aarch64_reg_save_mode (V8_REGNUM
);
5928 poly_int64 vector_save_size
= GET_MODE_SIZE (vector_save_mode
);
5929 bool frame_related_fp_reg_p
= false;
5930 aarch64_frame
&frame
= cfun
->machine
->frame
;
5932 frame
.emit_frame_chain
= aarch64_needs_frame_chain ();
5934 /* Adjust the outgoing arguments size if required. Keep it in sync with what
5935 the mid-end is doing. */
5936 crtl
->outgoing_args_size
= STACK_DYNAMIC_OFFSET (cfun
);
5938 #define SLOT_NOT_REQUIRED (-2)
5939 #define SLOT_REQUIRED (-1)
5941 frame
.wb_candidate1
= INVALID_REGNUM
;
5942 frame
.wb_candidate2
= INVALID_REGNUM
;
5943 frame
.spare_pred_reg
= INVALID_REGNUM
;
5945 /* First mark all the registers that really need to be saved... */
5946 for (regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
5947 frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
5949 /* ... that includes the eh data registers (if needed)... */
5950 if (crtl
->calls_eh_return
)
5951 for (regno
= 0; EH_RETURN_DATA_REGNO (regno
) != INVALID_REGNUM
; regno
++)
5952 frame
.reg_offset
[EH_RETURN_DATA_REGNO (regno
)] = SLOT_REQUIRED
;
5954 /* ... and any callee saved register that dataflow says is live. */
5955 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
5956 if (df_regs_ever_live_p (regno
)
5957 && !fixed_regs
[regno
]
5958 && (regno
== R30_REGNUM
5959 || !crtl
->abi
->clobbers_full_reg_p (regno
)))
5960 frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
5962 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
5963 if (df_regs_ever_live_p (regno
)
5964 && !fixed_regs
[regno
]
5965 && !crtl
->abi
->clobbers_full_reg_p (regno
))
5967 frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
5968 last_fp_reg
= regno
;
5969 if (aarch64_emit_cfi_for_reg_p (regno
))
5970 frame_related_fp_reg_p
= true;
5973 /* Big-endian SVE frames need a spare predicate register in order
5974 to save Z8-Z15. Decide which register they should use. Prefer
5975 an unused argument register if possible, so that we don't force P4
5976 to be saved unnecessarily. */
5977 if (frame_related_fp_reg_p
5978 && crtl
->abi
->id () == ARM_PCS_SVE
5979 && BYTES_BIG_ENDIAN
)
5981 bitmap live1
= df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun
));
5982 bitmap live2
= df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun
));
5983 for (regno
= P0_REGNUM
; regno
<= P7_REGNUM
; regno
++)
5984 if (!bitmap_bit_p (live1
, regno
) && !bitmap_bit_p (live2
, regno
))
5986 gcc_assert (regno
<= P7_REGNUM
);
5987 frame
.spare_pred_reg
= regno
;
5988 df_set_regs_ever_live (regno
, true);
5991 for (regno
= P0_REGNUM
; regno
<= P15_REGNUM
; regno
++)
5992 if (df_regs_ever_live_p (regno
)
5993 && !fixed_regs
[regno
]
5994 && !crtl
->abi
->clobbers_full_reg_p (regno
))
5995 frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
5997 /* With stack-clash, LR must be saved in non-leaf functions. */
5998 gcc_assert (crtl
->is_leaf
5999 || maybe_ne (frame
.reg_offset
[R30_REGNUM
], SLOT_NOT_REQUIRED
));
6001 /* Now assign stack slots for the registers. Start with the predicate
6002 registers, since predicate LDR and STR have a relatively small
6003 offset range. These saves happen below the hard frame pointer. */
6004 for (regno
= P0_REGNUM
; regno
<= P15_REGNUM
; regno
++)
6005 if (known_eq (frame
.reg_offset
[regno
], SLOT_REQUIRED
))
6007 frame
.reg_offset
[regno
] = offset
;
6008 offset
+= BYTES_PER_SVE_PRED
;
6011 if (maybe_ne (offset
, 0))
6013 /* If we have any vector registers to save above the predicate registers,
6014 the offset of the vector register save slots need to be a multiple
6015 of the vector size. This lets us use the immediate forms of LDR/STR
6016 (or LD1/ST1 for big-endian).
6018 A vector register is 8 times the size of a predicate register,
6019 and we need to save a maximum of 12 predicate registers, so the
6020 first vector register will be at either #1, MUL VL or #2, MUL VL.
6022 If we don't have any vector registers to save, and we know how
6023 big the predicate save area is, we can just round it up to the
6024 next 16-byte boundary. */
6025 if (last_fp_reg
== (int) INVALID_REGNUM
&& offset
.is_constant ())
6026 offset
= aligned_upper_bound (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
6029 if (known_le (offset
, vector_save_size
))
6030 offset
= vector_save_size
;
6031 else if (known_le (offset
, vector_save_size
* 2))
6032 offset
= vector_save_size
* 2;
6038 /* If we need to save any SVE vector registers, add them next. */
6039 if (last_fp_reg
!= (int) INVALID_REGNUM
&& crtl
->abi
->id () == ARM_PCS_SVE
)
6040 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
6041 if (known_eq (frame
.reg_offset
[regno
], SLOT_REQUIRED
))
6043 frame
.reg_offset
[regno
] = offset
;
6044 offset
+= vector_save_size
;
6047 /* OFFSET is now the offset of the hard frame pointer from the bottom
6048 of the callee save area. */
6049 bool saves_below_hard_fp_p
= maybe_ne (offset
, 0);
6050 frame
.below_hard_fp_saved_regs_size
= offset
;
6051 if (frame
.emit_frame_chain
)
6053 /* FP and LR are placed in the linkage record. */
6054 frame
.reg_offset
[R29_REGNUM
] = offset
;
6055 frame
.wb_candidate1
= R29_REGNUM
;
6056 frame
.reg_offset
[R30_REGNUM
] = offset
+ UNITS_PER_WORD
;
6057 frame
.wb_candidate2
= R30_REGNUM
;
6058 offset
+= 2 * UNITS_PER_WORD
;
6061 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
6062 if (known_eq (frame
.reg_offset
[regno
], SLOT_REQUIRED
))
6064 frame
.reg_offset
[regno
] = offset
;
6065 if (frame
.wb_candidate1
== INVALID_REGNUM
)
6066 frame
.wb_candidate1
= regno
;
6067 else if (frame
.wb_candidate2
== INVALID_REGNUM
)
6068 frame
.wb_candidate2
= regno
;
6069 offset
+= UNITS_PER_WORD
;
6072 poly_int64 max_int_offset
= offset
;
6073 offset
= aligned_upper_bound (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
6074 bool has_align_gap
= maybe_ne (offset
, max_int_offset
);
6076 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
6077 if (known_eq (frame
.reg_offset
[regno
], SLOT_REQUIRED
))
6079 /* If there is an alignment gap between integer and fp callee-saves,
6080 allocate the last fp register to it if possible. */
6081 if (regno
== last_fp_reg
6083 && known_eq (vector_save_size
, 8)
6084 && multiple_p (offset
, 16))
6086 frame
.reg_offset
[regno
] = max_int_offset
;
6090 frame
.reg_offset
[regno
] = offset
;
6091 if (frame
.wb_candidate1
== INVALID_REGNUM
)
6092 frame
.wb_candidate1
= regno
;
6093 else if (frame
.wb_candidate2
== INVALID_REGNUM
6094 && frame
.wb_candidate1
>= V0_REGNUM
)
6095 frame
.wb_candidate2
= regno
;
6096 offset
+= vector_save_size
;
6099 offset
= aligned_upper_bound (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
6101 frame
.saved_regs_size
= offset
;
6103 poly_int64 varargs_and_saved_regs_size
= offset
+ frame
.saved_varargs_size
;
6105 poly_int64 above_outgoing_args
6106 = aligned_upper_bound (varargs_and_saved_regs_size
6107 + get_frame_size (),
6108 STACK_BOUNDARY
/ BITS_PER_UNIT
);
6110 frame
.hard_fp_offset
6111 = above_outgoing_args
- frame
.below_hard_fp_saved_regs_size
;
6113 /* Both these values are already aligned. */
6114 gcc_assert (multiple_p (crtl
->outgoing_args_size
,
6115 STACK_BOUNDARY
/ BITS_PER_UNIT
));
6116 frame
.frame_size
= above_outgoing_args
+ crtl
->outgoing_args_size
;
6118 frame
.locals_offset
= frame
.saved_varargs_size
;
6120 frame
.initial_adjust
= 0;
6121 frame
.final_adjust
= 0;
6122 frame
.callee_adjust
= 0;
6123 frame
.sve_callee_adjust
= 0;
6124 frame
.callee_offset
= 0;
6126 HOST_WIDE_INT max_push_offset
= 0;
6127 if (frame
.wb_candidate2
!= INVALID_REGNUM
)
6128 max_push_offset
= 512;
6129 else if (frame
.wb_candidate1
!= INVALID_REGNUM
)
6130 max_push_offset
= 256;
6132 HOST_WIDE_INT const_size
, const_outgoing_args_size
, const_fp_offset
;
6133 HOST_WIDE_INT const_saved_regs_size
;
6134 if (frame
.frame_size
.is_constant (&const_size
)
6135 && const_size
< max_push_offset
6136 && known_eq (frame
.hard_fp_offset
, const_size
))
6138 /* Simple, small frame with no outgoing arguments:
6140 stp reg1, reg2, [sp, -frame_size]!
6141 stp reg3, reg4, [sp, 16] */
6142 frame
.callee_adjust
= const_size
;
6144 else if (crtl
->outgoing_args_size
.is_constant (&const_outgoing_args_size
)
6145 && frame
.saved_regs_size
.is_constant (&const_saved_regs_size
)
6146 && const_outgoing_args_size
+ const_saved_regs_size
< 512
6147 /* We could handle this case even with outgoing args, provided
6148 that the number of args left us with valid offsets for all
6149 predicate and vector save slots. It's such a rare case that
6150 it hardly seems worth the effort though. */
6151 && (!saves_below_hard_fp_p
|| const_outgoing_args_size
== 0)
6152 && !(cfun
->calls_alloca
6153 && frame
.hard_fp_offset
.is_constant (&const_fp_offset
)
6154 && const_fp_offset
< max_push_offset
))
6156 /* Frame with small outgoing arguments:
6158 sub sp, sp, frame_size
6159 stp reg1, reg2, [sp, outgoing_args_size]
6160 stp reg3, reg4, [sp, outgoing_args_size + 16] */
6161 frame
.initial_adjust
= frame
.frame_size
;
6162 frame
.callee_offset
= const_outgoing_args_size
;
6164 else if (saves_below_hard_fp_p
6165 && known_eq (frame
.saved_regs_size
,
6166 frame
.below_hard_fp_saved_regs_size
))
6168 /* Frame in which all saves are SVE saves:
6170 sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
6171 save SVE registers relative to SP
6172 sub sp, sp, outgoing_args_size */
6173 frame
.initial_adjust
= (frame
.hard_fp_offset
6174 + frame
.below_hard_fp_saved_regs_size
);
6175 frame
.final_adjust
= crtl
->outgoing_args_size
;
6177 else if (frame
.hard_fp_offset
.is_constant (&const_fp_offset
)
6178 && const_fp_offset
< max_push_offset
)
6180 /* Frame with large outgoing arguments or SVE saves, but with
6183 stp reg1, reg2, [sp, -hard_fp_offset]!
6184 stp reg3, reg4, [sp, 16]
6185 [sub sp, sp, below_hard_fp_saved_regs_size]
6186 [save SVE registers relative to SP]
6187 sub sp, sp, outgoing_args_size */
6188 frame
.callee_adjust
= const_fp_offset
;
6189 frame
.sve_callee_adjust
= frame
.below_hard_fp_saved_regs_size
;
6190 frame
.final_adjust
= crtl
->outgoing_args_size
;
6194 /* Frame with large local area and outgoing arguments or SVE saves,
6195 using frame pointer:
6197 sub sp, sp, hard_fp_offset
6198 stp x29, x30, [sp, 0]
6200 stp reg3, reg4, [sp, 16]
6201 [sub sp, sp, below_hard_fp_saved_regs_size]
6202 [save SVE registers relative to SP]
6203 sub sp, sp, outgoing_args_size */
6204 frame
.initial_adjust
= frame
.hard_fp_offset
;
6205 frame
.sve_callee_adjust
= frame
.below_hard_fp_saved_regs_size
;
6206 frame
.final_adjust
= crtl
->outgoing_args_size
;
6209 /* Make sure the individual adjustments add up to the full frame size. */
6210 gcc_assert (known_eq (frame
.initial_adjust
6211 + frame
.callee_adjust
6212 + frame
.sve_callee_adjust
6213 + frame
.final_adjust
, frame
.frame_size
));
6215 frame
.laid_out
= true;
6218 /* Return true if the register REGNO is saved on entry to
6219 the current function. */
6222 aarch64_register_saved_on_entry (int regno
)
6224 return known_ge (cfun
->machine
->frame
.reg_offset
[regno
], 0);
6227 /* Return the next register up from REGNO up to LIMIT for the callee
6231 aarch64_next_callee_save (unsigned regno
, unsigned limit
)
6233 while (regno
<= limit
&& !aarch64_register_saved_on_entry (regno
))
6238 /* Push the register number REGNO of mode MODE to the stack with write-back
6239 adjusting the stack by ADJUSTMENT. */
6242 aarch64_pushwb_single_reg (machine_mode mode
, unsigned regno
,
6243 HOST_WIDE_INT adjustment
)
6245 rtx base_rtx
= stack_pointer_rtx
;
6248 reg
= gen_rtx_REG (mode
, regno
);
6249 mem
= gen_rtx_PRE_MODIFY (Pmode
, base_rtx
,
6250 plus_constant (Pmode
, base_rtx
, -adjustment
));
6251 mem
= gen_frame_mem (mode
, mem
);
6253 insn
= emit_move_insn (mem
, reg
);
6254 RTX_FRAME_RELATED_P (insn
) = 1;
6257 /* Generate and return an instruction to store the pair of registers
6258 REG and REG2 of mode MODE to location BASE with write-back adjusting
6259 the stack location BASE by ADJUSTMENT. */
6262 aarch64_gen_storewb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
6263 HOST_WIDE_INT adjustment
)
6268 return gen_storewb_pairdi_di (base
, base
, reg
, reg2
,
6269 GEN_INT (-adjustment
),
6270 GEN_INT (UNITS_PER_WORD
- adjustment
));
6272 return gen_storewb_pairdf_di (base
, base
, reg
, reg2
,
6273 GEN_INT (-adjustment
),
6274 GEN_INT (UNITS_PER_WORD
- adjustment
));
6276 return gen_storewb_pairtf_di (base
, base
, reg
, reg2
,
6277 GEN_INT (-adjustment
),
6278 GEN_INT (UNITS_PER_VREG
- adjustment
));
6284 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
6285 stack pointer by ADJUSTMENT. */
6288 aarch64_push_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
)
6291 machine_mode mode
= aarch64_reg_save_mode (regno1
);
6293 if (regno2
== INVALID_REGNUM
)
6294 return aarch64_pushwb_single_reg (mode
, regno1
, adjustment
);
6296 rtx reg1
= gen_rtx_REG (mode
, regno1
);
6297 rtx reg2
= gen_rtx_REG (mode
, regno2
);
6299 insn
= emit_insn (aarch64_gen_storewb_pair (mode
, stack_pointer_rtx
, reg1
,
6301 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 2)) = 1;
6302 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
6303 RTX_FRAME_RELATED_P (insn
) = 1;
6306 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
6307 adjusting it by ADJUSTMENT afterwards. */
6310 aarch64_gen_loadwb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
6311 HOST_WIDE_INT adjustment
)
6316 return gen_loadwb_pairdi_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
6317 GEN_INT (UNITS_PER_WORD
));
6319 return gen_loadwb_pairdf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
6320 GEN_INT (UNITS_PER_WORD
));
6322 return gen_loadwb_pairtf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
6323 GEN_INT (UNITS_PER_VREG
));
6329 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
6330 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
6334 aarch64_pop_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
,
6337 machine_mode mode
= aarch64_reg_save_mode (regno1
);
6338 rtx reg1
= gen_rtx_REG (mode
, regno1
);
6340 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg1
, *cfi_ops
);
6342 if (regno2
== INVALID_REGNUM
)
6344 rtx mem
= plus_constant (Pmode
, stack_pointer_rtx
, adjustment
);
6345 mem
= gen_rtx_POST_MODIFY (Pmode
, stack_pointer_rtx
, mem
);
6346 emit_move_insn (reg1
, gen_frame_mem (mode
, mem
));
6350 rtx reg2
= gen_rtx_REG (mode
, regno2
);
6351 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
6352 emit_insn (aarch64_gen_loadwb_pair (mode
, stack_pointer_rtx
, reg1
,
6357 /* Generate and return a store pair instruction of mode MODE to store
6358 register REG1 to MEM1 and register REG2 to MEM2. */
6361 aarch64_gen_store_pair (machine_mode mode
, rtx mem1
, rtx reg1
, rtx mem2
,
6367 return gen_store_pair_dw_didi (mem1
, reg1
, mem2
, reg2
);
6370 return gen_store_pair_dw_dfdf (mem1
, reg1
, mem2
, reg2
);
6373 return gen_store_pair_dw_tftf (mem1
, reg1
, mem2
, reg2
);
6380 /* Generate and regurn a load pair isntruction of mode MODE to load register
6381 REG1 from MEM1 and register REG2 from MEM2. */
6384 aarch64_gen_load_pair (machine_mode mode
, rtx reg1
, rtx mem1
, rtx reg2
,
6390 return gen_load_pair_dw_didi (reg1
, mem1
, reg2
, mem2
);
6393 return gen_load_pair_dw_dfdf (reg1
, mem1
, reg2
, mem2
);
6396 return gen_load_pair_dw_tftf (reg1
, mem1
, reg2
, mem2
);
6403 /* Return TRUE if return address signing should be enabled for the current
6404 function, otherwise return FALSE. */
6407 aarch64_return_address_signing_enabled (void)
6409 /* This function should only be called after frame laid out. */
6410 gcc_assert (cfun
->machine
->frame
.laid_out
);
6412 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
6413 if its LR is pushed onto stack. */
6414 return (aarch64_ra_sign_scope
== AARCH64_FUNCTION_ALL
6415 || (aarch64_ra_sign_scope
== AARCH64_FUNCTION_NON_LEAF
6416 && known_ge (cfun
->machine
->frame
.reg_offset
[LR_REGNUM
], 0)));
6419 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
6421 aarch64_bti_enabled (void)
6423 return (aarch64_enable_bti
== 1);
6426 /* The caller is going to use ST1D or LD1D to save or restore an SVE
6427 register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
6428 the range [1, 16] * GET_MODE_SIZE (MODE). Prepare for this by:
6430 (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
6433 (2) setting PRED to a valid predicate register for the ST1D or LD1D,
6434 if the variable isn't already nonnull
6436 (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
6437 Handle this case using a temporary base register that is suitable for
6438 all offsets in that range. Use ANCHOR_REG as this base register if it
6439 is nonnull, otherwise create a new register and store it in ANCHOR_REG. */
6442 aarch64_adjust_sve_callee_save_base (machine_mode mode
, rtx
&base_rtx
,
6443 rtx
&anchor_reg
, poly_int64
&offset
,
6446 if (maybe_ge (offset
, 8 * GET_MODE_SIZE (mode
)))
6448 /* This is the maximum valid offset of the anchor from the base.
6449 Lower values would be valid too. */
6450 poly_int64 anchor_offset
= 16 * GET_MODE_SIZE (mode
);
6453 anchor_reg
= gen_rtx_REG (Pmode
, STACK_CLASH_SVE_CFA_REGNUM
);
6454 emit_insn (gen_add3_insn (anchor_reg
, base_rtx
,
6455 gen_int_mode (anchor_offset
, Pmode
)));
6457 base_rtx
= anchor_reg
;
6458 offset
-= anchor_offset
;
6462 int pred_reg
= cfun
->machine
->frame
.spare_pred_reg
;
6463 emit_move_insn (gen_rtx_REG (VNx16BImode
, pred_reg
),
6464 CONSTM1_RTX (VNx16BImode
));
6465 ptrue
= gen_rtx_REG (VNx2BImode
, pred_reg
);
6469 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6470 is saved at BASE + OFFSET. */
6473 aarch64_add_cfa_expression (rtx_insn
*insn
, rtx reg
,
6474 rtx base
, poly_int64 offset
)
6476 rtx mem
= gen_frame_mem (GET_MODE (reg
),
6477 plus_constant (Pmode
, base
, offset
));
6478 add_reg_note (insn
, REG_CFA_EXPRESSION
, gen_rtx_SET (mem
, reg
));
6481 /* Emit code to save the callee-saved registers from register number START
6482 to LIMIT to the stack at the location starting at offset START_OFFSET,
6483 skipping any write-back candidates if SKIP_WB is true. HARD_FP_VALID_P
6484 is true if the hard frame pointer has been set up. */
6487 aarch64_save_callee_saves (poly_int64 start_offset
,
6488 unsigned start
, unsigned limit
, bool skip_wb
,
6489 bool hard_fp_valid_p
)
6494 rtx anchor_reg
= NULL_RTX
, ptrue
= NULL_RTX
;
6496 for (regno
= aarch64_next_callee_save (start
, limit
);
6498 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
6502 bool frame_related_p
= aarch64_emit_cfi_for_reg_p (regno
);
6505 && (regno
== cfun
->machine
->frame
.wb_candidate1
6506 || regno
== cfun
->machine
->frame
.wb_candidate2
))
6509 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
6512 machine_mode mode
= aarch64_reg_save_mode (regno
);
6513 reg
= gen_rtx_REG (mode
, regno
);
6514 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
6515 rtx base_rtx
= stack_pointer_rtx
;
6516 poly_int64 sp_offset
= offset
;
6518 HOST_WIDE_INT const_offset
;
6519 if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
6520 aarch64_adjust_sve_callee_save_base (mode
, base_rtx
, anchor_reg
,
6522 else if (GP_REGNUM_P (regno
)
6523 && (!offset
.is_constant (&const_offset
) || const_offset
>= 512))
6525 gcc_assert (known_eq (start_offset
, 0));
6526 poly_int64 fp_offset
6527 = cfun
->machine
->frame
.below_hard_fp_saved_regs_size
;
6528 if (hard_fp_valid_p
)
6529 base_rtx
= hard_frame_pointer_rtx
;
6534 anchor_reg
= gen_rtx_REG (Pmode
, STACK_CLASH_SVE_CFA_REGNUM
);
6535 emit_insn (gen_add3_insn (anchor_reg
, base_rtx
,
6536 gen_int_mode (fp_offset
, Pmode
)));
6538 base_rtx
= anchor_reg
;
6540 offset
-= fp_offset
;
6542 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
6543 bool need_cfa_note_p
= (base_rtx
!= stack_pointer_rtx
);
6545 if (!aarch64_sve_mode_p (mode
)
6546 && (regno2
= aarch64_next_callee_save (regno
+ 1, limit
)) <= limit
6547 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
6548 && known_eq (GET_MODE_SIZE (mode
),
6549 cfun
->machine
->frame
.reg_offset
[regno2
]
6550 - cfun
->machine
->frame
.reg_offset
[regno
]))
6552 rtx reg2
= gen_rtx_REG (mode
, regno2
);
6555 offset
+= GET_MODE_SIZE (mode
);
6556 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
6557 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
,
6560 /* The first part of a frame-related parallel insn is
6561 always assumed to be relevant to the frame
6562 calculations; subsequent parts, are only
6563 frame-related if explicitly marked. */
6564 if (aarch64_emit_cfi_for_reg_p (regno2
))
6566 if (need_cfa_note_p
)
6567 aarch64_add_cfa_expression (insn
, reg2
, stack_pointer_rtx
,
6568 sp_offset
+ GET_MODE_SIZE (mode
));
6570 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
6575 else if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
6577 insn
= emit_insn (gen_aarch64_pred_mov (mode
, mem
, ptrue
, reg
));
6578 need_cfa_note_p
= true;
6580 else if (aarch64_sve_mode_p (mode
))
6581 insn
= emit_insn (gen_rtx_SET (mem
, reg
));
6583 insn
= emit_move_insn (mem
, reg
);
6585 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
6586 if (frame_related_p
&& need_cfa_note_p
)
6587 aarch64_add_cfa_expression (insn
, reg
, stack_pointer_rtx
, sp_offset
);
6591 /* Emit code to restore the callee registers from register number START
6592 up to and including LIMIT. Restore from the stack offset START_OFFSET,
6593 skipping any write-back candidates if SKIP_WB is true. Write the
6594 appropriate REG_CFA_RESTORE notes into CFI_OPS. */
6597 aarch64_restore_callee_saves (poly_int64 start_offset
, unsigned start
,
6598 unsigned limit
, bool skip_wb
, rtx
*cfi_ops
)
6603 rtx anchor_reg
= NULL_RTX
, ptrue
= NULL_RTX
;
6605 for (regno
= aarch64_next_callee_save (start
, limit
);
6607 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
6609 bool frame_related_p
= aarch64_emit_cfi_for_reg_p (regno
);
6610 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
6616 && (regno
== cfun
->machine
->frame
.wb_candidate1
6617 || regno
== cfun
->machine
->frame
.wb_candidate2
))
6620 machine_mode mode
= aarch64_reg_save_mode (regno
);
6621 reg
= gen_rtx_REG (mode
, regno
);
6622 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
6623 rtx base_rtx
= stack_pointer_rtx
;
6624 if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
6625 aarch64_adjust_sve_callee_save_base (mode
, base_rtx
, anchor_reg
,
6627 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
6629 if (!aarch64_sve_mode_p (mode
)
6630 && (regno2
= aarch64_next_callee_save (regno
+ 1, limit
)) <= limit
6631 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
6632 && known_eq (GET_MODE_SIZE (mode
),
6633 cfun
->machine
->frame
.reg_offset
[regno2
]
6634 - cfun
->machine
->frame
.reg_offset
[regno
]))
6636 rtx reg2
= gen_rtx_REG (mode
, regno2
);
6639 offset
+= GET_MODE_SIZE (mode
);
6640 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
6641 emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
6643 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
6646 else if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
6647 emit_insn (gen_aarch64_pred_mov (mode
, reg
, ptrue
, mem
));
6648 else if (aarch64_sve_mode_p (mode
))
6649 emit_insn (gen_rtx_SET (reg
, mem
));
6651 emit_move_insn (reg
, mem
);
6652 if (frame_related_p
)
6653 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg
, *cfi_ops
);
6657 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
6661 offset_4bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
6663 HOST_WIDE_INT multiple
;
6664 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
6665 && IN_RANGE (multiple
, -8, 7));
6668 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
6672 offset_6bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
6674 HOST_WIDE_INT multiple
;
6675 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
6676 && IN_RANGE (multiple
, 0, 63));
6679 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
6683 aarch64_offset_7bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
6685 HOST_WIDE_INT multiple
;
6686 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
6687 && IN_RANGE (multiple
, -64, 63));
6690 /* Return true if OFFSET is a signed 9-bit value. */
6693 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED
,
6696 HOST_WIDE_INT const_offset
;
6697 return (offset
.is_constant (&const_offset
)
6698 && IN_RANGE (const_offset
, -256, 255));
6701 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
6705 offset_9bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
6707 HOST_WIDE_INT multiple
;
6708 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
6709 && IN_RANGE (multiple
, -256, 255));
6712 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
6716 offset_12bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
6718 HOST_WIDE_INT multiple
;
6719 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
6720 && IN_RANGE (multiple
, 0, 4095));
6723 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
6726 aarch64_get_separate_components (void)
6728 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
6729 bitmap_clear (components
);
6731 /* The registers we need saved to the frame. */
6732 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
6733 if (aarch64_register_saved_on_entry (regno
))
6735 /* Punt on saves and restores that use ST1D and LD1D. We could
6736 try to be smarter, but it would involve making sure that the
6737 spare predicate register itself is safe to use at the save
6738 and restore points. Also, when a frame pointer is being used,
6739 the slots are often out of reach of ST1D and LD1D anyway. */
6740 machine_mode mode
= aarch64_reg_save_mode (regno
);
6741 if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
6744 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
6746 /* If the register is saved in the first SVE save slot, we use
6747 it as a stack probe for -fstack-clash-protection. */
6748 if (flag_stack_clash_protection
6749 && maybe_ne (cfun
->machine
->frame
.below_hard_fp_saved_regs_size
, 0)
6750 && known_eq (offset
, 0))
6753 /* Get the offset relative to the register we'll use. */
6754 if (frame_pointer_needed
)
6755 offset
-= cfun
->machine
->frame
.below_hard_fp_saved_regs_size
;
6757 offset
+= crtl
->outgoing_args_size
;
6759 /* Check that we can access the stack slot of the register with one
6760 direct load with no adjustments needed. */
6761 if (aarch64_sve_mode_p (mode
)
6762 ? offset_9bit_signed_scaled_p (mode
, offset
)
6763 : offset_12bit_unsigned_scaled_p (mode
, offset
))
6764 bitmap_set_bit (components
, regno
);
6767 /* Don't mess with the hard frame pointer. */
6768 if (frame_pointer_needed
)
6769 bitmap_clear_bit (components
, HARD_FRAME_POINTER_REGNUM
);
6771 /* If the spare predicate register used by big-endian SVE code
6772 is call-preserved, it must be saved in the main prologue
6773 before any saves that use it. */
6774 if (cfun
->machine
->frame
.spare_pred_reg
!= INVALID_REGNUM
)
6775 bitmap_clear_bit (components
, cfun
->machine
->frame
.spare_pred_reg
);
6777 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
6778 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
6779 /* If registers have been chosen to be stored/restored with
6780 writeback don't interfere with them to avoid having to output explicit
6781 stack adjustment instructions. */
6782 if (reg2
!= INVALID_REGNUM
)
6783 bitmap_clear_bit (components
, reg2
);
6784 if (reg1
!= INVALID_REGNUM
)
6785 bitmap_clear_bit (components
, reg1
);
6787 bitmap_clear_bit (components
, LR_REGNUM
);
6788 bitmap_clear_bit (components
, SP_REGNUM
);
6793 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
6796 aarch64_components_for_bb (basic_block bb
)
6798 bitmap in
= DF_LIVE_IN (bb
);
6799 bitmap gen
= &DF_LIVE_BB_INFO (bb
)->gen
;
6800 bitmap kill
= &DF_LIVE_BB_INFO (bb
)->kill
;
6802 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
6803 bitmap_clear (components
);
6805 /* Clobbered registers don't generate values in any meaningful sense,
6806 since nothing after the clobber can rely on their value. And we can't
6807 say that partially-clobbered registers are unconditionally killed,
6808 because whether they're killed or not depends on the mode of the
6809 value they're holding. Thus partially call-clobbered registers
6810 appear in neither the kill set nor the gen set.
6812 Check manually for any calls that clobber more of a register than the
6813 current function can. */
6814 function_abi_aggregator callee_abis
;
6816 FOR_BB_INSNS (bb
, insn
)
6818 callee_abis
.note_callee_abi (insn_callee_abi (insn
));
6819 HARD_REG_SET extra_caller_saves
= callee_abis
.caller_save_regs (*crtl
->abi
);
6821 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
6822 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
6823 if (!fixed_regs
[regno
]
6824 && !crtl
->abi
->clobbers_full_reg_p (regno
)
6825 && (TEST_HARD_REG_BIT (extra_caller_saves
, regno
)
6826 || bitmap_bit_p (in
, regno
)
6827 || bitmap_bit_p (gen
, regno
)
6828 || bitmap_bit_p (kill
, regno
)))
6830 bitmap_set_bit (components
, regno
);
6832 /* If there is a callee-save at an adjacent offset, add it too
6833 to increase the use of LDP/STP. */
6834 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
6835 unsigned regno2
= multiple_p (offset
, 16) ? regno
+ 1 : regno
- 1;
6837 if (regno2
<= LAST_SAVED_REGNUM
)
6839 poly_int64 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
6841 ? known_eq (offset
+ 8, offset2
)
6842 : multiple_p (offset2
, 16) && known_eq (offset2
+ 8, offset
))
6843 bitmap_set_bit (components
, regno2
);
6850 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
6851 Nothing to do for aarch64. */
6854 aarch64_disqualify_components (sbitmap
, edge
, sbitmap
, bool)
6858 /* Return the next set bit in BMP from START onwards. Return the total number
6859 of bits in BMP if no set bit is found at or after START. */
6862 aarch64_get_next_set_bit (sbitmap bmp
, unsigned int start
)
6864 unsigned int nbits
= SBITMAP_SIZE (bmp
);
6868 gcc_assert (start
< nbits
);
6869 for (unsigned int i
= start
; i
< nbits
; i
++)
6870 if (bitmap_bit_p (bmp
, i
))
6876 /* Do the work for aarch64_emit_prologue_components and
6877 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
6878 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
6879 for these components or the epilogue sequence. That is, it determines
6880 whether we should emit stores or loads and what kind of CFA notes to attach
6881 to the insns. Otherwise the logic for the two sequences is very
6885 aarch64_process_components (sbitmap components
, bool prologue_p
)
6887 rtx ptr_reg
= gen_rtx_REG (Pmode
, frame_pointer_needed
6888 ? HARD_FRAME_POINTER_REGNUM
6889 : STACK_POINTER_REGNUM
);
6891 unsigned last_regno
= SBITMAP_SIZE (components
);
6892 unsigned regno
= aarch64_get_next_set_bit (components
, R0_REGNUM
);
6893 rtx_insn
*insn
= NULL
;
6895 while (regno
!= last_regno
)
6897 bool frame_related_p
= aarch64_emit_cfi_for_reg_p (regno
);
6898 machine_mode mode
= aarch64_reg_save_mode (regno
);
6900 rtx reg
= gen_rtx_REG (mode
, regno
);
6901 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
6902 if (frame_pointer_needed
)
6903 offset
-= cfun
->machine
->frame
.below_hard_fp_saved_regs_size
;
6905 offset
+= crtl
->outgoing_args_size
;
6907 rtx addr
= plus_constant (Pmode
, ptr_reg
, offset
);
6908 rtx mem
= gen_frame_mem (mode
, addr
);
6910 rtx set
= prologue_p
? gen_rtx_SET (mem
, reg
) : gen_rtx_SET (reg
, mem
);
6911 unsigned regno2
= aarch64_get_next_set_bit (components
, regno
+ 1);
6912 /* No more registers to handle after REGNO.
6913 Emit a single save/restore and exit. */
6914 if (regno2
== last_regno
)
6916 insn
= emit_insn (set
);
6917 if (frame_related_p
)
6919 RTX_FRAME_RELATED_P (insn
) = 1;
6921 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
6923 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
6928 poly_int64 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
6929 /* The next register is not of the same class or its offset is not
6930 mergeable with the current one into a pair. */
6931 if (aarch64_sve_mode_p (mode
)
6932 || !satisfies_constraint_Ump (mem
)
6933 || GP_REGNUM_P (regno
) != GP_REGNUM_P (regno2
)
6934 || (crtl
->abi
->id () == ARM_PCS_SIMD
&& FP_REGNUM_P (regno
))
6935 || maybe_ne ((offset2
- cfun
->machine
->frame
.reg_offset
[regno
]),
6936 GET_MODE_SIZE (mode
)))
6938 insn
= emit_insn (set
);
6939 if (frame_related_p
)
6941 RTX_FRAME_RELATED_P (insn
) = 1;
6943 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
6945 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
6952 bool frame_related2_p
= aarch64_emit_cfi_for_reg_p (regno2
);
6954 /* REGNO2 can be saved/restored in a pair with REGNO. */
6955 rtx reg2
= gen_rtx_REG (mode
, regno2
);
6956 if (frame_pointer_needed
)
6957 offset2
-= cfun
->machine
->frame
.below_hard_fp_saved_regs_size
;
6959 offset2
+= crtl
->outgoing_args_size
;
6960 rtx addr2
= plus_constant (Pmode
, ptr_reg
, offset2
);
6961 rtx mem2
= gen_frame_mem (mode
, addr2
);
6962 rtx set2
= prologue_p
? gen_rtx_SET (mem2
, reg2
)
6963 : gen_rtx_SET (reg2
, mem2
);
6966 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
, reg2
));
6968 insn
= emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
6970 if (frame_related_p
|| frame_related2_p
)
6972 RTX_FRAME_RELATED_P (insn
) = 1;
6975 if (frame_related_p
)
6976 add_reg_note (insn
, REG_CFA_OFFSET
, set
);
6977 if (frame_related2_p
)
6978 add_reg_note (insn
, REG_CFA_OFFSET
, set2
);
6982 if (frame_related_p
)
6983 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
6984 if (frame_related2_p
)
6985 add_reg_note (insn
, REG_CFA_RESTORE
, reg2
);
6989 regno
= aarch64_get_next_set_bit (components
, regno2
+ 1);
6993 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
6996 aarch64_emit_prologue_components (sbitmap components
)
6998 aarch64_process_components (components
, true);
7001 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
7004 aarch64_emit_epilogue_components (sbitmap components
)
7006 aarch64_process_components (components
, false);
7009 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
7012 aarch64_set_handled_components (sbitmap components
)
7014 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
7015 if (bitmap_bit_p (components
, regno
))
7016 cfun
->machine
->reg_is_wrapped_separately
[regno
] = true;
7019 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
7020 determining the probe offset for alloca. */
7022 static HOST_WIDE_INT
7023 aarch64_stack_clash_protection_alloca_probe_range (void)
7025 return STACK_CLASH_CALLER_GUARD
;
7029 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
7030 registers. If POLY_SIZE is not large enough to require a probe this function
7031 will only adjust the stack. When allocating the stack space
7032 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
7033 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
7034 arguments. If we are then we ensure that any allocation larger than the ABI
7035 defined buffer needs a probe so that the invariant of having a 1KB buffer is
7038 We emit barriers after each stack adjustment to prevent optimizations from
7039 breaking the invariant that we never drop the stack more than a page. This
7040 invariant is needed to make it easier to correctly handle asynchronous
7041 events, e.g. if we were to allow the stack to be dropped by more than a page
7042 and then have multiple probes up and we take a signal somewhere in between
7043 then the signal handler doesn't know the state of the stack and can make no
7044 assumptions about which pages have been probed. */
7047 aarch64_allocate_and_probe_stack_space (rtx temp1
, rtx temp2
,
7048 poly_int64 poly_size
,
7049 bool frame_related_p
,
7050 bool final_adjustment_p
)
7052 HOST_WIDE_INT guard_size
7053 = 1 << param_stack_clash_protection_guard_size
;
7054 HOST_WIDE_INT guard_used_by_caller
= STACK_CLASH_CALLER_GUARD
;
7055 HOST_WIDE_INT min_probe_threshold
7056 = (final_adjustment_p
7057 ? guard_used_by_caller
7058 : guard_size
- guard_used_by_caller
);
7059 /* When doing the final adjustment for the outgoing arguments, take into
7060 account any unprobed space there is above the current SP. There are
7063 - When saving SVE registers below the hard frame pointer, we force
7064 the lowest save to take place in the prologue before doing the final
7065 adjustment (i.e. we don't allow the save to be shrink-wrapped).
7066 This acts as a probe at SP, so there is no unprobed space.
7068 - When there are no SVE register saves, we use the store of the link
7069 register as a probe. We can't assume that LR was saved at position 0
7070 though, so treat any space below it as unprobed. */
7071 if (final_adjustment_p
7072 && known_eq (cfun
->machine
->frame
.below_hard_fp_saved_regs_size
, 0))
7074 poly_int64 lr_offset
= cfun
->machine
->frame
.reg_offset
[LR_REGNUM
];
7075 if (known_ge (lr_offset
, 0))
7076 min_probe_threshold
-= lr_offset
.to_constant ();
7078 gcc_assert (!flag_stack_clash_protection
|| known_eq (poly_size
, 0));
7081 poly_int64 frame_size
= cfun
->machine
->frame
.frame_size
;
7083 /* We should always have a positive probe threshold. */
7084 gcc_assert (min_probe_threshold
> 0);
7086 if (flag_stack_clash_protection
&& !final_adjustment_p
)
7088 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
7089 poly_int64 sve_callee_adjust
= cfun
->machine
->frame
.sve_callee_adjust
;
7090 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
7092 if (known_eq (frame_size
, 0))
7094 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME
, false);
7096 else if (known_lt (initial_adjust
+ sve_callee_adjust
,
7097 guard_size
- guard_used_by_caller
)
7098 && known_lt (final_adjust
, guard_used_by_caller
))
7100 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME
, true);
7104 /* If SIZE is not large enough to require probing, just adjust the stack and
7106 if (known_lt (poly_size
, min_probe_threshold
)
7107 || !flag_stack_clash_protection
)
7109 aarch64_sub_sp (temp1
, temp2
, poly_size
, frame_related_p
);
7114 /* Handle the SVE non-constant case first. */
7115 if (!poly_size
.is_constant (&size
))
7119 fprintf (dump_file
, "Stack clash SVE prologue: ");
7120 print_dec (poly_size
, dump_file
);
7121 fprintf (dump_file
, " bytes, dynamic probing will be required.\n");
7124 /* First calculate the amount of bytes we're actually spilling. */
7125 aarch64_add_offset (Pmode
, temp1
, CONST0_RTX (Pmode
),
7126 poly_size
, temp1
, temp2
, false, true);
7128 rtx_insn
*insn
= get_last_insn ();
7130 if (frame_related_p
)
7132 /* This is done to provide unwinding information for the stack
7133 adjustments we're about to do, however to prevent the optimizers
7134 from removing the R11 move and leaving the CFA note (which would be
7135 very wrong) we tie the old and new stack pointer together.
7136 The tie will expand to nothing but the optimizers will not touch
7138 rtx stack_ptr_copy
= gen_rtx_REG (Pmode
, STACK_CLASH_SVE_CFA_REGNUM
);
7139 emit_move_insn (stack_ptr_copy
, stack_pointer_rtx
);
7140 emit_insn (gen_stack_tie (stack_ptr_copy
, stack_pointer_rtx
));
7142 /* We want the CFA independent of the stack pointer for the
7143 duration of the loop. */
7144 add_reg_note (insn
, REG_CFA_DEF_CFA
, stack_ptr_copy
);
7145 RTX_FRAME_RELATED_P (insn
) = 1;
7148 rtx probe_const
= gen_int_mode (min_probe_threshold
, Pmode
);
7149 rtx guard_const
= gen_int_mode (guard_size
, Pmode
);
7151 insn
= emit_insn (gen_probe_sve_stack_clash (Pmode
, stack_pointer_rtx
,
7152 stack_pointer_rtx
, temp1
,
7153 probe_const
, guard_const
));
7155 /* Now reset the CFA register if needed. */
7156 if (frame_related_p
)
7158 add_reg_note (insn
, REG_CFA_DEF_CFA
,
7159 gen_rtx_PLUS (Pmode
, stack_pointer_rtx
,
7160 gen_int_mode (poly_size
, Pmode
)));
7161 RTX_FRAME_RELATED_P (insn
) = 1;
7169 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
7170 " bytes, probing will be required.\n", size
);
7172 /* Round size to the nearest multiple of guard_size, and calculate the
7173 residual as the difference between the original size and the rounded
7175 HOST_WIDE_INT rounded_size
= ROUND_DOWN (size
, guard_size
);
7176 HOST_WIDE_INT residual
= size
- rounded_size
;
7178 /* We can handle a small number of allocations/probes inline. Otherwise
7180 if (rounded_size
<= STACK_CLASH_MAX_UNROLL_PAGES
* guard_size
)
7182 for (HOST_WIDE_INT i
= 0; i
< rounded_size
; i
+= guard_size
)
7184 aarch64_sub_sp (NULL
, temp2
, guard_size
, true);
7185 emit_stack_probe (plus_constant (Pmode
, stack_pointer_rtx
,
7186 guard_used_by_caller
));
7187 emit_insn (gen_blockage ());
7189 dump_stack_clash_frame_info (PROBE_INLINE
, size
!= rounded_size
);
7193 /* Compute the ending address. */
7194 aarch64_add_offset (Pmode
, temp1
, stack_pointer_rtx
, -rounded_size
,
7195 temp1
, NULL
, false, true);
7196 rtx_insn
*insn
= get_last_insn ();
7198 /* For the initial allocation, we don't have a frame pointer
7199 set up, so we always need CFI notes. If we're doing the
7200 final allocation, then we may have a frame pointer, in which
7201 case it is the CFA, otherwise we need CFI notes.
7203 We can determine which allocation we are doing by looking at
7204 the value of FRAME_RELATED_P since the final allocations are not
7206 if (frame_related_p
)
7208 /* We want the CFA independent of the stack pointer for the
7209 duration of the loop. */
7210 add_reg_note (insn
, REG_CFA_DEF_CFA
,
7211 plus_constant (Pmode
, temp1
, rounded_size
));
7212 RTX_FRAME_RELATED_P (insn
) = 1;
7215 /* This allocates and probes the stack. Note that this re-uses some of
7216 the existing Ada stack protection code. However we are guaranteed not
7217 to enter the non loop or residual branches of that code.
7219 The non-loop part won't be entered because if our allocation amount
7220 doesn't require a loop, the case above would handle it.
7222 The residual amount won't be entered because TEMP1 is a mutliple of
7223 the allocation size. The residual will always be 0. As such, the only
7224 part we are actually using from that code is the loop setup. The
7225 actual probing is done in aarch64_output_probe_stack_range. */
7226 insn
= emit_insn (gen_probe_stack_range (stack_pointer_rtx
,
7227 stack_pointer_rtx
, temp1
));
7229 /* Now reset the CFA register if needed. */
7230 if (frame_related_p
)
7232 add_reg_note (insn
, REG_CFA_DEF_CFA
,
7233 plus_constant (Pmode
, stack_pointer_rtx
, rounded_size
));
7234 RTX_FRAME_RELATED_P (insn
) = 1;
7237 emit_insn (gen_blockage ());
7238 dump_stack_clash_frame_info (PROBE_LOOP
, size
!= rounded_size
);
7241 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
7242 be probed. This maintains the requirement that each page is probed at
7243 least once. For initial probing we probe only if the allocation is
7244 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
7245 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
7246 GUARD_SIZE. This works that for any allocation that is large enough to
7247 trigger a probe here, we'll have at least one, and if they're not large
7248 enough for this code to emit anything for them, The page would have been
7249 probed by the saving of FP/LR either by this function or any callees. If
7250 we don't have any callees then we won't have more stack adjustments and so
7254 HOST_WIDE_INT residual_probe_offset
= guard_used_by_caller
;
7255 /* If we're doing final adjustments, and we've done any full page
7256 allocations then any residual needs to be probed. */
7257 if (final_adjustment_p
&& rounded_size
!= 0)
7258 min_probe_threshold
= 0;
7259 /* If doing a small final adjustment, we always probe at offset 0.
7260 This is done to avoid issues when LR is not at position 0 or when
7261 the final adjustment is smaller than the probing offset. */
7262 else if (final_adjustment_p
&& rounded_size
== 0)
7263 residual_probe_offset
= 0;
7265 aarch64_sub_sp (temp1
, temp2
, residual
, frame_related_p
);
7266 if (residual
>= min_probe_threshold
)
7270 "Stack clash AArch64 prologue residuals: "
7271 HOST_WIDE_INT_PRINT_DEC
" bytes, probing will be required."
7274 emit_stack_probe (plus_constant (Pmode
, stack_pointer_rtx
,
7275 residual_probe_offset
));
7276 emit_insn (gen_blockage ());
7281 /* Return 1 if the register is used by the epilogue. We need to say the
7282 return register is used, but only after epilogue generation is complete.
7283 Note that in the case of sibcalls, the values "used by the epilogue" are
7284 considered live at the start of the called function.
7286 For SIMD functions we need to return 1 for FP registers that are saved and
7287 restored by a function but are not zero in call_used_regs. If we do not do
7288 this optimizations may remove the restore of the register. */
7291 aarch64_epilogue_uses (int regno
)
7293 if (epilogue_completed
)
7295 if (regno
== LR_REGNUM
)
7301 /* AArch64 stack frames generated by this compiler look like:
7303 +-------------------------------+
7305 | incoming stack arguments |
7307 +-------------------------------+
7308 | | <-- incoming stack pointer (aligned)
7309 | callee-allocated save area |
7310 | for register varargs |
7312 +-------------------------------+
7313 | local variables | <-- frame_pointer_rtx
7315 +-------------------------------+
7317 +-------------------------------+ |
7318 | callee-saved registers | | frame.saved_regs_size
7319 +-------------------------------+ |
7321 +-------------------------------+ |
7323 +-------------------------------+ |<- hard_frame_pointer_rtx (aligned)
7324 | SVE vector registers | | \
7325 +-------------------------------+ | | below_hard_fp_saved_regs_size
7326 | SVE predicate registers | / /
7327 +-------------------------------+
7328 | dynamic allocation |
7329 +-------------------------------+
7331 +-------------------------------+
7332 | outgoing stack arguments | <-- arg_pointer
7334 +-------------------------------+
7335 | | <-- stack_pointer_rtx (aligned)
7337 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
7338 but leave frame_pointer_rtx and hard_frame_pointer_rtx
7341 By default for stack-clash we assume the guard is at least 64KB, but this
7342 value is configurable to either 4KB or 64KB. We also force the guard size to
7343 be the same as the probing interval and both values are kept in sync.
7345 With those assumptions the callee can allocate up to 63KB (or 3KB depending
7346 on the guard size) of stack space without probing.
7348 When probing is needed, we emit a probe at the start of the prologue
7349 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
7351 We have to track how much space has been allocated and the only stores
7352 to the stack we track as implicit probes are the FP/LR stores.
7354 For outgoing arguments we probe if the size is larger than 1KB, such that
7355 the ABI specified buffer is maintained for the next callee.
7357 The following registers are reserved during frame layout and should not be
7358 used for any other purpose:
7360 - r11: Used by stack clash protection when SVE is enabled, and also
7361 as an anchor register when saving and restoring registers
7362 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
7363 - r14 and r15: Used for speculation tracking.
7364 - r16(IP0), r17(IP1): Used by indirect tailcalls.
7365 - r30(LR), r29(FP): Used by standard frame layout.
7367 These registers must be avoided in frame layout related code unless the
7368 explicit intention is to interact with one of the features listed above. */
7370 /* Generate the prologue instructions for entry into a function.
7371 Establish the stack frame by decreasing the stack pointer with a
7372 properly calculated size and, if necessary, create a frame record
7373 filled with the values of LR and previous frame pointer. The
7374 current FP is also set up if it is in use. */
7377 aarch64_expand_prologue (void)
7379 poly_int64 frame_size
= cfun
->machine
->frame
.frame_size
;
7380 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
7381 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
7382 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
7383 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
7384 poly_int64 sve_callee_adjust
= cfun
->machine
->frame
.sve_callee_adjust
;
7385 poly_int64 below_hard_fp_saved_regs_size
7386 = cfun
->machine
->frame
.below_hard_fp_saved_regs_size
;
7387 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
7388 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
7389 bool emit_frame_chain
= cfun
->machine
->frame
.emit_frame_chain
;
7392 if (flag_stack_clash_protection
&& known_eq (callee_adjust
, 0))
7394 /* Fold the SVE allocation into the initial allocation.
7395 We don't do this in aarch64_layout_arg to avoid pessimizing
7396 the epilogue code. */
7397 initial_adjust
+= sve_callee_adjust
;
7398 sve_callee_adjust
= 0;
7401 /* Sign return address for functions. */
7402 if (aarch64_return_address_signing_enabled ())
7404 switch (aarch64_ra_sign_key
)
7407 insn
= emit_insn (gen_paciasp ());
7410 insn
= emit_insn (gen_pacibsp ());
7415 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
7416 RTX_FRAME_RELATED_P (insn
) = 1;
7419 if (flag_stack_usage_info
)
7420 current_function_static_stack_size
= constant_lower_bound (frame_size
);
7422 if (flag_stack_check
== STATIC_BUILTIN_STACK_CHECK
)
7424 if (crtl
->is_leaf
&& !cfun
->calls_alloca
)
7426 if (maybe_gt (frame_size
, PROBE_INTERVAL
)
7427 && maybe_gt (frame_size
, get_stack_check_protect ()))
7428 aarch64_emit_probe_stack_range (get_stack_check_protect (),
7430 - get_stack_check_protect ()));
7432 else if (maybe_gt (frame_size
, 0))
7433 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size
);
7436 rtx tmp0_rtx
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
7437 rtx tmp1_rtx
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
7439 /* In theory we should never have both an initial adjustment
7440 and a callee save adjustment. Verify that is the case since the
7441 code below does not handle it for -fstack-clash-protection. */
7442 gcc_assert (known_eq (initial_adjust
, 0) || callee_adjust
== 0);
7444 /* Will only probe if the initial adjustment is larger than the guard
7445 less the amount of the guard reserved for use by the caller's
7447 aarch64_allocate_and_probe_stack_space (tmp0_rtx
, tmp1_rtx
, initial_adjust
,
7450 if (callee_adjust
!= 0)
7451 aarch64_push_regs (reg1
, reg2
, callee_adjust
);
7453 /* The offset of the frame chain record (if any) from the current SP. */
7454 poly_int64 chain_offset
= (initial_adjust
+ callee_adjust
7455 - cfun
->machine
->frame
.hard_fp_offset
);
7456 gcc_assert (known_ge (chain_offset
, 0));
7458 /* The offset of the bottom of the save area from the current SP. */
7459 poly_int64 saved_regs_offset
= chain_offset
- below_hard_fp_saved_regs_size
;
7461 if (emit_frame_chain
)
7463 if (callee_adjust
== 0)
7467 aarch64_save_callee_saves (saved_regs_offset
, reg1
, reg2
,
7471 gcc_assert (known_eq (chain_offset
, 0));
7472 aarch64_add_offset (Pmode
, hard_frame_pointer_rtx
,
7473 stack_pointer_rtx
, chain_offset
,
7474 tmp1_rtx
, tmp0_rtx
, frame_pointer_needed
);
7475 if (frame_pointer_needed
&& !frame_size
.is_constant ())
7477 /* Variable-sized frames need to describe the save slot
7478 address using DW_CFA_expression rather than DW_CFA_offset.
7479 This means that, without taking further action, the
7480 locations of the registers that we've already saved would
7481 remain based on the stack pointer even after we redefine
7482 the CFA based on the frame pointer. We therefore need new
7483 DW_CFA_expressions to re-express the save slots with addresses
7484 based on the frame pointer. */
7485 rtx_insn
*insn
= get_last_insn ();
7486 gcc_assert (RTX_FRAME_RELATED_P (insn
));
7488 /* Add an explicit CFA definition if this was previously
7490 if (!find_reg_note (insn
, REG_CFA_ADJUST_CFA
, NULL_RTX
))
7492 rtx src
= plus_constant (Pmode
, stack_pointer_rtx
,
7494 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
7495 gen_rtx_SET (hard_frame_pointer_rtx
, src
));
7498 /* Change the save slot expressions for the registers that
7499 we've already saved. */
7500 aarch64_add_cfa_expression (insn
, regno_reg_rtx
[reg2
],
7501 hard_frame_pointer_rtx
, UNITS_PER_WORD
);
7502 aarch64_add_cfa_expression (insn
, regno_reg_rtx
[reg1
],
7503 hard_frame_pointer_rtx
, 0);
7505 emit_insn (gen_stack_tie (stack_pointer_rtx
, hard_frame_pointer_rtx
));
7508 aarch64_save_callee_saves (saved_regs_offset
, R0_REGNUM
, R30_REGNUM
,
7509 callee_adjust
!= 0 || emit_frame_chain
,
7511 if (maybe_ne (sve_callee_adjust
, 0))
7513 gcc_assert (!flag_stack_clash_protection
7514 || known_eq (initial_adjust
, 0));
7515 aarch64_allocate_and_probe_stack_space (tmp1_rtx
, tmp0_rtx
,
7517 !frame_pointer_needed
, false);
7518 saved_regs_offset
+= sve_callee_adjust
;
7520 aarch64_save_callee_saves (saved_regs_offset
, P0_REGNUM
, P15_REGNUM
,
7521 false, emit_frame_chain
);
7522 aarch64_save_callee_saves (saved_regs_offset
, V0_REGNUM
, V31_REGNUM
,
7523 callee_adjust
!= 0 || emit_frame_chain
,
7526 /* We may need to probe the final adjustment if it is larger than the guard
7527 that is assumed by the called. */
7528 aarch64_allocate_and_probe_stack_space (tmp1_rtx
, tmp0_rtx
, final_adjust
,
7529 !frame_pointer_needed
, true);
7532 /* Return TRUE if we can use a simple_return insn.
7534 This function checks whether the callee saved stack is empty, which
7535 means no restore actions are need. The pro_and_epilogue will use
7536 this to check whether shrink-wrapping opt is feasible. */
7539 aarch64_use_return_insn_p (void)
7541 if (!reload_completed
)
7547 return known_eq (cfun
->machine
->frame
.frame_size
, 0);
7550 /* Generate the epilogue instructions for returning from a function.
7551 This is almost exactly the reverse of the prolog sequence, except
7552 that we need to insert barriers to avoid scheduling loads that read
7553 from a deallocated stack, and we optimize the unwind records by
7554 emitting them all together if possible. */
7556 aarch64_expand_epilogue (bool for_sibcall
)
7558 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
7559 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
7560 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
7561 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
7562 poly_int64 sve_callee_adjust
= cfun
->machine
->frame
.sve_callee_adjust
;
7563 poly_int64 below_hard_fp_saved_regs_size
7564 = cfun
->machine
->frame
.below_hard_fp_saved_regs_size
;
7565 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
7566 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
7569 /* A stack clash protection prologue may not have left EP0_REGNUM or
7570 EP1_REGNUM in a usable state. The same is true for allocations
7571 with an SVE component, since we then need both temporary registers
7572 for each allocation. For stack clash we are in a usable state if
7573 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
7574 HOST_WIDE_INT guard_size
7575 = 1 << param_stack_clash_protection_guard_size
;
7576 HOST_WIDE_INT guard_used_by_caller
= STACK_CLASH_CALLER_GUARD
;
7578 /* We can re-use the registers when:
7580 (a) the deallocation amount is the same as the corresponding
7581 allocation amount (which is false if we combine the initial
7582 and SVE callee save allocations in the prologue); and
7584 (b) the allocation amount doesn't need a probe (which is false
7585 if the amount is guard_size - guard_used_by_caller or greater).
7587 In such situations the register should remain live with the correct
7589 bool can_inherit_p
= (initial_adjust
.is_constant ()
7590 && final_adjust
.is_constant ()
7591 && (!flag_stack_clash_protection
7592 || (known_lt (initial_adjust
,
7593 guard_size
- guard_used_by_caller
)
7594 && known_eq (sve_callee_adjust
, 0))));
7596 /* We need to add memory barrier to prevent read from deallocated stack. */
7598 = maybe_ne (get_frame_size ()
7599 + cfun
->machine
->frame
.saved_varargs_size
, 0);
7601 /* Emit a barrier to prevent loads from a deallocated stack. */
7602 if (maybe_gt (final_adjust
, crtl
->outgoing_args_size
)
7603 || cfun
->calls_alloca
7604 || crtl
->calls_eh_return
)
7606 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
7607 need_barrier_p
= false;
7610 /* Restore the stack pointer from the frame pointer if it may not
7611 be the same as the stack pointer. */
7612 rtx tmp0_rtx
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
7613 rtx tmp1_rtx
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
7614 if (frame_pointer_needed
7615 && (maybe_ne (final_adjust
, 0) || cfun
->calls_alloca
))
7616 /* If writeback is used when restoring callee-saves, the CFA
7617 is restored on the instruction doing the writeback. */
7618 aarch64_add_offset (Pmode
, stack_pointer_rtx
,
7619 hard_frame_pointer_rtx
,
7620 -callee_offset
- below_hard_fp_saved_regs_size
,
7621 tmp1_rtx
, tmp0_rtx
, callee_adjust
== 0);
7623 /* The case where we need to re-use the register here is very rare, so
7624 avoid the complicated condition and just always emit a move if the
7625 immediate doesn't fit. */
7626 aarch64_add_sp (tmp1_rtx
, tmp0_rtx
, final_adjust
, true);
7628 /* Restore the vector registers before the predicate registers,
7629 so that we can use P4 as a temporary for big-endian SVE frames. */
7630 aarch64_restore_callee_saves (callee_offset
, V0_REGNUM
, V31_REGNUM
,
7631 callee_adjust
!= 0, &cfi_ops
);
7632 aarch64_restore_callee_saves (callee_offset
, P0_REGNUM
, P15_REGNUM
,
7634 if (maybe_ne (sve_callee_adjust
, 0))
7635 aarch64_add_sp (NULL_RTX
, NULL_RTX
, sve_callee_adjust
, true);
7636 aarch64_restore_callee_saves (callee_offset
- sve_callee_adjust
,
7637 R0_REGNUM
, R30_REGNUM
,
7638 callee_adjust
!= 0, &cfi_ops
);
7641 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
7643 if (callee_adjust
!= 0)
7644 aarch64_pop_regs (reg1
, reg2
, callee_adjust
, &cfi_ops
);
7646 if (callee_adjust
!= 0 || maybe_gt (initial_adjust
, 65536))
7648 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
7649 insn
= get_last_insn ();
7650 rtx new_cfa
= plus_constant (Pmode
, stack_pointer_rtx
, initial_adjust
);
7651 REG_NOTES (insn
) = alloc_reg_note (REG_CFA_DEF_CFA
, new_cfa
, cfi_ops
);
7652 RTX_FRAME_RELATED_P (insn
) = 1;
7656 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
7657 add restriction on emit_move optimization to leaf functions. */
7658 aarch64_add_sp (tmp0_rtx
, tmp1_rtx
, initial_adjust
,
7659 (!can_inherit_p
|| !crtl
->is_leaf
7660 || df_regs_ever_live_p (EP0_REGNUM
)));
7664 /* Emit delayed restores and reset the CFA to be SP. */
7665 insn
= get_last_insn ();
7666 cfi_ops
= alloc_reg_note (REG_CFA_DEF_CFA
, stack_pointer_rtx
, cfi_ops
);
7667 REG_NOTES (insn
) = cfi_ops
;
7668 RTX_FRAME_RELATED_P (insn
) = 1;
7671 /* We prefer to emit the combined return/authenticate instruction RETAA,
7672 however there are three cases in which we must instead emit an explicit
7673 authentication instruction.
7675 1) Sibcalls don't return in a normal way, so if we're about to call one
7676 we must authenticate.
7678 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
7679 generating code for !TARGET_ARMV8_3 we can't use it and must
7680 explicitly authenticate.
7682 3) On an eh_return path we make extra stack adjustments to update the
7683 canonical frame address to be the exception handler's CFA. We want
7684 to authenticate using the CFA of the function which calls eh_return.
7686 if (aarch64_return_address_signing_enabled ()
7687 && (for_sibcall
|| !TARGET_ARMV8_3
|| crtl
->calls_eh_return
))
7689 switch (aarch64_ra_sign_key
)
7692 insn
= emit_insn (gen_autiasp ());
7695 insn
= emit_insn (gen_autibsp ());
7700 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
7701 RTX_FRAME_RELATED_P (insn
) = 1;
7704 /* Stack adjustment for exception handler. */
7705 if (crtl
->calls_eh_return
&& !for_sibcall
)
7707 /* We need to unwind the stack by the offset computed by
7708 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
7709 to be SP; letting the CFA move during this adjustment
7710 is just as correct as retaining the CFA from the body
7711 of the function. Therefore, do nothing special. */
7712 emit_insn (gen_add2_insn (stack_pointer_rtx
, EH_RETURN_STACKADJ_RTX
));
7715 emit_use (gen_rtx_REG (DImode
, LR_REGNUM
));
7717 emit_jump_insn (ret_rtx
);
7720 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
7721 normally or return to a previous frame after unwinding.
7723 An EH return uses a single shared return sequence. The epilogue is
7724 exactly like a normal epilogue except that it has an extra input
7725 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
7726 that must be applied after the frame has been destroyed. An extra label
7727 is inserted before the epilogue which initializes this register to zero,
7728 and this is the entry point for a normal return.
7730 An actual EH return updates the return address, initializes the stack
7731 adjustment and jumps directly into the epilogue (bypassing the zeroing
7732 of the adjustment). Since the return address is typically saved on the
7733 stack when a function makes a call, the saved LR must be updated outside
7736 This poses problems as the store is generated well before the epilogue,
7737 so the offset of LR is not known yet. Also optimizations will remove the
7738 store as it appears dead, even after the epilogue is generated (as the
7739 base or offset for loading LR is different in many cases).
7741 To avoid these problems this implementation forces the frame pointer
7742 in eh_return functions so that the location of LR is fixed and known early.
7743 It also marks the store volatile, so no optimization is permitted to
7744 remove the store. */
7746 aarch64_eh_return_handler_rtx (void)
7748 rtx tmp
= gen_frame_mem (Pmode
,
7749 plus_constant (Pmode
, hard_frame_pointer_rtx
, UNITS_PER_WORD
));
7751 /* Mark the store volatile, so no optimization is permitted to remove it. */
7752 MEM_VOLATILE_P (tmp
) = true;
7756 /* Output code to add DELTA to the first argument, and then jump
7757 to FUNCTION. Used for C++ multiple inheritance. */
7759 aarch64_output_mi_thunk (FILE *file
, tree thunk ATTRIBUTE_UNUSED
,
7760 HOST_WIDE_INT delta
,
7761 HOST_WIDE_INT vcall_offset
,
7764 /* The this pointer is always in x0. Note that this differs from
7765 Arm where the this pointer maybe bumped to r1 if r0 is required
7766 to return a pointer to an aggregate. On AArch64 a result value
7767 pointer will be in x8. */
7768 int this_regno
= R0_REGNUM
;
7769 rtx this_rtx
, temp0
, temp1
, addr
, funexp
;
7771 const char *fnname
= IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk
));
7773 if (aarch64_bti_enabled ())
7774 emit_insn (gen_bti_c());
7776 reload_completed
= 1;
7777 emit_note (NOTE_INSN_PROLOGUE_END
);
7779 this_rtx
= gen_rtx_REG (Pmode
, this_regno
);
7780 temp0
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
7781 temp1
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
7783 if (vcall_offset
== 0)
7784 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
, temp1
, temp0
, false);
7787 gcc_assert ((vcall_offset
& (POINTER_BYTES
- 1)) == 0);
7792 if (delta
>= -256 && delta
< 256)
7793 addr
= gen_rtx_PRE_MODIFY (Pmode
, this_rtx
,
7794 plus_constant (Pmode
, this_rtx
, delta
));
7796 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
,
7797 temp1
, temp0
, false);
7800 if (Pmode
== ptr_mode
)
7801 aarch64_emit_move (temp0
, gen_rtx_MEM (ptr_mode
, addr
));
7803 aarch64_emit_move (temp0
,
7804 gen_rtx_ZERO_EXTEND (Pmode
,
7805 gen_rtx_MEM (ptr_mode
, addr
)));
7807 if (vcall_offset
>= -256 && vcall_offset
< 4096 * POINTER_BYTES
)
7808 addr
= plus_constant (Pmode
, temp0
, vcall_offset
);
7811 aarch64_internal_mov_immediate (temp1
, GEN_INT (vcall_offset
), true,
7813 addr
= gen_rtx_PLUS (Pmode
, temp0
, temp1
);
7816 if (Pmode
== ptr_mode
)
7817 aarch64_emit_move (temp1
, gen_rtx_MEM (ptr_mode
,addr
));
7819 aarch64_emit_move (temp1
,
7820 gen_rtx_SIGN_EXTEND (Pmode
,
7821 gen_rtx_MEM (ptr_mode
, addr
)));
7823 emit_insn (gen_add2_insn (this_rtx
, temp1
));
7826 /* Generate a tail call to the target function. */
7827 if (!TREE_USED (function
))
7829 assemble_external (function
);
7830 TREE_USED (function
) = 1;
7832 funexp
= XEXP (DECL_RTL (function
), 0);
7833 funexp
= gen_rtx_MEM (FUNCTION_MODE
, funexp
);
7834 rtx callee_abi
= gen_int_mode (fndecl_abi (function
).id (), DImode
);
7835 insn
= emit_call_insn (gen_sibcall (funexp
, const0_rtx
, callee_abi
));
7836 SIBLING_CALL_P (insn
) = 1;
7838 insn
= get_insns ();
7839 shorten_branches (insn
);
7841 assemble_start_function (thunk
, fnname
);
7842 final_start_function (insn
, file
, 1);
7843 final (insn
, file
, 1);
7844 final_end_function ();
7845 assemble_end_function (thunk
, fnname
);
7847 /* Stop pretending to be a post-reload pass. */
7848 reload_completed
= 0;
7852 aarch64_tls_referenced_p (rtx x
)
7854 if (!TARGET_HAVE_TLS
)
7856 subrtx_iterator::array_type array
;
7857 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
7859 const_rtx x
= *iter
;
7860 if (GET_CODE (x
) == SYMBOL_REF
&& SYMBOL_REF_TLS_MODEL (x
) != 0)
7862 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
7863 TLS offsets, not real symbol references. */
7864 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
7865 iter
.skip_subrtxes ();
7871 /* Return true if val can be encoded as a 12-bit unsigned immediate with
7872 a left shift of 0 or 12 bits. */
7874 aarch64_uimm12_shift (HOST_WIDE_INT val
)
7876 return ((val
& (((HOST_WIDE_INT
) 0xfff) << 0)) == val
7877 || (val
& (((HOST_WIDE_INT
) 0xfff) << 12)) == val
7881 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
7882 that can be created with a left shift of 0 or 12. */
7883 static HOST_WIDE_INT
7884 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val
)
7886 /* Check to see if the value fits in 24 bits, as that is the maximum we can
7887 handle correctly. */
7888 gcc_assert ((val
& 0xffffff) == val
);
7890 if (((val
& 0xfff) << 0) == val
)
7893 return val
& (0xfff << 12);
7896 /* Return true if val is an immediate that can be loaded into a
7897 register by a MOVZ instruction. */
7899 aarch64_movw_imm (HOST_WIDE_INT val
, scalar_int_mode mode
)
7901 if (GET_MODE_SIZE (mode
) > 4)
7903 if ((val
& (((HOST_WIDE_INT
) 0xffff) << 32)) == val
7904 || (val
& (((HOST_WIDE_INT
) 0xffff) << 48)) == val
)
7909 /* Ignore sign extension. */
7910 val
&= (HOST_WIDE_INT
) 0xffffffff;
7912 return ((val
& (((HOST_WIDE_INT
) 0xffff) << 0)) == val
7913 || (val
& (((HOST_WIDE_INT
) 0xffff) << 16)) == val
);
7918 X = (X & AND_VAL) | IOR_VAL;
7920 can be implemented using:
7922 MOVK X, #(IOR_VAL >> shift), LSL #shift
7924 Return the shift if so, otherwise return -1. */
7926 aarch64_movk_shift (const wide_int_ref
&and_val
,
7927 const wide_int_ref
&ior_val
)
7929 unsigned int precision
= and_val
.get_precision ();
7930 unsigned HOST_WIDE_INT mask
= 0xffff;
7931 for (unsigned int shift
= 0; shift
< precision
; shift
+= 16)
7933 if (and_val
== ~mask
&& (ior_val
& mask
) == ior_val
)
7940 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
7941 64-bit (DImode) integer. */
7943 static unsigned HOST_WIDE_INT
7944 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val
, machine_mode mode
)
7946 unsigned int size
= GET_MODE_UNIT_PRECISION (mode
);
7949 val
&= (HOST_WIDE_INT_1U
<< size
) - 1;
7956 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
7958 static const unsigned HOST_WIDE_INT bitmask_imm_mul
[] =
7960 0x0000000100000001ull
,
7961 0x0001000100010001ull
,
7962 0x0101010101010101ull
,
7963 0x1111111111111111ull
,
7964 0x5555555555555555ull
,
7968 /* Return true if val is a valid bitmask immediate. */
7971 aarch64_bitmask_imm (HOST_WIDE_INT val_in
, machine_mode mode
)
7973 unsigned HOST_WIDE_INT val
, tmp
, mask
, first_one
, next_one
;
7976 /* Check for a single sequence of one bits and return quickly if so.
7977 The special cases of all ones and all zeroes returns false. */
7978 val
= aarch64_replicate_bitmask_imm (val_in
, mode
);
7979 tmp
= val
+ (val
& -val
);
7981 if (tmp
== (tmp
& -tmp
))
7982 return (val
+ 1) > 1;
7984 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
7986 val
= (val
<< 32) | (val
& 0xffffffff);
7988 /* Invert if the immediate doesn't start with a zero bit - this means we
7989 only need to search for sequences of one bits. */
7993 /* Find the first set bit and set tmp to val with the first sequence of one
7994 bits removed. Return success if there is a single sequence of ones. */
7995 first_one
= val
& -val
;
7996 tmp
= val
& (val
+ first_one
);
8001 /* Find the next set bit and compute the difference in bit position. */
8002 next_one
= tmp
& -tmp
;
8003 bits
= clz_hwi (first_one
) - clz_hwi (next_one
);
8006 /* Check the bit position difference is a power of 2, and that the first
8007 sequence of one bits fits within 'bits' bits. */
8008 if ((mask
>> bits
) != 0 || bits
!= (bits
& -bits
))
8011 /* Check the sequence of one bits is repeated 64/bits times. */
8012 return val
== mask
* bitmask_imm_mul
[__builtin_clz (bits
) - 26];
8015 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
8016 Assumed precondition: VAL_IN Is not zero. */
8018 unsigned HOST_WIDE_INT
8019 aarch64_and_split_imm1 (HOST_WIDE_INT val_in
)
8021 int lowest_bit_set
= ctz_hwi (val_in
);
8022 int highest_bit_set
= floor_log2 (val_in
);
8023 gcc_assert (val_in
!= 0);
8025 return ((HOST_WIDE_INT_UC (2) << highest_bit_set
) -
8026 (HOST_WIDE_INT_1U
<< lowest_bit_set
));
8029 /* Create constant where bits outside of lowest bit set to highest bit set
8032 unsigned HOST_WIDE_INT
8033 aarch64_and_split_imm2 (HOST_WIDE_INT val_in
)
8035 return val_in
| ~aarch64_and_split_imm1 (val_in
);
8038 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
8041 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in
, machine_mode mode
)
8043 scalar_int_mode int_mode
;
8044 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
8047 if (aarch64_bitmask_imm (val_in
, int_mode
))
8050 if (aarch64_move_imm (val_in
, int_mode
))
8053 unsigned HOST_WIDE_INT imm2
= aarch64_and_split_imm2 (val_in
);
8055 return aarch64_bitmask_imm (imm2
, int_mode
);
8058 /* Return true if val is an immediate that can be loaded into a
8059 register in a single instruction. */
8061 aarch64_move_imm (HOST_WIDE_INT val
, machine_mode mode
)
8063 scalar_int_mode int_mode
;
8064 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
8067 if (aarch64_movw_imm (val
, int_mode
) || aarch64_movw_imm (~val
, int_mode
))
8069 return aarch64_bitmask_imm (val
, int_mode
);
8073 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED
, rtx x
)
8077 if (GET_CODE (x
) == HIGH
)
8080 /* There's no way to calculate VL-based values using relocations. */
8081 subrtx_iterator::array_type array
;
8082 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
8083 if (GET_CODE (*iter
) == CONST_POLY_INT
)
8086 split_const (x
, &base
, &offset
);
8087 if (GET_CODE (base
) == SYMBOL_REF
|| GET_CODE (base
) == LABEL_REF
)
8089 if (aarch64_classify_symbol (base
, INTVAL (offset
))
8090 != SYMBOL_FORCE_TO_MEM
)
8093 /* Avoid generating a 64-bit relocation in ILP32; leave
8094 to aarch64_expand_mov_immediate to handle it properly. */
8095 return mode
!= ptr_mode
;
8098 return aarch64_tls_referenced_p (x
);
8101 /* Implement TARGET_CASE_VALUES_THRESHOLD.
8102 The expansion for a table switch is quite expensive due to the number
8103 of instructions, the table lookup and hard to predict indirect jump.
8104 When optimizing for speed, and -O3 enabled, use the per-core tuning if
8105 set, otherwise use tables for > 16 cases as a tradeoff between size and
8106 performance. When optimizing for size, use the default setting. */
8109 aarch64_case_values_threshold (void)
8111 /* Use the specified limit for the number of cases before using jump
8112 tables at higher optimization levels. */
8114 && selected_cpu
->tune
->max_case_values
!= 0)
8115 return selected_cpu
->tune
->max_case_values
;
8117 return optimize_size
? default_case_values_threshold () : 17;
8120 /* Return true if register REGNO is a valid index register.
8121 STRICT_P is true if REG_OK_STRICT is in effect. */
8124 aarch64_regno_ok_for_index_p (int regno
, bool strict_p
)
8126 if (!HARD_REGISTER_NUM_P (regno
))
8134 regno
= reg_renumber
[regno
];
8136 return GP_REGNUM_P (regno
);
8139 /* Return true if register REGNO is a valid base register for mode MODE.
8140 STRICT_P is true if REG_OK_STRICT is in effect. */
8143 aarch64_regno_ok_for_base_p (int regno
, bool strict_p
)
8145 if (!HARD_REGISTER_NUM_P (regno
))
8153 regno
= reg_renumber
[regno
];
8156 /* The fake registers will be eliminated to either the stack or
8157 hard frame pointer, both of which are usually valid base registers.
8158 Reload deals with the cases where the eliminated form isn't valid. */
8159 return (GP_REGNUM_P (regno
)
8160 || regno
== SP_REGNUM
8161 || regno
== FRAME_POINTER_REGNUM
8162 || regno
== ARG_POINTER_REGNUM
);
8165 /* Return true if X is a valid base register for mode MODE.
8166 STRICT_P is true if REG_OK_STRICT is in effect. */
8169 aarch64_base_register_rtx_p (rtx x
, bool strict_p
)
8172 && GET_CODE (x
) == SUBREG
8173 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (x
))])
8176 return (REG_P (x
) && aarch64_regno_ok_for_base_p (REGNO (x
), strict_p
));
8179 /* Return true if address offset is a valid index. If it is, fill in INFO
8180 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
8183 aarch64_classify_index (struct aarch64_address_info
*info
, rtx x
,
8184 machine_mode mode
, bool strict_p
)
8186 enum aarch64_address_type type
;
8191 if ((REG_P (x
) || GET_CODE (x
) == SUBREG
)
8192 && GET_MODE (x
) == Pmode
)
8194 type
= ADDRESS_REG_REG
;
8198 /* (sign_extend:DI (reg:SI)) */
8199 else if ((GET_CODE (x
) == SIGN_EXTEND
8200 || GET_CODE (x
) == ZERO_EXTEND
)
8201 && GET_MODE (x
) == DImode
8202 && GET_MODE (XEXP (x
, 0)) == SImode
)
8204 type
= (GET_CODE (x
) == SIGN_EXTEND
)
8205 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
8206 index
= XEXP (x
, 0);
8209 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
8210 else if (GET_CODE (x
) == MULT
8211 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
8212 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
8213 && GET_MODE (XEXP (x
, 0)) == DImode
8214 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
8215 && CONST_INT_P (XEXP (x
, 1)))
8217 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
8218 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
8219 index
= XEXP (XEXP (x
, 0), 0);
8220 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
8222 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
8223 else if (GET_CODE (x
) == ASHIFT
8224 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
8225 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
8226 && GET_MODE (XEXP (x
, 0)) == DImode
8227 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
8228 && CONST_INT_P (XEXP (x
, 1)))
8230 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
8231 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
8232 index
= XEXP (XEXP (x
, 0), 0);
8233 shift
= INTVAL (XEXP (x
, 1));
8235 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
8236 else if ((GET_CODE (x
) == SIGN_EXTRACT
8237 || GET_CODE (x
) == ZERO_EXTRACT
)
8238 && GET_MODE (x
) == DImode
8239 && GET_CODE (XEXP (x
, 0)) == MULT
8240 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
8241 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
8243 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
8244 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
8245 index
= XEXP (XEXP (x
, 0), 0);
8246 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
8247 if (INTVAL (XEXP (x
, 1)) != 32 + shift
8248 || INTVAL (XEXP (x
, 2)) != 0)
8251 /* (and:DI (mult:DI (reg:DI) (const_int scale))
8252 (const_int 0xffffffff<<shift)) */
8253 else if (GET_CODE (x
) == AND
8254 && GET_MODE (x
) == DImode
8255 && GET_CODE (XEXP (x
, 0)) == MULT
8256 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
8257 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
8258 && CONST_INT_P (XEXP (x
, 1)))
8260 type
= ADDRESS_REG_UXTW
;
8261 index
= XEXP (XEXP (x
, 0), 0);
8262 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
8263 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
8266 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
8267 else if ((GET_CODE (x
) == SIGN_EXTRACT
8268 || GET_CODE (x
) == ZERO_EXTRACT
)
8269 && GET_MODE (x
) == DImode
8270 && GET_CODE (XEXP (x
, 0)) == ASHIFT
8271 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
8272 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
8274 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
8275 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
8276 index
= XEXP (XEXP (x
, 0), 0);
8277 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
8278 if (INTVAL (XEXP (x
, 1)) != 32 + shift
8279 || INTVAL (XEXP (x
, 2)) != 0)
8282 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
8283 (const_int 0xffffffff<<shift)) */
8284 else if (GET_CODE (x
) == AND
8285 && GET_MODE (x
) == DImode
8286 && GET_CODE (XEXP (x
, 0)) == ASHIFT
8287 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
8288 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
8289 && CONST_INT_P (XEXP (x
, 1)))
8291 type
= ADDRESS_REG_UXTW
;
8292 index
= XEXP (XEXP (x
, 0), 0);
8293 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
8294 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
8297 /* (mult:P (reg:P) (const_int scale)) */
8298 else if (GET_CODE (x
) == MULT
8299 && GET_MODE (x
) == Pmode
8300 && GET_MODE (XEXP (x
, 0)) == Pmode
8301 && CONST_INT_P (XEXP (x
, 1)))
8303 type
= ADDRESS_REG_REG
;
8304 index
= XEXP (x
, 0);
8305 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
8307 /* (ashift:P (reg:P) (const_int shift)) */
8308 else if (GET_CODE (x
) == ASHIFT
8309 && GET_MODE (x
) == Pmode
8310 && GET_MODE (XEXP (x
, 0)) == Pmode
8311 && CONST_INT_P (XEXP (x
, 1)))
8313 type
= ADDRESS_REG_REG
;
8314 index
= XEXP (x
, 0);
8315 shift
= INTVAL (XEXP (x
, 1));
8321 && GET_CODE (index
) == SUBREG
8322 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (index
))])
8323 index
= SUBREG_REG (index
);
8325 if (aarch64_sve_data_mode_p (mode
))
8327 if (type
!= ADDRESS_REG_REG
8328 || (1 << shift
) != GET_MODE_UNIT_SIZE (mode
))
8334 && !(IN_RANGE (shift
, 1, 3)
8335 && known_eq (1 << shift
, GET_MODE_SIZE (mode
))))
8340 && aarch64_regno_ok_for_index_p (REGNO (index
), strict_p
))
8343 info
->offset
= index
;
8344 info
->shift
= shift
;
8351 /* Return true if MODE is one of the modes for which we
8352 support LDP/STP operations. */
8355 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode
)
8357 return mode
== SImode
|| mode
== DImode
8358 || mode
== SFmode
|| mode
== DFmode
8359 || (aarch64_vector_mode_supported_p (mode
)
8360 && (known_eq (GET_MODE_SIZE (mode
), 8)
8361 || (known_eq (GET_MODE_SIZE (mode
), 16)
8362 && (aarch64_tune_params
.extra_tuning_flags
8363 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
) == 0)));
8366 /* Return true if REGNO is a virtual pointer register, or an eliminable
8367 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
8368 include stack_pointer or hard_frame_pointer. */
8370 virt_or_elim_regno_p (unsigned regno
)
8372 return ((regno
>= FIRST_VIRTUAL_REGISTER
8373 && regno
<= LAST_VIRTUAL_POINTER_REGISTER
)
8374 || regno
== FRAME_POINTER_REGNUM
8375 || regno
== ARG_POINTER_REGNUM
);
8378 /* Return true if X is a valid address of type TYPE for machine mode MODE.
8379 If it is, fill in INFO appropriately. STRICT_P is true if
8380 REG_OK_STRICT is in effect. */
8383 aarch64_classify_address (struct aarch64_address_info
*info
,
8384 rtx x
, machine_mode mode
, bool strict_p
,
8385 aarch64_addr_query_type type
)
8387 enum rtx_code code
= GET_CODE (x
);
8391 HOST_WIDE_INT const_size
;
8393 /* Whether a vector mode is partial doesn't affect address legitimacy.
8394 Partial vectors like VNx8QImode allow the same indexed addressing
8395 mode and MUL VL addressing mode as full vectors like VNx16QImode;
8396 in both cases, MUL VL counts multiples of GET_MODE_SIZE. */
8397 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
8398 vec_flags
&= ~VEC_PARTIAL
;
8400 /* On BE, we use load/store pair for all large int mode load/stores.
8401 TI/TFmode may also use a load/store pair. */
8402 bool advsimd_struct_p
= (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
));
8403 bool load_store_pair_p
= (type
== ADDR_QUERY_LDP_STP
8404 || type
== ADDR_QUERY_LDP_STP_N
8407 || (BYTES_BIG_ENDIAN
&& advsimd_struct_p
));
8409 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
8410 corresponds to the actual size of the memory being loaded/stored and the
8411 mode of the corresponding addressing mode is half of that. */
8412 if (type
== ADDR_QUERY_LDP_STP_N
8413 && known_eq (GET_MODE_SIZE (mode
), 16))
8416 bool allow_reg_index_p
= (!load_store_pair_p
8417 && (known_lt (GET_MODE_SIZE (mode
), 16)
8418 || vec_flags
== VEC_ADVSIMD
8419 || vec_flags
& VEC_SVE_DATA
));
8421 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
8422 [Rn, #offset, MUL VL]. */
8423 if ((vec_flags
& (VEC_SVE_DATA
| VEC_SVE_PRED
)) != 0
8424 && (code
!= REG
&& code
!= PLUS
))
8427 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
8429 if (advsimd_struct_p
8430 && !BYTES_BIG_ENDIAN
8431 && (code
!= POST_INC
&& code
!= REG
))
8434 gcc_checking_assert (GET_MODE (x
) == VOIDmode
8435 || SCALAR_INT_MODE_P (GET_MODE (x
)));
8441 info
->type
= ADDRESS_REG_IMM
;
8443 info
->offset
= const0_rtx
;
8444 info
->const_offset
= 0;
8445 return aarch64_base_register_rtx_p (x
, strict_p
);
8453 && virt_or_elim_regno_p (REGNO (op0
))
8454 && poly_int_rtx_p (op1
, &offset
))
8456 info
->type
= ADDRESS_REG_IMM
;
8459 info
->const_offset
= offset
;
8464 if (maybe_ne (GET_MODE_SIZE (mode
), 0)
8465 && aarch64_base_register_rtx_p (op0
, strict_p
)
8466 && poly_int_rtx_p (op1
, &offset
))
8468 info
->type
= ADDRESS_REG_IMM
;
8471 info
->const_offset
= offset
;
8473 /* TImode and TFmode values are allowed in both pairs of X
8474 registers and individual Q registers. The available
8476 X,X: 7-bit signed scaled offset
8477 Q: 9-bit signed offset
8478 We conservatively require an offset representable in either mode.
8479 When performing the check for pairs of X registers i.e. LDP/STP
8480 pass down DImode since that is the natural size of the LDP/STP
8481 instruction memory accesses. */
8482 if (mode
== TImode
|| mode
== TFmode
)
8483 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
8484 && (aarch64_offset_9bit_signed_unscaled_p (mode
, offset
)
8485 || offset_12bit_unsigned_scaled_p (mode
, offset
)));
8487 /* A 7bit offset check because OImode will emit a ldp/stp
8488 instruction (only big endian will get here).
8489 For ldp/stp instructions, the offset is scaled for the size of a
8490 single element of the pair. */
8492 return aarch64_offset_7bit_signed_scaled_p (TImode
, offset
);
8494 /* Three 9/12 bit offsets checks because CImode will emit three
8495 ldr/str instructions (only big endian will get here). */
8497 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
8498 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode
,
8500 || offset_12bit_unsigned_scaled_p (V16QImode
,
8503 /* Two 7bit offsets checks because XImode will emit two ldp/stp
8504 instructions (only big endian will get here). */
8506 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
8507 && aarch64_offset_7bit_signed_scaled_p (TImode
,
8510 /* Make "m" use the LD1 offset range for SVE data modes, so
8511 that pre-RTL optimizers like ivopts will work to that
8512 instead of the wider LDR/STR range. */
8513 if (vec_flags
== VEC_SVE_DATA
)
8514 return (type
== ADDR_QUERY_M
8515 ? offset_4bit_signed_scaled_p (mode
, offset
)
8516 : offset_9bit_signed_scaled_p (mode
, offset
));
8518 if (vec_flags
== (VEC_SVE_DATA
| VEC_STRUCT
))
8520 poly_int64 end_offset
= (offset
8521 + GET_MODE_SIZE (mode
)
8522 - BYTES_PER_SVE_VECTOR
);
8523 return (type
== ADDR_QUERY_M
8524 ? offset_4bit_signed_scaled_p (mode
, offset
)
8525 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE
, offset
)
8526 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE
,
8530 if (vec_flags
== VEC_SVE_PRED
)
8531 return offset_9bit_signed_scaled_p (mode
, offset
);
8533 if (load_store_pair_p
)
8534 return ((known_eq (GET_MODE_SIZE (mode
), 4)
8535 || known_eq (GET_MODE_SIZE (mode
), 8)
8536 || known_eq (GET_MODE_SIZE (mode
), 16))
8537 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
8539 return (aarch64_offset_9bit_signed_unscaled_p (mode
, offset
)
8540 || offset_12bit_unsigned_scaled_p (mode
, offset
));
8543 if (allow_reg_index_p
)
8545 /* Look for base + (scaled/extended) index register. */
8546 if (aarch64_base_register_rtx_p (op0
, strict_p
)
8547 && aarch64_classify_index (info
, op1
, mode
, strict_p
))
8552 if (aarch64_base_register_rtx_p (op1
, strict_p
)
8553 && aarch64_classify_index (info
, op0
, mode
, strict_p
))
8566 info
->type
= ADDRESS_REG_WB
;
8567 info
->base
= XEXP (x
, 0);
8568 info
->offset
= NULL_RTX
;
8569 return aarch64_base_register_rtx_p (info
->base
, strict_p
);
8573 info
->type
= ADDRESS_REG_WB
;
8574 info
->base
= XEXP (x
, 0);
8575 if (GET_CODE (XEXP (x
, 1)) == PLUS
8576 && poly_int_rtx_p (XEXP (XEXP (x
, 1), 1), &offset
)
8577 && rtx_equal_p (XEXP (XEXP (x
, 1), 0), info
->base
)
8578 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
8580 info
->offset
= XEXP (XEXP (x
, 1), 1);
8581 info
->const_offset
= offset
;
8583 /* TImode and TFmode values are allowed in both pairs of X
8584 registers and individual Q registers. The available
8586 X,X: 7-bit signed scaled offset
8587 Q: 9-bit signed offset
8588 We conservatively require an offset representable in either mode.
8590 if (mode
== TImode
|| mode
== TFmode
)
8591 return (aarch64_offset_7bit_signed_scaled_p (mode
, offset
)
8592 && aarch64_offset_9bit_signed_unscaled_p (mode
, offset
));
8594 if (load_store_pair_p
)
8595 return ((known_eq (GET_MODE_SIZE (mode
), 4)
8596 || known_eq (GET_MODE_SIZE (mode
), 8)
8597 || known_eq (GET_MODE_SIZE (mode
), 16))
8598 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
8600 return aarch64_offset_9bit_signed_unscaled_p (mode
, offset
);
8607 /* load literal: pc-relative constant pool entry. Only supported
8608 for SI mode or larger. */
8609 info
->type
= ADDRESS_SYMBOLIC
;
8611 if (!load_store_pair_p
8612 && GET_MODE_SIZE (mode
).is_constant (&const_size
)
8617 split_const (x
, &sym
, &addend
);
8618 return ((GET_CODE (sym
) == LABEL_REF
8619 || (GET_CODE (sym
) == SYMBOL_REF
8620 && CONSTANT_POOL_ADDRESS_P (sym
)
8621 && aarch64_pcrelative_literal_loads
)));
8626 info
->type
= ADDRESS_LO_SUM
;
8627 info
->base
= XEXP (x
, 0);
8628 info
->offset
= XEXP (x
, 1);
8629 if (allow_reg_index_p
8630 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
8633 split_const (info
->offset
, &sym
, &offs
);
8634 if (GET_CODE (sym
) == SYMBOL_REF
8635 && (aarch64_classify_symbol (sym
, INTVAL (offs
))
8636 == SYMBOL_SMALL_ABSOLUTE
))
8638 /* The symbol and offset must be aligned to the access size. */
8641 if (CONSTANT_POOL_ADDRESS_P (sym
))
8642 align
= GET_MODE_ALIGNMENT (get_pool_mode (sym
));
8643 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym
))
8645 tree exp
= SYMBOL_REF_DECL (sym
);
8646 align
= TYPE_ALIGN (TREE_TYPE (exp
));
8647 align
= aarch64_constant_alignment (exp
, align
);
8649 else if (SYMBOL_REF_DECL (sym
))
8650 align
= DECL_ALIGN (SYMBOL_REF_DECL (sym
));
8651 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym
)
8652 && SYMBOL_REF_BLOCK (sym
) != NULL
)
8653 align
= SYMBOL_REF_BLOCK (sym
)->alignment
;
8655 align
= BITS_PER_UNIT
;
8657 poly_int64 ref_size
= GET_MODE_SIZE (mode
);
8658 if (known_eq (ref_size
, 0))
8659 ref_size
= GET_MODE_SIZE (DImode
);
8661 return (multiple_p (INTVAL (offs
), ref_size
)
8662 && multiple_p (align
/ BITS_PER_UNIT
, ref_size
));
8672 /* Return true if the address X is valid for a PRFM instruction.
8673 STRICT_P is true if we should do strict checking with
8674 aarch64_classify_address. */
8677 aarch64_address_valid_for_prefetch_p (rtx x
, bool strict_p
)
8679 struct aarch64_address_info addr
;
8681 /* PRFM accepts the same addresses as DImode... */
8682 bool res
= aarch64_classify_address (&addr
, x
, DImode
, strict_p
);
8686 /* ... except writeback forms. */
8687 return addr
.type
!= ADDRESS_REG_WB
;
8691 aarch64_symbolic_address_p (rtx x
)
8695 split_const (x
, &x
, &offset
);
8696 return GET_CODE (x
) == SYMBOL_REF
|| GET_CODE (x
) == LABEL_REF
;
8699 /* Classify the base of symbolic expression X. */
8701 enum aarch64_symbol_type
8702 aarch64_classify_symbolic_expression (rtx x
)
8706 split_const (x
, &x
, &offset
);
8707 return aarch64_classify_symbol (x
, INTVAL (offset
));
8711 /* Return TRUE if X is a legitimate address for accessing memory in
8714 aarch64_legitimate_address_hook_p (machine_mode mode
, rtx x
, bool strict_p
)
8716 struct aarch64_address_info addr
;
8718 return aarch64_classify_address (&addr
, x
, mode
, strict_p
);
8721 /* Return TRUE if X is a legitimate address of type TYPE for accessing
8722 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
8724 aarch64_legitimate_address_p (machine_mode mode
, rtx x
, bool strict_p
,
8725 aarch64_addr_query_type type
)
8727 struct aarch64_address_info addr
;
8729 return aarch64_classify_address (&addr
, x
, mode
, strict_p
, type
);
8732 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
8735 aarch64_legitimize_address_displacement (rtx
*offset1
, rtx
*offset2
,
8736 poly_int64 orig_offset
,
8740 if (GET_MODE_SIZE (mode
).is_constant (&size
))
8742 HOST_WIDE_INT const_offset
, second_offset
;
8744 /* A general SVE offset is A * VQ + B. Remove the A component from
8745 coefficient 0 in order to get the constant B. */
8746 const_offset
= orig_offset
.coeffs
[0] - orig_offset
.coeffs
[1];
8748 /* Split an out-of-range address displacement into a base and
8749 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
8750 range otherwise to increase opportunities for sharing the base
8751 address of different sizes. Unaligned accesses use the signed
8752 9-bit range, TImode/TFmode use the intersection of signed
8753 scaled 7-bit and signed 9-bit offset. */
8754 if (mode
== TImode
|| mode
== TFmode
)
8755 second_offset
= ((const_offset
+ 0x100) & 0x1f8) - 0x100;
8756 else if ((const_offset
& (size
- 1)) != 0)
8757 second_offset
= ((const_offset
+ 0x100) & 0x1ff) - 0x100;
8759 second_offset
= const_offset
& (size
< 4 ? 0xfff : 0x3ffc);
8761 if (second_offset
== 0 || known_eq (orig_offset
, second_offset
))
8764 /* Split the offset into second_offset and the rest. */
8765 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
8766 *offset2
= gen_int_mode (second_offset
, Pmode
);
8771 /* Get the mode we should use as the basis of the range. For structure
8772 modes this is the mode of one vector. */
8773 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
8774 machine_mode step_mode
8775 = (vec_flags
& VEC_STRUCT
) != 0 ? SVE_BYTE_MODE
: mode
;
8777 /* Get the "mul vl" multiplier we'd like to use. */
8778 HOST_WIDE_INT factor
= GET_MODE_SIZE (step_mode
).coeffs
[1];
8779 HOST_WIDE_INT vnum
= orig_offset
.coeffs
[1] / factor
;
8780 if (vec_flags
& VEC_SVE_DATA
)
8781 /* LDR supports a 9-bit range, but the move patterns for
8782 structure modes require all vectors to be in range of the
8783 same base. The simplest way of accomodating that while still
8784 promoting reuse of anchor points between different modes is
8785 to use an 8-bit range unconditionally. */
8786 vnum
= ((vnum
+ 128) & 255) - 128;
8788 /* Predicates are only handled singly, so we might as well use
8790 vnum
= ((vnum
+ 256) & 511) - 256;
8794 /* Convert the "mul vl" multiplier into a byte offset. */
8795 poly_int64 second_offset
= GET_MODE_SIZE (step_mode
) * vnum
;
8796 if (known_eq (second_offset
, orig_offset
))
8799 /* Split the offset into second_offset and the rest. */
8800 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
8801 *offset2
= gen_int_mode (second_offset
, Pmode
);
8806 /* Return the binary representation of floating point constant VALUE in INTVAL.
8807 If the value cannot be converted, return false without setting INTVAL.
8808 The conversion is done in the given MODE. */
8810 aarch64_reinterpret_float_as_int (rtx value
, unsigned HOST_WIDE_INT
*intval
)
8813 /* We make a general exception for 0. */
8814 if (aarch64_float_const_zero_rtx_p (value
))
8820 scalar_float_mode mode
;
8821 if (GET_CODE (value
) != CONST_DOUBLE
8822 || !is_a
<scalar_float_mode
> (GET_MODE (value
), &mode
)
8823 || GET_MODE_BITSIZE (mode
) > HOST_BITS_PER_WIDE_INT
8824 /* Only support up to DF mode. */
8825 || GET_MODE_BITSIZE (mode
) > GET_MODE_BITSIZE (DFmode
))
8828 unsigned HOST_WIDE_INT ival
= 0;
8831 real_to_target (res
,
8832 CONST_DOUBLE_REAL_VALUE (value
),
8833 REAL_MODE_FORMAT (mode
));
8837 int order
= BYTES_BIG_ENDIAN
? 1 : 0;
8838 ival
= zext_hwi (res
[order
], 32);
8839 ival
|= (zext_hwi (res
[1 - order
], 32) << 32);
8842 ival
= zext_hwi (res
[0], 32);
8848 /* Return TRUE if rtx X is an immediate constant that can be moved using a
8849 single MOV(+MOVK) followed by an FMOV. */
8851 aarch64_float_const_rtx_p (rtx x
)
8853 machine_mode mode
= GET_MODE (x
);
8854 if (mode
== VOIDmode
)
8857 /* Determine whether it's cheaper to write float constants as
8858 mov/movk pairs over ldr/adrp pairs. */
8859 unsigned HOST_WIDE_INT ival
;
8861 if (GET_CODE (x
) == CONST_DOUBLE
8862 && SCALAR_FLOAT_MODE_P (mode
)
8863 && aarch64_reinterpret_float_as_int (x
, &ival
))
8865 scalar_int_mode imode
= (mode
== HFmode
8867 : int_mode_for_mode (mode
).require ());
8868 int num_instr
= aarch64_internal_mov_immediate
8869 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
8870 return num_instr
< 3;
8876 /* Return TRUE if rtx X is immediate constant 0.0 */
8878 aarch64_float_const_zero_rtx_p (rtx x
)
8880 if (GET_MODE (x
) == VOIDmode
)
8883 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x
)))
8884 return !HONOR_SIGNED_ZEROS (GET_MODE (x
));
8885 return real_equal (CONST_DOUBLE_REAL_VALUE (x
), &dconst0
);
8888 /* Return TRUE if rtx X is immediate constant that fits in a single
8889 MOVI immediate operation. */
8891 aarch64_can_const_movi_rtx_p (rtx x
, machine_mode mode
)
8897 scalar_int_mode imode
;
8898 unsigned HOST_WIDE_INT ival
;
8900 if (GET_CODE (x
) == CONST_DOUBLE
8901 && SCALAR_FLOAT_MODE_P (mode
))
8903 if (!aarch64_reinterpret_float_as_int (x
, &ival
))
8906 /* We make a general exception for 0. */
8907 if (aarch64_float_const_zero_rtx_p (x
))
8910 imode
= int_mode_for_mode (mode
).require ();
8912 else if (GET_CODE (x
) == CONST_INT
8913 && is_a
<scalar_int_mode
> (mode
, &imode
))
8918 /* use a 64 bit mode for everything except for DI/DF mode, where we use
8919 a 128 bit vector mode. */
8920 int width
= GET_MODE_BITSIZE (imode
) == 64 ? 128 : 64;
8922 vmode
= aarch64_simd_container_mode (imode
, width
);
8923 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, ival
);
8925 return aarch64_simd_valid_immediate (v_op
, NULL
);
8929 /* Return the fixed registers used for condition codes. */
8932 aarch64_fixed_condition_code_regs (unsigned int *p1
, unsigned int *p2
)
8935 *p2
= INVALID_REGNUM
;
8939 /* This function is used by the call expanders of the machine description.
8940 RESULT is the register in which the result is returned. It's NULL for
8941 "call" and "sibcall".
8942 MEM is the location of the function call.
8943 CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
8944 SIBCALL indicates whether this function call is normal call or sibling call.
8945 It will generate different pattern accordingly. */
8948 aarch64_expand_call (rtx result
, rtx mem
, rtx callee_abi
, bool sibcall
)
8950 rtx call
, callee
, tmp
;
8954 gcc_assert (MEM_P (mem
));
8955 callee
= XEXP (mem
, 0);
8956 mode
= GET_MODE (callee
);
8957 gcc_assert (mode
== Pmode
);
8959 /* Decide if we should generate indirect calls by loading the
8960 address of the callee into a register before performing
8961 the branch-and-link. */
8962 if (SYMBOL_REF_P (callee
)
8963 ? (aarch64_is_long_call_p (callee
)
8964 || aarch64_is_noplt_call_p (callee
))
8966 XEXP (mem
, 0) = force_reg (mode
, callee
);
8968 call
= gen_rtx_CALL (VOIDmode
, mem
, const0_rtx
);
8970 if (result
!= NULL_RTX
)
8971 call
= gen_rtx_SET (result
, call
);
8976 tmp
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (Pmode
, LR_REGNUM
));
8978 gcc_assert (CONST_INT_P (callee_abi
));
8979 callee_abi
= gen_rtx_UNSPEC (DImode
, gen_rtvec (1, callee_abi
),
8982 vec
= gen_rtvec (3, call
, callee_abi
, tmp
);
8983 call
= gen_rtx_PARALLEL (VOIDmode
, vec
);
8985 aarch64_emit_call_insn (call
);
8988 /* Emit call insn with PAT and do aarch64-specific handling. */
8991 aarch64_emit_call_insn (rtx pat
)
8993 rtx insn
= emit_call_insn (pat
);
8995 rtx
*fusage
= &CALL_INSN_FUNCTION_USAGE (insn
);
8996 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP0_REGNUM
));
8997 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP1_REGNUM
));
9001 aarch64_select_cc_mode (RTX_CODE code
, rtx x
, rtx y
)
9003 machine_mode mode_x
= GET_MODE (x
);
9004 rtx_code code_x
= GET_CODE (x
);
9006 /* All floating point compares return CCFP if it is an equality
9007 comparison, and CCFPE otherwise. */
9008 if (GET_MODE_CLASS (mode_x
) == MODE_FLOAT
)
9035 /* Equality comparisons of short modes against zero can be performed
9036 using the TST instruction with the appropriate bitmask. */
9037 if (y
== const0_rtx
&& (REG_P (x
) || SUBREG_P (x
))
9038 && (code
== EQ
|| code
== NE
)
9039 && (mode_x
== HImode
|| mode_x
== QImode
))
9042 /* Similarly, comparisons of zero_extends from shorter modes can
9043 be performed using an ANDS with an immediate mask. */
9044 if (y
== const0_rtx
&& code_x
== ZERO_EXTEND
9045 && (mode_x
== SImode
|| mode_x
== DImode
)
9046 && (GET_MODE (XEXP (x
, 0)) == HImode
|| GET_MODE (XEXP (x
, 0)) == QImode
)
9047 && (code
== EQ
|| code
== NE
))
9050 if ((mode_x
== SImode
|| mode_x
== DImode
)
9052 && (code
== EQ
|| code
== NE
|| code
== LT
|| code
== GE
)
9053 && (code_x
== PLUS
|| code_x
== MINUS
|| code_x
== AND
9055 || (code_x
== ZERO_EXTRACT
&& CONST_INT_P (XEXP (x
, 1))
9056 && CONST_INT_P (XEXP (x
, 2)))))
9059 /* A compare with a shifted operand. Because of canonicalization,
9060 the comparison will have to be swapped when we emit the assembly
9062 if ((mode_x
== SImode
|| mode_x
== DImode
)
9063 && (REG_P (y
) || GET_CODE (y
) == SUBREG
|| y
== const0_rtx
)
9064 && (code_x
== ASHIFT
|| code_x
== ASHIFTRT
9065 || code_x
== LSHIFTRT
9066 || code_x
== ZERO_EXTEND
|| code_x
== SIGN_EXTEND
))
9069 /* Similarly for a negated operand, but we can only do this for
9071 if ((mode_x
== SImode
|| mode_x
== DImode
)
9072 && (REG_P (y
) || GET_CODE (y
) == SUBREG
)
9073 && (code
== EQ
|| code
== NE
)
9077 /* A test for unsigned overflow from an addition. */
9078 if ((mode_x
== DImode
|| mode_x
== TImode
)
9079 && (code
== LTU
|| code
== GEU
)
9081 && rtx_equal_p (XEXP (x
, 0), y
))
9084 /* A test for unsigned overflow from an add with carry. */
9085 if ((mode_x
== DImode
|| mode_x
== TImode
)
9086 && (code
== LTU
|| code
== GEU
)
9088 && CONST_SCALAR_INT_P (y
)
9089 && (rtx_mode_t (y
, mode_x
)
9090 == (wi::shwi (1, mode_x
)
9091 << (GET_MODE_BITSIZE (mode_x
).to_constant () / 2))))
9094 /* A test for signed overflow. */
9095 if ((mode_x
== DImode
|| mode_x
== TImode
)
9098 && GET_CODE (y
) == SIGN_EXTEND
)
9101 /* For everything else, return CCmode. */
9106 aarch64_get_condition_code_1 (machine_mode
, enum rtx_code
);
9109 aarch64_get_condition_code (rtx x
)
9111 machine_mode mode
= GET_MODE (XEXP (x
, 0));
9112 enum rtx_code comp_code
= GET_CODE (x
);
9114 if (GET_MODE_CLASS (mode
) != MODE_CC
)
9115 mode
= SELECT_CC_MODE (comp_code
, XEXP (x
, 0), XEXP (x
, 1));
9116 return aarch64_get_condition_code_1 (mode
, comp_code
);
9120 aarch64_get_condition_code_1 (machine_mode mode
, enum rtx_code comp_code
)
9128 case GE
: return AARCH64_GE
;
9129 case GT
: return AARCH64_GT
;
9130 case LE
: return AARCH64_LS
;
9131 case LT
: return AARCH64_MI
;
9132 case NE
: return AARCH64_NE
;
9133 case EQ
: return AARCH64_EQ
;
9134 case ORDERED
: return AARCH64_VC
;
9135 case UNORDERED
: return AARCH64_VS
;
9136 case UNLT
: return AARCH64_LT
;
9137 case UNLE
: return AARCH64_LE
;
9138 case UNGT
: return AARCH64_HI
;
9139 case UNGE
: return AARCH64_PL
;
9147 case NE
: return AARCH64_NE
;
9148 case EQ
: return AARCH64_EQ
;
9149 case GE
: return AARCH64_GE
;
9150 case GT
: return AARCH64_GT
;
9151 case LE
: return AARCH64_LE
;
9152 case LT
: return AARCH64_LT
;
9153 case GEU
: return AARCH64_CS
;
9154 case GTU
: return AARCH64_HI
;
9155 case LEU
: return AARCH64_LS
;
9156 case LTU
: return AARCH64_CC
;
9164 case NE
: return AARCH64_NE
;
9165 case EQ
: return AARCH64_EQ
;
9166 case GE
: return AARCH64_LE
;
9167 case GT
: return AARCH64_LT
;
9168 case LE
: return AARCH64_GE
;
9169 case LT
: return AARCH64_GT
;
9170 case GEU
: return AARCH64_LS
;
9171 case GTU
: return AARCH64_CC
;
9172 case LEU
: return AARCH64_CS
;
9173 case LTU
: return AARCH64_HI
;
9181 case NE
: return AARCH64_NE
; /* = any */
9182 case EQ
: return AARCH64_EQ
; /* = none */
9183 case GE
: return AARCH64_PL
; /* = nfrst */
9184 case LT
: return AARCH64_MI
; /* = first */
9185 case GEU
: return AARCH64_CS
; /* = nlast */
9186 case GTU
: return AARCH64_HI
; /* = pmore */
9187 case LEU
: return AARCH64_LS
; /* = plast */
9188 case LTU
: return AARCH64_CC
; /* = last */
9196 case NE
: return AARCH64_NE
;
9197 case EQ
: return AARCH64_EQ
;
9198 case GE
: return AARCH64_PL
;
9199 case LT
: return AARCH64_MI
;
9207 case NE
: return AARCH64_NE
;
9208 case EQ
: return AARCH64_EQ
;
9216 case LTU
: return AARCH64_CS
;
9217 case GEU
: return AARCH64_CC
;
9225 case GEU
: return AARCH64_CS
;
9226 case LTU
: return AARCH64_CC
;
9234 case NE
: return AARCH64_VS
;
9235 case EQ
: return AARCH64_VC
;
9248 aarch64_const_vec_all_same_in_range_p (rtx x
,
9249 HOST_WIDE_INT minval
,
9250 HOST_WIDE_INT maxval
)
9253 return (const_vec_duplicate_p (x
, &elt
)
9254 && CONST_INT_P (elt
)
9255 && IN_RANGE (INTVAL (elt
), minval
, maxval
));
9259 aarch64_const_vec_all_same_int_p (rtx x
, HOST_WIDE_INT val
)
9261 return aarch64_const_vec_all_same_in_range_p (x
, val
, val
);
9264 /* Return true if VEC is a constant in which every element is in the range
9265 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
9268 aarch64_const_vec_all_in_range_p (rtx vec
,
9269 HOST_WIDE_INT minval
,
9270 HOST_WIDE_INT maxval
)
9272 if (GET_CODE (vec
) != CONST_VECTOR
9273 || GET_MODE_CLASS (GET_MODE (vec
)) != MODE_VECTOR_INT
)
9277 if (!CONST_VECTOR_STEPPED_P (vec
))
9278 nunits
= const_vector_encoded_nelts (vec
);
9279 else if (!CONST_VECTOR_NUNITS (vec
).is_constant (&nunits
))
9282 for (int i
= 0; i
< nunits
; i
++)
9284 rtx vec_elem
= CONST_VECTOR_ELT (vec
, i
);
9285 if (!CONST_INT_P (vec_elem
)
9286 || !IN_RANGE (INTVAL (vec_elem
), minval
, maxval
))
9293 #define AARCH64_CC_V 1
9294 #define AARCH64_CC_C (1 << 1)
9295 #define AARCH64_CC_Z (1 << 2)
9296 #define AARCH64_CC_N (1 << 3)
9298 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
9299 static const int aarch64_nzcv_codes
[] =
9301 0, /* EQ, Z == 1. */
9302 AARCH64_CC_Z
, /* NE, Z == 0. */
9303 0, /* CS, C == 1. */
9304 AARCH64_CC_C
, /* CC, C == 0. */
9305 0, /* MI, N == 1. */
9306 AARCH64_CC_N
, /* PL, N == 0. */
9307 0, /* VS, V == 1. */
9308 AARCH64_CC_V
, /* VC, V == 0. */
9309 0, /* HI, C ==1 && Z == 0. */
9310 AARCH64_CC_C
, /* LS, !(C == 1 && Z == 0). */
9311 AARCH64_CC_V
, /* GE, N == V. */
9312 0, /* LT, N != V. */
9313 AARCH64_CC_Z
, /* GT, Z == 0 && N == V. */
9314 0, /* LE, !(Z == 0 && N == V). */
9319 /* Print floating-point vector immediate operand X to F, negating it
9320 first if NEGATE is true. Return true on success, false if it isn't
9321 a constant we can handle. */
9324 aarch64_print_vector_float_operand (FILE *f
, rtx x
, bool negate
)
9328 if (!const_vec_duplicate_p (x
, &elt
))
9331 REAL_VALUE_TYPE r
= *CONST_DOUBLE_REAL_VALUE (elt
);
9333 r
= real_value_negate (&r
);
9335 /* Handle the SVE single-bit immediates specially, since they have a
9336 fixed form in the assembly syntax. */
9337 if (real_equal (&r
, &dconst0
))
9338 asm_fprintf (f
, "0.0");
9339 else if (real_equal (&r
, &dconst2
))
9340 asm_fprintf (f
, "2.0");
9341 else if (real_equal (&r
, &dconst1
))
9342 asm_fprintf (f
, "1.0");
9343 else if (real_equal (&r
, &dconsthalf
))
9344 asm_fprintf (f
, "0.5");
9347 const int buf_size
= 20;
9348 char float_buf
[buf_size
] = {'\0'};
9349 real_to_decimal_for_mode (float_buf
, &r
, buf_size
, buf_size
,
9351 asm_fprintf (f
, "%s", float_buf
);
9357 /* Return the equivalent letter for size. */
9359 sizetochar (int size
)
9363 case 64: return 'd';
9364 case 32: return 's';
9365 case 16: return 'h';
9366 case 8 : return 'b';
9367 default: gcc_unreachable ();
9371 /* Print operand X to file F in a target specific manner according to CODE.
9372 The acceptable formatting commands given by CODE are:
9373 'c': An integer or symbol address without a preceding #
9375 'C': Take the duplicated element in a vector constant
9376 and print it in hex.
9377 'D': Take the duplicated element in a vector constant
9378 and print it as an unsigned integer, in decimal.
9379 'e': Print the sign/zero-extend size as a character 8->b,
9380 16->h, 32->w. Can also be used for masks:
9381 0xff->b, 0xffff->h, 0xffffffff->w.
9382 'I': If the operand is a duplicated vector constant,
9383 replace it with the duplicated scalar. If the
9384 operand is then a floating-point constant, replace
9385 it with the integer bit representation. Print the
9386 transformed constant as a signed decimal number.
9387 'p': Prints N such that 2^N == X (X must be power of 2 and
9389 'P': Print the number of non-zero bits in X (a const_int).
9390 'H': Print the higher numbered register of a pair (TImode)
9392 'm': Print a condition (eq, ne, etc).
9393 'M': Same as 'm', but invert condition.
9394 'N': Take the duplicated element in a vector constant
9395 and print the negative of it in decimal.
9396 'b/h/s/d/q': Print a scalar FP/SIMD register name.
9397 'S/T/U/V': Print a FP/SIMD register name for a register list.
9398 The register printed is the FP/SIMD register name
9399 of X + 0/1/2/3 for S/T/U/V.
9400 'R': Print a scalar Integer/FP/SIMD register name + 1.
9401 'X': Print bottom 16 bits of integer constant in hex.
9402 'w/x': Print a general register name or the zero register
9404 '0': Print a normal operand, if it's a general register,
9405 then we assume DImode.
9406 'k': Print NZCV for conditional compare instructions.
9407 'A': Output address constant representing the first
9408 argument of X, specifying a relocation offset
9410 'L': Output constant address specified by X
9411 with a relocation offset if appropriate.
9412 'G': Prints address of X, specifying a PC relative
9413 relocation mode if appropriate.
9414 'y': Output address of LDP or STP - this is used for
9415 some LDP/STPs which don't use a PARALLEL in their
9416 pattern (so the mode needs to be adjusted).
9417 'z': Output address of a typical LDP or STP. */
9420 aarch64_print_operand (FILE *f
, rtx x
, int code
)
9426 switch (GET_CODE (x
))
9429 fprintf (f
, HOST_WIDE_INT_PRINT_DEC
, INTVAL (x
));
9433 output_addr_const (f
, x
);
9437 if (GET_CODE (XEXP (x
, 0)) == PLUS
9438 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
)
9440 output_addr_const (f
, x
);
9446 output_operand_lossage ("unsupported operand for code '%c'", code
);
9452 x
= unwrap_const_vec_duplicate (x
);
9453 if (!CONST_INT_P (x
))
9455 output_operand_lossage ("invalid operand for '%%%c'", code
);
9459 HOST_WIDE_INT val
= INTVAL (x
);
9460 if ((val
& ~7) == 8 || val
== 0xff)
9462 else if ((val
& ~7) == 16 || val
== 0xffff)
9464 else if ((val
& ~7) == 32 || val
== 0xffffffff)
9468 output_operand_lossage ("invalid operand for '%%%c'", code
);
9478 if (!CONST_INT_P (x
) || (n
= exact_log2 (INTVAL (x
))) < 0)
9480 output_operand_lossage ("invalid operand for '%%%c'", code
);
9484 asm_fprintf (f
, "%d", n
);
9489 if (!CONST_INT_P (x
))
9491 output_operand_lossage ("invalid operand for '%%%c'", code
);
9495 asm_fprintf (f
, "%u", popcount_hwi (INTVAL (x
)));
9499 if (x
== const0_rtx
)
9501 asm_fprintf (f
, "xzr");
9505 if (!REG_P (x
) || !GP_REGNUM_P (REGNO (x
) + 1))
9507 output_operand_lossage ("invalid operand for '%%%c'", code
);
9511 asm_fprintf (f
, "%s", reg_names
[REGNO (x
) + 1]);
9516 x
= aarch64_bit_representation (unwrap_const_vec_duplicate (x
));
9517 if (CONST_INT_P (x
))
9518 asm_fprintf (f
, "%wd", INTVAL (x
));
9521 output_operand_lossage ("invalid operand for '%%%c'", code
);
9531 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
9532 if (x
== const_true_rtx
)
9539 if (!COMPARISON_P (x
))
9541 output_operand_lossage ("invalid operand for '%%%c'", code
);
9545 cond_code
= aarch64_get_condition_code (x
);
9546 gcc_assert (cond_code
>= 0);
9548 cond_code
= AARCH64_INVERSE_CONDITION_CODE (cond_code
);
9549 if (GET_MODE (XEXP (x
, 0)) == CC_NZCmode
)
9550 fputs (aarch64_sve_condition_codes
[cond_code
], f
);
9552 fputs (aarch64_condition_codes
[cond_code
], f
);
9557 if (!const_vec_duplicate_p (x
, &elt
))
9559 output_operand_lossage ("invalid vector constant");
9563 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
9564 asm_fprintf (f
, "%wd", -INTVAL (elt
));
9565 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
9566 && aarch64_print_vector_float_operand (f
, x
, true))
9570 output_operand_lossage ("invalid vector constant");
9580 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
9582 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
9585 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - V0_REGNUM
);
9592 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
9594 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
9597 asm_fprintf (f
, "%c%d",
9598 aarch64_sve_data_mode_p (GET_MODE (x
)) ? 'z' : 'v',
9599 REGNO (x
) - V0_REGNUM
+ (code
- 'S'));
9603 if (REG_P (x
) && FP_REGNUM_P (REGNO (x
)))
9604 asm_fprintf (f
, "q%d", REGNO (x
) - V0_REGNUM
+ 1);
9605 else if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
9606 asm_fprintf (f
, "x%d", REGNO (x
) - R0_REGNUM
+ 1);
9608 output_operand_lossage ("incompatible register operand for '%%%c'",
9613 if (!CONST_INT_P (x
))
9615 output_operand_lossage ("invalid operand for '%%%c'", code
);
9618 asm_fprintf (f
, "0x%wx", UINTVAL (x
) & 0xffff);
9623 /* Print a replicated constant in hex. */
9624 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
9626 output_operand_lossage ("invalid operand for '%%%c'", code
);
9629 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
9630 asm_fprintf (f
, "0x%wx", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
9636 /* Print a replicated constant in decimal, treating it as
9638 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
9640 output_operand_lossage ("invalid operand for '%%%c'", code
);
9643 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
9644 asm_fprintf (f
, "%wd", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
9651 || (CONST_DOUBLE_P (x
) && aarch64_float_const_zero_rtx_p (x
)))
9653 asm_fprintf (f
, "%czr", code
);
9657 if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
9659 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - R0_REGNUM
);
9663 if (REG_P (x
) && REGNO (x
) == SP_REGNUM
)
9665 asm_fprintf (f
, "%ssp", code
== 'w' ? "w" : "");
9674 output_operand_lossage ("missing operand");
9678 switch (GET_CODE (x
))
9681 if (aarch64_sve_data_mode_p (GET_MODE (x
)))
9683 if (REG_NREGS (x
) == 1)
9684 asm_fprintf (f
, "z%d", REGNO (x
) - V0_REGNUM
);
9688 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x
)));
9689 asm_fprintf (f
, "{z%d.%c - z%d.%c}",
9690 REGNO (x
) - V0_REGNUM
, suffix
,
9691 END_REGNO (x
) - V0_REGNUM
- 1, suffix
);
9695 asm_fprintf (f
, "%s", reg_names
[REGNO (x
)]);
9699 output_address (GET_MODE (x
), XEXP (x
, 0));
9704 output_addr_const (asm_out_file
, x
);
9708 asm_fprintf (f
, "%wd", INTVAL (x
));
9712 if (!VECTOR_MODE_P (GET_MODE (x
)))
9714 output_addr_const (asm_out_file
, x
);
9720 if (!const_vec_duplicate_p (x
, &elt
))
9722 output_operand_lossage ("invalid vector constant");
9726 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
9727 asm_fprintf (f
, "%wd", INTVAL (elt
));
9728 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
9729 && aarch64_print_vector_float_operand (f
, x
, false))
9733 output_operand_lossage ("invalid vector constant");
9739 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
9740 be getting CONST_DOUBLEs holding integers. */
9741 gcc_assert (GET_MODE (x
) != VOIDmode
);
9742 if (aarch64_float_const_zero_rtx_p (x
))
9747 else if (aarch64_float_const_representable_p (x
))
9750 char float_buf
[buf_size
] = {'\0'};
9751 real_to_decimal_for_mode (float_buf
,
9752 CONST_DOUBLE_REAL_VALUE (x
),
9755 asm_fprintf (asm_out_file
, "%s", float_buf
);
9759 output_operand_lossage ("invalid constant");
9762 output_operand_lossage ("invalid operand");
9768 if (GET_CODE (x
) == HIGH
)
9771 switch (aarch64_classify_symbolic_expression (x
))
9773 case SYMBOL_SMALL_GOT_4G
:
9774 asm_fprintf (asm_out_file
, ":got:");
9777 case SYMBOL_SMALL_TLSGD
:
9778 asm_fprintf (asm_out_file
, ":tlsgd:");
9781 case SYMBOL_SMALL_TLSDESC
:
9782 asm_fprintf (asm_out_file
, ":tlsdesc:");
9785 case SYMBOL_SMALL_TLSIE
:
9786 asm_fprintf (asm_out_file
, ":gottprel:");
9789 case SYMBOL_TLSLE24
:
9790 asm_fprintf (asm_out_file
, ":tprel:");
9793 case SYMBOL_TINY_GOT
:
9800 output_addr_const (asm_out_file
, x
);
9804 switch (aarch64_classify_symbolic_expression (x
))
9806 case SYMBOL_SMALL_GOT_4G
:
9807 asm_fprintf (asm_out_file
, ":lo12:");
9810 case SYMBOL_SMALL_TLSGD
:
9811 asm_fprintf (asm_out_file
, ":tlsgd_lo12:");
9814 case SYMBOL_SMALL_TLSDESC
:
9815 asm_fprintf (asm_out_file
, ":tlsdesc_lo12:");
9818 case SYMBOL_SMALL_TLSIE
:
9819 asm_fprintf (asm_out_file
, ":gottprel_lo12:");
9822 case SYMBOL_TLSLE12
:
9823 asm_fprintf (asm_out_file
, ":tprel_lo12:");
9826 case SYMBOL_TLSLE24
:
9827 asm_fprintf (asm_out_file
, ":tprel_lo12_nc:");
9830 case SYMBOL_TINY_GOT
:
9831 asm_fprintf (asm_out_file
, ":got:");
9834 case SYMBOL_TINY_TLSIE
:
9835 asm_fprintf (asm_out_file
, ":gottprel:");
9841 output_addr_const (asm_out_file
, x
);
9845 switch (aarch64_classify_symbolic_expression (x
))
9847 case SYMBOL_TLSLE24
:
9848 asm_fprintf (asm_out_file
, ":tprel_hi12:");
9853 output_addr_const (asm_out_file
, x
);
9858 HOST_WIDE_INT cond_code
;
9860 if (!CONST_INT_P (x
))
9862 output_operand_lossage ("invalid operand for '%%%c'", code
);
9866 cond_code
= INTVAL (x
);
9867 gcc_assert (cond_code
>= 0 && cond_code
<= AARCH64_NV
);
9868 asm_fprintf (f
, "%d", aarch64_nzcv_codes
[cond_code
]);
9875 machine_mode mode
= GET_MODE (x
);
9877 if (GET_CODE (x
) != MEM
9878 || (code
== 'y' && maybe_ne (GET_MODE_SIZE (mode
), 16)))
9880 output_operand_lossage ("invalid operand for '%%%c'", code
);
9884 if (!aarch64_print_address_internal (f
, mode
, XEXP (x
, 0),
9886 ? ADDR_QUERY_LDP_STP_N
9887 : ADDR_QUERY_LDP_STP
))
9888 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
9893 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
9898 /* Print address 'x' of a memory access with mode 'mode'.
9899 'op' is the context required by aarch64_classify_address. It can either be
9900 MEM for a normal memory access or PARALLEL for LDP/STP. */
9902 aarch64_print_address_internal (FILE *f
, machine_mode mode
, rtx x
,
9903 aarch64_addr_query_type type
)
9905 struct aarch64_address_info addr
;
9906 unsigned int size
, vec_flags
;
9908 /* Check all addresses are Pmode - including ILP32. */
9909 if (GET_MODE (x
) != Pmode
9910 && (!CONST_INT_P (x
)
9911 || trunc_int_for_mode (INTVAL (x
), Pmode
) != INTVAL (x
)))
9913 output_operand_lossage ("invalid address mode");
9917 if (aarch64_classify_address (&addr
, x
, mode
, true, type
))
9920 case ADDRESS_REG_IMM
:
9921 if (known_eq (addr
.const_offset
, 0))
9923 asm_fprintf (f
, "[%s]", reg_names
[REGNO (addr
.base
)]);
9927 vec_flags
= aarch64_classify_vector_mode (mode
);
9928 if (vec_flags
& VEC_ANY_SVE
)
9931 = exact_div (addr
.const_offset
,
9932 aarch64_vl_bytes (mode
, vec_flags
)).to_constant ();
9933 asm_fprintf (f
, "[%s, #%wd, mul vl]",
9934 reg_names
[REGNO (addr
.base
)], vnum
);
9938 asm_fprintf (f
, "[%s, %wd]", reg_names
[REGNO (addr
.base
)],
9939 INTVAL (addr
.offset
));
9942 case ADDRESS_REG_REG
:
9943 if (addr
.shift
== 0)
9944 asm_fprintf (f
, "[%s, %s]", reg_names
[REGNO (addr
.base
)],
9945 reg_names
[REGNO (addr
.offset
)]);
9947 asm_fprintf (f
, "[%s, %s, lsl %u]", reg_names
[REGNO (addr
.base
)],
9948 reg_names
[REGNO (addr
.offset
)], addr
.shift
);
9951 case ADDRESS_REG_UXTW
:
9952 if (addr
.shift
== 0)
9953 asm_fprintf (f
, "[%s, w%d, uxtw]", reg_names
[REGNO (addr
.base
)],
9954 REGNO (addr
.offset
) - R0_REGNUM
);
9956 asm_fprintf (f
, "[%s, w%d, uxtw %u]", reg_names
[REGNO (addr
.base
)],
9957 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
9960 case ADDRESS_REG_SXTW
:
9961 if (addr
.shift
== 0)
9962 asm_fprintf (f
, "[%s, w%d, sxtw]", reg_names
[REGNO (addr
.base
)],
9963 REGNO (addr
.offset
) - R0_REGNUM
);
9965 asm_fprintf (f
, "[%s, w%d, sxtw %u]", reg_names
[REGNO (addr
.base
)],
9966 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
9969 case ADDRESS_REG_WB
:
9970 /* Writeback is only supported for fixed-width modes. */
9971 size
= GET_MODE_SIZE (mode
).to_constant ();
9972 switch (GET_CODE (x
))
9975 asm_fprintf (f
, "[%s, %d]!", reg_names
[REGNO (addr
.base
)], size
);
9978 asm_fprintf (f
, "[%s], %d", reg_names
[REGNO (addr
.base
)], size
);
9981 asm_fprintf (f
, "[%s, -%d]!", reg_names
[REGNO (addr
.base
)], size
);
9984 asm_fprintf (f
, "[%s], -%d", reg_names
[REGNO (addr
.base
)], size
);
9987 asm_fprintf (f
, "[%s, %wd]!", reg_names
[REGNO (addr
.base
)],
9988 INTVAL (addr
.offset
));
9991 asm_fprintf (f
, "[%s], %wd", reg_names
[REGNO (addr
.base
)],
9992 INTVAL (addr
.offset
));
9999 case ADDRESS_LO_SUM
:
10000 asm_fprintf (f
, "[%s, #:lo12:", reg_names
[REGNO (addr
.base
)]);
10001 output_addr_const (f
, addr
.offset
);
10002 asm_fprintf (f
, "]");
10005 case ADDRESS_SYMBOLIC
:
10006 output_addr_const (f
, x
);
10013 /* Print address 'x' of a memory access with mode 'mode'. */
10015 aarch64_print_operand_address (FILE *f
, machine_mode mode
, rtx x
)
10017 if (!aarch64_print_address_internal (f
, mode
, x
, ADDR_QUERY_ANY
))
10018 output_addr_const (f
, x
);
10022 aarch64_label_mentioned_p (rtx x
)
10027 if (GET_CODE (x
) == LABEL_REF
)
10030 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
10031 referencing instruction, but they are constant offsets, not
10033 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
10036 fmt
= GET_RTX_FORMAT (GET_CODE (x
));
10037 for (i
= GET_RTX_LENGTH (GET_CODE (x
)) - 1; i
>= 0; i
--)
10043 for (j
= XVECLEN (x
, i
) - 1; j
>= 0; j
--)
10044 if (aarch64_label_mentioned_p (XVECEXP (x
, i
, j
)))
10047 else if (fmt
[i
] == 'e' && aarch64_label_mentioned_p (XEXP (x
, i
)))
10054 /* Implement REGNO_REG_CLASS. */
10057 aarch64_regno_regclass (unsigned regno
)
10059 if (GP_REGNUM_P (regno
))
10060 return GENERAL_REGS
;
10062 if (regno
== SP_REGNUM
)
10065 if (regno
== FRAME_POINTER_REGNUM
10066 || regno
== ARG_POINTER_REGNUM
)
10067 return POINTER_REGS
;
10069 if (FP_REGNUM_P (regno
))
10070 return (FP_LO8_REGNUM_P (regno
) ? FP_LO8_REGS
10071 : FP_LO_REGNUM_P (regno
) ? FP_LO_REGS
: FP_REGS
);
10073 if (PR_REGNUM_P (regno
))
10074 return PR_LO_REGNUM_P (regno
) ? PR_LO_REGS
: PR_HI_REGS
;
10076 if (regno
== FFR_REGNUM
|| regno
== FFRT_REGNUM
)
10082 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
10083 If OFFSET is out of range, return an offset of an anchor point
10084 that is in range. Return 0 otherwise. */
10086 static HOST_WIDE_INT
10087 aarch64_anchor_offset (HOST_WIDE_INT offset
, HOST_WIDE_INT size
,
10090 /* Does it look like we'll need a 16-byte load/store-pair operation? */
10092 return (offset
+ 0x400) & ~0x7f0;
10094 /* For offsets that aren't a multiple of the access size, the limit is
10096 if (offset
& (size
- 1))
10098 /* BLKmode typically uses LDP of X-registers. */
10099 if (mode
== BLKmode
)
10100 return (offset
+ 512) & ~0x3ff;
10101 return (offset
+ 0x100) & ~0x1ff;
10104 /* Small negative offsets are supported. */
10105 if (IN_RANGE (offset
, -256, 0))
10108 if (mode
== TImode
|| mode
== TFmode
)
10109 return (offset
+ 0x100) & ~0x1ff;
10111 /* Use 12-bit offset by access size. */
10112 return offset
& (~0xfff * size
);
10116 aarch64_legitimize_address (rtx x
, rtx
/* orig_x */, machine_mode mode
)
10118 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
10119 where mask is selected by alignment and size of the offset.
10120 We try to pick as large a range for the offset as possible to
10121 maximize the chance of a CSE. However, for aligned addresses
10122 we limit the range to 4k so that structures with different sized
10123 elements are likely to use the same base. We need to be careful
10124 not to split a CONST for some forms of address expression, otherwise
10125 it will generate sub-optimal code. */
10127 if (GET_CODE (x
) == PLUS
&& CONST_INT_P (XEXP (x
, 1)))
10129 rtx base
= XEXP (x
, 0);
10130 rtx offset_rtx
= XEXP (x
, 1);
10131 HOST_WIDE_INT offset
= INTVAL (offset_rtx
);
10133 if (GET_CODE (base
) == PLUS
)
10135 rtx op0
= XEXP (base
, 0);
10136 rtx op1
= XEXP (base
, 1);
10138 /* Force any scaling into a temp for CSE. */
10139 op0
= force_reg (Pmode
, op0
);
10140 op1
= force_reg (Pmode
, op1
);
10142 /* Let the pointer register be in op0. */
10143 if (REG_POINTER (op1
))
10144 std::swap (op0
, op1
);
10146 /* If the pointer is virtual or frame related, then we know that
10147 virtual register instantiation or register elimination is going
10148 to apply a second constant. We want the two constants folded
10149 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
10150 if (virt_or_elim_regno_p (REGNO (op0
)))
10152 base
= expand_binop (Pmode
, add_optab
, op0
, offset_rtx
,
10153 NULL_RTX
, true, OPTAB_DIRECT
);
10154 return gen_rtx_PLUS (Pmode
, base
, op1
);
10157 /* Otherwise, in order to encourage CSE (and thence loop strength
10158 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
10159 base
= expand_binop (Pmode
, add_optab
, op0
, op1
,
10160 NULL_RTX
, true, OPTAB_DIRECT
);
10161 x
= gen_rtx_PLUS (Pmode
, base
, offset_rtx
);
10164 HOST_WIDE_INT size
;
10165 if (GET_MODE_SIZE (mode
).is_constant (&size
))
10167 HOST_WIDE_INT base_offset
= aarch64_anchor_offset (offset
, size
,
10169 if (base_offset
!= 0)
10171 base
= plus_constant (Pmode
, base
, base_offset
);
10172 base
= force_operand (base
, NULL_RTX
);
10173 return plus_constant (Pmode
, base
, offset
- base_offset
);
10182 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED
, rtx x
,
10183 reg_class_t rclass
,
10185 secondary_reload_info
*sri
)
10187 /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
10188 LDR and STR. See the comment at the head of aarch64-sve.md for
10189 more details about the big-endian handling. */
10190 if (reg_class_subset_p (rclass
, FP_REGS
)
10191 && !((REG_P (x
) && HARD_REGISTER_P (x
))
10192 || aarch64_simd_valid_immediate (x
, NULL
))
10193 && mode
!= VNx16QImode
)
10195 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
10196 if ((vec_flags
& VEC_SVE_DATA
)
10197 && ((vec_flags
& VEC_PARTIAL
) || BYTES_BIG_ENDIAN
))
10199 sri
->icode
= CODE_FOR_aarch64_sve_reload_mem
;
10204 /* If we have to disable direct literal pool loads and stores because the
10205 function is too big, then we need a scratch register. */
10206 if (MEM_P (x
) && GET_CODE (x
) == SYMBOL_REF
&& CONSTANT_POOL_ADDRESS_P (x
)
10207 && (SCALAR_FLOAT_MODE_P (GET_MODE (x
))
10208 || targetm
.vector_mode_supported_p (GET_MODE (x
)))
10209 && !aarch64_pcrelative_literal_loads
)
10211 sri
->icode
= code_for_aarch64_reload_movcp (mode
, DImode
);
10215 /* Without the TARGET_SIMD instructions we cannot move a Q register
10216 to a Q register directly. We need a scratch. */
10217 if (REG_P (x
) && (mode
== TFmode
|| mode
== TImode
) && mode
== GET_MODE (x
)
10218 && FP_REGNUM_P (REGNO (x
)) && !TARGET_SIMD
10219 && reg_class_subset_p (rclass
, FP_REGS
))
10221 sri
->icode
= code_for_aarch64_reload_mov (mode
);
10225 /* A TFmode or TImode memory access should be handled via an FP_REGS
10226 because AArch64 has richer addressing modes for LDR/STR instructions
10227 than LDP/STP instructions. */
10228 if (TARGET_FLOAT
&& rclass
== GENERAL_REGS
10229 && known_eq (GET_MODE_SIZE (mode
), 16) && MEM_P (x
))
10232 if (rclass
== FP_REGS
&& (mode
== TImode
|| mode
== TFmode
) && CONSTANT_P(x
))
10233 return GENERAL_REGS
;
10239 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED
, const int to
)
10241 gcc_assert (from
== ARG_POINTER_REGNUM
|| from
== FRAME_POINTER_REGNUM
);
10243 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
10244 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
10245 if (frame_pointer_needed
)
10246 return to
== HARD_FRAME_POINTER_REGNUM
;
10251 aarch64_initial_elimination_offset (unsigned from
, unsigned to
)
10253 if (to
== HARD_FRAME_POINTER_REGNUM
)
10255 if (from
== ARG_POINTER_REGNUM
)
10256 return cfun
->machine
->frame
.hard_fp_offset
;
10258 if (from
== FRAME_POINTER_REGNUM
)
10259 return cfun
->machine
->frame
.hard_fp_offset
10260 - cfun
->machine
->frame
.locals_offset
;
10263 if (to
== STACK_POINTER_REGNUM
)
10265 if (from
== FRAME_POINTER_REGNUM
)
10266 return cfun
->machine
->frame
.frame_size
10267 - cfun
->machine
->frame
.locals_offset
;
10270 return cfun
->machine
->frame
.frame_size
;
10273 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
10277 aarch64_return_addr (int count
, rtx frame ATTRIBUTE_UNUSED
)
10281 return get_hard_reg_initial_val (Pmode
, LR_REGNUM
);
10286 aarch64_asm_trampoline_template (FILE *f
)
10291 if (aarch64_bti_enabled ())
10293 asm_fprintf (f
, "\thint\t34 // bti c\n");
10300 asm_fprintf (f
, "\tldr\tw%d, .+%d\n", IP1_REGNUM
- R0_REGNUM
, offset1
);
10301 asm_fprintf (f
, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM
- R0_REGNUM
,
10306 asm_fprintf (f
, "\tldr\t%s, .+%d\n", reg_names
[IP1_REGNUM
], offset1
);
10307 asm_fprintf (f
, "\tldr\t%s, .+%d\n", reg_names
[STATIC_CHAIN_REGNUM
],
10310 asm_fprintf (f
, "\tbr\t%s\n", reg_names
[IP1_REGNUM
]);
10312 /* The trampoline needs an extra padding instruction. In case if BTI is
10313 enabled the padding instruction is replaced by the BTI instruction at
10315 if (!aarch64_bti_enabled ())
10316 assemble_aligned_integer (4, const0_rtx
);
10318 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
10319 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
10323 aarch64_trampoline_init (rtx m_tramp
, tree fndecl
, rtx chain_value
)
10325 rtx fnaddr
, mem
, a_tramp
;
10326 const int tramp_code_sz
= 16;
10328 /* Don't need to copy the trailing D-words, we fill those in below. */
10329 emit_block_move (m_tramp
, assemble_trampoline_template (),
10330 GEN_INT (tramp_code_sz
), BLOCK_OP_NORMAL
);
10331 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
);
10332 fnaddr
= XEXP (DECL_RTL (fndecl
), 0);
10333 if (GET_MODE (fnaddr
) != ptr_mode
)
10334 fnaddr
= convert_memory_address (ptr_mode
, fnaddr
);
10335 emit_move_insn (mem
, fnaddr
);
10337 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
+ POINTER_BYTES
);
10338 emit_move_insn (mem
, chain_value
);
10340 /* XXX We should really define a "clear_cache" pattern and use
10341 gen_clear_cache(). */
10342 a_tramp
= XEXP (m_tramp
, 0);
10343 emit_library_call (gen_rtx_SYMBOL_REF (Pmode
, "__clear_cache"),
10344 LCT_NORMAL
, VOIDmode
, a_tramp
, ptr_mode
,
10345 plus_constant (ptr_mode
, a_tramp
, TRAMPOLINE_SIZE
),
10349 static unsigned char
10350 aarch64_class_max_nregs (reg_class_t regclass
, machine_mode mode
)
10352 /* ??? Logically we should only need to provide a value when
10353 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
10354 can hold MODE, but at the moment we need to handle all modes.
10355 Just ignore any runtime parts for registers that can't store them. */
10356 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
10357 unsigned int nregs
, vec_flags
;
10360 case TAILCALL_ADDR_REGS
:
10364 case POINTER_AND_FP_REGS
:
10368 vec_flags
= aarch64_classify_vector_mode (mode
);
10369 if ((vec_flags
& VEC_SVE_DATA
)
10370 && constant_multiple_p (GET_MODE_SIZE (mode
),
10371 aarch64_vl_bytes (mode
, vec_flags
), &nregs
))
10373 return (vec_flags
& VEC_ADVSIMD
10374 ? CEIL (lowest_size
, UNITS_PER_VREG
)
10375 : CEIL (lowest_size
, UNITS_PER_WORD
));
10381 case PR_AND_FFR_REGS
:
10390 gcc_unreachable ();
10394 aarch64_preferred_reload_class (rtx x
, reg_class_t regclass
)
10396 if (regclass
== POINTER_REGS
)
10397 return GENERAL_REGS
;
10399 if (regclass
== STACK_REG
)
10402 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x
)), POINTER_REGS
))
10408 /* Register eliminiation can result in a request for
10409 SP+constant->FP_REGS. We cannot support such operations which
10410 use SP as source and an FP_REG as destination, so reject out
10412 if (! reg_class_subset_p (regclass
, GENERAL_REGS
) && GET_CODE (x
) == PLUS
)
10414 rtx lhs
= XEXP (x
, 0);
10416 /* Look through a possible SUBREG introduced by ILP32. */
10417 if (GET_CODE (lhs
) == SUBREG
)
10418 lhs
= SUBREG_REG (lhs
);
10420 gcc_assert (REG_P (lhs
));
10421 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs
)),
10430 aarch64_asm_output_labelref (FILE* f
, const char *name
)
10432 asm_fprintf (f
, "%U%s", name
);
10436 aarch64_elf_asm_constructor (rtx symbol
, int priority
)
10438 if (priority
== DEFAULT_INIT_PRIORITY
)
10439 default_ctor_section_asm_out_constructor (symbol
, priority
);
10443 /* While priority is known to be in range [0, 65535], so 18 bytes
10444 would be enough, the compiler might not know that. To avoid
10445 -Wformat-truncation false positive, use a larger size. */
10447 snprintf (buf
, sizeof (buf
), ".init_array.%.5u", priority
);
10448 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
10449 switch_to_section (s
);
10450 assemble_align (POINTER_SIZE
);
10451 assemble_aligned_integer (POINTER_BYTES
, symbol
);
10456 aarch64_elf_asm_destructor (rtx symbol
, int priority
)
10458 if (priority
== DEFAULT_INIT_PRIORITY
)
10459 default_dtor_section_asm_out_destructor (symbol
, priority
);
10463 /* While priority is known to be in range [0, 65535], so 18 bytes
10464 would be enough, the compiler might not know that. To avoid
10465 -Wformat-truncation false positive, use a larger size. */
10467 snprintf (buf
, sizeof (buf
), ".fini_array.%.5u", priority
);
10468 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
10469 switch_to_section (s
);
10470 assemble_align (POINTER_SIZE
);
10471 assemble_aligned_integer (POINTER_BYTES
, symbol
);
10476 aarch64_output_casesi (rtx
*operands
)
10480 rtx diff_vec
= PATTERN (NEXT_INSN (as_a
<rtx_insn
*> (operands
[2])));
10482 static const char *const patterns
[4][2] =
10485 "ldrb\t%w3, [%0,%w1,uxtw]",
10486 "add\t%3, %4, %w3, sxtb #2"
10489 "ldrh\t%w3, [%0,%w1,uxtw #1]",
10490 "add\t%3, %4, %w3, sxth #2"
10493 "ldr\t%w3, [%0,%w1,uxtw #2]",
10494 "add\t%3, %4, %w3, sxtw #2"
10496 /* We assume that DImode is only generated when not optimizing and
10497 that we don't really need 64-bit address offsets. That would
10498 imply an object file with 8GB of code in a single function! */
10500 "ldr\t%w3, [%0,%w1,uxtw #2]",
10501 "add\t%3, %4, %w3, sxtw #2"
10505 gcc_assert (GET_CODE (diff_vec
) == ADDR_DIFF_VEC
);
10507 scalar_int_mode mode
= as_a
<scalar_int_mode
> (GET_MODE (diff_vec
));
10508 index
= exact_log2 (GET_MODE_SIZE (mode
));
10510 gcc_assert (index
>= 0 && index
<= 3);
10512 /* Need to implement table size reduction, by chaning the code below. */
10513 output_asm_insn (patterns
[index
][0], operands
);
10514 ASM_GENERATE_INTERNAL_LABEL (label
, "Lrtx", CODE_LABEL_NUMBER (operands
[2]));
10515 snprintf (buf
, sizeof (buf
),
10516 "adr\t%%4, %s", targetm
.strip_name_encoding (label
));
10517 output_asm_insn (buf
, operands
);
10518 output_asm_insn (patterns
[index
][1], operands
);
10519 output_asm_insn ("br\t%3", operands
);
10520 assemble_label (asm_out_file
, label
);
10525 /* Return size in bits of an arithmetic operand which is shifted/scaled and
10526 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
10530 aarch64_uxt_size (int shift
, HOST_WIDE_INT mask
)
10532 if (shift
>= 0 && shift
<= 3)
10535 for (size
= 8; size
<= 32; size
*= 2)
10537 HOST_WIDE_INT bits
= ((HOST_WIDE_INT
)1U << size
) - 1;
10538 if (mask
== bits
<< shift
)
10545 /* Constant pools are per function only when PC relative
10546 literal loads are true or we are in the large memory
10550 aarch64_can_use_per_function_literal_pools_p (void)
10552 return (aarch64_pcrelative_literal_loads
10553 || aarch64_cmodel
== AARCH64_CMODEL_LARGE
);
10557 aarch64_use_blocks_for_constant_p (machine_mode
, const_rtx
)
10559 /* We can't use blocks for constants when we're using a per-function
10561 return !aarch64_can_use_per_function_literal_pools_p ();
10564 /* Select appropriate section for constants depending
10565 on where we place literal pools. */
10568 aarch64_select_rtx_section (machine_mode mode
,
10570 unsigned HOST_WIDE_INT align
)
10572 if (aarch64_can_use_per_function_literal_pools_p ())
10573 return function_section (current_function_decl
);
10575 return default_elf_select_rtx_section (mode
, x
, align
);
10578 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
10580 aarch64_asm_output_pool_epilogue (FILE *f
, const char *, tree
,
10581 HOST_WIDE_INT offset
)
10583 /* When using per-function literal pools, we must ensure that any code
10584 section is aligned to the minimal instruction length, lest we get
10585 errors from the assembler re "unaligned instructions". */
10586 if ((offset
& 3) && aarch64_can_use_per_function_literal_pools_p ())
10587 ASM_OUTPUT_ALIGN (f
, 2);
10592 /* Helper function for rtx cost calculation. Strip a shift expression
10593 from X. Returns the inner operand if successful, or the original
10594 expression on failure. */
10596 aarch64_strip_shift (rtx x
)
10600 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
10601 we can convert both to ROR during final output. */
10602 if ((GET_CODE (op
) == ASHIFT
10603 || GET_CODE (op
) == ASHIFTRT
10604 || GET_CODE (op
) == LSHIFTRT
10605 || GET_CODE (op
) == ROTATERT
10606 || GET_CODE (op
) == ROTATE
)
10607 && CONST_INT_P (XEXP (op
, 1)))
10608 return XEXP (op
, 0);
10610 if (GET_CODE (op
) == MULT
10611 && CONST_INT_P (XEXP (op
, 1))
10612 && ((unsigned) exact_log2 (INTVAL (XEXP (op
, 1)))) < 64)
10613 return XEXP (op
, 0);
10618 /* Helper function for rtx cost calculation. Strip an extend
10619 expression from X. Returns the inner operand if successful, or the
10620 original expression on failure. We deal with a number of possible
10621 canonicalization variations here. If STRIP_SHIFT is true, then
10622 we can strip off a shift also. */
10624 aarch64_strip_extend (rtx x
, bool strip_shift
)
10626 scalar_int_mode mode
;
10629 if (!is_a
<scalar_int_mode
> (GET_MODE (op
), &mode
))
10632 /* Zero and sign extraction of a widened value. */
10633 if ((GET_CODE (op
) == ZERO_EXTRACT
|| GET_CODE (op
) == SIGN_EXTRACT
)
10634 && XEXP (op
, 2) == const0_rtx
10635 && GET_CODE (XEXP (op
, 0)) == MULT
10636 && aarch64_is_extend_from_extract (mode
, XEXP (XEXP (op
, 0), 1),
10638 return XEXP (XEXP (op
, 0), 0);
10640 /* It can also be represented (for zero-extend) as an AND with an
10642 if (GET_CODE (op
) == AND
10643 && GET_CODE (XEXP (op
, 0)) == MULT
10644 && CONST_INT_P (XEXP (XEXP (op
, 0), 1))
10645 && CONST_INT_P (XEXP (op
, 1))
10646 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op
, 0), 1))),
10647 INTVAL (XEXP (op
, 1))) != 0)
10648 return XEXP (XEXP (op
, 0), 0);
10650 /* Now handle extended register, as this may also have an optional
10651 left shift by 1..4. */
10653 && GET_CODE (op
) == ASHIFT
10654 && CONST_INT_P (XEXP (op
, 1))
10655 && ((unsigned HOST_WIDE_INT
) INTVAL (XEXP (op
, 1))) <= 4)
10658 if (GET_CODE (op
) == ZERO_EXTEND
10659 || GET_CODE (op
) == SIGN_EXTEND
)
10668 /* Return true iff CODE is a shift supported in combination
10669 with arithmetic instructions. */
10672 aarch64_shift_p (enum rtx_code code
)
10674 return code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
;
10678 /* Return true iff X is a cheap shift without a sign extend. */
10681 aarch64_cheap_mult_shift_p (rtx x
)
10688 if (!(aarch64_tune_params
.extra_tuning_flags
10689 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
))
10692 if (GET_CODE (op0
) == SIGN_EXTEND
)
10695 if (GET_CODE (x
) == ASHIFT
&& CONST_INT_P (op1
)
10696 && UINTVAL (op1
) <= 4)
10699 if (GET_CODE (x
) != MULT
|| !CONST_INT_P (op1
))
10702 HOST_WIDE_INT l2
= exact_log2 (INTVAL (op1
));
10704 if (l2
> 0 && l2
<= 4)
10710 /* Helper function for rtx cost calculation. Calculate the cost of
10711 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
10712 Return the calculated cost of the expression, recursing manually in to
10713 operands where needed. */
10716 aarch64_rtx_mult_cost (rtx x
, enum rtx_code code
, int outer
, bool speed
)
10719 const struct cpu_cost_table
*extra_cost
10720 = aarch64_tune_params
.insn_extra_cost
;
10722 bool compound_p
= (outer
== PLUS
|| outer
== MINUS
);
10723 machine_mode mode
= GET_MODE (x
);
10725 gcc_checking_assert (code
== MULT
);
10730 if (VECTOR_MODE_P (mode
))
10731 mode
= GET_MODE_INNER (mode
);
10733 /* Integer multiply/fma. */
10734 if (GET_MODE_CLASS (mode
) == MODE_INT
)
10736 /* The multiply will be canonicalized as a shift, cost it as such. */
10737 if (aarch64_shift_p (GET_CODE (x
))
10738 || (CONST_INT_P (op1
)
10739 && exact_log2 (INTVAL (op1
)) > 0))
10741 bool is_extend
= GET_CODE (op0
) == ZERO_EXTEND
10742 || GET_CODE (op0
) == SIGN_EXTEND
;
10747 /* If the shift is considered cheap,
10748 then don't add any cost. */
10749 if (aarch64_cheap_mult_shift_p (x
))
10751 else if (REG_P (op1
))
10752 /* ARITH + shift-by-register. */
10753 cost
+= extra_cost
->alu
.arith_shift_reg
;
10754 else if (is_extend
)
10755 /* ARITH + extended register. We don't have a cost field
10756 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
10757 cost
+= extra_cost
->alu
.extend_arith
;
10759 /* ARITH + shift-by-immediate. */
10760 cost
+= extra_cost
->alu
.arith_shift
;
10763 /* LSL (immediate). */
10764 cost
+= extra_cost
->alu
.shift
;
10767 /* Strip extends as we will have costed them in the case above. */
10769 op0
= aarch64_strip_extend (op0
, true);
10771 cost
+= rtx_cost (op0
, VOIDmode
, code
, 0, speed
);
10776 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
10777 compound and let the below cases handle it. After all, MNEG is a
10778 special-case alias of MSUB. */
10779 if (GET_CODE (op0
) == NEG
)
10781 op0
= XEXP (op0
, 0);
10785 /* Integer multiplies or FMAs have zero/sign extending variants. */
10786 if ((GET_CODE (op0
) == ZERO_EXTEND
10787 && GET_CODE (op1
) == ZERO_EXTEND
)
10788 || (GET_CODE (op0
) == SIGN_EXTEND
10789 && GET_CODE (op1
) == SIGN_EXTEND
))
10791 cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, MULT
, 0, speed
);
10792 cost
+= rtx_cost (XEXP (op1
, 0), VOIDmode
, MULT
, 1, speed
);
10797 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
10798 cost
+= extra_cost
->mult
[0].extend_add
;
10800 /* MUL/SMULL/UMULL. */
10801 cost
+= extra_cost
->mult
[0].extend
;
10807 /* This is either an integer multiply or a MADD. In both cases
10808 we want to recurse and cost the operands. */
10809 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
10810 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
10816 cost
+= extra_cost
->mult
[mode
== DImode
].add
;
10819 cost
+= extra_cost
->mult
[mode
== DImode
].simple
;
10828 /* Floating-point FMA/FMUL can also support negations of the
10829 operands, unless the rounding mode is upward or downward in
10830 which case FNMUL is different than FMUL with operand negation. */
10831 bool neg0
= GET_CODE (op0
) == NEG
;
10832 bool neg1
= GET_CODE (op1
) == NEG
;
10833 if (compound_p
|| !flag_rounding_math
|| (neg0
&& neg1
))
10836 op0
= XEXP (op0
, 0);
10838 op1
= XEXP (op1
, 0);
10842 /* FMADD/FNMADD/FNMSUB/FMSUB. */
10843 cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
10846 cost
+= extra_cost
->fp
[mode
== DFmode
].mult
;
10849 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
10850 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
10856 aarch64_address_cost (rtx x
,
10858 addr_space_t as ATTRIBUTE_UNUSED
,
10861 enum rtx_code c
= GET_CODE (x
);
10862 const struct cpu_addrcost_table
*addr_cost
= aarch64_tune_params
.addr_cost
;
10863 struct aarch64_address_info info
;
10867 if (!aarch64_classify_address (&info
, x
, mode
, false))
10869 if (GET_CODE (x
) == CONST
|| GET_CODE (x
) == SYMBOL_REF
)
10871 /* This is a CONST or SYMBOL ref which will be split
10872 in a different way depending on the code model in use.
10873 Cost it through the generic infrastructure. */
10874 int cost_symbol_ref
= rtx_cost (x
, Pmode
, MEM
, 1, speed
);
10875 /* Divide through by the cost of one instruction to
10876 bring it to the same units as the address costs. */
10877 cost_symbol_ref
/= COSTS_N_INSNS (1);
10878 /* The cost is then the cost of preparing the address,
10879 followed by an immediate (possibly 0) offset. */
10880 return cost_symbol_ref
+ addr_cost
->imm_offset
;
10884 /* This is most likely a jump table from a case
10886 return addr_cost
->register_offset
;
10892 case ADDRESS_LO_SUM
:
10893 case ADDRESS_SYMBOLIC
:
10894 case ADDRESS_REG_IMM
:
10895 cost
+= addr_cost
->imm_offset
;
10898 case ADDRESS_REG_WB
:
10899 if (c
== PRE_INC
|| c
== PRE_DEC
|| c
== PRE_MODIFY
)
10900 cost
+= addr_cost
->pre_modify
;
10901 else if (c
== POST_INC
|| c
== POST_DEC
|| c
== POST_MODIFY
)
10902 cost
+= addr_cost
->post_modify
;
10904 gcc_unreachable ();
10908 case ADDRESS_REG_REG
:
10909 cost
+= addr_cost
->register_offset
;
10912 case ADDRESS_REG_SXTW
:
10913 cost
+= addr_cost
->register_sextend
;
10916 case ADDRESS_REG_UXTW
:
10917 cost
+= addr_cost
->register_zextend
;
10921 gcc_unreachable ();
10925 if (info
.shift
> 0)
10927 /* For the sake of calculating the cost of the shifted register
10928 component, we can treat same sized modes in the same way. */
10929 if (known_eq (GET_MODE_BITSIZE (mode
), 16))
10930 cost
+= addr_cost
->addr_scale_costs
.hi
;
10931 else if (known_eq (GET_MODE_BITSIZE (mode
), 32))
10932 cost
+= addr_cost
->addr_scale_costs
.si
;
10933 else if (known_eq (GET_MODE_BITSIZE (mode
), 64))
10934 cost
+= addr_cost
->addr_scale_costs
.di
;
10936 /* We can't tell, or this is a 128-bit vector. */
10937 cost
+= addr_cost
->addr_scale_costs
.ti
;
10943 /* Return the cost of a branch. If SPEED_P is true then the compiler is
10944 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
10948 aarch64_branch_cost (bool speed_p
, bool predictable_p
)
10950 /* When optimizing for speed, use the cost of unpredictable branches. */
10951 const struct cpu_branch_cost
*branch_costs
=
10952 aarch64_tune_params
.branch_costs
;
10954 if (!speed_p
|| predictable_p
)
10955 return branch_costs
->predictable
;
10957 return branch_costs
->unpredictable
;
10960 /* Return true if the RTX X in mode MODE is a zero or sign extract
10961 usable in an ADD or SUB (extended register) instruction. */
10963 aarch64_rtx_arith_op_extract_p (rtx x
, scalar_int_mode mode
)
10965 /* Catch add with a sign extract.
10966 This is add_<optab><mode>_multp2. */
10967 if (GET_CODE (x
) == SIGN_EXTRACT
10968 || GET_CODE (x
) == ZERO_EXTRACT
)
10970 rtx op0
= XEXP (x
, 0);
10971 rtx op1
= XEXP (x
, 1);
10972 rtx op2
= XEXP (x
, 2);
10974 if (GET_CODE (op0
) == MULT
10975 && CONST_INT_P (op1
)
10976 && op2
== const0_rtx
10977 && CONST_INT_P (XEXP (op0
, 1))
10978 && aarch64_is_extend_from_extract (mode
,
10985 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
10987 else if (GET_CODE (x
) == SIGN_EXTEND
10988 || GET_CODE (x
) == ZERO_EXTEND
)
10989 return REG_P (XEXP (x
, 0));
10995 aarch64_frint_unspec_p (unsigned int u
)
10999 case UNSPEC_FRINTZ
:
11000 case UNSPEC_FRINTP
:
11001 case UNSPEC_FRINTM
:
11002 case UNSPEC_FRINTA
:
11003 case UNSPEC_FRINTN
:
11004 case UNSPEC_FRINTX
:
11005 case UNSPEC_FRINTI
:
11013 /* Return true iff X is an rtx that will match an extr instruction
11014 i.e. as described in the *extr<mode>5_insn family of patterns.
11015 OP0 and OP1 will be set to the operands of the shifts involved
11016 on success and will be NULL_RTX otherwise. */
11019 aarch64_extr_rtx_p (rtx x
, rtx
*res_op0
, rtx
*res_op1
)
11022 scalar_int_mode mode
;
11023 if (!is_a
<scalar_int_mode
> (GET_MODE (x
), &mode
))
11026 *res_op0
= NULL_RTX
;
11027 *res_op1
= NULL_RTX
;
11029 if (GET_CODE (x
) != IOR
)
11035 if ((GET_CODE (op0
) == ASHIFT
&& GET_CODE (op1
) == LSHIFTRT
)
11036 || (GET_CODE (op1
) == ASHIFT
&& GET_CODE (op0
) == LSHIFTRT
))
11038 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
11039 if (GET_CODE (op1
) == ASHIFT
)
11040 std::swap (op0
, op1
);
11042 if (!CONST_INT_P (XEXP (op0
, 1)) || !CONST_INT_P (XEXP (op1
, 1)))
11045 unsigned HOST_WIDE_INT shft_amnt_0
= UINTVAL (XEXP (op0
, 1));
11046 unsigned HOST_WIDE_INT shft_amnt_1
= UINTVAL (XEXP (op1
, 1));
11048 if (shft_amnt_0
< GET_MODE_BITSIZE (mode
)
11049 && shft_amnt_0
+ shft_amnt_1
== GET_MODE_BITSIZE (mode
))
11051 *res_op0
= XEXP (op0
, 0);
11052 *res_op1
= XEXP (op1
, 0);
11060 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
11061 storing it in *COST. Result is true if the total cost of the operation
11062 has now been calculated. */
11064 aarch64_if_then_else_costs (rtx op0
, rtx op1
, rtx op2
, int *cost
, bool speed
)
11068 enum rtx_code cmpcode
;
11069 const struct cpu_cost_table
*extra_cost
11070 = aarch64_tune_params
.insn_extra_cost
;
11072 if (COMPARISON_P (op0
))
11074 inner
= XEXP (op0
, 0);
11075 comparator
= XEXP (op0
, 1);
11076 cmpcode
= GET_CODE (op0
);
11081 comparator
= const0_rtx
;
11085 if (GET_CODE (op1
) == PC
|| GET_CODE (op2
) == PC
)
11087 /* Conditional branch. */
11088 if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
11092 if (cmpcode
== NE
|| cmpcode
== EQ
)
11094 if (comparator
== const0_rtx
)
11096 /* TBZ/TBNZ/CBZ/CBNZ. */
11097 if (GET_CODE (inner
) == ZERO_EXTRACT
)
11099 *cost
+= rtx_cost (XEXP (inner
, 0), VOIDmode
,
11100 ZERO_EXTRACT
, 0, speed
);
11103 *cost
+= rtx_cost (inner
, VOIDmode
, cmpcode
, 0, speed
);
11107 if (register_operand (inner
, VOIDmode
)
11108 && aarch64_imm24 (comparator
, VOIDmode
))
11110 /* SUB and SUBS. */
11111 *cost
+= COSTS_N_INSNS (2);
11113 *cost
+= extra_cost
->alu
.arith
* 2;
11117 else if (cmpcode
== LT
|| cmpcode
== GE
)
11120 if (comparator
== const0_rtx
)
11125 else if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
11128 if (GET_CODE (op1
) == COMPARE
)
11130 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
11131 if (XEXP (op1
, 1) == const0_rtx
)
11135 machine_mode mode
= GET_MODE (XEXP (op1
, 0));
11136 const struct cpu_cost_table
*extra_cost
11137 = aarch64_tune_params
.insn_extra_cost
;
11139 if (GET_MODE_CLASS (mode
) == MODE_INT
)
11140 *cost
+= extra_cost
->alu
.arith
;
11142 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
11147 /* It's a conditional operation based on the status flags,
11148 so it must be some flavor of CSEL. */
11150 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
11151 if (GET_CODE (op1
) == NEG
11152 || GET_CODE (op1
) == NOT
11153 || (GET_CODE (op1
) == PLUS
&& XEXP (op1
, 1) == const1_rtx
))
11154 op1
= XEXP (op1
, 0);
11155 else if (GET_CODE (op1
) == ZERO_EXTEND
&& GET_CODE (op2
) == ZERO_EXTEND
)
11157 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
11158 op1
= XEXP (op1
, 0);
11159 op2
= XEXP (op2
, 0);
11162 *cost
+= rtx_cost (op1
, VOIDmode
, IF_THEN_ELSE
, 1, speed
);
11163 *cost
+= rtx_cost (op2
, VOIDmode
, IF_THEN_ELSE
, 2, speed
);
11167 /* We don't know what this is, cost all operands. */
11171 /* Check whether X is a bitfield operation of the form shift + extend that
11172 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
11173 operand to which the bitfield operation is applied. Otherwise return
11177 aarch64_extend_bitfield_pattern_p (rtx x
)
11179 rtx_code outer_code
= GET_CODE (x
);
11180 machine_mode outer_mode
= GET_MODE (x
);
11182 if (outer_code
!= ZERO_EXTEND
&& outer_code
!= SIGN_EXTEND
11183 && outer_mode
!= SImode
&& outer_mode
!= DImode
)
11186 rtx inner
= XEXP (x
, 0);
11187 rtx_code inner_code
= GET_CODE (inner
);
11188 machine_mode inner_mode
= GET_MODE (inner
);
11191 switch (inner_code
)
11194 if (CONST_INT_P (XEXP (inner
, 1))
11195 && (inner_mode
== QImode
|| inner_mode
== HImode
))
11196 op
= XEXP (inner
, 0);
11199 if (outer_code
== ZERO_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
11200 && (inner_mode
== QImode
|| inner_mode
== HImode
))
11201 op
= XEXP (inner
, 0);
11204 if (outer_code
== SIGN_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
11205 && (inner_mode
== QImode
|| inner_mode
== HImode
))
11206 op
= XEXP (inner
, 0);
11215 /* Return true if the mask and a shift amount from an RTX of the form
11216 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
11217 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
11220 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode
, rtx mask
,
11223 return CONST_INT_P (mask
) && CONST_INT_P (shft_amnt
)
11224 && INTVAL (shft_amnt
) < GET_MODE_BITSIZE (mode
)
11225 && exact_log2 ((INTVAL (mask
) >> INTVAL (shft_amnt
)) + 1) >= 0
11227 & ((HOST_WIDE_INT_1U
<< INTVAL (shft_amnt
)) - 1)) == 0;
11230 /* Return true if the masks and a shift amount from an RTX of the form
11231 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
11232 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
11235 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode
,
11236 unsigned HOST_WIDE_INT mask1
,
11237 unsigned HOST_WIDE_INT shft_amnt
,
11238 unsigned HOST_WIDE_INT mask2
)
11240 unsigned HOST_WIDE_INT t
;
11242 /* Verify that there is no overlap in what bits are set in the two masks. */
11243 if (mask1
!= ~mask2
)
11246 /* Verify that mask2 is not all zeros or ones. */
11247 if (mask2
== 0 || mask2
== HOST_WIDE_INT_M1U
)
11250 /* The shift amount should always be less than the mode size. */
11251 gcc_assert (shft_amnt
< GET_MODE_BITSIZE (mode
));
11253 /* Verify that the mask being shifted is contiguous and would be in the
11254 least significant bits after shifting by shft_amnt. */
11255 t
= mask2
+ (HOST_WIDE_INT_1U
<< shft_amnt
);
11256 return (t
== (t
& -t
));
11259 /* Calculate the cost of calculating X, storing it in *COST. Result
11260 is true if the total cost of the operation has now been calculated. */
11262 aarch64_rtx_costs (rtx x
, machine_mode mode
, int outer ATTRIBUTE_UNUSED
,
11263 int param ATTRIBUTE_UNUSED
, int *cost
, bool speed
)
11266 const struct cpu_cost_table
*extra_cost
11267 = aarch64_tune_params
.insn_extra_cost
;
11268 int code
= GET_CODE (x
);
11269 scalar_int_mode int_mode
;
11271 /* By default, assume that everything has equivalent cost to the
11272 cheapest instruction. Any additional costs are applied as a delta
11273 above this default. */
11274 *cost
= COSTS_N_INSNS (1);
11279 /* The cost depends entirely on the operands to SET. */
11281 op0
= SET_DEST (x
);
11284 switch (GET_CODE (op0
))
11289 rtx address
= XEXP (op0
, 0);
11290 if (VECTOR_MODE_P (mode
))
11291 *cost
+= extra_cost
->ldst
.storev
;
11292 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
11293 *cost
+= extra_cost
->ldst
.store
;
11294 else if (mode
== SFmode
)
11295 *cost
+= extra_cost
->ldst
.storef
;
11296 else if (mode
== DFmode
)
11297 *cost
+= extra_cost
->ldst
.stored
;
11300 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
11304 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
11308 if (! REG_P (SUBREG_REG (op0
)))
11309 *cost
+= rtx_cost (SUBREG_REG (op0
), VOIDmode
, SET
, 0, speed
);
11311 /* Fall through. */
11313 /* The cost is one per vector-register copied. */
11314 if (VECTOR_MODE_P (GET_MODE (op0
)) && REG_P (op1
))
11316 int nregs
= aarch64_hard_regno_nregs (V0_REGNUM
, GET_MODE (op0
));
11317 *cost
= COSTS_N_INSNS (nregs
);
11319 /* const0_rtx is in general free, but we will use an
11320 instruction to set a register to 0. */
11321 else if (REG_P (op1
) || op1
== const0_rtx
)
11323 /* The cost is 1 per register copied. */
11324 int nregs
= aarch64_hard_regno_nregs (R0_REGNUM
, GET_MODE (op0
));
11325 *cost
= COSTS_N_INSNS (nregs
);
11328 /* Cost is just the cost of the RHS of the set. */
11329 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
11334 /* Bit-field insertion. Strip any redundant widening of
11335 the RHS to meet the width of the target. */
11336 if (GET_CODE (op1
) == SUBREG
)
11337 op1
= SUBREG_REG (op1
);
11338 if ((GET_CODE (op1
) == ZERO_EXTEND
11339 || GET_CODE (op1
) == SIGN_EXTEND
)
11340 && CONST_INT_P (XEXP (op0
, 1))
11341 && is_a
<scalar_int_mode
> (GET_MODE (XEXP (op1
, 0)), &int_mode
)
11342 && GET_MODE_BITSIZE (int_mode
) >= INTVAL (XEXP (op0
, 1)))
11343 op1
= XEXP (op1
, 0);
11345 if (CONST_INT_P (op1
))
11347 /* MOV immediate is assumed to always be cheap. */
11348 *cost
= COSTS_N_INSNS (1);
11354 *cost
+= extra_cost
->alu
.bfi
;
11355 *cost
+= rtx_cost (op1
, VOIDmode
, (enum rtx_code
) code
, 1, speed
);
11361 /* We can't make sense of this, assume default cost. */
11362 *cost
= COSTS_N_INSNS (1);
11368 /* If an instruction can incorporate a constant within the
11369 instruction, the instruction's expression avoids calling
11370 rtx_cost() on the constant. If rtx_cost() is called on a
11371 constant, then it is usually because the constant must be
11372 moved into a register by one or more instructions.
11374 The exception is constant 0, which can be expressed
11375 as XZR/WZR and is therefore free. The exception to this is
11376 if we have (set (reg) (const0_rtx)) in which case we must cost
11377 the move. However, we can catch that when we cost the SET, so
11378 we don't need to consider that here. */
11379 if (x
== const0_rtx
)
11383 /* To an approximation, building any other constant is
11384 proportionally expensive to the number of instructions
11385 required to build that constant. This is true whether we
11386 are compiling for SPEED or otherwise. */
11387 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
11388 int_mode
= word_mode
;
11389 *cost
= COSTS_N_INSNS (aarch64_internal_mov_immediate
11390 (NULL_RTX
, x
, false, int_mode
));
11396 /* First determine number of instructions to do the move
11397 as an integer constant. */
11398 if (!aarch64_float_const_representable_p (x
)
11399 && !aarch64_can_const_movi_rtx_p (x
, mode
)
11400 && aarch64_float_const_rtx_p (x
))
11402 unsigned HOST_WIDE_INT ival
;
11403 bool succeed
= aarch64_reinterpret_float_as_int (x
, &ival
);
11404 gcc_assert (succeed
);
11406 scalar_int_mode imode
= (mode
== HFmode
11408 : int_mode_for_mode (mode
).require ());
11409 int ncost
= aarch64_internal_mov_immediate
11410 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
11411 *cost
+= COSTS_N_INSNS (ncost
);
11417 /* mov[df,sf]_aarch64. */
11418 if (aarch64_float_const_representable_p (x
))
11419 /* FMOV (scalar immediate). */
11420 *cost
+= extra_cost
->fp
[mode
== DFmode
].fpconst
;
11421 else if (!aarch64_float_const_zero_rtx_p (x
))
11423 /* This will be a load from memory. */
11424 if (mode
== DFmode
)
11425 *cost
+= extra_cost
->ldst
.loadd
;
11427 *cost
+= extra_cost
->ldst
.loadf
;
11430 /* Otherwise this is +0.0. We get this using MOVI d0, #0
11431 or MOV v0.s[0], wzr - neither of which are modeled by the
11432 cost tables. Just use the default cost. */
11442 /* For loads we want the base cost of a load, plus an
11443 approximation for the additional cost of the addressing
11445 rtx address
= XEXP (x
, 0);
11446 if (VECTOR_MODE_P (mode
))
11447 *cost
+= extra_cost
->ldst
.loadv
;
11448 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
11449 *cost
+= extra_cost
->ldst
.load
;
11450 else if (mode
== SFmode
)
11451 *cost
+= extra_cost
->ldst
.loadf
;
11452 else if (mode
== DFmode
)
11453 *cost
+= extra_cost
->ldst
.loadd
;
11456 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
11465 if (VECTOR_MODE_P (mode
))
11470 *cost
+= extra_cost
->vect
.alu
;
11475 if (GET_MODE_CLASS (mode
) == MODE_INT
)
11477 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
11478 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
11481 *cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, NEG
, 0, speed
);
11485 /* Cost this as SUB wzr, X. */
11486 op0
= CONST0_RTX (mode
);
11491 if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
11493 /* Support (neg(fma...)) as a single instruction only if
11494 sign of zeros is unimportant. This matches the decision
11495 making in aarch64.md. */
11496 if (GET_CODE (op0
) == FMA
&& !HONOR_SIGNED_ZEROS (GET_MODE (op0
)))
11499 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
11502 if (GET_CODE (op0
) == MULT
)
11505 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
11510 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
11520 if (VECTOR_MODE_P (mode
))
11521 *cost
+= extra_cost
->vect
.alu
;
11523 *cost
+= extra_cost
->alu
.clz
;
11529 *cost
= COSTS_N_INSNS (2);
11532 *cost
+= extra_cost
->alu
.clz
+ extra_cost
->alu
.rev
;
11539 if (op1
== const0_rtx
11540 && GET_CODE (op0
) == AND
)
11543 mode
= GET_MODE (op0
);
11547 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
)
11549 /* TODO: A write to the CC flags possibly costs extra, this
11550 needs encoding in the cost tables. */
11552 mode
= GET_MODE (op0
);
11554 if (GET_CODE (op0
) == AND
)
11560 if (GET_CODE (op0
) == PLUS
)
11562 /* ADDS (and CMN alias). */
11567 if (GET_CODE (op0
) == MINUS
)
11574 if (GET_CODE (op0
) == ZERO_EXTRACT
&& op1
== const0_rtx
11575 && GET_MODE (x
) == CC_NZmode
&& CONST_INT_P (XEXP (op0
, 1))
11576 && CONST_INT_P (XEXP (op0
, 2)))
11578 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
11579 Handle it here directly rather than going to cost_logic
11580 since we know the immediate generated for the TST is valid
11581 so we can avoid creating an intermediate rtx for it only
11582 for costing purposes. */
11584 *cost
+= extra_cost
->alu
.logical
;
11586 *cost
+= rtx_cost (XEXP (op0
, 0), GET_MODE (op0
),
11587 ZERO_EXTRACT
, 0, speed
);
11591 if (GET_CODE (op1
) == NEG
)
11595 *cost
+= extra_cost
->alu
.arith
;
11597 *cost
+= rtx_cost (op0
, mode
, COMPARE
, 0, speed
);
11598 *cost
+= rtx_cost (XEXP (op1
, 0), mode
, NEG
, 1, speed
);
11604 Compare can freely swap the order of operands, and
11605 canonicalization puts the more complex operation first.
11606 But the integer MINUS logic expects the shift/extend
11607 operation in op1. */
11609 || (GET_CODE (op0
) == SUBREG
&& REG_P (SUBREG_REG (op0
)))))
11617 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_FLOAT
)
11621 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
11623 if (CONST_DOUBLE_P (op1
) && aarch64_float_const_zero_rtx_p (op1
))
11625 *cost
+= rtx_cost (op0
, VOIDmode
, COMPARE
, 0, speed
);
11626 /* FCMP supports constant 0.0 for no extra cost. */
11632 if (VECTOR_MODE_P (mode
))
11634 /* Vector compare. */
11636 *cost
+= extra_cost
->vect
.alu
;
11638 if (aarch64_float_const_zero_rtx_p (op1
))
11640 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
11654 *cost
+= rtx_cost (op0
, mode
, MINUS
, 0, speed
);
11656 /* Detect valid immediates. */
11657 if ((GET_MODE_CLASS (mode
) == MODE_INT
11658 || (GET_MODE_CLASS (mode
) == MODE_CC
11659 && GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
))
11660 && CONST_INT_P (op1
)
11661 && aarch64_uimm12_shift (INTVAL (op1
)))
11664 /* SUB(S) (immediate). */
11665 *cost
+= extra_cost
->alu
.arith
;
11669 /* Look for SUB (extended register). */
11670 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
11671 && aarch64_rtx_arith_op_extract_p (op1
, int_mode
))
11674 *cost
+= extra_cost
->alu
.extend_arith
;
11676 op1
= aarch64_strip_extend (op1
, true);
11677 *cost
+= rtx_cost (op1
, VOIDmode
,
11678 (enum rtx_code
) GET_CODE (op1
), 0, speed
);
11682 rtx new_op1
= aarch64_strip_extend (op1
, false);
11684 /* Cost this as an FMA-alike operation. */
11685 if ((GET_CODE (new_op1
) == MULT
11686 || aarch64_shift_p (GET_CODE (new_op1
)))
11687 && code
!= COMPARE
)
11689 *cost
+= aarch64_rtx_mult_cost (new_op1
, MULT
,
11690 (enum rtx_code
) code
,
11695 *cost
+= rtx_cost (new_op1
, VOIDmode
, MINUS
, 1, speed
);
11699 if (VECTOR_MODE_P (mode
))
11702 *cost
+= extra_cost
->vect
.alu
;
11704 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
11707 *cost
+= extra_cost
->alu
.arith
;
11709 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
11712 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
11726 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
11727 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
11730 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, PLUS
, 0, speed
);
11731 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
11735 if (GET_MODE_CLASS (mode
) == MODE_INT
11736 && (aarch64_plus_immediate (op1
, mode
)
11737 || aarch64_sve_addvl_addpl_immediate (op1
, mode
)))
11739 *cost
+= rtx_cost (op0
, mode
, PLUS
, 0, speed
);
11742 /* ADD (immediate). */
11743 *cost
+= extra_cost
->alu
.arith
;
11747 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
11749 /* Look for ADD (extended register). */
11750 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
11751 && aarch64_rtx_arith_op_extract_p (op0
, int_mode
))
11754 *cost
+= extra_cost
->alu
.extend_arith
;
11756 op0
= aarch64_strip_extend (op0
, true);
11757 *cost
+= rtx_cost (op0
, VOIDmode
,
11758 (enum rtx_code
) GET_CODE (op0
), 0, speed
);
11762 /* Strip any extend, leave shifts behind as we will
11763 cost them through mult_cost. */
11764 new_op0
= aarch64_strip_extend (op0
, false);
11766 if (GET_CODE (new_op0
) == MULT
11767 || aarch64_shift_p (GET_CODE (new_op0
)))
11769 *cost
+= aarch64_rtx_mult_cost (new_op0
, MULT
, PLUS
,
11774 *cost
+= rtx_cost (new_op0
, VOIDmode
, PLUS
, 0, speed
);
11778 if (VECTOR_MODE_P (mode
))
11781 *cost
+= extra_cost
->vect
.alu
;
11783 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
11786 *cost
+= extra_cost
->alu
.arith
;
11788 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
11791 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
11798 *cost
= COSTS_N_INSNS (1);
11802 if (VECTOR_MODE_P (mode
))
11803 *cost
+= extra_cost
->vect
.alu
;
11805 *cost
+= extra_cost
->alu
.rev
;
11810 if (aarch_rev16_p (x
))
11812 *cost
= COSTS_N_INSNS (1);
11816 if (VECTOR_MODE_P (mode
))
11817 *cost
+= extra_cost
->vect
.alu
;
11819 *cost
+= extra_cost
->alu
.rev
;
11824 if (aarch64_extr_rtx_p (x
, &op0
, &op1
))
11826 *cost
+= rtx_cost (op0
, mode
, IOR
, 0, speed
);
11827 *cost
+= rtx_cost (op1
, mode
, IOR
, 1, speed
);
11829 *cost
+= extra_cost
->alu
.shift
;
11833 /* Fall through. */
11840 if (VECTOR_MODE_P (mode
))
11843 *cost
+= extra_cost
->vect
.alu
;
11848 && GET_CODE (op0
) == MULT
11849 && CONST_INT_P (XEXP (op0
, 1))
11850 && CONST_INT_P (op1
)
11851 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0
, 1))),
11852 INTVAL (op1
)) != 0)
11854 /* This is a UBFM/SBFM. */
11855 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, ZERO_EXTRACT
, 0, speed
);
11857 *cost
+= extra_cost
->alu
.bfx
;
11861 if (is_int_mode (mode
, &int_mode
))
11863 if (CONST_INT_P (op1
))
11865 /* We have a mask + shift version of a UBFIZ
11866 i.e. the *andim_ashift<mode>_bfiz pattern. */
11867 if (GET_CODE (op0
) == ASHIFT
11868 && aarch64_mask_and_shift_for_ubfiz_p (int_mode
, op1
,
11871 *cost
+= rtx_cost (XEXP (op0
, 0), int_mode
,
11872 (enum rtx_code
) code
, 0, speed
);
11874 *cost
+= extra_cost
->alu
.bfx
;
11878 else if (aarch64_bitmask_imm (INTVAL (op1
), int_mode
))
11880 /* We possibly get the immediate for free, this is not
11882 *cost
+= rtx_cost (op0
, int_mode
,
11883 (enum rtx_code
) code
, 0, speed
);
11885 *cost
+= extra_cost
->alu
.logical
;
11894 /* Handle ORN, EON, or BIC. */
11895 if (GET_CODE (op0
) == NOT
)
11896 op0
= XEXP (op0
, 0);
11898 new_op0
= aarch64_strip_shift (op0
);
11900 /* If we had a shift on op0 then this is a logical-shift-
11901 by-register/immediate operation. Otherwise, this is just
11902 a logical operation. */
11905 if (new_op0
!= op0
)
11907 /* Shift by immediate. */
11908 if (CONST_INT_P (XEXP (op0
, 1)))
11909 *cost
+= extra_cost
->alu
.log_shift
;
11911 *cost
+= extra_cost
->alu
.log_shift_reg
;
11914 *cost
+= extra_cost
->alu
.logical
;
11917 /* In both cases we want to cost both operands. */
11918 *cost
+= rtx_cost (new_op0
, int_mode
, (enum rtx_code
) code
,
11920 *cost
+= rtx_cost (op1
, int_mode
, (enum rtx_code
) code
,
11930 op0
= aarch64_strip_shift (x
);
11932 if (VECTOR_MODE_P (mode
))
11935 *cost
+= extra_cost
->vect
.alu
;
11939 /* MVN-shifted-reg. */
11942 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
11945 *cost
+= extra_cost
->alu
.log_shift
;
11949 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
11950 Handle the second form here taking care that 'a' in the above can
11952 else if (GET_CODE (op0
) == XOR
)
11954 rtx newop0
= XEXP (op0
, 0);
11955 rtx newop1
= XEXP (op0
, 1);
11956 rtx op0_stripped
= aarch64_strip_shift (newop0
);
11958 *cost
+= rtx_cost (newop1
, mode
, (enum rtx_code
) code
, 1, speed
);
11959 *cost
+= rtx_cost (op0_stripped
, mode
, XOR
, 0, speed
);
11963 if (op0_stripped
!= newop0
)
11964 *cost
+= extra_cost
->alu
.log_shift
;
11966 *cost
+= extra_cost
->alu
.logical
;
11973 *cost
+= extra_cost
->alu
.logical
;
11980 /* If a value is written in SI mode, then zero extended to DI
11981 mode, the operation will in general be free as a write to
11982 a 'w' register implicitly zeroes the upper bits of an 'x'
11983 register. However, if this is
11985 (set (reg) (zero_extend (reg)))
11987 we must cost the explicit register move. */
11989 && GET_MODE (op0
) == SImode
11992 int op_cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, 0, speed
);
11994 /* If OP_COST is non-zero, then the cost of the zero extend
11995 is effectively the cost of the inner operation. Otherwise
11996 we have a MOV instruction and we take the cost from the MOV
11997 itself. This is true independently of whether we are
11998 optimizing for space or time. */
12004 else if (MEM_P (op0
))
12006 /* All loads can zero extend to any size for free. */
12007 *cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, param
, speed
);
12011 op0
= aarch64_extend_bitfield_pattern_p (x
);
12014 *cost
+= rtx_cost (op0
, mode
, ZERO_EXTEND
, 0, speed
);
12016 *cost
+= extra_cost
->alu
.bfx
;
12022 if (VECTOR_MODE_P (mode
))
12025 *cost
+= extra_cost
->vect
.alu
;
12029 /* We generate an AND instead of UXTB/UXTH. */
12030 *cost
+= extra_cost
->alu
.logical
;
12036 if (MEM_P (XEXP (x
, 0)))
12041 rtx address
= XEXP (XEXP (x
, 0), 0);
12042 *cost
+= extra_cost
->ldst
.load_sign_extend
;
12045 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
12051 op0
= aarch64_extend_bitfield_pattern_p (x
);
12054 *cost
+= rtx_cost (op0
, mode
, SIGN_EXTEND
, 0, speed
);
12056 *cost
+= extra_cost
->alu
.bfx
;
12062 if (VECTOR_MODE_P (mode
))
12063 *cost
+= extra_cost
->vect
.alu
;
12065 *cost
+= extra_cost
->alu
.extend
;
12073 if (CONST_INT_P (op1
))
12077 if (VECTOR_MODE_P (mode
))
12079 /* Vector shift (immediate). */
12080 *cost
+= extra_cost
->vect
.alu
;
12084 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
12086 *cost
+= extra_cost
->alu
.shift
;
12090 /* We can incorporate zero/sign extend for free. */
12091 if (GET_CODE (op0
) == ZERO_EXTEND
12092 || GET_CODE (op0
) == SIGN_EXTEND
)
12093 op0
= XEXP (op0
, 0);
12095 *cost
+= rtx_cost (op0
, VOIDmode
, ASHIFT
, 0, speed
);
12100 if (VECTOR_MODE_P (mode
))
12103 /* Vector shift (register). */
12104 *cost
+= extra_cost
->vect
.alu
;
12110 *cost
+= extra_cost
->alu
.shift_reg
;
12112 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
12113 && CONST_INT_P (XEXP (op1
, 1))
12114 && known_eq (INTVAL (XEXP (op1
, 1)),
12115 GET_MODE_BITSIZE (mode
) - 1))
12117 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
12118 /* We already demanded XEXP (op1, 0) to be REG_P, so
12119 don't recurse into it. */
12123 return false; /* All arguments need to be in registers. */
12133 if (CONST_INT_P (op1
))
12135 /* ASR (immediate) and friends. */
12138 if (VECTOR_MODE_P (mode
))
12139 *cost
+= extra_cost
->vect
.alu
;
12141 *cost
+= extra_cost
->alu
.shift
;
12144 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
12149 if (VECTOR_MODE_P (mode
))
12152 /* Vector shift (register). */
12153 *cost
+= extra_cost
->vect
.alu
;
12158 /* ASR (register) and friends. */
12159 *cost
+= extra_cost
->alu
.shift_reg
;
12161 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
12162 && CONST_INT_P (XEXP (op1
, 1))
12163 && known_eq (INTVAL (XEXP (op1
, 1)),
12164 GET_MODE_BITSIZE (mode
) - 1))
12166 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
12167 /* We already demanded XEXP (op1, 0) to be REG_P, so
12168 don't recurse into it. */
12172 return false; /* All arguments need to be in registers. */
12177 if (aarch64_cmodel
== AARCH64_CMODEL_LARGE
12178 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
)
12182 *cost
+= extra_cost
->ldst
.load
;
12184 else if (aarch64_cmodel
== AARCH64_CMODEL_SMALL
12185 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_PIC
)
12187 /* ADRP, followed by ADD. */
12188 *cost
+= COSTS_N_INSNS (1);
12190 *cost
+= 2 * extra_cost
->alu
.arith
;
12192 else if (aarch64_cmodel
== AARCH64_CMODEL_TINY
12193 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
12197 *cost
+= extra_cost
->alu
.arith
;
12202 /* One extra load instruction, after accessing the GOT. */
12203 *cost
+= COSTS_N_INSNS (1);
12205 *cost
+= extra_cost
->ldst
.load
;
12211 /* ADRP/ADD (immediate). */
12213 *cost
+= extra_cost
->alu
.arith
;
12221 if (VECTOR_MODE_P (mode
))
12222 *cost
+= extra_cost
->vect
.alu
;
12224 *cost
+= extra_cost
->alu
.bfx
;
12227 /* We can trust that the immediates used will be correct (there
12228 are no by-register forms), so we need only cost op0. */
12229 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (enum rtx_code
) code
, 0, speed
);
12233 *cost
+= aarch64_rtx_mult_cost (x
, MULT
, 0, speed
);
12234 /* aarch64_rtx_mult_cost always handles recursion to its
12239 /* We can expand signed mod by power of 2 using a NEGS, two parallel
12240 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
12241 an unconditional negate. This case should only ever be reached through
12242 the set_smod_pow2_cheap check in expmed.c. */
12243 if (CONST_INT_P (XEXP (x
, 1))
12244 && exact_log2 (INTVAL (XEXP (x
, 1))) > 0
12245 && (mode
== SImode
|| mode
== DImode
))
12247 /* We expand to 4 instructions. Reset the baseline. */
12248 *cost
= COSTS_N_INSNS (4);
12251 *cost
+= 2 * extra_cost
->alu
.logical
12252 + 2 * extra_cost
->alu
.arith
;
12257 /* Fall-through. */
12261 /* Slighly prefer UMOD over SMOD. */
12262 if (VECTOR_MODE_P (mode
))
12263 *cost
+= extra_cost
->vect
.alu
;
12264 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
12265 *cost
+= (extra_cost
->mult
[mode
== DImode
].add
12266 + extra_cost
->mult
[mode
== DImode
].idiv
12267 + (code
== MOD
? 1 : 0));
12269 return false; /* All arguments need to be in registers. */
12276 if (VECTOR_MODE_P (mode
))
12277 *cost
+= extra_cost
->vect
.alu
;
12278 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
12279 /* There is no integer SQRT, so only DIV and UDIV can get
12281 *cost
+= (extra_cost
->mult
[mode
== DImode
].idiv
12282 /* Slighly prefer UDIV over SDIV. */
12283 + (code
== DIV
? 1 : 0));
12285 *cost
+= extra_cost
->fp
[mode
== DFmode
].div
;
12287 return false; /* All arguments need to be in registers. */
12290 return aarch64_if_then_else_costs (XEXP (x
, 0), XEXP (x
, 1),
12291 XEXP (x
, 2), cost
, speed
);
12304 return false; /* All arguments must be in registers. */
12313 if (VECTOR_MODE_P (mode
))
12314 *cost
+= extra_cost
->vect
.alu
;
12316 *cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
12319 /* FMSUB, FNMADD, and FNMSUB are free. */
12320 if (GET_CODE (op0
) == NEG
)
12321 op0
= XEXP (op0
, 0);
12323 if (GET_CODE (op2
) == NEG
)
12324 op2
= XEXP (op2
, 0);
12326 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
12327 and the by-element operand as operand 0. */
12328 if (GET_CODE (op1
) == NEG
)
12329 op1
= XEXP (op1
, 0);
12331 /* Catch vector-by-element operations. The by-element operand can
12332 either be (vec_duplicate (vec_select (x))) or just
12333 (vec_select (x)), depending on whether we are multiplying by
12334 a vector or a scalar.
12336 Canonicalization is not very good in these cases, FMA4 will put the
12337 by-element operand as operand 0, FNMA4 will have it as operand 1. */
12338 if (GET_CODE (op0
) == VEC_DUPLICATE
)
12339 op0
= XEXP (op0
, 0);
12340 else if (GET_CODE (op1
) == VEC_DUPLICATE
)
12341 op1
= XEXP (op1
, 0);
12343 if (GET_CODE (op0
) == VEC_SELECT
)
12344 op0
= XEXP (op0
, 0);
12345 else if (GET_CODE (op1
) == VEC_SELECT
)
12346 op1
= XEXP (op1
, 0);
12348 /* If the remaining parameters are not registers,
12349 get the cost to put them into registers. */
12350 *cost
+= rtx_cost (op0
, mode
, FMA
, 0, speed
);
12351 *cost
+= rtx_cost (op1
, mode
, FMA
, 1, speed
);
12352 *cost
+= rtx_cost (op2
, mode
, FMA
, 2, speed
);
12356 case UNSIGNED_FLOAT
:
12358 *cost
+= extra_cost
->fp
[mode
== DFmode
].fromint
;
12364 if (VECTOR_MODE_P (mode
))
12366 /*Vector truncate. */
12367 *cost
+= extra_cost
->vect
.alu
;
12370 *cost
+= extra_cost
->fp
[mode
== DFmode
].widen
;
12374 case FLOAT_TRUNCATE
:
12377 if (VECTOR_MODE_P (mode
))
12379 /*Vector conversion. */
12380 *cost
+= extra_cost
->vect
.alu
;
12383 *cost
+= extra_cost
->fp
[mode
== DFmode
].narrow
;
12390 /* Strip the rounding part. They will all be implemented
12391 by the fcvt* family of instructions anyway. */
12392 if (GET_CODE (x
) == UNSPEC
)
12394 unsigned int uns_code
= XINT (x
, 1);
12396 if (uns_code
== UNSPEC_FRINTA
12397 || uns_code
== UNSPEC_FRINTM
12398 || uns_code
== UNSPEC_FRINTN
12399 || uns_code
== UNSPEC_FRINTP
12400 || uns_code
== UNSPEC_FRINTZ
)
12401 x
= XVECEXP (x
, 0, 0);
12406 if (VECTOR_MODE_P (mode
))
12407 *cost
+= extra_cost
->vect
.alu
;
12409 *cost
+= extra_cost
->fp
[GET_MODE (x
) == DFmode
].toint
;
12412 /* We can combine fmul by a power of 2 followed by a fcvt into a single
12413 fixed-point fcvt. */
12414 if (GET_CODE (x
) == MULT
12415 && ((VECTOR_MODE_P (mode
)
12416 && aarch64_vec_fpconst_pow_of_2 (XEXP (x
, 1)) > 0)
12417 || aarch64_fpconst_pow_of_2 (XEXP (x
, 1)) > 0))
12419 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (rtx_code
) code
,
12424 *cost
+= rtx_cost (x
, VOIDmode
, (enum rtx_code
) code
, 0, speed
);
12428 if (VECTOR_MODE_P (mode
))
12430 /* ABS (vector). */
12432 *cost
+= extra_cost
->vect
.alu
;
12434 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
12438 /* FABD, which is analogous to FADD. */
12439 if (GET_CODE (op0
) == MINUS
)
12441 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, MINUS
, 0, speed
);
12442 *cost
+= rtx_cost (XEXP (op0
, 1), mode
, MINUS
, 1, speed
);
12444 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
12448 /* Simple FABS is analogous to FNEG. */
12450 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
12454 /* Integer ABS will either be split to
12455 two arithmetic instructions, or will be an ABS
12456 (scalar), which we don't model. */
12457 *cost
= COSTS_N_INSNS (2);
12459 *cost
+= 2 * extra_cost
->alu
.arith
;
12467 if (VECTOR_MODE_P (mode
))
12468 *cost
+= extra_cost
->vect
.alu
;
12471 /* FMAXNM/FMINNM/FMAX/FMIN.
12472 TODO: This may not be accurate for all implementations, but
12473 we do not model this in the cost tables. */
12474 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
12480 /* The floating point round to integer frint* instructions. */
12481 if (aarch64_frint_unspec_p (XINT (x
, 1)))
12484 *cost
+= extra_cost
->fp
[mode
== DFmode
].roundint
;
12489 if (XINT (x
, 1) == UNSPEC_RBIT
)
12492 *cost
+= extra_cost
->alu
.rev
;
12500 /* Decompose <su>muldi3_highpart. */
12501 if (/* (truncate:DI */
12504 && GET_MODE (XEXP (x
, 0)) == TImode
12505 && GET_CODE (XEXP (x
, 0)) == LSHIFTRT
12507 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == MULT
12508 /* (ANY_EXTEND:TI (reg:DI))
12509 (ANY_EXTEND:TI (reg:DI))) */
12510 && ((GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == ZERO_EXTEND
12511 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == ZERO_EXTEND
)
12512 || (GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == SIGN_EXTEND
12513 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == SIGN_EXTEND
))
12514 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0)) == DImode
12515 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0)) == DImode
12516 /* (const_int 64) */
12517 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
12518 && UINTVAL (XEXP (XEXP (x
, 0), 1)) == 64)
12522 *cost
+= extra_cost
->mult
[mode
== DImode
].extend
;
12523 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0),
12524 mode
, MULT
, 0, speed
);
12525 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0),
12526 mode
, MULT
, 1, speed
);
12530 /* Fall through. */
12536 && flag_aarch64_verbose_cost
)
12537 fprintf (dump_file
,
12538 "\nFailed to cost RTX. Assuming default cost.\n");
12543 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
12544 calculated for X. This cost is stored in *COST. Returns true
12545 if the total cost of X was calculated. */
12547 aarch64_rtx_costs_wrapper (rtx x
, machine_mode mode
, int outer
,
12548 int param
, int *cost
, bool speed
)
12550 bool result
= aarch64_rtx_costs (x
, mode
, outer
, param
, cost
, speed
);
12553 && flag_aarch64_verbose_cost
)
12555 print_rtl_single (dump_file
, x
);
12556 fprintf (dump_file
, "\n%s cost: %d (%s)\n",
12557 speed
? "Hot" : "Cold",
12558 *cost
, result
? "final" : "partial");
12565 aarch64_register_move_cost (machine_mode mode
,
12566 reg_class_t from_i
, reg_class_t to_i
)
12568 enum reg_class from
= (enum reg_class
) from_i
;
12569 enum reg_class to
= (enum reg_class
) to_i
;
12570 const struct cpu_regmove_cost
*regmove_cost
12571 = aarch64_tune_params
.regmove_cost
;
12573 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
12574 if (to
== TAILCALL_ADDR_REGS
|| to
== POINTER_REGS
)
12577 if (from
== TAILCALL_ADDR_REGS
|| from
== POINTER_REGS
)
12578 from
= GENERAL_REGS
;
12580 /* Make RDFFR very expensive. In particular, if we know that the FFR
12581 contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
12582 as a way of obtaining a PTRUE. */
12583 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
12584 && hard_reg_set_subset_p (reg_class_contents
[from_i
],
12585 reg_class_contents
[FFR_REGS
]))
12588 /* Moving between GPR and stack cost is the same as GP2GP. */
12589 if ((from
== GENERAL_REGS
&& to
== STACK_REG
)
12590 || (to
== GENERAL_REGS
&& from
== STACK_REG
))
12591 return regmove_cost
->GP2GP
;
12593 /* To/From the stack register, we move via the gprs. */
12594 if (to
== STACK_REG
|| from
== STACK_REG
)
12595 return aarch64_register_move_cost (mode
, from
, GENERAL_REGS
)
12596 + aarch64_register_move_cost (mode
, GENERAL_REGS
, to
);
12598 if (known_eq (GET_MODE_SIZE (mode
), 16))
12600 /* 128-bit operations on general registers require 2 instructions. */
12601 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
12602 return regmove_cost
->GP2GP
* 2;
12603 else if (from
== GENERAL_REGS
)
12604 return regmove_cost
->GP2FP
* 2;
12605 else if (to
== GENERAL_REGS
)
12606 return regmove_cost
->FP2GP
* 2;
12608 /* When AdvSIMD instructions are disabled it is not possible to move
12609 a 128-bit value directly between Q registers. This is handled in
12610 secondary reload. A general register is used as a scratch to move
12611 the upper DI value and the lower DI value is moved directly,
12612 hence the cost is the sum of three moves. */
12614 return regmove_cost
->GP2FP
+ regmove_cost
->FP2GP
+ regmove_cost
->FP2FP
;
12616 return regmove_cost
->FP2FP
;
12619 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
12620 return regmove_cost
->GP2GP
;
12621 else if (from
== GENERAL_REGS
)
12622 return regmove_cost
->GP2FP
;
12623 else if (to
== GENERAL_REGS
)
12624 return regmove_cost
->FP2GP
;
12626 return regmove_cost
->FP2FP
;
12630 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED
,
12631 reg_class_t rclass ATTRIBUTE_UNUSED
,
12632 bool in ATTRIBUTE_UNUSED
)
12634 return aarch64_tune_params
.memmov_cost
;
12637 /* Implement TARGET_INIT_BUILTINS. */
12639 aarch64_init_builtins ()
12641 aarch64_general_init_builtins ();
12642 aarch64_sve::init_builtins ();
12645 /* Implement TARGET_FOLD_BUILTIN. */
12647 aarch64_fold_builtin (tree fndecl
, int nargs
, tree
*args
, bool)
12649 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
12650 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
12651 tree type
= TREE_TYPE (TREE_TYPE (fndecl
));
12652 switch (code
& AARCH64_BUILTIN_CLASS
)
12654 case AARCH64_BUILTIN_GENERAL
:
12655 return aarch64_general_fold_builtin (subcode
, type
, nargs
, args
);
12657 case AARCH64_BUILTIN_SVE
:
12660 gcc_unreachable ();
12663 /* Implement TARGET_GIMPLE_FOLD_BUILTIN. */
12665 aarch64_gimple_fold_builtin (gimple_stmt_iterator
*gsi
)
12667 gcall
*stmt
= as_a
<gcall
*> (gsi_stmt (*gsi
));
12668 tree fndecl
= gimple_call_fndecl (stmt
);
12669 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
12670 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
12671 gimple
*new_stmt
= NULL
;
12672 switch (code
& AARCH64_BUILTIN_CLASS
)
12674 case AARCH64_BUILTIN_GENERAL
:
12675 new_stmt
= aarch64_general_gimple_fold_builtin (subcode
, stmt
);
12678 case AARCH64_BUILTIN_SVE
:
12679 new_stmt
= aarch64_sve::gimple_fold_builtin (subcode
, gsi
, stmt
);
12686 gsi_replace (gsi
, new_stmt
, true);
12690 /* Implement TARGET_EXPAND_BUILTIN. */
12692 aarch64_expand_builtin (tree exp
, rtx target
, rtx
, machine_mode
, int ignore
)
12694 tree fndecl
= TREE_OPERAND (CALL_EXPR_FN (exp
), 0);
12695 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
12696 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
12697 switch (code
& AARCH64_BUILTIN_CLASS
)
12699 case AARCH64_BUILTIN_GENERAL
:
12700 return aarch64_general_expand_builtin (subcode
, exp
, target
, ignore
);
12702 case AARCH64_BUILTIN_SVE
:
12703 return aarch64_sve::expand_builtin (subcode
, exp
, target
);
12705 gcc_unreachable ();
12708 /* Implement TARGET_BUILTIN_DECL. */
12710 aarch64_builtin_decl (unsigned int code
, bool initialize_p
)
12712 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
12713 switch (code
& AARCH64_BUILTIN_CLASS
)
12715 case AARCH64_BUILTIN_GENERAL
:
12716 return aarch64_general_builtin_decl (subcode
, initialize_p
);
12718 case AARCH64_BUILTIN_SVE
:
12719 return aarch64_sve::builtin_decl (subcode
, initialize_p
);
12721 gcc_unreachable ();
12724 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
12725 to optimize 1.0/sqrt. */
12728 use_rsqrt_p (machine_mode mode
)
12730 return (!flag_trapping_math
12731 && flag_unsafe_math_optimizations
12732 && ((aarch64_tune_params
.approx_modes
->recip_sqrt
12733 & AARCH64_APPROX_MODE (mode
))
12734 || flag_mrecip_low_precision_sqrt
));
12737 /* Function to decide when to use the approximate reciprocal square root
12741 aarch64_builtin_reciprocal (tree fndecl
)
12743 machine_mode mode
= TYPE_MODE (TREE_TYPE (fndecl
));
12745 if (!use_rsqrt_p (mode
))
12747 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
12748 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
12749 switch (code
& AARCH64_BUILTIN_CLASS
)
12751 case AARCH64_BUILTIN_GENERAL
:
12752 return aarch64_general_builtin_rsqrt (subcode
);
12754 case AARCH64_BUILTIN_SVE
:
12757 gcc_unreachable ();
12760 /* Emit code to perform the floating-point operation:
12764 where all three operands are already known to be registers.
12765 If the operation is an SVE one, PTRUE is a suitable all-true
12769 aarch64_emit_mult (rtx dst
, rtx ptrue
, rtx src1
, rtx src2
)
12772 emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL
, GET_MODE (dst
),
12773 dst
, ptrue
, src1
, src2
,
12774 gen_int_mode (SVE_RELAXED_GP
, SImode
)));
12776 emit_set_insn (dst
, gen_rtx_MULT (GET_MODE (dst
), src1
, src2
));
12779 /* Emit instruction sequence to compute either the approximate square root
12780 or its approximate reciprocal, depending on the flag RECP, and return
12781 whether the sequence was emitted or not. */
12784 aarch64_emit_approx_sqrt (rtx dst
, rtx src
, bool recp
)
12786 machine_mode mode
= GET_MODE (dst
);
12788 if (GET_MODE_INNER (mode
) == HFmode
)
12790 gcc_assert (!recp
);
12796 if (!(flag_mlow_precision_sqrt
12797 || (aarch64_tune_params
.approx_modes
->sqrt
12798 & AARCH64_APPROX_MODE (mode
))))
12801 if (!flag_finite_math_only
12802 || flag_trapping_math
12803 || !flag_unsafe_math_optimizations
12804 || optimize_function_for_size_p (cfun
))
12808 /* Caller assumes we cannot fail. */
12809 gcc_assert (use_rsqrt_p (mode
));
12812 if (aarch64_sve_mode_p (mode
))
12813 pg
= aarch64_ptrue_reg (aarch64_sve_pred_mode (mode
));
12814 machine_mode mmsk
= (VECTOR_MODE_P (mode
)
12815 ? related_int_vector_mode (mode
).require ()
12816 : int_mode_for_mode (mode
).require ());
12817 rtx xmsk
= NULL_RTX
;
12820 /* When calculating the approximate square root, compare the
12821 argument with 0.0 and create a mask. */
12822 rtx zero
= CONST0_RTX (mode
);
12825 xmsk
= gen_reg_rtx (GET_MODE (pg
));
12826 rtx hint
= gen_int_mode (SVE_KNOWN_PTRUE
, SImode
);
12827 emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE
, mode
,
12828 xmsk
, pg
, hint
, src
, zero
));
12832 xmsk
= gen_reg_rtx (mmsk
);
12833 emit_insn (gen_rtx_SET (xmsk
,
12835 gen_rtx_EQ (mmsk
, src
, zero
))));
12839 /* Estimate the approximate reciprocal square root. */
12840 rtx xdst
= gen_reg_rtx (mode
);
12841 emit_insn (gen_aarch64_rsqrte (mode
, xdst
, src
));
12843 /* Iterate over the series twice for SF and thrice for DF. */
12844 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
12846 /* Optionally iterate over the series once less for faster performance
12847 while sacrificing the accuracy. */
12848 if ((recp
&& flag_mrecip_low_precision_sqrt
)
12849 || (!recp
&& flag_mlow_precision_sqrt
))
12852 /* Iterate over the series to calculate the approximate reciprocal square
12854 rtx x1
= gen_reg_rtx (mode
);
12855 while (iterations
--)
12857 rtx x2
= gen_reg_rtx (mode
);
12858 aarch64_emit_mult (x2
, pg
, xdst
, xdst
);
12860 emit_insn (gen_aarch64_rsqrts (mode
, x1
, src
, x2
));
12862 if (iterations
> 0)
12863 aarch64_emit_mult (xdst
, pg
, xdst
, x1
);
12869 /* Multiply nonzero source values by the corresponding intermediate
12870 result elements, so that the final calculation is the approximate
12871 square root rather than its reciprocal. Select a zero result for
12872 zero source values, to avoid the Inf * 0 -> NaN that we'd get
12874 emit_insn (gen_cond (UNSPEC_COND_FMUL
, mode
,
12875 xdst
, xmsk
, xdst
, src
, CONST0_RTX (mode
)));
12878 /* Qualify the approximate reciprocal square root when the
12879 argument is 0.0 by squashing the intermediary result to 0.0. */
12880 rtx xtmp
= gen_reg_rtx (mmsk
);
12881 emit_set_insn (xtmp
, gen_rtx_AND (mmsk
, gen_rtx_NOT (mmsk
, xmsk
),
12882 gen_rtx_SUBREG (mmsk
, xdst
, 0)));
12883 emit_move_insn (xdst
, gen_rtx_SUBREG (mode
, xtmp
, 0));
12885 /* Calculate the approximate square root. */
12886 aarch64_emit_mult (xdst
, pg
, xdst
, src
);
12890 /* Finalize the approximation. */
12891 aarch64_emit_mult (dst
, pg
, xdst
, x1
);
12896 /* Emit the instruction sequence to compute the approximation for the division
12897 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
12900 aarch64_emit_approx_div (rtx quo
, rtx num
, rtx den
)
12902 machine_mode mode
= GET_MODE (quo
);
12904 if (GET_MODE_INNER (mode
) == HFmode
)
12907 bool use_approx_division_p
= (flag_mlow_precision_div
12908 || (aarch64_tune_params
.approx_modes
->division
12909 & AARCH64_APPROX_MODE (mode
)));
12911 if (!flag_finite_math_only
12912 || flag_trapping_math
12913 || !flag_unsafe_math_optimizations
12914 || optimize_function_for_size_p (cfun
)
12915 || !use_approx_division_p
)
12918 if (!TARGET_SIMD
&& VECTOR_MODE_P (mode
))
12922 if (aarch64_sve_mode_p (mode
))
12923 pg
= aarch64_ptrue_reg (aarch64_sve_pred_mode (mode
));
12925 /* Estimate the approximate reciprocal. */
12926 rtx xrcp
= gen_reg_rtx (mode
);
12927 emit_insn (gen_aarch64_frecpe (mode
, xrcp
, den
));
12929 /* Iterate over the series twice for SF and thrice for DF. */
12930 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
12932 /* Optionally iterate over the series less for faster performance,
12933 while sacrificing the accuracy. The default is 2 for DF and 1 for SF. */
12934 if (flag_mlow_precision_div
)
12935 iterations
= (GET_MODE_INNER (mode
) == DFmode
12936 ? aarch64_double_recp_precision
12937 : aarch64_float_recp_precision
);
12939 /* Iterate over the series to calculate the approximate reciprocal. */
12940 rtx xtmp
= gen_reg_rtx (mode
);
12941 while (iterations
--)
12943 emit_insn (gen_aarch64_frecps (mode
, xtmp
, xrcp
, den
));
12945 if (iterations
> 0)
12946 aarch64_emit_mult (xrcp
, pg
, xrcp
, xtmp
);
12949 if (num
!= CONST1_RTX (mode
))
12951 /* As the approximate reciprocal of DEN is already calculated, only
12952 calculate the approximate division when NUM is not 1.0. */
12953 rtx xnum
= force_reg (mode
, num
);
12954 aarch64_emit_mult (xrcp
, pg
, xrcp
, xnum
);
12957 /* Finalize the approximation. */
12958 aarch64_emit_mult (quo
, pg
, xrcp
, xtmp
);
12962 /* Return the number of instructions that can be issued per cycle. */
12964 aarch64_sched_issue_rate (void)
12966 return aarch64_tune_params
.issue_rate
;
12969 /* Implement TARGET_SCHED_VARIABLE_ISSUE. */
12971 aarch64_sched_variable_issue (FILE *, int, rtx_insn
*insn
, int more
)
12973 if (DEBUG_INSN_P (insn
))
12976 rtx_code code
= GET_CODE (PATTERN (insn
));
12977 if (code
== USE
|| code
== CLOBBER
)
12980 if (get_attr_type (insn
) == TYPE_NO_INSN
)
12987 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
12989 int issue_rate
= aarch64_sched_issue_rate ();
12991 return issue_rate
> 1 && !sched_fusion
? issue_rate
: 0;
12995 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
12996 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
12997 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
13000 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn
*insn
,
13003 return autopref_multipass_dfa_lookahead_guard (insn
, ready_index
);
13007 /* Vectorizer cost model target hooks. */
13009 /* Implement targetm.vectorize.builtin_vectorization_cost. */
13011 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost
,
13013 int misalign ATTRIBUTE_UNUSED
)
13016 const cpu_vector_cost
*costs
= aarch64_tune_params
.vec_costs
;
13019 if (vectype
!= NULL
)
13020 fp
= FLOAT_TYPE_P (vectype
);
13022 switch (type_of_cost
)
13025 return fp
? costs
->scalar_fp_stmt_cost
: costs
->scalar_int_stmt_cost
;
13028 return costs
->scalar_load_cost
;
13031 return costs
->scalar_store_cost
;
13034 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
13037 return costs
->vec_align_load_cost
;
13040 return costs
->vec_store_cost
;
13042 case vec_to_scalar
:
13043 return costs
->vec_to_scalar_cost
;
13045 case scalar_to_vec
:
13046 return costs
->scalar_to_vec_cost
;
13048 case unaligned_load
:
13049 case vector_gather_load
:
13050 return costs
->vec_unalign_load_cost
;
13052 case unaligned_store
:
13053 case vector_scatter_store
:
13054 return costs
->vec_unalign_store_cost
;
13056 case cond_branch_taken
:
13057 return costs
->cond_taken_branch_cost
;
13059 case cond_branch_not_taken
:
13060 return costs
->cond_not_taken_branch_cost
;
13063 return costs
->vec_permute_cost
;
13065 case vec_promote_demote
:
13066 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
13068 case vec_construct
:
13069 elements
= estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype
));
13070 return elements
/ 2 + 1;
13073 gcc_unreachable ();
13077 /* Return true if STMT_INFO extends the result of a load. */
13079 aarch64_extending_load_p (stmt_vec_info stmt_info
)
13081 gassign
*assign
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
13082 if (!assign
|| !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign
)))
13085 tree rhs
= gimple_assign_rhs1 (stmt_info
->stmt
);
13086 tree lhs_type
= TREE_TYPE (gimple_assign_lhs (assign
));
13087 tree rhs_type
= TREE_TYPE (rhs
);
13088 if (!INTEGRAL_TYPE_P (lhs_type
)
13089 || !INTEGRAL_TYPE_P (rhs_type
)
13090 || TYPE_PRECISION (lhs_type
) <= TYPE_PRECISION (rhs_type
))
13093 stmt_vec_info def_stmt_info
= stmt_info
->vinfo
->lookup_def (rhs
);
13094 return (def_stmt_info
13095 && STMT_VINFO_DATA_REF (def_stmt_info
)
13096 && DR_IS_READ (STMT_VINFO_DATA_REF (def_stmt_info
)));
13099 /* Return true if STMT_INFO is an integer truncation. */
13101 aarch64_integer_truncation_p (stmt_vec_info stmt_info
)
13103 gassign
*assign
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
13104 if (!assign
|| !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign
)))
13107 tree lhs_type
= TREE_TYPE (gimple_assign_lhs (assign
));
13108 tree rhs_type
= TREE_TYPE (gimple_assign_rhs1 (assign
));
13109 return (INTEGRAL_TYPE_P (lhs_type
)
13110 && INTEGRAL_TYPE_P (rhs_type
)
13111 && TYPE_PRECISION (lhs_type
) < TYPE_PRECISION (rhs_type
));
13114 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
13115 for STMT_INFO, which has cost kind KIND. Adjust the cost as necessary
13116 for SVE targets. */
13117 static unsigned int
13118 aarch64_sve_adjust_stmt_cost (vect_cost_for_stmt kind
, stmt_vec_info stmt_info
,
13119 unsigned int stmt_cost
)
13121 /* Unlike vec_promote_demote, vector_stmt conversions do not change the
13122 vector register size or number of units. Integer promotions of this
13123 type therefore map to SXT[BHW] or UXT[BHW].
13125 Most loads have extending forms that can do the sign or zero extension
13126 on the fly. Optimistically assume that a load followed by an extension
13127 will fold to this form during combine, and that the extension therefore
13129 if (kind
== vector_stmt
&& aarch64_extending_load_p (stmt_info
))
13132 /* For similar reasons, vector_stmt integer truncations are a no-op,
13133 because we can just ignore the unused upper bits of the source. */
13134 if (kind
== vector_stmt
&& aarch64_integer_truncation_p (stmt_info
))
13140 /* Implement targetm.vectorize.add_stmt_cost. */
13142 aarch64_add_stmt_cost (void *data
, int count
, enum vect_cost_for_stmt kind
,
13143 struct _stmt_vec_info
*stmt_info
, int misalign
,
13144 enum vect_cost_model_location where
)
13146 unsigned *cost
= (unsigned *) data
;
13147 unsigned retval
= 0;
13149 if (flag_vect_cost_model
)
13151 tree vectype
= stmt_info
? stmt_vectype (stmt_info
) : NULL_TREE
;
13153 aarch64_builtin_vectorization_cost (kind
, vectype
, misalign
);
13155 if (stmt_info
&& vectype
&& aarch64_sve_mode_p (TYPE_MODE (vectype
)))
13156 stmt_cost
= aarch64_sve_adjust_stmt_cost (kind
, stmt_info
, stmt_cost
);
13158 /* Statements in an inner loop relative to the loop being
13159 vectorized are weighted more heavily. The value here is
13160 arbitrary and could potentially be improved with analysis. */
13161 if (where
== vect_body
&& stmt_info
&& stmt_in_inner_loop_p (stmt_info
))
13162 count
*= 50; /* FIXME */
13164 retval
= (unsigned) (count
* stmt_cost
);
13165 cost
[where
] += retval
;
13171 static void initialize_aarch64_code_model (struct gcc_options
*);
13173 /* Parse the TO_PARSE string and put the architecture struct that it
13174 selects into RES and the architectural features into ISA_FLAGS.
13175 Return an aarch64_parse_opt_result describing the parse result.
13176 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
13177 When the TO_PARSE string contains an invalid extension,
13178 a copy of the string is created and stored to INVALID_EXTENSION. */
13180 static enum aarch64_parse_opt_result
13181 aarch64_parse_arch (const char *to_parse
, const struct processor
**res
,
13182 uint64_t *isa_flags
, std::string
*invalid_extension
)
13185 const struct processor
*arch
;
13188 ext
= strchr (to_parse
, '+');
13191 len
= ext
- to_parse
;
13193 len
= strlen (to_parse
);
13196 return AARCH64_PARSE_MISSING_ARG
;
13199 /* Loop through the list of supported ARCHes to find a match. */
13200 for (arch
= all_architectures
; arch
->name
!= NULL
; arch
++)
13202 if (strlen (arch
->name
) == len
13203 && strncmp (arch
->name
, to_parse
, len
) == 0)
13205 uint64_t isa_temp
= arch
->flags
;
13209 /* TO_PARSE string contains at least one extension. */
13210 enum aarch64_parse_opt_result ext_res
13211 = aarch64_parse_extension (ext
, &isa_temp
, invalid_extension
);
13213 if (ext_res
!= AARCH64_PARSE_OK
)
13216 /* Extension parsing was successful. Confirm the result
13217 arch and ISA flags. */
13219 *isa_flags
= isa_temp
;
13220 return AARCH64_PARSE_OK
;
13224 /* ARCH name not found in list. */
13225 return AARCH64_PARSE_INVALID_ARG
;
13228 /* Parse the TO_PARSE string and put the result tuning in RES and the
13229 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
13230 describing the parse result. If there is an error parsing, RES and
13231 ISA_FLAGS are left unchanged.
13232 When the TO_PARSE string contains an invalid extension,
13233 a copy of the string is created and stored to INVALID_EXTENSION. */
13235 static enum aarch64_parse_opt_result
13236 aarch64_parse_cpu (const char *to_parse
, const struct processor
**res
,
13237 uint64_t *isa_flags
, std::string
*invalid_extension
)
13240 const struct processor
*cpu
;
13243 ext
= strchr (to_parse
, '+');
13246 len
= ext
- to_parse
;
13248 len
= strlen (to_parse
);
13251 return AARCH64_PARSE_MISSING_ARG
;
13254 /* Loop through the list of supported CPUs to find a match. */
13255 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
13257 if (strlen (cpu
->name
) == len
&& strncmp (cpu
->name
, to_parse
, len
) == 0)
13259 uint64_t isa_temp
= cpu
->flags
;
13264 /* TO_PARSE string contains at least one extension. */
13265 enum aarch64_parse_opt_result ext_res
13266 = aarch64_parse_extension (ext
, &isa_temp
, invalid_extension
);
13268 if (ext_res
!= AARCH64_PARSE_OK
)
13271 /* Extension parsing was successfull. Confirm the result
13272 cpu and ISA flags. */
13274 *isa_flags
= isa_temp
;
13275 return AARCH64_PARSE_OK
;
13279 /* CPU name not found in list. */
13280 return AARCH64_PARSE_INVALID_ARG
;
13283 /* Parse the TO_PARSE string and put the cpu it selects into RES.
13284 Return an aarch64_parse_opt_result describing the parse result.
13285 If the parsing fails the RES does not change. */
13287 static enum aarch64_parse_opt_result
13288 aarch64_parse_tune (const char *to_parse
, const struct processor
**res
)
13290 const struct processor
*cpu
;
13292 /* Loop through the list of supported CPUs to find a match. */
13293 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
13295 if (strcmp (cpu
->name
, to_parse
) == 0)
13298 return AARCH64_PARSE_OK
;
13302 /* CPU name not found in list. */
13303 return AARCH64_PARSE_INVALID_ARG
;
13306 /* Parse TOKEN, which has length LENGTH to see if it is an option
13307 described in FLAG. If it is, return the index bit for that fusion type.
13308 If not, error (printing OPTION_NAME) and return zero. */
13310 static unsigned int
13311 aarch64_parse_one_option_token (const char *token
,
13313 const struct aarch64_flag_desc
*flag
,
13314 const char *option_name
)
13316 for (; flag
->name
!= NULL
; flag
++)
13318 if (length
== strlen (flag
->name
)
13319 && !strncmp (flag
->name
, token
, length
))
13323 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name
, token
);
13327 /* Parse OPTION which is a comma-separated list of flags to enable.
13328 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
13329 default state we inherit from the CPU tuning structures. OPTION_NAME
13330 gives the top-level option we are parsing in the -moverride string,
13331 for use in error messages. */
13333 static unsigned int
13334 aarch64_parse_boolean_options (const char *option
,
13335 const struct aarch64_flag_desc
*flags
,
13336 unsigned int initial_state
,
13337 const char *option_name
)
13339 const char separator
= '.';
13340 const char* specs
= option
;
13341 const char* ntoken
= option
;
13342 unsigned int found_flags
= initial_state
;
13344 while ((ntoken
= strchr (specs
, separator
)))
13346 size_t token_length
= ntoken
- specs
;
13347 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
13351 /* If we find "none" (or, for simplicity's sake, an error) anywhere
13352 in the token stream, reset the supported operations. So:
13354 adrp+add.cmp+branch.none.adrp+add
13356 would have the result of turning on only adrp+add fusion. */
13360 found_flags
|= token_ops
;
13364 /* We ended with a comma, print something. */
13367 error ("%s string ill-formed\n", option_name
);
13371 /* We still have one more token to parse. */
13372 size_t token_length
= strlen (specs
);
13373 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
13380 found_flags
|= token_ops
;
13381 return found_flags
;
13384 /* Support for overriding instruction fusion. */
13387 aarch64_parse_fuse_string (const char *fuse_string
,
13388 struct tune_params
*tune
)
13390 tune
->fusible_ops
= aarch64_parse_boolean_options (fuse_string
,
13391 aarch64_fusible_pairs
,
13396 /* Support for overriding other tuning flags. */
13399 aarch64_parse_tune_string (const char *tune_string
,
13400 struct tune_params
*tune
)
13402 tune
->extra_tuning_flags
13403 = aarch64_parse_boolean_options (tune_string
,
13404 aarch64_tuning_flags
,
13405 tune
->extra_tuning_flags
,
13409 /* Parse the sve_width tuning moverride string in TUNE_STRING.
13410 Accept the valid SVE vector widths allowed by
13411 aarch64_sve_vector_bits_enum and use it to override sve_width
13415 aarch64_parse_sve_width_string (const char *tune_string
,
13416 struct tune_params
*tune
)
13420 int n
= sscanf (tune_string
, "%d", &width
);
13423 error ("invalid format for sve_width");
13435 error ("invalid sve_width value: %d", width
);
13437 tune
->sve_width
= (enum aarch64_sve_vector_bits_enum
) width
;
13440 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
13441 we understand. If it is, extract the option string and handoff to
13442 the appropriate function. */
13445 aarch64_parse_one_override_token (const char* token
,
13447 struct tune_params
*tune
)
13449 const struct aarch64_tuning_override_function
*fn
13450 = aarch64_tuning_override_functions
;
13452 const char *option_part
= strchr (token
, '=');
13455 error ("tuning string missing in option (%s)", token
);
13459 /* Get the length of the option name. */
13460 length
= option_part
- token
;
13461 /* Skip the '=' to get to the option string. */
13464 for (; fn
->name
!= NULL
; fn
++)
13466 if (!strncmp (fn
->name
, token
, length
))
13468 fn
->parse_override (option_part
, tune
);
13473 error ("unknown tuning option (%s)",token
);
13477 /* A checking mechanism for the implementation of the tls size. */
13480 initialize_aarch64_tls_size (struct gcc_options
*opts
)
13482 if (aarch64_tls_size
== 0)
13483 aarch64_tls_size
= 24;
13485 switch (opts
->x_aarch64_cmodel_var
)
13487 case AARCH64_CMODEL_TINY
:
13488 /* Both the default and maximum TLS size allowed under tiny is 1M which
13489 needs two instructions to address, so we clamp the size to 24. */
13490 if (aarch64_tls_size
> 24)
13491 aarch64_tls_size
= 24;
13493 case AARCH64_CMODEL_SMALL
:
13494 /* The maximum TLS size allowed under small is 4G. */
13495 if (aarch64_tls_size
> 32)
13496 aarch64_tls_size
= 32;
13498 case AARCH64_CMODEL_LARGE
:
13499 /* The maximum TLS size allowed under large is 16E.
13500 FIXME: 16E should be 64bit, we only support 48bit offset now. */
13501 if (aarch64_tls_size
> 48)
13502 aarch64_tls_size
= 48;
13505 gcc_unreachable ();
13511 /* Parse STRING looking for options in the format:
13512 string :: option:string
13513 option :: name=substring
13515 substring :: defined by option. */
13518 aarch64_parse_override_string (const char* input_string
,
13519 struct tune_params
* tune
)
13521 const char separator
= ':';
13522 size_t string_length
= strlen (input_string
) + 1;
13523 char *string_root
= (char *) xmalloc (sizeof (*string_root
) * string_length
);
13524 char *string
= string_root
;
13525 strncpy (string
, input_string
, string_length
);
13526 string
[string_length
- 1] = '\0';
13528 char* ntoken
= string
;
13530 while ((ntoken
= strchr (string
, separator
)))
13532 size_t token_length
= ntoken
- string
;
13533 /* Make this substring look like a string. */
13535 aarch64_parse_one_override_token (string
, token_length
, tune
);
13539 /* One last option to parse. */
13540 aarch64_parse_one_override_token (string
, strlen (string
), tune
);
13541 free (string_root
);
13546 aarch64_override_options_after_change_1 (struct gcc_options
*opts
)
13548 if (accepted_branch_protection_string
)
13550 opts
->x_aarch64_branch_protection_string
13551 = xstrdup (accepted_branch_protection_string
);
13554 /* PR 70044: We have to be careful about being called multiple times for the
13555 same function. This means all changes should be repeatable. */
13557 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
13558 Disable the frame pointer flag so the mid-end will not use a frame
13559 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
13560 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
13561 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
13562 aarch64_use_frame_pointer
= opts
->x_flag_omit_frame_pointer
!= 1;
13563 if (opts
->x_flag_omit_frame_pointer
== 0)
13564 opts
->x_flag_omit_frame_pointer
= 2;
13566 /* If not optimizing for size, set the default
13567 alignment to what the target wants. */
13568 if (!opts
->x_optimize_size
)
13570 if (opts
->x_flag_align_loops
&& !opts
->x_str_align_loops
)
13571 opts
->x_str_align_loops
= aarch64_tune_params
.loop_align
;
13572 if (opts
->x_flag_align_jumps
&& !opts
->x_str_align_jumps
)
13573 opts
->x_str_align_jumps
= aarch64_tune_params
.jump_align
;
13574 if (opts
->x_flag_align_functions
&& !opts
->x_str_align_functions
)
13575 opts
->x_str_align_functions
= aarch64_tune_params
.function_align
;
13578 /* We default to no pc-relative literal loads. */
13580 aarch64_pcrelative_literal_loads
= false;
13582 /* If -mpc-relative-literal-loads is set on the command line, this
13583 implies that the user asked for PC relative literal loads. */
13584 if (opts
->x_pcrelative_literal_loads
== 1)
13585 aarch64_pcrelative_literal_loads
= true;
13587 /* In the tiny memory model it makes no sense to disallow PC relative
13588 literal pool loads. */
13589 if (aarch64_cmodel
== AARCH64_CMODEL_TINY
13590 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
13591 aarch64_pcrelative_literal_loads
= true;
13593 /* When enabling the lower precision Newton series for the square root, also
13594 enable it for the reciprocal square root, since the latter is an
13595 intermediary step for the former. */
13596 if (flag_mlow_precision_sqrt
)
13597 flag_mrecip_low_precision_sqrt
= true;
13600 /* 'Unpack' up the internal tuning structs and update the options
13601 in OPTS. The caller must have set up selected_tune and selected_arch
13602 as all the other target-specific codegen decisions are
13603 derived from them. */
13606 aarch64_override_options_internal (struct gcc_options
*opts
)
13608 aarch64_tune_flags
= selected_tune
->flags
;
13609 aarch64_tune
= selected_tune
->sched_core
;
13610 /* Make a copy of the tuning parameters attached to the core, which
13611 we may later overwrite. */
13612 aarch64_tune_params
= *(selected_tune
->tune
);
13613 aarch64_architecture_version
= selected_arch
->architecture_version
;
13615 if (opts
->x_aarch64_override_tune_string
)
13616 aarch64_parse_override_string (opts
->x_aarch64_override_tune_string
,
13617 &aarch64_tune_params
);
13619 /* This target defaults to strict volatile bitfields. */
13620 if (opts
->x_flag_strict_volatile_bitfields
< 0 && abi_version_at_least (2))
13621 opts
->x_flag_strict_volatile_bitfields
= 1;
13623 if (aarch64_stack_protector_guard
== SSP_GLOBAL
13624 && opts
->x_aarch64_stack_protector_guard_offset_str
)
13626 error ("incompatible options %<-mstack-protector-guard=global%> and "
13627 "%<-mstack-protector-guard-offset=%s%>",
13628 aarch64_stack_protector_guard_offset_str
);
13631 if (aarch64_stack_protector_guard
== SSP_SYSREG
13632 && !(opts
->x_aarch64_stack_protector_guard_offset_str
13633 && opts
->x_aarch64_stack_protector_guard_reg_str
))
13635 error ("both %<-mstack-protector-guard-offset%> and "
13636 "%<-mstack-protector-guard-reg%> must be used "
13637 "with %<-mstack-protector-guard=sysreg%>");
13640 if (opts
->x_aarch64_stack_protector_guard_reg_str
)
13642 if (strlen (opts
->x_aarch64_stack_protector_guard_reg_str
) > 100)
13643 error ("specify a system register with a small string length.");
13646 if (opts
->x_aarch64_stack_protector_guard_offset_str
)
13649 const char *str
= aarch64_stack_protector_guard_offset_str
;
13651 long offs
= strtol (aarch64_stack_protector_guard_offset_str
, &end
, 0);
13652 if (!*str
|| *end
|| errno
)
13653 error ("%qs is not a valid offset in %qs", str
,
13654 "-mstack-protector-guard-offset=");
13655 aarch64_stack_protector_guard_offset
= offs
;
13658 initialize_aarch64_code_model (opts
);
13659 initialize_aarch64_tls_size (opts
);
13661 int queue_depth
= 0;
13662 switch (aarch64_tune_params
.autoprefetcher_model
)
13664 case tune_params::AUTOPREFETCHER_OFF
:
13667 case tune_params::AUTOPREFETCHER_WEAK
:
13670 case tune_params::AUTOPREFETCHER_STRONG
:
13671 queue_depth
= max_insn_queue_index
+ 1;
13674 gcc_unreachable ();
13677 /* We don't mind passing in global_options_set here as we don't use
13678 the *options_set structs anyway. */
13679 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
13680 param_sched_autopref_queue_depth
, queue_depth
);
13682 /* Set up parameters to be used in prefetching algorithm. Do not
13683 override the defaults unless we are tuning for a core we have
13684 researched values for. */
13685 if (aarch64_tune_params
.prefetch
->num_slots
> 0)
13686 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
13687 param_simultaneous_prefetches
,
13688 aarch64_tune_params
.prefetch
->num_slots
);
13689 if (aarch64_tune_params
.prefetch
->l1_cache_size
>= 0)
13690 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
13691 param_l1_cache_size
,
13692 aarch64_tune_params
.prefetch
->l1_cache_size
);
13693 if (aarch64_tune_params
.prefetch
->l1_cache_line_size
>= 0)
13694 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
13695 param_l1_cache_line_size
,
13696 aarch64_tune_params
.prefetch
->l1_cache_line_size
);
13697 if (aarch64_tune_params
.prefetch
->l2_cache_size
>= 0)
13698 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
13699 param_l2_cache_size
,
13700 aarch64_tune_params
.prefetch
->l2_cache_size
);
13701 if (!aarch64_tune_params
.prefetch
->prefetch_dynamic_strides
)
13702 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
13703 param_prefetch_dynamic_strides
, 0);
13704 if (aarch64_tune_params
.prefetch
->minimum_stride
>= 0)
13705 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
13706 param_prefetch_minimum_stride
,
13707 aarch64_tune_params
.prefetch
->minimum_stride
);
13709 /* Use the alternative scheduling-pressure algorithm by default. */
13710 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
13711 param_sched_pressure_algorithm
,
13712 SCHED_PRESSURE_MODEL
);
13714 /* Validate the guard size. */
13715 int guard_size
= param_stack_clash_protection_guard_size
;
13717 if (guard_size
!= 12 && guard_size
!= 16)
13718 error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
13719 "size. Given value %d (%llu KB) is out of range",
13720 guard_size
, (1ULL << guard_size
) / 1024ULL);
13722 /* Enforce that interval is the same size as size so the mid-end does the
13724 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
13725 param_stack_clash_protection_probe_interval
,
13728 /* The maybe_set calls won't update the value if the user has explicitly set
13729 one. Which means we need to validate that probing interval and guard size
13732 = param_stack_clash_protection_probe_interval
;
13733 if (guard_size
!= probe_interval
)
13734 error ("stack clash guard size %<%d%> must be equal to probing interval "
13735 "%<%d%>", guard_size
, probe_interval
);
13737 /* Enable sw prefetching at specified optimization level for
13738 CPUS that have prefetch. Lower optimization level threshold by 1
13739 when profiling is enabled. */
13740 if (opts
->x_flag_prefetch_loop_arrays
< 0
13741 && !opts
->x_optimize_size
13742 && aarch64_tune_params
.prefetch
->default_opt_level
>= 0
13743 && opts
->x_optimize
>= aarch64_tune_params
.prefetch
->default_opt_level
)
13744 opts
->x_flag_prefetch_loop_arrays
= 1;
13746 if (opts
->x_aarch64_arch_string
== NULL
)
13747 opts
->x_aarch64_arch_string
= selected_arch
->name
;
13748 if (opts
->x_aarch64_cpu_string
== NULL
)
13749 opts
->x_aarch64_cpu_string
= selected_cpu
->name
;
13750 if (opts
->x_aarch64_tune_string
== NULL
)
13751 opts
->x_aarch64_tune_string
= selected_tune
->name
;
13753 aarch64_override_options_after_change_1 (opts
);
13756 /* Print a hint with a suggestion for a core or architecture name that
13757 most closely resembles what the user passed in STR. ARCH is true if
13758 the user is asking for an architecture name. ARCH is false if the user
13759 is asking for a core name. */
13762 aarch64_print_hint_for_core_or_arch (const char *str
, bool arch
)
13764 auto_vec
<const char *> candidates
;
13765 const struct processor
*entry
= arch
? all_architectures
: all_cores
;
13766 for (; entry
->name
!= NULL
; entry
++)
13767 candidates
.safe_push (entry
->name
);
13769 #ifdef HAVE_LOCAL_CPU_DETECT
13770 /* Add also "native" as possible value. */
13772 candidates
.safe_push ("native");
13776 const char *hint
= candidates_list_and_hint (str
, s
, candidates
);
13778 inform (input_location
, "valid arguments are: %s;"
13779 " did you mean %qs?", s
, hint
);
13781 inform (input_location
, "valid arguments are: %s", s
);
13786 /* Print a hint with a suggestion for a core name that most closely resembles
13787 what the user passed in STR. */
13790 aarch64_print_hint_for_core (const char *str
)
13792 aarch64_print_hint_for_core_or_arch (str
, false);
13795 /* Print a hint with a suggestion for an architecture name that most closely
13796 resembles what the user passed in STR. */
13799 aarch64_print_hint_for_arch (const char *str
)
13801 aarch64_print_hint_for_core_or_arch (str
, true);
13805 /* Print a hint with a suggestion for an extension name
13806 that most closely resembles what the user passed in STR. */
13809 aarch64_print_hint_for_extensions (const std::string
&str
)
13811 auto_vec
<const char *> candidates
;
13812 aarch64_get_all_extension_candidates (&candidates
);
13814 const char *hint
= candidates_list_and_hint (str
.c_str (), s
, candidates
);
13816 inform (input_location
, "valid arguments are: %s;"
13817 " did you mean %qs?", s
, hint
);
13819 inform (input_location
, "valid arguments are: %s;", s
);
13824 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
13825 specified in STR and throw errors if appropriate. Put the results if
13826 they are valid in RES and ISA_FLAGS. Return whether the option is
13830 aarch64_validate_mcpu (const char *str
, const struct processor
**res
,
13831 uint64_t *isa_flags
)
13833 std::string invalid_extension
;
13834 enum aarch64_parse_opt_result parse_res
13835 = aarch64_parse_cpu (str
, res
, isa_flags
, &invalid_extension
);
13837 if (parse_res
== AARCH64_PARSE_OK
)
13842 case AARCH64_PARSE_MISSING_ARG
:
13843 error ("missing cpu name in %<-mcpu=%s%>", str
);
13845 case AARCH64_PARSE_INVALID_ARG
:
13846 error ("unknown value %qs for %<-mcpu%>", str
);
13847 aarch64_print_hint_for_core (str
);
13849 case AARCH64_PARSE_INVALID_FEATURE
:
13850 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
13851 invalid_extension
.c_str (), str
);
13852 aarch64_print_hint_for_extensions (invalid_extension
);
13855 gcc_unreachable ();
13861 /* Parses CONST_STR for branch protection features specified in
13862 aarch64_branch_protect_types, and set any global variables required. Returns
13863 the parsing result and assigns LAST_STR to the last processed token from
13864 CONST_STR so that it can be used for error reporting. */
13867 aarch64_parse_opt_result
aarch64_parse_branch_protection (const char *const_str
,
13870 char *str_root
= xstrdup (const_str
);
13871 char* token_save
= NULL
;
13872 char *str
= strtok_r (str_root
, "+", &token_save
);
13873 enum aarch64_parse_opt_result res
= AARCH64_PARSE_OK
;
13875 res
= AARCH64_PARSE_MISSING_ARG
;
13878 char *next_str
= strtok_r (NULL
, "+", &token_save
);
13879 /* Reset the branch protection features to their defaults. */
13880 aarch64_handle_no_branch_protection (NULL
, NULL
);
13882 while (str
&& res
== AARCH64_PARSE_OK
)
13884 const aarch64_branch_protect_type
* type
= aarch64_branch_protect_types
;
13885 bool found
= false;
13886 /* Search for this type. */
13887 while (type
&& type
->name
&& !found
&& res
== AARCH64_PARSE_OK
)
13889 if (strcmp (str
, type
->name
) == 0)
13892 res
= type
->handler (str
, next_str
);
13894 next_str
= strtok_r (NULL
, "+", &token_save
);
13899 if (found
&& res
== AARCH64_PARSE_OK
)
13901 bool found_subtype
= true;
13902 /* Loop through each token until we find one that isn't a
13904 while (found_subtype
)
13906 found_subtype
= false;
13907 const aarch64_branch_protect_type
*subtype
= type
->subtypes
;
13908 /* Search for the subtype. */
13909 while (str
&& subtype
&& subtype
->name
&& !found_subtype
13910 && res
== AARCH64_PARSE_OK
)
13912 if (strcmp (str
, subtype
->name
) == 0)
13914 found_subtype
= true;
13915 res
= subtype
->handler (str
, next_str
);
13917 next_str
= strtok_r (NULL
, "+", &token_save
);
13925 res
= AARCH64_PARSE_INVALID_ARG
;
13928 /* Copy the last processed token into the argument to pass it back.
13929 Used by option and attribute validation to print the offending token. */
13932 if (str
) strcpy (*last_str
, str
);
13933 else *last_str
= NULL
;
13935 if (res
== AARCH64_PARSE_OK
)
13937 /* If needed, alloc the accepted string then copy in const_str.
13938 Used by override_option_after_change_1. */
13939 if (!accepted_branch_protection_string
)
13940 accepted_branch_protection_string
= (char *) xmalloc (
13941 BRANCH_PROTECT_STR_MAX
13943 strncpy (accepted_branch_protection_string
, const_str
,
13944 BRANCH_PROTECT_STR_MAX
+ 1);
13945 /* Forcibly null-terminate. */
13946 accepted_branch_protection_string
[BRANCH_PROTECT_STR_MAX
] = '\0';
13952 aarch64_validate_mbranch_protection (const char *const_str
)
13954 char *str
= (char *) xmalloc (strlen (const_str
));
13955 enum aarch64_parse_opt_result res
=
13956 aarch64_parse_branch_protection (const_str
, &str
);
13957 if (res
== AARCH64_PARSE_INVALID_ARG
)
13958 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str
);
13959 else if (res
== AARCH64_PARSE_MISSING_ARG
)
13960 error ("missing argument for %<-mbranch-protection=%>");
13962 return res
== AARCH64_PARSE_OK
;
13965 /* Validate a command-line -march option. Parse the arch and extensions
13966 (if any) specified in STR and throw errors if appropriate. Put the
13967 results, if they are valid, in RES and ISA_FLAGS. Return whether the
13968 option is valid. */
13971 aarch64_validate_march (const char *str
, const struct processor
**res
,
13972 uint64_t *isa_flags
)
13974 std::string invalid_extension
;
13975 enum aarch64_parse_opt_result parse_res
13976 = aarch64_parse_arch (str
, res
, isa_flags
, &invalid_extension
);
13978 if (parse_res
== AARCH64_PARSE_OK
)
13983 case AARCH64_PARSE_MISSING_ARG
:
13984 error ("missing arch name in %<-march=%s%>", str
);
13986 case AARCH64_PARSE_INVALID_ARG
:
13987 error ("unknown value %qs for %<-march%>", str
);
13988 aarch64_print_hint_for_arch (str
);
13990 case AARCH64_PARSE_INVALID_FEATURE
:
13991 error ("invalid feature modifier %qs in %<-march=%s%>",
13992 invalid_extension
.c_str (), str
);
13993 aarch64_print_hint_for_extensions (invalid_extension
);
13996 gcc_unreachable ();
14002 /* Validate a command-line -mtune option. Parse the cpu
14003 specified in STR and throw errors if appropriate. Put the
14004 result, if it is valid, in RES. Return whether the option is
14008 aarch64_validate_mtune (const char *str
, const struct processor
**res
)
14010 enum aarch64_parse_opt_result parse_res
14011 = aarch64_parse_tune (str
, res
);
14013 if (parse_res
== AARCH64_PARSE_OK
)
14018 case AARCH64_PARSE_MISSING_ARG
:
14019 error ("missing cpu name in %<-mtune=%s%>", str
);
14021 case AARCH64_PARSE_INVALID_ARG
:
14022 error ("unknown value %qs for %<-mtune%>", str
);
14023 aarch64_print_hint_for_core (str
);
14026 gcc_unreachable ();
14031 /* Return the CPU corresponding to the enum CPU.
14032 If it doesn't specify a cpu, return the default. */
14034 static const struct processor
*
14035 aarch64_get_tune_cpu (enum aarch64_processor cpu
)
14037 if (cpu
!= aarch64_none
)
14038 return &all_cores
[cpu
];
14040 /* The & 0x3f is to extract the bottom 6 bits that encode the
14041 default cpu as selected by the --with-cpu GCC configure option
14043 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
14044 flags mechanism should be reworked to make it more sane. */
14045 return &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
14048 /* Return the architecture corresponding to the enum ARCH.
14049 If it doesn't specify a valid architecture, return the default. */
14051 static const struct processor
*
14052 aarch64_get_arch (enum aarch64_arch arch
)
14054 if (arch
!= aarch64_no_arch
)
14055 return &all_architectures
[arch
];
14057 const struct processor
*cpu
= &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
14059 return &all_architectures
[cpu
->arch
];
14062 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
14065 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value
)
14067 /* 128-bit SVE and Advanced SIMD modes use different register layouts
14068 on big-endian targets, so we would need to forbid subregs that convert
14069 from one to the other. By default a reinterpret sequence would then
14070 involve a store to memory in one mode and a load back in the other.
14071 Even if we optimize that sequence using reverse instructions,
14072 it would still be a significant potential overhead.
14074 For now, it seems better to generate length-agnostic code for that
14076 if (value
== SVE_SCALABLE
14077 || (value
== SVE_128
&& BYTES_BIG_ENDIAN
))
14078 return poly_uint16 (2, 2);
14080 return (int) value
/ 64;
14083 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
14084 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
14085 tuning structs. In particular it must set selected_tune and
14086 aarch64_isa_flags that define the available ISA features and tuning
14087 decisions. It must also set selected_arch as this will be used to
14088 output the .arch asm tags for each function. */
14091 aarch64_override_options (void)
14093 uint64_t cpu_isa
= 0;
14094 uint64_t arch_isa
= 0;
14095 aarch64_isa_flags
= 0;
14097 bool valid_cpu
= true;
14098 bool valid_tune
= true;
14099 bool valid_arch
= true;
14101 selected_cpu
= NULL
;
14102 selected_arch
= NULL
;
14103 selected_tune
= NULL
;
14105 if (aarch64_branch_protection_string
)
14106 aarch64_validate_mbranch_protection (aarch64_branch_protection_string
);
14108 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
14109 If either of -march or -mtune is given, they override their
14110 respective component of -mcpu. */
14111 if (aarch64_cpu_string
)
14112 valid_cpu
= aarch64_validate_mcpu (aarch64_cpu_string
, &selected_cpu
,
14115 if (aarch64_arch_string
)
14116 valid_arch
= aarch64_validate_march (aarch64_arch_string
, &selected_arch
,
14119 if (aarch64_tune_string
)
14120 valid_tune
= aarch64_validate_mtune (aarch64_tune_string
, &selected_tune
);
14122 #ifdef SUBTARGET_OVERRIDE_OPTIONS
14123 SUBTARGET_OVERRIDE_OPTIONS
;
14126 /* If the user did not specify a processor, choose the default
14127 one for them. This will be the CPU set during configuration using
14128 --with-cpu, otherwise it is "generic". */
14133 selected_cpu
= &all_cores
[selected_arch
->ident
];
14134 aarch64_isa_flags
= arch_isa
;
14135 explicit_arch
= selected_arch
->arch
;
14139 /* Get default configure-time CPU. */
14140 selected_cpu
= aarch64_get_tune_cpu (aarch64_none
);
14141 aarch64_isa_flags
= TARGET_CPU_DEFAULT
>> 6;
14145 explicit_tune_core
= selected_tune
->ident
;
14147 /* If both -mcpu and -march are specified check that they are architecturally
14148 compatible, warn if they're not and prefer the -march ISA flags. */
14149 else if (selected_arch
)
14151 if (selected_arch
->arch
!= selected_cpu
->arch
)
14153 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
14154 aarch64_cpu_string
,
14155 aarch64_arch_string
);
14157 aarch64_isa_flags
= arch_isa
;
14158 explicit_arch
= selected_arch
->arch
;
14159 explicit_tune_core
= selected_tune
? selected_tune
->ident
14160 : selected_cpu
->ident
;
14164 /* -mcpu but no -march. */
14165 aarch64_isa_flags
= cpu_isa
;
14166 explicit_tune_core
= selected_tune
? selected_tune
->ident
14167 : selected_cpu
->ident
;
14168 gcc_assert (selected_cpu
);
14169 selected_arch
= &all_architectures
[selected_cpu
->arch
];
14170 explicit_arch
= selected_arch
->arch
;
14173 /* Set the arch as well as we will need it when outputing
14174 the .arch directive in assembly. */
14175 if (!selected_arch
)
14177 gcc_assert (selected_cpu
);
14178 selected_arch
= &all_architectures
[selected_cpu
->arch
];
14181 if (!selected_tune
)
14182 selected_tune
= selected_cpu
;
14184 if (aarch64_enable_bti
== 2)
14186 #ifdef TARGET_ENABLE_BTI
14187 aarch64_enable_bti
= 1;
14189 aarch64_enable_bti
= 0;
14193 /* Return address signing is currently not supported for ILP32 targets. For
14194 LP64 targets use the configured option in the absence of a command-line
14195 option for -mbranch-protection. */
14196 if (!TARGET_ILP32
&& accepted_branch_protection_string
== NULL
)
14198 #ifdef TARGET_ENABLE_PAC_RET
14199 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
14201 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NONE
;
14205 #ifndef HAVE_AS_MABI_OPTION
14206 /* The compiler may have been configured with 2.23.* binutils, which does
14207 not have support for ILP32. */
14209 error ("assembler does not support %<-mabi=ilp32%>");
14212 /* Convert -msve-vector-bits to a VG count. */
14213 aarch64_sve_vg
= aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits
);
14215 if (aarch64_ra_sign_scope
!= AARCH64_FUNCTION_NONE
&& TARGET_ILP32
)
14216 sorry ("return address signing is only supported for %<-mabi=lp64%>");
14218 /* Make sure we properly set up the explicit options. */
14219 if ((aarch64_cpu_string
&& valid_cpu
)
14220 || (aarch64_tune_string
&& valid_tune
))
14221 gcc_assert (explicit_tune_core
!= aarch64_none
);
14223 if ((aarch64_cpu_string
&& valid_cpu
)
14224 || (aarch64_arch_string
&& valid_arch
))
14225 gcc_assert (explicit_arch
!= aarch64_no_arch
);
14227 /* The pass to insert speculation tracking runs before
14228 shrink-wrapping and the latter does not know how to update the
14229 tracking status. So disable it in this case. */
14230 if (aarch64_track_speculation
)
14231 flag_shrink_wrap
= 0;
14233 aarch64_override_options_internal (&global_options
);
14235 /* Save these options as the default ones in case we push and pop them later
14236 while processing functions with potential target attributes. */
14237 target_option_default_node
= target_option_current_node
14238 = build_target_option_node (&global_options
);
14241 /* Implement targetm.override_options_after_change. */
14244 aarch64_override_options_after_change (void)
14246 aarch64_override_options_after_change_1 (&global_options
);
14249 static struct machine_function
*
14250 aarch64_init_machine_status (void)
14252 struct machine_function
*machine
;
14253 machine
= ggc_cleared_alloc
<machine_function
> ();
14258 aarch64_init_expanders (void)
14260 init_machine_status
= aarch64_init_machine_status
;
14263 /* A checking mechanism for the implementation of the various code models. */
14265 initialize_aarch64_code_model (struct gcc_options
*opts
)
14267 if (opts
->x_flag_pic
)
14269 switch (opts
->x_aarch64_cmodel_var
)
14271 case AARCH64_CMODEL_TINY
:
14272 aarch64_cmodel
= AARCH64_CMODEL_TINY_PIC
;
14274 case AARCH64_CMODEL_SMALL
:
14275 #ifdef HAVE_AS_SMALL_PIC_RELOCS
14276 aarch64_cmodel
= (flag_pic
== 2
14277 ? AARCH64_CMODEL_SMALL_PIC
14278 : AARCH64_CMODEL_SMALL_SPIC
);
14280 aarch64_cmodel
= AARCH64_CMODEL_SMALL_PIC
;
14283 case AARCH64_CMODEL_LARGE
:
14284 sorry ("code model %qs with %<-f%s%>", "large",
14285 opts
->x_flag_pic
> 1 ? "PIC" : "pic");
14288 gcc_unreachable ();
14292 aarch64_cmodel
= opts
->x_aarch64_cmodel_var
;
14295 /* Implement TARGET_OPTION_SAVE. */
14298 aarch64_option_save (struct cl_target_option
*ptr
, struct gcc_options
*opts
)
14300 ptr
->x_aarch64_override_tune_string
= opts
->x_aarch64_override_tune_string
;
14301 ptr
->x_aarch64_branch_protection_string
14302 = opts
->x_aarch64_branch_protection_string
;
14305 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
14306 using the information saved in PTR. */
14309 aarch64_option_restore (struct gcc_options
*opts
, struct cl_target_option
*ptr
)
14311 opts
->x_explicit_tune_core
= ptr
->x_explicit_tune_core
;
14312 selected_tune
= aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
14313 opts
->x_explicit_arch
= ptr
->x_explicit_arch
;
14314 selected_arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
14315 opts
->x_aarch64_override_tune_string
= ptr
->x_aarch64_override_tune_string
;
14316 opts
->x_aarch64_branch_protection_string
14317 = ptr
->x_aarch64_branch_protection_string
;
14318 if (opts
->x_aarch64_branch_protection_string
)
14320 aarch64_parse_branch_protection (opts
->x_aarch64_branch_protection_string
,
14324 aarch64_override_options_internal (opts
);
14327 /* Implement TARGET_OPTION_PRINT. */
14330 aarch64_option_print (FILE *file
, int indent
, struct cl_target_option
*ptr
)
14332 const struct processor
*cpu
14333 = aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
14334 uint64_t isa_flags
= ptr
->x_aarch64_isa_flags
;
14335 const struct processor
*arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
14336 std::string extension
14337 = aarch64_get_extension_string_for_isa_flags (isa_flags
, arch
->flags
);
14339 fprintf (file
, "%*sselected tune = %s\n", indent
, "", cpu
->name
);
14340 fprintf (file
, "%*sselected arch = %s%s\n", indent
, "",
14341 arch
->name
, extension
.c_str ());
14344 static GTY(()) tree aarch64_previous_fndecl
;
14347 aarch64_reset_previous_fndecl (void)
14349 aarch64_previous_fndecl
= NULL
;
14352 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
14353 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
14354 make sure optab availability predicates are recomputed when necessary. */
14357 aarch64_save_restore_target_globals (tree new_tree
)
14359 if (TREE_TARGET_GLOBALS (new_tree
))
14360 restore_target_globals (TREE_TARGET_GLOBALS (new_tree
));
14361 else if (new_tree
== target_option_default_node
)
14362 restore_target_globals (&default_target_globals
);
14364 TREE_TARGET_GLOBALS (new_tree
) = save_target_globals_default_opts ();
14367 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
14368 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
14369 of the function, if such exists. This function may be called multiple
14370 times on a single function so use aarch64_previous_fndecl to avoid
14371 setting up identical state. */
14374 aarch64_set_current_function (tree fndecl
)
14376 if (!fndecl
|| fndecl
== aarch64_previous_fndecl
)
14379 tree old_tree
= (aarch64_previous_fndecl
14380 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl
)
14383 tree new_tree
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
14385 /* If current function has no attributes but the previous one did,
14386 use the default node. */
14387 if (!new_tree
&& old_tree
)
14388 new_tree
= target_option_default_node
;
14390 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
14391 the default have been handled by aarch64_save_restore_target_globals from
14392 aarch64_pragma_target_parse. */
14393 if (old_tree
== new_tree
)
14396 aarch64_previous_fndecl
= fndecl
;
14398 /* First set the target options. */
14399 cl_target_option_restore (&global_options
, TREE_TARGET_OPTION (new_tree
));
14401 aarch64_save_restore_target_globals (new_tree
);
14404 /* Enum describing the various ways we can handle attributes.
14405 In many cases we can reuse the generic option handling machinery. */
14407 enum aarch64_attr_opt_type
14409 aarch64_attr_mask
, /* Attribute should set a bit in target_flags. */
14410 aarch64_attr_bool
, /* Attribute sets or unsets a boolean variable. */
14411 aarch64_attr_enum
, /* Attribute sets an enum variable. */
14412 aarch64_attr_custom
/* Attribute requires a custom handling function. */
14415 /* All the information needed to handle a target attribute.
14416 NAME is the name of the attribute.
14417 ATTR_TYPE specifies the type of behavior of the attribute as described
14418 in the definition of enum aarch64_attr_opt_type.
14419 ALLOW_NEG is true if the attribute supports a "no-" form.
14420 HANDLER is the function that takes the attribute string as an argument
14421 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
14422 OPT_NUM is the enum specifying the option that the attribute modifies.
14423 This is needed for attributes that mirror the behavior of a command-line
14424 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
14425 aarch64_attr_enum. */
14427 struct aarch64_attribute_info
14430 enum aarch64_attr_opt_type attr_type
;
14432 bool (*handler
) (const char *);
14433 enum opt_code opt_num
;
14436 /* Handle the ARCH_STR argument to the arch= target attribute. */
14439 aarch64_handle_attr_arch (const char *str
)
14441 const struct processor
*tmp_arch
= NULL
;
14442 std::string invalid_extension
;
14443 enum aarch64_parse_opt_result parse_res
14444 = aarch64_parse_arch (str
, &tmp_arch
, &aarch64_isa_flags
, &invalid_extension
);
14446 if (parse_res
== AARCH64_PARSE_OK
)
14448 gcc_assert (tmp_arch
);
14449 selected_arch
= tmp_arch
;
14450 explicit_arch
= selected_arch
->arch
;
14456 case AARCH64_PARSE_MISSING_ARG
:
14457 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
14459 case AARCH64_PARSE_INVALID_ARG
:
14460 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str
);
14461 aarch64_print_hint_for_arch (str
);
14463 case AARCH64_PARSE_INVALID_FEATURE
:
14464 error ("invalid feature modifier %s of value (\"%s\") in "
14465 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
14466 aarch64_print_hint_for_extensions (invalid_extension
);
14469 gcc_unreachable ();
14475 /* Handle the argument CPU_STR to the cpu= target attribute. */
14478 aarch64_handle_attr_cpu (const char *str
)
14480 const struct processor
*tmp_cpu
= NULL
;
14481 std::string invalid_extension
;
14482 enum aarch64_parse_opt_result parse_res
14483 = aarch64_parse_cpu (str
, &tmp_cpu
, &aarch64_isa_flags
, &invalid_extension
);
14485 if (parse_res
== AARCH64_PARSE_OK
)
14487 gcc_assert (tmp_cpu
);
14488 selected_tune
= tmp_cpu
;
14489 explicit_tune_core
= selected_tune
->ident
;
14491 selected_arch
= &all_architectures
[tmp_cpu
->arch
];
14492 explicit_arch
= selected_arch
->arch
;
14498 case AARCH64_PARSE_MISSING_ARG
:
14499 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
14501 case AARCH64_PARSE_INVALID_ARG
:
14502 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str
);
14503 aarch64_print_hint_for_core (str
);
14505 case AARCH64_PARSE_INVALID_FEATURE
:
14506 error ("invalid feature modifier %s of value (\"%s\") in "
14507 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
14508 aarch64_print_hint_for_extensions (invalid_extension
);
14511 gcc_unreachable ();
14517 /* Handle the argument STR to the branch-protection= attribute. */
14520 aarch64_handle_attr_branch_protection (const char* str
)
14522 char *err_str
= (char *) xmalloc (strlen (str
) + 1);
14523 enum aarch64_parse_opt_result res
= aarch64_parse_branch_protection (str
,
14525 bool success
= false;
14528 case AARCH64_PARSE_MISSING_ARG
:
14529 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
14532 case AARCH64_PARSE_INVALID_ARG
:
14533 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
14534 "=\")%> pragma or attribute", err_str
);
14536 case AARCH64_PARSE_OK
:
14538 /* Fall through. */
14539 case AARCH64_PARSE_INVALID_FEATURE
:
14542 gcc_unreachable ();
14548 /* Handle the argument STR to the tune= target attribute. */
14551 aarch64_handle_attr_tune (const char *str
)
14553 const struct processor
*tmp_tune
= NULL
;
14554 enum aarch64_parse_opt_result parse_res
14555 = aarch64_parse_tune (str
, &tmp_tune
);
14557 if (parse_res
== AARCH64_PARSE_OK
)
14559 gcc_assert (tmp_tune
);
14560 selected_tune
= tmp_tune
;
14561 explicit_tune_core
= selected_tune
->ident
;
14567 case AARCH64_PARSE_INVALID_ARG
:
14568 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str
);
14569 aarch64_print_hint_for_core (str
);
14572 gcc_unreachable ();
14578 /* Parse an architecture extensions target attribute string specified in STR.
14579 For example "+fp+nosimd". Show any errors if needed. Return TRUE
14580 if successful. Update aarch64_isa_flags to reflect the ISA features
14584 aarch64_handle_attr_isa_flags (char *str
)
14586 enum aarch64_parse_opt_result parse_res
;
14587 uint64_t isa_flags
= aarch64_isa_flags
;
14589 /* We allow "+nothing" in the beginning to clear out all architectural
14590 features if the user wants to handpick specific features. */
14591 if (strncmp ("+nothing", str
, 8) == 0)
14597 std::string invalid_extension
;
14598 parse_res
= aarch64_parse_extension (str
, &isa_flags
, &invalid_extension
);
14600 if (parse_res
== AARCH64_PARSE_OK
)
14602 aarch64_isa_flags
= isa_flags
;
14608 case AARCH64_PARSE_MISSING_ARG
:
14609 error ("missing value in %<target()%> pragma or attribute");
14612 case AARCH64_PARSE_INVALID_FEATURE
:
14613 error ("invalid feature modifier %s of value (\"%s\") in "
14614 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
14618 gcc_unreachable ();
14624 /* The target attributes that we support. On top of these we also support just
14625 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
14626 handled explicitly in aarch64_process_one_target_attr. */
14628 static const struct aarch64_attribute_info aarch64_attributes
[] =
14630 { "general-regs-only", aarch64_attr_mask
, false, NULL
,
14631 OPT_mgeneral_regs_only
},
14632 { "fix-cortex-a53-835769", aarch64_attr_bool
, true, NULL
,
14633 OPT_mfix_cortex_a53_835769
},
14634 { "fix-cortex-a53-843419", aarch64_attr_bool
, true, NULL
,
14635 OPT_mfix_cortex_a53_843419
},
14636 { "cmodel", aarch64_attr_enum
, false, NULL
, OPT_mcmodel_
},
14637 { "strict-align", aarch64_attr_mask
, true, NULL
, OPT_mstrict_align
},
14638 { "omit-leaf-frame-pointer", aarch64_attr_bool
, true, NULL
,
14639 OPT_momit_leaf_frame_pointer
},
14640 { "tls-dialect", aarch64_attr_enum
, false, NULL
, OPT_mtls_dialect_
},
14641 { "arch", aarch64_attr_custom
, false, aarch64_handle_attr_arch
,
14643 { "cpu", aarch64_attr_custom
, false, aarch64_handle_attr_cpu
, OPT_mcpu_
},
14644 { "tune", aarch64_attr_custom
, false, aarch64_handle_attr_tune
,
14646 { "branch-protection", aarch64_attr_custom
, false,
14647 aarch64_handle_attr_branch_protection
, OPT_mbranch_protection_
},
14648 { "sign-return-address", aarch64_attr_enum
, false, NULL
,
14649 OPT_msign_return_address_
},
14650 { NULL
, aarch64_attr_custom
, false, NULL
, OPT____
}
14653 /* Parse ARG_STR which contains the definition of one target attribute.
14654 Show appropriate errors if any or return true if the attribute is valid. */
14657 aarch64_process_one_target_attr (char *arg_str
)
14659 bool invert
= false;
14661 size_t len
= strlen (arg_str
);
14665 error ("malformed %<target()%> pragma or attribute");
14669 char *str_to_check
= (char *) alloca (len
+ 1);
14670 strcpy (str_to_check
, arg_str
);
14672 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
14673 It is easier to detect and handle it explicitly here rather than going
14674 through the machinery for the rest of the target attributes in this
14676 if (*str_to_check
== '+')
14677 return aarch64_handle_attr_isa_flags (str_to_check
);
14679 if (len
> 3 && strncmp (str_to_check
, "no-", 3) == 0)
14684 char *arg
= strchr (str_to_check
, '=');
14686 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
14687 and point ARG to "foo". */
14693 const struct aarch64_attribute_info
*p_attr
;
14694 bool found
= false;
14695 for (p_attr
= aarch64_attributes
; p_attr
->name
; p_attr
++)
14697 /* If the names don't match up, or the user has given an argument
14698 to an attribute that doesn't accept one, or didn't give an argument
14699 to an attribute that expects one, fail to match. */
14700 if (strcmp (str_to_check
, p_attr
->name
) != 0)
14704 bool attr_need_arg_p
= p_attr
->attr_type
== aarch64_attr_custom
14705 || p_attr
->attr_type
== aarch64_attr_enum
;
14707 if (attr_need_arg_p
^ (arg
!= NULL
))
14709 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check
);
14713 /* If the name matches but the attribute does not allow "no-" versions
14714 then we can't match. */
14715 if (invert
&& !p_attr
->allow_neg
)
14717 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check
);
14721 switch (p_attr
->attr_type
)
14723 /* Has a custom handler registered.
14724 For example, cpu=, arch=, tune=. */
14725 case aarch64_attr_custom
:
14726 gcc_assert (p_attr
->handler
);
14727 if (!p_attr
->handler (arg
))
14731 /* Either set or unset a boolean option. */
14732 case aarch64_attr_bool
:
14734 struct cl_decoded_option decoded
;
14736 generate_option (p_attr
->opt_num
, NULL
, !invert
,
14737 CL_TARGET
, &decoded
);
14738 aarch64_handle_option (&global_options
, &global_options_set
,
14739 &decoded
, input_location
);
14742 /* Set or unset a bit in the target_flags. aarch64_handle_option
14743 should know what mask to apply given the option number. */
14744 case aarch64_attr_mask
:
14746 struct cl_decoded_option decoded
;
14747 /* We only need to specify the option number.
14748 aarch64_handle_option will know which mask to apply. */
14749 decoded
.opt_index
= p_attr
->opt_num
;
14750 decoded
.value
= !invert
;
14751 aarch64_handle_option (&global_options
, &global_options_set
,
14752 &decoded
, input_location
);
14755 /* Use the option setting machinery to set an option to an enum. */
14756 case aarch64_attr_enum
:
14761 valid
= opt_enum_arg_to_value (p_attr
->opt_num
, arg
,
14762 &value
, CL_TARGET
);
14765 set_option (&global_options
, NULL
, p_attr
->opt_num
, value
,
14766 NULL
, DK_UNSPECIFIED
, input_location
,
14771 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check
, arg
);
14776 gcc_unreachable ();
14780 /* If we reached here we either have found an attribute and validated
14781 it or didn't match any. If we matched an attribute but its arguments
14782 were malformed we will have returned false already. */
14786 /* Count how many times the character C appears in
14787 NULL-terminated string STR. */
14789 static unsigned int
14790 num_occurences_in_str (char c
, char *str
)
14792 unsigned int res
= 0;
14793 while (*str
!= '\0')
14804 /* Parse the tree in ARGS that contains the target attribute information
14805 and update the global target options space. */
14808 aarch64_process_target_attr (tree args
)
14810 if (TREE_CODE (args
) == TREE_LIST
)
14814 tree head
= TREE_VALUE (args
);
14817 if (!aarch64_process_target_attr (head
))
14820 args
= TREE_CHAIN (args
);
14826 if (TREE_CODE (args
) != STRING_CST
)
14828 error ("attribute %<target%> argument not a string");
14832 size_t len
= strlen (TREE_STRING_POINTER (args
));
14833 char *str_to_check
= (char *) alloca (len
+ 1);
14834 strcpy (str_to_check
, TREE_STRING_POINTER (args
));
14838 error ("malformed %<target()%> pragma or attribute");
14842 /* Used to catch empty spaces between commas i.e.
14843 attribute ((target ("attr1,,attr2"))). */
14844 unsigned int num_commas
= num_occurences_in_str (',', str_to_check
);
14846 /* Handle multiple target attributes separated by ','. */
14847 char *token
= strtok_r (str_to_check
, ",", &str_to_check
);
14849 unsigned int num_attrs
= 0;
14853 if (!aarch64_process_one_target_attr (token
))
14855 error ("pragma or attribute %<target(\"%s\")%> is not valid", token
);
14859 token
= strtok_r (NULL
, ",", &str_to_check
);
14862 if (num_attrs
!= num_commas
+ 1)
14864 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args
));
14871 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
14872 process attribute ((target ("..."))). */
14875 aarch64_option_valid_attribute_p (tree fndecl
, tree
, tree args
, int)
14877 struct cl_target_option cur_target
;
14880 tree new_target
, new_optimize
;
14881 tree existing_target
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
14883 /* If what we're processing is the current pragma string then the
14884 target option node is already stored in target_option_current_node
14885 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
14886 having to re-parse the string. This is especially useful to keep
14887 arm_neon.h compile times down since that header contains a lot
14888 of intrinsics enclosed in pragmas. */
14889 if (!existing_target
&& args
== current_target_pragma
)
14891 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = target_option_current_node
;
14894 tree func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
14896 old_optimize
= build_optimization_node (&global_options
);
14897 func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
14899 /* If the function changed the optimization levels as well as setting
14900 target options, start with the optimizations specified. */
14901 if (func_optimize
&& func_optimize
!= old_optimize
)
14902 cl_optimization_restore (&global_options
,
14903 TREE_OPTIMIZATION (func_optimize
));
14905 /* Save the current target options to restore at the end. */
14906 cl_target_option_save (&cur_target
, &global_options
);
14908 /* If fndecl already has some target attributes applied to it, unpack
14909 them so that we add this attribute on top of them, rather than
14910 overwriting them. */
14911 if (existing_target
)
14913 struct cl_target_option
*existing_options
14914 = TREE_TARGET_OPTION (existing_target
);
14916 if (existing_options
)
14917 cl_target_option_restore (&global_options
, existing_options
);
14920 cl_target_option_restore (&global_options
,
14921 TREE_TARGET_OPTION (target_option_current_node
));
14923 ret
= aarch64_process_target_attr (args
);
14925 /* Set up any additional state. */
14928 aarch64_override_options_internal (&global_options
);
14929 /* Initialize SIMD builtins if we haven't already.
14930 Set current_target_pragma to NULL for the duration so that
14931 the builtin initialization code doesn't try to tag the functions
14932 being built with the attributes specified by any current pragma, thus
14933 going into an infinite recursion. */
14936 tree saved_current_target_pragma
= current_target_pragma
;
14937 current_target_pragma
= NULL
;
14938 aarch64_init_simd_builtins ();
14939 current_target_pragma
= saved_current_target_pragma
;
14941 new_target
= build_target_option_node (&global_options
);
14946 new_optimize
= build_optimization_node (&global_options
);
14950 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = new_target
;
14952 if (old_optimize
!= new_optimize
)
14953 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
) = new_optimize
;
14956 cl_target_option_restore (&global_options
, &cur_target
);
14958 if (old_optimize
!= new_optimize
)
14959 cl_optimization_restore (&global_options
,
14960 TREE_OPTIMIZATION (old_optimize
));
14964 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
14965 tri-bool options (yes, no, don't care) and the default value is
14966 DEF, determine whether to reject inlining. */
14969 aarch64_tribools_ok_for_inlining_p (int caller
, int callee
,
14970 int dont_care
, int def
)
14972 /* If the callee doesn't care, always allow inlining. */
14973 if (callee
== dont_care
)
14976 /* If the caller doesn't care, always allow inlining. */
14977 if (caller
== dont_care
)
14980 /* Otherwise, allow inlining if either the callee and caller values
14981 agree, or if the callee is using the default value. */
14982 return (callee
== caller
|| callee
== def
);
14985 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
14986 to inline CALLEE into CALLER based on target-specific info.
14987 Make sure that the caller and callee have compatible architectural
14988 features. Then go through the other possible target attributes
14989 and see if they can block inlining. Try not to reject always_inline
14990 callees unless they are incompatible architecturally. */
14993 aarch64_can_inline_p (tree caller
, tree callee
)
14995 tree caller_tree
= DECL_FUNCTION_SPECIFIC_TARGET (caller
);
14996 tree callee_tree
= DECL_FUNCTION_SPECIFIC_TARGET (callee
);
14998 struct cl_target_option
*caller_opts
14999 = TREE_TARGET_OPTION (caller_tree
? caller_tree
15000 : target_option_default_node
);
15002 struct cl_target_option
*callee_opts
15003 = TREE_TARGET_OPTION (callee_tree
? callee_tree
15004 : target_option_default_node
);
15006 /* Callee's ISA flags should be a subset of the caller's. */
15007 if ((caller_opts
->x_aarch64_isa_flags
& callee_opts
->x_aarch64_isa_flags
)
15008 != callee_opts
->x_aarch64_isa_flags
)
15011 /* Allow non-strict aligned functions inlining into strict
15013 if ((TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)
15014 != TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
))
15015 && !(!TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
)
15016 && TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)))
15019 bool always_inline
= lookup_attribute ("always_inline",
15020 DECL_ATTRIBUTES (callee
));
15022 /* If the architectural features match up and the callee is always_inline
15023 then the other attributes don't matter. */
15027 if (caller_opts
->x_aarch64_cmodel_var
15028 != callee_opts
->x_aarch64_cmodel_var
)
15031 if (caller_opts
->x_aarch64_tls_dialect
15032 != callee_opts
->x_aarch64_tls_dialect
)
15035 /* Honour explicit requests to workaround errata. */
15036 if (!aarch64_tribools_ok_for_inlining_p (
15037 caller_opts
->x_aarch64_fix_a53_err835769
,
15038 callee_opts
->x_aarch64_fix_a53_err835769
,
15039 2, TARGET_FIX_ERR_A53_835769_DEFAULT
))
15042 if (!aarch64_tribools_ok_for_inlining_p (
15043 caller_opts
->x_aarch64_fix_a53_err843419
,
15044 callee_opts
->x_aarch64_fix_a53_err843419
,
15045 2, TARGET_FIX_ERR_A53_843419
))
15048 /* If the user explicitly specified -momit-leaf-frame-pointer for the
15049 caller and calle and they don't match up, reject inlining. */
15050 if (!aarch64_tribools_ok_for_inlining_p (
15051 caller_opts
->x_flag_omit_leaf_frame_pointer
,
15052 callee_opts
->x_flag_omit_leaf_frame_pointer
,
15056 /* If the callee has specific tuning overrides, respect them. */
15057 if (callee_opts
->x_aarch64_override_tune_string
!= NULL
15058 && caller_opts
->x_aarch64_override_tune_string
== NULL
)
15061 /* If the user specified tuning override strings for the
15062 caller and callee and they don't match up, reject inlining.
15063 We just do a string compare here, we don't analyze the meaning
15064 of the string, as it would be too costly for little gain. */
15065 if (callee_opts
->x_aarch64_override_tune_string
15066 && caller_opts
->x_aarch64_override_tune_string
15067 && (strcmp (callee_opts
->x_aarch64_override_tune_string
,
15068 caller_opts
->x_aarch64_override_tune_string
) != 0))
15074 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
15078 aarch64_tlsdesc_abi_id ()
15080 predefined_function_abi
&tlsdesc_abi
= function_abis
[ARM_PCS_TLSDESC
];
15081 if (!tlsdesc_abi
.initialized_p ())
15083 HARD_REG_SET full_reg_clobbers
;
15084 CLEAR_HARD_REG_SET (full_reg_clobbers
);
15085 SET_HARD_REG_BIT (full_reg_clobbers
, R0_REGNUM
);
15086 SET_HARD_REG_BIT (full_reg_clobbers
, CC_REGNUM
);
15087 for (int regno
= P0_REGNUM
; regno
<= P15_REGNUM
; ++regno
)
15088 SET_HARD_REG_BIT (full_reg_clobbers
, regno
);
15089 tlsdesc_abi
.initialize (ARM_PCS_TLSDESC
, full_reg_clobbers
);
15091 return tlsdesc_abi
.id ();
15094 /* Return true if SYMBOL_REF X binds locally. */
15097 aarch64_symbol_binds_local_p (const_rtx x
)
15099 return (SYMBOL_REF_DECL (x
)
15100 ? targetm
.binds_local_p (SYMBOL_REF_DECL (x
))
15101 : SYMBOL_REF_LOCAL_P (x
));
15104 /* Return true if SYMBOL_REF X is thread local */
15106 aarch64_tls_symbol_p (rtx x
)
15108 if (! TARGET_HAVE_TLS
)
15111 if (GET_CODE (x
) != SYMBOL_REF
)
15114 return SYMBOL_REF_TLS_MODEL (x
) != 0;
15117 /* Classify a TLS symbol into one of the TLS kinds. */
15118 enum aarch64_symbol_type
15119 aarch64_classify_tls_symbol (rtx x
)
15121 enum tls_model tls_kind
= tls_symbolic_operand_type (x
);
15125 case TLS_MODEL_GLOBAL_DYNAMIC
:
15126 case TLS_MODEL_LOCAL_DYNAMIC
:
15127 return TARGET_TLS_DESC
? SYMBOL_SMALL_TLSDESC
: SYMBOL_SMALL_TLSGD
;
15129 case TLS_MODEL_INITIAL_EXEC
:
15130 switch (aarch64_cmodel
)
15132 case AARCH64_CMODEL_TINY
:
15133 case AARCH64_CMODEL_TINY_PIC
:
15134 return SYMBOL_TINY_TLSIE
;
15136 return SYMBOL_SMALL_TLSIE
;
15139 case TLS_MODEL_LOCAL_EXEC
:
15140 if (aarch64_tls_size
== 12)
15141 return SYMBOL_TLSLE12
;
15142 else if (aarch64_tls_size
== 24)
15143 return SYMBOL_TLSLE24
;
15144 else if (aarch64_tls_size
== 32)
15145 return SYMBOL_TLSLE32
;
15146 else if (aarch64_tls_size
== 48)
15147 return SYMBOL_TLSLE48
;
15149 gcc_unreachable ();
15151 case TLS_MODEL_EMULATED
:
15152 case TLS_MODEL_NONE
:
15153 return SYMBOL_FORCE_TO_MEM
;
15156 gcc_unreachable ();
15160 /* Return the correct method for accessing X + OFFSET, where X is either
15161 a SYMBOL_REF or LABEL_REF. */
15163 enum aarch64_symbol_type
15164 aarch64_classify_symbol (rtx x
, HOST_WIDE_INT offset
)
15166 if (GET_CODE (x
) == LABEL_REF
)
15168 switch (aarch64_cmodel
)
15170 case AARCH64_CMODEL_LARGE
:
15171 return SYMBOL_FORCE_TO_MEM
;
15173 case AARCH64_CMODEL_TINY_PIC
:
15174 case AARCH64_CMODEL_TINY
:
15175 return SYMBOL_TINY_ABSOLUTE
;
15177 case AARCH64_CMODEL_SMALL_SPIC
:
15178 case AARCH64_CMODEL_SMALL_PIC
:
15179 case AARCH64_CMODEL_SMALL
:
15180 return SYMBOL_SMALL_ABSOLUTE
;
15183 gcc_unreachable ();
15187 if (GET_CODE (x
) == SYMBOL_REF
)
15189 if (aarch64_tls_symbol_p (x
))
15190 return aarch64_classify_tls_symbol (x
);
15192 switch (aarch64_cmodel
)
15194 case AARCH64_CMODEL_TINY
:
15195 /* When we retrieve symbol + offset address, we have to make sure
15196 the offset does not cause overflow of the final address. But
15197 we have no way of knowing the address of symbol at compile time
15198 so we can't accurately say if the distance between the PC and
15199 symbol + offset is outside the addressible range of +/-1MB in the
15200 TINY code model. So we limit the maximum offset to +/-64KB and
15201 assume the offset to the symbol is not larger than +/-(1MB - 64KB).
15202 If offset_within_block_p is true we allow larger offsets.
15203 Furthermore force to memory if the symbol is a weak reference to
15204 something that doesn't resolve to a symbol in this module. */
15206 if (SYMBOL_REF_WEAK (x
) && !aarch64_symbol_binds_local_p (x
))
15207 return SYMBOL_FORCE_TO_MEM
;
15208 if (!(IN_RANGE (offset
, -0x10000, 0x10000)
15209 || offset_within_block_p (x
, offset
)))
15210 return SYMBOL_FORCE_TO_MEM
;
15212 return SYMBOL_TINY_ABSOLUTE
;
15214 case AARCH64_CMODEL_SMALL
:
15215 /* Same reasoning as the tiny code model, but the offset cap here is
15216 1MB, allowing +/-3.9GB for the offset to the symbol. */
15218 if (SYMBOL_REF_WEAK (x
) && !aarch64_symbol_binds_local_p (x
))
15219 return SYMBOL_FORCE_TO_MEM
;
15220 if (!(IN_RANGE (offset
, -0x100000, 0x100000)
15221 || offset_within_block_p (x
, offset
)))
15222 return SYMBOL_FORCE_TO_MEM
;
15224 return SYMBOL_SMALL_ABSOLUTE
;
15226 case AARCH64_CMODEL_TINY_PIC
:
15227 if (!aarch64_symbol_binds_local_p (x
))
15228 return SYMBOL_TINY_GOT
;
15229 return SYMBOL_TINY_ABSOLUTE
;
15231 case AARCH64_CMODEL_SMALL_SPIC
:
15232 case AARCH64_CMODEL_SMALL_PIC
:
15233 if (!aarch64_symbol_binds_local_p (x
))
15234 return (aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
15235 ? SYMBOL_SMALL_GOT_28K
: SYMBOL_SMALL_GOT_4G
);
15236 return SYMBOL_SMALL_ABSOLUTE
;
15238 case AARCH64_CMODEL_LARGE
:
15239 /* This is alright even in PIC code as the constant
15240 pool reference is always PC relative and within
15241 the same translation unit. */
15242 if (!aarch64_pcrelative_literal_loads
&& CONSTANT_POOL_ADDRESS_P (x
))
15243 return SYMBOL_SMALL_ABSOLUTE
;
15245 return SYMBOL_FORCE_TO_MEM
;
15248 gcc_unreachable ();
15252 /* By default push everything into the constant pool. */
15253 return SYMBOL_FORCE_TO_MEM
;
15257 aarch64_constant_address_p (rtx x
)
15259 return (CONSTANT_P (x
) && memory_address_p (DImode
, x
));
15263 aarch64_legitimate_pic_operand_p (rtx x
)
15265 if (GET_CODE (x
) == SYMBOL_REF
15266 || (GET_CODE (x
) == CONST
15267 && GET_CODE (XEXP (x
, 0)) == PLUS
15268 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
))
15274 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
15275 that should be rematerialized rather than spilled. */
15278 aarch64_legitimate_constant_p (machine_mode mode
, rtx x
)
15280 /* Support CSE and rematerialization of common constants. */
15281 if (CONST_INT_P (x
)
15282 || (CONST_DOUBLE_P (x
) && GET_MODE_CLASS (mode
) == MODE_FLOAT
)
15283 || GET_CODE (x
) == CONST_VECTOR
)
15286 /* Do not allow vector struct mode constants for Advanced SIMD.
15287 We could support 0 and -1 easily, but they need support in
15288 aarch64-simd.md. */
15289 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
15290 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
15293 /* Only accept variable-length vector constants if they can be
15296 ??? It would be possible to handle rematerialization of other
15297 constants via secondary reloads. */
15298 if (vec_flags
& VEC_ANY_SVE
)
15299 return aarch64_simd_valid_immediate (x
, NULL
);
15301 if (GET_CODE (x
) == HIGH
)
15304 /* Accept polynomial constants that can be calculated by using the
15305 destination of a move as the sole temporary. Constants that
15306 require a second temporary cannot be rematerialized (they can't be
15307 forced to memory and also aren't legitimate constants). */
15309 if (poly_int_rtx_p (x
, &offset
))
15310 return aarch64_offset_temporaries (false, offset
) <= 1;
15312 /* If an offset is being added to something else, we need to allow the
15313 base to be moved into the destination register, meaning that there
15314 are no free temporaries for the offset. */
15315 x
= strip_offset (x
, &offset
);
15316 if (!offset
.is_constant () && aarch64_offset_temporaries (true, offset
) > 0)
15319 /* Do not allow const (plus (anchor_symbol, const_int)). */
15320 if (maybe_ne (offset
, 0) && SYMBOL_REF_P (x
) && SYMBOL_REF_ANCHOR_P (x
))
15323 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
15324 so spilling them is better than rematerialization. */
15325 if (SYMBOL_REF_P (x
) && !SYMBOL_REF_TLS_MODEL (x
))
15328 /* Label references are always constant. */
15329 if (GET_CODE (x
) == LABEL_REF
)
15336 aarch64_load_tp (rtx target
)
15339 || GET_MODE (target
) != Pmode
15340 || !register_operand (target
, Pmode
))
15341 target
= gen_reg_rtx (Pmode
);
15343 /* Can return in any reg. */
15344 emit_insn (gen_aarch64_load_tp_hard (target
));
15348 /* On AAPCS systems, this is the "struct __va_list". */
15349 static GTY(()) tree va_list_type
;
15351 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
15352 Return the type to use as __builtin_va_list.
15354 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
15366 aarch64_build_builtin_va_list (void)
15369 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
15371 /* Create the type. */
15372 va_list_type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
15373 /* Give it the required name. */
15374 va_list_name
= build_decl (BUILTINS_LOCATION
,
15376 get_identifier ("__va_list"),
15378 DECL_ARTIFICIAL (va_list_name
) = 1;
15379 TYPE_NAME (va_list_type
) = va_list_name
;
15380 TYPE_STUB_DECL (va_list_type
) = va_list_name
;
15382 /* Create the fields. */
15383 f_stack
= build_decl (BUILTINS_LOCATION
,
15384 FIELD_DECL
, get_identifier ("__stack"),
15386 f_grtop
= build_decl (BUILTINS_LOCATION
,
15387 FIELD_DECL
, get_identifier ("__gr_top"),
15389 f_vrtop
= build_decl (BUILTINS_LOCATION
,
15390 FIELD_DECL
, get_identifier ("__vr_top"),
15392 f_groff
= build_decl (BUILTINS_LOCATION
,
15393 FIELD_DECL
, get_identifier ("__gr_offs"),
15394 integer_type_node
);
15395 f_vroff
= build_decl (BUILTINS_LOCATION
,
15396 FIELD_DECL
, get_identifier ("__vr_offs"),
15397 integer_type_node
);
15399 /* Tell tree-stdarg pass about our internal offset fields.
15400 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
15401 purpose to identify whether the code is updating va_list internal
15402 offset fields through irregular way. */
15403 va_list_gpr_counter_field
= f_groff
;
15404 va_list_fpr_counter_field
= f_vroff
;
15406 DECL_ARTIFICIAL (f_stack
) = 1;
15407 DECL_ARTIFICIAL (f_grtop
) = 1;
15408 DECL_ARTIFICIAL (f_vrtop
) = 1;
15409 DECL_ARTIFICIAL (f_groff
) = 1;
15410 DECL_ARTIFICIAL (f_vroff
) = 1;
15412 DECL_FIELD_CONTEXT (f_stack
) = va_list_type
;
15413 DECL_FIELD_CONTEXT (f_grtop
) = va_list_type
;
15414 DECL_FIELD_CONTEXT (f_vrtop
) = va_list_type
;
15415 DECL_FIELD_CONTEXT (f_groff
) = va_list_type
;
15416 DECL_FIELD_CONTEXT (f_vroff
) = va_list_type
;
15418 TYPE_FIELDS (va_list_type
) = f_stack
;
15419 DECL_CHAIN (f_stack
) = f_grtop
;
15420 DECL_CHAIN (f_grtop
) = f_vrtop
;
15421 DECL_CHAIN (f_vrtop
) = f_groff
;
15422 DECL_CHAIN (f_groff
) = f_vroff
;
15424 /* Compute its layout. */
15425 layout_type (va_list_type
);
15427 return va_list_type
;
15430 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
15432 aarch64_expand_builtin_va_start (tree valist
, rtx nextarg ATTRIBUTE_UNUSED
)
15434 const CUMULATIVE_ARGS
*cum
;
15435 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
15436 tree stack
, grtop
, vrtop
, groff
, vroff
;
15438 int gr_save_area_size
= cfun
->va_list_gpr_size
;
15439 int vr_save_area_size
= cfun
->va_list_fpr_size
;
15442 cum
= &crtl
->args
.info
;
15443 if (cfun
->va_list_gpr_size
)
15444 gr_save_area_size
= MIN ((NUM_ARG_REGS
- cum
->aapcs_ncrn
) * UNITS_PER_WORD
,
15445 cfun
->va_list_gpr_size
);
15446 if (cfun
->va_list_fpr_size
)
15447 vr_save_area_size
= MIN ((NUM_FP_ARG_REGS
- cum
->aapcs_nvrn
)
15448 * UNITS_PER_VREG
, cfun
->va_list_fpr_size
);
15452 gcc_assert (cum
->aapcs_nvrn
== 0);
15453 vr_save_area_size
= 0;
15456 f_stack
= TYPE_FIELDS (va_list_type_node
);
15457 f_grtop
= DECL_CHAIN (f_stack
);
15458 f_vrtop
= DECL_CHAIN (f_grtop
);
15459 f_groff
= DECL_CHAIN (f_vrtop
);
15460 f_vroff
= DECL_CHAIN (f_groff
);
15462 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), valist
, f_stack
,
15464 grtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
), valist
, f_grtop
,
15466 vrtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
), valist
, f_vrtop
,
15468 groff
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
), valist
, f_groff
,
15470 vroff
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
), valist
, f_vroff
,
15473 /* Emit code to initialize STACK, which points to the next varargs stack
15474 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
15475 by named arguments. STACK is 8-byte aligned. */
15476 t
= make_tree (TREE_TYPE (stack
), virtual_incoming_args_rtx
);
15477 if (cum
->aapcs_stack_size
> 0)
15478 t
= fold_build_pointer_plus_hwi (t
, cum
->aapcs_stack_size
* UNITS_PER_WORD
);
15479 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), stack
, t
);
15480 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
15482 /* Emit code to initialize GRTOP, the top of the GR save area.
15483 virtual_incoming_args_rtx should have been 16 byte aligned. */
15484 t
= make_tree (TREE_TYPE (grtop
), virtual_incoming_args_rtx
);
15485 t
= build2 (MODIFY_EXPR
, TREE_TYPE (grtop
), grtop
, t
);
15486 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
15488 /* Emit code to initialize VRTOP, the top of the VR save area.
15489 This address is gr_save_area_bytes below GRTOP, rounded
15490 down to the next 16-byte boundary. */
15491 t
= make_tree (TREE_TYPE (vrtop
), virtual_incoming_args_rtx
);
15492 vr_offset
= ROUND_UP (gr_save_area_size
,
15493 STACK_BOUNDARY
/ BITS_PER_UNIT
);
15496 t
= fold_build_pointer_plus_hwi (t
, -vr_offset
);
15497 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vrtop
), vrtop
, t
);
15498 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
15500 /* Emit code to initialize GROFF, the offset from GRTOP of the
15501 next GPR argument. */
15502 t
= build2 (MODIFY_EXPR
, TREE_TYPE (groff
), groff
,
15503 build_int_cst (TREE_TYPE (groff
), -gr_save_area_size
));
15504 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
15506 /* Likewise emit code to initialize VROFF, the offset from FTOP
15507 of the next VR argument. */
15508 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vroff
), vroff
,
15509 build_int_cst (TREE_TYPE (vroff
), -vr_save_area_size
));
15510 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
15513 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
15516 aarch64_gimplify_va_arg_expr (tree valist
, tree type
, gimple_seq
*pre_p
,
15517 gimple_seq
*post_p ATTRIBUTE_UNUSED
)
15521 bool is_ha
; /* is HFA or HVA. */
15522 bool dw_align
; /* double-word align. */
15523 machine_mode ag_mode
= VOIDmode
;
15527 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
15528 tree stack
, f_top
, f_off
, off
, arg
, roundup
, on_stack
;
15529 HOST_WIDE_INT size
, rsize
, adjust
, align
;
15530 tree t
, u
, cond1
, cond2
;
15532 indirect_p
= pass_va_arg_by_reference (type
);
15534 type
= build_pointer_type (type
);
15536 mode
= TYPE_MODE (type
);
15538 f_stack
= TYPE_FIELDS (va_list_type_node
);
15539 f_grtop
= DECL_CHAIN (f_stack
);
15540 f_vrtop
= DECL_CHAIN (f_grtop
);
15541 f_groff
= DECL_CHAIN (f_vrtop
);
15542 f_vroff
= DECL_CHAIN (f_groff
);
15544 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), unshare_expr (valist
),
15545 f_stack
, NULL_TREE
);
15546 size
= int_size_in_bytes (type
);
15550 = aarch64_function_arg_alignment (mode
, type
, &abi_break
) / BITS_PER_UNIT
;
15554 if (aarch64_vfp_is_call_or_return_candidate (mode
,
15560 /* No frontends can create types with variable-sized modes, so we
15561 shouldn't be asked to pass or return them. */
15562 unsigned int ag_size
= GET_MODE_SIZE (ag_mode
).to_constant ();
15564 /* TYPE passed in fp/simd registers. */
15566 aarch64_err_no_fpadvsimd (mode
);
15568 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
),
15569 unshare_expr (valist
), f_vrtop
, NULL_TREE
);
15570 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
),
15571 unshare_expr (valist
), f_vroff
, NULL_TREE
);
15573 rsize
= nregs
* UNITS_PER_VREG
;
15577 if (BYTES_BIG_ENDIAN
&& ag_size
< UNITS_PER_VREG
)
15578 adjust
= UNITS_PER_VREG
- ag_size
;
15580 else if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
15581 && size
< UNITS_PER_VREG
)
15583 adjust
= UNITS_PER_VREG
- size
;
15588 /* TYPE passed in general registers. */
15589 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
),
15590 unshare_expr (valist
), f_grtop
, NULL_TREE
);
15591 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
),
15592 unshare_expr (valist
), f_groff
, NULL_TREE
);
15593 rsize
= ROUND_UP (size
, UNITS_PER_WORD
);
15594 nregs
= rsize
/ UNITS_PER_WORD
;
15598 if (abi_break
&& warn_psabi
)
15599 inform (input_location
, "parameter passing for argument of type "
15600 "%qT changed in GCC 9.1", type
);
15604 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
15605 && size
< UNITS_PER_WORD
)
15607 adjust
= UNITS_PER_WORD
- size
;
15611 /* Get a local temporary for the field value. */
15612 off
= get_initialized_tmp_var (f_off
, pre_p
, NULL
);
15614 /* Emit code to branch if off >= 0. */
15615 t
= build2 (GE_EXPR
, boolean_type_node
, off
,
15616 build_int_cst (TREE_TYPE (off
), 0));
15617 cond1
= build3 (COND_EXPR
, ptr_type_node
, t
, NULL_TREE
, NULL_TREE
);
15621 /* Emit: offs = (offs + 15) & -16. */
15622 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
15623 build_int_cst (TREE_TYPE (off
), 15));
15624 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (off
), t
,
15625 build_int_cst (TREE_TYPE (off
), -16));
15626 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (off
), off
, t
);
15631 /* Update ap.__[g|v]r_offs */
15632 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
15633 build_int_cst (TREE_TYPE (off
), rsize
));
15634 t
= build2 (MODIFY_EXPR
, TREE_TYPE (f_off
), unshare_expr (f_off
), t
);
15638 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
15640 /* [cond2] if (ap.__[g|v]r_offs > 0) */
15641 u
= build2 (GT_EXPR
, boolean_type_node
, unshare_expr (f_off
),
15642 build_int_cst (TREE_TYPE (f_off
), 0));
15643 cond2
= build3 (COND_EXPR
, ptr_type_node
, u
, NULL_TREE
, NULL_TREE
);
15645 /* String up: make sure the assignment happens before the use. */
15646 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (cond2
), t
, cond2
);
15647 COND_EXPR_ELSE (cond1
) = t
;
15649 /* Prepare the trees handling the argument that is passed on the stack;
15650 the top level node will store in ON_STACK. */
15651 arg
= get_initialized_tmp_var (stack
, pre_p
, NULL
);
15654 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
15655 t
= fold_build_pointer_plus_hwi (arg
, 15);
15656 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
15657 build_int_cst (TREE_TYPE (t
), -16));
15658 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (arg
), arg
, t
);
15662 /* Advance ap.__stack */
15663 t
= fold_build_pointer_plus_hwi (arg
, size
+ 7);
15664 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
15665 build_int_cst (TREE_TYPE (t
), -8));
15666 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), unshare_expr (stack
), t
);
15667 /* String up roundup and advance. */
15669 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
15670 /* String up with arg */
15671 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), t
, arg
);
15672 /* Big-endianness related address adjustment. */
15673 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
15674 && size
< UNITS_PER_WORD
)
15676 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (arg
), arg
,
15677 size_int (UNITS_PER_WORD
- size
));
15678 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), on_stack
, t
);
15681 COND_EXPR_THEN (cond1
) = unshare_expr (on_stack
);
15682 COND_EXPR_THEN (cond2
) = unshare_expr (on_stack
);
15684 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
15687 t
= build2 (PREINCREMENT_EXPR
, TREE_TYPE (off
), off
,
15688 build_int_cst (TREE_TYPE (off
), adjust
));
15690 t
= fold_convert (sizetype
, t
);
15691 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (f_top
), f_top
, t
);
15695 /* type ha; // treat as "struct {ftype field[n];}"
15696 ... [computing offs]
15697 for (i = 0; i <nregs; ++i, offs += 16)
15698 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
15701 tree tmp_ha
, field_t
, field_ptr_t
;
15703 /* Declare a local variable. */
15704 tmp_ha
= create_tmp_var_raw (type
, "ha");
15705 gimple_add_tmp_var (tmp_ha
);
15707 /* Establish the base type. */
15711 field_t
= float_type_node
;
15712 field_ptr_t
= float_ptr_type_node
;
15715 field_t
= double_type_node
;
15716 field_ptr_t
= double_ptr_type_node
;
15719 field_t
= long_double_type_node
;
15720 field_ptr_t
= long_double_ptr_type_node
;
15723 field_t
= aarch64_fp16_type_node
;
15724 field_ptr_t
= aarch64_fp16_ptr_type_node
;
15727 field_t
= aarch64_bf16_type_node
;
15728 field_ptr_t
= aarch64_bf16_ptr_type_node
;
15733 tree innertype
= make_signed_type (GET_MODE_PRECISION (SImode
));
15734 field_t
= build_vector_type_for_mode (innertype
, ag_mode
);
15735 field_ptr_t
= build_pointer_type (field_t
);
15742 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
15743 tmp_ha
= build1 (ADDR_EXPR
, field_ptr_t
, tmp_ha
);
15745 t
= fold_convert (field_ptr_t
, addr
);
15746 t
= build2 (MODIFY_EXPR
, field_t
,
15747 build1 (INDIRECT_REF
, field_t
, tmp_ha
),
15748 build1 (INDIRECT_REF
, field_t
, t
));
15750 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
15751 for (i
= 1; i
< nregs
; ++i
)
15753 addr
= fold_build_pointer_plus_hwi (addr
, UNITS_PER_VREG
);
15754 u
= fold_convert (field_ptr_t
, addr
);
15755 u
= build2 (MODIFY_EXPR
, field_t
,
15756 build2 (MEM_REF
, field_t
, tmp_ha
,
15757 build_int_cst (field_ptr_t
,
15759 int_size_in_bytes (field_t
)))),
15760 build1 (INDIRECT_REF
, field_t
, u
));
15761 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), t
, u
);
15764 u
= fold_convert (TREE_TYPE (f_top
), tmp_ha
);
15765 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (f_top
), t
, u
);
15768 COND_EXPR_ELSE (cond2
) = t
;
15769 addr
= fold_convert (build_pointer_type (type
), cond1
);
15770 addr
= build_va_arg_indirect_ref (addr
);
15773 addr
= build_va_arg_indirect_ref (addr
);
15778 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
15781 aarch64_setup_incoming_varargs (cumulative_args_t cum_v
,
15782 const function_arg_info
&arg
,
15783 int *pretend_size ATTRIBUTE_UNUSED
, int no_rtl
)
15785 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
15786 CUMULATIVE_ARGS local_cum
;
15787 int gr_saved
= cfun
->va_list_gpr_size
;
15788 int vr_saved
= cfun
->va_list_fpr_size
;
15790 /* The caller has advanced CUM up to, but not beyond, the last named
15791 argument. Advance a local copy of CUM past the last "real" named
15792 argument, to find out how many registers are left over. */
15794 aarch64_function_arg_advance (pack_cumulative_args(&local_cum
), arg
);
15796 /* Found out how many registers we need to save.
15797 Honor tree-stdvar analysis results. */
15798 if (cfun
->va_list_gpr_size
)
15799 gr_saved
= MIN (NUM_ARG_REGS
- local_cum
.aapcs_ncrn
,
15800 cfun
->va_list_gpr_size
/ UNITS_PER_WORD
);
15801 if (cfun
->va_list_fpr_size
)
15802 vr_saved
= MIN (NUM_FP_ARG_REGS
- local_cum
.aapcs_nvrn
,
15803 cfun
->va_list_fpr_size
/ UNITS_PER_VREG
);
15807 gcc_assert (local_cum
.aapcs_nvrn
== 0);
15817 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
15818 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
,
15819 - gr_saved
* UNITS_PER_WORD
);
15820 mem
= gen_frame_mem (BLKmode
, ptr
);
15821 set_mem_alias_set (mem
, get_varargs_alias_set ());
15823 move_block_from_reg (local_cum
.aapcs_ncrn
+ R0_REGNUM
,
15828 /* We can't use move_block_from_reg, because it will use
15829 the wrong mode, storing D regs only. */
15830 machine_mode mode
= TImode
;
15831 int off
, i
, vr_start
;
15833 /* Set OFF to the offset from virtual_incoming_args_rtx of
15834 the first vector register. The VR save area lies below
15835 the GR one, and is aligned to 16 bytes. */
15836 off
= -ROUND_UP (gr_saved
* UNITS_PER_WORD
,
15837 STACK_BOUNDARY
/ BITS_PER_UNIT
);
15838 off
-= vr_saved
* UNITS_PER_VREG
;
15840 vr_start
= V0_REGNUM
+ local_cum
.aapcs_nvrn
;
15841 for (i
= 0; i
< vr_saved
; ++i
)
15845 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
, off
);
15846 mem
= gen_frame_mem (mode
, ptr
);
15847 set_mem_alias_set (mem
, get_varargs_alias_set ());
15848 aarch64_emit_move (mem
, gen_rtx_REG (mode
, vr_start
+ i
));
15849 off
+= UNITS_PER_VREG
;
15854 /* We don't save the size into *PRETEND_SIZE because we want to avoid
15855 any complication of having crtl->args.pretend_args_size changed. */
15856 cfun
->machine
->frame
.saved_varargs_size
15857 = (ROUND_UP (gr_saved
* UNITS_PER_WORD
,
15858 STACK_BOUNDARY
/ BITS_PER_UNIT
)
15859 + vr_saved
* UNITS_PER_VREG
);
15863 aarch64_conditional_register_usage (void)
15868 for (i
= V0_REGNUM
; i
<= V31_REGNUM
; i
++)
15871 call_used_regs
[i
] = 1;
15875 for (i
= P0_REGNUM
; i
<= P15_REGNUM
; i
++)
15878 call_used_regs
[i
] = 1;
15881 /* Only allow the FFR and FFRT to be accessed via special patterns. */
15882 CLEAR_HARD_REG_BIT (operand_reg_set
, FFR_REGNUM
);
15883 CLEAR_HARD_REG_BIT (operand_reg_set
, FFRT_REGNUM
);
15885 /* When tracking speculation, we need a couple of call-clobbered registers
15886 to track the speculation state. It would be nice to just use
15887 IP0 and IP1, but currently there are numerous places that just
15888 assume these registers are free for other uses (eg pointer
15889 authentication). */
15890 if (aarch64_track_speculation
)
15892 fixed_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
15893 call_used_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
15894 fixed_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
15895 call_used_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
15899 /* Walk down the type tree of TYPE counting consecutive base elements.
15900 If *MODEP is VOIDmode, then set it to the first valid floating point
15901 type. If a non-floating point type is found, or if a floating point
15902 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
15903 otherwise return the count in the sub-tree. */
15905 aapcs_vfp_sub_candidate (const_tree type
, machine_mode
*modep
)
15908 HOST_WIDE_INT size
;
15910 /* SVE types (and types containing SVE types) must be handled
15911 before calling this function. */
15912 gcc_assert (!aarch64_sve::builtin_type_p (type
));
15914 switch (TREE_CODE (type
))
15917 mode
= TYPE_MODE (type
);
15918 if (mode
!= DFmode
&& mode
!= SFmode
15919 && mode
!= TFmode
&& mode
!= HFmode
)
15922 if (*modep
== VOIDmode
)
15925 if (*modep
== mode
)
15931 mode
= TYPE_MODE (TREE_TYPE (type
));
15932 if (mode
!= DFmode
&& mode
!= SFmode
15933 && mode
!= TFmode
&& mode
!= HFmode
)
15936 if (*modep
== VOIDmode
)
15939 if (*modep
== mode
)
15945 /* Use V2SImode and V4SImode as representatives of all 64-bit
15946 and 128-bit vector types. */
15947 size
= int_size_in_bytes (type
);
15960 if (*modep
== VOIDmode
)
15963 /* Vector modes are considered to be opaque: two vectors are
15964 equivalent for the purposes of being homogeneous aggregates
15965 if they are the same size. */
15966 if (*modep
== mode
)
15974 tree index
= TYPE_DOMAIN (type
);
15976 /* Can't handle incomplete types nor sizes that are not
15978 if (!COMPLETE_TYPE_P (type
)
15979 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
15982 count
= aapcs_vfp_sub_candidate (TREE_TYPE (type
), modep
);
15985 || !TYPE_MAX_VALUE (index
)
15986 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index
))
15987 || !TYPE_MIN_VALUE (index
)
15988 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index
))
15992 count
*= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index
))
15993 - tree_to_uhwi (TYPE_MIN_VALUE (index
)));
15995 /* There must be no padding. */
15996 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
15997 count
* GET_MODE_BITSIZE (*modep
)))
16009 /* Can't handle incomplete types nor sizes that are not
16011 if (!COMPLETE_TYPE_P (type
)
16012 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
16015 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
16017 if (TREE_CODE (field
) != FIELD_DECL
)
16020 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
16023 count
+= sub_count
;
16026 /* There must be no padding. */
16027 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
16028 count
* GET_MODE_BITSIZE (*modep
)))
16035 case QUAL_UNION_TYPE
:
16037 /* These aren't very interesting except in a degenerate case. */
16042 /* Can't handle incomplete types nor sizes that are not
16044 if (!COMPLETE_TYPE_P (type
)
16045 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
16048 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
16050 if (TREE_CODE (field
) != FIELD_DECL
)
16053 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
16056 count
= count
> sub_count
? count
: sub_count
;
16059 /* There must be no padding. */
16060 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
16061 count
* GET_MODE_BITSIZE (*modep
)))
16074 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
16075 type as described in AAPCS64 \S 4.1.2.
16077 See the comment above aarch64_composite_type_p for the notes on MODE. */
16080 aarch64_short_vector_p (const_tree type
,
16083 poly_int64 size
= -1;
16085 if (type
&& aarch64_sve::builtin_type_p (type
))
16088 if (type
&& TREE_CODE (type
) == VECTOR_TYPE
)
16089 size
= int_size_in_bytes (type
);
16090 else if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
16091 || GET_MODE_CLASS (mode
) == MODE_VECTOR_FLOAT
)
16092 size
= GET_MODE_SIZE (mode
);
16094 return known_eq (size
, 8) || known_eq (size
, 16);
16097 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
16098 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
16099 array types. The C99 floating-point complex types are also considered
16100 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
16101 types, which are GCC extensions and out of the scope of AAPCS64, are
16102 treated as composite types here as well.
16104 Note that MODE itself is not sufficient in determining whether a type
16105 is such a composite type or not. This is because
16106 stor-layout.c:compute_record_mode may have already changed the MODE
16107 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
16108 structure with only one field may have its MODE set to the mode of the
16109 field. Also an integer mode whose size matches the size of the
16110 RECORD_TYPE type may be used to substitute the original mode
16111 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
16112 solely relied on. */
16115 aarch64_composite_type_p (const_tree type
,
16118 if (aarch64_short_vector_p (type
, mode
))
16121 if (type
&& (AGGREGATE_TYPE_P (type
) || TREE_CODE (type
) == COMPLEX_TYPE
))
16124 if (mode
== BLKmode
16125 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
16126 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_INT
)
16132 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
16133 shall be passed or returned in simd/fp register(s) (providing these
16134 parameter passing registers are available).
16136 Upon successful return, *COUNT returns the number of needed registers,
16137 *BASE_MODE returns the mode of the individual register and when IS_HAF
16138 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
16139 floating-point aggregate or a homogeneous short-vector aggregate. */
16142 aarch64_vfp_is_call_or_return_candidate (machine_mode mode
,
16144 machine_mode
*base_mode
,
16148 if (is_ha
!= NULL
) *is_ha
= false;
16150 if (type
&& aarch64_sve::builtin_type_p (type
))
16153 machine_mode new_mode
= VOIDmode
;
16154 bool composite_p
= aarch64_composite_type_p (type
, mode
);
16156 if ((!composite_p
&& GET_MODE_CLASS (mode
) == MODE_FLOAT
)
16157 || aarch64_short_vector_p (type
, mode
))
16162 else if (GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
)
16164 if (is_ha
!= NULL
) *is_ha
= true;
16166 new_mode
= GET_MODE_INNER (mode
);
16168 else if (type
&& composite_p
)
16170 int ag_count
= aapcs_vfp_sub_candidate (type
, &new_mode
);
16172 if (ag_count
> 0 && ag_count
<= HA_MAX_NUM_FLDS
)
16174 if (is_ha
!= NULL
) *is_ha
= true;
16183 *base_mode
= new_mode
;
16187 /* Implement TARGET_STRUCT_VALUE_RTX. */
16190 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED
,
16191 int incoming ATTRIBUTE_UNUSED
)
16193 return gen_rtx_REG (Pmode
, AARCH64_STRUCT_VALUE_REGNUM
);
16196 /* Implements target hook vector_mode_supported_p. */
16198 aarch64_vector_mode_supported_p (machine_mode mode
)
16200 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
16201 return vec_flags
!= 0 && (vec_flags
& VEC_STRUCT
) == 0;
16204 /* Return the full-width SVE vector mode for element mode MODE, if one
16207 aarch64_full_sve_mode (scalar_mode mode
)
16226 return VNx16QImode
;
16228 return opt_machine_mode ();
16232 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
16235 aarch64_vq_mode (scalar_mode mode
)
16256 return opt_machine_mode ();
16260 /* Return appropriate SIMD container
16261 for MODE within a vector of WIDTH bits. */
16262 static machine_mode
16263 aarch64_simd_container_mode (scalar_mode mode
, poly_int64 width
)
16266 && maybe_ne (width
, 128)
16267 && known_eq (width
, BITS_PER_SVE_VECTOR
))
16268 return aarch64_full_sve_mode (mode
).else_mode (word_mode
);
16270 gcc_assert (known_eq (width
, 64) || known_eq (width
, 128));
16273 if (known_eq (width
, 128))
16274 return aarch64_vq_mode (mode
).else_mode (word_mode
);
16297 /* Return 128-bit container as the preferred SIMD mode for MODE. */
16298 static machine_mode
16299 aarch64_preferred_simd_mode (scalar_mode mode
)
16301 poly_int64 bits
= TARGET_SVE
? BITS_PER_SVE_VECTOR
: 128;
16302 return aarch64_simd_container_mode (mode
, bits
);
16305 /* Return a list of possible vector sizes for the vectorizer
16306 to iterate over. */
16307 static unsigned int
16308 aarch64_autovectorize_vector_modes (vector_modes
*modes
, bool)
16310 static const machine_mode sve_modes
[] = {
16311 /* Try using full vectors for all element types. */
16314 /* Try using 16-bit containers for 8-bit elements and full vectors
16315 for wider elements. */
16318 /* Try using 32-bit containers for 8-bit and 16-bit elements and
16319 full vectors for wider elements. */
16322 /* Try using 64-bit containers for all element types. */
16326 static const machine_mode advsimd_modes
[] = {
16327 /* Try using 128-bit vectors for all element types. */
16330 /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
16331 for wider elements. */
16334 /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
16335 for wider elements.
16337 TODO: We could support a limited form of V4QImode too, so that
16338 we use 32-bit vectors for 8-bit elements. */
16341 /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
16342 for 64-bit elements.
16344 TODO: We could similarly support limited forms of V2QImode and V2HImode
16349 /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
16352 - If we can't use N-byte Advanced SIMD vectors then the placement
16353 doesn't matter; we'll just continue as though the Advanced SIMD
16354 entry didn't exist.
16356 - If an SVE main loop with N bytes ends up being cheaper than an
16357 Advanced SIMD main loop with N bytes then by default we'll replace
16358 the Advanced SIMD version with the SVE one.
16360 - If an Advanced SIMD main loop with N bytes ends up being cheaper
16361 than an SVE main loop with N bytes then by default we'll try to
16362 use the SVE loop to vectorize the epilogue instead. */
16363 unsigned int sve_i
= TARGET_SVE
? 0 : ARRAY_SIZE (sve_modes
);
16364 unsigned int advsimd_i
= 0;
16365 while (advsimd_i
< ARRAY_SIZE (advsimd_modes
))
16367 if (sve_i
< ARRAY_SIZE (sve_modes
)
16368 && maybe_gt (GET_MODE_NUNITS (sve_modes
[sve_i
]),
16369 GET_MODE_NUNITS (advsimd_modes
[advsimd_i
])))
16370 modes
->safe_push (sve_modes
[sve_i
++]);
16372 modes
->safe_push (advsimd_modes
[advsimd_i
++]);
16374 while (sve_i
< ARRAY_SIZE (sve_modes
))
16375 modes
->safe_push (sve_modes
[sve_i
++]);
16377 unsigned int flags
= 0;
16378 /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we
16379 can compare SVE against Advanced SIMD and so that we can compare
16380 multiple SVE vectorization approaches against each other. There's
16381 not really any point doing this for Advanced SIMD only, since the
16382 first mode that works should always be the best. */
16383 if (TARGET_SVE
&& aarch64_sve_compare_costs
)
16384 flags
|= VECT_COMPARE_COSTS
;
16388 /* Implement TARGET_MANGLE_TYPE. */
16390 static const char *
16391 aarch64_mangle_type (const_tree type
)
16393 /* The AArch64 ABI documents say that "__va_list" has to be
16394 mangled as if it is in the "std" namespace. */
16395 if (lang_hooks
.types_compatible_p (CONST_CAST_TREE (type
), va_list_type
))
16396 return "St9__va_list";
16398 /* Half-precision floating point types. */
16399 if (TREE_CODE (type
) == REAL_TYPE
&& TYPE_PRECISION (type
) == 16)
16401 if (TYPE_MODE (type
) == BFmode
)
16407 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
16409 if (TYPE_NAME (type
) != NULL
)
16412 if ((res
= aarch64_general_mangle_builtin_type (type
))
16413 || (res
= aarch64_sve::mangle_builtin_type (type
)))
16417 /* Use the default mangling. */
16421 /* Implement TARGET_VERIFY_TYPE_CONTEXT. */
16424 aarch64_verify_type_context (location_t loc
, type_context_kind context
,
16425 const_tree type
, bool silent_p
)
16427 return aarch64_sve::verify_type_context (loc
, context
, type
, silent_p
);
16430 /* Find the first rtx_insn before insn that will generate an assembly
16434 aarch64_prev_real_insn (rtx_insn
*insn
)
16441 insn
= prev_real_insn (insn
);
16443 while (insn
&& recog_memoized (insn
) < 0);
16449 is_madd_op (enum attr_type t1
)
16452 /* A number of these may be AArch32 only. */
16453 enum attr_type mlatypes
[] = {
16454 TYPE_MLA
, TYPE_MLAS
, TYPE_SMLAD
, TYPE_SMLADX
, TYPE_SMLAL
, TYPE_SMLALD
,
16455 TYPE_SMLALS
, TYPE_SMLALXY
, TYPE_SMLAWX
, TYPE_SMLAWY
, TYPE_SMLAXY
,
16456 TYPE_SMMLA
, TYPE_UMLAL
, TYPE_UMLALS
,TYPE_SMLSD
, TYPE_SMLSDX
, TYPE_SMLSLD
16459 for (i
= 0; i
< sizeof (mlatypes
) / sizeof (enum attr_type
); i
++)
16461 if (t1
== mlatypes
[i
])
16468 /* Check if there is a register dependency between a load and the insn
16469 for which we hold recog_data. */
16472 dep_between_memop_and_curr (rtx memop
)
16477 gcc_assert (GET_CODE (memop
) == SET
);
16479 if (!REG_P (SET_DEST (memop
)))
16482 load_reg
= SET_DEST (memop
);
16483 for (opno
= 1; opno
< recog_data
.n_operands
; opno
++)
16485 rtx operand
= recog_data
.operand
[opno
];
16486 if (REG_P (operand
)
16487 && reg_overlap_mentioned_p (load_reg
, operand
))
16495 /* When working around the Cortex-A53 erratum 835769,
16496 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
16497 instruction and has a preceding memory instruction such that a NOP
16498 should be inserted between them. */
16501 aarch64_madd_needs_nop (rtx_insn
* insn
)
16503 enum attr_type attr_type
;
16507 if (!TARGET_FIX_ERR_A53_835769
)
16510 if (!INSN_P (insn
) || recog_memoized (insn
) < 0)
16513 attr_type
= get_attr_type (insn
);
16514 if (!is_madd_op (attr_type
))
16517 prev
= aarch64_prev_real_insn (insn
);
16518 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
16519 Restore recog state to INSN to avoid state corruption. */
16520 extract_constrain_insn_cached (insn
);
16522 if (!prev
|| !contains_mem_rtx_p (PATTERN (prev
)))
16525 body
= single_set (prev
);
16527 /* If the previous insn is a memory op and there is no dependency between
16528 it and the DImode madd, emit a NOP between them. If body is NULL then we
16529 have a complex memory operation, probably a load/store pair.
16530 Be conservative for now and emit a NOP. */
16531 if (GET_MODE (recog_data
.operand
[0]) == DImode
16532 && (!body
|| !dep_between_memop_and_curr (body
)))
16540 /* Implement FINAL_PRESCAN_INSN. */
16543 aarch64_final_prescan_insn (rtx_insn
*insn
)
16545 if (aarch64_madd_needs_nop (insn
))
16546 fprintf (asm_out_file
, "\tnop // between mem op and mult-accumulate\n");
16550 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
16554 aarch64_sve_index_immediate_p (rtx base_or_step
)
16556 return (CONST_INT_P (base_or_step
)
16557 && IN_RANGE (INTVAL (base_or_step
), -16, 15));
16560 /* Return true if X is a valid immediate for the SVE ADD and SUB instructions
16561 when applied to mode MODE. Negate X first if NEGATE_P is true. */
16564 aarch64_sve_arith_immediate_p (machine_mode mode
, rtx x
, bool negate_p
)
16566 rtx elt
= unwrap_const_vec_duplicate (x
);
16567 if (!CONST_INT_P (elt
))
16570 HOST_WIDE_INT val
= INTVAL (elt
);
16573 val
&= GET_MODE_MASK (GET_MODE_INNER (mode
));
16576 return IN_RANGE (val
, 0, 0xff);
16577 return IN_RANGE (val
, 0, 0xff00);
16580 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
16581 instructions when applied to mode MODE. Negate X first if NEGATE_P
16585 aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode
, rtx x
, bool negate_p
)
16587 if (!aarch64_sve_arith_immediate_p (mode
, x
, negate_p
))
16590 /* After the optional negation, the immediate must be nonnegative.
16591 E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
16592 instead of SQADD Zn.B, Zn.B, #129. */
16593 rtx elt
= unwrap_const_vec_duplicate (x
);
16594 return negate_p
== (INTVAL (elt
) < 0);
16597 /* Return true if X is a valid immediate operand for an SVE logical
16598 instruction such as AND. */
16601 aarch64_sve_bitmask_immediate_p (rtx x
)
16605 return (const_vec_duplicate_p (x
, &elt
)
16606 && CONST_INT_P (elt
)
16607 && aarch64_bitmask_imm (INTVAL (elt
),
16608 GET_MODE_INNER (GET_MODE (x
))));
16611 /* Return true if X is a valid immediate for the SVE DUP and CPY
16615 aarch64_sve_dup_immediate_p (rtx x
)
16617 x
= aarch64_bit_representation (unwrap_const_vec_duplicate (x
));
16618 if (!CONST_INT_P (x
))
16621 HOST_WIDE_INT val
= INTVAL (x
);
16623 return IN_RANGE (val
, -0x80, 0x7f);
16624 return IN_RANGE (val
, -0x8000, 0x7f00);
16627 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
16628 SIGNED_P says whether the operand is signed rather than unsigned. */
16631 aarch64_sve_cmp_immediate_p (rtx x
, bool signed_p
)
16633 x
= unwrap_const_vec_duplicate (x
);
16634 return (CONST_INT_P (x
)
16636 ? IN_RANGE (INTVAL (x
), -16, 15)
16637 : IN_RANGE (INTVAL (x
), 0, 127)));
16640 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
16641 instruction. Negate X first if NEGATE_P is true. */
16644 aarch64_sve_float_arith_immediate_p (rtx x
, bool negate_p
)
16649 if (!const_vec_duplicate_p (x
, &elt
)
16650 || GET_CODE (elt
) != CONST_DOUBLE
)
16653 r
= *CONST_DOUBLE_REAL_VALUE (elt
);
16656 r
= real_value_negate (&r
);
16658 if (real_equal (&r
, &dconst1
))
16660 if (real_equal (&r
, &dconsthalf
))
16665 /* Return true if X is a valid immediate operand for an SVE FMUL
16669 aarch64_sve_float_mul_immediate_p (rtx x
)
16673 return (const_vec_duplicate_p (x
, &elt
)
16674 && GET_CODE (elt
) == CONST_DOUBLE
16675 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt
), &dconsthalf
)
16676 || real_equal (CONST_DOUBLE_REAL_VALUE (elt
), &dconst2
)));
16679 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
16680 for the Advanced SIMD operation described by WHICH and INSN. If INFO
16681 is nonnull, use it to describe valid immediates. */
16683 aarch64_advsimd_valid_immediate_hs (unsigned int val32
,
16684 simd_immediate_info
*info
,
16685 enum simd_immediate_check which
,
16686 simd_immediate_info::insn_type insn
)
16688 /* Try a 4-byte immediate with LSL. */
16689 for (unsigned int shift
= 0; shift
< 32; shift
+= 8)
16690 if ((val32
& (0xff << shift
)) == val32
)
16693 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
16694 simd_immediate_info::LSL
, shift
);
16698 /* Try a 2-byte immediate with LSL. */
16699 unsigned int imm16
= val32
& 0xffff;
16700 if (imm16
== (val32
>> 16))
16701 for (unsigned int shift
= 0; shift
< 16; shift
+= 8)
16702 if ((imm16
& (0xff << shift
)) == imm16
)
16705 *info
= simd_immediate_info (HImode
, imm16
>> shift
, insn
,
16706 simd_immediate_info::LSL
, shift
);
16710 /* Try a 4-byte immediate with MSL, except for cases that MVN
16712 if (which
== AARCH64_CHECK_MOV
)
16713 for (unsigned int shift
= 8; shift
< 24; shift
+= 8)
16715 unsigned int low
= (1 << shift
) - 1;
16716 if (((val32
& (0xff << shift
)) | low
) == val32
)
16719 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
16720 simd_immediate_info::MSL
, shift
);
16728 /* Return true if replicating VAL64 is a valid immediate for the
16729 Advanced SIMD operation described by WHICH. If INFO is nonnull,
16730 use it to describe valid immediates. */
16732 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64
,
16733 simd_immediate_info
*info
,
16734 enum simd_immediate_check which
)
16736 unsigned int val32
= val64
& 0xffffffff;
16737 unsigned int val16
= val64
& 0xffff;
16738 unsigned int val8
= val64
& 0xff;
16740 if (val32
== (val64
>> 32))
16742 if ((which
& AARCH64_CHECK_ORR
) != 0
16743 && aarch64_advsimd_valid_immediate_hs (val32
, info
, which
,
16744 simd_immediate_info::MOV
))
16747 if ((which
& AARCH64_CHECK_BIC
) != 0
16748 && aarch64_advsimd_valid_immediate_hs (~val32
, info
, which
,
16749 simd_immediate_info::MVN
))
16752 /* Try using a replicated byte. */
16753 if (which
== AARCH64_CHECK_MOV
16754 && val16
== (val32
>> 16)
16755 && val8
== (val16
>> 8))
16758 *info
= simd_immediate_info (QImode
, val8
);
16763 /* Try using a bit-to-bytemask. */
16764 if (which
== AARCH64_CHECK_MOV
)
16767 for (i
= 0; i
< 64; i
+= 8)
16769 unsigned char byte
= (val64
>> i
) & 0xff;
16770 if (byte
!= 0 && byte
!= 0xff)
16776 *info
= simd_immediate_info (DImode
, val64
);
16783 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
16784 instruction. If INFO is nonnull, use it to describe valid immediates. */
16787 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64
,
16788 simd_immediate_info
*info
)
16790 scalar_int_mode mode
= DImode
;
16791 unsigned int val32
= val64
& 0xffffffff;
16792 if (val32
== (val64
>> 32))
16795 unsigned int val16
= val32
& 0xffff;
16796 if (val16
== (val32
>> 16))
16799 unsigned int val8
= val16
& 0xff;
16800 if (val8
== (val16
>> 8))
16804 HOST_WIDE_INT val
= trunc_int_for_mode (val64
, mode
);
16805 if (IN_RANGE (val
, -0x80, 0x7f))
16807 /* DUP with no shift. */
16809 *info
= simd_immediate_info (mode
, val
);
16812 if ((val
& 0xff) == 0 && IN_RANGE (val
, -0x8000, 0x7f00))
16814 /* DUP with LSL #8. */
16816 *info
= simd_immediate_info (mode
, val
);
16819 if (aarch64_bitmask_imm (val64
, mode
))
16823 *info
= simd_immediate_info (mode
, val
);
16829 /* Return true if X is an UNSPEC_PTRUE constant of the form:
16831 (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
16833 where PATTERN is the svpattern as a CONST_INT and where ZERO
16834 is a zero constant of the required PTRUE mode (which can have
16835 fewer elements than X's mode, if zero bits are significant).
16837 If so, and if INFO is nonnull, describe the immediate in INFO. */
16839 aarch64_sve_ptrue_svpattern_p (rtx x
, struct simd_immediate_info
*info
)
16841 if (GET_CODE (x
) != CONST
)
16845 if (GET_CODE (x
) != UNSPEC
|| XINT (x
, 1) != UNSPEC_PTRUE
)
16850 aarch64_svpattern pattern
16851 = (aarch64_svpattern
) INTVAL (XVECEXP (x
, 0, 0));
16852 machine_mode pred_mode
= GET_MODE (XVECEXP (x
, 0, 1));
16853 scalar_int_mode int_mode
= aarch64_sve_element_int_mode (pred_mode
);
16854 *info
= simd_immediate_info (int_mode
, pattern
);
16859 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
16860 it to describe valid immediates. */
16863 aarch64_sve_pred_valid_immediate (rtx x
, simd_immediate_info
*info
)
16865 if (aarch64_sve_ptrue_svpattern_p (x
, info
))
16868 if (x
== CONST0_RTX (GET_MODE (x
)))
16871 *info
= simd_immediate_info (DImode
, 0);
16875 /* Analyze the value as a VNx16BImode. This should be relatively
16876 efficient, since rtx_vector_builder has enough built-in capacity
16877 to store all VLA predicate constants without needing the heap. */
16878 rtx_vector_builder builder
;
16879 if (!aarch64_get_sve_pred_bits (builder
, x
))
16882 unsigned int elt_size
= aarch64_widest_sve_pred_elt_size (builder
);
16883 if (int vl
= aarch64_partial_ptrue_length (builder
, elt_size
))
16885 machine_mode mode
= aarch64_sve_pred_mode (elt_size
).require ();
16886 aarch64_svpattern pattern
= aarch64_svpattern_for_vl (mode
, vl
);
16887 if (pattern
!= AARCH64_NUM_SVPATTERNS
)
16891 scalar_int_mode int_mode
= aarch64_sve_element_int_mode (mode
);
16892 *info
= simd_immediate_info (int_mode
, pattern
);
16900 /* Return true if OP is a valid SIMD immediate for the operation
16901 described by WHICH. If INFO is nonnull, use it to describe valid
16904 aarch64_simd_valid_immediate (rtx op
, simd_immediate_info
*info
,
16905 enum simd_immediate_check which
)
16907 machine_mode mode
= GET_MODE (op
);
16908 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
16909 if (vec_flags
== 0 || vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
16912 if (vec_flags
& VEC_SVE_PRED
)
16913 return aarch64_sve_pred_valid_immediate (op
, info
);
16915 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
16917 unsigned int n_elts
;
16918 if (GET_CODE (op
) == CONST_VECTOR
16919 && CONST_VECTOR_DUPLICATE_P (op
))
16920 n_elts
= CONST_VECTOR_NPATTERNS (op
);
16921 else if ((vec_flags
& VEC_SVE_DATA
)
16922 && const_vec_series_p (op
, &base
, &step
))
16924 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
);
16925 if (!aarch64_sve_index_immediate_p (base
)
16926 || !aarch64_sve_index_immediate_p (step
))
16931 /* Get the corresponding container mode. E.g. an INDEX on V2SI
16932 should yield two integer values per 128-bit block, meaning
16933 that we need to treat it in the same way as V2DI and then
16934 ignore the upper 32 bits of each element. */
16935 elt_mode
= aarch64_sve_container_int_mode (mode
);
16936 *info
= simd_immediate_info (elt_mode
, base
, step
);
16940 else if (GET_CODE (op
) == CONST_VECTOR
16941 && CONST_VECTOR_NUNITS (op
).is_constant (&n_elts
))
16942 /* N_ELTS set above. */;
16946 scalar_float_mode elt_float_mode
;
16948 && is_a
<scalar_float_mode
> (elt_mode
, &elt_float_mode
))
16950 rtx elt
= CONST_VECTOR_ENCODED_ELT (op
, 0);
16951 if (aarch64_float_const_zero_rtx_p (elt
)
16952 || aarch64_float_const_representable_p (elt
))
16955 *info
= simd_immediate_info (elt_float_mode
, elt
);
16960 /* If all elements in an SVE vector have the same value, we have a free
16961 choice between using the element mode and using the container mode.
16962 Using the element mode means that unused parts of the vector are
16963 duplicates of the used elements, while using the container mode means
16964 that the unused parts are an extension of the used elements. Using the
16965 element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
16966 for its container mode VNx4SI while 0x00000101 isn't.
16968 If not all elements in an SVE vector have the same value, we need the
16969 transition from one element to the next to occur at container boundaries.
16970 E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
16971 in the same way as a VNx4SI containing { 1, 2, 3, 4 }. */
16972 scalar_int_mode elt_int_mode
;
16973 if ((vec_flags
& VEC_SVE_DATA
) && n_elts
> 1)
16974 elt_int_mode
= aarch64_sve_container_int_mode (mode
);
16976 elt_int_mode
= int_mode_for_mode (elt_mode
).require ();
16978 unsigned int elt_size
= GET_MODE_SIZE (elt_int_mode
);
16982 /* Expand the vector constant out into a byte vector, with the least
16983 significant byte of the register first. */
16984 auto_vec
<unsigned char, 16> bytes
;
16985 bytes
.reserve (n_elts
* elt_size
);
16986 for (unsigned int i
= 0; i
< n_elts
; i
++)
16988 /* The vector is provided in gcc endian-neutral fashion.
16989 For aarch64_be Advanced SIMD, it must be laid out in the vector
16990 register in reverse order. */
16991 bool swap_p
= ((vec_flags
& VEC_ADVSIMD
) != 0 && BYTES_BIG_ENDIAN
);
16992 rtx elt
= CONST_VECTOR_ELT (op
, swap_p
? (n_elts
- 1 - i
) : i
);
16994 if (elt_mode
!= elt_int_mode
)
16995 elt
= gen_lowpart (elt_int_mode
, elt
);
16997 if (!CONST_INT_P (elt
))
17000 unsigned HOST_WIDE_INT elt_val
= INTVAL (elt
);
17001 for (unsigned int byte
= 0; byte
< elt_size
; byte
++)
17003 bytes
.quick_push (elt_val
& 0xff);
17004 elt_val
>>= BITS_PER_UNIT
;
17008 /* The immediate must repeat every eight bytes. */
17009 unsigned int nbytes
= bytes
.length ();
17010 for (unsigned i
= 8; i
< nbytes
; ++i
)
17011 if (bytes
[i
] != bytes
[i
- 8])
17014 /* Get the repeating 8-byte value as an integer. No endian correction
17015 is needed here because bytes is already in lsb-first order. */
17016 unsigned HOST_WIDE_INT val64
= 0;
17017 for (unsigned int i
= 0; i
< 8; i
++)
17018 val64
|= ((unsigned HOST_WIDE_INT
) bytes
[i
% nbytes
]
17019 << (i
* BITS_PER_UNIT
));
17021 if (vec_flags
& VEC_SVE_DATA
)
17022 return aarch64_sve_valid_immediate (val64
, info
);
17024 return aarch64_advsimd_valid_immediate (val64
, info
, which
);
17027 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
17028 has a step in the range of INDEX. Return the index expression if so,
17029 otherwise return null. */
17031 aarch64_check_zero_based_sve_index_immediate (rtx x
)
17034 if (const_vec_series_p (x
, &base
, &step
)
17035 && base
== const0_rtx
17036 && aarch64_sve_index_immediate_p (step
))
17041 /* Check of immediate shift constants are within range. */
17043 aarch64_simd_shift_imm_p (rtx x
, machine_mode mode
, bool left
)
17045 x
= unwrap_const_vec_duplicate (x
);
17046 if (!CONST_INT_P (x
))
17048 int bit_width
= GET_MODE_UNIT_SIZE (mode
) * BITS_PER_UNIT
;
17050 return IN_RANGE (INTVAL (x
), 0, bit_width
- 1);
17052 return IN_RANGE (INTVAL (x
), 1, bit_width
);
17055 /* Return the bitmask CONST_INT to select the bits required by a zero extract
17056 operation of width WIDTH at bit position POS. */
17059 aarch64_mask_from_zextract_ops (rtx width
, rtx pos
)
17061 gcc_assert (CONST_INT_P (width
));
17062 gcc_assert (CONST_INT_P (pos
));
17064 unsigned HOST_WIDE_INT mask
17065 = ((unsigned HOST_WIDE_INT
) 1 << UINTVAL (width
)) - 1;
17066 return GEN_INT (mask
<< UINTVAL (pos
));
17070 aarch64_mov_operand_p (rtx x
, machine_mode mode
)
17072 if (GET_CODE (x
) == HIGH
17073 && aarch64_valid_symref (XEXP (x
, 0), GET_MODE (XEXP (x
, 0))))
17076 if (CONST_INT_P (x
))
17079 if (VECTOR_MODE_P (GET_MODE (x
)))
17081 /* Require predicate constants to be VNx16BI before RA, so that we
17082 force everything to have a canonical form. */
17083 if (!lra_in_progress
17084 && !reload_completed
17085 && GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_BOOL
17086 && GET_MODE (x
) != VNx16BImode
)
17089 return aarch64_simd_valid_immediate (x
, NULL
);
17092 if (GET_CODE (x
) == SYMBOL_REF
&& mode
== DImode
&& CONSTANT_ADDRESS_P (x
))
17095 if (TARGET_SVE
&& aarch64_sve_cnt_immediate_p (x
))
17098 return aarch64_classify_symbolic_expression (x
)
17099 == SYMBOL_TINY_ABSOLUTE
;
17102 /* Return a const_int vector of VAL. */
17104 aarch64_simd_gen_const_vector_dup (machine_mode mode
, HOST_WIDE_INT val
)
17106 rtx c
= gen_int_mode (val
, GET_MODE_INNER (mode
));
17107 return gen_const_vec_duplicate (mode
, c
);
17110 /* Check OP is a legal scalar immediate for the MOVI instruction. */
17113 aarch64_simd_scalar_immediate_valid_for_move (rtx op
, scalar_int_mode mode
)
17115 machine_mode vmode
;
17117 vmode
= aarch64_simd_container_mode (mode
, 64);
17118 rtx op_v
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (op
));
17119 return aarch64_simd_valid_immediate (op_v
, NULL
);
17122 /* Construct and return a PARALLEL RTX vector with elements numbering the
17123 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
17124 the vector - from the perspective of the architecture. This does not
17125 line up with GCC's perspective on lane numbers, so we end up with
17126 different masks depending on our target endian-ness. The diagram
17127 below may help. We must draw the distinction when building masks
17128 which select one half of the vector. An instruction selecting
17129 architectural low-lanes for a big-endian target, must be described using
17130 a mask selecting GCC high-lanes.
17132 Big-Endian Little-Endian
17134 GCC 0 1 2 3 3 2 1 0
17135 | x | x | x | x | | x | x | x | x |
17136 Architecture 3 2 1 0 3 2 1 0
17138 Low Mask: { 2, 3 } { 0, 1 }
17139 High Mask: { 0, 1 } { 2, 3 }
17141 MODE Is the mode of the vector and NUNITS is the number of units in it. */
17144 aarch64_simd_vect_par_cnst_half (machine_mode mode
, int nunits
, bool high
)
17146 rtvec v
= rtvec_alloc (nunits
/ 2);
17147 int high_base
= nunits
/ 2;
17153 if (BYTES_BIG_ENDIAN
)
17154 base
= high
? low_base
: high_base
;
17156 base
= high
? high_base
: low_base
;
17158 for (i
= 0; i
< nunits
/ 2; i
++)
17159 RTVEC_ELT (v
, i
) = GEN_INT (base
+ i
);
17161 t1
= gen_rtx_PARALLEL (mode
, v
);
17165 /* Check OP for validity as a PARALLEL RTX vector with elements
17166 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
17167 from the perspective of the architecture. See the diagram above
17168 aarch64_simd_vect_par_cnst_half for more details. */
17171 aarch64_simd_check_vect_par_cnst_half (rtx op
, machine_mode mode
,
17175 if (!VECTOR_MODE_P (mode
) || !GET_MODE_NUNITS (mode
).is_constant (&nelts
))
17178 rtx ideal
= aarch64_simd_vect_par_cnst_half (mode
, nelts
, high
);
17179 HOST_WIDE_INT count_op
= XVECLEN (op
, 0);
17180 HOST_WIDE_INT count_ideal
= XVECLEN (ideal
, 0);
17183 if (count_op
!= count_ideal
)
17186 for (i
= 0; i
< count_ideal
; i
++)
17188 rtx elt_op
= XVECEXP (op
, 0, i
);
17189 rtx elt_ideal
= XVECEXP (ideal
, 0, i
);
17191 if (!CONST_INT_P (elt_op
)
17192 || INTVAL (elt_ideal
) != INTVAL (elt_op
))
17198 /* Return a PARALLEL containing NELTS elements, with element I equal
17199 to BASE + I * STEP. */
17202 aarch64_gen_stepped_int_parallel (unsigned int nelts
, int base
, int step
)
17204 rtvec vec
= rtvec_alloc (nelts
);
17205 for (unsigned int i
= 0; i
< nelts
; ++i
)
17206 RTVEC_ELT (vec
, i
) = gen_int_mode (base
+ i
* step
, DImode
);
17207 return gen_rtx_PARALLEL (VOIDmode
, vec
);
17210 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
17211 series with step STEP. */
17214 aarch64_stepped_int_parallel_p (rtx op
, int step
)
17216 if (GET_CODE (op
) != PARALLEL
|| !CONST_INT_P (XVECEXP (op
, 0, 0)))
17219 unsigned HOST_WIDE_INT base
= UINTVAL (XVECEXP (op
, 0, 0));
17220 for (int i
= 1; i
< XVECLEN (op
, 0); ++i
)
17221 if (!CONST_INT_P (XVECEXP (op
, 0, i
))
17222 || UINTVAL (XVECEXP (op
, 0, i
)) != base
+ i
* step
)
17228 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
17229 HIGH (exclusive). */
17231 aarch64_simd_lane_bounds (rtx operand
, HOST_WIDE_INT low
, HOST_WIDE_INT high
,
17234 HOST_WIDE_INT lane
;
17235 gcc_assert (CONST_INT_P (operand
));
17236 lane
= INTVAL (operand
);
17238 if (lane
< low
|| lane
>= high
)
17241 error ("%Klane %wd out of range %wd - %wd", exp
, lane
, low
, high
- 1);
17243 error ("lane %wd out of range %wd - %wd", lane
, low
, high
- 1);
17247 /* Peform endian correction on lane number N, which indexes a vector
17248 of mode MODE, and return the result as an SImode rtx. */
17251 aarch64_endian_lane_rtx (machine_mode mode
, unsigned int n
)
17253 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode
), n
), SImode
);
17256 /* Return TRUE if OP is a valid vector addressing mode. */
17259 aarch64_simd_mem_operand_p (rtx op
)
17261 return MEM_P (op
) && (GET_CODE (XEXP (op
, 0)) == POST_INC
17262 || REG_P (XEXP (op
, 0)));
17265 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
17268 aarch64_sve_ld1r_operand_p (rtx op
)
17270 struct aarch64_address_info addr
;
17274 && is_a
<scalar_mode
> (GET_MODE (op
), &mode
)
17275 && aarch64_classify_address (&addr
, XEXP (op
, 0), mode
, false)
17276 && addr
.type
== ADDRESS_REG_IMM
17277 && offset_6bit_unsigned_scaled_p (mode
, addr
.const_offset
));
17280 /* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
17281 where the size of the read data is specified by `mode` and the size of the
17282 vector elements are specified by `elem_mode`. */
17284 aarch64_sve_ld1rq_ld1ro_operand_p (rtx op
, machine_mode mode
,
17285 scalar_mode elem_mode
)
17287 struct aarch64_address_info addr
;
17289 || !aarch64_classify_address (&addr
, XEXP (op
, 0), elem_mode
, false))
17292 if (addr
.type
== ADDRESS_REG_IMM
)
17293 return offset_4bit_signed_scaled_p (mode
, addr
.const_offset
);
17295 if (addr
.type
== ADDRESS_REG_REG
)
17296 return (1U << addr
.shift
) == GET_MODE_SIZE (elem_mode
);
17301 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
17303 aarch64_sve_ld1rq_operand_p (rtx op
)
17305 return aarch64_sve_ld1rq_ld1ro_operand_p (op
, TImode
,
17306 GET_MODE_INNER (GET_MODE (op
)));
17309 /* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
17310 accessing a vector where the element size is specified by `elem_mode`. */
17312 aarch64_sve_ld1ro_operand_p (rtx op
, scalar_mode elem_mode
)
17314 return aarch64_sve_ld1rq_ld1ro_operand_p (op
, OImode
, elem_mode
);
17317 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction. */
17319 aarch64_sve_ldff1_operand_p (rtx op
)
17324 struct aarch64_address_info addr
;
17325 if (!aarch64_classify_address (&addr
, XEXP (op
, 0), GET_MODE (op
), false))
17328 if (addr
.type
== ADDRESS_REG_IMM
)
17329 return known_eq (addr
.const_offset
, 0);
17331 return addr
.type
== ADDRESS_REG_REG
;
17334 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction. */
17336 aarch64_sve_ldnf1_operand_p (rtx op
)
17338 struct aarch64_address_info addr
;
17341 && aarch64_classify_address (&addr
, XEXP (op
, 0),
17342 GET_MODE (op
), false)
17343 && addr
.type
== ADDRESS_REG_IMM
);
17346 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
17347 The conditions for STR are the same. */
17349 aarch64_sve_ldr_operand_p (rtx op
)
17351 struct aarch64_address_info addr
;
17354 && aarch64_classify_address (&addr
, XEXP (op
, 0), GET_MODE (op
),
17355 false, ADDR_QUERY_ANY
)
17356 && addr
.type
== ADDRESS_REG_IMM
);
17359 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
17360 addressing memory of mode MODE. */
17362 aarch64_sve_prefetch_operand_p (rtx op
, machine_mode mode
)
17364 struct aarch64_address_info addr
;
17365 if (!aarch64_classify_address (&addr
, op
, mode
, false))
17368 if (addr
.type
== ADDRESS_REG_IMM
)
17369 return known_eq (addr
.const_offset
, 0);
17371 return addr
.type
== ADDRESS_REG_REG
;
17374 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
17375 We need to be able to access the individual pieces, so the range
17376 is different from LD[234] and ST[234]. */
17378 aarch64_sve_struct_memory_operand_p (rtx op
)
17383 machine_mode mode
= GET_MODE (op
);
17384 struct aarch64_address_info addr
;
17385 if (!aarch64_classify_address (&addr
, XEXP (op
, 0), SVE_BYTE_MODE
, false,
17387 || addr
.type
!= ADDRESS_REG_IMM
)
17390 poly_int64 first
= addr
.const_offset
;
17391 poly_int64 last
= first
+ GET_MODE_SIZE (mode
) - BYTES_PER_SVE_VECTOR
;
17392 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, first
)
17393 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, last
));
17396 /* Emit a register copy from operand to operand, taking care not to
17397 early-clobber source registers in the process.
17399 COUNT is the number of components into which the copy needs to be
17402 aarch64_simd_emit_reg_reg_move (rtx
*operands
, machine_mode mode
,
17403 unsigned int count
)
17406 int rdest
= REGNO (operands
[0]);
17407 int rsrc
= REGNO (operands
[1]);
17409 if (!reg_overlap_mentioned_p (operands
[0], operands
[1])
17411 for (i
= 0; i
< count
; i
++)
17412 emit_move_insn (gen_rtx_REG (mode
, rdest
+ i
),
17413 gen_rtx_REG (mode
, rsrc
+ i
));
17415 for (i
= 0; i
< count
; i
++)
17416 emit_move_insn (gen_rtx_REG (mode
, rdest
+ count
- i
- 1),
17417 gen_rtx_REG (mode
, rsrc
+ count
- i
- 1));
17420 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
17421 one of VSTRUCT modes: OI, CI, or XI. */
17423 aarch64_simd_attr_length_rglist (machine_mode mode
)
17425 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
17426 return (GET_MODE_SIZE (mode
).to_constant () / UNITS_PER_VREG
) * 4;
17429 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
17430 alignment of a vector to 128 bits. SVE predicates have an alignment of
17432 static HOST_WIDE_INT
17433 aarch64_simd_vector_alignment (const_tree type
)
17435 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
17436 be set for non-predicate vectors of booleans. Modes are the most
17437 direct way we have of identifying real SVE predicate types. */
17438 if (GET_MODE_CLASS (TYPE_MODE (type
)) == MODE_VECTOR_BOOL
)
17440 widest_int min_size
17441 = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type
)));
17442 return wi::umin (min_size
, 128).to_uhwi ();
17445 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
17447 aarch64_vectorize_preferred_vector_alignment (const_tree type
)
17449 if (aarch64_sve_data_mode_p (TYPE_MODE (type
)))
17451 /* If the length of the vector is fixed, try to align to that length,
17452 otherwise don't try to align at all. */
17453 HOST_WIDE_INT result
;
17454 if (!BITS_PER_SVE_VECTOR
.is_constant (&result
))
17455 result
= TYPE_ALIGN (TREE_TYPE (type
));
17458 return TYPE_ALIGN (type
);
17461 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
17463 aarch64_simd_vector_alignment_reachable (const_tree type
, bool is_packed
)
17468 /* For fixed-length vectors, check that the vectorizer will aim for
17469 full-vector alignment. This isn't true for generic GCC vectors
17470 that are wider than the ABI maximum of 128 bits. */
17471 poly_uint64 preferred_alignment
=
17472 aarch64_vectorize_preferred_vector_alignment (type
);
17473 if (TREE_CODE (TYPE_SIZE (type
)) == INTEGER_CST
17474 && maybe_ne (wi::to_widest (TYPE_SIZE (type
)),
17475 preferred_alignment
))
17478 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
17482 /* Return true if the vector misalignment factor is supported by the
17485 aarch64_builtin_support_vector_misalignment (machine_mode mode
,
17486 const_tree type
, int misalignment
,
17489 if (TARGET_SIMD
&& STRICT_ALIGNMENT
)
17491 /* Return if movmisalign pattern is not supported for this mode. */
17492 if (optab_handler (movmisalign_optab
, mode
) == CODE_FOR_nothing
)
17495 /* Misalignment factor is unknown at compile time. */
17496 if (misalignment
== -1)
17499 return default_builtin_support_vector_misalignment (mode
, type
, misalignment
,
17503 /* If VALS is a vector constant that can be loaded into a register
17504 using DUP, generate instructions to do so and return an RTX to
17505 assign to the register. Otherwise return NULL_RTX. */
17507 aarch64_simd_dup_constant (rtx vals
)
17509 machine_mode mode
= GET_MODE (vals
);
17510 machine_mode inner_mode
= GET_MODE_INNER (mode
);
17513 if (!const_vec_duplicate_p (vals
, &x
))
17516 /* We can load this constant by using DUP and a constant in a
17517 single ARM register. This will be cheaper than a vector
17519 x
= copy_to_mode_reg (inner_mode
, x
);
17520 return gen_vec_duplicate (mode
, x
);
17524 /* Generate code to load VALS, which is a PARALLEL containing only
17525 constants (for vec_init) or CONST_VECTOR, efficiently into a
17526 register. Returns an RTX to copy into the register, or NULL_RTX
17527 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
17529 aarch64_simd_make_constant (rtx vals
)
17531 machine_mode mode
= GET_MODE (vals
);
17533 rtx const_vec
= NULL_RTX
;
17537 if (GET_CODE (vals
) == CONST_VECTOR
)
17539 else if (GET_CODE (vals
) == PARALLEL
)
17541 /* A CONST_VECTOR must contain only CONST_INTs and
17542 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
17543 Only store valid constants in a CONST_VECTOR. */
17544 int n_elts
= XVECLEN (vals
, 0);
17545 for (i
= 0; i
< n_elts
; ++i
)
17547 rtx x
= XVECEXP (vals
, 0, i
);
17548 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
17551 if (n_const
== n_elts
)
17552 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0));
17555 gcc_unreachable ();
17557 if (const_vec
!= NULL_RTX
17558 && aarch64_simd_valid_immediate (const_vec
, NULL
))
17559 /* Load using MOVI/MVNI. */
17561 else if ((const_dup
= aarch64_simd_dup_constant (vals
)) != NULL_RTX
)
17562 /* Loaded using DUP. */
17564 else if (const_vec
!= NULL_RTX
)
17565 /* Load from constant pool. We cannot take advantage of single-cycle
17566 LD1 because we need a PC-relative addressing mode. */
17569 /* A PARALLEL containing something not valid inside CONST_VECTOR.
17570 We cannot construct an initializer. */
17574 /* Expand a vector initialisation sequence, such that TARGET is
17575 initialised to contain VALS. */
17578 aarch64_expand_vector_init (rtx target
, rtx vals
)
17580 machine_mode mode
= GET_MODE (target
);
17581 scalar_mode inner_mode
= GET_MODE_INNER (mode
);
17582 /* The number of vector elements. */
17583 int n_elts
= XVECLEN (vals
, 0);
17584 /* The number of vector elements which are not constant. */
17586 rtx any_const
= NULL_RTX
;
17587 /* The first element of vals. */
17588 rtx v0
= XVECEXP (vals
, 0, 0);
17589 bool all_same
= true;
17591 /* This is a special vec_init<M><N> where N is not an element mode but a
17592 vector mode with half the elements of M. We expect to find two entries
17593 of mode N in VALS and we must put their concatentation into TARGET. */
17594 if (XVECLEN (vals
, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals
, 0, 0))))
17596 gcc_assert (known_eq (GET_MODE_SIZE (mode
),
17597 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals
, 0, 0)))));
17598 rtx lo
= XVECEXP (vals
, 0, 0);
17599 rtx hi
= XVECEXP (vals
, 0, 1);
17600 machine_mode narrow_mode
= GET_MODE (lo
);
17601 gcc_assert (GET_MODE_INNER (narrow_mode
) == inner_mode
);
17602 gcc_assert (narrow_mode
== GET_MODE (hi
));
17604 /* When we want to concatenate a half-width vector with zeroes we can
17605 use the aarch64_combinez[_be] patterns. Just make sure that the
17606 zeroes are in the right half. */
17607 if (BYTES_BIG_ENDIAN
17608 && aarch64_simd_imm_zero (lo
, narrow_mode
)
17609 && general_operand (hi
, narrow_mode
))
17610 emit_insn (gen_aarch64_combinez_be (narrow_mode
, target
, hi
, lo
));
17611 else if (!BYTES_BIG_ENDIAN
17612 && aarch64_simd_imm_zero (hi
, narrow_mode
)
17613 && general_operand (lo
, narrow_mode
))
17614 emit_insn (gen_aarch64_combinez (narrow_mode
, target
, lo
, hi
));
17617 /* Else create the two half-width registers and combine them. */
17619 lo
= force_reg (GET_MODE (lo
), lo
);
17621 hi
= force_reg (GET_MODE (hi
), hi
);
17623 if (BYTES_BIG_ENDIAN
)
17624 std::swap (lo
, hi
);
17625 emit_insn (gen_aarch64_simd_combine (narrow_mode
, target
, lo
, hi
));
17630 /* Count the number of variable elements to initialise. */
17631 for (int i
= 0; i
< n_elts
; ++i
)
17633 rtx x
= XVECEXP (vals
, 0, i
);
17634 if (!(CONST_INT_P (x
) || CONST_DOUBLE_P (x
)))
17639 all_same
&= rtx_equal_p (x
, v0
);
17642 /* No variable elements, hand off to aarch64_simd_make_constant which knows
17643 how best to handle this. */
17646 rtx constant
= aarch64_simd_make_constant (vals
);
17647 if (constant
!= NULL_RTX
)
17649 emit_move_insn (target
, constant
);
17654 /* Splat a single non-constant element if we can. */
17657 rtx x
= copy_to_mode_reg (inner_mode
, v0
);
17658 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
17662 enum insn_code icode
= optab_handler (vec_set_optab
, mode
);
17663 gcc_assert (icode
!= CODE_FOR_nothing
);
17665 /* If there are only variable elements, try to optimize
17666 the insertion using dup for the most common element
17667 followed by insertions. */
17669 /* The algorithm will fill matches[*][0] with the earliest matching element,
17670 and matches[X][1] with the count of duplicate elements (if X is the
17671 earliest element which has duplicates). */
17673 if (n_var
== n_elts
&& n_elts
<= 16)
17675 int matches
[16][2] = {0};
17676 for (int i
= 0; i
< n_elts
; i
++)
17678 for (int j
= 0; j
<= i
; j
++)
17680 if (rtx_equal_p (XVECEXP (vals
, 0, i
), XVECEXP (vals
, 0, j
)))
17688 int maxelement
= 0;
17690 for (int i
= 0; i
< n_elts
; i
++)
17691 if (matches
[i
][1] > maxv
)
17694 maxv
= matches
[i
][1];
17697 /* Create a duplicate of the most common element, unless all elements
17698 are equally useless to us, in which case just immediately set the
17699 vector register using the first element. */
17703 /* For vectors of two 64-bit elements, we can do even better. */
17705 && (inner_mode
== E_DImode
17706 || inner_mode
== E_DFmode
))
17709 rtx x0
= XVECEXP (vals
, 0, 0);
17710 rtx x1
= XVECEXP (vals
, 0, 1);
17711 /* Combine can pick up this case, but handling it directly
17712 here leaves clearer RTL.
17714 This is load_pair_lanes<mode>, and also gives us a clean-up
17715 for store_pair_lanes<mode>. */
17716 if (memory_operand (x0
, inner_mode
)
17717 && memory_operand (x1
, inner_mode
)
17718 && !STRICT_ALIGNMENT
17719 && rtx_equal_p (XEXP (x1
, 0),
17720 plus_constant (Pmode
,
17722 GET_MODE_SIZE (inner_mode
))))
17725 if (inner_mode
== DFmode
)
17726 t
= gen_load_pair_lanesdf (target
, x0
, x1
);
17728 t
= gen_load_pair_lanesdi (target
, x0
, x1
);
17733 /* The subreg-move sequence below will move into lane zero of the
17734 vector register. For big-endian we want that position to hold
17735 the last element of VALS. */
17736 maxelement
= BYTES_BIG_ENDIAN
? n_elts
- 1 : 0;
17737 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
17738 aarch64_emit_move (target
, lowpart_subreg (mode
, x
, inner_mode
));
17742 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
17743 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
17746 /* Insert the rest. */
17747 for (int i
= 0; i
< n_elts
; i
++)
17749 rtx x
= XVECEXP (vals
, 0, i
);
17750 if (matches
[i
][0] == maxelement
)
17752 x
= copy_to_mode_reg (inner_mode
, x
);
17753 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
17758 /* Initialise a vector which is part-variable. We want to first try
17759 to build those lanes which are constant in the most efficient way we
17761 if (n_var
!= n_elts
)
17763 rtx copy
= copy_rtx (vals
);
17765 /* Load constant part of vector. We really don't care what goes into the
17766 parts we will overwrite, but we're more likely to be able to load the
17767 constant efficiently if it has fewer, larger, repeating parts
17768 (see aarch64_simd_valid_immediate). */
17769 for (int i
= 0; i
< n_elts
; i
++)
17771 rtx x
= XVECEXP (vals
, 0, i
);
17772 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
17774 rtx subst
= any_const
;
17775 for (int bit
= n_elts
/ 2; bit
> 0; bit
/= 2)
17777 /* Look in the copied vector, as more elements are const. */
17778 rtx test
= XVECEXP (copy
, 0, i
^ bit
);
17779 if (CONST_INT_P (test
) || CONST_DOUBLE_P (test
))
17785 XVECEXP (copy
, 0, i
) = subst
;
17787 aarch64_expand_vector_init (target
, copy
);
17790 /* Insert the variable lanes directly. */
17791 for (int i
= 0; i
< n_elts
; i
++)
17793 rtx x
= XVECEXP (vals
, 0, i
);
17794 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
17796 x
= copy_to_mode_reg (inner_mode
, x
);
17797 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
17801 /* Emit RTL corresponding to:
17802 insr TARGET, ELEM. */
17805 emit_insr (rtx target
, rtx elem
)
17807 machine_mode mode
= GET_MODE (target
);
17808 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
17809 elem
= force_reg (elem_mode
, elem
);
17811 insn_code icode
= optab_handler (vec_shl_insert_optab
, mode
);
17812 gcc_assert (icode
!= CODE_FOR_nothing
);
17813 emit_insn (GEN_FCN (icode
) (target
, target
, elem
));
17816 /* Subroutine of aarch64_sve_expand_vector_init for handling
17817 trailing constants.
17818 This function works as follows:
17819 (a) Create a new vector consisting of trailing constants.
17820 (b) Initialize TARGET with the constant vector using emit_move_insn.
17821 (c) Insert remaining elements in TARGET using insr.
17822 NELTS is the total number of elements in original vector while
17823 while NELTS_REQD is the number of elements that are actually
17826 ??? The heuristic used is to do above only if number of constants
17827 is at least half the total number of elements. May need fine tuning. */
17830 aarch64_sve_expand_vector_init_handle_trailing_constants
17831 (rtx target
, const rtx_vector_builder
&builder
, int nelts
, int nelts_reqd
)
17833 machine_mode mode
= GET_MODE (target
);
17834 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
17835 int n_trailing_constants
= 0;
17837 for (int i
= nelts_reqd
- 1;
17838 i
>= 0 && aarch64_legitimate_constant_p (elem_mode
, builder
.elt (i
));
17840 n_trailing_constants
++;
17842 if (n_trailing_constants
>= nelts_reqd
/ 2)
17844 rtx_vector_builder
v (mode
, 1, nelts
);
17845 for (int i
= 0; i
< nelts
; i
++)
17846 v
.quick_push (builder
.elt (i
+ nelts_reqd
- n_trailing_constants
));
17847 rtx const_vec
= v
.build ();
17848 emit_move_insn (target
, const_vec
);
17850 for (int i
= nelts_reqd
- n_trailing_constants
- 1; i
>= 0; i
--)
17851 emit_insr (target
, builder
.elt (i
));
17859 /* Subroutine of aarch64_sve_expand_vector_init.
17861 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
17862 (b) Skip trailing elements from BUILDER, which are the same as
17863 element NELTS_REQD - 1.
17864 (c) Insert earlier elements in reverse order in TARGET using insr. */
17867 aarch64_sve_expand_vector_init_insert_elems (rtx target
,
17868 const rtx_vector_builder
&builder
,
17871 machine_mode mode
= GET_MODE (target
);
17872 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
17874 struct expand_operand ops
[2];
17875 enum insn_code icode
= optab_handler (vec_duplicate_optab
, mode
);
17876 gcc_assert (icode
!= CODE_FOR_nothing
);
17878 create_output_operand (&ops
[0], target
, mode
);
17879 create_input_operand (&ops
[1], builder
.elt (nelts_reqd
- 1), elem_mode
);
17880 expand_insn (icode
, 2, ops
);
17882 int ndups
= builder
.count_dups (nelts_reqd
- 1, -1, -1);
17883 for (int i
= nelts_reqd
- ndups
- 1; i
>= 0; i
--)
17884 emit_insr (target
, builder
.elt (i
));
17887 /* Subroutine of aarch64_sve_expand_vector_init to handle case
17888 when all trailing elements of builder are same.
17889 This works as follows:
17890 (a) Use expand_insn interface to broadcast last vector element in TARGET.
17891 (b) Insert remaining elements in TARGET using insr.
17893 ??? The heuristic used is to do above if number of same trailing elements
17894 is at least 3/4 of total number of elements, loosely based on
17895 heuristic from mostly_zeros_p. May need fine-tuning. */
17898 aarch64_sve_expand_vector_init_handle_trailing_same_elem
17899 (rtx target
, const rtx_vector_builder
&builder
, int nelts_reqd
)
17901 int ndups
= builder
.count_dups (nelts_reqd
- 1, -1, -1);
17902 if (ndups
>= (3 * nelts_reqd
) / 4)
17904 aarch64_sve_expand_vector_init_insert_elems (target
, builder
,
17905 nelts_reqd
- ndups
+ 1);
17912 /* Initialize register TARGET from BUILDER. NELTS is the constant number
17913 of elements in BUILDER.
17915 The function tries to initialize TARGET from BUILDER if it fits one
17916 of the special cases outlined below.
17918 Failing that, the function divides BUILDER into two sub-vectors:
17919 v_even = even elements of BUILDER;
17920 v_odd = odd elements of BUILDER;
17922 and recursively calls itself with v_even and v_odd.
17924 if (recursive call succeeded for v_even or v_odd)
17925 TARGET = zip (v_even, v_odd)
17927 The function returns true if it managed to build TARGET from BUILDER
17928 with one of the special cases, false otherwise.
17930 Example: {a, 1, b, 2, c, 3, d, 4}
17932 The vector gets divided into:
17933 v_even = {a, b, c, d}
17934 v_odd = {1, 2, 3, 4}
17936 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
17937 initialize tmp2 from constant vector v_odd using emit_move_insn.
17939 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
17940 4 elements, so we construct tmp1 from v_even using insr:
17947 TARGET = zip (tmp1, tmp2)
17948 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
17951 aarch64_sve_expand_vector_init (rtx target
, const rtx_vector_builder
&builder
,
17952 int nelts
, int nelts_reqd
)
17954 machine_mode mode
= GET_MODE (target
);
17956 /* Case 1: Vector contains trailing constants. */
17958 if (aarch64_sve_expand_vector_init_handle_trailing_constants
17959 (target
, builder
, nelts
, nelts_reqd
))
17962 /* Case 2: Vector contains leading constants. */
17964 rtx_vector_builder
rev_builder (mode
, 1, nelts_reqd
);
17965 for (int i
= 0; i
< nelts_reqd
; i
++)
17966 rev_builder
.quick_push (builder
.elt (nelts_reqd
- i
- 1));
17967 rev_builder
.finalize ();
17969 if (aarch64_sve_expand_vector_init_handle_trailing_constants
17970 (target
, rev_builder
, nelts
, nelts_reqd
))
17972 emit_insn (gen_aarch64_sve_rev (mode
, target
, target
));
17976 /* Case 3: Vector contains trailing same element. */
17978 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
17979 (target
, builder
, nelts_reqd
))
17982 /* Case 4: Vector contains leading same element. */
17984 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
17985 (target
, rev_builder
, nelts_reqd
) && nelts_reqd
== nelts
)
17987 emit_insn (gen_aarch64_sve_rev (mode
, target
, target
));
17991 /* Avoid recursing below 4-elements.
17992 ??? The threshold 4 may need fine-tuning. */
17994 if (nelts_reqd
<= 4)
17997 rtx_vector_builder
v_even (mode
, 1, nelts
);
17998 rtx_vector_builder
v_odd (mode
, 1, nelts
);
18000 for (int i
= 0; i
< nelts
* 2; i
+= 2)
18002 v_even
.quick_push (builder
.elt (i
));
18003 v_odd
.quick_push (builder
.elt (i
+ 1));
18006 v_even
.finalize ();
18009 rtx tmp1
= gen_reg_rtx (mode
);
18010 bool did_even_p
= aarch64_sve_expand_vector_init (tmp1
, v_even
,
18011 nelts
, nelts_reqd
/ 2);
18013 rtx tmp2
= gen_reg_rtx (mode
);
18014 bool did_odd_p
= aarch64_sve_expand_vector_init (tmp2
, v_odd
,
18015 nelts
, nelts_reqd
/ 2);
18017 if (!did_even_p
&& !did_odd_p
)
18020 /* Initialize v_even and v_odd using INSR if it didn't match any of the
18021 special cases and zip v_even, v_odd. */
18024 aarch64_sve_expand_vector_init_insert_elems (tmp1
, v_even
, nelts_reqd
/ 2);
18027 aarch64_sve_expand_vector_init_insert_elems (tmp2
, v_odd
, nelts_reqd
/ 2);
18029 rtvec v
= gen_rtvec (2, tmp1
, tmp2
);
18030 emit_set_insn (target
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
18034 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
18037 aarch64_sve_expand_vector_init (rtx target
, rtx vals
)
18039 machine_mode mode
= GET_MODE (target
);
18040 int nelts
= XVECLEN (vals
, 0);
18042 rtx_vector_builder
v (mode
, 1, nelts
);
18043 for (int i
= 0; i
< nelts
; i
++)
18044 v
.quick_push (XVECEXP (vals
, 0, i
));
18047 /* If neither sub-vectors of v could be initialized specially,
18048 then use INSR to insert all elements from v into TARGET.
18049 ??? This might not be optimal for vectors with large
18050 initializers like 16-element or above.
18051 For nelts < 4, it probably isn't useful to handle specially. */
18054 || !aarch64_sve_expand_vector_init (target
, v
, nelts
, nelts
))
18055 aarch64_sve_expand_vector_init_insert_elems (target
, v
, nelts
);
18058 /* Check whether VALUE is a vector constant in which every element
18059 is either a power of 2 or a negated power of 2. If so, return
18060 a constant vector of log2s, and flip CODE between PLUS and MINUS
18061 if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
18064 aarch64_convert_mult_to_shift (rtx value
, rtx_code
&code
)
18066 if (GET_CODE (value
) != CONST_VECTOR
)
18069 rtx_vector_builder builder
;
18070 if (!builder
.new_unary_operation (GET_MODE (value
), value
, false))
18073 scalar_mode int_mode
= GET_MODE_INNER (GET_MODE (value
));
18074 /* 1 if the result of the multiplication must be negated,
18075 0 if it mustn't, or -1 if we don't yet care. */
18077 unsigned int encoded_nelts
= const_vector_encoded_nelts (value
);
18078 for (unsigned int i
= 0; i
< encoded_nelts
; ++i
)
18080 rtx elt
= CONST_VECTOR_ENCODED_ELT (value
, i
);
18081 if (!CONST_SCALAR_INT_P (elt
))
18083 rtx_mode_t
val (elt
, int_mode
);
18084 wide_int pow2
= wi::neg (val
);
18087 /* It matters whether we negate or not. Make that choice,
18088 and make sure that it's consistent with previous elements. */
18089 if (negate
== !wi::neg_p (val
))
18091 negate
= wi::neg_p (val
);
18095 /* POW2 is now the value that we want to be a power of 2. */
18096 int shift
= wi::exact_log2 (pow2
);
18099 builder
.quick_push (gen_int_mode (shift
, int_mode
));
18102 /* PLUS and MINUS are equivalent; canonicalize on PLUS. */
18104 else if (negate
== 1)
18105 code
= code
== PLUS
? MINUS
: PLUS
;
18106 return builder
.build ();
18109 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
18110 CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
18111 operands array, in the same order as for fma_optab. Return true if
18112 the function emitted all the necessary instructions, false if the caller
18113 should generate the pattern normally with the new OPERANDS array. */
18116 aarch64_prepare_sve_int_fma (rtx
*operands
, rtx_code code
)
18118 machine_mode mode
= GET_MODE (operands
[0]);
18119 if (rtx shifts
= aarch64_convert_mult_to_shift (operands
[2], code
))
18121 rtx product
= expand_binop (mode
, vashl_optab
, operands
[1], shifts
,
18122 NULL_RTX
, true, OPTAB_DIRECT
);
18123 force_expand_binop (mode
, code
== PLUS
? add_optab
: sub_optab
,
18124 operands
[3], product
, operands
[0], true,
18128 operands
[2] = force_reg (mode
, operands
[2]);
18132 /* Likewise, but for a conditional pattern. */
18135 aarch64_prepare_sve_cond_int_fma (rtx
*operands
, rtx_code code
)
18137 machine_mode mode
= GET_MODE (operands
[0]);
18138 if (rtx shifts
= aarch64_convert_mult_to_shift (operands
[3], code
))
18140 rtx product
= expand_binop (mode
, vashl_optab
, operands
[2], shifts
,
18141 NULL_RTX
, true, OPTAB_DIRECT
);
18142 emit_insn (gen_cond (code
, mode
, operands
[0], operands
[1],
18143 operands
[4], product
, operands
[5]));
18146 operands
[3] = force_reg (mode
, operands
[3]);
18150 static unsigned HOST_WIDE_INT
18151 aarch64_shift_truncation_mask (machine_mode mode
)
18153 if (!SHIFT_COUNT_TRUNCATED
|| aarch64_vector_data_mode_p (mode
))
18155 return GET_MODE_UNIT_BITSIZE (mode
) - 1;
18158 /* Select a format to encode pointers in exception handling data. */
18160 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED
, int global
)
18163 switch (aarch64_cmodel
)
18165 case AARCH64_CMODEL_TINY
:
18166 case AARCH64_CMODEL_TINY_PIC
:
18167 case AARCH64_CMODEL_SMALL
:
18168 case AARCH64_CMODEL_SMALL_PIC
:
18169 case AARCH64_CMODEL_SMALL_SPIC
:
18170 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
18172 type
= DW_EH_PE_sdata4
;
18175 /* No assumptions here. 8-byte relocs required. */
18176 type
= DW_EH_PE_sdata8
;
18179 return (global
? DW_EH_PE_indirect
: 0) | DW_EH_PE_pcrel
| type
;
18182 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
18185 aarch64_asm_output_variant_pcs (FILE *stream
, const tree decl
, const char* name
)
18187 if (TREE_CODE (decl
) == FUNCTION_DECL
)
18189 arm_pcs pcs
= (arm_pcs
) fndecl_abi (decl
).id ();
18190 if (pcs
== ARM_PCS_SIMD
|| pcs
== ARM_PCS_SVE
)
18192 fprintf (stream
, "\t.variant_pcs\t");
18193 assemble_name (stream
, name
);
18194 fprintf (stream
, "\n");
18199 /* The last .arch and .tune assembly strings that we printed. */
18200 static std::string aarch64_last_printed_arch_string
;
18201 static std::string aarch64_last_printed_tune_string
;
18203 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
18204 by the function fndecl. */
18207 aarch64_declare_function_name (FILE *stream
, const char* name
,
18210 tree target_parts
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
18212 struct cl_target_option
*targ_options
;
18214 targ_options
= TREE_TARGET_OPTION (target_parts
);
18216 targ_options
= TREE_TARGET_OPTION (target_option_current_node
);
18217 gcc_assert (targ_options
);
18219 const struct processor
*this_arch
18220 = aarch64_get_arch (targ_options
->x_explicit_arch
);
18222 uint64_t isa_flags
= targ_options
->x_aarch64_isa_flags
;
18223 std::string extension
18224 = aarch64_get_extension_string_for_isa_flags (isa_flags
,
18226 /* Only update the assembler .arch string if it is distinct from the last
18227 such string we printed. */
18228 std::string to_print
= this_arch
->name
+ extension
;
18229 if (to_print
!= aarch64_last_printed_arch_string
)
18231 asm_fprintf (asm_out_file
, "\t.arch %s\n", to_print
.c_str ());
18232 aarch64_last_printed_arch_string
= to_print
;
18235 /* Print the cpu name we're tuning for in the comments, might be
18236 useful to readers of the generated asm. Do it only when it changes
18237 from function to function and verbose assembly is requested. */
18238 const struct processor
*this_tune
18239 = aarch64_get_tune_cpu (targ_options
->x_explicit_tune_core
);
18241 if (flag_debug_asm
&& aarch64_last_printed_tune_string
!= this_tune
->name
)
18243 asm_fprintf (asm_out_file
, "\t" ASM_COMMENT_START
".tune %s\n",
18245 aarch64_last_printed_tune_string
= this_tune
->name
;
18248 aarch64_asm_output_variant_pcs (stream
, fndecl
, name
);
18250 /* Don't forget the type directive for ELF. */
18251 ASM_OUTPUT_TYPE_DIRECTIVE (stream
, name
, "function");
18252 ASM_OUTPUT_LABEL (stream
, name
);
18254 cfun
->machine
->label_is_assembled
= true;
18257 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY. Check if the patch area is after
18258 the function label and emit a BTI if necessary. */
18261 aarch64_print_patchable_function_entry (FILE *file
,
18262 unsigned HOST_WIDE_INT patch_area_size
,
18265 if (cfun
->machine
->label_is_assembled
18266 && aarch64_bti_enabled ()
18267 && !cgraph_node::get (cfun
->decl
)->only_called_directly_p ())
18269 /* Remove the BTI that follows the patch area and insert a new BTI
18270 before the patch area right after the function label. */
18271 rtx_insn
*insn
= next_real_nondebug_insn (get_insns ());
18274 && GET_CODE (PATTERN (insn
)) == UNSPEC_VOLATILE
18275 && XINT (PATTERN (insn
), 1) == UNSPECV_BTI_C
)
18276 delete_insn (insn
);
18277 asm_fprintf (file
, "\thint\t34 // bti c\n");
18280 default_print_patchable_function_entry (file
, patch_area_size
, record_p
);
18283 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
18286 aarch64_asm_output_alias (FILE *stream
, const tree decl
, const tree target
)
18288 const char *name
= XSTR (XEXP (DECL_RTL (decl
), 0), 0);
18289 const char *value
= IDENTIFIER_POINTER (target
);
18290 aarch64_asm_output_variant_pcs (stream
, decl
, name
);
18291 ASM_OUTPUT_DEF (stream
, name
, value
);
18294 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
18295 function symbol references. */
18298 aarch64_asm_output_external (FILE *stream
, tree decl
, const char* name
)
18300 default_elf_asm_output_external (stream
, decl
, name
);
18301 aarch64_asm_output_variant_pcs (stream
, decl
, name
);
18304 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
18305 Used to output the .cfi_b_key_frame directive when signing the current
18306 function with the B key. */
18309 aarch64_post_cfi_startproc (FILE *f
, tree ignored ATTRIBUTE_UNUSED
)
18311 if (cfun
->machine
->frame
.laid_out
&& aarch64_return_address_signing_enabled ()
18312 && aarch64_ra_sign_key
== AARCH64_KEY_B
)
18313 asm_fprintf (f
, "\t.cfi_b_key_frame\n");
18316 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
18319 aarch64_start_file (void)
18321 struct cl_target_option
*default_options
18322 = TREE_TARGET_OPTION (target_option_default_node
);
18324 const struct processor
*default_arch
18325 = aarch64_get_arch (default_options
->x_explicit_arch
);
18326 uint64_t default_isa_flags
= default_options
->x_aarch64_isa_flags
;
18327 std::string extension
18328 = aarch64_get_extension_string_for_isa_flags (default_isa_flags
,
18329 default_arch
->flags
);
18331 aarch64_last_printed_arch_string
= default_arch
->name
+ extension
;
18332 aarch64_last_printed_tune_string
= "";
18333 asm_fprintf (asm_out_file
, "\t.arch %s\n",
18334 aarch64_last_printed_arch_string
.c_str ());
18336 default_file_start ();
18339 /* Emit load exclusive. */
18342 aarch64_emit_load_exclusive (machine_mode mode
, rtx rval
,
18343 rtx mem
, rtx model_rtx
)
18345 if (mode
== TImode
)
18346 emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode
, rval
),
18347 gen_highpart (DImode
, rval
),
18350 emit_insn (gen_aarch64_load_exclusive (mode
, rval
, mem
, model_rtx
));
18353 /* Emit store exclusive. */
18356 aarch64_emit_store_exclusive (machine_mode mode
, rtx bval
,
18357 rtx mem
, rtx rval
, rtx model_rtx
)
18359 if (mode
== TImode
)
18360 emit_insn (gen_aarch64_store_exclusive_pair
18361 (bval
, mem
, operand_subword (rval
, 0, 0, TImode
),
18362 operand_subword (rval
, 1, 0, TImode
), model_rtx
));
18364 emit_insn (gen_aarch64_store_exclusive (mode
, bval
, mem
, rval
, model_rtx
));
18367 /* Mark the previous jump instruction as unlikely. */
18370 aarch64_emit_unlikely_jump (rtx insn
)
18372 rtx_insn
*jump
= emit_jump_insn (insn
);
18373 add_reg_br_prob_note (jump
, profile_probability::very_unlikely ());
18376 /* We store the names of the various atomic helpers in a 5x4 array.
18377 Return the libcall function given MODE, MODEL and NAMES. */
18380 aarch64_atomic_ool_func(machine_mode mode
, rtx model_rtx
,
18381 const atomic_ool_names
*names
)
18383 memmodel model
= memmodel_base (INTVAL (model_rtx
));
18384 int mode_idx
, model_idx
;
18404 gcc_unreachable ();
18409 case MEMMODEL_RELAXED
:
18412 case MEMMODEL_CONSUME
:
18413 case MEMMODEL_ACQUIRE
:
18416 case MEMMODEL_RELEASE
:
18419 case MEMMODEL_ACQ_REL
:
18420 case MEMMODEL_SEQ_CST
:
18424 gcc_unreachable ();
18427 return init_one_libfunc_visibility (names
->str
[mode_idx
][model_idx
],
18428 VISIBILITY_HIDDEN
);
18431 #define DEF0(B, N) \
18432 { "__aarch64_" #B #N "_relax", \
18433 "__aarch64_" #B #N "_acq", \
18434 "__aarch64_" #B #N "_rel", \
18435 "__aarch64_" #B #N "_acq_rel" }
18437 #define DEF4(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
18438 { NULL, NULL, NULL, NULL }
18439 #define DEF5(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
18441 static const atomic_ool_names aarch64_ool_cas_names
= { { DEF5(cas
) } };
18442 const atomic_ool_names aarch64_ool_swp_names
= { { DEF4(swp
) } };
18443 const atomic_ool_names aarch64_ool_ldadd_names
= { { DEF4(ldadd
) } };
18444 const atomic_ool_names aarch64_ool_ldset_names
= { { DEF4(ldset
) } };
18445 const atomic_ool_names aarch64_ool_ldclr_names
= { { DEF4(ldclr
) } };
18446 const atomic_ool_names aarch64_ool_ldeor_names
= { { DEF4(ldeor
) } };
18452 /* Expand a compare and swap pattern. */
18455 aarch64_expand_compare_and_swap (rtx operands
[])
18457 rtx bval
, rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
, x
, cc_reg
;
18458 machine_mode mode
, r_mode
;
18460 bval
= operands
[0];
18461 rval
= operands
[1];
18463 oldval
= operands
[3];
18464 newval
= operands
[4];
18465 is_weak
= operands
[5];
18466 mod_s
= operands
[6];
18467 mod_f
= operands
[7];
18468 mode
= GET_MODE (mem
);
18470 /* Normally the succ memory model must be stronger than fail, but in the
18471 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
18472 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
18473 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f
)))
18474 && is_mm_release (memmodel_from_int (INTVAL (mod_s
))))
18475 mod_s
= GEN_INT (MEMMODEL_ACQ_REL
);
18478 if (mode
== QImode
|| mode
== HImode
)
18481 rval
= gen_reg_rtx (r_mode
);
18486 /* The CAS insn requires oldval and rval overlap, but we need to
18487 have a copy of oldval saved across the operation to tell if
18488 the operation is successful. */
18489 if (reg_overlap_mentioned_p (rval
, oldval
))
18490 rval
= copy_to_mode_reg (r_mode
, oldval
);
18492 emit_move_insn (rval
, gen_lowpart (r_mode
, oldval
));
18494 emit_insn (gen_aarch64_compare_and_swap_lse (mode
, rval
, mem
,
18496 cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
18498 else if (TARGET_OUTLINE_ATOMICS
)
18500 /* Oldval must satisfy compare afterward. */
18501 if (!aarch64_plus_operand (oldval
, mode
))
18502 oldval
= force_reg (mode
, oldval
);
18503 rtx func
= aarch64_atomic_ool_func (mode
, mod_s
, &aarch64_ool_cas_names
);
18504 rval
= emit_library_call_value (func
, NULL_RTX
, LCT_NORMAL
, r_mode
,
18505 oldval
, mode
, newval
, mode
,
18506 XEXP (mem
, 0), Pmode
);
18507 cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
18511 /* The oldval predicate varies by mode. Test it and force to reg. */
18512 insn_code code
= code_for_aarch64_compare_and_swap (mode
);
18513 if (!insn_data
[code
].operand
[2].predicate (oldval
, mode
))
18514 oldval
= force_reg (mode
, oldval
);
18516 emit_insn (GEN_FCN (code
) (rval
, mem
, oldval
, newval
,
18517 is_weak
, mod_s
, mod_f
));
18518 cc_reg
= gen_rtx_REG (CCmode
, CC_REGNUM
);
18521 if (r_mode
!= mode
)
18522 rval
= gen_lowpart (mode
, rval
);
18523 emit_move_insn (operands
[1], rval
);
18525 x
= gen_rtx_EQ (SImode
, cc_reg
, const0_rtx
);
18526 emit_insn (gen_rtx_SET (bval
, x
));
18529 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
18530 sequence implementing an atomic operation. */
18533 aarch64_emit_post_barrier (enum memmodel model
)
18535 const enum memmodel base_model
= memmodel_base (model
);
18537 if (is_mm_sync (model
)
18538 && (base_model
== MEMMODEL_ACQUIRE
18539 || base_model
== MEMMODEL_ACQ_REL
18540 || base_model
== MEMMODEL_SEQ_CST
))
18542 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST
)));
18546 /* Split a compare and swap pattern. */
18549 aarch64_split_compare_and_swap (rtx operands
[])
18551 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
18552 gcc_assert (epilogue_completed
);
18554 rtx rval
, mem
, oldval
, newval
, scratch
, x
, model_rtx
;
18557 rtx_code_label
*label1
, *label2
;
18558 enum memmodel model
;
18560 rval
= operands
[0];
18562 oldval
= operands
[2];
18563 newval
= operands
[3];
18564 is_weak
= (operands
[4] != const0_rtx
);
18565 model_rtx
= operands
[5];
18566 scratch
= operands
[7];
18567 mode
= GET_MODE (mem
);
18568 model
= memmodel_from_int (INTVAL (model_rtx
));
18570 /* When OLDVAL is zero and we want the strong version we can emit a tighter
18573 LD[A]XR rval, [mem]
18575 ST[L]XR scratch, newval, [mem]
18576 CBNZ scratch, .label1
18579 bool strong_zero_p
= (!is_weak
&& !aarch64_track_speculation
&&
18580 oldval
== const0_rtx
&& mode
!= TImode
);
18585 label1
= gen_label_rtx ();
18586 emit_label (label1
);
18588 label2
= gen_label_rtx ();
18590 /* The initial load can be relaxed for a __sync operation since a final
18591 barrier will be emitted to stop code hoisting. */
18592 if (is_mm_sync (model
))
18593 aarch64_emit_load_exclusive (mode
, rval
, mem
, GEN_INT (MEMMODEL_RELAXED
));
18595 aarch64_emit_load_exclusive (mode
, rval
, mem
, model_rtx
);
18598 x
= gen_rtx_NE (VOIDmode
, rval
, const0_rtx
);
18601 rtx cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
18602 x
= gen_rtx_NE (VOIDmode
, cc_reg
, const0_rtx
);
18604 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
18605 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
18606 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
18608 aarch64_emit_store_exclusive (mode
, scratch
, mem
, newval
, model_rtx
);
18612 if (aarch64_track_speculation
)
18614 /* Emit an explicit compare instruction, so that we can correctly
18615 track the condition codes. */
18616 rtx cc_reg
= aarch64_gen_compare_reg (NE
, scratch
, const0_rtx
);
18617 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
18620 x
= gen_rtx_NE (VOIDmode
, scratch
, const0_rtx
);
18622 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
18623 gen_rtx_LABEL_REF (Pmode
, label1
), pc_rtx
);
18624 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
18627 aarch64_gen_compare_reg (NE
, scratch
, const0_rtx
);
18629 emit_label (label2
);
18631 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
18632 to set the condition flags. If this is not used it will be removed by
18635 aarch64_gen_compare_reg (NE
, rval
, const0_rtx
);
18637 /* Emit any final barrier needed for a __sync operation. */
18638 if (is_mm_sync (model
))
18639 aarch64_emit_post_barrier (model
);
18642 /* Split an atomic operation. */
18645 aarch64_split_atomic_op (enum rtx_code code
, rtx old_out
, rtx new_out
, rtx mem
,
18646 rtx value
, rtx model_rtx
, rtx cond
)
18648 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
18649 gcc_assert (epilogue_completed
);
18651 machine_mode mode
= GET_MODE (mem
);
18652 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
18653 const enum memmodel model
= memmodel_from_int (INTVAL (model_rtx
));
18654 const bool is_sync
= is_mm_sync (model
);
18655 rtx_code_label
*label
;
18658 /* Split the atomic operation into a sequence. */
18659 label
= gen_label_rtx ();
18660 emit_label (label
);
18663 new_out
= gen_lowpart (wmode
, new_out
);
18665 old_out
= gen_lowpart (wmode
, old_out
);
18668 value
= simplify_gen_subreg (wmode
, value
, mode
, 0);
18670 /* The initial load can be relaxed for a __sync operation since a final
18671 barrier will be emitted to stop code hoisting. */
18673 aarch64_emit_load_exclusive (mode
, old_out
, mem
,
18674 GEN_INT (MEMMODEL_RELAXED
));
18676 aarch64_emit_load_exclusive (mode
, old_out
, mem
, model_rtx
);
18685 x
= gen_rtx_AND (wmode
, old_out
, value
);
18686 emit_insn (gen_rtx_SET (new_out
, x
));
18687 x
= gen_rtx_NOT (wmode
, new_out
);
18688 emit_insn (gen_rtx_SET (new_out
, x
));
18692 if (CONST_INT_P (value
))
18694 value
= GEN_INT (-INTVAL (value
));
18697 /* Fall through. */
18700 x
= gen_rtx_fmt_ee (code
, wmode
, old_out
, value
);
18701 emit_insn (gen_rtx_SET (new_out
, x
));
18705 aarch64_emit_store_exclusive (mode
, cond
, mem
,
18706 gen_lowpart (mode
, new_out
), model_rtx
);
18708 if (aarch64_track_speculation
)
18710 /* Emit an explicit compare instruction, so that we can correctly
18711 track the condition codes. */
18712 rtx cc_reg
= aarch64_gen_compare_reg (NE
, cond
, const0_rtx
);
18713 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
18716 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
18718 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
18719 gen_rtx_LABEL_REF (Pmode
, label
), pc_rtx
);
18720 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
18722 /* Emit any final barrier needed for a __sync operation. */
18724 aarch64_emit_post_barrier (model
);
18728 aarch64_init_libfuncs (void)
18730 /* Half-precision float operations. The compiler handles all operations
18731 with NULL libfuncs by converting to SFmode. */
18734 set_conv_libfunc (trunc_optab
, HFmode
, SFmode
, "__gnu_f2h_ieee");
18735 set_conv_libfunc (sext_optab
, SFmode
, HFmode
, "__gnu_h2f_ieee");
18738 set_optab_libfunc (add_optab
, HFmode
, NULL
);
18739 set_optab_libfunc (sdiv_optab
, HFmode
, NULL
);
18740 set_optab_libfunc (smul_optab
, HFmode
, NULL
);
18741 set_optab_libfunc (neg_optab
, HFmode
, NULL
);
18742 set_optab_libfunc (sub_optab
, HFmode
, NULL
);
18745 set_optab_libfunc (eq_optab
, HFmode
, NULL
);
18746 set_optab_libfunc (ne_optab
, HFmode
, NULL
);
18747 set_optab_libfunc (lt_optab
, HFmode
, NULL
);
18748 set_optab_libfunc (le_optab
, HFmode
, NULL
);
18749 set_optab_libfunc (ge_optab
, HFmode
, NULL
);
18750 set_optab_libfunc (gt_optab
, HFmode
, NULL
);
18751 set_optab_libfunc (unord_optab
, HFmode
, NULL
);
18754 /* Target hook for c_mode_for_suffix. */
18755 static machine_mode
18756 aarch64_c_mode_for_suffix (char suffix
)
18764 /* We can only represent floating point constants which will fit in
18765 "quarter-precision" values. These values are characterised by
18766 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
18769 (-1)^s * (n/16) * 2^r
18772 's' is the sign bit.
18773 'n' is an integer in the range 16 <= n <= 31.
18774 'r' is an integer in the range -3 <= r <= 4. */
18776 /* Return true iff X can be represented by a quarter-precision
18777 floating point immediate operand X. Note, we cannot represent 0.0. */
18779 aarch64_float_const_representable_p (rtx x
)
18781 /* This represents our current view of how many bits
18782 make up the mantissa. */
18783 int point_pos
= 2 * HOST_BITS_PER_WIDE_INT
- 1;
18785 unsigned HOST_WIDE_INT mantissa
, mask
;
18786 REAL_VALUE_TYPE r
, m
;
18789 x
= unwrap_const_vec_duplicate (x
);
18790 if (!CONST_DOUBLE_P (x
))
18793 if (GET_MODE (x
) == VOIDmode
18794 || (GET_MODE (x
) == HFmode
&& !TARGET_FP_F16INST
))
18797 r
= *CONST_DOUBLE_REAL_VALUE (x
);
18799 /* We cannot represent infinities, NaNs or +/-zero. We won't
18800 know if we have +zero until we analyse the mantissa, but we
18801 can reject the other invalid values. */
18802 if (REAL_VALUE_ISINF (r
) || REAL_VALUE_ISNAN (r
)
18803 || REAL_VALUE_MINUS_ZERO (r
))
18806 /* Extract exponent. */
18807 r
= real_value_abs (&r
);
18808 exponent
= REAL_EXP (&r
);
18810 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
18811 highest (sign) bit, with a fixed binary point at bit point_pos.
18812 m1 holds the low part of the mantissa, m2 the high part.
18813 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
18814 bits for the mantissa, this can fail (low bits will be lost). */
18815 real_ldexp (&m
, &r
, point_pos
- exponent
);
18816 wide_int w
= real_to_integer (&m
, &fail
, HOST_BITS_PER_WIDE_INT
* 2);
18818 /* If the low part of the mantissa has bits set we cannot represent
18820 if (w
.ulow () != 0)
18822 /* We have rejected the lower HOST_WIDE_INT, so update our
18823 understanding of how many bits lie in the mantissa and
18824 look only at the high HOST_WIDE_INT. */
18825 mantissa
= w
.elt (1);
18826 point_pos
-= HOST_BITS_PER_WIDE_INT
;
18828 /* We can only represent values with a mantissa of the form 1.xxxx. */
18829 mask
= ((unsigned HOST_WIDE_INT
)1 << (point_pos
- 5)) - 1;
18830 if ((mantissa
& mask
) != 0)
18833 /* Having filtered unrepresentable values, we may now remove all
18834 but the highest 5 bits. */
18835 mantissa
>>= point_pos
- 5;
18837 /* We cannot represent the value 0.0, so reject it. This is handled
18842 /* Then, as bit 4 is always set, we can mask it off, leaving
18843 the mantissa in the range [0, 15]. */
18844 mantissa
&= ~(1 << 4);
18845 gcc_assert (mantissa
<= 15);
18847 /* GCC internally does not use IEEE754-like encoding (where normalized
18848 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
18849 Our mantissa values are shifted 4 places to the left relative to
18850 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
18851 by 5 places to correct for GCC's representation. */
18852 exponent
= 5 - exponent
;
18854 return (exponent
>= 0 && exponent
<= 7);
18857 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
18858 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
18859 output MOVI/MVNI, ORR or BIC immediate. */
18861 aarch64_output_simd_mov_immediate (rtx const_vector
, unsigned width
,
18862 enum simd_immediate_check which
)
18865 static char templ
[40];
18866 const char *mnemonic
;
18867 const char *shift_op
;
18868 unsigned int lane_count
= 0;
18871 struct simd_immediate_info info
;
18873 /* This will return true to show const_vector is legal for use as either
18874 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
18875 It will also update INFO to show how the immediate should be generated.
18876 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
18877 is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
, which
);
18878 gcc_assert (is_valid
);
18880 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
18881 lane_count
= width
/ GET_MODE_BITSIZE (info
.elt_mode
);
18883 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
18885 gcc_assert (info
.insn
== simd_immediate_info::MOV
18886 && info
.u
.mov
.shift
== 0);
18887 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
18888 move immediate path. */
18889 if (aarch64_float_const_zero_rtx_p (info
.u
.mov
.value
))
18890 info
.u
.mov
.value
= GEN_INT (0);
18893 const unsigned int buf_size
= 20;
18894 char float_buf
[buf_size
] = {'\0'};
18895 real_to_decimal_for_mode (float_buf
,
18896 CONST_DOUBLE_REAL_VALUE (info
.u
.mov
.value
),
18897 buf_size
, buf_size
, 1, info
.elt_mode
);
18899 if (lane_count
== 1)
18900 snprintf (templ
, sizeof (templ
), "fmov\t%%d0, %s", float_buf
);
18902 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%d%c, %s",
18903 lane_count
, element_char
, float_buf
);
18908 gcc_assert (CONST_INT_P (info
.u
.mov
.value
));
18910 if (which
== AARCH64_CHECK_MOV
)
18912 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "mvni" : "movi";
18913 shift_op
= (info
.u
.mov
.modifier
== simd_immediate_info::MSL
18915 if (lane_count
== 1)
18916 snprintf (templ
, sizeof (templ
), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX
,
18917 mnemonic
, UINTVAL (info
.u
.mov
.value
));
18918 else if (info
.u
.mov
.shift
)
18919 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
18920 HOST_WIDE_INT_PRINT_HEX
", %s %d", mnemonic
, lane_count
,
18921 element_char
, UINTVAL (info
.u
.mov
.value
), shift_op
,
18924 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
18925 HOST_WIDE_INT_PRINT_HEX
, mnemonic
, lane_count
,
18926 element_char
, UINTVAL (info
.u
.mov
.value
));
18930 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
18931 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "bic" : "orr";
18932 if (info
.u
.mov
.shift
)
18933 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
18934 HOST_WIDE_INT_PRINT_DEC
", %s #%d", mnemonic
, lane_count
,
18935 element_char
, UINTVAL (info
.u
.mov
.value
), "lsl",
18938 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
18939 HOST_WIDE_INT_PRINT_DEC
, mnemonic
, lane_count
,
18940 element_char
, UINTVAL (info
.u
.mov
.value
));
18946 aarch64_output_scalar_simd_mov_immediate (rtx immediate
, scalar_int_mode mode
)
18949 /* If a floating point number was passed and we desire to use it in an
18950 integer mode do the conversion to integer. */
18951 if (CONST_DOUBLE_P (immediate
) && GET_MODE_CLASS (mode
) == MODE_INT
)
18953 unsigned HOST_WIDE_INT ival
;
18954 if (!aarch64_reinterpret_float_as_int (immediate
, &ival
))
18955 gcc_unreachable ();
18956 immediate
= gen_int_mode (ival
, mode
);
18959 machine_mode vmode
;
18960 /* use a 64 bit mode for everything except for DI/DF mode, where we use
18961 a 128 bit vector mode. */
18962 int width
= GET_MODE_BITSIZE (mode
) == 64 ? 128 : 64;
18964 vmode
= aarch64_simd_container_mode (mode
, width
);
18965 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (immediate
));
18966 return aarch64_output_simd_mov_immediate (v_op
, width
);
18969 /* Return the output string to use for moving immediate CONST_VECTOR
18970 into an SVE register. */
18973 aarch64_output_sve_mov_immediate (rtx const_vector
)
18975 static char templ
[40];
18976 struct simd_immediate_info info
;
18979 bool is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
);
18980 gcc_assert (is_valid
);
18982 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
18984 machine_mode vec_mode
= GET_MODE (const_vector
);
18985 if (aarch64_sve_pred_mode_p (vec_mode
))
18987 static char buf
[sizeof ("ptrue\t%0.N, vlNNNNN")];
18988 if (info
.insn
== simd_immediate_info::MOV
)
18990 gcc_assert (info
.u
.mov
.value
== const0_rtx
);
18991 snprintf (buf
, sizeof (buf
), "pfalse\t%%0.b");
18995 gcc_assert (info
.insn
== simd_immediate_info::PTRUE
);
18996 unsigned int total_bytes
;
18997 if (info
.u
.pattern
== AARCH64_SV_ALL
18998 && BYTES_PER_SVE_VECTOR
.is_constant (&total_bytes
))
18999 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, vl%d", element_char
,
19000 total_bytes
/ GET_MODE_SIZE (info
.elt_mode
));
19002 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, %s", element_char
,
19003 svpattern_token (info
.u
.pattern
));
19008 if (info
.insn
== simd_immediate_info::INDEX
)
19010 snprintf (templ
, sizeof (templ
), "index\t%%0.%c, #"
19011 HOST_WIDE_INT_PRINT_DEC
", #" HOST_WIDE_INT_PRINT_DEC
,
19012 element_char
, INTVAL (info
.u
.index
.base
),
19013 INTVAL (info
.u
.index
.step
));
19017 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
19019 if (aarch64_float_const_zero_rtx_p (info
.u
.mov
.value
))
19020 info
.u
.mov
.value
= GEN_INT (0);
19023 const int buf_size
= 20;
19024 char float_buf
[buf_size
] = {};
19025 real_to_decimal_for_mode (float_buf
,
19026 CONST_DOUBLE_REAL_VALUE (info
.u
.mov
.value
),
19027 buf_size
, buf_size
, 1, info
.elt_mode
);
19029 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%c, #%s",
19030 element_char
, float_buf
);
19035 snprintf (templ
, sizeof (templ
), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC
,
19036 element_char
, INTVAL (info
.u
.mov
.value
));
19040 /* Return the asm template for a PTRUES. CONST_UNSPEC is the
19041 aarch64_sve_ptrue_svpattern_immediate that describes the predicate
19045 aarch64_output_sve_ptrues (rtx const_unspec
)
19047 static char templ
[40];
19049 struct simd_immediate_info info
;
19050 bool is_valid
= aarch64_simd_valid_immediate (const_unspec
, &info
);
19051 gcc_assert (is_valid
&& info
.insn
== simd_immediate_info::PTRUE
);
19053 char element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
19054 snprintf (templ
, sizeof (templ
), "ptrues\t%%0.%c, %s", element_char
,
19055 svpattern_token (info
.u
.pattern
));
19059 /* Split operands into moves from op[1] + op[2] into op[0]. */
19062 aarch64_split_combinev16qi (rtx operands
[3])
19064 unsigned int dest
= REGNO (operands
[0]);
19065 unsigned int src1
= REGNO (operands
[1]);
19066 unsigned int src2
= REGNO (operands
[2]);
19067 machine_mode halfmode
= GET_MODE (operands
[1]);
19068 unsigned int halfregs
= REG_NREGS (operands
[1]);
19069 rtx destlo
, desthi
;
19071 gcc_assert (halfmode
== V16QImode
);
19073 if (src1
== dest
&& src2
== dest
+ halfregs
)
19075 /* No-op move. Can't split to nothing; emit something. */
19076 emit_note (NOTE_INSN_DELETED
);
19080 /* Preserve register attributes for variable tracking. */
19081 destlo
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
, 0);
19082 desthi
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
+ halfregs
,
19083 GET_MODE_SIZE (halfmode
));
19085 /* Special case of reversed high/low parts. */
19086 if (reg_overlap_mentioned_p (operands
[2], destlo
)
19087 && reg_overlap_mentioned_p (operands
[1], desthi
))
19089 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
19090 emit_insn (gen_xorv16qi3 (operands
[2], operands
[1], operands
[2]));
19091 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
19093 else if (!reg_overlap_mentioned_p (operands
[2], destlo
))
19095 /* Try to avoid unnecessary moves if part of the result
19096 is in the right place already. */
19098 emit_move_insn (destlo
, operands
[1]);
19099 if (src2
!= dest
+ halfregs
)
19100 emit_move_insn (desthi
, operands
[2]);
19104 if (src2
!= dest
+ halfregs
)
19105 emit_move_insn (desthi
, operands
[2]);
19107 emit_move_insn (destlo
, operands
[1]);
19111 /* vec_perm support. */
19113 struct expand_vec_perm_d
19115 rtx target
, op0
, op1
;
19116 vec_perm_indices perm
;
19117 machine_mode vmode
;
19118 unsigned int vec_flags
;
19123 /* Generate a variable permutation. */
19126 aarch64_expand_vec_perm_1 (rtx target
, rtx op0
, rtx op1
, rtx sel
)
19128 machine_mode vmode
= GET_MODE (target
);
19129 bool one_vector_p
= rtx_equal_p (op0
, op1
);
19131 gcc_checking_assert (vmode
== V8QImode
|| vmode
== V16QImode
);
19132 gcc_checking_assert (GET_MODE (op0
) == vmode
);
19133 gcc_checking_assert (GET_MODE (op1
) == vmode
);
19134 gcc_checking_assert (GET_MODE (sel
) == vmode
);
19135 gcc_checking_assert (TARGET_SIMD
);
19139 if (vmode
== V8QImode
)
19141 /* Expand the argument to a V16QI mode by duplicating it. */
19142 rtx pair
= gen_reg_rtx (V16QImode
);
19143 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op0
));
19144 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
19148 emit_insn (gen_aarch64_tbl1v16qi (target
, op0
, sel
));
19155 if (vmode
== V8QImode
)
19157 pair
= gen_reg_rtx (V16QImode
);
19158 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op1
));
19159 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
19163 pair
= gen_reg_rtx (OImode
);
19164 emit_insn (gen_aarch64_combinev16qi (pair
, op0
, op1
));
19165 emit_insn (gen_aarch64_tbl2v16qi (target
, pair
, sel
));
19170 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
19171 NELT is the number of elements in the vector. */
19174 aarch64_expand_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
,
19177 machine_mode vmode
= GET_MODE (target
);
19178 bool one_vector_p
= rtx_equal_p (op0
, op1
);
19181 /* The TBL instruction does not use a modulo index, so we must take care
19182 of that ourselves. */
19183 mask
= aarch64_simd_gen_const_vector_dup (vmode
,
19184 one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
19185 sel
= expand_simple_binop (vmode
, AND
, sel
, mask
, NULL
, 0, OPTAB_LIB_WIDEN
);
19187 /* For big-endian, we also need to reverse the index within the vector
19188 (but not which vector). */
19189 if (BYTES_BIG_ENDIAN
)
19191 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
19193 mask
= aarch64_simd_gen_const_vector_dup (vmode
, nelt
- 1);
19194 sel
= expand_simple_binop (vmode
, XOR
, sel
, mask
,
19195 NULL
, 0, OPTAB_LIB_WIDEN
);
19197 aarch64_expand_vec_perm_1 (target
, op0
, op1
, sel
);
19200 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
19203 emit_unspec2 (rtx target
, int code
, rtx op0
, rtx op1
)
19205 emit_insn (gen_rtx_SET (target
,
19206 gen_rtx_UNSPEC (GET_MODE (target
),
19207 gen_rtvec (2, op0
, op1
), code
)));
19210 /* Expand an SVE vec_perm with the given operands. */
19213 aarch64_expand_sve_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
)
19215 machine_mode data_mode
= GET_MODE (target
);
19216 machine_mode sel_mode
= GET_MODE (sel
);
19217 /* Enforced by the pattern condition. */
19218 int nunits
= GET_MODE_NUNITS (sel_mode
).to_constant ();
19220 /* Note: vec_perm indices are supposed to wrap when they go beyond the
19221 size of the two value vectors, i.e. the upper bits of the indices
19222 are effectively ignored. SVE TBL instead produces 0 for any
19223 out-of-range indices, so we need to modulo all the vec_perm indices
19224 to ensure they are all in range. */
19225 rtx sel_reg
= force_reg (sel_mode
, sel
);
19227 /* Check if the sel only references the first values vector. */
19228 if (GET_CODE (sel
) == CONST_VECTOR
19229 && aarch64_const_vec_all_in_range_p (sel
, 0, nunits
- 1))
19231 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_reg
);
19235 /* Check if the two values vectors are the same. */
19236 if (rtx_equal_p (op0
, op1
))
19238 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
, nunits
- 1);
19239 rtx sel_mod
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
19240 NULL
, 0, OPTAB_DIRECT
);
19241 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_mod
);
19245 /* Run TBL on for each value vector and combine the results. */
19247 rtx res0
= gen_reg_rtx (data_mode
);
19248 rtx res1
= gen_reg_rtx (data_mode
);
19249 rtx neg_num_elems
= aarch64_simd_gen_const_vector_dup (sel_mode
, -nunits
);
19250 if (GET_CODE (sel
) != CONST_VECTOR
19251 || !aarch64_const_vec_all_in_range_p (sel
, 0, 2 * nunits
- 1))
19253 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
,
19255 sel_reg
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
19256 NULL
, 0, OPTAB_DIRECT
);
19258 emit_unspec2 (res0
, UNSPEC_TBL
, op0
, sel_reg
);
19259 rtx sel_sub
= expand_simple_binop (sel_mode
, PLUS
, sel_reg
, neg_num_elems
,
19260 NULL
, 0, OPTAB_DIRECT
);
19261 emit_unspec2 (res1
, UNSPEC_TBL
, op1
, sel_sub
);
19262 if (GET_MODE_CLASS (data_mode
) == MODE_VECTOR_INT
)
19263 emit_insn (gen_rtx_SET (target
, gen_rtx_IOR (data_mode
, res0
, res1
)));
19265 emit_unspec2 (target
, UNSPEC_IORF
, res0
, res1
);
19268 /* Recognize patterns suitable for the TRN instructions. */
19270 aarch64_evpc_trn (struct expand_vec_perm_d
*d
)
19273 poly_uint64 nelt
= d
->perm
.length ();
19274 rtx out
, in0
, in1
, x
;
19275 machine_mode vmode
= d
->vmode
;
19277 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
19280 /* Note that these are little-endian tests.
19281 We correct for big-endian later. */
19282 if (!d
->perm
[0].is_constant (&odd
)
19283 || (odd
!= 0 && odd
!= 1)
19284 || !d
->perm
.series_p (0, 2, odd
, 2)
19285 || !d
->perm
.series_p (1, 2, nelt
+ odd
, 2))
19294 /* We don't need a big-endian lane correction for SVE; see the comment
19295 at the head of aarch64-sve.md for details. */
19296 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
19298 x
= in0
, in0
= in1
, in1
= x
;
19303 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
19304 odd
? UNSPEC_TRN2
: UNSPEC_TRN1
));
19308 /* Recognize patterns suitable for the UZP instructions. */
19310 aarch64_evpc_uzp (struct expand_vec_perm_d
*d
)
19313 rtx out
, in0
, in1
, x
;
19314 machine_mode vmode
= d
->vmode
;
19316 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
19319 /* Note that these are little-endian tests.
19320 We correct for big-endian later. */
19321 if (!d
->perm
[0].is_constant (&odd
)
19322 || (odd
!= 0 && odd
!= 1)
19323 || !d
->perm
.series_p (0, 1, odd
, 2))
19332 /* We don't need a big-endian lane correction for SVE; see the comment
19333 at the head of aarch64-sve.md for details. */
19334 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
19336 x
= in0
, in0
= in1
, in1
= x
;
19341 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
19342 odd
? UNSPEC_UZP2
: UNSPEC_UZP1
));
19346 /* Recognize patterns suitable for the ZIP instructions. */
19348 aarch64_evpc_zip (struct expand_vec_perm_d
*d
)
19351 poly_uint64 nelt
= d
->perm
.length ();
19352 rtx out
, in0
, in1
, x
;
19353 machine_mode vmode
= d
->vmode
;
19355 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
19358 /* Note that these are little-endian tests.
19359 We correct for big-endian later. */
19360 poly_uint64 first
= d
->perm
[0];
19361 if ((maybe_ne (first
, 0U) && maybe_ne (first
* 2, nelt
))
19362 || !d
->perm
.series_p (0, 2, first
, 1)
19363 || !d
->perm
.series_p (1, 2, first
+ nelt
, 1))
19365 high
= maybe_ne (first
, 0U);
19373 /* We don't need a big-endian lane correction for SVE; see the comment
19374 at the head of aarch64-sve.md for details. */
19375 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
19377 x
= in0
, in0
= in1
, in1
= x
;
19382 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
19383 high
? UNSPEC_ZIP2
: UNSPEC_ZIP1
));
19387 /* Recognize patterns for the EXT insn. */
19390 aarch64_evpc_ext (struct expand_vec_perm_d
*d
)
19392 HOST_WIDE_INT location
;
19395 /* The first element always refers to the first vector.
19396 Check if the extracted indices are increasing by one. */
19397 if (d
->vec_flags
== VEC_SVE_PRED
19398 || !d
->perm
[0].is_constant (&location
)
19399 || !d
->perm
.series_p (0, 1, location
, 1))
19406 /* The case where (location == 0) is a no-op for both big- and little-endian,
19407 and is removed by the mid-end at optimization levels -O1 and higher.
19409 We don't need a big-endian lane correction for SVE; see the comment
19410 at the head of aarch64-sve.md for details. */
19411 if (BYTES_BIG_ENDIAN
&& location
!= 0 && d
->vec_flags
== VEC_ADVSIMD
)
19413 /* After setup, we want the high elements of the first vector (stored
19414 at the LSB end of the register), and the low elements of the second
19415 vector (stored at the MSB end of the register). So swap. */
19416 std::swap (d
->op0
, d
->op1
);
19417 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
19418 to_constant () is safe since this is restricted to Advanced SIMD
19420 location
= d
->perm
.length ().to_constant () - location
;
19423 offset
= GEN_INT (location
);
19424 emit_set_insn (d
->target
,
19425 gen_rtx_UNSPEC (d
->vmode
,
19426 gen_rtvec (3, d
->op0
, d
->op1
, offset
),
19431 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
19432 within each 64-bit, 32-bit or 16-bit granule. */
19435 aarch64_evpc_rev_local (struct expand_vec_perm_d
*d
)
19437 HOST_WIDE_INT diff
;
19438 unsigned int i
, size
, unspec
;
19439 machine_mode pred_mode
;
19441 if (d
->vec_flags
== VEC_SVE_PRED
19442 || !d
->one_vector_p
19443 || !d
->perm
[0].is_constant (&diff
))
19446 size
= (diff
+ 1) * GET_MODE_UNIT_SIZE (d
->vmode
);
19449 unspec
= UNSPEC_REV64
;
19450 pred_mode
= VNx2BImode
;
19452 else if (size
== 4)
19454 unspec
= UNSPEC_REV32
;
19455 pred_mode
= VNx4BImode
;
19457 else if (size
== 2)
19459 unspec
= UNSPEC_REV16
;
19460 pred_mode
= VNx8BImode
;
19465 unsigned int step
= diff
+ 1;
19466 for (i
= 0; i
< step
; ++i
)
19467 if (!d
->perm
.series_p (i
, step
, diff
- i
, step
))
19474 if (d
->vec_flags
== VEC_SVE_DATA
)
19476 machine_mode int_mode
= aarch64_sve_int_mode (pred_mode
);
19477 rtx target
= gen_reg_rtx (int_mode
);
19478 if (BYTES_BIG_ENDIAN
)
19479 /* The act of taking a subreg between INT_MODE and d->vmode
19480 is itself a reversing operation on big-endian targets;
19481 see the comment at the head of aarch64-sve.md for details.
19482 First reinterpret OP0 as INT_MODE without using a subreg
19483 and without changing the contents. */
19484 emit_insn (gen_aarch64_sve_reinterpret (int_mode
, target
, d
->op0
));
19487 /* For SVE we use REV[BHW] unspecs derived from the element size
19488 of v->mode and vector modes whose elements have SIZE bytes.
19489 This ensures that the vector modes match the predicate modes. */
19490 int unspec
= aarch64_sve_rev_unspec (d
->vmode
);
19491 rtx pred
= aarch64_ptrue_reg (pred_mode
);
19492 emit_insn (gen_aarch64_pred (unspec
, int_mode
, target
, pred
,
19493 gen_lowpart (int_mode
, d
->op0
)));
19495 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
19498 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), unspec
);
19499 emit_set_insn (d
->target
, src
);
19503 /* Recognize patterns for the REV insn, which reverses elements within
19507 aarch64_evpc_rev_global (struct expand_vec_perm_d
*d
)
19509 poly_uint64 nelt
= d
->perm
.length ();
19511 if (!d
->one_vector_p
|| d
->vec_flags
== VEC_ADVSIMD
)
19514 if (!d
->perm
.series_p (0, 1, nelt
- 1, -1))
19521 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), UNSPEC_REV
);
19522 emit_set_insn (d
->target
, src
);
19527 aarch64_evpc_dup (struct expand_vec_perm_d
*d
)
19529 rtx out
= d
->target
;
19532 machine_mode vmode
= d
->vmode
;
19535 if (d
->vec_flags
== VEC_SVE_PRED
19536 || d
->perm
.encoding ().encoded_nelts () != 1
19537 || !d
->perm
[0].is_constant (&elt
))
19540 if (d
->vec_flags
== VEC_SVE_DATA
&& elt
>= 64 * GET_MODE_UNIT_SIZE (vmode
))
19547 /* The generic preparation in aarch64_expand_vec_perm_const_1
19548 swaps the operand order and the permute indices if it finds
19549 d->perm[0] to be in the second operand. Thus, we can always
19550 use d->op0 and need not do any extra arithmetic to get the
19551 correct lane number. */
19553 lane
= GEN_INT (elt
); /* The pattern corrects for big-endian. */
19555 rtx parallel
= gen_rtx_PARALLEL (vmode
, gen_rtvec (1, lane
));
19556 rtx select
= gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode
), in0
, parallel
);
19557 emit_set_insn (out
, gen_rtx_VEC_DUPLICATE (vmode
, select
));
19562 aarch64_evpc_tbl (struct expand_vec_perm_d
*d
)
19564 rtx rperm
[MAX_COMPILE_TIME_VEC_BYTES
], sel
;
19565 machine_mode vmode
= d
->vmode
;
19567 /* Make sure that the indices are constant. */
19568 unsigned int encoded_nelts
= d
->perm
.encoding ().encoded_nelts ();
19569 for (unsigned int i
= 0; i
< encoded_nelts
; ++i
)
19570 if (!d
->perm
[i
].is_constant ())
19576 /* Generic code will try constant permutation twice. Once with the
19577 original mode and again with the elements lowered to QImode.
19578 So wait and don't do the selector expansion ourselves. */
19579 if (vmode
!= V8QImode
&& vmode
!= V16QImode
)
19582 /* to_constant is safe since this routine is specific to Advanced SIMD
19584 unsigned int nelt
= d
->perm
.length ().to_constant ();
19585 for (unsigned int i
= 0; i
< nelt
; ++i
)
19586 /* If big-endian and two vectors we end up with a weird mixed-endian
19587 mode on NEON. Reverse the index within each word but not the word
19588 itself. to_constant is safe because we checked is_constant above. */
19589 rperm
[i
] = GEN_INT (BYTES_BIG_ENDIAN
19590 ? d
->perm
[i
].to_constant () ^ (nelt
- 1)
19591 : d
->perm
[i
].to_constant ());
19593 sel
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
19594 sel
= force_reg (vmode
, sel
);
19596 aarch64_expand_vec_perm_1 (d
->target
, d
->op0
, d
->op1
, sel
);
19600 /* Try to implement D using an SVE TBL instruction. */
19603 aarch64_evpc_sve_tbl (struct expand_vec_perm_d
*d
)
19605 unsigned HOST_WIDE_INT nelt
;
19607 /* Permuting two variable-length vectors could overflow the
19609 if (!d
->one_vector_p
&& !d
->perm
.length ().is_constant (&nelt
))
19615 machine_mode sel_mode
= related_int_vector_mode (d
->vmode
).require ();
19616 rtx sel
= vec_perm_indices_to_rtx (sel_mode
, d
->perm
);
19617 if (d
->one_vector_p
)
19618 emit_unspec2 (d
->target
, UNSPEC_TBL
, d
->op0
, force_reg (sel_mode
, sel
));
19620 aarch64_expand_sve_vec_perm (d
->target
, d
->op0
, d
->op1
, sel
);
19624 /* Try to implement D using SVE SEL instruction. */
19627 aarch64_evpc_sel (struct expand_vec_perm_d
*d
)
19629 machine_mode vmode
= d
->vmode
;
19630 int unit_size
= GET_MODE_UNIT_SIZE (vmode
);
19632 if (d
->vec_flags
!= VEC_SVE_DATA
19636 int n_patterns
= d
->perm
.encoding ().npatterns ();
19637 poly_int64 vec_len
= d
->perm
.length ();
19639 for (int i
= 0; i
< n_patterns
; ++i
)
19640 if (!known_eq (d
->perm
[i
], i
)
19641 && !known_eq (d
->perm
[i
], vec_len
+ i
))
19644 for (int i
= n_patterns
; i
< n_patterns
* 2; i
++)
19645 if (!d
->perm
.series_p (i
, n_patterns
, i
, n_patterns
)
19646 && !d
->perm
.series_p (i
, n_patterns
, vec_len
+ i
, n_patterns
))
19652 machine_mode pred_mode
= aarch64_sve_pred_mode (vmode
);
19654 /* Build a predicate that is true when op0 elements should be used. */
19655 rtx_vector_builder
builder (pred_mode
, n_patterns
, 2);
19656 for (int i
= 0; i
< n_patterns
* 2; i
++)
19658 rtx elem
= known_eq (d
->perm
[i
], i
) ? CONST1_RTX (BImode
)
19659 : CONST0_RTX (BImode
);
19660 builder
.quick_push (elem
);
19663 rtx const_vec
= builder
.build ();
19664 rtx pred
= force_reg (pred_mode
, const_vec
);
19665 /* TARGET = PRED ? OP0 : OP1. */
19666 emit_insn (gen_vcond_mask (vmode
, vmode
, d
->target
, d
->op0
, d
->op1
, pred
));
19671 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
19673 /* The pattern matching functions above are written to look for a small
19674 number to begin the sequence (0, 1, N/2). If we begin with an index
19675 from the second operand, we can swap the operands. */
19676 poly_int64 nelt
= d
->perm
.length ();
19677 if (known_ge (d
->perm
[0], nelt
))
19679 d
->perm
.rotate_inputs (1);
19680 std::swap (d
->op0
, d
->op1
);
19683 if ((d
->vec_flags
== VEC_ADVSIMD
19684 || d
->vec_flags
== VEC_SVE_DATA
19685 || d
->vec_flags
== VEC_SVE_PRED
)
19686 && known_gt (nelt
, 1))
19688 if (aarch64_evpc_rev_local (d
))
19690 else if (aarch64_evpc_rev_global (d
))
19692 else if (aarch64_evpc_ext (d
))
19694 else if (aarch64_evpc_dup (d
))
19696 else if (aarch64_evpc_zip (d
))
19698 else if (aarch64_evpc_uzp (d
))
19700 else if (aarch64_evpc_trn (d
))
19702 else if (aarch64_evpc_sel (d
))
19704 if (d
->vec_flags
== VEC_SVE_DATA
)
19705 return aarch64_evpc_sve_tbl (d
);
19706 else if (d
->vec_flags
== VEC_ADVSIMD
)
19707 return aarch64_evpc_tbl (d
);
19712 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
19715 aarch64_vectorize_vec_perm_const (machine_mode vmode
, rtx target
, rtx op0
,
19716 rtx op1
, const vec_perm_indices
&sel
)
19718 struct expand_vec_perm_d d
;
19720 /* Check whether the mask can be applied to a single vector. */
19721 if (sel
.ninputs () == 1
19722 || (op0
&& rtx_equal_p (op0
, op1
)))
19723 d
.one_vector_p
= true;
19724 else if (sel
.all_from_input_p (0))
19726 d
.one_vector_p
= true;
19729 else if (sel
.all_from_input_p (1))
19731 d
.one_vector_p
= true;
19735 d
.one_vector_p
= false;
19737 d
.perm
.new_vector (sel
.encoding (), d
.one_vector_p
? 1 : 2,
19738 sel
.nelts_per_input ());
19740 d
.vec_flags
= aarch64_classify_vector_mode (d
.vmode
);
19744 d
.testing_p
= !target
;
19747 return aarch64_expand_vec_perm_const_1 (&d
);
19749 rtx_insn
*last
= get_last_insn ();
19750 bool ret
= aarch64_expand_vec_perm_const_1 (&d
);
19751 gcc_assert (last
== get_last_insn ());
19756 /* Generate a byte permute mask for a register of mode MODE,
19757 which has NUNITS units. */
19760 aarch64_reverse_mask (machine_mode mode
, unsigned int nunits
)
19762 /* We have to reverse each vector because we dont have
19763 a permuted load that can reverse-load according to ABI rules. */
19765 rtvec v
= rtvec_alloc (16);
19767 unsigned int usize
= GET_MODE_UNIT_SIZE (mode
);
19769 gcc_assert (BYTES_BIG_ENDIAN
);
19770 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode
));
19772 for (i
= 0; i
< nunits
; i
++)
19773 for (j
= 0; j
< usize
; j
++)
19774 RTVEC_ELT (v
, i
* usize
+ j
) = GEN_INT ((i
+ 1) * usize
- 1 - j
);
19775 mask
= gen_rtx_CONST_VECTOR (V16QImode
, v
);
19776 return force_reg (V16QImode
, mask
);
19779 /* Expand an SVE integer comparison using the SVE equivalent of:
19781 (set TARGET (CODE OP0 OP1)). */
19784 aarch64_expand_sve_vec_cmp_int (rtx target
, rtx_code code
, rtx op0
, rtx op1
)
19786 machine_mode pred_mode
= GET_MODE (target
);
19787 machine_mode data_mode
= GET_MODE (op0
);
19788 rtx res
= aarch64_sve_emit_int_cmp (target
, pred_mode
, code
, data_mode
,
19790 if (!rtx_equal_p (target
, res
))
19791 emit_move_insn (target
, res
);
19794 /* Return the UNSPEC_COND_* code for comparison CODE. */
19796 static unsigned int
19797 aarch64_unspec_cond_code (rtx_code code
)
19802 return UNSPEC_COND_FCMNE
;
19804 return UNSPEC_COND_FCMEQ
;
19806 return UNSPEC_COND_FCMLT
;
19808 return UNSPEC_COND_FCMGT
;
19810 return UNSPEC_COND_FCMLE
;
19812 return UNSPEC_COND_FCMGE
;
19814 return UNSPEC_COND_FCMUO
;
19816 gcc_unreachable ();
19822 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
19824 where <X> is the operation associated with comparison CODE.
19825 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
19828 aarch64_emit_sve_fp_cond (rtx target
, rtx_code code
, rtx pred
,
19829 bool known_ptrue_p
, rtx op0
, rtx op1
)
19831 rtx flag
= gen_int_mode (known_ptrue_p
, SImode
);
19832 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (pred
),
19833 gen_rtvec (4, pred
, flag
, op0
, op1
),
19834 aarch64_unspec_cond_code (code
));
19835 emit_set_insn (target
, unspec
);
19838 /* Emit the SVE equivalent of:
19840 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
19841 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
19842 (set TARGET (ior:PRED_MODE TMP1 TMP2))
19844 where <Xi> is the operation associated with comparison CODEi.
19845 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
19848 aarch64_emit_sve_or_fp_conds (rtx target
, rtx_code code1
, rtx_code code2
,
19849 rtx pred
, bool known_ptrue_p
, rtx op0
, rtx op1
)
19851 machine_mode pred_mode
= GET_MODE (pred
);
19852 rtx tmp1
= gen_reg_rtx (pred_mode
);
19853 aarch64_emit_sve_fp_cond (tmp1
, code1
, pred
, known_ptrue_p
, op0
, op1
);
19854 rtx tmp2
= gen_reg_rtx (pred_mode
);
19855 aarch64_emit_sve_fp_cond (tmp2
, code2
, pred
, known_ptrue_p
, op0
, op1
);
19856 aarch64_emit_binop (target
, ior_optab
, tmp1
, tmp2
);
19859 /* Emit the SVE equivalent of:
19861 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
19862 (set TARGET (not TMP))
19864 where <X> is the operation associated with comparison CODE.
19865 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
19868 aarch64_emit_sve_invert_fp_cond (rtx target
, rtx_code code
, rtx pred
,
19869 bool known_ptrue_p
, rtx op0
, rtx op1
)
19871 machine_mode pred_mode
= GET_MODE (pred
);
19872 rtx tmp
= gen_reg_rtx (pred_mode
);
19873 aarch64_emit_sve_fp_cond (tmp
, code
, pred
, known_ptrue_p
, op0
, op1
);
19874 aarch64_emit_unop (target
, one_cmpl_optab
, tmp
);
19877 /* Expand an SVE floating-point comparison using the SVE equivalent of:
19879 (set TARGET (CODE OP0 OP1))
19881 If CAN_INVERT_P is true, the caller can also handle inverted results;
19882 return true if the result is in fact inverted. */
19885 aarch64_expand_sve_vec_cmp_float (rtx target
, rtx_code code
,
19886 rtx op0
, rtx op1
, bool can_invert_p
)
19888 machine_mode pred_mode
= GET_MODE (target
);
19889 machine_mode data_mode
= GET_MODE (op0
);
19891 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
19895 /* UNORDERED has no immediate form. */
19896 op1
= force_reg (data_mode
, op1
);
19905 /* There is native support for the comparison. */
19906 aarch64_emit_sve_fp_cond (target
, code
, ptrue
, true, op0
, op1
);
19911 /* This is a trapping operation (LT or GT). */
19912 aarch64_emit_sve_or_fp_conds (target
, LT
, GT
, ptrue
, true, op0
, op1
);
19916 if (!flag_trapping_math
)
19918 /* This would trap for signaling NaNs. */
19919 op1
= force_reg (data_mode
, op1
);
19920 aarch64_emit_sve_or_fp_conds (target
, UNORDERED
, EQ
,
19921 ptrue
, true, op0
, op1
);
19929 if (flag_trapping_math
)
19931 /* Work out which elements are ordered. */
19932 rtx ordered
= gen_reg_rtx (pred_mode
);
19933 op1
= force_reg (data_mode
, op1
);
19934 aarch64_emit_sve_invert_fp_cond (ordered
, UNORDERED
,
19935 ptrue
, true, op0
, op1
);
19937 /* Test the opposite condition for the ordered elements,
19938 then invert the result. */
19942 code
= reverse_condition_maybe_unordered (code
);
19945 aarch64_emit_sve_fp_cond (target
, code
,
19946 ordered
, false, op0
, op1
);
19949 aarch64_emit_sve_invert_fp_cond (target
, code
,
19950 ordered
, false, op0
, op1
);
19956 /* ORDERED has no immediate form. */
19957 op1
= force_reg (data_mode
, op1
);
19961 gcc_unreachable ();
19964 /* There is native support for the inverse comparison. */
19965 code
= reverse_condition_maybe_unordered (code
);
19968 aarch64_emit_sve_fp_cond (target
, code
, ptrue
, true, op0
, op1
);
19971 aarch64_emit_sve_invert_fp_cond (target
, code
, ptrue
, true, op0
, op1
);
19975 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
19976 of the data being selected and CMP_MODE is the mode of the values being
19980 aarch64_expand_sve_vcond (machine_mode data_mode
, machine_mode cmp_mode
,
19983 machine_mode pred_mode
= aarch64_get_mask_mode (cmp_mode
).require ();
19984 rtx pred
= gen_reg_rtx (pred_mode
);
19985 if (FLOAT_MODE_P (cmp_mode
))
19987 if (aarch64_expand_sve_vec_cmp_float (pred
, GET_CODE (ops
[3]),
19988 ops
[4], ops
[5], true))
19989 std::swap (ops
[1], ops
[2]);
19992 aarch64_expand_sve_vec_cmp_int (pred
, GET_CODE (ops
[3]), ops
[4], ops
[5]);
19994 if (!aarch64_sve_reg_or_dup_imm (ops
[1], data_mode
))
19995 ops
[1] = force_reg (data_mode
, ops
[1]);
19996 /* The "false" value can only be zero if the "true" value is a constant. */
19997 if (register_operand (ops
[1], data_mode
)
19998 || !aarch64_simd_reg_or_zero (ops
[2], data_mode
))
19999 ops
[2] = force_reg (data_mode
, ops
[2]);
20001 rtvec vec
= gen_rtvec (3, pred
, ops
[1], ops
[2]);
20002 emit_set_insn (ops
[0], gen_rtx_UNSPEC (data_mode
, vec
, UNSPEC_SEL
));
20005 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
20006 true. However due to issues with register allocation it is preferable
20007 to avoid tieing integer scalar and FP scalar modes. Executing integer
20008 operations in general registers is better than treating them as scalar
20009 vector operations. This reduces latency and avoids redundant int<->FP
20010 moves. So tie modes if they are either the same class, or vector modes
20011 with other vector modes, vector structs or any scalar mode. */
20014 aarch64_modes_tieable_p (machine_mode mode1
, machine_mode mode2
)
20016 if (GET_MODE_CLASS (mode1
) == GET_MODE_CLASS (mode2
))
20019 /* We specifically want to allow elements of "structure" modes to
20020 be tieable to the structure. This more general condition allows
20021 other rarer situations too. The reason we don't extend this to
20022 predicate modes is that there are no predicate structure modes
20023 nor any specific instructions for extracting part of a predicate
20025 if (aarch64_vector_data_mode_p (mode1
)
20026 && aarch64_vector_data_mode_p (mode2
))
20029 /* Also allow any scalar modes with vectors. */
20030 if (aarch64_vector_mode_supported_p (mode1
)
20031 || aarch64_vector_mode_supported_p (mode2
))
20037 /* Return a new RTX holding the result of moving POINTER forward by
20041 aarch64_move_pointer (rtx pointer
, poly_int64 amount
)
20043 rtx next
= plus_constant (Pmode
, XEXP (pointer
, 0), amount
);
20045 return adjust_automodify_address (pointer
, GET_MODE (pointer
),
20049 /* Return a new RTX holding the result of moving POINTER forward by the
20050 size of the mode it points to. */
20053 aarch64_progress_pointer (rtx pointer
)
20055 return aarch64_move_pointer (pointer
, GET_MODE_SIZE (GET_MODE (pointer
)));
20058 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
20062 aarch64_copy_one_block_and_progress_pointers (rtx
*src
, rtx
*dst
,
20065 rtx reg
= gen_reg_rtx (mode
);
20067 /* "Cast" the pointers to the correct mode. */
20068 *src
= adjust_address (*src
, mode
, 0);
20069 *dst
= adjust_address (*dst
, mode
, 0);
20070 /* Emit the memcpy. */
20071 emit_move_insn (reg
, *src
);
20072 emit_move_insn (*dst
, reg
);
20073 /* Move the pointers forward. */
20074 *src
= aarch64_progress_pointer (*src
);
20075 *dst
= aarch64_progress_pointer (*dst
);
20078 /* Expand cpymem, as if from a __builtin_memcpy. Return true if
20079 we succeed, otherwise return false. */
20082 aarch64_expand_cpymem (rtx
*operands
)
20085 rtx dst
= operands
[0];
20086 rtx src
= operands
[1];
20088 machine_mode cur_mode
= BLKmode
, next_mode
;
20089 bool speed_p
= !optimize_function_for_size_p (cfun
);
20091 /* When optimizing for size, give a better estimate of the length of a
20092 memcpy call, but use the default otherwise. Moves larger than 8 bytes
20093 will always require an even number of instructions to do now. And each
20094 operation requires both a load+store, so devide the max number by 2. */
20095 int max_num_moves
= (speed_p
? 16 : AARCH64_CALL_RATIO
) / 2;
20097 /* We can't do anything smart if the amount to copy is not constant. */
20098 if (!CONST_INT_P (operands
[2]))
20101 n
= INTVAL (operands
[2]);
20103 /* Try to keep the number of instructions low. For all cases we will do at
20104 most two moves for the residual amount, since we'll always overlap the
20106 if (((n
/ 16) + (n
% 16 ? 2 : 0)) > max_num_moves
)
20109 base
= copy_to_mode_reg (Pmode
, XEXP (dst
, 0));
20110 dst
= adjust_automodify_address (dst
, VOIDmode
, base
, 0);
20112 base
= copy_to_mode_reg (Pmode
, XEXP (src
, 0));
20113 src
= adjust_automodify_address (src
, VOIDmode
, base
, 0);
20115 /* Convert n to bits to make the rest of the code simpler. */
20116 n
= n
* BITS_PER_UNIT
;
20118 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
20119 larger than TImode, but we should not use them for loads/stores here. */
20120 const int copy_limit
= GET_MODE_BITSIZE (TImode
);
20124 /* Find the largest mode in which to do the copy in without over reading
20126 opt_scalar_int_mode mode_iter
;
20127 FOR_EACH_MODE_IN_CLASS (mode_iter
, MODE_INT
)
20128 if (GET_MODE_BITSIZE (mode_iter
.require ()) <= MIN (n
, copy_limit
))
20129 cur_mode
= mode_iter
.require ();
20131 gcc_assert (cur_mode
!= BLKmode
);
20133 mode_bits
= GET_MODE_BITSIZE (cur_mode
).to_constant ();
20134 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, cur_mode
);
20138 /* Do certain trailing copies as overlapping if it's going to be
20139 cheaper. i.e. less instructions to do so. For instance doing a 15
20140 byte copy it's more efficient to do two overlapping 8 byte copies than
20142 if (n
> 0 && n
<= 8 * BITS_PER_UNIT
)
20144 next_mode
= smallest_mode_for_size (n
, MODE_INT
);
20145 int n_bits
= GET_MODE_BITSIZE (next_mode
).to_constant ();
20146 src
= aarch64_move_pointer (src
, (n
- n_bits
) / BITS_PER_UNIT
);
20147 dst
= aarch64_move_pointer (dst
, (n
- n_bits
) / BITS_PER_UNIT
);
20155 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
20156 SImode stores. Handle the case when the constant has identical
20157 bottom and top halves. This is beneficial when the two stores can be
20158 merged into an STP and we avoid synthesising potentially expensive
20159 immediates twice. Return true if such a split is possible. */
20162 aarch64_split_dimode_const_store (rtx dst
, rtx src
)
20164 rtx lo
= gen_lowpart (SImode
, src
);
20165 rtx hi
= gen_highpart_mode (SImode
, DImode
, src
);
20167 bool size_p
= optimize_function_for_size_p (cfun
);
20169 if (!rtx_equal_p (lo
, hi
))
20172 unsigned int orig_cost
20173 = aarch64_internal_mov_immediate (NULL_RTX
, src
, false, DImode
);
20174 unsigned int lo_cost
20175 = aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, SImode
);
20177 /* We want to transform:
20179 MOVK x1, 0x140, lsl 16
20180 MOVK x1, 0xc0da, lsl 32
20181 MOVK x1, 0x140, lsl 48
20185 MOVK w1, 0x140, lsl 16
20187 So we want to perform this only when we save two instructions
20188 or more. When optimizing for size, however, accept any code size
20190 if (size_p
&& orig_cost
<= lo_cost
)
20194 && (orig_cost
<= lo_cost
+ 1))
20197 rtx mem_lo
= adjust_address (dst
, SImode
, 0);
20198 if (!aarch64_mem_pair_operand (mem_lo
, SImode
))
20201 rtx tmp_reg
= gen_reg_rtx (SImode
);
20202 aarch64_expand_mov_immediate (tmp_reg
, lo
);
20203 rtx mem_hi
= aarch64_move_pointer (mem_lo
, GET_MODE_SIZE (SImode
));
20204 /* Don't emit an explicit store pair as this may not be always profitable.
20205 Let the sched-fusion logic decide whether to merge them. */
20206 emit_move_insn (mem_lo
, tmp_reg
);
20207 emit_move_insn (mem_hi
, tmp_reg
);
20212 /* Generate RTL for a conditional branch with rtx comparison CODE in
20213 mode CC_MODE. The destination of the unlikely conditional branch
20217 aarch64_gen_unlikely_cbranch (enum rtx_code code
, machine_mode cc_mode
,
20221 x
= gen_rtx_fmt_ee (code
, VOIDmode
,
20222 gen_rtx_REG (cc_mode
, CC_REGNUM
),
20225 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
20226 gen_rtx_LABEL_REF (VOIDmode
, label_ref
),
20228 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
20231 /* Generate DImode scratch registers for 128-bit (TImode) addition.
20233 OP1 represents the TImode destination operand 1
20234 OP2 represents the TImode destination operand 2
20235 LOW_DEST represents the low half (DImode) of TImode operand 0
20236 LOW_IN1 represents the low half (DImode) of TImode operand 1
20237 LOW_IN2 represents the low half (DImode) of TImode operand 2
20238 HIGH_DEST represents the high half (DImode) of TImode operand 0
20239 HIGH_IN1 represents the high half (DImode) of TImode operand 1
20240 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
20243 aarch64_addti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
20244 rtx
*low_in1
, rtx
*low_in2
,
20245 rtx
*high_dest
, rtx
*high_in1
,
20248 *low_dest
= gen_reg_rtx (DImode
);
20249 *low_in1
= gen_lowpart (DImode
, op1
);
20250 *low_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
20251 subreg_lowpart_offset (DImode
, TImode
));
20252 *high_dest
= gen_reg_rtx (DImode
);
20253 *high_in1
= gen_highpart (DImode
, op1
);
20254 *high_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
20255 subreg_highpart_offset (DImode
, TImode
));
20258 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
20260 This function differs from 'arch64_addti_scratch_regs' in that
20261 OP1 can be an immediate constant (zero). We must call
20262 subreg_highpart_offset with DImode and TImode arguments, otherwise
20263 VOIDmode will be used for the const_int which generates an internal
20264 error from subreg_size_highpart_offset which does not expect a size of zero.
20266 OP1 represents the TImode destination operand 1
20267 OP2 represents the TImode destination operand 2
20268 LOW_DEST represents the low half (DImode) of TImode operand 0
20269 LOW_IN1 represents the low half (DImode) of TImode operand 1
20270 LOW_IN2 represents the low half (DImode) of TImode operand 2
20271 HIGH_DEST represents the high half (DImode) of TImode operand 0
20272 HIGH_IN1 represents the high half (DImode) of TImode operand 1
20273 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
20277 aarch64_subvti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
20278 rtx
*low_in1
, rtx
*low_in2
,
20279 rtx
*high_dest
, rtx
*high_in1
,
20282 *low_dest
= gen_reg_rtx (DImode
);
20283 *low_in1
= simplify_gen_subreg (DImode
, op1
, TImode
,
20284 subreg_lowpart_offset (DImode
, TImode
));
20286 *low_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
20287 subreg_lowpart_offset (DImode
, TImode
));
20288 *high_dest
= gen_reg_rtx (DImode
);
20290 *high_in1
= simplify_gen_subreg (DImode
, op1
, TImode
,
20291 subreg_highpart_offset (DImode
, TImode
));
20292 *high_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
20293 subreg_highpart_offset (DImode
, TImode
));
20296 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
20298 OP0 represents the TImode destination operand 0
20299 LOW_DEST represents the low half (DImode) of TImode operand 0
20300 LOW_IN1 represents the low half (DImode) of TImode operand 1
20301 LOW_IN2 represents the low half (DImode) of TImode operand 2
20302 HIGH_DEST represents the high half (DImode) of TImode operand 0
20303 HIGH_IN1 represents the high half (DImode) of TImode operand 1
20304 HIGH_IN2 represents the high half (DImode) of TImode operand 2
20305 UNSIGNED_P is true if the operation is being performed on unsigned
20308 aarch64_expand_subvti (rtx op0
, rtx low_dest
, rtx low_in1
,
20309 rtx low_in2
, rtx high_dest
, rtx high_in1
,
20310 rtx high_in2
, bool unsigned_p
)
20312 if (low_in2
== const0_rtx
)
20314 low_dest
= low_in1
;
20315 high_in2
= force_reg (DImode
, high_in2
);
20317 emit_insn (gen_subdi3_compare1 (high_dest
, high_in1
, high_in2
));
20319 emit_insn (gen_subvdi_insn (high_dest
, high_in1
, high_in2
));
20323 if (aarch64_plus_immediate (low_in2
, DImode
))
20324 emit_insn (gen_subdi3_compare1_imm (low_dest
, low_in1
, low_in2
,
20325 GEN_INT (-INTVAL (low_in2
))));
20328 low_in2
= force_reg (DImode
, low_in2
);
20329 emit_insn (gen_subdi3_compare1 (low_dest
, low_in1
, low_in2
));
20331 high_in2
= force_reg (DImode
, high_in2
);
20334 emit_insn (gen_usubdi3_carryinC (high_dest
, high_in1
, high_in2
));
20336 emit_insn (gen_subdi3_carryinV (high_dest
, high_in1
, high_in2
));
20339 emit_move_insn (gen_lowpart (DImode
, op0
), low_dest
);
20340 emit_move_insn (gen_highpart (DImode
, op0
), high_dest
);
20344 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
20346 static unsigned HOST_WIDE_INT
20347 aarch64_asan_shadow_offset (void)
20350 return (HOST_WIDE_INT_1
<< 29);
20352 return (HOST_WIDE_INT_1
<< 36);
20356 aarch64_gen_ccmp_first (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
,
20357 int code
, tree treeop0
, tree treeop1
)
20359 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
20361 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
20363 struct expand_operand ops
[4];
20366 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
20368 op_mode
= GET_MODE (op0
);
20369 if (op_mode
== VOIDmode
)
20370 op_mode
= GET_MODE (op1
);
20378 icode
= CODE_FOR_cmpsi
;
20383 icode
= CODE_FOR_cmpdi
;
20388 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
20389 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpesf
: CODE_FOR_fcmpsf
;
20394 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
20395 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpedf
: CODE_FOR_fcmpdf
;
20403 op0
= prepare_operand (icode
, op0
, 0, op_mode
, cmp_mode
, unsignedp
);
20404 op1
= prepare_operand (icode
, op1
, 1, op_mode
, cmp_mode
, unsignedp
);
20410 *prep_seq
= get_insns ();
20413 create_fixed_operand (&ops
[0], op0
);
20414 create_fixed_operand (&ops
[1], op1
);
20417 if (!maybe_expand_insn (icode
, 2, ops
))
20422 *gen_seq
= get_insns ();
20425 return gen_rtx_fmt_ee ((rtx_code
) code
, cc_mode
,
20426 gen_rtx_REG (cc_mode
, CC_REGNUM
), const0_rtx
);
20430 aarch64_gen_ccmp_next (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
, rtx prev
,
20431 int cmp_code
, tree treeop0
, tree treeop1
, int bit_code
)
20433 rtx op0
, op1
, target
;
20434 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
20435 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
20437 struct expand_operand ops
[6];
20440 push_to_sequence (*prep_seq
);
20441 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
20443 op_mode
= GET_MODE (op0
);
20444 if (op_mode
== VOIDmode
)
20445 op_mode
= GET_MODE (op1
);
20461 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
20466 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
20474 icode
= code_for_ccmp (cc_mode
, cmp_mode
);
20476 op0
= prepare_operand (icode
, op0
, 2, op_mode
, cmp_mode
, unsignedp
);
20477 op1
= prepare_operand (icode
, op1
, 3, op_mode
, cmp_mode
, unsignedp
);
20483 *prep_seq
= get_insns ();
20486 target
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
20487 aarch64_cond
= aarch64_get_condition_code_1 (cc_mode
, (rtx_code
) cmp_code
);
20489 if (bit_code
!= AND
)
20491 /* Treat the ccmp patterns as canonical and use them where possible,
20492 but fall back to ccmp_rev patterns if there's no other option. */
20493 rtx_code prev_code
= GET_CODE (prev
);
20494 machine_mode prev_mode
= GET_MODE (XEXP (prev
, 0));
20495 if ((prev_mode
== CCFPmode
|| prev_mode
== CCFPEmode
)
20496 && !(prev_code
== EQ
20498 || prev_code
== ORDERED
20499 || prev_code
== UNORDERED
))
20500 icode
= code_for_ccmp_rev (cc_mode
, cmp_mode
);
20503 rtx_code code
= reverse_condition (prev_code
);
20504 prev
= gen_rtx_fmt_ee (code
, VOIDmode
, XEXP (prev
, 0), const0_rtx
);
20506 aarch64_cond
= AARCH64_INVERSE_CONDITION_CODE (aarch64_cond
);
20509 create_fixed_operand (&ops
[0], XEXP (prev
, 0));
20510 create_fixed_operand (&ops
[1], target
);
20511 create_fixed_operand (&ops
[2], op0
);
20512 create_fixed_operand (&ops
[3], op1
);
20513 create_fixed_operand (&ops
[4], prev
);
20514 create_fixed_operand (&ops
[5], GEN_INT (aarch64_cond
));
20516 push_to_sequence (*gen_seq
);
20517 if (!maybe_expand_insn (icode
, 6, ops
))
20523 *gen_seq
= get_insns ();
20526 return gen_rtx_fmt_ee ((rtx_code
) cmp_code
, VOIDmode
, target
, const0_rtx
);
20529 #undef TARGET_GEN_CCMP_FIRST
20530 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
20532 #undef TARGET_GEN_CCMP_NEXT
20533 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
20535 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
20536 instruction fusion of some sort. */
20539 aarch64_macro_fusion_p (void)
20541 return aarch64_tune_params
.fusible_ops
!= AARCH64_FUSE_NOTHING
;
20545 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
20546 should be kept together during scheduling. */
20549 aarch_macro_fusion_pair_p (rtx_insn
*prev
, rtx_insn
*curr
)
20552 rtx prev_set
= single_set (prev
);
20553 rtx curr_set
= single_set (curr
);
20554 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
20555 bool simple_sets_p
= prev_set
&& curr_set
&& !any_condjump_p (curr
);
20557 if (!aarch64_macro_fusion_p ())
20560 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK
))
20562 /* We are trying to match:
20563 prev (mov) == (set (reg r0) (const_int imm16))
20564 curr (movk) == (set (zero_extract (reg r0)
20567 (const_int imm16_1)) */
20569 set_dest
= SET_DEST (curr_set
);
20571 if (GET_CODE (set_dest
) == ZERO_EXTRACT
20572 && CONST_INT_P (SET_SRC (curr_set
))
20573 && CONST_INT_P (SET_SRC (prev_set
))
20574 && CONST_INT_P (XEXP (set_dest
, 2))
20575 && INTVAL (XEXP (set_dest
, 2)) == 16
20576 && REG_P (XEXP (set_dest
, 0))
20577 && REG_P (SET_DEST (prev_set
))
20578 && REGNO (XEXP (set_dest
, 0)) == REGNO (SET_DEST (prev_set
)))
20584 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD
))
20587 /* We're trying to match:
20588 prev (adrp) == (set (reg r1)
20589 (high (symbol_ref ("SYM"))))
20590 curr (add) == (set (reg r0)
20592 (symbol_ref ("SYM"))))
20593 Note that r0 need not necessarily be the same as r1, especially
20594 during pre-regalloc scheduling. */
20596 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
20597 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
20599 if (GET_CODE (SET_SRC (curr_set
)) == LO_SUM
20600 && REG_P (XEXP (SET_SRC (curr_set
), 0))
20601 && REGNO (XEXP (SET_SRC (curr_set
), 0))
20602 == REGNO (SET_DEST (prev_set
))
20603 && rtx_equal_p (XEXP (SET_SRC (prev_set
), 0),
20604 XEXP (SET_SRC (curr_set
), 1)))
20609 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK
))
20612 /* We're trying to match:
20613 prev (movk) == (set (zero_extract (reg r0)
20616 (const_int imm16_1))
20617 curr (movk) == (set (zero_extract (reg r0)
20620 (const_int imm16_2)) */
20622 if (GET_CODE (SET_DEST (prev_set
)) == ZERO_EXTRACT
20623 && GET_CODE (SET_DEST (curr_set
)) == ZERO_EXTRACT
20624 && REG_P (XEXP (SET_DEST (prev_set
), 0))
20625 && REG_P (XEXP (SET_DEST (curr_set
), 0))
20626 && REGNO (XEXP (SET_DEST (prev_set
), 0))
20627 == REGNO (XEXP (SET_DEST (curr_set
), 0))
20628 && CONST_INT_P (XEXP (SET_DEST (prev_set
), 2))
20629 && CONST_INT_P (XEXP (SET_DEST (curr_set
), 2))
20630 && INTVAL (XEXP (SET_DEST (prev_set
), 2)) == 32
20631 && INTVAL (XEXP (SET_DEST (curr_set
), 2)) == 48
20632 && CONST_INT_P (SET_SRC (prev_set
))
20633 && CONST_INT_P (SET_SRC (curr_set
)))
20637 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR
))
20639 /* We're trying to match:
20640 prev (adrp) == (set (reg r0)
20641 (high (symbol_ref ("SYM"))))
20642 curr (ldr) == (set (reg r1)
20643 (mem (lo_sum (reg r0)
20644 (symbol_ref ("SYM")))))
20646 curr (ldr) == (set (reg r1)
20649 (symbol_ref ("SYM")))))) */
20650 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
20651 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
20653 rtx curr_src
= SET_SRC (curr_set
);
20655 if (GET_CODE (curr_src
) == ZERO_EXTEND
)
20656 curr_src
= XEXP (curr_src
, 0);
20658 if (MEM_P (curr_src
) && GET_CODE (XEXP (curr_src
, 0)) == LO_SUM
20659 && REG_P (XEXP (XEXP (curr_src
, 0), 0))
20660 && REGNO (XEXP (XEXP (curr_src
, 0), 0))
20661 == REGNO (SET_DEST (prev_set
))
20662 && rtx_equal_p (XEXP (XEXP (curr_src
, 0), 1),
20663 XEXP (SET_SRC (prev_set
), 0)))
20668 /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch. */
20669 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH
)
20670 && prev_set
&& curr_set
&& any_condjump_p (curr
)
20671 && GET_CODE (SET_SRC (prev_set
)) == COMPARE
20672 && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set
), 0)))
20673 && reg_referenced_p (SET_DEST (prev_set
), PATTERN (curr
)))
20676 /* Fuse flag-setting ALU instructions and conditional branch. */
20677 if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH
)
20678 && any_condjump_p (curr
))
20680 unsigned int condreg1
, condreg2
;
20682 aarch64_fixed_condition_code_regs (&condreg1
, &condreg2
);
20683 cc_reg_1
= gen_rtx_REG (CCmode
, condreg1
);
20685 if (reg_referenced_p (cc_reg_1
, PATTERN (curr
))
20687 && modified_in_p (cc_reg_1
, prev
))
20689 enum attr_type prev_type
= get_attr_type (prev
);
20691 /* FIXME: this misses some which is considered simple arthematic
20692 instructions for ThunderX. Simple shifts are missed here. */
20693 if (prev_type
== TYPE_ALUS_SREG
20694 || prev_type
== TYPE_ALUS_IMM
20695 || prev_type
== TYPE_LOGICS_REG
20696 || prev_type
== TYPE_LOGICS_IMM
)
20701 /* Fuse ALU instructions and CBZ/CBNZ. */
20704 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ
)
20705 && any_condjump_p (curr
))
20707 /* We're trying to match:
20708 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
20709 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
20711 (label_ref ("SYM"))
20713 if (SET_DEST (curr_set
) == (pc_rtx
)
20714 && GET_CODE (SET_SRC (curr_set
)) == IF_THEN_ELSE
20715 && REG_P (XEXP (XEXP (SET_SRC (curr_set
), 0), 0))
20716 && REG_P (SET_DEST (prev_set
))
20717 && REGNO (SET_DEST (prev_set
))
20718 == REGNO (XEXP (XEXP (SET_SRC (curr_set
), 0), 0)))
20720 /* Fuse ALU operations followed by conditional branch instruction. */
20721 switch (get_attr_type (prev
))
20724 case TYPE_ALU_SREG
:
20727 case TYPE_ADCS_REG
:
20728 case TYPE_ADCS_IMM
:
20729 case TYPE_LOGIC_REG
:
20730 case TYPE_LOGIC_IMM
:
20734 case TYPE_SHIFT_REG
:
20735 case TYPE_SHIFT_IMM
:
20750 /* Return true iff the instruction fusion described by OP is enabled. */
20753 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op
)
20755 return (aarch64_tune_params
.fusible_ops
& op
) != 0;
20758 /* If MEM is in the form of [base+offset], extract the two parts
20759 of address and set to BASE and OFFSET, otherwise return false
20760 after clearing BASE and OFFSET. */
20763 extract_base_offset_in_addr (rtx mem
, rtx
*base
, rtx
*offset
)
20767 gcc_assert (MEM_P (mem
));
20769 addr
= XEXP (mem
, 0);
20774 *offset
= const0_rtx
;
20778 if (GET_CODE (addr
) == PLUS
20779 && REG_P (XEXP (addr
, 0)) && CONST_INT_P (XEXP (addr
, 1)))
20781 *base
= XEXP (addr
, 0);
20782 *offset
= XEXP (addr
, 1);
20787 *offset
= NULL_RTX
;
20792 /* Types for scheduling fusion. */
20793 enum sched_fusion_type
20795 SCHED_FUSION_NONE
= 0,
20796 SCHED_FUSION_LD_SIGN_EXTEND
,
20797 SCHED_FUSION_LD_ZERO_EXTEND
,
20803 /* If INSN is a load or store of address in the form of [base+offset],
20804 extract the two parts and set to BASE and OFFSET. Return scheduling
20805 fusion type this INSN is. */
20807 static enum sched_fusion_type
20808 fusion_load_store (rtx_insn
*insn
, rtx
*base
, rtx
*offset
)
20811 enum sched_fusion_type fusion
= SCHED_FUSION_LD
;
20813 gcc_assert (INSN_P (insn
));
20814 x
= PATTERN (insn
);
20815 if (GET_CODE (x
) != SET
)
20816 return SCHED_FUSION_NONE
;
20819 dest
= SET_DEST (x
);
20821 machine_mode dest_mode
= GET_MODE (dest
);
20823 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode
))
20824 return SCHED_FUSION_NONE
;
20826 if (GET_CODE (src
) == SIGN_EXTEND
)
20828 fusion
= SCHED_FUSION_LD_SIGN_EXTEND
;
20829 src
= XEXP (src
, 0);
20830 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
20831 return SCHED_FUSION_NONE
;
20833 else if (GET_CODE (src
) == ZERO_EXTEND
)
20835 fusion
= SCHED_FUSION_LD_ZERO_EXTEND
;
20836 src
= XEXP (src
, 0);
20837 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
20838 return SCHED_FUSION_NONE
;
20841 if (GET_CODE (src
) == MEM
&& REG_P (dest
))
20842 extract_base_offset_in_addr (src
, base
, offset
);
20843 else if (GET_CODE (dest
) == MEM
&& (REG_P (src
) || src
== const0_rtx
))
20845 fusion
= SCHED_FUSION_ST
;
20846 extract_base_offset_in_addr (dest
, base
, offset
);
20849 return SCHED_FUSION_NONE
;
20851 if (*base
== NULL_RTX
|| *offset
== NULL_RTX
)
20852 fusion
= SCHED_FUSION_NONE
;
20857 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
20859 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
20860 and PRI are only calculated for these instructions. For other instruction,
20861 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
20862 type instruction fusion can be added by returning different priorities.
20864 It's important that irrelevant instructions get the largest FUSION_PRI. */
20867 aarch64_sched_fusion_priority (rtx_insn
*insn
, int max_pri
,
20868 int *fusion_pri
, int *pri
)
20872 enum sched_fusion_type fusion
;
20874 gcc_assert (INSN_P (insn
));
20877 fusion
= fusion_load_store (insn
, &base
, &offset
);
20878 if (fusion
== SCHED_FUSION_NONE
)
20885 /* Set FUSION_PRI according to fusion type and base register. */
20886 *fusion_pri
= tmp
- fusion
* FIRST_PSEUDO_REGISTER
- REGNO (base
);
20888 /* Calculate PRI. */
20891 /* INSN with smaller offset goes first. */
20892 off_val
= (int)(INTVAL (offset
));
20894 tmp
-= (off_val
& 0xfffff);
20896 tmp
+= ((- off_val
) & 0xfffff);
20902 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
20903 Adjust priority of sha1h instructions so they are scheduled before
20904 other SHA1 instructions. */
20907 aarch64_sched_adjust_priority (rtx_insn
*insn
, int priority
)
20909 rtx x
= PATTERN (insn
);
20911 if (GET_CODE (x
) == SET
)
20915 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_SHA1H
)
20916 return priority
+ 10;
20922 /* Given OPERANDS of consecutive load/store, check if we can merge
20923 them into ldp/stp. LOAD is true if they are load instructions.
20924 MODE is the mode of memory operands. */
20927 aarch64_operands_ok_for_ldpstp (rtx
*operands
, bool load
,
20930 HOST_WIDE_INT offval_1
, offval_2
, msize
;
20931 enum reg_class rclass_1
, rclass_2
;
20932 rtx mem_1
, mem_2
, reg_1
, reg_2
, base_1
, base_2
, offset_1
, offset_2
;
20936 mem_1
= operands
[1];
20937 mem_2
= operands
[3];
20938 reg_1
= operands
[0];
20939 reg_2
= operands
[2];
20940 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
));
20941 if (REGNO (reg_1
) == REGNO (reg_2
))
20946 mem_1
= operands
[0];
20947 mem_2
= operands
[2];
20948 reg_1
= operands
[1];
20949 reg_2
= operands
[3];
20952 /* The mems cannot be volatile. */
20953 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
))
20956 /* If we have SImode and slow unaligned ldp,
20957 check the alignment to be at least 8 byte. */
20959 && (aarch64_tune_params
.extra_tuning_flags
20960 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
20962 && MEM_ALIGN (mem_1
) < 8 * BITS_PER_UNIT
)
20965 /* Check if the addresses are in the form of [base+offset]. */
20966 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
20967 if (base_1
== NULL_RTX
|| offset_1
== NULL_RTX
)
20969 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
20970 if (base_2
== NULL_RTX
|| offset_2
== NULL_RTX
)
20973 /* Check if the bases are same. */
20974 if (!rtx_equal_p (base_1
, base_2
))
20977 /* The operands must be of the same size. */
20978 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1
)),
20979 GET_MODE_SIZE (GET_MODE (mem_2
))));
20981 offval_1
= INTVAL (offset_1
);
20982 offval_2
= INTVAL (offset_2
);
20983 /* We should only be trying this for fixed-sized modes. There is no
20984 SVE LDP/STP instruction. */
20985 msize
= GET_MODE_SIZE (mode
).to_constant ();
20986 /* Check if the offsets are consecutive. */
20987 if (offval_1
!= (offval_2
+ msize
) && offval_2
!= (offval_1
+ msize
))
20990 /* Check if the addresses are clobbered by load. */
20993 if (reg_mentioned_p (reg_1
, mem_1
))
20996 /* In increasing order, the last load can clobber the address. */
20997 if (offval_1
> offval_2
&& reg_mentioned_p (reg_2
, mem_2
))
21001 /* One of the memory accesses must be a mempair operand.
21002 If it is not the first one, they need to be swapped by the
21004 if (!aarch64_mem_pair_operand (mem_1
, GET_MODE (mem_1
))
21005 && !aarch64_mem_pair_operand (mem_2
, GET_MODE (mem_2
)))
21008 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
21009 rclass_1
= FP_REGS
;
21011 rclass_1
= GENERAL_REGS
;
21013 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
21014 rclass_2
= FP_REGS
;
21016 rclass_2
= GENERAL_REGS
;
21018 /* Check if the registers are of same class. */
21019 if (rclass_1
!= rclass_2
)
21025 /* Given OPERANDS of consecutive load/store that can be merged,
21026 swap them if they are not in ascending order. */
21028 aarch64_swap_ldrstr_operands (rtx
* operands
, bool load
)
21030 rtx mem_1
, mem_2
, base_1
, base_2
, offset_1
, offset_2
;
21031 HOST_WIDE_INT offval_1
, offval_2
;
21035 mem_1
= operands
[1];
21036 mem_2
= operands
[3];
21040 mem_1
= operands
[0];
21041 mem_2
= operands
[2];
21044 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
21045 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
21047 offval_1
= INTVAL (offset_1
);
21048 offval_2
= INTVAL (offset_2
);
21050 if (offval_1
> offval_2
)
21052 /* Irrespective of whether this is a load or a store,
21053 we do the same swap. */
21054 std::swap (operands
[0], operands
[2]);
21055 std::swap (operands
[1], operands
[3]);
21059 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
21060 comparison between the two. */
21062 aarch64_host_wide_int_compare (const void *x
, const void *y
)
21064 return wi::cmps (* ((const HOST_WIDE_INT
*) x
),
21065 * ((const HOST_WIDE_INT
*) y
));
21068 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
21069 other pointing to a REG rtx containing an offset, compare the offsets
21074 1 iff offset (X) > offset (Y)
21075 0 iff offset (X) == offset (Y)
21076 -1 iff offset (X) < offset (Y) */
21078 aarch64_ldrstr_offset_compare (const void *x
, const void *y
)
21080 const rtx
* operands_1
= (const rtx
*) x
;
21081 const rtx
* operands_2
= (const rtx
*) y
;
21082 rtx mem_1
, mem_2
, base
, offset_1
, offset_2
;
21084 if (MEM_P (operands_1
[0]))
21085 mem_1
= operands_1
[0];
21087 mem_1
= operands_1
[1];
21089 if (MEM_P (operands_2
[0]))
21090 mem_2
= operands_2
[0];
21092 mem_2
= operands_2
[1];
21094 /* Extract the offsets. */
21095 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
21096 extract_base_offset_in_addr (mem_2
, &base
, &offset_2
);
21098 gcc_assert (offset_1
!= NULL_RTX
&& offset_2
!= NULL_RTX
);
21100 return wi::cmps (INTVAL (offset_1
), INTVAL (offset_2
));
21103 /* Given OPERANDS of consecutive load/store, check if we can merge
21104 them into ldp/stp by adjusting the offset. LOAD is true if they
21105 are load instructions. MODE is the mode of memory operands.
21107 Given below consecutive stores:
21109 str w1, [xb, 0x100]
21110 str w1, [xb, 0x104]
21111 str w1, [xb, 0x108]
21112 str w1, [xb, 0x10c]
21114 Though the offsets are out of the range supported by stp, we can
21115 still pair them after adjusting the offset, like:
21117 add scratch, xb, 0x100
21118 stp w1, w1, [scratch]
21119 stp w1, w1, [scratch, 0x8]
21121 The peephole patterns detecting this opportunity should guarantee
21122 the scratch register is avaliable. */
21125 aarch64_operands_adjust_ok_for_ldpstp (rtx
*operands
, bool load
,
21128 const int num_insns
= 4;
21129 enum reg_class rclass
;
21130 HOST_WIDE_INT offvals
[num_insns
], msize
;
21131 rtx mem
[num_insns
], reg
[num_insns
], base
[num_insns
], offset
[num_insns
];
21135 for (int i
= 0; i
< num_insns
; i
++)
21137 reg
[i
] = operands
[2 * i
];
21138 mem
[i
] = operands
[2 * i
+ 1];
21140 gcc_assert (REG_P (reg
[i
]));
21143 /* Do not attempt to merge the loads if the loads clobber each other. */
21144 for (int i
= 0; i
< 8; i
+= 2)
21145 for (int j
= i
+ 2; j
< 8; j
+= 2)
21146 if (reg_overlap_mentioned_p (operands
[i
], operands
[j
]))
21150 for (int i
= 0; i
< num_insns
; i
++)
21152 mem
[i
] = operands
[2 * i
];
21153 reg
[i
] = operands
[2 * i
+ 1];
21156 /* Skip if memory operand is by itself valid for ldp/stp. */
21157 if (!MEM_P (mem
[0]) || aarch64_mem_pair_operand (mem
[0], mode
))
21160 for (int i
= 0; i
< num_insns
; i
++)
21162 /* The mems cannot be volatile. */
21163 if (MEM_VOLATILE_P (mem
[i
]))
21166 /* Check if the addresses are in the form of [base+offset]. */
21167 extract_base_offset_in_addr (mem
[i
], base
+ i
, offset
+ i
);
21168 if (base
[i
] == NULL_RTX
|| offset
[i
] == NULL_RTX
)
21172 /* Check if the registers are of same class. */
21173 rclass
= REG_P (reg
[0]) && FP_REGNUM_P (REGNO (reg
[0]))
21174 ? FP_REGS
: GENERAL_REGS
;
21176 for (int i
= 1; i
< num_insns
; i
++)
21177 if (REG_P (reg
[i
]) && FP_REGNUM_P (REGNO (reg
[i
])))
21179 if (rclass
!= FP_REGS
)
21184 if (rclass
!= GENERAL_REGS
)
21188 /* Only the last register in the order in which they occur
21189 may be clobbered by the load. */
21190 if (rclass
== GENERAL_REGS
&& load
)
21191 for (int i
= 0; i
< num_insns
- 1; i
++)
21192 if (reg_mentioned_p (reg
[i
], mem
[i
]))
21195 /* Check if the bases are same. */
21196 for (int i
= 0; i
< num_insns
- 1; i
++)
21197 if (!rtx_equal_p (base
[i
], base
[i
+ 1]))
21200 for (int i
= 0; i
< num_insns
; i
++)
21201 offvals
[i
] = INTVAL (offset
[i
]);
21203 msize
= GET_MODE_SIZE (mode
);
21205 /* Check if the offsets can be put in the right order to do a ldp/stp. */
21206 qsort (offvals
, num_insns
, sizeof (HOST_WIDE_INT
),
21207 aarch64_host_wide_int_compare
);
21209 if (!(offvals
[1] == offvals
[0] + msize
21210 && offvals
[3] == offvals
[2] + msize
))
21213 /* Check that offsets are within range of each other. The ldp/stp
21214 instructions have 7 bit immediate offsets, so use 0x80. */
21215 if (offvals
[2] - offvals
[0] >= msize
* 0x80)
21218 /* The offsets must be aligned with respect to each other. */
21219 if (offvals
[0] % msize
!= offvals
[2] % msize
)
21222 /* If we have SImode and slow unaligned ldp,
21223 check the alignment to be at least 8 byte. */
21225 && (aarch64_tune_params
.extra_tuning_flags
21226 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
21228 && MEM_ALIGN (mem
[0]) < 8 * BITS_PER_UNIT
)
21234 /* Given OPERANDS of consecutive load/store, this function pairs them
21235 into LDP/STP after adjusting the offset. It depends on the fact
21236 that the operands can be sorted so the offsets are correct for STP.
21237 MODE is the mode of memory operands. CODE is the rtl operator
21238 which should be applied to all memory operands, it's SIGN_EXTEND,
21239 ZERO_EXTEND or UNKNOWN. */
21242 aarch64_gen_adjusted_ldpstp (rtx
*operands
, bool load
,
21243 scalar_mode mode
, RTX_CODE code
)
21245 rtx base
, offset_1
, offset_3
, t1
, t2
;
21246 rtx mem_1
, mem_2
, mem_3
, mem_4
;
21247 rtx temp_operands
[8];
21248 HOST_WIDE_INT off_val_1
, off_val_3
, base_off
, new_off_1
, new_off_3
,
21249 stp_off_upper_limit
, stp_off_lower_limit
, msize
;
21251 /* We make changes on a copy as we may still bail out. */
21252 for (int i
= 0; i
< 8; i
++)
21253 temp_operands
[i
] = operands
[i
];
21255 /* Sort the operands. */
21256 qsort (temp_operands
, 4, 2 * sizeof (rtx
*), aarch64_ldrstr_offset_compare
);
21258 /* Copy the memory operands so that if we have to bail for some
21259 reason the original addresses are unchanged. */
21262 mem_1
= copy_rtx (temp_operands
[1]);
21263 mem_2
= copy_rtx (temp_operands
[3]);
21264 mem_3
= copy_rtx (temp_operands
[5]);
21265 mem_4
= copy_rtx (temp_operands
[7]);
21269 mem_1
= copy_rtx (temp_operands
[0]);
21270 mem_2
= copy_rtx (temp_operands
[2]);
21271 mem_3
= copy_rtx (temp_operands
[4]);
21272 mem_4
= copy_rtx (temp_operands
[6]);
21273 gcc_assert (code
== UNKNOWN
);
21276 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
21277 extract_base_offset_in_addr (mem_3
, &base
, &offset_3
);
21278 gcc_assert (base
!= NULL_RTX
&& offset_1
!= NULL_RTX
21279 && offset_3
!= NULL_RTX
);
21281 /* Adjust offset so it can fit in LDP/STP instruction. */
21282 msize
= GET_MODE_SIZE (mode
);
21283 stp_off_upper_limit
= msize
* (0x40 - 1);
21284 stp_off_lower_limit
= - msize
* 0x40;
21286 off_val_1
= INTVAL (offset_1
);
21287 off_val_3
= INTVAL (offset_3
);
21289 /* The base offset is optimally half way between the two STP/LDP offsets. */
21291 base_off
= (off_val_1
+ off_val_3
) / 2;
21293 /* However, due to issues with negative LDP/STP offset generation for
21294 larger modes, for DF, DI and vector modes. we must not use negative
21295 addresses smaller than 9 signed unadjusted bits can store. This
21296 provides the most range in this case. */
21297 base_off
= off_val_1
;
21299 /* Adjust the base so that it is aligned with the addresses but still
21301 if (base_off
% msize
!= off_val_1
% msize
)
21302 /* Fix the offset, bearing in mind we want to make it bigger not
21304 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
21305 else if (msize
<= 4)
21306 /* The negative range of LDP/STP is one larger than the positive range. */
21309 /* Check if base offset is too big or too small. We can attempt to resolve
21310 this issue by setting it to the maximum value and seeing if the offsets
21312 if (base_off
>= 0x1000)
21314 base_off
= 0x1000 - 1;
21315 /* We must still make sure that the base offset is aligned with respect
21316 to the address. But it may not be made any bigger. */
21317 base_off
-= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
21320 /* Likewise for the case where the base is too small. */
21321 if (base_off
<= -0x1000)
21323 base_off
= -0x1000 + 1;
21324 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
21327 /* Offset of the first STP/LDP. */
21328 new_off_1
= off_val_1
- base_off
;
21330 /* Offset of the second STP/LDP. */
21331 new_off_3
= off_val_3
- base_off
;
21333 /* The offsets must be within the range of the LDP/STP instructions. */
21334 if (new_off_1
> stp_off_upper_limit
|| new_off_1
< stp_off_lower_limit
21335 || new_off_3
> stp_off_upper_limit
|| new_off_3
< stp_off_lower_limit
)
21338 replace_equiv_address_nv (mem_1
, plus_constant (Pmode
, operands
[8],
21340 replace_equiv_address_nv (mem_2
, plus_constant (Pmode
, operands
[8],
21341 new_off_1
+ msize
), true);
21342 replace_equiv_address_nv (mem_3
, plus_constant (Pmode
, operands
[8],
21344 replace_equiv_address_nv (mem_4
, plus_constant (Pmode
, operands
[8],
21345 new_off_3
+ msize
), true);
21347 if (!aarch64_mem_pair_operand (mem_1
, mode
)
21348 || !aarch64_mem_pair_operand (mem_3
, mode
))
21351 if (code
== ZERO_EXTEND
)
21353 mem_1
= gen_rtx_ZERO_EXTEND (DImode
, mem_1
);
21354 mem_2
= gen_rtx_ZERO_EXTEND (DImode
, mem_2
);
21355 mem_3
= gen_rtx_ZERO_EXTEND (DImode
, mem_3
);
21356 mem_4
= gen_rtx_ZERO_EXTEND (DImode
, mem_4
);
21358 else if (code
== SIGN_EXTEND
)
21360 mem_1
= gen_rtx_SIGN_EXTEND (DImode
, mem_1
);
21361 mem_2
= gen_rtx_SIGN_EXTEND (DImode
, mem_2
);
21362 mem_3
= gen_rtx_SIGN_EXTEND (DImode
, mem_3
);
21363 mem_4
= gen_rtx_SIGN_EXTEND (DImode
, mem_4
);
21368 operands
[0] = temp_operands
[0];
21369 operands
[1] = mem_1
;
21370 operands
[2] = temp_operands
[2];
21371 operands
[3] = mem_2
;
21372 operands
[4] = temp_operands
[4];
21373 operands
[5] = mem_3
;
21374 operands
[6] = temp_operands
[6];
21375 operands
[7] = mem_4
;
21379 operands
[0] = mem_1
;
21380 operands
[1] = temp_operands
[1];
21381 operands
[2] = mem_2
;
21382 operands
[3] = temp_operands
[3];
21383 operands
[4] = mem_3
;
21384 operands
[5] = temp_operands
[5];
21385 operands
[6] = mem_4
;
21386 operands
[7] = temp_operands
[7];
21389 /* Emit adjusting instruction. */
21390 emit_insn (gen_rtx_SET (operands
[8], plus_constant (DImode
, base
, base_off
)));
21391 /* Emit ldp/stp instructions. */
21392 t1
= gen_rtx_SET (operands
[0], operands
[1]);
21393 t2
= gen_rtx_SET (operands
[2], operands
[3]);
21394 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
21395 t1
= gen_rtx_SET (operands
[4], operands
[5]);
21396 t2
= gen_rtx_SET (operands
[6], operands
[7]);
21397 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
21401 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
21402 it isn't worth branching around empty masked ops (including masked
21406 aarch64_empty_mask_is_expensive (unsigned)
21411 /* Return 1 if pseudo register should be created and used to hold
21412 GOT address for PIC code. */
21415 aarch64_use_pseudo_pic_reg (void)
21417 return aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
;
21420 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
21423 aarch64_unspec_may_trap_p (const_rtx x
, unsigned flags
)
21425 switch (XINT (x
, 1))
21427 case UNSPEC_GOTSMALLPIC
:
21428 case UNSPEC_GOTSMALLPIC28K
:
21429 case UNSPEC_GOTTINYPIC
:
21435 return default_unspec_may_trap_p (x
, flags
);
21439 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
21440 return the log2 of that value. Otherwise return -1. */
21443 aarch64_fpconst_pow_of_2 (rtx x
)
21445 const REAL_VALUE_TYPE
*r
;
21447 if (!CONST_DOUBLE_P (x
))
21450 r
= CONST_DOUBLE_REAL_VALUE (x
);
21452 if (REAL_VALUE_NEGATIVE (*r
)
21453 || REAL_VALUE_ISNAN (*r
)
21454 || REAL_VALUE_ISINF (*r
)
21455 || !real_isinteger (r
, DFmode
))
21458 return exact_log2 (real_to_integer (r
));
21461 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
21462 power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
21463 return n. Otherwise return -1. */
21466 aarch64_fpconst_pow2_recip (rtx x
)
21468 REAL_VALUE_TYPE r0
;
21470 if (!CONST_DOUBLE_P (x
))
21473 r0
= *CONST_DOUBLE_REAL_VALUE (x
);
21474 if (exact_real_inverse (DFmode
, &r0
)
21475 && !REAL_VALUE_NEGATIVE (r0
))
21477 int ret
= exact_log2 (real_to_integer (&r0
));
21478 if (ret
>= 1 && ret
<= 32)
21484 /* If X is a vector of equal CONST_DOUBLE values and that value is
21485 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
21488 aarch64_vec_fpconst_pow_of_2 (rtx x
)
21491 if (GET_CODE (x
) != CONST_VECTOR
21492 || !CONST_VECTOR_NUNITS (x
).is_constant (&nelts
))
21495 if (GET_MODE_CLASS (GET_MODE (x
)) != MODE_VECTOR_FLOAT
)
21498 int firstval
= aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, 0));
21502 for (int i
= 1; i
< nelts
; i
++)
21503 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, i
)) != firstval
)
21509 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
21512 __fp16 always promotes through this hook.
21513 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
21514 through the generic excess precision logic rather than here. */
21517 aarch64_promoted_type (const_tree t
)
21519 if (SCALAR_FLOAT_TYPE_P (t
)
21520 && TYPE_MAIN_VARIANT (t
) == aarch64_fp16_type_node
)
21521 return float_type_node
;
21526 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
21529 aarch64_optab_supported_p (int op
, machine_mode mode1
, machine_mode
,
21530 optimization_type opt_type
)
21535 return opt_type
== OPTIMIZE_FOR_SPEED
&& use_rsqrt_p (mode1
);
21542 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
21544 static unsigned int
21545 aarch64_dwarf_poly_indeterminate_value (unsigned int i
, unsigned int *factor
,
21548 /* Polynomial invariant 1 == (VG / 2) - 1. */
21549 gcc_assert (i
== 1);
21552 return AARCH64_DWARF_VG
;
21555 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
21556 if MODE is HFmode, and punt to the generic implementation otherwise. */
21559 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode
)
21561 return (mode
== HFmode
21563 : default_libgcc_floating_mode_supported_p (mode
));
21566 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
21567 if MODE is HFmode, and punt to the generic implementation otherwise. */
21570 aarch64_scalar_mode_supported_p (scalar_mode mode
)
21572 return (mode
== HFmode
21574 : default_scalar_mode_supported_p (mode
));
21577 /* Set the value of FLT_EVAL_METHOD.
21578 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
21580 0: evaluate all operations and constants, whose semantic type has at
21581 most the range and precision of type float, to the range and
21582 precision of float; evaluate all other operations and constants to
21583 the range and precision of the semantic type;
21585 N, where _FloatN is a supported interchange floating type
21586 evaluate all operations and constants, whose semantic type has at
21587 most the range and precision of _FloatN type, to the range and
21588 precision of the _FloatN type; evaluate all other operations and
21589 constants to the range and precision of the semantic type;
21591 If we have the ARMv8.2-A extensions then we support _Float16 in native
21592 precision, so we should set this to 16. Otherwise, we support the type,
21593 but want to evaluate expressions in float precision, so set this to
21596 static enum flt_eval_method
21597 aarch64_excess_precision (enum excess_precision_type type
)
21601 case EXCESS_PRECISION_TYPE_FAST
:
21602 case EXCESS_PRECISION_TYPE_STANDARD
:
21603 /* We can calculate either in 16-bit range and precision or
21604 32-bit range and precision. Make that decision based on whether
21605 we have native support for the ARMv8.2-A 16-bit floating-point
21606 instructions or not. */
21607 return (TARGET_FP_F16INST
21608 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
21609 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
);
21610 case EXCESS_PRECISION_TYPE_IMPLICIT
:
21611 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
;
21613 gcc_unreachable ();
21615 return FLT_EVAL_METHOD_UNPREDICTABLE
;
21618 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
21619 scheduled for speculative execution. Reject the long-running division
21620 and square-root instructions. */
21623 aarch64_sched_can_speculate_insn (rtx_insn
*insn
)
21625 switch (get_attr_type (insn
))
21633 case TYPE_NEON_FP_SQRT_S
:
21634 case TYPE_NEON_FP_SQRT_D
:
21635 case TYPE_NEON_FP_SQRT_S_Q
:
21636 case TYPE_NEON_FP_SQRT_D_Q
:
21637 case TYPE_NEON_FP_DIV_S
:
21638 case TYPE_NEON_FP_DIV_D
:
21639 case TYPE_NEON_FP_DIV_S_Q
:
21640 case TYPE_NEON_FP_DIV_D_Q
:
21647 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
21650 aarch64_compute_pressure_classes (reg_class
*classes
)
21653 classes
[i
++] = GENERAL_REGS
;
21654 classes
[i
++] = FP_REGS
;
21655 /* PR_REGS isn't a useful pressure class because many predicate pseudo
21656 registers need to go in PR_LO_REGS at some point during their
21657 lifetime. Splitting it into two halves has the effect of making
21658 all predicates count against PR_LO_REGS, so that we try whenever
21659 possible to restrict the number of live predicates to 8. This
21660 greatly reduces the amount of spilling in certain loops. */
21661 classes
[i
++] = PR_LO_REGS
;
21662 classes
[i
++] = PR_HI_REGS
;
21666 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
21669 aarch64_can_change_mode_class (machine_mode from
,
21670 machine_mode to
, reg_class_t
)
21672 unsigned int from_flags
= aarch64_classify_vector_mode (from
);
21673 unsigned int to_flags
= aarch64_classify_vector_mode (to
);
21675 bool from_sve_p
= (from_flags
& VEC_ANY_SVE
);
21676 bool to_sve_p
= (to_flags
& VEC_ANY_SVE
);
21678 bool from_partial_sve_p
= from_sve_p
&& (from_flags
& VEC_PARTIAL
);
21679 bool to_partial_sve_p
= to_sve_p
&& (to_flags
& VEC_PARTIAL
);
21681 /* Don't allow changes between partial SVE modes and other modes.
21682 The contents of partial SVE modes are distributed evenly across
21683 the register, whereas GCC expects them to be clustered together. */
21684 if (from_partial_sve_p
!= to_partial_sve_p
)
21687 /* Similarly reject changes between partial SVE modes that have
21688 different patterns of significant and insignificant bits. */
21689 if (from_partial_sve_p
21690 && (aarch64_sve_container_bits (from
) != aarch64_sve_container_bits (to
)
21691 || GET_MODE_UNIT_SIZE (from
) != GET_MODE_UNIT_SIZE (to
)))
21694 if (BYTES_BIG_ENDIAN
)
21696 /* Don't allow changes between SVE data modes and non-SVE modes.
21697 See the comment at the head of aarch64-sve.md for details. */
21698 if (from_sve_p
!= to_sve_p
)
21701 /* Don't allow changes in element size: lane 0 of the new vector
21702 would not then be lane 0 of the old vector. See the comment
21703 above aarch64_maybe_expand_sve_subreg_move for a more detailed
21706 In the worst case, this forces a register to be spilled in
21707 one mode and reloaded in the other, which handles the
21708 endianness correctly. */
21709 if (from_sve_p
&& GET_MODE_UNIT_SIZE (from
) != GET_MODE_UNIT_SIZE (to
))
21715 /* Implement TARGET_EARLY_REMAT_MODES. */
21718 aarch64_select_early_remat_modes (sbitmap modes
)
21720 /* SVE values are not normally live across a call, so it should be
21721 worth doing early rematerialization even in VL-specific mode. */
21722 for (int i
= 0; i
< NUM_MACHINE_MODES
; ++i
)
21723 if (aarch64_sve_mode_p ((machine_mode
) i
))
21724 bitmap_set_bit (modes
, i
);
21727 /* Override the default target speculation_safe_value. */
21729 aarch64_speculation_safe_value (machine_mode mode
,
21730 rtx result
, rtx val
, rtx failval
)
21732 /* Maybe we should warn if falling back to hard barriers. They are
21733 likely to be noticably more expensive than the alternative below. */
21734 if (!aarch64_track_speculation
)
21735 return default_speculation_safe_value (mode
, result
, val
, failval
);
21738 val
= copy_to_mode_reg (mode
, val
);
21740 if (!aarch64_reg_or_zero (failval
, mode
))
21741 failval
= copy_to_mode_reg (mode
, failval
);
21743 emit_insn (gen_despeculate_copy (mode
, result
, val
, failval
));
21747 /* Implement TARGET_ESTIMATED_POLY_VALUE.
21748 Look into the tuning structure for an estimate.
21749 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
21750 Advanced SIMD 128 bits. */
21752 static HOST_WIDE_INT
21753 aarch64_estimated_poly_value (poly_int64 val
)
21755 enum aarch64_sve_vector_bits_enum width_source
21756 = aarch64_tune_params
.sve_width
;
21758 /* If we still don't have an estimate, use the default. */
21759 if (width_source
== SVE_SCALABLE
)
21760 return default_estimated_poly_value (val
);
21762 HOST_WIDE_INT over_128
= width_source
- 128;
21763 return val
.coeffs
[0] + val
.coeffs
[1] * over_128
/ 128;
21767 /* Return true for types that could be supported as SIMD return or
21771 supported_simd_type (tree t
)
21773 if (SCALAR_FLOAT_TYPE_P (t
) || INTEGRAL_TYPE_P (t
) || POINTER_TYPE_P (t
))
21775 HOST_WIDE_INT s
= tree_to_shwi (TYPE_SIZE_UNIT (t
));
21776 return s
== 1 || s
== 2 || s
== 4 || s
== 8;
21781 /* Return true for types that currently are supported as SIMD return
21782 or argument types. */
21785 currently_supported_simd_type (tree t
, tree b
)
21787 if (COMPLEX_FLOAT_TYPE_P (t
))
21790 if (TYPE_SIZE (t
) != TYPE_SIZE (b
))
21793 return supported_simd_type (t
);
21796 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
21799 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node
*node
,
21800 struct cgraph_simd_clone
*clonei
,
21801 tree base_type
, int num
)
21803 tree t
, ret_type
, arg_type
;
21804 unsigned int elt_bits
, vec_bits
, count
;
21809 if (clonei
->simdlen
21810 && (clonei
->simdlen
< 2
21811 || clonei
->simdlen
> 1024
21812 || (clonei
->simdlen
& (clonei
->simdlen
- 1)) != 0))
21814 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
21815 "unsupported simdlen %d", clonei
->simdlen
);
21819 ret_type
= TREE_TYPE (TREE_TYPE (node
->decl
));
21820 if (TREE_CODE (ret_type
) != VOID_TYPE
21821 && !currently_supported_simd_type (ret_type
, base_type
))
21823 if (TYPE_SIZE (ret_type
) != TYPE_SIZE (base_type
))
21824 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
21825 "GCC does not currently support mixed size types "
21826 "for %<simd%> functions");
21827 else if (supported_simd_type (ret_type
))
21828 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
21829 "GCC does not currently support return type %qT "
21830 "for %<simd%> functions", ret_type
);
21832 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
21833 "unsupported return type %qT for %<simd%> functions",
21838 for (t
= DECL_ARGUMENTS (node
->decl
); t
; t
= DECL_CHAIN (t
))
21840 arg_type
= TREE_TYPE (t
);
21842 if (!currently_supported_simd_type (arg_type
, base_type
))
21844 if (TYPE_SIZE (arg_type
) != TYPE_SIZE (base_type
))
21845 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
21846 "GCC does not currently support mixed size types "
21847 "for %<simd%> functions");
21849 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
21850 "GCC does not currently support argument type %qT "
21851 "for %<simd%> functions", arg_type
);
21856 clonei
->vecsize_mangle
= 'n';
21857 clonei
->mask_mode
= VOIDmode
;
21858 elt_bits
= GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type
));
21859 if (clonei
->simdlen
== 0)
21862 vec_bits
= (num
== 0 ? 64 : 128);
21863 clonei
->simdlen
= vec_bits
/ elt_bits
;
21868 vec_bits
= clonei
->simdlen
* elt_bits
;
21869 if (vec_bits
!= 64 && vec_bits
!= 128)
21871 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
21872 "GCC does not currently support simdlen %d for type %qT",
21873 clonei
->simdlen
, base_type
);
21877 clonei
->vecsize_int
= vec_bits
;
21878 clonei
->vecsize_float
= vec_bits
;
21882 /* Implement TARGET_SIMD_CLONE_ADJUST. */
21885 aarch64_simd_clone_adjust (struct cgraph_node
*node
)
21887 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
21888 use the correct ABI. */
21890 tree t
= TREE_TYPE (node
->decl
);
21891 TYPE_ATTRIBUTES (t
) = make_attribute ("aarch64_vector_pcs", "default",
21892 TYPE_ATTRIBUTES (t
));
21895 /* Implement TARGET_SIMD_CLONE_USABLE. */
21898 aarch64_simd_clone_usable (struct cgraph_node
*node
)
21900 switch (node
->simdclone
->vecsize_mangle
)
21907 gcc_unreachable ();
21911 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
21914 aarch64_comp_type_attributes (const_tree type1
, const_tree type2
)
21916 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1
))
21917 != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2
)))
21922 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
21924 static const char *
21925 aarch64_get_multilib_abi_name (void)
21927 if (TARGET_BIG_END
)
21928 return TARGET_ILP32
? "aarch64_be_ilp32" : "aarch64_be";
21929 return TARGET_ILP32
? "aarch64_ilp32" : "aarch64";
21932 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
21933 global variable based guard use the default else
21934 return a null tree. */
21936 aarch64_stack_protect_guard (void)
21938 if (aarch64_stack_protector_guard
== SSP_GLOBAL
)
21939 return default_stack_protect_guard ();
21944 /* Return the diagnostic message string if conversion from FROMTYPE to
21945 TOTYPE is not allowed, NULL otherwise. */
21947 static const char *
21948 aarch64_invalid_conversion (const_tree fromtype
, const_tree totype
)
21950 if (element_mode (fromtype
) != element_mode (totype
))
21952 /* Do no allow conversions to/from BFmode scalar types. */
21953 if (TYPE_MODE (fromtype
) == BFmode
)
21954 return N_("invalid conversion from type %<bfloat16_t%>");
21955 if (TYPE_MODE (totype
) == BFmode
)
21956 return N_("invalid conversion to type %<bfloat16_t%>");
21959 /* Conversion allowed. */
21963 /* Return the diagnostic message string if the unary operation OP is
21964 not permitted on TYPE, NULL otherwise. */
21966 static const char *
21967 aarch64_invalid_unary_op (int op
, const_tree type
)
21969 /* Reject all single-operand operations on BFmode except for &. */
21970 if (element_mode (type
) == BFmode
&& op
!= ADDR_EXPR
)
21971 return N_("operation not permitted on type %<bfloat16_t%>");
21973 /* Operation allowed. */
21977 /* Return the diagnostic message string if the binary operation OP is
21978 not permitted on TYPE1 and TYPE2, NULL otherwise. */
21980 static const char *
21981 aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED
, const_tree type1
,
21984 /* Reject all 2-operand operations on BFmode. */
21985 if (element_mode (type1
) == BFmode
21986 || element_mode (type2
) == BFmode
)
21987 return N_("operation not permitted on type %<bfloat16_t%>");
21989 /* Operation allowed. */
21993 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
21994 section at the end if needed. */
21995 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
21996 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
21997 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
21999 aarch64_file_end_indicate_exec_stack ()
22001 file_end_indicate_exec_stack ();
22003 unsigned feature_1_and
= 0;
22004 if (aarch64_bti_enabled ())
22005 feature_1_and
|= GNU_PROPERTY_AARCH64_FEATURE_1_BTI
;
22007 if (aarch64_ra_sign_scope
!= AARCH64_FUNCTION_NONE
)
22008 feature_1_and
|= GNU_PROPERTY_AARCH64_FEATURE_1_PAC
;
22012 /* Generate .note.gnu.property section. */
22013 switch_to_section (get_section (".note.gnu.property",
22014 SECTION_NOTYPE
, NULL
));
22016 /* PT_NOTE header: namesz, descsz, type.
22017 namesz = 4 ("GNU\0")
22018 descsz = 16 (Size of the program property array)
22019 [(12 + padding) * Number of array elements]
22020 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
22021 assemble_align (POINTER_SIZE
);
22022 assemble_integer (GEN_INT (4), 4, 32, 1);
22023 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES
)), 4, 32, 1);
22024 assemble_integer (GEN_INT (5), 4, 32, 1);
22026 /* PT_NOTE name. */
22027 assemble_string ("GNU", 4);
22029 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
22030 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
22032 data = feature_1_and. */
22033 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND
), 4, 32, 1);
22034 assemble_integer (GEN_INT (4), 4, 32, 1);
22035 assemble_integer (GEN_INT (feature_1_and
), 4, 32, 1);
22037 /* Pad the size of the note to the required alignment. */
22038 assemble_align (POINTER_SIZE
);
22041 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
22042 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
22043 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
22045 /* Target-specific selftests. */
22049 namespace selftest
{
22051 /* Selftest for the RTL loader.
22052 Verify that the RTL loader copes with a dump from
22053 print_rtx_function. This is essentially just a test that class
22054 function_reader can handle a real dump, but it also verifies
22055 that lookup_reg_by_dump_name correctly handles hard regs.
22056 The presence of hard reg names in the dump means that the test is
22057 target-specific, hence it is in this file. */
22060 aarch64_test_loading_full_dump ()
22062 rtl_dump_test
t (SELFTEST_LOCATION
, locate_file ("aarch64/times-two.rtl"));
22064 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun
->decl
)));
22066 rtx_insn
*insn_1
= get_insn_by_uid (1);
22067 ASSERT_EQ (NOTE
, GET_CODE (insn_1
));
22069 rtx_insn
*insn_15
= get_insn_by_uid (15);
22070 ASSERT_EQ (INSN
, GET_CODE (insn_15
));
22071 ASSERT_EQ (USE
, GET_CODE (PATTERN (insn_15
)));
22073 /* Verify crtl->return_rtx. */
22074 ASSERT_EQ (REG
, GET_CODE (crtl
->return_rtx
));
22075 ASSERT_EQ (0, REGNO (crtl
->return_rtx
));
22076 ASSERT_EQ (SImode
, GET_MODE (crtl
->return_rtx
));
22079 /* Run all target-specific selftests. */
22082 aarch64_run_selftests (void)
22084 aarch64_test_loading_full_dump ();
22087 } // namespace selftest
22089 #endif /* #if CHECKING_P */
22091 #undef TARGET_STACK_PROTECT_GUARD
22092 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
22094 #undef TARGET_ADDRESS_COST
22095 #define TARGET_ADDRESS_COST aarch64_address_cost
22097 /* This hook will determines whether unnamed bitfields affect the alignment
22098 of the containing structure. The hook returns true if the structure
22099 should inherit the alignment requirements of an unnamed bitfield's
22101 #undef TARGET_ALIGN_ANON_BITFIELD
22102 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
22104 #undef TARGET_ASM_ALIGNED_DI_OP
22105 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
22107 #undef TARGET_ASM_ALIGNED_HI_OP
22108 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
22110 #undef TARGET_ASM_ALIGNED_SI_OP
22111 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
22113 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
22114 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
22115 hook_bool_const_tree_hwi_hwi_const_tree_true
22117 #undef TARGET_ASM_FILE_START
22118 #define TARGET_ASM_FILE_START aarch64_start_file
22120 #undef TARGET_ASM_OUTPUT_MI_THUNK
22121 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
22123 #undef TARGET_ASM_SELECT_RTX_SECTION
22124 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
22126 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
22127 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
22129 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
22130 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
22132 #undef TARGET_BUILD_BUILTIN_VA_LIST
22133 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
22135 #undef TARGET_CALLEE_COPIES
22136 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
22138 #undef TARGET_CAN_ELIMINATE
22139 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
22141 #undef TARGET_CAN_INLINE_P
22142 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
22144 #undef TARGET_CANNOT_FORCE_CONST_MEM
22145 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
22147 #undef TARGET_CASE_VALUES_THRESHOLD
22148 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
22150 #undef TARGET_CONDITIONAL_REGISTER_USAGE
22151 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
22153 /* Only the least significant bit is used for initialization guard
22155 #undef TARGET_CXX_GUARD_MASK_BIT
22156 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
22158 #undef TARGET_C_MODE_FOR_SUFFIX
22159 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
22161 #ifdef TARGET_BIG_ENDIAN_DEFAULT
22162 #undef TARGET_DEFAULT_TARGET_FLAGS
22163 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
22166 #undef TARGET_CLASS_MAX_NREGS
22167 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
22169 #undef TARGET_BUILTIN_DECL
22170 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
22172 #undef TARGET_BUILTIN_RECIPROCAL
22173 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
22175 #undef TARGET_C_EXCESS_PRECISION
22176 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
22178 #undef TARGET_EXPAND_BUILTIN
22179 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
22181 #undef TARGET_EXPAND_BUILTIN_VA_START
22182 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
22184 #undef TARGET_FOLD_BUILTIN
22185 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
22187 #undef TARGET_FUNCTION_ARG
22188 #define TARGET_FUNCTION_ARG aarch64_function_arg
22190 #undef TARGET_FUNCTION_ARG_ADVANCE
22191 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
22193 #undef TARGET_FUNCTION_ARG_BOUNDARY
22194 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
22196 #undef TARGET_FUNCTION_ARG_PADDING
22197 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
22199 #undef TARGET_GET_RAW_RESULT_MODE
22200 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
22201 #undef TARGET_GET_RAW_ARG_MODE
22202 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
22204 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
22205 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
22207 #undef TARGET_FUNCTION_VALUE
22208 #define TARGET_FUNCTION_VALUE aarch64_function_value
22210 #undef TARGET_FUNCTION_VALUE_REGNO_P
22211 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
22213 #undef TARGET_GIMPLE_FOLD_BUILTIN
22214 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
22216 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
22217 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
22219 #undef TARGET_INIT_BUILTINS
22220 #define TARGET_INIT_BUILTINS aarch64_init_builtins
22222 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
22223 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
22224 aarch64_ira_change_pseudo_allocno_class
22226 #undef TARGET_LEGITIMATE_ADDRESS_P
22227 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
22229 #undef TARGET_LEGITIMATE_CONSTANT_P
22230 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
22232 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
22233 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
22234 aarch64_legitimize_address_displacement
22236 #undef TARGET_LIBGCC_CMP_RETURN_MODE
22237 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
22239 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
22240 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
22241 aarch64_libgcc_floating_mode_supported_p
22243 #undef TARGET_MANGLE_TYPE
22244 #define TARGET_MANGLE_TYPE aarch64_mangle_type
22246 #undef TARGET_INVALID_CONVERSION
22247 #define TARGET_INVALID_CONVERSION aarch64_invalid_conversion
22249 #undef TARGET_INVALID_UNARY_OP
22250 #define TARGET_INVALID_UNARY_OP aarch64_invalid_unary_op
22252 #undef TARGET_INVALID_BINARY_OP
22253 #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
22255 #undef TARGET_VERIFY_TYPE_CONTEXT
22256 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
22258 #undef TARGET_MEMORY_MOVE_COST
22259 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
22261 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
22262 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
22264 #undef TARGET_MUST_PASS_IN_STACK
22265 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
22267 /* This target hook should return true if accesses to volatile bitfields
22268 should use the narrowest mode possible. It should return false if these
22269 accesses should use the bitfield container type. */
22270 #undef TARGET_NARROW_VOLATILE_BITFIELD
22271 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
22273 #undef TARGET_OPTION_OVERRIDE
22274 #define TARGET_OPTION_OVERRIDE aarch64_override_options
22276 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
22277 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
22278 aarch64_override_options_after_change
22280 #undef TARGET_OPTION_SAVE
22281 #define TARGET_OPTION_SAVE aarch64_option_save
22283 #undef TARGET_OPTION_RESTORE
22284 #define TARGET_OPTION_RESTORE aarch64_option_restore
22286 #undef TARGET_OPTION_PRINT
22287 #define TARGET_OPTION_PRINT aarch64_option_print
22289 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
22290 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
22292 #undef TARGET_SET_CURRENT_FUNCTION
22293 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
22295 #undef TARGET_PASS_BY_REFERENCE
22296 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
22298 #undef TARGET_PREFERRED_RELOAD_CLASS
22299 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
22301 #undef TARGET_SCHED_REASSOCIATION_WIDTH
22302 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
22304 #undef TARGET_PROMOTED_TYPE
22305 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
22307 #undef TARGET_SECONDARY_RELOAD
22308 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
22310 #undef TARGET_SHIFT_TRUNCATION_MASK
22311 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
22313 #undef TARGET_SETUP_INCOMING_VARARGS
22314 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
22316 #undef TARGET_STRUCT_VALUE_RTX
22317 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
22319 #undef TARGET_REGISTER_MOVE_COST
22320 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
22322 #undef TARGET_RETURN_IN_MEMORY
22323 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
22325 #undef TARGET_RETURN_IN_MSB
22326 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
22328 #undef TARGET_RTX_COSTS
22329 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
22331 #undef TARGET_SCALAR_MODE_SUPPORTED_P
22332 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
22334 #undef TARGET_SCHED_ISSUE_RATE
22335 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
22337 #undef TARGET_SCHED_VARIABLE_ISSUE
22338 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
22340 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
22341 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
22342 aarch64_sched_first_cycle_multipass_dfa_lookahead
22344 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
22345 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
22346 aarch64_first_cycle_multipass_dfa_lookahead_guard
22348 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
22349 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
22350 aarch64_get_separate_components
22352 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
22353 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
22354 aarch64_components_for_bb
22356 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
22357 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
22358 aarch64_disqualify_components
22360 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
22361 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
22362 aarch64_emit_prologue_components
22364 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
22365 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
22366 aarch64_emit_epilogue_components
22368 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
22369 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
22370 aarch64_set_handled_components
22372 #undef TARGET_TRAMPOLINE_INIT
22373 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
22375 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
22376 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
22378 #undef TARGET_VECTOR_MODE_SUPPORTED_P
22379 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
22381 #undef TARGET_COMPATIBLE_VECTOR_TYPES_P
22382 #define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
22384 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
22385 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
22386 aarch64_builtin_support_vector_misalignment
22388 #undef TARGET_ARRAY_MODE
22389 #define TARGET_ARRAY_MODE aarch64_array_mode
22391 #undef TARGET_ARRAY_MODE_SUPPORTED_P
22392 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
22394 #undef TARGET_VECTORIZE_ADD_STMT_COST
22395 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
22397 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
22398 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
22399 aarch64_builtin_vectorization_cost
22401 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
22402 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
22404 #undef TARGET_VECTORIZE_BUILTINS
22405 #define TARGET_VECTORIZE_BUILTINS
22407 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
22408 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
22409 aarch64_builtin_vectorized_function
22411 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
22412 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
22413 aarch64_autovectorize_vector_modes
22415 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
22416 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
22417 aarch64_atomic_assign_expand_fenv
22419 /* Section anchor support. */
22421 #undef TARGET_MIN_ANCHOR_OFFSET
22422 #define TARGET_MIN_ANCHOR_OFFSET -256
22424 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
22425 byte offset; we can do much more for larger data types, but have no way
22426 to determine the size of the access. We assume accesses are aligned. */
22427 #undef TARGET_MAX_ANCHOR_OFFSET
22428 #define TARGET_MAX_ANCHOR_OFFSET 4095
22430 #undef TARGET_VECTOR_ALIGNMENT
22431 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
22433 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
22434 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
22435 aarch64_vectorize_preferred_vector_alignment
22436 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
22437 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
22438 aarch64_simd_vector_alignment_reachable
22440 /* vec_perm support. */
22442 #undef TARGET_VECTORIZE_VEC_PERM_CONST
22443 #define TARGET_VECTORIZE_VEC_PERM_CONST \
22444 aarch64_vectorize_vec_perm_const
22446 #undef TARGET_VECTORIZE_RELATED_MODE
22447 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
22448 #undef TARGET_VECTORIZE_GET_MASK_MODE
22449 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
22450 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
22451 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
22452 aarch64_empty_mask_is_expensive
22453 #undef TARGET_PREFERRED_ELSE_VALUE
22454 #define TARGET_PREFERRED_ELSE_VALUE \
22455 aarch64_preferred_else_value
22457 #undef TARGET_INIT_LIBFUNCS
22458 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
22460 #undef TARGET_FIXED_CONDITION_CODE_REGS
22461 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
22463 #undef TARGET_FLAGS_REGNUM
22464 #define TARGET_FLAGS_REGNUM CC_REGNUM
22466 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
22467 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
22469 #undef TARGET_ASAN_SHADOW_OFFSET
22470 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
22472 #undef TARGET_LEGITIMIZE_ADDRESS
22473 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
22475 #undef TARGET_SCHED_CAN_SPECULATE_INSN
22476 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
22478 #undef TARGET_CAN_USE_DOLOOP_P
22479 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
22481 #undef TARGET_SCHED_ADJUST_PRIORITY
22482 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
22484 #undef TARGET_SCHED_MACRO_FUSION_P
22485 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
22487 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
22488 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
22490 #undef TARGET_SCHED_FUSION_PRIORITY
22491 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
22493 #undef TARGET_UNSPEC_MAY_TRAP_P
22494 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
22496 #undef TARGET_USE_PSEUDO_PIC_REG
22497 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
22499 #undef TARGET_PRINT_OPERAND
22500 #define TARGET_PRINT_OPERAND aarch64_print_operand
22502 #undef TARGET_PRINT_OPERAND_ADDRESS
22503 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
22505 #undef TARGET_OPTAB_SUPPORTED_P
22506 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
22508 #undef TARGET_OMIT_STRUCT_RETURN_REG
22509 #define TARGET_OMIT_STRUCT_RETURN_REG true
22511 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
22512 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
22513 aarch64_dwarf_poly_indeterminate_value
22515 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
22516 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
22517 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
22519 #undef TARGET_HARD_REGNO_NREGS
22520 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
22521 #undef TARGET_HARD_REGNO_MODE_OK
22522 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
22524 #undef TARGET_MODES_TIEABLE_P
22525 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
22527 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
22528 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
22529 aarch64_hard_regno_call_part_clobbered
22531 #undef TARGET_INSN_CALLEE_ABI
22532 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
22534 #undef TARGET_CONSTANT_ALIGNMENT
22535 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
22537 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
22538 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
22539 aarch64_stack_clash_protection_alloca_probe_range
22541 #undef TARGET_COMPUTE_PRESSURE_CLASSES
22542 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
22544 #undef TARGET_CAN_CHANGE_MODE_CLASS
22545 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
22547 #undef TARGET_SELECT_EARLY_REMAT_MODES
22548 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
22550 #undef TARGET_SPECULATION_SAFE_VALUE
22551 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
22553 #undef TARGET_ESTIMATED_POLY_VALUE
22554 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
22556 #undef TARGET_ATTRIBUTE_TABLE
22557 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
22559 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
22560 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
22561 aarch64_simd_clone_compute_vecsize_and_simdlen
22563 #undef TARGET_SIMD_CLONE_ADJUST
22564 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
22566 #undef TARGET_SIMD_CLONE_USABLE
22567 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
22569 #undef TARGET_COMP_TYPE_ATTRIBUTES
22570 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
22572 #undef TARGET_GET_MULTILIB_ABI_NAME
22573 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
22575 #undef TARGET_FNTYPE_ABI
22576 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
22579 #undef TARGET_RUN_TARGET_SELFTESTS
22580 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
22581 #endif /* #if CHECKING_P */
22583 #undef TARGET_ASM_POST_CFI_STARTPROC
22584 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
22586 #undef TARGET_STRICT_ARGUMENT_NAMING
22587 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
22589 #undef TARGET_MD_ASM_ADJUST
22590 #define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
22592 struct gcc_target targetm
= TARGET_INITIALIZER
;
22594 #include "gt-aarch64.h"