1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2019 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
24 #define INCLUDE_STRING
26 #include "coretypes.h"
37 #include "stringpool.h"
43 #include "diagnostic.h"
44 #include "insn-attr.h"
46 #include "fold-const.h"
47 #include "stor-layout.h"
55 #include "langhooks.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
75 /* This file should be included last. */
76 #include "target-def.h"
78 /* Defined for convenience. */
79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
81 /* Information about a legitimate vector immediate operand. */
82 struct simd_immediate_info
84 enum insn_type
{ MOV
, MVN
};
85 enum modifier_type
{ LSL
, MSL
};
87 simd_immediate_info () {}
88 simd_immediate_info (scalar_float_mode
, rtx
);
89 simd_immediate_info (scalar_int_mode
, unsigned HOST_WIDE_INT
,
90 insn_type
= MOV
, modifier_type
= LSL
,
92 simd_immediate_info (scalar_mode
, rtx
, rtx
);
94 /* The mode of the elements. */
97 /* The value of each element if all elements are the same, or the
98 first value if the constant is a series. */
101 /* The value of the step if the constant is a series, null otherwise. */
104 /* The instruction to use to move the immediate into a vector. */
107 /* The kind of shift modifier to use, and the number of bits to shift.
108 This is (LSL, 0) if no shift is needed. */
109 modifier_type modifier
;
113 /* Construct a floating-point immediate in which each element has mode
114 ELT_MODE_IN and value VALUE_IN. */
115 inline simd_immediate_info
116 ::simd_immediate_info (scalar_float_mode elt_mode_in
, rtx value_in
)
117 : elt_mode (elt_mode_in
), value (value_in
), step (NULL_RTX
), insn (MOV
),
118 modifier (LSL
), shift (0)
121 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
122 and value VALUE_IN. The other parameters are as for the structure
124 inline simd_immediate_info
125 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
126 unsigned HOST_WIDE_INT value_in
,
127 insn_type insn_in
, modifier_type modifier_in
,
128 unsigned int shift_in
)
129 : elt_mode (elt_mode_in
), value (gen_int_mode (value_in
, elt_mode_in
)),
130 step (NULL_RTX
), insn (insn_in
), modifier (modifier_in
), shift (shift_in
)
133 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
134 and where element I is equal to VALUE_IN + I * STEP_IN. */
135 inline simd_immediate_info
136 ::simd_immediate_info (scalar_mode elt_mode_in
, rtx value_in
, rtx step_in
)
137 : elt_mode (elt_mode_in
), value (value_in
), step (step_in
), insn (MOV
),
138 modifier (LSL
), shift (0)
141 /* The current code model. */
142 enum aarch64_code_model aarch64_cmodel
;
144 /* The number of 64-bit elements in an SVE vector. */
145 poly_uint16 aarch64_sve_vg
;
148 #undef TARGET_HAVE_TLS
149 #define TARGET_HAVE_TLS 1
152 static bool aarch64_composite_type_p (const_tree
, machine_mode
);
153 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode
,
155 machine_mode
*, int *,
157 static void aarch64_elf_asm_constructor (rtx
, int) ATTRIBUTE_UNUSED
;
158 static void aarch64_elf_asm_destructor (rtx
, int) ATTRIBUTE_UNUSED
;
159 static void aarch64_override_options_after_change (void);
160 static bool aarch64_vector_mode_supported_p (machine_mode
);
161 static int aarch64_address_cost (rtx
, machine_mode
, addr_space_t
, bool);
162 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode
,
166 static machine_mode
aarch64_simd_container_mode (scalar_mode
, poly_int64
);
167 static bool aarch64_print_address_internal (FILE*, machine_mode
, rtx
,
168 aarch64_addr_query_type
);
169 static HOST_WIDE_INT
aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val
);
171 /* Major revision number of the ARM Architecture implemented by the target. */
172 unsigned aarch64_architecture_version
;
174 /* The processor for which instructions should be scheduled. */
175 enum aarch64_processor aarch64_tune
= cortexa53
;
177 /* Mask to specify which instruction scheduling options should be used. */
178 unsigned long aarch64_tune_flags
= 0;
180 /* Global flag for PC relative loads. */
181 bool aarch64_pcrelative_literal_loads
;
183 /* Global flag for whether frame pointer is enabled. */
184 bool aarch64_use_frame_pointer
;
186 #define BRANCH_PROTECT_STR_MAX 255
187 char *accepted_branch_protection_string
= NULL
;
189 static enum aarch64_parse_opt_result
190 aarch64_parse_branch_protection (const char*, char**);
192 /* Support for command line parsing of boolean flags in the tuning
194 struct aarch64_flag_desc
200 #define AARCH64_FUSION_PAIR(name, internal_name) \
201 { name, AARCH64_FUSE_##internal_name },
202 static const struct aarch64_flag_desc aarch64_fusible_pairs
[] =
204 { "none", AARCH64_FUSE_NOTHING
},
205 #include "aarch64-fusion-pairs.def"
206 { "all", AARCH64_FUSE_ALL
},
207 { NULL
, AARCH64_FUSE_NOTHING
}
210 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
211 { name, AARCH64_EXTRA_TUNE_##internal_name },
212 static const struct aarch64_flag_desc aarch64_tuning_flags
[] =
214 { "none", AARCH64_EXTRA_TUNE_NONE
},
215 #include "aarch64-tuning-flags.def"
216 { "all", AARCH64_EXTRA_TUNE_ALL
},
217 { NULL
, AARCH64_EXTRA_TUNE_NONE
}
220 /* Tuning parameters. */
222 static const struct cpu_addrcost_table generic_addrcost_table
=
232 0, /* register_offset */
233 0, /* register_sextend */
234 0, /* register_zextend */
238 static const struct cpu_addrcost_table exynosm1_addrcost_table
=
248 1, /* register_offset */
249 1, /* register_sextend */
250 2, /* register_zextend */
254 static const struct cpu_addrcost_table xgene1_addrcost_table
=
264 0, /* register_offset */
265 1, /* register_sextend */
266 1, /* register_zextend */
270 static const struct cpu_addrcost_table thunderx2t99_addrcost_table
=
280 2, /* register_offset */
281 3, /* register_sextend */
282 3, /* register_zextend */
286 static const struct cpu_addrcost_table tsv110_addrcost_table
=
296 0, /* register_offset */
297 1, /* register_sextend */
298 1, /* register_zextend */
302 static const struct cpu_addrcost_table qdf24xx_addrcost_table
=
312 3, /* register_offset */
313 3, /* register_sextend */
314 3, /* register_zextend */
318 static const struct cpu_regmove_cost generic_regmove_cost
=
321 /* Avoid the use of slow int<->fp moves for spilling by setting
322 their cost higher than memmov_cost. */
328 static const struct cpu_regmove_cost cortexa57_regmove_cost
=
331 /* Avoid the use of slow int<->fp moves for spilling by setting
332 their cost higher than memmov_cost. */
338 static const struct cpu_regmove_cost cortexa53_regmove_cost
=
341 /* Avoid the use of slow int<->fp moves for spilling by setting
342 their cost higher than memmov_cost. */
348 static const struct cpu_regmove_cost exynosm1_regmove_cost
=
351 /* Avoid the use of slow int<->fp moves for spilling by setting
352 their cost higher than memmov_cost (actual, 4 and 9). */
358 static const struct cpu_regmove_cost thunderx_regmove_cost
=
366 static const struct cpu_regmove_cost xgene1_regmove_cost
=
369 /* Avoid the use of slow int<->fp moves for spilling by setting
370 their cost higher than memmov_cost. */
376 static const struct cpu_regmove_cost qdf24xx_regmove_cost
=
379 /* Avoid the use of int<->fp moves for spilling. */
385 static const struct cpu_regmove_cost thunderx2t99_regmove_cost
=
388 /* Avoid the use of int<->fp moves for spilling. */
394 static const struct cpu_regmove_cost tsv110_regmove_cost
=
397 /* Avoid the use of slow int<->fp moves for spilling by setting
398 their cost higher than memmov_cost. */
404 /* Generic costs for vector insn classes. */
405 static const struct cpu_vector_cost generic_vector_cost
=
407 1, /* scalar_int_stmt_cost */
408 1, /* scalar_fp_stmt_cost */
409 1, /* scalar_load_cost */
410 1, /* scalar_store_cost */
411 1, /* vec_int_stmt_cost */
412 1, /* vec_fp_stmt_cost */
413 2, /* vec_permute_cost */
414 1, /* vec_to_scalar_cost */
415 1, /* scalar_to_vec_cost */
416 1, /* vec_align_load_cost */
417 1, /* vec_unalign_load_cost */
418 1, /* vec_unalign_store_cost */
419 1, /* vec_store_cost */
420 3, /* cond_taken_branch_cost */
421 1 /* cond_not_taken_branch_cost */
424 /* QDF24XX costs for vector insn classes. */
425 static const struct cpu_vector_cost qdf24xx_vector_cost
=
427 1, /* scalar_int_stmt_cost */
428 1, /* scalar_fp_stmt_cost */
429 1, /* scalar_load_cost */
430 1, /* scalar_store_cost */
431 1, /* vec_int_stmt_cost */
432 3, /* vec_fp_stmt_cost */
433 2, /* vec_permute_cost */
434 1, /* vec_to_scalar_cost */
435 1, /* scalar_to_vec_cost */
436 1, /* vec_align_load_cost */
437 1, /* vec_unalign_load_cost */
438 1, /* vec_unalign_store_cost */
439 1, /* vec_store_cost */
440 3, /* cond_taken_branch_cost */
441 1 /* cond_not_taken_branch_cost */
444 /* ThunderX costs for vector insn classes. */
445 static const struct cpu_vector_cost thunderx_vector_cost
=
447 1, /* scalar_int_stmt_cost */
448 1, /* scalar_fp_stmt_cost */
449 3, /* scalar_load_cost */
450 1, /* scalar_store_cost */
451 4, /* vec_int_stmt_cost */
452 1, /* vec_fp_stmt_cost */
453 4, /* vec_permute_cost */
454 2, /* vec_to_scalar_cost */
455 2, /* scalar_to_vec_cost */
456 3, /* vec_align_load_cost */
457 5, /* vec_unalign_load_cost */
458 5, /* vec_unalign_store_cost */
459 1, /* vec_store_cost */
460 3, /* cond_taken_branch_cost */
461 3 /* cond_not_taken_branch_cost */
464 static const struct cpu_vector_cost tsv110_vector_cost
=
466 1, /* scalar_int_stmt_cost */
467 1, /* scalar_fp_stmt_cost */
468 5, /* scalar_load_cost */
469 1, /* scalar_store_cost */
470 2, /* vec_int_stmt_cost */
471 2, /* vec_fp_stmt_cost */
472 2, /* vec_permute_cost */
473 3, /* vec_to_scalar_cost */
474 2, /* scalar_to_vec_cost */
475 5, /* vec_align_load_cost */
476 5, /* vec_unalign_load_cost */
477 1, /* vec_unalign_store_cost */
478 1, /* vec_store_cost */
479 1, /* cond_taken_branch_cost */
480 1 /* cond_not_taken_branch_cost */
483 /* Generic costs for vector insn classes. */
484 static const struct cpu_vector_cost cortexa57_vector_cost
=
486 1, /* scalar_int_stmt_cost */
487 1, /* scalar_fp_stmt_cost */
488 4, /* scalar_load_cost */
489 1, /* scalar_store_cost */
490 2, /* vec_int_stmt_cost */
491 2, /* vec_fp_stmt_cost */
492 3, /* vec_permute_cost */
493 8, /* vec_to_scalar_cost */
494 8, /* scalar_to_vec_cost */
495 4, /* vec_align_load_cost */
496 4, /* vec_unalign_load_cost */
497 1, /* vec_unalign_store_cost */
498 1, /* vec_store_cost */
499 1, /* cond_taken_branch_cost */
500 1 /* cond_not_taken_branch_cost */
503 static const struct cpu_vector_cost exynosm1_vector_cost
=
505 1, /* scalar_int_stmt_cost */
506 1, /* scalar_fp_stmt_cost */
507 5, /* scalar_load_cost */
508 1, /* scalar_store_cost */
509 3, /* vec_int_stmt_cost */
510 3, /* vec_fp_stmt_cost */
511 3, /* vec_permute_cost */
512 3, /* vec_to_scalar_cost */
513 3, /* scalar_to_vec_cost */
514 5, /* vec_align_load_cost */
515 5, /* vec_unalign_load_cost */
516 1, /* vec_unalign_store_cost */
517 1, /* vec_store_cost */
518 1, /* cond_taken_branch_cost */
519 1 /* cond_not_taken_branch_cost */
522 /* Generic costs for vector insn classes. */
523 static const struct cpu_vector_cost xgene1_vector_cost
=
525 1, /* scalar_int_stmt_cost */
526 1, /* scalar_fp_stmt_cost */
527 5, /* scalar_load_cost */
528 1, /* scalar_store_cost */
529 2, /* vec_int_stmt_cost */
530 2, /* vec_fp_stmt_cost */
531 2, /* vec_permute_cost */
532 4, /* vec_to_scalar_cost */
533 4, /* scalar_to_vec_cost */
534 10, /* vec_align_load_cost */
535 10, /* vec_unalign_load_cost */
536 2, /* vec_unalign_store_cost */
537 2, /* vec_store_cost */
538 2, /* cond_taken_branch_cost */
539 1 /* cond_not_taken_branch_cost */
542 /* Costs for vector insn classes for Vulcan. */
543 static const struct cpu_vector_cost thunderx2t99_vector_cost
=
545 1, /* scalar_int_stmt_cost */
546 6, /* scalar_fp_stmt_cost */
547 4, /* scalar_load_cost */
548 1, /* scalar_store_cost */
549 5, /* vec_int_stmt_cost */
550 6, /* vec_fp_stmt_cost */
551 3, /* vec_permute_cost */
552 6, /* vec_to_scalar_cost */
553 5, /* scalar_to_vec_cost */
554 8, /* vec_align_load_cost */
555 8, /* vec_unalign_load_cost */
556 4, /* vec_unalign_store_cost */
557 4, /* vec_store_cost */
558 2, /* cond_taken_branch_cost */
559 1 /* cond_not_taken_branch_cost */
562 /* Generic costs for branch instructions. */
563 static const struct cpu_branch_cost generic_branch_cost
=
565 1, /* Predictable. */
566 3 /* Unpredictable. */
569 /* Generic approximation modes. */
570 static const cpu_approx_modes generic_approx_modes
=
572 AARCH64_APPROX_NONE
, /* division */
573 AARCH64_APPROX_NONE
, /* sqrt */
574 AARCH64_APPROX_NONE
/* recip_sqrt */
577 /* Approximation modes for Exynos M1. */
578 static const cpu_approx_modes exynosm1_approx_modes
=
580 AARCH64_APPROX_NONE
, /* division */
581 AARCH64_APPROX_ALL
, /* sqrt */
582 AARCH64_APPROX_ALL
/* recip_sqrt */
585 /* Approximation modes for X-Gene 1. */
586 static const cpu_approx_modes xgene1_approx_modes
=
588 AARCH64_APPROX_NONE
, /* division */
589 AARCH64_APPROX_NONE
, /* sqrt */
590 AARCH64_APPROX_ALL
/* recip_sqrt */
593 /* Generic prefetch settings (which disable prefetch). */
594 static const cpu_prefetch_tune generic_prefetch_tune
=
597 -1, /* l1_cache_size */
598 -1, /* l1_cache_line_size */
599 -1, /* l2_cache_size */
600 true, /* prefetch_dynamic_strides */
601 -1, /* minimum_stride */
602 -1 /* default_opt_level */
605 static const cpu_prefetch_tune exynosm1_prefetch_tune
=
608 -1, /* l1_cache_size */
609 64, /* l1_cache_line_size */
610 -1, /* l2_cache_size */
611 true, /* prefetch_dynamic_strides */
612 -1, /* minimum_stride */
613 -1 /* default_opt_level */
616 static const cpu_prefetch_tune qdf24xx_prefetch_tune
=
619 32, /* l1_cache_size */
620 64, /* l1_cache_line_size */
621 512, /* l2_cache_size */
622 false, /* prefetch_dynamic_strides */
623 2048, /* minimum_stride */
624 3 /* default_opt_level */
627 static const cpu_prefetch_tune thunderxt88_prefetch_tune
=
630 32, /* l1_cache_size */
631 128, /* l1_cache_line_size */
632 16*1024, /* l2_cache_size */
633 true, /* prefetch_dynamic_strides */
634 -1, /* minimum_stride */
635 3 /* default_opt_level */
638 static const cpu_prefetch_tune thunderx_prefetch_tune
=
641 32, /* l1_cache_size */
642 128, /* l1_cache_line_size */
643 -1, /* l2_cache_size */
644 true, /* prefetch_dynamic_strides */
645 -1, /* minimum_stride */
646 -1 /* default_opt_level */
649 static const cpu_prefetch_tune thunderx2t99_prefetch_tune
=
652 32, /* l1_cache_size */
653 64, /* l1_cache_line_size */
654 256, /* l2_cache_size */
655 true, /* prefetch_dynamic_strides */
656 -1, /* minimum_stride */
657 -1 /* default_opt_level */
660 static const cpu_prefetch_tune tsv110_prefetch_tune
=
663 64, /* l1_cache_size */
664 64, /* l1_cache_line_size */
665 512, /* l2_cache_size */
666 true, /* prefetch_dynamic_strides */
667 -1, /* minimum_stride */
668 -1 /* default_opt_level */
671 static const cpu_prefetch_tune xgene1_prefetch_tune
=
674 32, /* l1_cache_size */
675 64, /* l1_cache_line_size */
676 256, /* l2_cache_size */
677 true, /* prefetch_dynamic_strides */
678 -1, /* minimum_stride */
679 -1 /* default_opt_level */
682 static const struct tune_params generic_tunings
=
684 &cortexa57_extra_costs
,
685 &generic_addrcost_table
,
686 &generic_regmove_cost
,
687 &generic_vector_cost
,
688 &generic_branch_cost
,
689 &generic_approx_modes
,
690 SVE_NOT_IMPLEMENTED
, /* sve_width */
693 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
694 "8", /* function_align. */
695 "4", /* jump_align. */
696 "8", /* loop_align. */
697 2, /* int_reassoc_width. */
698 4, /* fp_reassoc_width. */
699 1, /* vec_reassoc_width. */
700 2, /* min_div_recip_mul_sf. */
701 2, /* min_div_recip_mul_df. */
702 0, /* max_case_values. */
703 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
704 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
705 &generic_prefetch_tune
708 static const struct tune_params cortexa35_tunings
=
710 &cortexa53_extra_costs
,
711 &generic_addrcost_table
,
712 &cortexa53_regmove_cost
,
713 &generic_vector_cost
,
714 &generic_branch_cost
,
715 &generic_approx_modes
,
716 SVE_NOT_IMPLEMENTED
, /* sve_width */
719 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
720 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
721 "16", /* function_align. */
722 "4", /* jump_align. */
723 "8", /* loop_align. */
724 2, /* int_reassoc_width. */
725 4, /* fp_reassoc_width. */
726 1, /* vec_reassoc_width. */
727 2, /* min_div_recip_mul_sf. */
728 2, /* min_div_recip_mul_df. */
729 0, /* max_case_values. */
730 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
731 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
732 &generic_prefetch_tune
735 static const struct tune_params cortexa53_tunings
=
737 &cortexa53_extra_costs
,
738 &generic_addrcost_table
,
739 &cortexa53_regmove_cost
,
740 &generic_vector_cost
,
741 &generic_branch_cost
,
742 &generic_approx_modes
,
743 SVE_NOT_IMPLEMENTED
, /* sve_width */
746 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
747 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
748 "16", /* function_align. */
749 "4", /* jump_align. */
750 "8", /* loop_align. */
751 2, /* int_reassoc_width. */
752 4, /* fp_reassoc_width. */
753 1, /* vec_reassoc_width. */
754 2, /* min_div_recip_mul_sf. */
755 2, /* min_div_recip_mul_df. */
756 0, /* max_case_values. */
757 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
758 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
759 &generic_prefetch_tune
762 static const struct tune_params cortexa57_tunings
=
764 &cortexa57_extra_costs
,
765 &generic_addrcost_table
,
766 &cortexa57_regmove_cost
,
767 &cortexa57_vector_cost
,
768 &generic_branch_cost
,
769 &generic_approx_modes
,
770 SVE_NOT_IMPLEMENTED
, /* sve_width */
773 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
774 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
775 "16", /* function_align. */
776 "4", /* jump_align. */
777 "8", /* loop_align. */
778 2, /* int_reassoc_width. */
779 4, /* fp_reassoc_width. */
780 1, /* vec_reassoc_width. */
781 2, /* min_div_recip_mul_sf. */
782 2, /* min_div_recip_mul_df. */
783 0, /* max_case_values. */
784 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
785 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
), /* tune_flags. */
786 &generic_prefetch_tune
789 static const struct tune_params cortexa72_tunings
=
791 &cortexa57_extra_costs
,
792 &generic_addrcost_table
,
793 &cortexa57_regmove_cost
,
794 &cortexa57_vector_cost
,
795 &generic_branch_cost
,
796 &generic_approx_modes
,
797 SVE_NOT_IMPLEMENTED
, /* sve_width */
800 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
801 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
802 "16", /* function_align. */
803 "4", /* jump_align. */
804 "8", /* loop_align. */
805 2, /* int_reassoc_width. */
806 4, /* fp_reassoc_width. */
807 1, /* vec_reassoc_width. */
808 2, /* min_div_recip_mul_sf. */
809 2, /* min_div_recip_mul_df. */
810 0, /* max_case_values. */
811 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
812 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
813 &generic_prefetch_tune
816 static const struct tune_params cortexa73_tunings
=
818 &cortexa57_extra_costs
,
819 &generic_addrcost_table
,
820 &cortexa57_regmove_cost
,
821 &cortexa57_vector_cost
,
822 &generic_branch_cost
,
823 &generic_approx_modes
,
824 SVE_NOT_IMPLEMENTED
, /* sve_width */
825 4, /* memmov_cost. */
827 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
828 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
829 "16", /* function_align. */
830 "4", /* jump_align. */
831 "8", /* loop_align. */
832 2, /* int_reassoc_width. */
833 4, /* fp_reassoc_width. */
834 1, /* vec_reassoc_width. */
835 2, /* min_div_recip_mul_sf. */
836 2, /* min_div_recip_mul_df. */
837 0, /* max_case_values. */
838 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
839 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
840 &generic_prefetch_tune
845 static const struct tune_params exynosm1_tunings
=
847 &exynosm1_extra_costs
,
848 &exynosm1_addrcost_table
,
849 &exynosm1_regmove_cost
,
850 &exynosm1_vector_cost
,
851 &generic_branch_cost
,
852 &exynosm1_approx_modes
,
853 SVE_NOT_IMPLEMENTED
, /* sve_width */
856 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
857 "4", /* function_align. */
858 "4", /* jump_align. */
859 "4", /* loop_align. */
860 2, /* int_reassoc_width. */
861 4, /* fp_reassoc_width. */
862 1, /* vec_reassoc_width. */
863 2, /* min_div_recip_mul_sf. */
864 2, /* min_div_recip_mul_df. */
865 48, /* max_case_values. */
866 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
867 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
868 &exynosm1_prefetch_tune
871 static const struct tune_params thunderxt88_tunings
=
873 &thunderx_extra_costs
,
874 &generic_addrcost_table
,
875 &thunderx_regmove_cost
,
876 &thunderx_vector_cost
,
877 &generic_branch_cost
,
878 &generic_approx_modes
,
879 SVE_NOT_IMPLEMENTED
, /* sve_width */
882 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
883 "8", /* function_align. */
884 "8", /* jump_align. */
885 "8", /* loop_align. */
886 2, /* int_reassoc_width. */
887 4, /* fp_reassoc_width. */
888 1, /* vec_reassoc_width. */
889 2, /* min_div_recip_mul_sf. */
890 2, /* min_div_recip_mul_df. */
891 0, /* max_case_values. */
892 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
893 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
), /* tune_flags. */
894 &thunderxt88_prefetch_tune
897 static const struct tune_params thunderx_tunings
=
899 &thunderx_extra_costs
,
900 &generic_addrcost_table
,
901 &thunderx_regmove_cost
,
902 &thunderx_vector_cost
,
903 &generic_branch_cost
,
904 &generic_approx_modes
,
905 SVE_NOT_IMPLEMENTED
, /* sve_width */
908 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
909 "8", /* function_align. */
910 "8", /* jump_align. */
911 "8", /* loop_align. */
912 2, /* int_reassoc_width. */
913 4, /* fp_reassoc_width. */
914 1, /* vec_reassoc_width. */
915 2, /* min_div_recip_mul_sf. */
916 2, /* min_div_recip_mul_df. */
917 0, /* max_case_values. */
918 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
919 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
920 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
), /* tune_flags. */
921 &thunderx_prefetch_tune
924 static const struct tune_params tsv110_tunings
=
927 &tsv110_addrcost_table
,
928 &tsv110_regmove_cost
,
930 &generic_branch_cost
,
931 &generic_approx_modes
,
932 SVE_NOT_IMPLEMENTED
, /* sve_width */
935 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_CMP_BRANCH
936 | AARCH64_FUSE_ALU_BRANCH
), /* fusible_ops */
937 "16", /* function_align. */
938 "4", /* jump_align. */
939 "8", /* loop_align. */
940 2, /* int_reassoc_width. */
941 4, /* fp_reassoc_width. */
942 1, /* vec_reassoc_width. */
943 2, /* min_div_recip_mul_sf. */
944 2, /* min_div_recip_mul_df. */
945 0, /* max_case_values. */
946 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
947 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
948 &tsv110_prefetch_tune
951 static const struct tune_params xgene1_tunings
=
954 &xgene1_addrcost_table
,
955 &xgene1_regmove_cost
,
957 &generic_branch_cost
,
958 &xgene1_approx_modes
,
959 SVE_NOT_IMPLEMENTED
, /* sve_width */
962 AARCH64_FUSE_NOTHING
, /* fusible_ops */
963 "16", /* function_align. */
964 "16", /* jump_align. */
965 "16", /* loop_align. */
966 2, /* int_reassoc_width. */
967 4, /* fp_reassoc_width. */
968 1, /* vec_reassoc_width. */
969 2, /* min_div_recip_mul_sf. */
970 2, /* min_div_recip_mul_df. */
971 17, /* max_case_values. */
972 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
973 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
), /* tune_flags. */
974 &xgene1_prefetch_tune
977 static const struct tune_params emag_tunings
=
980 &xgene1_addrcost_table
,
981 &xgene1_regmove_cost
,
983 &generic_branch_cost
,
984 &xgene1_approx_modes
,
988 AARCH64_FUSE_NOTHING
, /* fusible_ops */
989 "16", /* function_align. */
990 "16", /* jump_align. */
991 "16", /* loop_align. */
992 2, /* int_reassoc_width. */
993 4, /* fp_reassoc_width. */
994 1, /* vec_reassoc_width. */
995 2, /* min_div_recip_mul_sf. */
996 2, /* min_div_recip_mul_df. */
997 17, /* max_case_values. */
998 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
999 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
), /* tune_flags. */
1000 &xgene1_prefetch_tune
1003 static const struct tune_params qdf24xx_tunings
=
1005 &qdf24xx_extra_costs
,
1006 &qdf24xx_addrcost_table
,
1007 &qdf24xx_regmove_cost
,
1008 &qdf24xx_vector_cost
,
1009 &generic_branch_cost
,
1010 &generic_approx_modes
,
1011 SVE_NOT_IMPLEMENTED
, /* sve_width */
1012 4, /* memmov_cost */
1014 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
1015 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
1016 "16", /* function_align. */
1017 "8", /* jump_align. */
1018 "16", /* loop_align. */
1019 2, /* int_reassoc_width. */
1020 4, /* fp_reassoc_width. */
1021 1, /* vec_reassoc_width. */
1022 2, /* min_div_recip_mul_sf. */
1023 2, /* min_div_recip_mul_df. */
1024 0, /* max_case_values. */
1025 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1026 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS
, /* tune_flags. */
1027 &qdf24xx_prefetch_tune
1030 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1032 static const struct tune_params saphira_tunings
=
1034 &generic_extra_costs
,
1035 &generic_addrcost_table
,
1036 &generic_regmove_cost
,
1037 &generic_vector_cost
,
1038 &generic_branch_cost
,
1039 &generic_approx_modes
,
1040 SVE_NOT_IMPLEMENTED
, /* sve_width */
1041 4, /* memmov_cost */
1043 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
1044 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
1045 "16", /* function_align. */
1046 "8", /* jump_align. */
1047 "16", /* loop_align. */
1048 2, /* int_reassoc_width. */
1049 4, /* fp_reassoc_width. */
1050 1, /* vec_reassoc_width. */
1051 2, /* min_div_recip_mul_sf. */
1052 2, /* min_div_recip_mul_df. */
1053 0, /* max_case_values. */
1054 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1055 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1056 &generic_prefetch_tune
1059 static const struct tune_params thunderx2t99_tunings
=
1061 &thunderx2t99_extra_costs
,
1062 &thunderx2t99_addrcost_table
,
1063 &thunderx2t99_regmove_cost
,
1064 &thunderx2t99_vector_cost
,
1065 &generic_branch_cost
,
1066 &generic_approx_modes
,
1067 SVE_NOT_IMPLEMENTED
, /* sve_width */
1068 4, /* memmov_cost. */
1069 4, /* issue_rate. */
1070 (AARCH64_FUSE_CMP_BRANCH
| AARCH64_FUSE_AES_AESMC
1071 | AARCH64_FUSE_ALU_BRANCH
), /* fusible_ops */
1072 "16", /* function_align. */
1073 "8", /* jump_align. */
1074 "16", /* loop_align. */
1075 3, /* int_reassoc_width. */
1076 2, /* fp_reassoc_width. */
1077 2, /* vec_reassoc_width. */
1078 2, /* min_div_recip_mul_sf. */
1079 2, /* min_div_recip_mul_df. */
1080 0, /* max_case_values. */
1081 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1082 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1083 &thunderx2t99_prefetch_tune
1086 /* Support for fine-grained override of the tuning structures. */
1087 struct aarch64_tuning_override_function
1090 void (*parse_override
)(const char*, struct tune_params
*);
1093 static void aarch64_parse_fuse_string (const char*, struct tune_params
*);
1094 static void aarch64_parse_tune_string (const char*, struct tune_params
*);
1095 static void aarch64_parse_sve_width_string (const char*, struct tune_params
*);
1097 static const struct aarch64_tuning_override_function
1098 aarch64_tuning_override_functions
[] =
1100 { "fuse", aarch64_parse_fuse_string
},
1101 { "tune", aarch64_parse_tune_string
},
1102 { "sve_width", aarch64_parse_sve_width_string
},
1106 /* A processor implementing AArch64. */
1109 const char *const name
;
1110 enum aarch64_processor ident
;
1111 enum aarch64_processor sched_core
;
1112 enum aarch64_arch arch
;
1113 unsigned architecture_version
;
1114 const unsigned long flags
;
1115 const struct tune_params
*const tune
;
1118 /* Architectures implementing AArch64. */
1119 static const struct processor all_architectures
[] =
1121 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1122 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1123 #include "aarch64-arches.def"
1124 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
1127 /* Processor cores implementing AArch64. */
1128 static const struct processor all_cores
[] =
1130 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1131 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1132 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1133 FLAGS, &COSTS##_tunings},
1134 #include "aarch64-cores.def"
1135 {"generic", generic
, cortexa53
, AARCH64_ARCH_8A
, 8,
1136 AARCH64_FL_FOR_ARCH8
, &generic_tunings
},
1137 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
1141 /* Target specification. These are populated by the -march, -mtune, -mcpu
1142 handling code or by target attributes. */
1143 static const struct processor
*selected_arch
;
1144 static const struct processor
*selected_cpu
;
1145 static const struct processor
*selected_tune
;
1147 /* The current tuning set. */
1148 struct tune_params aarch64_tune_params
= generic_tunings
;
1150 /* Table of machine attributes. */
1151 static const struct attribute_spec aarch64_attribute_table
[] =
1153 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1154 affects_type_identity, handler, exclude } */
1155 { "aarch64_vector_pcs", 0, 0, false, true, true, false, NULL
, NULL
},
1156 { NULL
, 0, 0, false, false, false, false, NULL
, NULL
}
1159 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1161 /* An ISA extension in the co-processor and main instruction set space. */
1162 struct aarch64_option_extension
1164 const char *const name
;
1165 const unsigned long flags_on
;
1166 const unsigned long flags_off
;
1169 typedef enum aarch64_cond_code
1171 AARCH64_EQ
= 0, AARCH64_NE
, AARCH64_CS
, AARCH64_CC
, AARCH64_MI
, AARCH64_PL
,
1172 AARCH64_VS
, AARCH64_VC
, AARCH64_HI
, AARCH64_LS
, AARCH64_GE
, AARCH64_LT
,
1173 AARCH64_GT
, AARCH64_LE
, AARCH64_AL
, AARCH64_NV
1177 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1179 struct aarch64_branch_protect_type
1181 /* The type's name that the user passes to the branch-protection option
1184 /* Function to handle the protection type and set global variables.
1185 First argument is the string token corresponding with this type and the
1186 second argument is the next token in the option string.
1188 * AARCH64_PARSE_OK: Handling was sucessful.
1189 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1190 should print an error.
1191 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1193 enum aarch64_parse_opt_result (*handler
)(char*, char*);
1194 /* A list of types that can follow this type in the option string. */
1195 const aarch64_branch_protect_type
* subtypes
;
1196 unsigned int num_subtypes
;
1199 static enum aarch64_parse_opt_result
1200 aarch64_handle_no_branch_protection (char* str
, char* rest
)
1202 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NONE
;
1203 aarch64_enable_bti
= 0;
1206 error ("unexpected %<%s%> after %<%s%>", rest
, str
);
1207 return AARCH64_PARSE_INVALID_FEATURE
;
1209 return AARCH64_PARSE_OK
;
1212 static enum aarch64_parse_opt_result
1213 aarch64_handle_standard_branch_protection (char* str
, char* rest
)
1215 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
1216 aarch64_enable_bti
= 1;
1219 error ("unexpected %<%s%> after %<%s%>", rest
, str
);
1220 return AARCH64_PARSE_INVALID_FEATURE
;
1222 return AARCH64_PARSE_OK
;
1225 static enum aarch64_parse_opt_result
1226 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED
,
1227 char* rest ATTRIBUTE_UNUSED
)
1229 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
1230 return AARCH64_PARSE_OK
;
1233 static enum aarch64_parse_opt_result
1234 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED
,
1235 char* rest ATTRIBUTE_UNUSED
)
1237 aarch64_ra_sign_scope
= AARCH64_FUNCTION_ALL
;
1238 return AARCH64_PARSE_OK
;
1241 static enum aarch64_parse_opt_result
1242 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED
,
1243 char* rest ATTRIBUTE_UNUSED
)
1245 aarch64_enable_bti
= 1;
1246 return AARCH64_PARSE_OK
;
1249 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes
[] = {
1250 { "leaf", aarch64_handle_pac_ret_leaf
, NULL
, 0 },
1251 { NULL
, NULL
, NULL
, 0 }
1254 static const struct aarch64_branch_protect_type aarch64_branch_protect_types
[] = {
1255 { "none", aarch64_handle_no_branch_protection
, NULL
, 0 },
1256 { "standard", aarch64_handle_standard_branch_protection
, NULL
, 0 },
1257 { "pac-ret", aarch64_handle_pac_ret_protection
, aarch64_pac_ret_subtypes
,
1258 ARRAY_SIZE (aarch64_pac_ret_subtypes
) },
1259 { "bti", aarch64_handle_bti_protection
, NULL
, 0 },
1260 { NULL
, NULL
, NULL
, 0 }
1263 /* The condition codes of the processor, and the inverse function. */
1264 static const char * const aarch64_condition_codes
[] =
1266 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1267 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1270 /* Generate code to enable conditional branches in functions over 1 MiB. */
1272 aarch64_gen_far_branch (rtx
* operands
, int pos_label
, const char * dest
,
1273 const char * branch_format
)
1275 rtx_code_label
* tmp_label
= gen_label_rtx ();
1276 char label_buf
[256];
1278 ASM_GENERATE_INTERNAL_LABEL (label_buf
, dest
,
1279 CODE_LABEL_NUMBER (tmp_label
));
1280 const char *label_ptr
= targetm
.strip_name_encoding (label_buf
);
1281 rtx dest_label
= operands
[pos_label
];
1282 operands
[pos_label
] = tmp_label
;
1284 snprintf (buffer
, sizeof (buffer
), "%s%s", branch_format
, label_ptr
);
1285 output_asm_insn (buffer
, operands
);
1287 snprintf (buffer
, sizeof (buffer
), "b\t%%l%d\n%s:", pos_label
, label_ptr
);
1288 operands
[pos_label
] = dest_label
;
1289 output_asm_insn (buffer
, operands
);
1294 aarch64_err_no_fpadvsimd (machine_mode mode
)
1296 if (TARGET_GENERAL_REGS_ONLY
)
1297 if (FLOAT_MODE_P (mode
))
1298 error ("%qs is incompatible with the use of floating-point types",
1299 "-mgeneral-regs-only");
1301 error ("%qs is incompatible with the use of vector types",
1302 "-mgeneral-regs-only");
1304 if (FLOAT_MODE_P (mode
))
1305 error ("%qs feature modifier is incompatible with the use of"
1306 " floating-point types", "+nofp");
1308 error ("%qs feature modifier is incompatible with the use of"
1309 " vector types", "+nofp");
1312 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1313 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1314 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1315 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1316 and GENERAL_REGS is lower than the memory cost (in this case the best class
1317 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1318 cost results in bad allocations with many redundant int<->FP moves which
1319 are expensive on various cores.
1320 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1321 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1322 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1323 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1324 The result of this is that it is no longer inefficient to have a higher
1325 memory move cost than the register move cost.
1329 aarch64_ira_change_pseudo_allocno_class (int regno
, reg_class_t allocno_class
,
1330 reg_class_t best_class
)
1334 if (!reg_class_subset_p (GENERAL_REGS
, allocno_class
)
1335 || !reg_class_subset_p (FP_REGS
, allocno_class
))
1336 return allocno_class
;
1338 if (!reg_class_subset_p (GENERAL_REGS
, best_class
)
1339 || !reg_class_subset_p (FP_REGS
, best_class
))
1342 mode
= PSEUDO_REGNO_MODE (regno
);
1343 return FLOAT_MODE_P (mode
) || VECTOR_MODE_P (mode
) ? FP_REGS
: GENERAL_REGS
;
1347 aarch64_min_divisions_for_recip_mul (machine_mode mode
)
1349 if (GET_MODE_UNIT_SIZE (mode
) == 4)
1350 return aarch64_tune_params
.min_div_recip_mul_sf
;
1351 return aarch64_tune_params
.min_div_recip_mul_df
;
1354 /* Return the reassociation width of treeop OPC with mode MODE. */
1356 aarch64_reassociation_width (unsigned opc
, machine_mode mode
)
1358 if (VECTOR_MODE_P (mode
))
1359 return aarch64_tune_params
.vec_reassoc_width
;
1360 if (INTEGRAL_MODE_P (mode
))
1361 return aarch64_tune_params
.int_reassoc_width
;
1362 /* Avoid reassociating floating point addition so we emit more FMAs. */
1363 if (FLOAT_MODE_P (mode
) && opc
!= PLUS_EXPR
)
1364 return aarch64_tune_params
.fp_reassoc_width
;
1368 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1370 aarch64_dbx_register_number (unsigned regno
)
1372 if (GP_REGNUM_P (regno
))
1373 return AARCH64_DWARF_R0
+ regno
- R0_REGNUM
;
1374 else if (regno
== SP_REGNUM
)
1375 return AARCH64_DWARF_SP
;
1376 else if (FP_REGNUM_P (regno
))
1377 return AARCH64_DWARF_V0
+ regno
- V0_REGNUM
;
1378 else if (PR_REGNUM_P (regno
))
1379 return AARCH64_DWARF_P0
+ regno
- P0_REGNUM
;
1380 else if (regno
== VG_REGNUM
)
1381 return AARCH64_DWARF_VG
;
1383 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1384 equivalent DWARF register. */
1385 return DWARF_FRAME_REGISTERS
;
1388 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1390 aarch64_advsimd_struct_mode_p (machine_mode mode
)
1393 && (mode
== OImode
|| mode
== CImode
|| mode
== XImode
));
1396 /* Return true if MODE is an SVE predicate mode. */
1398 aarch64_sve_pred_mode_p (machine_mode mode
)
1401 && (mode
== VNx16BImode
1402 || mode
== VNx8BImode
1403 || mode
== VNx4BImode
1404 || mode
== VNx2BImode
));
1407 /* Three mutually-exclusive flags describing a vector or predicate type. */
1408 const unsigned int VEC_ADVSIMD
= 1;
1409 const unsigned int VEC_SVE_DATA
= 2;
1410 const unsigned int VEC_SVE_PRED
= 4;
1411 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1412 a structure of 2, 3 or 4 vectors. */
1413 const unsigned int VEC_STRUCT
= 8;
1414 /* Useful combinations of the above. */
1415 const unsigned int VEC_ANY_SVE
= VEC_SVE_DATA
| VEC_SVE_PRED
;
1416 const unsigned int VEC_ANY_DATA
= VEC_ADVSIMD
| VEC_SVE_DATA
;
1418 /* Return a set of flags describing the vector properties of mode MODE.
1419 Ignore modes that are not supported by the current target. */
1421 aarch64_classify_vector_mode (machine_mode mode
)
1423 if (aarch64_advsimd_struct_mode_p (mode
))
1424 return VEC_ADVSIMD
| VEC_STRUCT
;
1426 if (aarch64_sve_pred_mode_p (mode
))
1427 return VEC_SVE_PRED
;
1429 scalar_mode inner
= GET_MODE_INNER (mode
);
1430 if (VECTOR_MODE_P (mode
)
1437 || inner
== DFmode
))
1441 if (known_eq (GET_MODE_BITSIZE (mode
), BITS_PER_SVE_VECTOR
))
1442 return VEC_SVE_DATA
;
1443 if (known_eq (GET_MODE_BITSIZE (mode
), BITS_PER_SVE_VECTOR
* 2)
1444 || known_eq (GET_MODE_BITSIZE (mode
), BITS_PER_SVE_VECTOR
* 3)
1445 || known_eq (GET_MODE_BITSIZE (mode
), BITS_PER_SVE_VECTOR
* 4))
1446 return VEC_SVE_DATA
| VEC_STRUCT
;
1449 /* This includes V1DF but not V1DI (which doesn't exist). */
1451 && (known_eq (GET_MODE_BITSIZE (mode
), 64)
1452 || known_eq (GET_MODE_BITSIZE (mode
), 128)))
1459 /* Return true if MODE is any of the data vector modes, including
1462 aarch64_vector_data_mode_p (machine_mode mode
)
1464 return aarch64_classify_vector_mode (mode
) & VEC_ANY_DATA
;
1467 /* Return true if MODE is an SVE data vector mode; either a single vector
1468 or a structure of vectors. */
1470 aarch64_sve_data_mode_p (machine_mode mode
)
1472 return aarch64_classify_vector_mode (mode
) & VEC_SVE_DATA
;
1475 /* Implement target hook TARGET_ARRAY_MODE. */
1476 static opt_machine_mode
1477 aarch64_array_mode (machine_mode mode
, unsigned HOST_WIDE_INT nelems
)
1479 if (aarch64_classify_vector_mode (mode
) == VEC_SVE_DATA
1480 && IN_RANGE (nelems
, 2, 4))
1481 return mode_for_vector (GET_MODE_INNER (mode
),
1482 GET_MODE_NUNITS (mode
) * nelems
);
1484 return opt_machine_mode ();
1487 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1489 aarch64_array_mode_supported_p (machine_mode mode
,
1490 unsigned HOST_WIDE_INT nelems
)
1493 && (AARCH64_VALID_SIMD_QREG_MODE (mode
)
1494 || AARCH64_VALID_SIMD_DREG_MODE (mode
))
1495 && (nelems
>= 2 && nelems
<= 4))
1501 /* Return the SVE predicate mode to use for elements that have
1502 ELEM_NBYTES bytes, if such a mode exists. */
1505 aarch64_sve_pred_mode (unsigned int elem_nbytes
)
1509 if (elem_nbytes
== 1)
1511 if (elem_nbytes
== 2)
1513 if (elem_nbytes
== 4)
1515 if (elem_nbytes
== 8)
1518 return opt_machine_mode ();
1521 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1523 static opt_machine_mode
1524 aarch64_get_mask_mode (poly_uint64 nunits
, poly_uint64 nbytes
)
1526 if (TARGET_SVE
&& known_eq (nbytes
, BYTES_PER_SVE_VECTOR
))
1528 unsigned int elem_nbytes
= vector_element_size (nbytes
, nunits
);
1529 machine_mode pred_mode
;
1530 if (aarch64_sve_pred_mode (elem_nbytes
).exists (&pred_mode
))
1534 return default_get_mask_mode (nunits
, nbytes
);
1537 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1538 prefer to use the first arithmetic operand as the else value if
1539 the else value doesn't matter, since that exactly matches the SVE
1540 destructive merging form. For ternary operations we could either
1541 pick the first operand and use FMAD-like instructions or the last
1542 operand and use FMLA-like instructions; the latter seems more
1546 aarch64_preferred_else_value (unsigned, tree
, unsigned int nops
, tree
*ops
)
1548 return nops
== 3 ? ops
[2] : ops
[0];
1551 /* Implement TARGET_HARD_REGNO_NREGS. */
1554 aarch64_hard_regno_nregs (unsigned regno
, machine_mode mode
)
1556 /* ??? Logically we should only need to provide a value when
1557 HARD_REGNO_MODE_OK says that the combination is valid,
1558 but at the moment we need to handle all modes. Just ignore
1559 any runtime parts for registers that can't store them. */
1560 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
1561 switch (aarch64_regno_regclass (regno
))
1565 if (aarch64_sve_data_mode_p (mode
))
1566 return exact_div (GET_MODE_SIZE (mode
),
1567 BYTES_PER_SVE_VECTOR
).to_constant ();
1568 return CEIL (lowest_size
, UNITS_PER_VREG
);
1574 return CEIL (lowest_size
, UNITS_PER_WORD
);
1579 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1582 aarch64_hard_regno_mode_ok (unsigned regno
, machine_mode mode
)
1584 if (GET_MODE_CLASS (mode
) == MODE_CC
)
1585 return regno
== CC_REGNUM
;
1587 if (regno
== VG_REGNUM
)
1588 /* This must have the same size as _Unwind_Word. */
1589 return mode
== DImode
;
1591 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1592 if (vec_flags
& VEC_SVE_PRED
)
1593 return PR_REGNUM_P (regno
);
1595 if (PR_REGNUM_P (regno
))
1598 if (regno
== SP_REGNUM
)
1599 /* The purpose of comparing with ptr_mode is to support the
1600 global register variable associated with the stack pointer
1601 register via the syntax of asm ("wsp") in ILP32. */
1602 return mode
== Pmode
|| mode
== ptr_mode
;
1604 if (regno
== FRAME_POINTER_REGNUM
|| regno
== ARG_POINTER_REGNUM
)
1605 return mode
== Pmode
;
1607 if (GP_REGNUM_P (regno
))
1609 if (known_le (GET_MODE_SIZE (mode
), 8))
1611 else if (known_le (GET_MODE_SIZE (mode
), 16))
1612 return (regno
& 1) == 0;
1614 else if (FP_REGNUM_P (regno
))
1616 if (vec_flags
& VEC_STRUCT
)
1617 return end_hard_regno (mode
, regno
) - 1 <= V31_REGNUM
;
1619 return !VECTOR_MODE_P (mode
) || vec_flags
!= 0;
1625 /* Return true if this is a definition of a vectorized simd function. */
1628 aarch64_simd_decl_p (tree fndecl
)
1634 fntype
= TREE_TYPE (fndecl
);
1638 /* Functions with the aarch64_vector_pcs attribute use the simd ABI. */
1639 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype
)) != NULL
)
1645 /* Return the mode a register save/restore should use. DImode for integer
1646 registers, DFmode for FP registers in non-SIMD functions (they only save
1647 the bottom half of a 128 bit register), or TFmode for FP registers in
1651 aarch64_reg_save_mode (tree fndecl
, unsigned regno
)
1653 return GP_REGNUM_P (regno
)
1655 : (aarch64_simd_decl_p (fndecl
) ? E_TFmode
: E_DFmode
);
1658 /* Return true if the instruction is a call to a SIMD function, false
1659 if it is not a SIMD function or if we do not know anything about
1663 aarch64_simd_call_p (rtx_insn
*insn
)
1669 gcc_assert (CALL_P (insn
));
1670 call
= get_call_rtx_from (insn
);
1671 symbol
= XEXP (XEXP (call
, 0), 0);
1672 if (GET_CODE (symbol
) != SYMBOL_REF
)
1674 fndecl
= SYMBOL_REF_DECL (symbol
);
1678 return aarch64_simd_decl_p (fndecl
);
1681 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS. If INSN calls
1682 a function that uses the SIMD ABI, take advantage of the extra
1683 call-preserved registers that the ABI provides. */
1686 aarch64_remove_extra_call_preserved_regs (rtx_insn
*insn
,
1687 HARD_REG_SET
*return_set
)
1689 if (aarch64_simd_call_p (insn
))
1691 for (int regno
= 0; regno
< FIRST_PSEUDO_REGISTER
; regno
++)
1692 if (FP_SIMD_SAVED_REGNUM_P (regno
))
1693 CLEAR_HARD_REG_BIT (*return_set
, regno
);
1697 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1698 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1699 clobbers the top 64 bits when restoring the bottom 64 bits. */
1702 aarch64_hard_regno_call_part_clobbered (unsigned int regno
, machine_mode mode
)
1704 return FP_REGNUM_P (regno
) && maybe_gt (GET_MODE_SIZE (mode
), 8);
1707 /* Implement REGMODE_NATURAL_SIZE. */
1709 aarch64_regmode_natural_size (machine_mode mode
)
1711 /* The natural size for SVE data modes is one SVE data vector,
1712 and similarly for predicates. We can't independently modify
1713 anything smaller than that. */
1714 /* ??? For now, only do this for variable-width SVE registers.
1715 Doing it for constant-sized registers breaks lower-subreg.c. */
1716 /* ??? And once that's fixed, we should probably have similar
1717 code for Advanced SIMD. */
1718 if (!aarch64_sve_vg
.is_constant ())
1720 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1721 if (vec_flags
& VEC_SVE_PRED
)
1722 return BYTES_PER_SVE_PRED
;
1723 if (vec_flags
& VEC_SVE_DATA
)
1724 return BYTES_PER_SVE_VECTOR
;
1726 return UNITS_PER_WORD
;
1729 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1731 aarch64_hard_regno_caller_save_mode (unsigned regno
, unsigned,
1734 /* The predicate mode determines which bits are significant and
1735 which are "don't care". Decreasing the number of lanes would
1736 lose data while increasing the number of lanes would make bits
1737 unnecessarily significant. */
1738 if (PR_REGNUM_P (regno
))
1740 if (known_ge (GET_MODE_SIZE (mode
), 4))
1746 /* Return true if I's bits are consecutive ones from the MSB. */
1748 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i
)
1750 return exact_log2 (-i
) != HOST_WIDE_INT_M1
;
1753 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1754 that strcpy from constants will be faster. */
1756 static HOST_WIDE_INT
1757 aarch64_constant_alignment (const_tree exp
, HOST_WIDE_INT align
)
1759 if (TREE_CODE (exp
) == STRING_CST
&& !optimize_size
)
1760 return MAX (align
, BITS_PER_WORD
);
1764 /* Return true if calls to DECL should be treated as
1765 long-calls (ie called via a register). */
1767 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED
)
1772 /* Return true if calls to symbol-ref SYM should be treated as
1773 long-calls (ie called via a register). */
1775 aarch64_is_long_call_p (rtx sym
)
1777 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym
));
1780 /* Return true if calls to symbol-ref SYM should not go through
1784 aarch64_is_noplt_call_p (rtx sym
)
1786 const_tree decl
= SYMBOL_REF_DECL (sym
);
1791 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl
)))
1792 && !targetm
.binds_local_p (decl
))
1798 /* Return true if the offsets to a zero/sign-extract operation
1799 represent an expression that matches an extend operation. The
1800 operands represent the paramters from
1802 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1804 aarch64_is_extend_from_extract (scalar_int_mode mode
, rtx mult_imm
,
1807 HOST_WIDE_INT mult_val
, extract_val
;
1809 if (! CONST_INT_P (mult_imm
) || ! CONST_INT_P (extract_imm
))
1812 mult_val
= INTVAL (mult_imm
);
1813 extract_val
= INTVAL (extract_imm
);
1816 && extract_val
< GET_MODE_BITSIZE (mode
)
1817 && exact_log2 (extract_val
& ~7) > 0
1818 && (extract_val
& 7) <= 4
1819 && mult_val
== (1 << (extract_val
& 7)))
1825 /* Emit an insn that's a simple single-set. Both the operands must be
1826 known to be valid. */
1827 inline static rtx_insn
*
1828 emit_set_insn (rtx x
, rtx y
)
1830 return emit_insn (gen_rtx_SET (x
, y
));
1833 /* X and Y are two things to compare using CODE. Emit the compare insn and
1834 return the rtx for register 0 in the proper mode. */
1836 aarch64_gen_compare_reg (RTX_CODE code
, rtx x
, rtx y
)
1838 machine_mode mode
= SELECT_CC_MODE (code
, x
, y
);
1839 rtx cc_reg
= gen_rtx_REG (mode
, CC_REGNUM
);
1841 emit_set_insn (cc_reg
, gen_rtx_COMPARE (mode
, x
, y
));
1845 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
1848 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code
, rtx x
, rtx y
,
1849 machine_mode y_mode
)
1851 if (y_mode
== E_QImode
|| y_mode
== E_HImode
)
1853 if (CONST_INT_P (y
))
1854 y
= GEN_INT (INTVAL (y
) & GET_MODE_MASK (y_mode
));
1858 machine_mode cc_mode
;
1860 t
= gen_rtx_ZERO_EXTEND (SImode
, y
);
1861 t
= gen_rtx_COMPARE (CC_SWPmode
, t
, x
);
1862 cc_mode
= CC_SWPmode
;
1863 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
1864 emit_set_insn (cc_reg
, t
);
1869 return aarch64_gen_compare_reg (code
, x
, y
);
1872 /* Build the SYMBOL_REF for __tls_get_addr. */
1874 static GTY(()) rtx tls_get_addr_libfunc
;
1877 aarch64_tls_get_addr (void)
1879 if (!tls_get_addr_libfunc
)
1880 tls_get_addr_libfunc
= init_one_libfunc ("__tls_get_addr");
1881 return tls_get_addr_libfunc
;
1884 /* Return the TLS model to use for ADDR. */
1886 static enum tls_model
1887 tls_symbolic_operand_type (rtx addr
)
1889 enum tls_model tls_kind
= TLS_MODEL_NONE
;
1890 if (GET_CODE (addr
) == CONST
)
1893 rtx sym
= strip_offset (addr
, &addend
);
1894 if (GET_CODE (sym
) == SYMBOL_REF
)
1895 tls_kind
= SYMBOL_REF_TLS_MODEL (sym
);
1897 else if (GET_CODE (addr
) == SYMBOL_REF
)
1898 tls_kind
= SYMBOL_REF_TLS_MODEL (addr
);
1903 /* We'll allow lo_sum's in addresses in our legitimate addresses
1904 so that combine would take care of combining addresses where
1905 necessary, but for generation purposes, we'll generate the address
1908 tmp = hi (symbol_ref); adrp x1, foo
1909 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1913 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1914 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1918 Load TLS symbol, depending on TLS mechanism and TLS access model.
1920 Global Dynamic - Traditional TLS:
1921 adrp tmp, :tlsgd:imm
1922 add dest, tmp, #:tlsgd_lo12:imm
1925 Global Dynamic - TLS Descriptors:
1926 adrp dest, :tlsdesc:imm
1927 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1928 add dest, dest, #:tlsdesc_lo12:imm
1935 adrp tmp, :gottprel:imm
1936 ldr dest, [tmp, #:gottprel_lo12:imm]
1941 add t0, tp, #:tprel_hi12:imm, lsl #12
1942 add t0, t0, #:tprel_lo12_nc:imm
1946 aarch64_load_symref_appropriately (rtx dest
, rtx imm
,
1947 enum aarch64_symbol_type type
)
1951 case SYMBOL_SMALL_ABSOLUTE
:
1953 /* In ILP32, the mode of dest can be either SImode or DImode. */
1955 machine_mode mode
= GET_MODE (dest
);
1957 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
1959 if (can_create_pseudo_p ())
1960 tmp_reg
= gen_reg_rtx (mode
);
1962 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
1963 emit_insn (gen_add_losym (dest
, tmp_reg
, imm
));
1967 case SYMBOL_TINY_ABSOLUTE
:
1968 emit_insn (gen_rtx_SET (dest
, imm
));
1971 case SYMBOL_SMALL_GOT_28K
:
1973 machine_mode mode
= GET_MODE (dest
);
1974 rtx gp_rtx
= pic_offset_table_rtx
;
1978 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1979 here before rtl expand. Tree IVOPT will generate rtl pattern to
1980 decide rtx costs, in which case pic_offset_table_rtx is not
1981 initialized. For that case no need to generate the first adrp
1982 instruction as the final cost for global variable access is
1986 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1987 using the page base as GOT base, the first page may be wasted,
1988 in the worst scenario, there is only 28K space for GOT).
1990 The generate instruction sequence for accessing global variable
1993 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1995 Only one instruction needed. But we must initialize
1996 pic_offset_table_rtx properly. We generate initialize insn for
1997 every global access, and allow CSE to remove all redundant.
1999 The final instruction sequences will look like the following
2000 for multiply global variables access.
2002 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2004 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2005 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2006 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2009 rtx s
= gen_rtx_SYMBOL_REF (Pmode
, "_GLOBAL_OFFSET_TABLE_");
2010 crtl
->uses_pic_offset_table
= 1;
2011 emit_move_insn (gp_rtx
, gen_rtx_HIGH (Pmode
, s
));
2013 if (mode
!= GET_MODE (gp_rtx
))
2014 gp_rtx
= gen_lowpart (mode
, gp_rtx
);
2018 if (mode
== ptr_mode
)
2021 insn
= gen_ldr_got_small_28k_di (dest
, gp_rtx
, imm
);
2023 insn
= gen_ldr_got_small_28k_si (dest
, gp_rtx
, imm
);
2025 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
2029 gcc_assert (mode
== Pmode
);
2031 insn
= gen_ldr_got_small_28k_sidi (dest
, gp_rtx
, imm
);
2032 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
2035 /* The operand is expected to be MEM. Whenever the related insn
2036 pattern changed, above code which calculate mem should be
2038 gcc_assert (GET_CODE (mem
) == MEM
);
2039 MEM_READONLY_P (mem
) = 1;
2040 MEM_NOTRAP_P (mem
) = 1;
2045 case SYMBOL_SMALL_GOT_4G
:
2047 /* In ILP32, the mode of dest can be either SImode or DImode,
2048 while the got entry is always of SImode size. The mode of
2049 dest depends on how dest is used: if dest is assigned to a
2050 pointer (e.g. in the memory), it has SImode; it may have
2051 DImode if dest is dereferenced to access the memeory.
2052 This is why we have to handle three different ldr_got_small
2053 patterns here (two patterns for ILP32). */
2058 machine_mode mode
= GET_MODE (dest
);
2060 if (can_create_pseudo_p ())
2061 tmp_reg
= gen_reg_rtx (mode
);
2063 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
2064 if (mode
== ptr_mode
)
2067 insn
= gen_ldr_got_small_di (dest
, tmp_reg
, imm
);
2069 insn
= gen_ldr_got_small_si (dest
, tmp_reg
, imm
);
2071 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
2075 gcc_assert (mode
== Pmode
);
2077 insn
= gen_ldr_got_small_sidi (dest
, tmp_reg
, imm
);
2078 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
2081 gcc_assert (GET_CODE (mem
) == MEM
);
2082 MEM_READONLY_P (mem
) = 1;
2083 MEM_NOTRAP_P (mem
) = 1;
2088 case SYMBOL_SMALL_TLSGD
:
2091 machine_mode mode
= GET_MODE (dest
);
2092 rtx result
= gen_rtx_REG (mode
, R0_REGNUM
);
2096 aarch64_emit_call_insn (gen_tlsgd_small_si (result
, imm
));
2098 aarch64_emit_call_insn (gen_tlsgd_small_di (result
, imm
));
2099 insns
= get_insns ();
2102 RTL_CONST_CALL_P (insns
) = 1;
2103 emit_libcall_block (insns
, dest
, result
, imm
);
2107 case SYMBOL_SMALL_TLSDESC
:
2109 machine_mode mode
= GET_MODE (dest
);
2110 rtx x0
= gen_rtx_REG (mode
, R0_REGNUM
);
2113 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
2115 /* In ILP32, the got entry is always of SImode size. Unlike
2116 small GOT, the dest is fixed at reg 0. */
2118 emit_insn (gen_tlsdesc_small_si (imm
));
2120 emit_insn (gen_tlsdesc_small_di (imm
));
2121 tp
= aarch64_load_tp (NULL
);
2124 tp
= gen_lowpart (mode
, tp
);
2126 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, x0
)));
2128 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2132 case SYMBOL_SMALL_TLSIE
:
2134 /* In ILP32, the mode of dest can be either SImode or DImode,
2135 while the got entry is always of SImode size. The mode of
2136 dest depends on how dest is used: if dest is assigned to a
2137 pointer (e.g. in the memory), it has SImode; it may have
2138 DImode if dest is dereferenced to access the memeory.
2139 This is why we have to handle three different tlsie_small
2140 patterns here (two patterns for ILP32). */
2141 machine_mode mode
= GET_MODE (dest
);
2142 rtx tmp_reg
= gen_reg_rtx (mode
);
2143 rtx tp
= aarch64_load_tp (NULL
);
2145 if (mode
== ptr_mode
)
2148 emit_insn (gen_tlsie_small_di (tmp_reg
, imm
));
2151 emit_insn (gen_tlsie_small_si (tmp_reg
, imm
));
2152 tp
= gen_lowpart (mode
, tp
);
2157 gcc_assert (mode
== Pmode
);
2158 emit_insn (gen_tlsie_small_sidi (tmp_reg
, imm
));
2161 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, tmp_reg
)));
2163 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2167 case SYMBOL_TLSLE12
:
2168 case SYMBOL_TLSLE24
:
2169 case SYMBOL_TLSLE32
:
2170 case SYMBOL_TLSLE48
:
2172 machine_mode mode
= GET_MODE (dest
);
2173 rtx tp
= aarch64_load_tp (NULL
);
2176 tp
= gen_lowpart (mode
, tp
);
2180 case SYMBOL_TLSLE12
:
2181 emit_insn ((mode
== DImode
? gen_tlsle12_di
: gen_tlsle12_si
)
2184 case SYMBOL_TLSLE24
:
2185 emit_insn ((mode
== DImode
? gen_tlsle24_di
: gen_tlsle24_si
)
2188 case SYMBOL_TLSLE32
:
2189 emit_insn ((mode
== DImode
? gen_tlsle32_di
: gen_tlsle32_si
)
2191 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
2194 case SYMBOL_TLSLE48
:
2195 emit_insn ((mode
== DImode
? gen_tlsle48_di
: gen_tlsle48_si
)
2197 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
2205 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2209 case SYMBOL_TINY_GOT
:
2210 emit_insn (gen_ldr_got_tiny (dest
, imm
));
2213 case SYMBOL_TINY_TLSIE
:
2215 machine_mode mode
= GET_MODE (dest
);
2216 rtx tp
= aarch64_load_tp (NULL
);
2218 if (mode
== ptr_mode
)
2221 emit_insn (gen_tlsie_tiny_di (dest
, imm
, tp
));
2224 tp
= gen_lowpart (mode
, tp
);
2225 emit_insn (gen_tlsie_tiny_si (dest
, imm
, tp
));
2230 gcc_assert (mode
== Pmode
);
2231 emit_insn (gen_tlsie_tiny_sidi (dest
, imm
, tp
));
2235 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2244 /* Emit a move from SRC to DEST. Assume that the move expanders can
2245 handle all moves if !can_create_pseudo_p (). The distinction is
2246 important because, unlike emit_move_insn, the move expanders know
2247 how to force Pmode objects into the constant pool even when the
2248 constant pool address is not itself legitimate. */
2250 aarch64_emit_move (rtx dest
, rtx src
)
2252 return (can_create_pseudo_p ()
2253 ? emit_move_insn (dest
, src
)
2254 : emit_move_insn_1 (dest
, src
));
2257 /* Apply UNOPTAB to OP and store the result in DEST. */
2260 aarch64_emit_unop (rtx dest
, optab unoptab
, rtx op
)
2262 rtx tmp
= expand_unop (GET_MODE (dest
), unoptab
, op
, dest
, 0);
2264 emit_move_insn (dest
, tmp
);
2267 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
2270 aarch64_emit_binop (rtx dest
, optab binoptab
, rtx op0
, rtx op1
)
2272 rtx tmp
= expand_binop (GET_MODE (dest
), binoptab
, op0
, op1
, dest
, 0,
2275 emit_move_insn (dest
, tmp
);
2278 /* Split a 128-bit move operation into two 64-bit move operations,
2279 taking care to handle partial overlap of register to register
2280 copies. Special cases are needed when moving between GP regs and
2281 FP regs. SRC can be a register, constant or memory; DST a register
2282 or memory. If either operand is memory it must not have any side
2285 aarch64_split_128bit_move (rtx dst
, rtx src
)
2290 machine_mode mode
= GET_MODE (dst
);
2292 gcc_assert (mode
== TImode
|| mode
== TFmode
);
2293 gcc_assert (!(side_effects_p (src
) || side_effects_p (dst
)));
2294 gcc_assert (mode
== GET_MODE (src
) || GET_MODE (src
) == VOIDmode
);
2296 if (REG_P (dst
) && REG_P (src
))
2298 int src_regno
= REGNO (src
);
2299 int dst_regno
= REGNO (dst
);
2301 /* Handle FP <-> GP regs. */
2302 if (FP_REGNUM_P (dst_regno
) && GP_REGNUM_P (src_regno
))
2304 src_lo
= gen_lowpart (word_mode
, src
);
2305 src_hi
= gen_highpart (word_mode
, src
);
2307 emit_insn (gen_aarch64_movlow_di (mode
, dst
, src_lo
));
2308 emit_insn (gen_aarch64_movhigh_di (mode
, dst
, src_hi
));
2311 else if (GP_REGNUM_P (dst_regno
) && FP_REGNUM_P (src_regno
))
2313 dst_lo
= gen_lowpart (word_mode
, dst
);
2314 dst_hi
= gen_highpart (word_mode
, dst
);
2316 emit_insn (gen_aarch64_movdi_low (mode
, dst_lo
, src
));
2317 emit_insn (gen_aarch64_movdi_high (mode
, dst_hi
, src
));
2322 dst_lo
= gen_lowpart (word_mode
, dst
);
2323 dst_hi
= gen_highpart (word_mode
, dst
);
2324 src_lo
= gen_lowpart (word_mode
, src
);
2325 src_hi
= gen_highpart_mode (word_mode
, mode
, src
);
2327 /* At most one pairing may overlap. */
2328 if (reg_overlap_mentioned_p (dst_lo
, src_hi
))
2330 aarch64_emit_move (dst_hi
, src_hi
);
2331 aarch64_emit_move (dst_lo
, src_lo
);
2335 aarch64_emit_move (dst_lo
, src_lo
);
2336 aarch64_emit_move (dst_hi
, src_hi
);
2341 aarch64_split_128bit_move_p (rtx dst
, rtx src
)
2343 return (! REG_P (src
)
2344 || ! (FP_REGNUM_P (REGNO (dst
)) && FP_REGNUM_P (REGNO (src
))));
2347 /* Split a complex SIMD combine. */
2350 aarch64_split_simd_combine (rtx dst
, rtx src1
, rtx src2
)
2352 machine_mode src_mode
= GET_MODE (src1
);
2353 machine_mode dst_mode
= GET_MODE (dst
);
2355 gcc_assert (VECTOR_MODE_P (dst_mode
));
2356 gcc_assert (register_operand (dst
, dst_mode
)
2357 && register_operand (src1
, src_mode
)
2358 && register_operand (src2
, src_mode
));
2360 emit_insn (gen_aarch64_simd_combine (src_mode
, dst
, src1
, src2
));
2364 /* Split a complex SIMD move. */
2367 aarch64_split_simd_move (rtx dst
, rtx src
)
2369 machine_mode src_mode
= GET_MODE (src
);
2370 machine_mode dst_mode
= GET_MODE (dst
);
2372 gcc_assert (VECTOR_MODE_P (dst_mode
));
2374 if (REG_P (dst
) && REG_P (src
))
2376 gcc_assert (VECTOR_MODE_P (src_mode
));
2377 emit_insn (gen_aarch64_split_simd_mov (src_mode
, dst
, src
));
2382 aarch64_zero_extend_const_eq (machine_mode xmode
, rtx x
,
2383 machine_mode ymode
, rtx y
)
2385 rtx r
= simplify_const_unary_operation (ZERO_EXTEND
, xmode
, y
, ymode
);
2386 gcc_assert (r
!= NULL
);
2387 return rtx_equal_p (x
, r
);
2392 aarch64_force_temporary (machine_mode mode
, rtx x
, rtx value
)
2394 if (can_create_pseudo_p ())
2395 return force_reg (mode
, value
);
2399 aarch64_emit_move (x
, value
);
2404 /* Return true if we can move VALUE into a register using a single
2405 CNT[BHWD] instruction. */
2408 aarch64_sve_cnt_immediate_p (poly_int64 value
)
2410 HOST_WIDE_INT factor
= value
.coeffs
[0];
2411 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2412 return (value
.coeffs
[1] == factor
2413 && IN_RANGE (factor
, 2, 16 * 16)
2414 && (factor
& 1) == 0
2415 && factor
<= 16 * (factor
& -factor
));
2418 /* Likewise for rtx X. */
2421 aarch64_sve_cnt_immediate_p (rtx x
)
2424 return poly_int_rtx_p (x
, &value
) && aarch64_sve_cnt_immediate_p (value
);
2427 /* Return the asm string for an instruction with a CNT-like vector size
2428 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2429 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2430 first part of the operands template (the part that comes before the
2431 vector size itself). FACTOR is the number of quadwords.
2432 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2433 If it is zero, we can use any element size. */
2436 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
2437 unsigned int factor
,
2438 unsigned int nelts_per_vq
)
2440 static char buffer
[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2442 if (nelts_per_vq
== 0)
2443 /* There is some overlap in the ranges of the four CNT instructions.
2444 Here we always use the smallest possible element size, so that the
2445 multiplier is 1 whereever possible. */
2446 nelts_per_vq
= factor
& -factor
;
2447 int shift
= std::min (exact_log2 (nelts_per_vq
), 4);
2448 gcc_assert (IN_RANGE (shift
, 1, 4));
2449 char suffix
= "dwhb"[shift
- 1];
2452 unsigned int written
;
2454 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s",
2455 prefix
, suffix
, operands
);
2457 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s, all, mul #%d",
2458 prefix
, suffix
, operands
, factor
);
2459 gcc_assert (written
< sizeof (buffer
));
2463 /* Return the asm string for an instruction with a CNT-like vector size
2464 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2465 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2466 first part of the operands template (the part that comes before the
2467 vector size itself). X is the value of the vector size operand,
2468 as a polynomial integer rtx. */
2471 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
2474 poly_int64 value
= rtx_to_poly_int64 (x
);
2475 gcc_assert (aarch64_sve_cnt_immediate_p (value
));
2476 return aarch64_output_sve_cnt_immediate (prefix
, operands
,
2477 value
.coeffs
[1], 0);
2480 /* Return true if we can add VALUE to a register using a single ADDVL
2481 or ADDPL instruction. */
2484 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value
)
2486 HOST_WIDE_INT factor
= value
.coeffs
[0];
2487 if (factor
== 0 || value
.coeffs
[1] != factor
)
2489 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2490 and a value of 16 is one vector width. */
2491 return (((factor
& 15) == 0 && IN_RANGE (factor
, -32 * 16, 31 * 16))
2492 || ((factor
& 1) == 0 && IN_RANGE (factor
, -32 * 2, 31 * 2)));
2495 /* Likewise for rtx X. */
2498 aarch64_sve_addvl_addpl_immediate_p (rtx x
)
2501 return (poly_int_rtx_p (x
, &value
)
2502 && aarch64_sve_addvl_addpl_immediate_p (value
));
2505 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2506 and storing the result in operand 0. */
2509 aarch64_output_sve_addvl_addpl (rtx dest
, rtx base
, rtx offset
)
2511 static char buffer
[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2512 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
2513 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value
));
2515 /* Use INC or DEC if possible. */
2516 if (rtx_equal_p (dest
, base
) && GP_REGNUM_P (REGNO (dest
)))
2518 if (aarch64_sve_cnt_immediate_p (offset_value
))
2519 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2520 offset_value
.coeffs
[1], 0);
2521 if (aarch64_sve_cnt_immediate_p (-offset_value
))
2522 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2523 -offset_value
.coeffs
[1], 0);
2526 int factor
= offset_value
.coeffs
[1];
2527 if ((factor
& 15) == 0)
2528 snprintf (buffer
, sizeof (buffer
), "addvl\t%%x0, %%x1, #%d", factor
/ 16);
2530 snprintf (buffer
, sizeof (buffer
), "addpl\t%%x0, %%x1, #%d", factor
/ 2);
2534 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2535 instruction. If it is, store the number of elements in each vector
2536 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2537 factor in *FACTOR_OUT (if nonnull). */
2540 aarch64_sve_inc_dec_immediate_p (rtx x
, int *factor_out
,
2541 unsigned int *nelts_per_vq_out
)
2546 if (!const_vec_duplicate_p (x
, &elt
)
2547 || !poly_int_rtx_p (elt
, &value
))
2550 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x
));
2551 if (nelts_per_vq
!= 8 && nelts_per_vq
!= 4 && nelts_per_vq
!= 2)
2552 /* There's no vector INCB. */
2555 HOST_WIDE_INT factor
= value
.coeffs
[0];
2556 if (value
.coeffs
[1] != factor
)
2559 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2560 if ((factor
% nelts_per_vq
) != 0
2561 || !IN_RANGE (abs (factor
), nelts_per_vq
, 16 * nelts_per_vq
))
2565 *factor_out
= factor
;
2566 if (nelts_per_vq_out
)
2567 *nelts_per_vq_out
= nelts_per_vq
;
2571 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2575 aarch64_sve_inc_dec_immediate_p (rtx x
)
2577 return aarch64_sve_inc_dec_immediate_p (x
, NULL
, NULL
);
2580 /* Return the asm template for an SVE vector INC or DEC instruction.
2581 OPERANDS gives the operands before the vector count and X is the
2582 value of the vector count operand itself. */
2585 aarch64_output_sve_inc_dec_immediate (const char *operands
, rtx x
)
2588 unsigned int nelts_per_vq
;
2589 if (!aarch64_sve_inc_dec_immediate_p (x
, &factor
, &nelts_per_vq
))
2592 return aarch64_output_sve_cnt_immediate ("dec", operands
, -factor
,
2595 return aarch64_output_sve_cnt_immediate ("inc", operands
, factor
,
2600 aarch64_internal_mov_immediate (rtx dest
, rtx imm
, bool generate
,
2601 scalar_int_mode mode
)
2604 unsigned HOST_WIDE_INT val
, val2
, mask
;
2605 int one_match
, zero_match
;
2610 if (aarch64_move_imm (val
, mode
))
2613 emit_insn (gen_rtx_SET (dest
, imm
));
2617 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2618 (with XXXX non-zero). In that case check to see if the move can be done in
2620 val2
= val
& 0xffffffff;
2622 && aarch64_move_imm (val2
, SImode
)
2623 && (((val
>> 32) & 0xffff) == 0 || (val
>> 48) == 0))
2626 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
2628 /* Check if we have to emit a second instruction by checking to see
2629 if any of the upper 32 bits of the original DI mode value is set. */
2633 i
= (val
>> 48) ? 48 : 32;
2636 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
2637 GEN_INT ((val
>> i
) & 0xffff)));
2642 if ((val
>> 32) == 0 || mode
== SImode
)
2646 emit_insn (gen_rtx_SET (dest
, GEN_INT (val
& 0xffff)));
2648 emit_insn (gen_insv_immsi (dest
, GEN_INT (16),
2649 GEN_INT ((val
>> 16) & 0xffff)));
2651 emit_insn (gen_insv_immdi (dest
, GEN_INT (16),
2652 GEN_INT ((val
>> 16) & 0xffff)));
2657 /* Remaining cases are all for DImode. */
2660 zero_match
= ((val
& mask
) == 0) + ((val
& (mask
<< 16)) == 0) +
2661 ((val
& (mask
<< 32)) == 0) + ((val
& (mask
<< 48)) == 0);
2662 one_match
= ((~val
& mask
) == 0) + ((~val
& (mask
<< 16)) == 0) +
2663 ((~val
& (mask
<< 32)) == 0) + ((~val
& (mask
<< 48)) == 0);
2665 if (zero_match
!= 2 && one_match
!= 2)
2667 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2668 For a 64-bit bitmask try whether changing 16 bits to all ones or
2669 zeroes creates a valid bitmask. To check any repeated bitmask,
2670 try using 16 bits from the other 32-bit half of val. */
2672 for (i
= 0; i
< 64; i
+= 16, mask
<<= 16)
2675 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
2678 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
2680 val2
= val2
& ~mask
;
2681 val2
= val2
| (((val2
>> 32) | (val2
<< 32)) & mask
);
2682 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
2689 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
2690 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
2691 GEN_INT ((val
>> i
) & 0xffff)));
2697 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2698 are emitted by the initial mov. If one_match > zero_match, skip set bits,
2699 otherwise skip zero bits. */
2703 val2
= one_match
> zero_match
? ~val
: val
;
2704 i
= (val2
& mask
) != 0 ? 0 : (val2
& (mask
<< 16)) != 0 ? 16 : 32;
2707 emit_insn (gen_rtx_SET (dest
, GEN_INT (one_match
> zero_match
2708 ? (val
| ~(mask
<< i
))
2709 : (val
& (mask
<< i
)))));
2710 for (i
+= 16; i
< 64; i
+= 16)
2712 if ((val2
& (mask
<< i
)) == 0)
2715 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
2716 GEN_INT ((val
>> i
) & 0xffff)));
2723 /* Return whether imm is a 128-bit immediate which is simple enough to
2726 aarch64_mov128_immediate (rtx imm
)
2728 if (GET_CODE (imm
) == CONST_INT
)
2731 gcc_assert (CONST_WIDE_INT_NUNITS (imm
) == 2);
2733 rtx lo
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 0));
2734 rtx hi
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 1));
2736 return aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, DImode
)
2737 + aarch64_internal_mov_immediate (NULL_RTX
, hi
, false, DImode
) <= 4;
2741 /* Return the number of temporary registers that aarch64_add_offset_1
2742 would need to add OFFSET to a register. */
2745 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset
)
2747 return abs_hwi (offset
) < 0x1000000 ? 0 : 1;
2750 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
2751 a non-polynomial OFFSET. MODE is the mode of the addition.
2752 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2753 be set and CFA adjustments added to the generated instructions.
2755 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2756 temporary if register allocation is already complete. This temporary
2757 register may overlap DEST but must not overlap SRC. If TEMP1 is known
2758 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2759 the immediate again.
2761 Since this function may be used to adjust the stack pointer, we must
2762 ensure that it cannot cause transient stack deallocation (for example
2763 by first incrementing SP and then decrementing when adjusting by a
2764 large immediate). */
2767 aarch64_add_offset_1 (scalar_int_mode mode
, rtx dest
,
2768 rtx src
, HOST_WIDE_INT offset
, rtx temp1
,
2769 bool frame_related_p
, bool emit_move_imm
)
2771 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
2772 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
2774 HOST_WIDE_INT moffset
= abs_hwi (offset
);
2779 if (!rtx_equal_p (dest
, src
))
2781 insn
= emit_insn (gen_rtx_SET (dest
, src
));
2782 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2787 /* Single instruction adjustment. */
2788 if (aarch64_uimm12_shift (moffset
))
2790 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (offset
)));
2791 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2795 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2798 a) the offset cannot be loaded by a 16-bit move or
2799 b) there is no spare register into which we can move it. */
2800 if (moffset
< 0x1000000
2801 && ((!temp1
&& !can_create_pseudo_p ())
2802 || !aarch64_move_imm (moffset
, mode
)))
2804 HOST_WIDE_INT low_off
= moffset
& 0xfff;
2806 low_off
= offset
< 0 ? -low_off
: low_off
;
2807 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (low_off
)));
2808 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2809 insn
= emit_insn (gen_add2_insn (dest
, GEN_INT (offset
- low_off
)));
2810 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2814 /* Emit a move immediate if required and an addition/subtraction. */
2817 gcc_assert (temp1
!= NULL_RTX
|| can_create_pseudo_p ());
2818 temp1
= aarch64_force_temporary (mode
, temp1
, GEN_INT (moffset
));
2820 insn
= emit_insn (offset
< 0
2821 ? gen_sub3_insn (dest
, src
, temp1
)
2822 : gen_add3_insn (dest
, src
, temp1
));
2823 if (frame_related_p
)
2825 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2826 rtx adj
= plus_constant (mode
, src
, offset
);
2827 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, gen_rtx_SET (dest
, adj
));
2831 /* Return the number of temporary registers that aarch64_add_offset
2832 would need to move OFFSET into a register or add OFFSET to a register;
2833 ADD_P is true if we want the latter rather than the former. */
2836 aarch64_offset_temporaries (bool add_p
, poly_int64 offset
)
2838 /* This follows the same structure as aarch64_add_offset. */
2839 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
2842 unsigned int count
= 0;
2843 HOST_WIDE_INT factor
= offset
.coeffs
[1];
2844 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
2845 poly_int64
poly_offset (factor
, factor
);
2846 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
2847 /* Need one register for the ADDVL/ADDPL result. */
2849 else if (factor
!= 0)
2851 factor
= abs (factor
);
2852 if (factor
> 16 * (factor
& -factor
))
2853 /* Need one register for the CNT result and one for the multiplication
2854 factor. If necessary, the second temporary can be reused for the
2855 constant part of the offset. */
2857 /* Need one register for the CNT result (which might then
2861 return count
+ aarch64_add_offset_1_temporaries (constant
);
2864 /* If X can be represented as a poly_int64, return the number
2865 of temporaries that are required to add it to a register.
2866 Return -1 otherwise. */
2869 aarch64_add_offset_temporaries (rtx x
)
2872 if (!poly_int_rtx_p (x
, &offset
))
2874 return aarch64_offset_temporaries (true, offset
);
2877 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
2878 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2879 be set and CFA adjustments added to the generated instructions.
2881 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2882 temporary if register allocation is already complete. This temporary
2883 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2884 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2885 false to avoid emitting the immediate again.
2887 TEMP2, if nonnull, is a second temporary register that doesn't
2888 overlap either DEST or REG.
2890 Since this function may be used to adjust the stack pointer, we must
2891 ensure that it cannot cause transient stack deallocation (for example
2892 by first incrementing SP and then decrementing when adjusting by a
2893 large immediate). */
2896 aarch64_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
2897 poly_int64 offset
, rtx temp1
, rtx temp2
,
2898 bool frame_related_p
, bool emit_move_imm
= true)
2900 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
2901 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
2902 gcc_assert (temp1
== NULL_RTX
2904 || !reg_overlap_mentioned_p (temp1
, dest
));
2905 gcc_assert (temp2
== NULL_RTX
|| !reg_overlap_mentioned_p (dest
, temp2
));
2907 /* Try using ADDVL or ADDPL to add the whole value. */
2908 if (src
!= const0_rtx
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
2910 rtx offset_rtx
= gen_int_mode (offset
, mode
);
2911 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
2912 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2916 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2917 SVE vector register, over and above the minimum size of 128 bits.
2918 This is equivalent to half the value returned by CNTD with a
2919 vector shape of ALL. */
2920 HOST_WIDE_INT factor
= offset
.coeffs
[1];
2921 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
2923 /* Try using ADDVL or ADDPL to add the VG-based part. */
2924 poly_int64
poly_offset (factor
, factor
);
2925 if (src
!= const0_rtx
2926 && aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
2928 rtx offset_rtx
= gen_int_mode (poly_offset
, mode
);
2929 if (frame_related_p
)
2931 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
2932 RTX_FRAME_RELATED_P (insn
) = true;
2937 rtx addr
= gen_rtx_PLUS (mode
, src
, offset_rtx
);
2938 src
= aarch64_force_temporary (mode
, temp1
, addr
);
2943 /* Otherwise use a CNT-based sequence. */
2944 else if (factor
!= 0)
2946 /* Use a subtraction if we have a negative factor. */
2947 rtx_code code
= PLUS
;
2954 /* Calculate CNTD * FACTOR / 2. First try to fold the division
2955 into the multiplication. */
2959 /* Use a right shift by 1. */
2963 HOST_WIDE_INT low_bit
= factor
& -factor
;
2964 if (factor
<= 16 * low_bit
)
2966 if (factor
> 16 * 8)
2968 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2969 the value with the minimum multiplier and shift it into
2971 int extra_shift
= exact_log2 (low_bit
);
2972 shift
+= extra_shift
;
2973 factor
>>= extra_shift
;
2975 val
= gen_int_mode (poly_int64 (factor
* 2, factor
* 2), mode
);
2979 /* Use CNTD, then multiply it by FACTOR. */
2980 val
= gen_int_mode (poly_int64 (2, 2), mode
);
2981 val
= aarch64_force_temporary (mode
, temp1
, val
);
2983 /* Go back to using a negative multiplication factor if we have
2984 no register from which to subtract. */
2985 if (code
== MINUS
&& src
== const0_rtx
)
2990 rtx coeff1
= gen_int_mode (factor
, mode
);
2991 coeff1
= aarch64_force_temporary (mode
, temp2
, coeff1
);
2992 val
= gen_rtx_MULT (mode
, val
, coeff1
);
2997 /* Multiply by 1 << SHIFT. */
2998 val
= aarch64_force_temporary (mode
, temp1
, val
);
2999 val
= gen_rtx_ASHIFT (mode
, val
, GEN_INT (shift
));
3001 else if (shift
== -1)
3004 val
= aarch64_force_temporary (mode
, temp1
, val
);
3005 val
= gen_rtx_ASHIFTRT (mode
, val
, const1_rtx
);
3008 /* Calculate SRC +/- CNTD * FACTOR / 2. */
3009 if (src
!= const0_rtx
)
3011 val
= aarch64_force_temporary (mode
, temp1
, val
);
3012 val
= gen_rtx_fmt_ee (code
, mode
, src
, val
);
3014 else if (code
== MINUS
)
3016 val
= aarch64_force_temporary (mode
, temp1
, val
);
3017 val
= gen_rtx_NEG (mode
, val
);
3020 if (constant
== 0 || frame_related_p
)
3022 rtx_insn
*insn
= emit_insn (gen_rtx_SET (dest
, val
));
3023 if (frame_related_p
)
3025 RTX_FRAME_RELATED_P (insn
) = true;
3026 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
3027 gen_rtx_SET (dest
, plus_constant (Pmode
, src
,
3036 src
= aarch64_force_temporary (mode
, temp1
, val
);
3041 emit_move_imm
= true;
3044 aarch64_add_offset_1 (mode
, dest
, src
, constant
, temp1
,
3045 frame_related_p
, emit_move_imm
);
3048 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3049 than a poly_int64. */
3052 aarch64_split_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
3053 rtx offset_rtx
, rtx temp1
, rtx temp2
)
3055 aarch64_add_offset (mode
, dest
, src
, rtx_to_poly_int64 (offset_rtx
),
3056 temp1
, temp2
, false);
3059 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3060 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
3061 if TEMP1 already contains abs (DELTA). */
3064 aarch64_add_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool emit_move_imm
)
3066 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, delta
,
3067 temp1
, temp2
, true, emit_move_imm
);
3070 /* Subtract DELTA from the stack pointer, marking the instructions
3071 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
3075 aarch64_sub_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool frame_related_p
,
3076 bool emit_move_imm
= true)
3078 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, -delta
,
3079 temp1
, temp2
, frame_related_p
, emit_move_imm
);
3082 /* Set DEST to (vec_series BASE STEP). */
3085 aarch64_expand_vec_series (rtx dest
, rtx base
, rtx step
)
3087 machine_mode mode
= GET_MODE (dest
);
3088 scalar_mode inner
= GET_MODE_INNER (mode
);
3090 /* Each operand can be a register or an immediate in the range [-16, 15]. */
3091 if (!aarch64_sve_index_immediate_p (base
))
3092 base
= force_reg (inner
, base
);
3093 if (!aarch64_sve_index_immediate_p (step
))
3094 step
= force_reg (inner
, step
);
3096 emit_set_insn (dest
, gen_rtx_VEC_SERIES (mode
, base
, step
));
3099 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
3100 integer of mode INT_MODE. Return true on success. */
3103 aarch64_expand_sve_widened_duplicate (rtx dest
, scalar_int_mode src_mode
,
3106 /* If the constant is smaller than 128 bits, we can do the move
3107 using a vector of SRC_MODEs. */
3108 if (src_mode
!= TImode
)
3110 poly_uint64 count
= exact_div (GET_MODE_SIZE (GET_MODE (dest
)),
3111 GET_MODE_SIZE (src_mode
));
3112 machine_mode dup_mode
= mode_for_vector (src_mode
, count
).require ();
3113 emit_move_insn (gen_lowpart (dup_mode
, dest
),
3114 gen_const_vec_duplicate (dup_mode
, src
));
3118 /* Use LD1RQ[BHWD] to load the 128 bits from memory. */
3119 src
= force_const_mem (src_mode
, src
);
3123 /* Make sure that the address is legitimate. */
3124 if (!aarch64_sve_ld1r_operand_p (src
))
3126 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
3127 src
= replace_equiv_address (src
, addr
);
3130 machine_mode mode
= GET_MODE (dest
);
3131 unsigned int elem_bytes
= GET_MODE_UNIT_SIZE (mode
);
3132 machine_mode pred_mode
= aarch64_sve_pred_mode (elem_bytes
).require ();
3133 rtx ptrue
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
3134 src
= gen_rtx_UNSPEC (mode
, gen_rtvec (2, ptrue
, src
), UNSPEC_LD1RQ
);
3135 emit_insn (gen_rtx_SET (dest
, src
));
3139 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
3140 isn't a simple duplicate or series. */
3143 aarch64_expand_sve_const_vector (rtx dest
, rtx src
)
3145 machine_mode mode
= GET_MODE (src
);
3146 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (src
);
3147 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (src
);
3148 gcc_assert (npatterns
> 1);
3150 if (nelts_per_pattern
== 1)
3152 /* The constant is a repeating seqeuence of at least two elements,
3153 where the repeating elements occupy no more than 128 bits.
3154 Get an integer representation of the replicated value. */
3155 scalar_int_mode int_mode
;
3156 if (BYTES_BIG_ENDIAN
)
3157 /* For now, always use LD1RQ to load the value on big-endian
3158 targets, since the handling of smaller integers includes a
3159 subreg that is semantically an element reverse. */
3163 unsigned int int_bits
= GET_MODE_UNIT_BITSIZE (mode
) * npatterns
;
3164 gcc_assert (int_bits
<= 128);
3165 int_mode
= int_mode_for_size (int_bits
, 0).require ();
3167 rtx int_value
= simplify_gen_subreg (int_mode
, src
, mode
, 0);
3169 && aarch64_expand_sve_widened_duplicate (dest
, int_mode
, int_value
))
3173 /* Expand each pattern individually. */
3174 rtx_vector_builder builder
;
3175 auto_vec
<rtx
, 16> vectors (npatterns
);
3176 for (unsigned int i
= 0; i
< npatterns
; ++i
)
3178 builder
.new_vector (mode
, 1, nelts_per_pattern
);
3179 for (unsigned int j
= 0; j
< nelts_per_pattern
; ++j
)
3180 builder
.quick_push (CONST_VECTOR_ELT (src
, i
+ j
* npatterns
));
3181 vectors
.quick_push (force_reg (mode
, builder
.build ()));
3184 /* Use permutes to interleave the separate vectors. */
3185 while (npatterns
> 1)
3188 for (unsigned int i
= 0; i
< npatterns
; ++i
)
3190 rtx tmp
= (npatterns
== 1 ? dest
: gen_reg_rtx (mode
));
3191 rtvec v
= gen_rtvec (2, vectors
[i
], vectors
[i
+ npatterns
]);
3192 emit_set_insn (tmp
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
3196 gcc_assert (vectors
[0] == dest
);
3199 /* Set DEST to immediate IMM. For SVE vector modes, GEN_VEC_DUPLICATE
3200 is a pattern that can be used to set DEST to a replicated scalar
3204 aarch64_expand_mov_immediate (rtx dest
, rtx imm
,
3205 rtx (*gen_vec_duplicate
) (rtx
, rtx
))
3207 machine_mode mode
= GET_MODE (dest
);
3209 /* Check on what type of symbol it is. */
3210 scalar_int_mode int_mode
;
3211 if ((GET_CODE (imm
) == SYMBOL_REF
3212 || GET_CODE (imm
) == LABEL_REF
3213 || GET_CODE (imm
) == CONST
3214 || GET_CODE (imm
) == CONST_POLY_INT
)
3215 && is_a
<scalar_int_mode
> (mode
, &int_mode
))
3219 HOST_WIDE_INT const_offset
;
3220 enum aarch64_symbol_type sty
;
3222 /* If we have (const (plus symbol offset)), separate out the offset
3223 before we start classifying the symbol. */
3224 rtx base
= strip_offset (imm
, &offset
);
3226 /* We must always add an offset involving VL separately, rather than
3227 folding it into the relocation. */
3228 if (!offset
.is_constant (&const_offset
))
3230 if (base
== const0_rtx
&& aarch64_sve_cnt_immediate_p (offset
))
3231 emit_insn (gen_rtx_SET (dest
, imm
));
3234 /* Do arithmetic on 32-bit values if the result is smaller
3236 if (partial_subreg_p (int_mode
, SImode
))
3238 /* It is invalid to do symbol calculations in modes
3239 narrower than SImode. */
3240 gcc_assert (base
== const0_rtx
);
3241 dest
= gen_lowpart (SImode
, dest
);
3244 if (base
!= const0_rtx
)
3246 base
= aarch64_force_temporary (int_mode
, dest
, base
);
3247 aarch64_add_offset (int_mode
, dest
, base
, offset
,
3248 NULL_RTX
, NULL_RTX
, false);
3251 aarch64_add_offset (int_mode
, dest
, base
, offset
,
3252 dest
, NULL_RTX
, false);
3257 sty
= aarch64_classify_symbol (base
, const_offset
);
3260 case SYMBOL_FORCE_TO_MEM
:
3261 if (const_offset
!= 0
3262 && targetm
.cannot_force_const_mem (int_mode
, imm
))
3264 gcc_assert (can_create_pseudo_p ());
3265 base
= aarch64_force_temporary (int_mode
, dest
, base
);
3266 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
3267 NULL_RTX
, NULL_RTX
, false);
3271 mem
= force_const_mem (ptr_mode
, imm
);
3274 /* If we aren't generating PC relative literals, then
3275 we need to expand the literal pool access carefully.
3276 This is something that needs to be done in a number
3277 of places, so could well live as a separate function. */
3278 if (!aarch64_pcrelative_literal_loads
)
3280 gcc_assert (can_create_pseudo_p ());
3281 base
= gen_reg_rtx (ptr_mode
);
3282 aarch64_expand_mov_immediate (base
, XEXP (mem
, 0));
3283 if (ptr_mode
!= Pmode
)
3284 base
= convert_memory_address (Pmode
, base
);
3285 mem
= gen_rtx_MEM (ptr_mode
, base
);
3288 if (int_mode
!= ptr_mode
)
3289 mem
= gen_rtx_ZERO_EXTEND (int_mode
, mem
);
3291 emit_insn (gen_rtx_SET (dest
, mem
));
3295 case SYMBOL_SMALL_TLSGD
:
3296 case SYMBOL_SMALL_TLSDESC
:
3297 case SYMBOL_SMALL_TLSIE
:
3298 case SYMBOL_SMALL_GOT_28K
:
3299 case SYMBOL_SMALL_GOT_4G
:
3300 case SYMBOL_TINY_GOT
:
3301 case SYMBOL_TINY_TLSIE
:
3302 if (const_offset
!= 0)
3304 gcc_assert(can_create_pseudo_p ());
3305 base
= aarch64_force_temporary (int_mode
, dest
, base
);
3306 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
3307 NULL_RTX
, NULL_RTX
, false);
3312 case SYMBOL_SMALL_ABSOLUTE
:
3313 case SYMBOL_TINY_ABSOLUTE
:
3314 case SYMBOL_TLSLE12
:
3315 case SYMBOL_TLSLE24
:
3316 case SYMBOL_TLSLE32
:
3317 case SYMBOL_TLSLE48
:
3318 aarch64_load_symref_appropriately (dest
, imm
, sty
);
3326 if (!CONST_INT_P (imm
))
3328 rtx base
, step
, value
;
3329 if (GET_CODE (imm
) == HIGH
3330 || aarch64_simd_valid_immediate (imm
, NULL
))
3331 emit_insn (gen_rtx_SET (dest
, imm
));
3332 else if (const_vec_series_p (imm
, &base
, &step
))
3333 aarch64_expand_vec_series (dest
, base
, step
);
3334 else if (const_vec_duplicate_p (imm
, &value
))
3336 /* If the constant is out of range of an SVE vector move,
3337 load it from memory if we can, otherwise move it into
3338 a register and use a DUP. */
3339 scalar_mode inner_mode
= GET_MODE_INNER (mode
);
3340 rtx op
= force_const_mem (inner_mode
, value
);
3342 op
= force_reg (inner_mode
, value
);
3343 else if (!aarch64_sve_ld1r_operand_p (op
))
3345 rtx addr
= force_reg (Pmode
, XEXP (op
, 0));
3346 op
= replace_equiv_address (op
, addr
);
3348 emit_insn (gen_vec_duplicate (dest
, op
));
3350 else if (GET_CODE (imm
) == CONST_VECTOR
3351 && !GET_MODE_NUNITS (GET_MODE (imm
)).is_constant ())
3352 aarch64_expand_sve_const_vector (dest
, imm
);
3355 rtx mem
= force_const_mem (mode
, imm
);
3357 emit_move_insn (dest
, mem
);
3363 aarch64_internal_mov_immediate (dest
, imm
, true,
3364 as_a
<scalar_int_mode
> (mode
));
3367 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
3368 that is known to contain PTRUE. */
3371 aarch64_emit_sve_pred_move (rtx dest
, rtx pred
, rtx src
)
3373 emit_insn (gen_rtx_SET (dest
, gen_rtx_UNSPEC (GET_MODE (dest
),
3374 gen_rtvec (2, pred
, src
),
3375 UNSPEC_MERGE_PTRUE
)));
3378 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3379 operand is in memory. In this case we need to use the predicated LD1
3380 and ST1 instead of LDR and STR, both for correctness on big-endian
3381 targets and because LD1 and ST1 support a wider range of addressing modes.
3382 PRED_MODE is the mode of the predicate.
3384 See the comment at the head of aarch64-sve.md for details about the
3385 big-endian handling. */
3388 aarch64_expand_sve_mem_move (rtx dest
, rtx src
, machine_mode pred_mode
)
3390 machine_mode mode
= GET_MODE (dest
);
3391 rtx ptrue
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
3392 if (!register_operand (src
, mode
)
3393 && !register_operand (dest
, mode
))
3395 rtx tmp
= gen_reg_rtx (mode
);
3397 aarch64_emit_sve_pred_move (tmp
, ptrue
, src
);
3399 emit_move_insn (tmp
, src
);
3402 aarch64_emit_sve_pred_move (dest
, ptrue
, src
);
3405 /* Called only on big-endian targets. See whether an SVE vector move
3406 from SRC to DEST is effectively a REV[BHW] instruction, because at
3407 least one operand is a subreg of an SVE vector that has wider or
3408 narrower elements. Return true and emit the instruction if so.
3412 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3414 represents a VIEW_CONVERT between the following vectors, viewed
3417 R2: { [0].high, [0].low, [1].high, [1].low, ... }
3418 R1: { [0], [1], [2], [3], ... }
3420 The high part of lane X in R2 should therefore correspond to lane X*2
3421 of R1, but the register representations are:
3424 R2: ...... [1].high [1].low [0].high [0].low
3425 R1: ...... [3] [2] [1] [0]
3427 where the low part of lane X in R2 corresponds to lane X*2 in R1.
3428 We therefore need a reverse operation to swap the high and low values
3431 This is purely an optimization. Without it we would spill the
3432 subreg operand to the stack in one mode and reload it in the
3433 other mode, which has the same effect as the REV. */
3436 aarch64_maybe_expand_sve_subreg_move (rtx dest
, rtx src
)
3438 gcc_assert (BYTES_BIG_ENDIAN
);
3439 if (GET_CODE (dest
) == SUBREG
)
3440 dest
= SUBREG_REG (dest
);
3441 if (GET_CODE (src
) == SUBREG
)
3442 src
= SUBREG_REG (src
);
3444 /* The optimization handles two single SVE REGs with different element
3448 || aarch64_classify_vector_mode (GET_MODE (dest
)) != VEC_SVE_DATA
3449 || aarch64_classify_vector_mode (GET_MODE (src
)) != VEC_SVE_DATA
3450 || (GET_MODE_UNIT_SIZE (GET_MODE (dest
))
3451 == GET_MODE_UNIT_SIZE (GET_MODE (src
))))
3454 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
3455 rtx ptrue
= force_reg (VNx16BImode
, CONSTM1_RTX (VNx16BImode
));
3456 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (dest
), gen_rtvec (2, ptrue
, src
),
3458 emit_insn (gen_rtx_SET (dest
, unspec
));
3462 /* Return a copy of X with mode MODE, without changing its other
3463 attributes. Unlike gen_lowpart, this doesn't care whether the
3464 mode change is valid. */
3467 aarch64_replace_reg_mode (rtx x
, machine_mode mode
)
3469 if (GET_MODE (x
) == mode
)
3472 x
= shallow_copy_rtx (x
);
3473 set_mode_and_regno (x
, mode
, REGNO (x
));
3477 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3481 aarch64_split_sve_subreg_move (rtx dest
, rtx ptrue
, rtx src
)
3483 /* Decide which REV operation we need. The mode with narrower elements
3484 determines the mode of the operands and the mode with the wider
3485 elements determines the reverse width. */
3486 machine_mode mode_with_wider_elts
= GET_MODE (dest
);
3487 machine_mode mode_with_narrower_elts
= GET_MODE (src
);
3488 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts
)
3489 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts
))
3490 std::swap (mode_with_wider_elts
, mode_with_narrower_elts
);
3492 unsigned int wider_bytes
= GET_MODE_UNIT_SIZE (mode_with_wider_elts
);
3493 unsigned int unspec
;
3494 if (wider_bytes
== 8)
3495 unspec
= UNSPEC_REV64
;
3496 else if (wider_bytes
== 4)
3497 unspec
= UNSPEC_REV32
;
3498 else if (wider_bytes
== 2)
3499 unspec
= UNSPEC_REV16
;
3502 machine_mode pred_mode
= aarch64_sve_pred_mode (wider_bytes
).require ();
3506 (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3507 UNSPEC_MERGE_PTRUE))
3509 with the appropriate modes. */
3510 ptrue
= gen_lowpart (pred_mode
, ptrue
);
3511 dest
= aarch64_replace_reg_mode (dest
, mode_with_narrower_elts
);
3512 src
= aarch64_replace_reg_mode (src
, mode_with_narrower_elts
);
3513 src
= gen_rtx_UNSPEC (mode_with_narrower_elts
, gen_rtvec (1, src
), unspec
);
3514 src
= gen_rtx_UNSPEC (mode_with_narrower_elts
, gen_rtvec (2, ptrue
, src
),
3515 UNSPEC_MERGE_PTRUE
);
3516 emit_insn (gen_rtx_SET (dest
, src
));
3520 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED
,
3521 tree exp ATTRIBUTE_UNUSED
)
3523 if (aarch64_simd_decl_p (cfun
->decl
) != aarch64_simd_decl_p (decl
))
3529 /* Implement TARGET_PASS_BY_REFERENCE. */
3532 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED
,
3535 bool named ATTRIBUTE_UNUSED
)
3538 machine_mode dummymode
;
3541 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
3542 if (mode
== BLKmode
&& type
)
3543 size
= int_size_in_bytes (type
);
3545 /* No frontends can create types with variable-sized modes, so we
3546 shouldn't be asked to pass or return them. */
3547 size
= GET_MODE_SIZE (mode
).to_constant ();
3549 /* Aggregates are passed by reference based on their size. */
3550 if (type
&& AGGREGATE_TYPE_P (type
))
3552 size
= int_size_in_bytes (type
);
3555 /* Variable sized arguments are always returned by reference. */
3559 /* Can this be a candidate to be passed in fp/simd register(s)? */
3560 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
3565 /* Arguments which are variable sized or larger than 2 registers are
3566 passed by reference unless they are a homogenous floating point
3568 return size
> 2 * UNITS_PER_WORD
;
3571 /* Return TRUE if VALTYPE is padded to its least significant bits. */
3573 aarch64_return_in_msb (const_tree valtype
)
3575 machine_mode dummy_mode
;
3578 /* Never happens in little-endian mode. */
3579 if (!BYTES_BIG_ENDIAN
)
3582 /* Only composite types smaller than or equal to 16 bytes can
3583 be potentially returned in registers. */
3584 if (!aarch64_composite_type_p (valtype
, TYPE_MODE (valtype
))
3585 || int_size_in_bytes (valtype
) <= 0
3586 || int_size_in_bytes (valtype
) > 16)
3589 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3590 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3591 is always passed/returned in the least significant bits of fp/simd
3593 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype
), valtype
,
3594 &dummy_mode
, &dummy_int
, NULL
))
3600 /* Implement TARGET_FUNCTION_VALUE.
3601 Define how to find the value returned by a function. */
3604 aarch64_function_value (const_tree type
, const_tree func
,
3605 bool outgoing ATTRIBUTE_UNUSED
)
3610 machine_mode ag_mode
;
3612 mode
= TYPE_MODE (type
);
3613 if (INTEGRAL_TYPE_P (type
))
3614 mode
= promote_function_mode (type
, mode
, &unsignedp
, func
, 1);
3616 if (aarch64_return_in_msb (type
))
3618 HOST_WIDE_INT size
= int_size_in_bytes (type
);
3620 if (size
% UNITS_PER_WORD
!= 0)
3622 size
+= UNITS_PER_WORD
- size
% UNITS_PER_WORD
;
3623 mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 0).require ();
3627 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
3628 &ag_mode
, &count
, NULL
))
3630 if (!aarch64_composite_type_p (type
, mode
))
3632 gcc_assert (count
== 1 && mode
== ag_mode
);
3633 return gen_rtx_REG (mode
, V0_REGNUM
);
3640 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (count
));
3641 for (i
= 0; i
< count
; i
++)
3643 rtx tmp
= gen_rtx_REG (ag_mode
, V0_REGNUM
+ i
);
3644 rtx offset
= gen_int_mode (i
* GET_MODE_SIZE (ag_mode
), Pmode
);
3645 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
3646 XVECEXP (par
, 0, i
) = tmp
;
3652 return gen_rtx_REG (mode
, R0_REGNUM
);
3655 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3656 Return true if REGNO is the number of a hard register in which the values
3657 of called function may come back. */
3660 aarch64_function_value_regno_p (const unsigned int regno
)
3662 /* Maximum of 16 bytes can be returned in the general registers. Examples
3663 of 16-byte return values are: 128-bit integers and 16-byte small
3664 structures (excluding homogeneous floating-point aggregates). */
3665 if (regno
== R0_REGNUM
|| regno
== R1_REGNUM
)
3668 /* Up to four fp/simd registers can return a function value, e.g. a
3669 homogeneous floating-point aggregate having four members. */
3670 if (regno
>= V0_REGNUM
&& regno
< V0_REGNUM
+ HA_MAX_NUM_FLDS
)
3671 return TARGET_FLOAT
;
3676 /* Implement TARGET_RETURN_IN_MEMORY.
3678 If the type T of the result of a function is such that
3680 would require that arg be passed as a value in a register (or set of
3681 registers) according to the parameter passing rules, then the result
3682 is returned in the same registers as would be used for such an
3686 aarch64_return_in_memory (const_tree type
, const_tree fndecl ATTRIBUTE_UNUSED
)
3689 machine_mode ag_mode
;
3692 if (!AGGREGATE_TYPE_P (type
)
3693 && TREE_CODE (type
) != COMPLEX_TYPE
3694 && TREE_CODE (type
) != VECTOR_TYPE
)
3695 /* Simple scalar types always returned in registers. */
3698 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
),
3705 /* Types larger than 2 registers returned in memory. */
3706 size
= int_size_in_bytes (type
);
3707 return (size
< 0 || size
> 2 * UNITS_PER_WORD
);
3711 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v
, machine_mode mode
,
3712 const_tree type
, int *nregs
)
3714 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
3715 return aarch64_vfp_is_call_or_return_candidate (mode
,
3717 &pcum
->aapcs_vfp_rmode
,
3722 /* Given MODE and TYPE of a function argument, return the alignment in
3723 bits. The idea is to suppress any stronger alignment requested by
3724 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3725 This is a helper function for local use only. */
3728 aarch64_function_arg_alignment (machine_mode mode
, const_tree type
)
3731 return GET_MODE_ALIGNMENT (mode
);
3733 if (integer_zerop (TYPE_SIZE (type
)))
3736 gcc_assert (TYPE_MODE (type
) == mode
);
3738 if (!AGGREGATE_TYPE_P (type
))
3739 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type
));
3741 if (TREE_CODE (type
) == ARRAY_TYPE
)
3742 return TYPE_ALIGN (TREE_TYPE (type
));
3744 unsigned int alignment
= 0;
3745 for (tree field
= TYPE_FIELDS (type
); field
; field
= DECL_CHAIN (field
))
3746 if (TREE_CODE (field
) == FIELD_DECL
)
3747 alignment
= std::max (alignment
, DECL_ALIGN (field
));
3752 /* Layout a function argument according to the AAPCS64 rules. The rule
3753 numbers refer to the rule numbers in the AAPCS64. */
3756 aarch64_layout_arg (cumulative_args_t pcum_v
, machine_mode mode
,
3758 bool named ATTRIBUTE_UNUSED
)
3760 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
3761 int ncrn
, nvrn
, nregs
;
3762 bool allocate_ncrn
, allocate_nvrn
;
3765 /* We need to do this once per argument. */
3766 if (pcum
->aapcs_arg_processed
)
3769 pcum
->aapcs_arg_processed
= true;
3771 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
3773 size
= int_size_in_bytes (type
);
3775 /* No frontends can create types with variable-sized modes, so we
3776 shouldn't be asked to pass or return them. */
3777 size
= GET_MODE_SIZE (mode
).to_constant ();
3778 size
= ROUND_UP (size
, UNITS_PER_WORD
);
3780 allocate_ncrn
= (type
) ? !(FLOAT_TYPE_P (type
)) : !FLOAT_MODE_P (mode
);
3781 allocate_nvrn
= aarch64_vfp_is_call_candidate (pcum_v
,
3786 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3787 The following code thus handles passing by SIMD/FP registers first. */
3789 nvrn
= pcum
->aapcs_nvrn
;
3791 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3792 and homogenous short-vector aggregates (HVA). */
3796 aarch64_err_no_fpadvsimd (mode
);
3798 if (nvrn
+ nregs
<= NUM_FP_ARG_REGS
)
3800 pcum
->aapcs_nextnvrn
= nvrn
+ nregs
;
3801 if (!aarch64_composite_type_p (type
, mode
))
3803 gcc_assert (nregs
== 1);
3804 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ nvrn
);
3810 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
3811 for (i
= 0; i
< nregs
; i
++)
3813 rtx tmp
= gen_rtx_REG (pcum
->aapcs_vfp_rmode
,
3814 V0_REGNUM
+ nvrn
+ i
);
3815 rtx offset
= gen_int_mode
3816 (i
* GET_MODE_SIZE (pcum
->aapcs_vfp_rmode
), Pmode
);
3817 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
3818 XVECEXP (par
, 0, i
) = tmp
;
3820 pcum
->aapcs_reg
= par
;
3826 /* C.3 NSRN is set to 8. */
3827 pcum
->aapcs_nextnvrn
= NUM_FP_ARG_REGS
;
3832 ncrn
= pcum
->aapcs_ncrn
;
3833 nregs
= size
/ UNITS_PER_WORD
;
3835 /* C6 - C9. though the sign and zero extension semantics are
3836 handled elsewhere. This is the case where the argument fits
3837 entirely general registers. */
3838 if (allocate_ncrn
&& (ncrn
+ nregs
<= NUM_ARG_REGS
))
3841 gcc_assert (nregs
== 0 || nregs
== 1 || nregs
== 2);
3843 /* C.8 if the argument has an alignment of 16 then the NGRN is
3844 rounded up to the next even number. */
3847 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3848 comparison is there because for > 16 * BITS_PER_UNIT
3849 alignment nregs should be > 2 and therefore it should be
3850 passed by reference rather than value. */
3851 && aarch64_function_arg_alignment (mode
, type
) == 16 * BITS_PER_UNIT
)
3854 gcc_assert (ncrn
+ nregs
<= NUM_ARG_REGS
);
3857 /* NREGS can be 0 when e.g. an empty structure is to be passed.
3858 A reg is still generated for it, but the caller should be smart
3859 enough not to use it. */
3860 if (nregs
== 0 || nregs
== 1 || GET_MODE_CLASS (mode
) == MODE_INT
)
3861 pcum
->aapcs_reg
= gen_rtx_REG (mode
, R0_REGNUM
+ ncrn
);
3867 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
3868 for (i
= 0; i
< nregs
; i
++)
3870 rtx tmp
= gen_rtx_REG (word_mode
, R0_REGNUM
+ ncrn
+ i
);
3871 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
,
3872 GEN_INT (i
* UNITS_PER_WORD
));
3873 XVECEXP (par
, 0, i
) = tmp
;
3875 pcum
->aapcs_reg
= par
;
3878 pcum
->aapcs_nextncrn
= ncrn
+ nregs
;
3883 pcum
->aapcs_nextncrn
= NUM_ARG_REGS
;
3885 /* The argument is passed on stack; record the needed number of words for
3886 this argument and align the total size if necessary. */
3888 pcum
->aapcs_stack_words
= size
/ UNITS_PER_WORD
;
3890 if (aarch64_function_arg_alignment (mode
, type
) == 16 * BITS_PER_UNIT
)
3891 pcum
->aapcs_stack_size
= ROUND_UP (pcum
->aapcs_stack_size
,
3892 16 / UNITS_PER_WORD
);
3896 /* Implement TARGET_FUNCTION_ARG. */
3899 aarch64_function_arg (cumulative_args_t pcum_v
, machine_mode mode
,
3900 const_tree type
, bool named
)
3902 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
3903 gcc_assert (pcum
->pcs_variant
== ARM_PCS_AAPCS64
);
3905 if (mode
== VOIDmode
)
3908 aarch64_layout_arg (pcum_v
, mode
, type
, named
);
3909 return pcum
->aapcs_reg
;
3913 aarch64_init_cumulative_args (CUMULATIVE_ARGS
*pcum
,
3914 const_tree fntype ATTRIBUTE_UNUSED
,
3915 rtx libname ATTRIBUTE_UNUSED
,
3916 const_tree fndecl ATTRIBUTE_UNUSED
,
3917 unsigned n_named ATTRIBUTE_UNUSED
)
3919 pcum
->aapcs_ncrn
= 0;
3920 pcum
->aapcs_nvrn
= 0;
3921 pcum
->aapcs_nextncrn
= 0;
3922 pcum
->aapcs_nextnvrn
= 0;
3923 pcum
->pcs_variant
= ARM_PCS_AAPCS64
;
3924 pcum
->aapcs_reg
= NULL_RTX
;
3925 pcum
->aapcs_arg_processed
= false;
3926 pcum
->aapcs_stack_words
= 0;
3927 pcum
->aapcs_stack_size
= 0;
3930 && fndecl
&& TREE_PUBLIC (fndecl
)
3931 && fntype
&& fntype
!= error_mark_node
)
3933 const_tree type
= TREE_TYPE (fntype
);
3934 machine_mode mode ATTRIBUTE_UNUSED
; /* To pass pointer as argument. */
3935 int nregs ATTRIBUTE_UNUSED
; /* Likewise. */
3936 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
), type
,
3937 &mode
, &nregs
, NULL
))
3938 aarch64_err_no_fpadvsimd (TYPE_MODE (type
));
3944 aarch64_function_arg_advance (cumulative_args_t pcum_v
,
3949 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
3950 if (pcum
->pcs_variant
== ARM_PCS_AAPCS64
)
3952 aarch64_layout_arg (pcum_v
, mode
, type
, named
);
3953 gcc_assert ((pcum
->aapcs_reg
!= NULL_RTX
)
3954 != (pcum
->aapcs_stack_words
!= 0));
3955 pcum
->aapcs_arg_processed
= false;
3956 pcum
->aapcs_ncrn
= pcum
->aapcs_nextncrn
;
3957 pcum
->aapcs_nvrn
= pcum
->aapcs_nextnvrn
;
3958 pcum
->aapcs_stack_size
+= pcum
->aapcs_stack_words
;
3959 pcum
->aapcs_stack_words
= 0;
3960 pcum
->aapcs_reg
= NULL_RTX
;
3965 aarch64_function_arg_regno_p (unsigned regno
)
3967 return ((GP_REGNUM_P (regno
) && regno
< R0_REGNUM
+ NUM_ARG_REGS
)
3968 || (FP_REGNUM_P (regno
) && regno
< V0_REGNUM
+ NUM_FP_ARG_REGS
));
3971 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
3972 PARM_BOUNDARY bits of alignment, but will be given anything up
3973 to STACK_BOUNDARY bits if the type requires it. This makes sure
3974 that both before and after the layout of each argument, the Next
3975 Stacked Argument Address (NSAA) will have a minimum alignment of
3979 aarch64_function_arg_boundary (machine_mode mode
, const_tree type
)
3981 unsigned int alignment
= aarch64_function_arg_alignment (mode
, type
);
3982 return MIN (MAX (alignment
, PARM_BOUNDARY
), STACK_BOUNDARY
);
3985 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
3987 static fixed_size_mode
3988 aarch64_get_reg_raw_mode (int regno
)
3990 if (TARGET_SVE
&& FP_REGNUM_P (regno
))
3991 /* Don't use the SVE part of the register for __builtin_apply and
3992 __builtin_return. The SVE registers aren't used by the normal PCS,
3993 so using them there would be a waste of time. The PCS extensions
3994 for SVE types are fundamentally incompatible with the
3995 __builtin_return/__builtin_apply interface. */
3996 return as_a
<fixed_size_mode
> (V16QImode
);
3997 return default_get_reg_raw_mode (regno
);
4000 /* Implement TARGET_FUNCTION_ARG_PADDING.
4002 Small aggregate types are placed in the lowest memory address.
4004 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
4006 static pad_direction
4007 aarch64_function_arg_padding (machine_mode mode
, const_tree type
)
4009 /* On little-endian targets, the least significant byte of every stack
4010 argument is passed at the lowest byte address of the stack slot. */
4011 if (!BYTES_BIG_ENDIAN
)
4014 /* Otherwise, integral, floating-point and pointer types are padded downward:
4015 the least significant byte of a stack argument is passed at the highest
4016 byte address of the stack slot. */
4018 ? (INTEGRAL_TYPE_P (type
) || SCALAR_FLOAT_TYPE_P (type
)
4019 || POINTER_TYPE_P (type
))
4020 : (SCALAR_INT_MODE_P (mode
) || SCALAR_FLOAT_MODE_P (mode
)))
4021 return PAD_DOWNWARD
;
4023 /* Everything else padded upward, i.e. data in first byte of stack slot. */
4027 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4029 It specifies padding for the last (may also be the only)
4030 element of a block move between registers and memory. If
4031 assuming the block is in the memory, padding upward means that
4032 the last element is padded after its highest significant byte,
4033 while in downward padding, the last element is padded at the
4034 its least significant byte side.
4036 Small aggregates and small complex types are always padded
4039 We don't need to worry about homogeneous floating-point or
4040 short-vector aggregates; their move is not affected by the
4041 padding direction determined here. Regardless of endianness,
4042 each element of such an aggregate is put in the least
4043 significant bits of a fp/simd register.
4045 Return !BYTES_BIG_ENDIAN if the least significant byte of the
4046 register has useful data, and return the opposite if the most
4047 significant byte does. */
4050 aarch64_pad_reg_upward (machine_mode mode
, const_tree type
,
4051 bool first ATTRIBUTE_UNUSED
)
4054 /* Small composite types are always padded upward. */
4055 if (BYTES_BIG_ENDIAN
&& aarch64_composite_type_p (type
, mode
))
4059 size
= int_size_in_bytes (type
);
4061 /* No frontends can create types with variable-sized modes, so we
4062 shouldn't be asked to pass or return them. */
4063 size
= GET_MODE_SIZE (mode
).to_constant ();
4064 if (size
< 2 * UNITS_PER_WORD
)
4068 /* Otherwise, use the default padding. */
4069 return !BYTES_BIG_ENDIAN
;
4072 static scalar_int_mode
4073 aarch64_libgcc_cmp_return_mode (void)
4078 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4080 /* We use the 12-bit shifted immediate arithmetic instructions so values
4081 must be multiple of (1 << 12), i.e. 4096. */
4082 #define ARITH_FACTOR 4096
4084 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4085 #error Cannot use simple address calculation for stack probing
4088 /* The pair of scratch registers used for stack probing. */
4089 #define PROBE_STACK_FIRST_REG R9_REGNUM
4090 #define PROBE_STACK_SECOND_REG R10_REGNUM
4092 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
4093 inclusive. These are offsets from the current stack pointer. */
4096 aarch64_emit_probe_stack_range (HOST_WIDE_INT first
, poly_int64 poly_size
)
4099 if (!poly_size
.is_constant (&size
))
4101 sorry ("stack probes for SVE frames");
4105 rtx reg1
= gen_rtx_REG (Pmode
, PROBE_STACK_FIRST_REG
);
4107 /* See the same assertion on PROBE_INTERVAL above. */
4108 gcc_assert ((first
% ARITH_FACTOR
) == 0);
4110 /* See if we have a constant small number of probes to generate. If so,
4111 that's the easy case. */
4112 if (size
<= PROBE_INTERVAL
)
4114 const HOST_WIDE_INT base
= ROUND_UP (size
, ARITH_FACTOR
);
4116 emit_set_insn (reg1
,
4117 plus_constant (Pmode
,
4118 stack_pointer_rtx
, -(first
+ base
)));
4119 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- size
));
4122 /* The run-time loop is made up of 8 insns in the generic case while the
4123 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
4124 else if (size
<= 4 * PROBE_INTERVAL
)
4126 HOST_WIDE_INT i
, rem
;
4128 emit_set_insn (reg1
,
4129 plus_constant (Pmode
,
4131 -(first
+ PROBE_INTERVAL
)));
4132 emit_stack_probe (reg1
);
4134 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
4135 it exceeds SIZE. If only two probes are needed, this will not
4136 generate any code. Then probe at FIRST + SIZE. */
4137 for (i
= 2 * PROBE_INTERVAL
; i
< size
; i
+= PROBE_INTERVAL
)
4139 emit_set_insn (reg1
,
4140 plus_constant (Pmode
, reg1
, -PROBE_INTERVAL
));
4141 emit_stack_probe (reg1
);
4144 rem
= size
- (i
- PROBE_INTERVAL
);
4147 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
4149 emit_set_insn (reg1
, plus_constant (Pmode
, reg1
, -base
));
4150 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- rem
));
4153 emit_stack_probe (plus_constant (Pmode
, reg1
, -rem
));
4156 /* Otherwise, do the same as above, but in a loop. Note that we must be
4157 extra careful with variables wrapping around because we might be at
4158 the very top (or the very bottom) of the address space and we have
4159 to be able to handle this case properly; in particular, we use an
4160 equality test for the loop condition. */
4163 rtx reg2
= gen_rtx_REG (Pmode
, PROBE_STACK_SECOND_REG
);
4165 /* Step 1: round SIZE to the previous multiple of the interval. */
4167 HOST_WIDE_INT rounded_size
= size
& -PROBE_INTERVAL
;
4170 /* Step 2: compute initial and final value of the loop counter. */
4172 /* TEST_ADDR = SP + FIRST. */
4173 emit_set_insn (reg1
,
4174 plus_constant (Pmode
, stack_pointer_rtx
, -first
));
4176 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
4177 HOST_WIDE_INT adjustment
= - (first
+ rounded_size
);
4178 if (! aarch64_uimm12_shift (adjustment
))
4180 aarch64_internal_mov_immediate (reg2
, GEN_INT (adjustment
),
4182 emit_set_insn (reg2
, gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, reg2
));
4185 emit_set_insn (reg2
,
4186 plus_constant (Pmode
, stack_pointer_rtx
, adjustment
));
4192 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
4195 while (TEST_ADDR != LAST_ADDR)
4197 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
4198 until it is equal to ROUNDED_SIZE. */
4200 emit_insn (gen_probe_stack_range (reg1
, reg1
, reg2
));
4203 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
4204 that SIZE is equal to ROUNDED_SIZE. */
4206 if (size
!= rounded_size
)
4208 HOST_WIDE_INT rem
= size
- rounded_size
;
4212 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
4214 emit_set_insn (reg2
, plus_constant (Pmode
, reg2
, -base
));
4215 emit_stack_probe (plus_constant (Pmode
, reg2
, base
- rem
));
4218 emit_stack_probe (plus_constant (Pmode
, reg2
, -rem
));
4222 /* Make sure nothing is scheduled before we are done. */
4223 emit_insn (gen_blockage ());
4226 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
4227 absolute addresses. */
4230 aarch64_output_probe_stack_range (rtx reg1
, rtx reg2
)
4232 static int labelno
= 0;
4236 ASM_GENERATE_INTERNAL_LABEL (loop_lab
, "LPSRL", labelno
++);
4239 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_lab
);
4241 HOST_WIDE_INT stack_clash_probe_interval
4242 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
4244 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
4246 HOST_WIDE_INT interval
;
4247 if (flag_stack_clash_protection
)
4248 interval
= stack_clash_probe_interval
;
4250 interval
= PROBE_INTERVAL
;
4252 gcc_assert (aarch64_uimm12_shift (interval
));
4253 xops
[1] = GEN_INT (interval
);
4255 output_asm_insn ("sub\t%0, %0, %1", xops
);
4257 /* If doing stack clash protection then we probe up by the ABI specified
4258 amount. We do this because we're dropping full pages at a time in the
4259 loop. But if we're doing non-stack clash probing, probe at SP 0. */
4260 if (flag_stack_clash_protection
)
4261 xops
[1] = GEN_INT (STACK_CLASH_CALLER_GUARD
);
4263 xops
[1] = CONST0_RTX (GET_MODE (xops
[1]));
4265 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
4266 by this amount for each iteration. */
4267 output_asm_insn ("str\txzr, [%0, %1]", xops
);
4269 /* Test if TEST_ADDR == LAST_ADDR. */
4271 output_asm_insn ("cmp\t%0, %1", xops
);
4274 fputs ("\tb.ne\t", asm_out_file
);
4275 assemble_name_raw (asm_out_file
, loop_lab
);
4276 fputc ('\n', asm_out_file
);
4281 /* Emit the probe loop for doing stack clash probes and stack adjustments for
4282 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
4283 of GUARD_SIZE. When a probe is emitted it is done at most
4284 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
4285 at most MIN_PROBE_THRESHOLD. By the end of this function
4286 BASE = BASE - ADJUSTMENT. */
4289 aarch64_output_probe_sve_stack_clash (rtx base
, rtx adjustment
,
4290 rtx min_probe_threshold
, rtx guard_size
)
4292 /* This function is not allowed to use any instruction generation function
4293 like gen_ and friends. If you do you'll likely ICE during CFG validation,
4294 so instead emit the code you want using output_asm_insn. */
4295 gcc_assert (flag_stack_clash_protection
);
4296 gcc_assert (CONST_INT_P (min_probe_threshold
) && CONST_INT_P (guard_size
));
4297 gcc_assert (INTVAL (guard_size
) > INTVAL (min_probe_threshold
));
4299 /* The minimum required allocation before the residual requires probing. */
4300 HOST_WIDE_INT residual_probe_guard
= INTVAL (min_probe_threshold
);
4302 /* Clamp the value down to the nearest value that can be used with a cmp. */
4303 residual_probe_guard
= aarch64_clamp_to_uimm12_shift (residual_probe_guard
);
4304 rtx probe_offset_value_rtx
= gen_int_mode (residual_probe_guard
, Pmode
);
4306 gcc_assert (INTVAL (min_probe_threshold
) >= residual_probe_guard
);
4307 gcc_assert (aarch64_uimm12_shift (residual_probe_guard
));
4309 static int labelno
= 0;
4310 char loop_start_lab
[32];
4311 char loop_end_lab
[32];
4314 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab
, "SVLPSPL", labelno
);
4315 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab
, "SVLPEND", labelno
++);
4317 /* Emit loop start label. */
4318 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_start_lab
);
4320 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
4321 xops
[0] = adjustment
;
4322 xops
[1] = probe_offset_value_rtx
;
4323 output_asm_insn ("cmp\t%0, %1", xops
);
4325 /* Branch to end if not enough adjustment to probe. */
4326 fputs ("\tb.lt\t", asm_out_file
);
4327 assemble_name_raw (asm_out_file
, loop_end_lab
);
4328 fputc ('\n', asm_out_file
);
4330 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
4332 xops
[1] = probe_offset_value_rtx
;
4333 output_asm_insn ("sub\t%0, %0, %1", xops
);
4335 /* Probe at BASE. */
4336 xops
[1] = const0_rtx
;
4337 output_asm_insn ("str\txzr, [%0, %1]", xops
);
4339 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
4340 xops
[0] = adjustment
;
4341 xops
[1] = probe_offset_value_rtx
;
4342 output_asm_insn ("sub\t%0, %0, %1", xops
);
4344 /* Branch to start if still more bytes to allocate. */
4345 fputs ("\tb\t", asm_out_file
);
4346 assemble_name_raw (asm_out_file
, loop_start_lab
);
4347 fputc ('\n', asm_out_file
);
4349 /* No probe leave. */
4350 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_end_lab
);
4352 /* BASE = BASE - ADJUSTMENT. */
4354 xops
[1] = adjustment
;
4355 output_asm_insn ("sub\t%0, %0, %1", xops
);
4359 /* Determine whether a frame chain needs to be generated. */
4361 aarch64_needs_frame_chain (void)
4363 /* Force a frame chain for EH returns so the return address is at FP+8. */
4364 if (frame_pointer_needed
|| crtl
->calls_eh_return
)
4367 /* A leaf function cannot have calls or write LR. */
4368 bool is_leaf
= crtl
->is_leaf
&& !df_regs_ever_live_p (LR_REGNUM
);
4370 /* Don't use a frame chain in leaf functions if leaf frame pointers
4372 if (flag_omit_leaf_frame_pointer
&& is_leaf
)
4375 return aarch64_use_frame_pointer
;
4378 /* Mark the registers that need to be saved by the callee and calculate
4379 the size of the callee-saved registers area and frame record (both FP
4380 and LR may be omitted). */
4382 aarch64_layout_frame (void)
4384 HOST_WIDE_INT offset
= 0;
4385 int regno
, last_fp_reg
= INVALID_REGNUM
;
4386 bool simd_function
= aarch64_simd_decl_p (cfun
->decl
);
4388 cfun
->machine
->frame
.emit_frame_chain
= aarch64_needs_frame_chain ();
4390 /* Adjust the outgoing arguments size if required. Keep it in sync with what
4391 the mid-end is doing. */
4392 crtl
->outgoing_args_size
= STACK_DYNAMIC_OFFSET (cfun
);
4394 #define SLOT_NOT_REQUIRED (-2)
4395 #define SLOT_REQUIRED (-1)
4397 cfun
->machine
->frame
.wb_candidate1
= INVALID_REGNUM
;
4398 cfun
->machine
->frame
.wb_candidate2
= INVALID_REGNUM
;
4400 /* If this is a non-leaf simd function with calls we assume that
4401 at least one of those calls is to a non-simd function and thus
4402 we must save V8 to V23 in the prologue. */
4404 if (simd_function
&& !crtl
->is_leaf
)
4406 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
4407 if (FP_SIMD_SAVED_REGNUM_P (regno
))
4408 df_set_regs_ever_live (regno
, true);
4411 /* First mark all the registers that really need to be saved... */
4412 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
4413 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
4415 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
4416 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
4418 /* ... that includes the eh data registers (if needed)... */
4419 if (crtl
->calls_eh_return
)
4420 for (regno
= 0; EH_RETURN_DATA_REGNO (regno
) != INVALID_REGNUM
; regno
++)
4421 cfun
->machine
->frame
.reg_offset
[EH_RETURN_DATA_REGNO (regno
)]
4424 /* ... and any callee saved register that dataflow says is live. */
4425 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
4426 if (df_regs_ever_live_p (regno
)
4427 && (regno
== R30_REGNUM
4428 || !call_used_regs
[regno
]))
4429 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
4431 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
4432 if (df_regs_ever_live_p (regno
)
4433 && (!call_used_regs
[regno
]
4434 || (simd_function
&& FP_SIMD_SAVED_REGNUM_P (regno
))))
4436 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
4437 last_fp_reg
= regno
;
4440 if (cfun
->machine
->frame
.emit_frame_chain
)
4442 /* FP and LR are placed in the linkage record. */
4443 cfun
->machine
->frame
.reg_offset
[R29_REGNUM
] = 0;
4444 cfun
->machine
->frame
.wb_candidate1
= R29_REGNUM
;
4445 cfun
->machine
->frame
.reg_offset
[R30_REGNUM
] = UNITS_PER_WORD
;
4446 cfun
->machine
->frame
.wb_candidate2
= R30_REGNUM
;
4447 offset
= 2 * UNITS_PER_WORD
;
4450 /* With stack-clash, LR must be saved in non-leaf functions. */
4451 gcc_assert (crtl
->is_leaf
4452 || (cfun
->machine
->frame
.reg_offset
[R30_REGNUM
]
4453 != SLOT_NOT_REQUIRED
));
4455 /* Now assign stack slots for them. */
4456 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
4457 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
4459 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
4460 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
4461 cfun
->machine
->frame
.wb_candidate1
= regno
;
4462 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
)
4463 cfun
->machine
->frame
.wb_candidate2
= regno
;
4464 offset
+= UNITS_PER_WORD
;
4467 HOST_WIDE_INT max_int_offset
= offset
;
4468 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
4469 bool has_align_gap
= offset
!= max_int_offset
;
4471 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
4472 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
4474 /* If there is an alignment gap between integer and fp callee-saves,
4475 allocate the last fp register to it if possible. */
4476 if (regno
== last_fp_reg
4479 && (offset
& 8) == 0)
4481 cfun
->machine
->frame
.reg_offset
[regno
] = max_int_offset
;
4485 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
4486 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
4487 cfun
->machine
->frame
.wb_candidate1
= regno
;
4488 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
4489 && cfun
->machine
->frame
.wb_candidate1
>= V0_REGNUM
)
4490 cfun
->machine
->frame
.wb_candidate2
= regno
;
4491 offset
+= simd_function
? UNITS_PER_VREG
: UNITS_PER_WORD
;
4494 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
4496 cfun
->machine
->frame
.saved_regs_size
= offset
;
4498 HOST_WIDE_INT varargs_and_saved_regs_size
4499 = offset
+ cfun
->machine
->frame
.saved_varargs_size
;
4501 cfun
->machine
->frame
.hard_fp_offset
4502 = aligned_upper_bound (varargs_and_saved_regs_size
4503 + get_frame_size (),
4504 STACK_BOUNDARY
/ BITS_PER_UNIT
);
4506 /* Both these values are already aligned. */
4507 gcc_assert (multiple_p (crtl
->outgoing_args_size
,
4508 STACK_BOUNDARY
/ BITS_PER_UNIT
));
4509 cfun
->machine
->frame
.frame_size
4510 = (cfun
->machine
->frame
.hard_fp_offset
4511 + crtl
->outgoing_args_size
);
4513 cfun
->machine
->frame
.locals_offset
= cfun
->machine
->frame
.saved_varargs_size
;
4515 cfun
->machine
->frame
.initial_adjust
= 0;
4516 cfun
->machine
->frame
.final_adjust
= 0;
4517 cfun
->machine
->frame
.callee_adjust
= 0;
4518 cfun
->machine
->frame
.callee_offset
= 0;
4520 HOST_WIDE_INT max_push_offset
= 0;
4521 if (cfun
->machine
->frame
.wb_candidate2
!= INVALID_REGNUM
)
4522 max_push_offset
= 512;
4523 else if (cfun
->machine
->frame
.wb_candidate1
!= INVALID_REGNUM
)
4524 max_push_offset
= 256;
4526 HOST_WIDE_INT const_size
, const_fp_offset
;
4527 if (cfun
->machine
->frame
.frame_size
.is_constant (&const_size
)
4528 && const_size
< max_push_offset
4529 && known_eq (crtl
->outgoing_args_size
, 0))
4531 /* Simple, small frame with no outgoing arguments:
4532 stp reg1, reg2, [sp, -frame_size]!
4533 stp reg3, reg4, [sp, 16] */
4534 cfun
->machine
->frame
.callee_adjust
= const_size
;
4536 else if (known_lt (crtl
->outgoing_args_size
4537 + cfun
->machine
->frame
.saved_regs_size
, 512)
4538 && !(cfun
->calls_alloca
4539 && known_lt (cfun
->machine
->frame
.hard_fp_offset
,
4542 /* Frame with small outgoing arguments:
4543 sub sp, sp, frame_size
4544 stp reg1, reg2, [sp, outgoing_args_size]
4545 stp reg3, reg4, [sp, outgoing_args_size + 16] */
4546 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.frame_size
;
4547 cfun
->machine
->frame
.callee_offset
4548 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.hard_fp_offset
;
4550 else if (cfun
->machine
->frame
.hard_fp_offset
.is_constant (&const_fp_offset
)
4551 && const_fp_offset
< max_push_offset
)
4553 /* Frame with large outgoing arguments but a small local area:
4554 stp reg1, reg2, [sp, -hard_fp_offset]!
4555 stp reg3, reg4, [sp, 16]
4556 sub sp, sp, outgoing_args_size */
4557 cfun
->machine
->frame
.callee_adjust
= const_fp_offset
;
4558 cfun
->machine
->frame
.final_adjust
4559 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.callee_adjust
;
4563 /* Frame with large local area and outgoing arguments using frame pointer:
4564 sub sp, sp, hard_fp_offset
4565 stp x29, x30, [sp, 0]
4567 stp reg3, reg4, [sp, 16]
4568 sub sp, sp, outgoing_args_size */
4569 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.hard_fp_offset
;
4570 cfun
->machine
->frame
.final_adjust
4571 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.initial_adjust
;
4574 cfun
->machine
->frame
.laid_out
= true;
4577 /* Return true if the register REGNO is saved on entry to
4578 the current function. */
4581 aarch64_register_saved_on_entry (int regno
)
4583 return cfun
->machine
->frame
.reg_offset
[regno
] >= 0;
4586 /* Return the next register up from REGNO up to LIMIT for the callee
4590 aarch64_next_callee_save (unsigned regno
, unsigned limit
)
4592 while (regno
<= limit
&& !aarch64_register_saved_on_entry (regno
))
4597 /* Push the register number REGNO of mode MODE to the stack with write-back
4598 adjusting the stack by ADJUSTMENT. */
4601 aarch64_pushwb_single_reg (machine_mode mode
, unsigned regno
,
4602 HOST_WIDE_INT adjustment
)
4604 rtx base_rtx
= stack_pointer_rtx
;
4607 reg
= gen_rtx_REG (mode
, regno
);
4608 mem
= gen_rtx_PRE_MODIFY (Pmode
, base_rtx
,
4609 plus_constant (Pmode
, base_rtx
, -adjustment
));
4610 mem
= gen_frame_mem (mode
, mem
);
4612 insn
= emit_move_insn (mem
, reg
);
4613 RTX_FRAME_RELATED_P (insn
) = 1;
4616 /* Generate and return an instruction to store the pair of registers
4617 REG and REG2 of mode MODE to location BASE with write-back adjusting
4618 the stack location BASE by ADJUSTMENT. */
4621 aarch64_gen_storewb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
4622 HOST_WIDE_INT adjustment
)
4627 return gen_storewb_pairdi_di (base
, base
, reg
, reg2
,
4628 GEN_INT (-adjustment
),
4629 GEN_INT (UNITS_PER_WORD
- adjustment
));
4631 return gen_storewb_pairdf_di (base
, base
, reg
, reg2
,
4632 GEN_INT (-adjustment
),
4633 GEN_INT (UNITS_PER_WORD
- adjustment
));
4635 return gen_storewb_pairtf_di (base
, base
, reg
, reg2
,
4636 GEN_INT (-adjustment
),
4637 GEN_INT (UNITS_PER_VREG
- adjustment
));
4643 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4644 stack pointer by ADJUSTMENT. */
4647 aarch64_push_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
)
4650 machine_mode mode
= aarch64_reg_save_mode (cfun
->decl
, regno1
);
4652 if (regno2
== INVALID_REGNUM
)
4653 return aarch64_pushwb_single_reg (mode
, regno1
, adjustment
);
4655 rtx reg1
= gen_rtx_REG (mode
, regno1
);
4656 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4658 insn
= emit_insn (aarch64_gen_storewb_pair (mode
, stack_pointer_rtx
, reg1
,
4660 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 2)) = 1;
4661 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
4662 RTX_FRAME_RELATED_P (insn
) = 1;
4665 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4666 adjusting it by ADJUSTMENT afterwards. */
4669 aarch64_gen_loadwb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
4670 HOST_WIDE_INT adjustment
)
4675 return gen_loadwb_pairdi_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
4676 GEN_INT (UNITS_PER_WORD
));
4678 return gen_loadwb_pairdf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
4679 GEN_INT (UNITS_PER_WORD
));
4681 return gen_loadwb_pairtf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
4682 GEN_INT (UNITS_PER_VREG
));
4688 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4689 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4693 aarch64_pop_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
,
4696 machine_mode mode
= aarch64_reg_save_mode (cfun
->decl
, regno1
);
4697 rtx reg1
= gen_rtx_REG (mode
, regno1
);
4699 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg1
, *cfi_ops
);
4701 if (regno2
== INVALID_REGNUM
)
4703 rtx mem
= plus_constant (Pmode
, stack_pointer_rtx
, adjustment
);
4704 mem
= gen_rtx_POST_MODIFY (Pmode
, stack_pointer_rtx
, mem
);
4705 emit_move_insn (reg1
, gen_frame_mem (mode
, mem
));
4709 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4710 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
4711 emit_insn (aarch64_gen_loadwb_pair (mode
, stack_pointer_rtx
, reg1
,
4716 /* Generate and return a store pair instruction of mode MODE to store
4717 register REG1 to MEM1 and register REG2 to MEM2. */
4720 aarch64_gen_store_pair (machine_mode mode
, rtx mem1
, rtx reg1
, rtx mem2
,
4726 return gen_store_pair_dw_didi (mem1
, reg1
, mem2
, reg2
);
4729 return gen_store_pair_dw_dfdf (mem1
, reg1
, mem2
, reg2
);
4732 return gen_store_pair_dw_tftf (mem1
, reg1
, mem2
, reg2
);
4739 /* Generate and regurn a load pair isntruction of mode MODE to load register
4740 REG1 from MEM1 and register REG2 from MEM2. */
4743 aarch64_gen_load_pair (machine_mode mode
, rtx reg1
, rtx mem1
, rtx reg2
,
4749 return gen_load_pair_dw_didi (reg1
, mem1
, reg2
, mem2
);
4752 return gen_load_pair_dw_dfdf (reg1
, mem1
, reg2
, mem2
);
4755 return gen_load_pair_dw_tftf (reg1
, mem1
, reg2
, mem2
);
4762 /* Return TRUE if return address signing should be enabled for the current
4763 function, otherwise return FALSE. */
4766 aarch64_return_address_signing_enabled (void)
4768 /* This function should only be called after frame laid out. */
4769 gcc_assert (cfun
->machine
->frame
.laid_out
);
4771 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4772 if it's LR is pushed onto stack. */
4773 return (aarch64_ra_sign_scope
== AARCH64_FUNCTION_ALL
4774 || (aarch64_ra_sign_scope
== AARCH64_FUNCTION_NON_LEAF
4775 && cfun
->machine
->frame
.reg_offset
[LR_REGNUM
] >= 0));
4778 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
4780 aarch64_bti_enabled (void)
4782 return (aarch64_enable_bti
== 1);
4785 /* Emit code to save the callee-saved registers from register number START
4786 to LIMIT to the stack at the location starting at offset START_OFFSET,
4787 skipping any write-back candidates if SKIP_WB is true. */
4790 aarch64_save_callee_saves (machine_mode mode
, poly_int64 start_offset
,
4791 unsigned start
, unsigned limit
, bool skip_wb
)
4797 for (regno
= aarch64_next_callee_save (start
, limit
);
4799 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
4806 && (regno
== cfun
->machine
->frame
.wb_candidate1
4807 || regno
== cfun
->machine
->frame
.wb_candidate2
))
4810 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
4813 reg
= gen_rtx_REG (mode
, regno
);
4814 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
4815 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
4818 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
4819 offset_diff
= cfun
->machine
->frame
.reg_offset
[regno2
]
4820 - cfun
->machine
->frame
.reg_offset
[regno
];
4823 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
4824 && known_eq (GET_MODE_SIZE (mode
), offset_diff
))
4826 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4829 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
4830 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
4832 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
,
4835 /* The first part of a frame-related parallel insn is
4836 always assumed to be relevant to the frame
4837 calculations; subsequent parts, are only
4838 frame-related if explicitly marked. */
4839 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
4843 insn
= emit_move_insn (mem
, reg
);
4845 RTX_FRAME_RELATED_P (insn
) = 1;
4849 /* Emit code to restore the callee registers of mode MODE from register
4850 number START up to and including LIMIT. Restore from the stack offset
4851 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4852 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
4855 aarch64_restore_callee_saves (machine_mode mode
,
4856 poly_int64 start_offset
, unsigned start
,
4857 unsigned limit
, bool skip_wb
, rtx
*cfi_ops
)
4859 rtx base_rtx
= stack_pointer_rtx
;
4864 for (regno
= aarch64_next_callee_save (start
, limit
);
4866 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
4868 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
4875 && (regno
== cfun
->machine
->frame
.wb_candidate1
4876 || regno
== cfun
->machine
->frame
.wb_candidate2
))
4879 reg
= gen_rtx_REG (mode
, regno
);
4880 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
4881 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
4883 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
4884 offset_diff
= cfun
->machine
->frame
.reg_offset
[regno2
]
4885 - cfun
->machine
->frame
.reg_offset
[regno
];
4888 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
4889 && known_eq (GET_MODE_SIZE (mode
), offset_diff
))
4891 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4894 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
4895 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
4896 emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
4898 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
4902 emit_move_insn (reg
, mem
);
4903 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg
, *cfi_ops
);
4907 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4911 offset_4bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
4913 HOST_WIDE_INT multiple
;
4914 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4915 && IN_RANGE (multiple
, -8, 7));
4918 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4922 offset_6bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
4924 HOST_WIDE_INT multiple
;
4925 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4926 && IN_RANGE (multiple
, 0, 63));
4929 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4933 aarch64_offset_7bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
4935 HOST_WIDE_INT multiple
;
4936 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4937 && IN_RANGE (multiple
, -64, 63));
4940 /* Return true if OFFSET is a signed 9-bit value. */
4943 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED
,
4946 HOST_WIDE_INT const_offset
;
4947 return (offset
.is_constant (&const_offset
)
4948 && IN_RANGE (const_offset
, -256, 255));
4951 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4955 offset_9bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
4957 HOST_WIDE_INT multiple
;
4958 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4959 && IN_RANGE (multiple
, -256, 255));
4962 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4966 offset_12bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
4968 HOST_WIDE_INT multiple
;
4969 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4970 && IN_RANGE (multiple
, 0, 4095));
4973 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
4976 aarch64_get_separate_components (void)
4978 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
4979 bitmap_clear (components
);
4981 /* The registers we need saved to the frame. */
4982 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
4983 if (aarch64_register_saved_on_entry (regno
))
4985 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
4986 if (!frame_pointer_needed
)
4987 offset
+= cfun
->machine
->frame
.frame_size
4988 - cfun
->machine
->frame
.hard_fp_offset
;
4989 /* Check that we can access the stack slot of the register with one
4990 direct load with no adjustments needed. */
4991 if (offset_12bit_unsigned_scaled_p (DImode
, offset
))
4992 bitmap_set_bit (components
, regno
);
4995 /* Don't mess with the hard frame pointer. */
4996 if (frame_pointer_needed
)
4997 bitmap_clear_bit (components
, HARD_FRAME_POINTER_REGNUM
);
4999 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
5000 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
5001 /* If registers have been chosen to be stored/restored with
5002 writeback don't interfere with them to avoid having to output explicit
5003 stack adjustment instructions. */
5004 if (reg2
!= INVALID_REGNUM
)
5005 bitmap_clear_bit (components
, reg2
);
5006 if (reg1
!= INVALID_REGNUM
)
5007 bitmap_clear_bit (components
, reg1
);
5009 bitmap_clear_bit (components
, LR_REGNUM
);
5010 bitmap_clear_bit (components
, SP_REGNUM
);
5015 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
5018 aarch64_components_for_bb (basic_block bb
)
5020 bitmap in
= DF_LIVE_IN (bb
);
5021 bitmap gen
= &DF_LIVE_BB_INFO (bb
)->gen
;
5022 bitmap kill
= &DF_LIVE_BB_INFO (bb
)->kill
;
5023 bool simd_function
= aarch64_simd_decl_p (cfun
->decl
);
5025 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
5026 bitmap_clear (components
);
5028 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
5029 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
5030 if ((!call_used_regs
[regno
]
5031 || (simd_function
&& FP_SIMD_SAVED_REGNUM_P (regno
)))
5032 && (bitmap_bit_p (in
, regno
)
5033 || bitmap_bit_p (gen
, regno
)
5034 || bitmap_bit_p (kill
, regno
)))
5036 unsigned regno2
, offset
, offset2
;
5037 bitmap_set_bit (components
, regno
);
5039 /* If there is a callee-save at an adjacent offset, add it too
5040 to increase the use of LDP/STP. */
5041 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
5042 regno2
= ((offset
& 8) == 0) ? regno
+ 1 : regno
- 1;
5044 if (regno2
<= LAST_SAVED_REGNUM
)
5046 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
5047 if ((offset
& ~8) == (offset2
& ~8))
5048 bitmap_set_bit (components
, regno2
);
5055 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5056 Nothing to do for aarch64. */
5059 aarch64_disqualify_components (sbitmap
, edge
, sbitmap
, bool)
5063 /* Return the next set bit in BMP from START onwards. Return the total number
5064 of bits in BMP if no set bit is found at or after START. */
5067 aarch64_get_next_set_bit (sbitmap bmp
, unsigned int start
)
5069 unsigned int nbits
= SBITMAP_SIZE (bmp
);
5073 gcc_assert (start
< nbits
);
5074 for (unsigned int i
= start
; i
< nbits
; i
++)
5075 if (bitmap_bit_p (bmp
, i
))
5081 /* Do the work for aarch64_emit_prologue_components and
5082 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
5083 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5084 for these components or the epilogue sequence. That is, it determines
5085 whether we should emit stores or loads and what kind of CFA notes to attach
5086 to the insns. Otherwise the logic for the two sequences is very
5090 aarch64_process_components (sbitmap components
, bool prologue_p
)
5092 rtx ptr_reg
= gen_rtx_REG (Pmode
, frame_pointer_needed
5093 ? HARD_FRAME_POINTER_REGNUM
5094 : STACK_POINTER_REGNUM
);
5096 unsigned last_regno
= SBITMAP_SIZE (components
);
5097 unsigned regno
= aarch64_get_next_set_bit (components
, R0_REGNUM
);
5098 rtx_insn
*insn
= NULL
;
5100 while (regno
!= last_regno
)
5102 /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
5103 so DFmode for the vector registers is enough. For simd functions
5104 we want to save the low 128 bits. */
5105 machine_mode mode
= aarch64_reg_save_mode (cfun
->decl
, regno
);
5107 rtx reg
= gen_rtx_REG (mode
, regno
);
5108 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
5109 if (!frame_pointer_needed
)
5110 offset
+= cfun
->machine
->frame
.frame_size
5111 - cfun
->machine
->frame
.hard_fp_offset
;
5112 rtx addr
= plus_constant (Pmode
, ptr_reg
, offset
);
5113 rtx mem
= gen_frame_mem (mode
, addr
);
5115 rtx set
= prologue_p
? gen_rtx_SET (mem
, reg
) : gen_rtx_SET (reg
, mem
);
5116 unsigned regno2
= aarch64_get_next_set_bit (components
, regno
+ 1);
5117 /* No more registers to handle after REGNO.
5118 Emit a single save/restore and exit. */
5119 if (regno2
== last_regno
)
5121 insn
= emit_insn (set
);
5122 RTX_FRAME_RELATED_P (insn
) = 1;
5124 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
5126 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
5130 poly_int64 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
5131 /* The next register is not of the same class or its offset is not
5132 mergeable with the current one into a pair. */
5133 if (!satisfies_constraint_Ump (mem
)
5134 || GP_REGNUM_P (regno
) != GP_REGNUM_P (regno2
)
5135 || (aarch64_simd_decl_p (cfun
->decl
) && FP_REGNUM_P (regno
))
5136 || maybe_ne ((offset2
- cfun
->machine
->frame
.reg_offset
[regno
]),
5137 GET_MODE_SIZE (mode
)))
5139 insn
= emit_insn (set
);
5140 RTX_FRAME_RELATED_P (insn
) = 1;
5142 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
5144 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
5150 /* REGNO2 can be saved/restored in a pair with REGNO. */
5151 rtx reg2
= gen_rtx_REG (mode
, regno2
);
5152 if (!frame_pointer_needed
)
5153 offset2
+= cfun
->machine
->frame
.frame_size
5154 - cfun
->machine
->frame
.hard_fp_offset
;
5155 rtx addr2
= plus_constant (Pmode
, ptr_reg
, offset2
);
5156 rtx mem2
= gen_frame_mem (mode
, addr2
);
5157 rtx set2
= prologue_p
? gen_rtx_SET (mem2
, reg2
)
5158 : gen_rtx_SET (reg2
, mem2
);
5161 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
, reg2
));
5163 insn
= emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
5165 RTX_FRAME_RELATED_P (insn
) = 1;
5168 add_reg_note (insn
, REG_CFA_OFFSET
, set
);
5169 add_reg_note (insn
, REG_CFA_OFFSET
, set2
);
5173 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
5174 add_reg_note (insn
, REG_CFA_RESTORE
, reg2
);
5177 regno
= aarch64_get_next_set_bit (components
, regno2
+ 1);
5181 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
5184 aarch64_emit_prologue_components (sbitmap components
)
5186 aarch64_process_components (components
, true);
5189 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
5192 aarch64_emit_epilogue_components (sbitmap components
)
5194 aarch64_process_components (components
, false);
5197 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
5200 aarch64_set_handled_components (sbitmap components
)
5202 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
5203 if (bitmap_bit_p (components
, regno
))
5204 cfun
->machine
->reg_is_wrapped_separately
[regno
] = true;
5207 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
5208 determining the probe offset for alloca. */
5210 static HOST_WIDE_INT
5211 aarch64_stack_clash_protection_alloca_probe_range (void)
5213 return STACK_CLASH_CALLER_GUARD
;
5217 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
5218 registers. If POLY_SIZE is not large enough to require a probe this function
5219 will only adjust the stack. When allocating the stack space
5220 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
5221 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
5222 arguments. If we are then we ensure that any allocation larger than the ABI
5223 defined buffer needs a probe so that the invariant of having a 1KB buffer is
5226 We emit barriers after each stack adjustment to prevent optimizations from
5227 breaking the invariant that we never drop the stack more than a page. This
5228 invariant is needed to make it easier to correctly handle asynchronous
5229 events, e.g. if we were to allow the stack to be dropped by more than a page
5230 and then have multiple probes up and we take a signal somewhere in between
5231 then the signal handler doesn't know the state of the stack and can make no
5232 assumptions about which pages have been probed. */
5235 aarch64_allocate_and_probe_stack_space (rtx temp1
, rtx temp2
,
5236 poly_int64 poly_size
,
5237 bool frame_related_p
,
5238 bool final_adjustment_p
)
5240 HOST_WIDE_INT guard_size
5241 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
5242 HOST_WIDE_INT guard_used_by_caller
= STACK_CLASH_CALLER_GUARD
;
5243 /* When doing the final adjustment for the outgoing argument size we can't
5244 assume that LR was saved at position 0. So subtract it's offset from the
5245 ABI safe buffer so that we don't accidentally allow an adjustment that
5246 would result in an allocation larger than the ABI buffer without
5248 HOST_WIDE_INT min_probe_threshold
5249 = final_adjustment_p
5250 ? guard_used_by_caller
- cfun
->machine
->frame
.reg_offset
[LR_REGNUM
]
5251 : guard_size
- guard_used_by_caller
;
5253 poly_int64 frame_size
= cfun
->machine
->frame
.frame_size
;
5255 /* We should always have a positive probe threshold. */
5256 gcc_assert (min_probe_threshold
> 0);
5258 if (flag_stack_clash_protection
&& !final_adjustment_p
)
5260 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
5261 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
5263 if (known_eq (frame_size
, 0))
5265 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME
, false);
5267 else if (known_lt (initial_adjust
, guard_size
- guard_used_by_caller
)
5268 && known_lt (final_adjust
, guard_used_by_caller
))
5270 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME
, true);
5274 /* If SIZE is not large enough to require probing, just adjust the stack and
5276 if (known_lt (poly_size
, min_probe_threshold
)
5277 || !flag_stack_clash_protection
)
5279 aarch64_sub_sp (temp1
, temp2
, poly_size
, frame_related_p
);
5284 /* Handle the SVE non-constant case first. */
5285 if (!poly_size
.is_constant (&size
))
5289 fprintf (dump_file
, "Stack clash SVE prologue: ");
5290 print_dec (poly_size
, dump_file
);
5291 fprintf (dump_file
, " bytes, dynamic probing will be required.\n");
5294 /* First calculate the amount of bytes we're actually spilling. */
5295 aarch64_add_offset (Pmode
, temp1
, CONST0_RTX (Pmode
),
5296 poly_size
, temp1
, temp2
, false, true);
5298 rtx_insn
*insn
= get_last_insn ();
5300 if (frame_related_p
)
5302 /* This is done to provide unwinding information for the stack
5303 adjustments we're about to do, however to prevent the optimizers
5304 from removing the R15 move and leaving the CFA note (which would be
5305 very wrong) we tie the old and new stack pointer together.
5306 The tie will expand to nothing but the optimizers will not touch
5308 rtx stack_ptr_copy
= gen_rtx_REG (Pmode
, R15_REGNUM
);
5309 emit_move_insn (stack_ptr_copy
, stack_pointer_rtx
);
5310 emit_insn (gen_stack_tie (stack_ptr_copy
, stack_pointer_rtx
));
5312 /* We want the CFA independent of the stack pointer for the
5313 duration of the loop. */
5314 add_reg_note (insn
, REG_CFA_DEF_CFA
, stack_ptr_copy
);
5315 RTX_FRAME_RELATED_P (insn
) = 1;
5318 rtx probe_const
= gen_int_mode (min_probe_threshold
, Pmode
);
5319 rtx guard_const
= gen_int_mode (guard_size
, Pmode
);
5321 insn
= emit_insn (gen_probe_sve_stack_clash (Pmode
, stack_pointer_rtx
,
5322 stack_pointer_rtx
, temp1
,
5323 probe_const
, guard_const
));
5325 /* Now reset the CFA register if needed. */
5326 if (frame_related_p
)
5328 add_reg_note (insn
, REG_CFA_DEF_CFA
,
5329 gen_rtx_PLUS (Pmode
, stack_pointer_rtx
,
5330 gen_int_mode (poly_size
, Pmode
)));
5331 RTX_FRAME_RELATED_P (insn
) = 1;
5339 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
5340 " bytes, probing will be required.\n", size
);
5342 /* Round size to the nearest multiple of guard_size, and calculate the
5343 residual as the difference between the original size and the rounded
5345 HOST_WIDE_INT rounded_size
= ROUND_DOWN (size
, guard_size
);
5346 HOST_WIDE_INT residual
= size
- rounded_size
;
5348 /* We can handle a small number of allocations/probes inline. Otherwise
5350 if (rounded_size
<= STACK_CLASH_MAX_UNROLL_PAGES
* guard_size
)
5352 for (HOST_WIDE_INT i
= 0; i
< rounded_size
; i
+= guard_size
)
5354 aarch64_sub_sp (NULL
, temp2
, guard_size
, true);
5355 emit_stack_probe (plus_constant (Pmode
, stack_pointer_rtx
,
5356 guard_used_by_caller
));
5357 emit_insn (gen_blockage ());
5359 dump_stack_clash_frame_info (PROBE_INLINE
, size
!= rounded_size
);
5363 /* Compute the ending address. */
5364 aarch64_add_offset (Pmode
, temp1
, stack_pointer_rtx
, -rounded_size
,
5365 temp1
, NULL
, false, true);
5366 rtx_insn
*insn
= get_last_insn ();
5368 /* For the initial allocation, we don't have a frame pointer
5369 set up, so we always need CFI notes. If we're doing the
5370 final allocation, then we may have a frame pointer, in which
5371 case it is the CFA, otherwise we need CFI notes.
5373 We can determine which allocation we are doing by looking at
5374 the value of FRAME_RELATED_P since the final allocations are not
5376 if (frame_related_p
)
5378 /* We want the CFA independent of the stack pointer for the
5379 duration of the loop. */
5380 add_reg_note (insn
, REG_CFA_DEF_CFA
,
5381 plus_constant (Pmode
, temp1
, rounded_size
));
5382 RTX_FRAME_RELATED_P (insn
) = 1;
5385 /* This allocates and probes the stack. Note that this re-uses some of
5386 the existing Ada stack protection code. However we are guaranteed not
5387 to enter the non loop or residual branches of that code.
5389 The non-loop part won't be entered because if our allocation amount
5390 doesn't require a loop, the case above would handle it.
5392 The residual amount won't be entered because TEMP1 is a mutliple of
5393 the allocation size. The residual will always be 0. As such, the only
5394 part we are actually using from that code is the loop setup. The
5395 actual probing is done in aarch64_output_probe_stack_range. */
5396 insn
= emit_insn (gen_probe_stack_range (stack_pointer_rtx
,
5397 stack_pointer_rtx
, temp1
));
5399 /* Now reset the CFA register if needed. */
5400 if (frame_related_p
)
5402 add_reg_note (insn
, REG_CFA_DEF_CFA
,
5403 plus_constant (Pmode
, stack_pointer_rtx
, rounded_size
));
5404 RTX_FRAME_RELATED_P (insn
) = 1;
5407 emit_insn (gen_blockage ());
5408 dump_stack_clash_frame_info (PROBE_LOOP
, size
!= rounded_size
);
5411 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
5412 be probed. This maintains the requirement that each page is probed at
5413 least once. For initial probing we probe only if the allocation is
5414 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
5415 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
5416 GUARD_SIZE. This works that for any allocation that is large enough to
5417 trigger a probe here, we'll have at least one, and if they're not large
5418 enough for this code to emit anything for them, The page would have been
5419 probed by the saving of FP/LR either by this function or any callees. If
5420 we don't have any callees then we won't have more stack adjustments and so
5424 HOST_WIDE_INT residual_probe_offset
= guard_used_by_caller
;
5425 /* If we're doing final adjustments, and we've done any full page
5426 allocations then any residual needs to be probed. */
5427 if (final_adjustment_p
&& rounded_size
!= 0)
5428 min_probe_threshold
= 0;
5429 /* If doing a small final adjustment, we always probe at offset 0.
5430 This is done to avoid issues when LR is not at position 0 or when
5431 the final adjustment is smaller than the probing offset. */
5432 else if (final_adjustment_p
&& rounded_size
== 0)
5433 residual_probe_offset
= 0;
5435 aarch64_sub_sp (temp1
, temp2
, residual
, frame_related_p
);
5436 if (residual
>= min_probe_threshold
)
5440 "Stack clash AArch64 prologue residuals: "
5441 HOST_WIDE_INT_PRINT_DEC
" bytes, probing will be required."
5444 emit_stack_probe (plus_constant (Pmode
, stack_pointer_rtx
,
5445 residual_probe_offset
));
5446 emit_insn (gen_blockage ());
5451 /* Return 1 if the register is used by the epilogue. We need to say the
5452 return register is used, but only after epilogue generation is complete.
5453 Note that in the case of sibcalls, the values "used by the epilogue" are
5454 considered live at the start of the called function.
5456 For SIMD functions we need to return 1 for FP registers that are saved and
5457 restored by a function but are not zero in call_used_regs. If we do not do
5458 this optimizations may remove the restore of the register. */
5461 aarch64_epilogue_uses (int regno
)
5463 if (epilogue_completed
)
5465 if (regno
== LR_REGNUM
)
5467 if (aarch64_simd_decl_p (cfun
->decl
) && FP_SIMD_SAVED_REGNUM_P (regno
))
5473 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
5474 is saved at BASE + OFFSET. */
5477 aarch64_add_cfa_expression (rtx_insn
*insn
, unsigned int reg
,
5478 rtx base
, poly_int64 offset
)
5480 rtx mem
= gen_frame_mem (DImode
, plus_constant (Pmode
, base
, offset
));
5481 add_reg_note (insn
, REG_CFA_EXPRESSION
,
5482 gen_rtx_SET (mem
, regno_reg_rtx
[reg
]));
5485 /* AArch64 stack frames generated by this compiler look like:
5487 +-------------------------------+
5489 | incoming stack arguments |
5491 +-------------------------------+
5492 | | <-- incoming stack pointer (aligned)
5493 | callee-allocated save area |
5494 | for register varargs |
5496 +-------------------------------+
5497 | local variables | <-- frame_pointer_rtx
5499 +-------------------------------+
5501 +-------------------------------+ |
5502 | callee-saved registers | | frame.saved_regs_size
5503 +-------------------------------+ |
5505 +-------------------------------+ |
5506 | FP' | / <- hard_frame_pointer_rtx (aligned)
5507 +-------------------------------+
5508 | dynamic allocation |
5509 +-------------------------------+
5511 +-------------------------------+
5512 | outgoing stack arguments | <-- arg_pointer
5514 +-------------------------------+
5515 | | <-- stack_pointer_rtx (aligned)
5517 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
5518 but leave frame_pointer_rtx and hard_frame_pointer_rtx
5521 By default for stack-clash we assume the guard is at least 64KB, but this
5522 value is configurable to either 4KB or 64KB. We also force the guard size to
5523 be the same as the probing interval and both values are kept in sync.
5525 With those assumptions the callee can allocate up to 63KB (or 3KB depending
5526 on the guard size) of stack space without probing.
5528 When probing is needed, we emit a probe at the start of the prologue
5529 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
5531 We have to track how much space has been allocated and the only stores
5532 to the stack we track as implicit probes are the FP/LR stores.
5534 For outgoing arguments we probe if the size is larger than 1KB, such that
5535 the ABI specified buffer is maintained for the next callee. */
5537 /* Generate the prologue instructions for entry into a function.
5538 Establish the stack frame by decreasing the stack pointer with a
5539 properly calculated size and, if necessary, create a frame record
5540 filled with the values of LR and previous frame pointer. The
5541 current FP is also set up if it is in use. */
5544 aarch64_expand_prologue (void)
5546 poly_int64 frame_size
= cfun
->machine
->frame
.frame_size
;
5547 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
5548 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
5549 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
5550 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
5551 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
5552 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
5553 bool emit_frame_chain
= cfun
->machine
->frame
.emit_frame_chain
;
5556 /* Sign return address for functions. */
5557 if (aarch64_return_address_signing_enabled ())
5559 insn
= emit_insn (gen_pacisp ());
5560 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
5561 RTX_FRAME_RELATED_P (insn
) = 1;
5564 if (flag_stack_usage_info
)
5565 current_function_static_stack_size
= constant_lower_bound (frame_size
);
5567 if (flag_stack_check
== STATIC_BUILTIN_STACK_CHECK
)
5569 if (crtl
->is_leaf
&& !cfun
->calls_alloca
)
5571 if (maybe_gt (frame_size
, PROBE_INTERVAL
)
5572 && maybe_gt (frame_size
, get_stack_check_protect ()))
5573 aarch64_emit_probe_stack_range (get_stack_check_protect (),
5575 - get_stack_check_protect ()));
5577 else if (maybe_gt (frame_size
, 0))
5578 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size
);
5581 rtx tmp0_rtx
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
5582 rtx tmp1_rtx
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
5584 /* In theory we should never have both an initial adjustment
5585 and a callee save adjustment. Verify that is the case since the
5586 code below does not handle it for -fstack-clash-protection. */
5587 gcc_assert (known_eq (initial_adjust
, 0) || callee_adjust
== 0);
5589 /* Will only probe if the initial adjustment is larger than the guard
5590 less the amount of the guard reserved for use by the caller's
5592 aarch64_allocate_and_probe_stack_space (tmp0_rtx
, tmp1_rtx
, initial_adjust
,
5595 if (callee_adjust
!= 0)
5596 aarch64_push_regs (reg1
, reg2
, callee_adjust
);
5598 if (emit_frame_chain
)
5600 poly_int64 reg_offset
= callee_adjust
;
5601 if (callee_adjust
== 0)
5605 reg_offset
= callee_offset
;
5606 aarch64_save_callee_saves (DImode
, reg_offset
, reg1
, reg2
, false);
5608 aarch64_add_offset (Pmode
, hard_frame_pointer_rtx
,
5609 stack_pointer_rtx
, callee_offset
,
5610 tmp1_rtx
, tmp0_rtx
, frame_pointer_needed
);
5611 if (frame_pointer_needed
&& !frame_size
.is_constant ())
5613 /* Variable-sized frames need to describe the save slot
5614 address using DW_CFA_expression rather than DW_CFA_offset.
5615 This means that, without taking further action, the
5616 locations of the registers that we've already saved would
5617 remain based on the stack pointer even after we redefine
5618 the CFA based on the frame pointer. We therefore need new
5619 DW_CFA_expressions to re-express the save slots with addresses
5620 based on the frame pointer. */
5621 rtx_insn
*insn
= get_last_insn ();
5622 gcc_assert (RTX_FRAME_RELATED_P (insn
));
5624 /* Add an explicit CFA definition if this was previously
5626 if (!find_reg_note (insn
, REG_CFA_ADJUST_CFA
, NULL_RTX
))
5628 rtx src
= plus_constant (Pmode
, stack_pointer_rtx
,
5630 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
5631 gen_rtx_SET (hard_frame_pointer_rtx
, src
));
5634 /* Change the save slot expressions for the registers that
5635 we've already saved. */
5636 reg_offset
-= callee_offset
;
5637 aarch64_add_cfa_expression (insn
, reg2
, hard_frame_pointer_rtx
,
5638 reg_offset
+ UNITS_PER_WORD
);
5639 aarch64_add_cfa_expression (insn
, reg1
, hard_frame_pointer_rtx
,
5642 emit_insn (gen_stack_tie (stack_pointer_rtx
, hard_frame_pointer_rtx
));
5645 aarch64_save_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
5646 callee_adjust
!= 0 || emit_frame_chain
);
5647 if (aarch64_simd_decl_p (cfun
->decl
))
5648 aarch64_save_callee_saves (TFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
5649 callee_adjust
!= 0 || emit_frame_chain
);
5651 aarch64_save_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
5652 callee_adjust
!= 0 || emit_frame_chain
);
5654 /* We may need to probe the final adjustment if it is larger than the guard
5655 that is assumed by the called. */
5656 aarch64_allocate_and_probe_stack_space (tmp1_rtx
, tmp0_rtx
, final_adjust
,
5657 !frame_pointer_needed
, true);
5660 /* Return TRUE if we can use a simple_return insn.
5662 This function checks whether the callee saved stack is empty, which
5663 means no restore actions are need. The pro_and_epilogue will use
5664 this to check whether shrink-wrapping opt is feasible. */
5667 aarch64_use_return_insn_p (void)
5669 if (!reload_completed
)
5675 return known_eq (cfun
->machine
->frame
.frame_size
, 0);
5678 /* Return false for non-leaf SIMD functions in order to avoid
5679 shrink-wrapping them. Doing this will lose the necessary
5680 save/restore of FP registers. */
5683 aarch64_use_simple_return_insn_p (void)
5685 if (aarch64_simd_decl_p (cfun
->decl
) && !crtl
->is_leaf
)
5691 /* Generate the epilogue instructions for returning from a function.
5692 This is almost exactly the reverse of the prolog sequence, except
5693 that we need to insert barriers to avoid scheduling loads that read
5694 from a deallocated stack, and we optimize the unwind records by
5695 emitting them all together if possible. */
5697 aarch64_expand_epilogue (bool for_sibcall
)
5699 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
5700 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
5701 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
5702 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
5703 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
5704 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
5707 /* A stack clash protection prologue may not have left EP0_REGNUM or
5708 EP1_REGNUM in a usable state. The same is true for allocations
5709 with an SVE component, since we then need both temporary registers
5710 for each allocation. For stack clash we are in a usable state if
5711 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
5712 HOST_WIDE_INT guard_size
5713 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
5714 HOST_WIDE_INT guard_used_by_caller
= STACK_CLASH_CALLER_GUARD
;
5716 /* We can re-use the registers when the allocation amount is smaller than
5717 guard_size - guard_used_by_caller because we won't be doing any probes
5718 then. In such situations the register should remain live with the correct
5720 bool can_inherit_p
= (initial_adjust
.is_constant ()
5721 && final_adjust
.is_constant ())
5722 && (!flag_stack_clash_protection
5723 || known_lt (initial_adjust
,
5724 guard_size
- guard_used_by_caller
));
5726 /* We need to add memory barrier to prevent read from deallocated stack. */
5728 = maybe_ne (get_frame_size ()
5729 + cfun
->machine
->frame
.saved_varargs_size
, 0);
5731 /* Emit a barrier to prevent loads from a deallocated stack. */
5732 if (maybe_gt (final_adjust
, crtl
->outgoing_args_size
)
5733 || cfun
->calls_alloca
5734 || crtl
->calls_eh_return
)
5736 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
5737 need_barrier_p
= false;
5740 /* Restore the stack pointer from the frame pointer if it may not
5741 be the same as the stack pointer. */
5742 rtx tmp0_rtx
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
5743 rtx tmp1_rtx
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
5744 if (frame_pointer_needed
5745 && (maybe_ne (final_adjust
, 0) || cfun
->calls_alloca
))
5746 /* If writeback is used when restoring callee-saves, the CFA
5747 is restored on the instruction doing the writeback. */
5748 aarch64_add_offset (Pmode
, stack_pointer_rtx
,
5749 hard_frame_pointer_rtx
, -callee_offset
,
5750 tmp1_rtx
, tmp0_rtx
, callee_adjust
== 0);
5752 /* The case where we need to re-use the register here is very rare, so
5753 avoid the complicated condition and just always emit a move if the
5754 immediate doesn't fit. */
5755 aarch64_add_sp (tmp1_rtx
, tmp0_rtx
, final_adjust
, true);
5757 aarch64_restore_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
5758 callee_adjust
!= 0, &cfi_ops
);
5759 if (aarch64_simd_decl_p (cfun
->decl
))
5760 aarch64_restore_callee_saves (TFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
5761 callee_adjust
!= 0, &cfi_ops
);
5763 aarch64_restore_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
5764 callee_adjust
!= 0, &cfi_ops
);
5767 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
5769 if (callee_adjust
!= 0)
5770 aarch64_pop_regs (reg1
, reg2
, callee_adjust
, &cfi_ops
);
5772 if (callee_adjust
!= 0 || maybe_gt (initial_adjust
, 65536))
5774 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
5775 insn
= get_last_insn ();
5776 rtx new_cfa
= plus_constant (Pmode
, stack_pointer_rtx
, initial_adjust
);
5777 REG_NOTES (insn
) = alloc_reg_note (REG_CFA_DEF_CFA
, new_cfa
, cfi_ops
);
5778 RTX_FRAME_RELATED_P (insn
) = 1;
5782 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
5783 add restriction on emit_move optimization to leaf functions. */
5784 aarch64_add_sp (tmp0_rtx
, tmp1_rtx
, initial_adjust
,
5785 (!can_inherit_p
|| !crtl
->is_leaf
5786 || df_regs_ever_live_p (EP0_REGNUM
)));
5790 /* Emit delayed restores and reset the CFA to be SP. */
5791 insn
= get_last_insn ();
5792 cfi_ops
= alloc_reg_note (REG_CFA_DEF_CFA
, stack_pointer_rtx
, cfi_ops
);
5793 REG_NOTES (insn
) = cfi_ops
;
5794 RTX_FRAME_RELATED_P (insn
) = 1;
5797 /* We prefer to emit the combined return/authenticate instruction RETAA,
5798 however there are three cases in which we must instead emit an explicit
5799 authentication instruction.
5801 1) Sibcalls don't return in a normal way, so if we're about to call one
5802 we must authenticate.
5804 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5805 generating code for !TARGET_ARMV8_3 we can't use it and must
5806 explicitly authenticate.
5808 3) On an eh_return path we make extra stack adjustments to update the
5809 canonical frame address to be the exception handler's CFA. We want
5810 to authenticate using the CFA of the function which calls eh_return.
5812 if (aarch64_return_address_signing_enabled ()
5813 && (for_sibcall
|| !TARGET_ARMV8_3
|| crtl
->calls_eh_return
))
5815 insn
= emit_insn (gen_autisp ());
5816 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
5817 RTX_FRAME_RELATED_P (insn
) = 1;
5820 /* Stack adjustment for exception handler. */
5821 if (crtl
->calls_eh_return
)
5823 /* We need to unwind the stack by the offset computed by
5824 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
5825 to be SP; letting the CFA move during this adjustment
5826 is just as correct as retaining the CFA from the body
5827 of the function. Therefore, do nothing special. */
5828 emit_insn (gen_add2_insn (stack_pointer_rtx
, EH_RETURN_STACKADJ_RTX
));
5831 emit_use (gen_rtx_REG (DImode
, LR_REGNUM
));
5833 emit_jump_insn (ret_rtx
);
5836 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
5837 normally or return to a previous frame after unwinding.
5839 An EH return uses a single shared return sequence. The epilogue is
5840 exactly like a normal epilogue except that it has an extra input
5841 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5842 that must be applied after the frame has been destroyed. An extra label
5843 is inserted before the epilogue which initializes this register to zero,
5844 and this is the entry point for a normal return.
5846 An actual EH return updates the return address, initializes the stack
5847 adjustment and jumps directly into the epilogue (bypassing the zeroing
5848 of the adjustment). Since the return address is typically saved on the
5849 stack when a function makes a call, the saved LR must be updated outside
5852 This poses problems as the store is generated well before the epilogue,
5853 so the offset of LR is not known yet. Also optimizations will remove the
5854 store as it appears dead, even after the epilogue is generated (as the
5855 base or offset for loading LR is different in many cases).
5857 To avoid these problems this implementation forces the frame pointer
5858 in eh_return functions so that the location of LR is fixed and known early.
5859 It also marks the store volatile, so no optimization is permitted to
5860 remove the store. */
5862 aarch64_eh_return_handler_rtx (void)
5864 rtx tmp
= gen_frame_mem (Pmode
,
5865 plus_constant (Pmode
, hard_frame_pointer_rtx
, UNITS_PER_WORD
));
5867 /* Mark the store volatile, so no optimization is permitted to remove it. */
5868 MEM_VOLATILE_P (tmp
) = true;
5872 /* Output code to add DELTA to the first argument, and then jump
5873 to FUNCTION. Used for C++ multiple inheritance. */
5875 aarch64_output_mi_thunk (FILE *file
, tree thunk ATTRIBUTE_UNUSED
,
5876 HOST_WIDE_INT delta
,
5877 HOST_WIDE_INT vcall_offset
,
5880 /* The this pointer is always in x0. Note that this differs from
5881 Arm where the this pointer maybe bumped to r1 if r0 is required
5882 to return a pointer to an aggregate. On AArch64 a result value
5883 pointer will be in x8. */
5884 int this_regno
= R0_REGNUM
;
5885 rtx this_rtx
, temp0
, temp1
, addr
, funexp
;
5888 reload_completed
= 1;
5889 emit_note (NOTE_INSN_PROLOGUE_END
);
5891 this_rtx
= gen_rtx_REG (Pmode
, this_regno
);
5892 temp0
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
5893 temp1
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
5895 if (vcall_offset
== 0)
5896 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
, temp1
, temp0
, false);
5899 gcc_assert ((vcall_offset
& (POINTER_BYTES
- 1)) == 0);
5904 if (delta
>= -256 && delta
< 256)
5905 addr
= gen_rtx_PRE_MODIFY (Pmode
, this_rtx
,
5906 plus_constant (Pmode
, this_rtx
, delta
));
5908 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
,
5909 temp1
, temp0
, false);
5912 if (Pmode
== ptr_mode
)
5913 aarch64_emit_move (temp0
, gen_rtx_MEM (ptr_mode
, addr
));
5915 aarch64_emit_move (temp0
,
5916 gen_rtx_ZERO_EXTEND (Pmode
,
5917 gen_rtx_MEM (ptr_mode
, addr
)));
5919 if (vcall_offset
>= -256 && vcall_offset
< 4096 * POINTER_BYTES
)
5920 addr
= plus_constant (Pmode
, temp0
, vcall_offset
);
5923 aarch64_internal_mov_immediate (temp1
, GEN_INT (vcall_offset
), true,
5925 addr
= gen_rtx_PLUS (Pmode
, temp0
, temp1
);
5928 if (Pmode
== ptr_mode
)
5929 aarch64_emit_move (temp1
, gen_rtx_MEM (ptr_mode
,addr
));
5931 aarch64_emit_move (temp1
,
5932 gen_rtx_SIGN_EXTEND (Pmode
,
5933 gen_rtx_MEM (ptr_mode
, addr
)));
5935 emit_insn (gen_add2_insn (this_rtx
, temp1
));
5938 /* Generate a tail call to the target function. */
5939 if (!TREE_USED (function
))
5941 assemble_external (function
);
5942 TREE_USED (function
) = 1;
5944 funexp
= XEXP (DECL_RTL (function
), 0);
5945 funexp
= gen_rtx_MEM (FUNCTION_MODE
, funexp
);
5946 insn
= emit_call_insn (gen_sibcall (funexp
, const0_rtx
, NULL_RTX
));
5947 SIBLING_CALL_P (insn
) = 1;
5949 insn
= get_insns ();
5950 shorten_branches (insn
);
5951 final_start_function (insn
, file
, 1);
5952 final (insn
, file
, 1);
5953 final_end_function ();
5955 /* Stop pretending to be a post-reload pass. */
5956 reload_completed
= 0;
5960 aarch64_tls_referenced_p (rtx x
)
5962 if (!TARGET_HAVE_TLS
)
5964 subrtx_iterator::array_type array
;
5965 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
5967 const_rtx x
= *iter
;
5968 if (GET_CODE (x
) == SYMBOL_REF
&& SYMBOL_REF_TLS_MODEL (x
) != 0)
5970 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5971 TLS offsets, not real symbol references. */
5972 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
5973 iter
.skip_subrtxes ();
5979 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5980 a left shift of 0 or 12 bits. */
5982 aarch64_uimm12_shift (HOST_WIDE_INT val
)
5984 return ((val
& (((HOST_WIDE_INT
) 0xfff) << 0)) == val
5985 || (val
& (((HOST_WIDE_INT
) 0xfff) << 12)) == val
5989 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
5990 that can be created with a left shift of 0 or 12. */
5991 static HOST_WIDE_INT
5992 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val
)
5994 /* Check to see if the value fits in 24 bits, as that is the maximum we can
5995 handle correctly. */
5996 gcc_assert ((val
& 0xffffff) == val
);
5998 if (((val
& 0xfff) << 0) == val
)
6001 return val
& (0xfff << 12);
6004 /* Return true if val is an immediate that can be loaded into a
6005 register by a MOVZ instruction. */
6007 aarch64_movw_imm (HOST_WIDE_INT val
, scalar_int_mode mode
)
6009 if (GET_MODE_SIZE (mode
) > 4)
6011 if ((val
& (((HOST_WIDE_INT
) 0xffff) << 32)) == val
6012 || (val
& (((HOST_WIDE_INT
) 0xffff) << 48)) == val
)
6017 /* Ignore sign extension. */
6018 val
&= (HOST_WIDE_INT
) 0xffffffff;
6020 return ((val
& (((HOST_WIDE_INT
) 0xffff) << 0)) == val
6021 || (val
& (((HOST_WIDE_INT
) 0xffff) << 16)) == val
);
6024 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
6025 64-bit (DImode) integer. */
6027 static unsigned HOST_WIDE_INT
6028 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val
, machine_mode mode
)
6030 unsigned int size
= GET_MODE_UNIT_PRECISION (mode
);
6033 val
&= (HOST_WIDE_INT_1U
<< size
) - 1;
6040 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
6042 static const unsigned HOST_WIDE_INT bitmask_imm_mul
[] =
6044 0x0000000100000001ull
,
6045 0x0001000100010001ull
,
6046 0x0101010101010101ull
,
6047 0x1111111111111111ull
,
6048 0x5555555555555555ull
,
6052 /* Return true if val is a valid bitmask immediate. */
6055 aarch64_bitmask_imm (HOST_WIDE_INT val_in
, machine_mode mode
)
6057 unsigned HOST_WIDE_INT val
, tmp
, mask
, first_one
, next_one
;
6060 /* Check for a single sequence of one bits and return quickly if so.
6061 The special cases of all ones and all zeroes returns false. */
6062 val
= aarch64_replicate_bitmask_imm (val_in
, mode
);
6063 tmp
= val
+ (val
& -val
);
6065 if (tmp
== (tmp
& -tmp
))
6066 return (val
+ 1) > 1;
6068 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
6070 val
= (val
<< 32) | (val
& 0xffffffff);
6072 /* Invert if the immediate doesn't start with a zero bit - this means we
6073 only need to search for sequences of one bits. */
6077 /* Find the first set bit and set tmp to val with the first sequence of one
6078 bits removed. Return success if there is a single sequence of ones. */
6079 first_one
= val
& -val
;
6080 tmp
= val
& (val
+ first_one
);
6085 /* Find the next set bit and compute the difference in bit position. */
6086 next_one
= tmp
& -tmp
;
6087 bits
= clz_hwi (first_one
) - clz_hwi (next_one
);
6090 /* Check the bit position difference is a power of 2, and that the first
6091 sequence of one bits fits within 'bits' bits. */
6092 if ((mask
>> bits
) != 0 || bits
!= (bits
& -bits
))
6095 /* Check the sequence of one bits is repeated 64/bits times. */
6096 return val
== mask
* bitmask_imm_mul
[__builtin_clz (bits
) - 26];
6099 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
6100 Assumed precondition: VAL_IN Is not zero. */
6102 unsigned HOST_WIDE_INT
6103 aarch64_and_split_imm1 (HOST_WIDE_INT val_in
)
6105 int lowest_bit_set
= ctz_hwi (val_in
);
6106 int highest_bit_set
= floor_log2 (val_in
);
6107 gcc_assert (val_in
!= 0);
6109 return ((HOST_WIDE_INT_UC (2) << highest_bit_set
) -
6110 (HOST_WIDE_INT_1U
<< lowest_bit_set
));
6113 /* Create constant where bits outside of lowest bit set to highest bit set
6116 unsigned HOST_WIDE_INT
6117 aarch64_and_split_imm2 (HOST_WIDE_INT val_in
)
6119 return val_in
| ~aarch64_and_split_imm1 (val_in
);
6122 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
6125 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in
, machine_mode mode
)
6127 scalar_int_mode int_mode
;
6128 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
6131 if (aarch64_bitmask_imm (val_in
, int_mode
))
6134 if (aarch64_move_imm (val_in
, int_mode
))
6137 unsigned HOST_WIDE_INT imm2
= aarch64_and_split_imm2 (val_in
);
6139 return aarch64_bitmask_imm (imm2
, int_mode
);
6142 /* Return true if val is an immediate that can be loaded into a
6143 register in a single instruction. */
6145 aarch64_move_imm (HOST_WIDE_INT val
, machine_mode mode
)
6147 scalar_int_mode int_mode
;
6148 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
6151 if (aarch64_movw_imm (val
, int_mode
) || aarch64_movw_imm (~val
, int_mode
))
6153 return aarch64_bitmask_imm (val
, int_mode
);
6157 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED
, rtx x
)
6161 if (GET_CODE (x
) == HIGH
)
6164 /* There's no way to calculate VL-based values using relocations. */
6165 subrtx_iterator::array_type array
;
6166 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
6167 if (GET_CODE (*iter
) == CONST_POLY_INT
)
6170 split_const (x
, &base
, &offset
);
6171 if (GET_CODE (base
) == SYMBOL_REF
|| GET_CODE (base
) == LABEL_REF
)
6173 if (aarch64_classify_symbol (base
, INTVAL (offset
))
6174 != SYMBOL_FORCE_TO_MEM
)
6177 /* Avoid generating a 64-bit relocation in ILP32; leave
6178 to aarch64_expand_mov_immediate to handle it properly. */
6179 return mode
!= ptr_mode
;
6182 return aarch64_tls_referenced_p (x
);
6185 /* Implement TARGET_CASE_VALUES_THRESHOLD.
6186 The expansion for a table switch is quite expensive due to the number
6187 of instructions, the table lookup and hard to predict indirect jump.
6188 When optimizing for speed, and -O3 enabled, use the per-core tuning if
6189 set, otherwise use tables for > 16 cases as a tradeoff between size and
6190 performance. When optimizing for size, use the default setting. */
6193 aarch64_case_values_threshold (void)
6195 /* Use the specified limit for the number of cases before using jump
6196 tables at higher optimization levels. */
6198 && selected_cpu
->tune
->max_case_values
!= 0)
6199 return selected_cpu
->tune
->max_case_values
;
6201 return optimize_size
? default_case_values_threshold () : 17;
6204 /* Return true if register REGNO is a valid index register.
6205 STRICT_P is true if REG_OK_STRICT is in effect. */
6208 aarch64_regno_ok_for_index_p (int regno
, bool strict_p
)
6210 if (!HARD_REGISTER_NUM_P (regno
))
6218 regno
= reg_renumber
[regno
];
6220 return GP_REGNUM_P (regno
);
6223 /* Return true if register REGNO is a valid base register for mode MODE.
6224 STRICT_P is true if REG_OK_STRICT is in effect. */
6227 aarch64_regno_ok_for_base_p (int regno
, bool strict_p
)
6229 if (!HARD_REGISTER_NUM_P (regno
))
6237 regno
= reg_renumber
[regno
];
6240 /* The fake registers will be eliminated to either the stack or
6241 hard frame pointer, both of which are usually valid base registers.
6242 Reload deals with the cases where the eliminated form isn't valid. */
6243 return (GP_REGNUM_P (regno
)
6244 || regno
== SP_REGNUM
6245 || regno
== FRAME_POINTER_REGNUM
6246 || regno
== ARG_POINTER_REGNUM
);
6249 /* Return true if X is a valid base register for mode MODE.
6250 STRICT_P is true if REG_OK_STRICT is in effect. */
6253 aarch64_base_register_rtx_p (rtx x
, bool strict_p
)
6256 && GET_CODE (x
) == SUBREG
6257 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (x
))])
6260 return (REG_P (x
) && aarch64_regno_ok_for_base_p (REGNO (x
), strict_p
));
6263 /* Return true if address offset is a valid index. If it is, fill in INFO
6264 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
6267 aarch64_classify_index (struct aarch64_address_info
*info
, rtx x
,
6268 machine_mode mode
, bool strict_p
)
6270 enum aarch64_address_type type
;
6275 if ((REG_P (x
) || GET_CODE (x
) == SUBREG
)
6276 && GET_MODE (x
) == Pmode
)
6278 type
= ADDRESS_REG_REG
;
6282 /* (sign_extend:DI (reg:SI)) */
6283 else if ((GET_CODE (x
) == SIGN_EXTEND
6284 || GET_CODE (x
) == ZERO_EXTEND
)
6285 && GET_MODE (x
) == DImode
6286 && GET_MODE (XEXP (x
, 0)) == SImode
)
6288 type
= (GET_CODE (x
) == SIGN_EXTEND
)
6289 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
6290 index
= XEXP (x
, 0);
6293 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
6294 else if (GET_CODE (x
) == MULT
6295 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
6296 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
6297 && GET_MODE (XEXP (x
, 0)) == DImode
6298 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
6299 && CONST_INT_P (XEXP (x
, 1)))
6301 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
6302 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
6303 index
= XEXP (XEXP (x
, 0), 0);
6304 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
6306 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
6307 else if (GET_CODE (x
) == ASHIFT
6308 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
6309 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
6310 && GET_MODE (XEXP (x
, 0)) == DImode
6311 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
6312 && CONST_INT_P (XEXP (x
, 1)))
6314 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
6315 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
6316 index
= XEXP (XEXP (x
, 0), 0);
6317 shift
= INTVAL (XEXP (x
, 1));
6319 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
6320 else if ((GET_CODE (x
) == SIGN_EXTRACT
6321 || GET_CODE (x
) == ZERO_EXTRACT
)
6322 && GET_MODE (x
) == DImode
6323 && GET_CODE (XEXP (x
, 0)) == MULT
6324 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
6325 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
6327 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
6328 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
6329 index
= XEXP (XEXP (x
, 0), 0);
6330 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
6331 if (INTVAL (XEXP (x
, 1)) != 32 + shift
6332 || INTVAL (XEXP (x
, 2)) != 0)
6335 /* (and:DI (mult:DI (reg:DI) (const_int scale))
6336 (const_int 0xffffffff<<shift)) */
6337 else if (GET_CODE (x
) == AND
6338 && GET_MODE (x
) == DImode
6339 && GET_CODE (XEXP (x
, 0)) == MULT
6340 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
6341 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
6342 && CONST_INT_P (XEXP (x
, 1)))
6344 type
= ADDRESS_REG_UXTW
;
6345 index
= XEXP (XEXP (x
, 0), 0);
6346 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
6347 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
6350 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
6351 else if ((GET_CODE (x
) == SIGN_EXTRACT
6352 || GET_CODE (x
) == ZERO_EXTRACT
)
6353 && GET_MODE (x
) == DImode
6354 && GET_CODE (XEXP (x
, 0)) == ASHIFT
6355 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
6356 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
6358 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
6359 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
6360 index
= XEXP (XEXP (x
, 0), 0);
6361 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
6362 if (INTVAL (XEXP (x
, 1)) != 32 + shift
6363 || INTVAL (XEXP (x
, 2)) != 0)
6366 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
6367 (const_int 0xffffffff<<shift)) */
6368 else if (GET_CODE (x
) == AND
6369 && GET_MODE (x
) == DImode
6370 && GET_CODE (XEXP (x
, 0)) == ASHIFT
6371 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
6372 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
6373 && CONST_INT_P (XEXP (x
, 1)))
6375 type
= ADDRESS_REG_UXTW
;
6376 index
= XEXP (XEXP (x
, 0), 0);
6377 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
6378 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
6381 /* (mult:P (reg:P) (const_int scale)) */
6382 else if (GET_CODE (x
) == MULT
6383 && GET_MODE (x
) == Pmode
6384 && GET_MODE (XEXP (x
, 0)) == Pmode
6385 && CONST_INT_P (XEXP (x
, 1)))
6387 type
= ADDRESS_REG_REG
;
6388 index
= XEXP (x
, 0);
6389 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
6391 /* (ashift:P (reg:P) (const_int shift)) */
6392 else if (GET_CODE (x
) == ASHIFT
6393 && GET_MODE (x
) == Pmode
6394 && GET_MODE (XEXP (x
, 0)) == Pmode
6395 && CONST_INT_P (XEXP (x
, 1)))
6397 type
= ADDRESS_REG_REG
;
6398 index
= XEXP (x
, 0);
6399 shift
= INTVAL (XEXP (x
, 1));
6405 && GET_CODE (index
) == SUBREG
6406 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (index
))])
6407 index
= SUBREG_REG (index
);
6409 if (aarch64_sve_data_mode_p (mode
))
6411 if (type
!= ADDRESS_REG_REG
6412 || (1 << shift
) != GET_MODE_UNIT_SIZE (mode
))
6418 && !(IN_RANGE (shift
, 1, 3)
6419 && known_eq (1 << shift
, GET_MODE_SIZE (mode
))))
6424 && aarch64_regno_ok_for_index_p (REGNO (index
), strict_p
))
6427 info
->offset
= index
;
6428 info
->shift
= shift
;
6435 /* Return true if MODE is one of the modes for which we
6436 support LDP/STP operations. */
6439 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode
)
6441 return mode
== SImode
|| mode
== DImode
6442 || mode
== SFmode
|| mode
== DFmode
6443 || (aarch64_vector_mode_supported_p (mode
)
6444 && (known_eq (GET_MODE_SIZE (mode
), 8)
6445 || (known_eq (GET_MODE_SIZE (mode
), 16)
6446 && (aarch64_tune_params
.extra_tuning_flags
6447 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
) == 0)));
6450 /* Return true if REGNO is a virtual pointer register, or an eliminable
6451 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
6452 include stack_pointer or hard_frame_pointer. */
6454 virt_or_elim_regno_p (unsigned regno
)
6456 return ((regno
>= FIRST_VIRTUAL_REGISTER
6457 && regno
<= LAST_VIRTUAL_POINTER_REGISTER
)
6458 || regno
== FRAME_POINTER_REGNUM
6459 || regno
== ARG_POINTER_REGNUM
);
6462 /* Return true if X is a valid address of type TYPE for machine mode MODE.
6463 If it is, fill in INFO appropriately. STRICT_P is true if
6464 REG_OK_STRICT is in effect. */
6467 aarch64_classify_address (struct aarch64_address_info
*info
,
6468 rtx x
, machine_mode mode
, bool strict_p
,
6469 aarch64_addr_query_type type
)
6471 enum rtx_code code
= GET_CODE (x
);
6475 HOST_WIDE_INT const_size
;
6477 /* On BE, we use load/store pair for all large int mode load/stores.
6478 TI/TFmode may also use a load/store pair. */
6479 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
6480 bool advsimd_struct_p
= (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
));
6481 bool load_store_pair_p
= (type
== ADDR_QUERY_LDP_STP
6482 || type
== ADDR_QUERY_LDP_STP_N
6485 || (BYTES_BIG_ENDIAN
&& advsimd_struct_p
));
6487 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
6488 corresponds to the actual size of the memory being loaded/stored and the
6489 mode of the corresponding addressing mode is half of that. */
6490 if (type
== ADDR_QUERY_LDP_STP_N
6491 && known_eq (GET_MODE_SIZE (mode
), 16))
6494 bool allow_reg_index_p
= (!load_store_pair_p
6495 && (known_lt (GET_MODE_SIZE (mode
), 16)
6496 || vec_flags
== VEC_ADVSIMD
6497 || vec_flags
== VEC_SVE_DATA
));
6499 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
6500 [Rn, #offset, MUL VL]. */
6501 if ((vec_flags
& (VEC_SVE_DATA
| VEC_SVE_PRED
)) != 0
6502 && (code
!= REG
&& code
!= PLUS
))
6505 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
6507 if (advsimd_struct_p
6508 && !BYTES_BIG_ENDIAN
6509 && (code
!= POST_INC
&& code
!= REG
))
6512 gcc_checking_assert (GET_MODE (x
) == VOIDmode
6513 || SCALAR_INT_MODE_P (GET_MODE (x
)));
6519 info
->type
= ADDRESS_REG_IMM
;
6521 info
->offset
= const0_rtx
;
6522 info
->const_offset
= 0;
6523 return aarch64_base_register_rtx_p (x
, strict_p
);
6531 && virt_or_elim_regno_p (REGNO (op0
))
6532 && poly_int_rtx_p (op1
, &offset
))
6534 info
->type
= ADDRESS_REG_IMM
;
6537 info
->const_offset
= offset
;
6542 if (maybe_ne (GET_MODE_SIZE (mode
), 0)
6543 && aarch64_base_register_rtx_p (op0
, strict_p
)
6544 && poly_int_rtx_p (op1
, &offset
))
6546 info
->type
= ADDRESS_REG_IMM
;
6549 info
->const_offset
= offset
;
6551 /* TImode and TFmode values are allowed in both pairs of X
6552 registers and individual Q registers. The available
6554 X,X: 7-bit signed scaled offset
6555 Q: 9-bit signed offset
6556 We conservatively require an offset representable in either mode.
6557 When performing the check for pairs of X registers i.e. LDP/STP
6558 pass down DImode since that is the natural size of the LDP/STP
6559 instruction memory accesses. */
6560 if (mode
== TImode
|| mode
== TFmode
)
6561 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
6562 && (aarch64_offset_9bit_signed_unscaled_p (mode
, offset
)
6563 || offset_12bit_unsigned_scaled_p (mode
, offset
)));
6565 /* A 7bit offset check because OImode will emit a ldp/stp
6566 instruction (only big endian will get here).
6567 For ldp/stp instructions, the offset is scaled for the size of a
6568 single element of the pair. */
6570 return aarch64_offset_7bit_signed_scaled_p (TImode
, offset
);
6572 /* Three 9/12 bit offsets checks because CImode will emit three
6573 ldr/str instructions (only big endian will get here). */
6575 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
6576 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode
,
6578 || offset_12bit_unsigned_scaled_p (V16QImode
,
6581 /* Two 7bit offsets checks because XImode will emit two ldp/stp
6582 instructions (only big endian will get here). */
6584 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
6585 && aarch64_offset_7bit_signed_scaled_p (TImode
,
6588 /* Make "m" use the LD1 offset range for SVE data modes, so
6589 that pre-RTL optimizers like ivopts will work to that
6590 instead of the wider LDR/STR range. */
6591 if (vec_flags
== VEC_SVE_DATA
)
6592 return (type
== ADDR_QUERY_M
6593 ? offset_4bit_signed_scaled_p (mode
, offset
)
6594 : offset_9bit_signed_scaled_p (mode
, offset
));
6596 if (vec_flags
== (VEC_SVE_DATA
| VEC_STRUCT
))
6598 poly_int64 end_offset
= (offset
6599 + GET_MODE_SIZE (mode
)
6600 - BYTES_PER_SVE_VECTOR
);
6601 return (type
== ADDR_QUERY_M
6602 ? offset_4bit_signed_scaled_p (mode
, offset
)
6603 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE
, offset
)
6604 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE
,
6608 if (vec_flags
== VEC_SVE_PRED
)
6609 return offset_9bit_signed_scaled_p (mode
, offset
);
6611 if (load_store_pair_p
)
6612 return ((known_eq (GET_MODE_SIZE (mode
), 4)
6613 || known_eq (GET_MODE_SIZE (mode
), 8)
6614 || known_eq (GET_MODE_SIZE (mode
), 16))
6615 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
6617 return (aarch64_offset_9bit_signed_unscaled_p (mode
, offset
)
6618 || offset_12bit_unsigned_scaled_p (mode
, offset
));
6621 if (allow_reg_index_p
)
6623 /* Look for base + (scaled/extended) index register. */
6624 if (aarch64_base_register_rtx_p (op0
, strict_p
)
6625 && aarch64_classify_index (info
, op1
, mode
, strict_p
))
6630 if (aarch64_base_register_rtx_p (op1
, strict_p
)
6631 && aarch64_classify_index (info
, op0
, mode
, strict_p
))
6644 info
->type
= ADDRESS_REG_WB
;
6645 info
->base
= XEXP (x
, 0);
6646 info
->offset
= NULL_RTX
;
6647 return aarch64_base_register_rtx_p (info
->base
, strict_p
);
6651 info
->type
= ADDRESS_REG_WB
;
6652 info
->base
= XEXP (x
, 0);
6653 if (GET_CODE (XEXP (x
, 1)) == PLUS
6654 && poly_int_rtx_p (XEXP (XEXP (x
, 1), 1), &offset
)
6655 && rtx_equal_p (XEXP (XEXP (x
, 1), 0), info
->base
)
6656 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
6658 info
->offset
= XEXP (XEXP (x
, 1), 1);
6659 info
->const_offset
= offset
;
6661 /* TImode and TFmode values are allowed in both pairs of X
6662 registers and individual Q registers. The available
6664 X,X: 7-bit signed scaled offset
6665 Q: 9-bit signed offset
6666 We conservatively require an offset representable in either mode.
6668 if (mode
== TImode
|| mode
== TFmode
)
6669 return (aarch64_offset_7bit_signed_scaled_p (mode
, offset
)
6670 && aarch64_offset_9bit_signed_unscaled_p (mode
, offset
));
6672 if (load_store_pair_p
)
6673 return ((known_eq (GET_MODE_SIZE (mode
), 4)
6674 || known_eq (GET_MODE_SIZE (mode
), 8)
6675 || known_eq (GET_MODE_SIZE (mode
), 16))
6676 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
6678 return aarch64_offset_9bit_signed_unscaled_p (mode
, offset
);
6685 /* load literal: pc-relative constant pool entry. Only supported
6686 for SI mode or larger. */
6687 info
->type
= ADDRESS_SYMBOLIC
;
6689 if (!load_store_pair_p
6690 && GET_MODE_SIZE (mode
).is_constant (&const_size
)
6695 split_const (x
, &sym
, &addend
);
6696 return ((GET_CODE (sym
) == LABEL_REF
6697 || (GET_CODE (sym
) == SYMBOL_REF
6698 && CONSTANT_POOL_ADDRESS_P (sym
)
6699 && aarch64_pcrelative_literal_loads
)));
6704 info
->type
= ADDRESS_LO_SUM
;
6705 info
->base
= XEXP (x
, 0);
6706 info
->offset
= XEXP (x
, 1);
6707 if (allow_reg_index_p
6708 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
6711 split_const (info
->offset
, &sym
, &offs
);
6712 if (GET_CODE (sym
) == SYMBOL_REF
6713 && (aarch64_classify_symbol (sym
, INTVAL (offs
))
6714 == SYMBOL_SMALL_ABSOLUTE
))
6716 /* The symbol and offset must be aligned to the access size. */
6719 if (CONSTANT_POOL_ADDRESS_P (sym
))
6720 align
= GET_MODE_ALIGNMENT (get_pool_mode (sym
));
6721 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym
))
6723 tree exp
= SYMBOL_REF_DECL (sym
);
6724 align
= TYPE_ALIGN (TREE_TYPE (exp
));
6725 align
= aarch64_constant_alignment (exp
, align
);
6727 else if (SYMBOL_REF_DECL (sym
))
6728 align
= DECL_ALIGN (SYMBOL_REF_DECL (sym
));
6729 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym
)
6730 && SYMBOL_REF_BLOCK (sym
) != NULL
)
6731 align
= SYMBOL_REF_BLOCK (sym
)->alignment
;
6733 align
= BITS_PER_UNIT
;
6735 poly_int64 ref_size
= GET_MODE_SIZE (mode
);
6736 if (known_eq (ref_size
, 0))
6737 ref_size
= GET_MODE_SIZE (DImode
);
6739 return (multiple_p (INTVAL (offs
), ref_size
)
6740 && multiple_p (align
/ BITS_PER_UNIT
, ref_size
));
6750 /* Return true if the address X is valid for a PRFM instruction.
6751 STRICT_P is true if we should do strict checking with
6752 aarch64_classify_address. */
6755 aarch64_address_valid_for_prefetch_p (rtx x
, bool strict_p
)
6757 struct aarch64_address_info addr
;
6759 /* PRFM accepts the same addresses as DImode... */
6760 bool res
= aarch64_classify_address (&addr
, x
, DImode
, strict_p
);
6764 /* ... except writeback forms. */
6765 return addr
.type
!= ADDRESS_REG_WB
;
6769 aarch64_symbolic_address_p (rtx x
)
6773 split_const (x
, &x
, &offset
);
6774 return GET_CODE (x
) == SYMBOL_REF
|| GET_CODE (x
) == LABEL_REF
;
6777 /* Classify the base of symbolic expression X. */
6779 enum aarch64_symbol_type
6780 aarch64_classify_symbolic_expression (rtx x
)
6784 split_const (x
, &x
, &offset
);
6785 return aarch64_classify_symbol (x
, INTVAL (offset
));
6789 /* Return TRUE if X is a legitimate address for accessing memory in
6792 aarch64_legitimate_address_hook_p (machine_mode mode
, rtx x
, bool strict_p
)
6794 struct aarch64_address_info addr
;
6796 return aarch64_classify_address (&addr
, x
, mode
, strict_p
);
6799 /* Return TRUE if X is a legitimate address of type TYPE for accessing
6800 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
6802 aarch64_legitimate_address_p (machine_mode mode
, rtx x
, bool strict_p
,
6803 aarch64_addr_query_type type
)
6805 struct aarch64_address_info addr
;
6807 return aarch64_classify_address (&addr
, x
, mode
, strict_p
, type
);
6810 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
6813 aarch64_legitimize_address_displacement (rtx
*offset1
, rtx
*offset2
,
6814 poly_int64 orig_offset
,
6818 if (GET_MODE_SIZE (mode
).is_constant (&size
))
6820 HOST_WIDE_INT const_offset
, second_offset
;
6822 /* A general SVE offset is A * VQ + B. Remove the A component from
6823 coefficient 0 in order to get the constant B. */
6824 const_offset
= orig_offset
.coeffs
[0] - orig_offset
.coeffs
[1];
6826 /* Split an out-of-range address displacement into a base and
6827 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
6828 range otherwise to increase opportunities for sharing the base
6829 address of different sizes. Unaligned accesses use the signed
6830 9-bit range, TImode/TFmode use the intersection of signed
6831 scaled 7-bit and signed 9-bit offset. */
6832 if (mode
== TImode
|| mode
== TFmode
)
6833 second_offset
= ((const_offset
+ 0x100) & 0x1f8) - 0x100;
6834 else if ((const_offset
& (size
- 1)) != 0)
6835 second_offset
= ((const_offset
+ 0x100) & 0x1ff) - 0x100;
6837 second_offset
= const_offset
& (size
< 4 ? 0xfff : 0x3ffc);
6839 if (second_offset
== 0 || known_eq (orig_offset
, second_offset
))
6842 /* Split the offset into second_offset and the rest. */
6843 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
6844 *offset2
= gen_int_mode (second_offset
, Pmode
);
6849 /* Get the mode we should use as the basis of the range. For structure
6850 modes this is the mode of one vector. */
6851 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
6852 machine_mode step_mode
6853 = (vec_flags
& VEC_STRUCT
) != 0 ? SVE_BYTE_MODE
: mode
;
6855 /* Get the "mul vl" multiplier we'd like to use. */
6856 HOST_WIDE_INT factor
= GET_MODE_SIZE (step_mode
).coeffs
[1];
6857 HOST_WIDE_INT vnum
= orig_offset
.coeffs
[1] / factor
;
6858 if (vec_flags
& VEC_SVE_DATA
)
6859 /* LDR supports a 9-bit range, but the move patterns for
6860 structure modes require all vectors to be in range of the
6861 same base. The simplest way of accomodating that while still
6862 promoting reuse of anchor points between different modes is
6863 to use an 8-bit range unconditionally. */
6864 vnum
= ((vnum
+ 128) & 255) - 128;
6866 /* Predicates are only handled singly, so we might as well use
6868 vnum
= ((vnum
+ 256) & 511) - 256;
6872 /* Convert the "mul vl" multiplier into a byte offset. */
6873 poly_int64 second_offset
= GET_MODE_SIZE (step_mode
) * vnum
;
6874 if (known_eq (second_offset
, orig_offset
))
6877 /* Split the offset into second_offset and the rest. */
6878 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
6879 *offset2
= gen_int_mode (second_offset
, Pmode
);
6884 /* Return the binary representation of floating point constant VALUE in INTVAL.
6885 If the value cannot be converted, return false without setting INTVAL.
6886 The conversion is done in the given MODE. */
6888 aarch64_reinterpret_float_as_int (rtx value
, unsigned HOST_WIDE_INT
*intval
)
6891 /* We make a general exception for 0. */
6892 if (aarch64_float_const_zero_rtx_p (value
))
6898 scalar_float_mode mode
;
6899 if (GET_CODE (value
) != CONST_DOUBLE
6900 || !is_a
<scalar_float_mode
> (GET_MODE (value
), &mode
)
6901 || GET_MODE_BITSIZE (mode
) > HOST_BITS_PER_WIDE_INT
6902 /* Only support up to DF mode. */
6903 || GET_MODE_BITSIZE (mode
) > GET_MODE_BITSIZE (DFmode
))
6906 unsigned HOST_WIDE_INT ival
= 0;
6909 real_to_target (res
,
6910 CONST_DOUBLE_REAL_VALUE (value
),
6911 REAL_MODE_FORMAT (mode
));
6915 int order
= BYTES_BIG_ENDIAN
? 1 : 0;
6916 ival
= zext_hwi (res
[order
], 32);
6917 ival
|= (zext_hwi (res
[1 - order
], 32) << 32);
6920 ival
= zext_hwi (res
[0], 32);
6926 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6927 single MOV(+MOVK) followed by an FMOV. */
6929 aarch64_float_const_rtx_p (rtx x
)
6931 machine_mode mode
= GET_MODE (x
);
6932 if (mode
== VOIDmode
)
6935 /* Determine whether it's cheaper to write float constants as
6936 mov/movk pairs over ldr/adrp pairs. */
6937 unsigned HOST_WIDE_INT ival
;
6939 if (GET_CODE (x
) == CONST_DOUBLE
6940 && SCALAR_FLOAT_MODE_P (mode
)
6941 && aarch64_reinterpret_float_as_int (x
, &ival
))
6943 scalar_int_mode imode
= (mode
== HFmode
6945 : int_mode_for_mode (mode
).require ());
6946 int num_instr
= aarch64_internal_mov_immediate
6947 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
6948 return num_instr
< 3;
6954 /* Return TRUE if rtx X is immediate constant 0.0 */
6956 aarch64_float_const_zero_rtx_p (rtx x
)
6958 if (GET_MODE (x
) == VOIDmode
)
6961 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x
)))
6962 return !HONOR_SIGNED_ZEROS (GET_MODE (x
));
6963 return real_equal (CONST_DOUBLE_REAL_VALUE (x
), &dconst0
);
6966 /* Return TRUE if rtx X is immediate constant that fits in a single
6967 MOVI immediate operation. */
6969 aarch64_can_const_movi_rtx_p (rtx x
, machine_mode mode
)
6975 scalar_int_mode imode
;
6976 unsigned HOST_WIDE_INT ival
;
6978 if (GET_CODE (x
) == CONST_DOUBLE
6979 && SCALAR_FLOAT_MODE_P (mode
))
6981 if (!aarch64_reinterpret_float_as_int (x
, &ival
))
6984 /* We make a general exception for 0. */
6985 if (aarch64_float_const_zero_rtx_p (x
))
6988 imode
= int_mode_for_mode (mode
).require ();
6990 else if (GET_CODE (x
) == CONST_INT
6991 && is_a
<scalar_int_mode
> (mode
, &imode
))
6996 /* use a 64 bit mode for everything except for DI/DF mode, where we use
6997 a 128 bit vector mode. */
6998 int width
= GET_MODE_BITSIZE (imode
) == 64 ? 128 : 64;
7000 vmode
= aarch64_simd_container_mode (imode
, width
);
7001 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, ival
);
7003 return aarch64_simd_valid_immediate (v_op
, NULL
);
7007 /* Return the fixed registers used for condition codes. */
7010 aarch64_fixed_condition_code_regs (unsigned int *p1
, unsigned int *p2
)
7013 *p2
= INVALID_REGNUM
;
7017 /* This function is used by the call expanders of the machine description.
7018 RESULT is the register in which the result is returned. It's NULL for
7019 "call" and "sibcall".
7020 MEM is the location of the function call.
7021 SIBCALL indicates whether this function call is normal call or sibling call.
7022 It will generate different pattern accordingly. */
7025 aarch64_expand_call (rtx result
, rtx mem
, bool sibcall
)
7027 rtx call
, callee
, tmp
;
7031 gcc_assert (MEM_P (mem
));
7032 callee
= XEXP (mem
, 0);
7033 mode
= GET_MODE (callee
);
7034 gcc_assert (mode
== Pmode
);
7036 /* Decide if we should generate indirect calls by loading the
7037 address of the callee into a register before performing
7038 the branch-and-link. */
7039 if (SYMBOL_REF_P (callee
)
7040 ? (aarch64_is_long_call_p (callee
)
7041 || aarch64_is_noplt_call_p (callee
))
7043 XEXP (mem
, 0) = force_reg (mode
, callee
);
7045 call
= gen_rtx_CALL (VOIDmode
, mem
, const0_rtx
);
7047 if (result
!= NULL_RTX
)
7048 call
= gen_rtx_SET (result
, call
);
7053 tmp
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (Pmode
, LR_REGNUM
));
7055 vec
= gen_rtvec (2, call
, tmp
);
7056 call
= gen_rtx_PARALLEL (VOIDmode
, vec
);
7058 aarch64_emit_call_insn (call
);
7061 /* Emit call insn with PAT and do aarch64-specific handling. */
7064 aarch64_emit_call_insn (rtx pat
)
7066 rtx insn
= emit_call_insn (pat
);
7068 rtx
*fusage
= &CALL_INSN_FUNCTION_USAGE (insn
);
7069 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP0_REGNUM
));
7070 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP1_REGNUM
));
7074 aarch64_select_cc_mode (RTX_CODE code
, rtx x
, rtx y
)
7076 /* All floating point compares return CCFP if it is an equality
7077 comparison, and CCFPE otherwise. */
7078 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_FLOAT
)
7105 /* Equality comparisons of short modes against zero can be performed
7106 using the TST instruction with the appropriate bitmask. */
7107 if (y
== const0_rtx
&& REG_P (x
)
7108 && (code
== EQ
|| code
== NE
)
7109 && (GET_MODE (x
) == HImode
|| GET_MODE (x
) == QImode
))
7112 /* Similarly, comparisons of zero_extends from shorter modes can
7113 be performed using an ANDS with an immediate mask. */
7114 if (y
== const0_rtx
&& GET_CODE (x
) == ZERO_EXTEND
7115 && (GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
7116 && (GET_MODE (XEXP (x
, 0)) == HImode
|| GET_MODE (XEXP (x
, 0)) == QImode
)
7117 && (code
== EQ
|| code
== NE
))
7120 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
7122 && (code
== EQ
|| code
== NE
|| code
== LT
|| code
== GE
)
7123 && (GET_CODE (x
) == PLUS
|| GET_CODE (x
) == MINUS
|| GET_CODE (x
) == AND
7124 || GET_CODE (x
) == NEG
7125 || (GET_CODE (x
) == ZERO_EXTRACT
&& CONST_INT_P (XEXP (x
, 1))
7126 && CONST_INT_P (XEXP (x
, 2)))))
7129 /* A compare with a shifted operand. Because of canonicalization,
7130 the comparison will have to be swapped when we emit the assembly
7132 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
7133 && (REG_P (y
) || GET_CODE (y
) == SUBREG
|| y
== const0_rtx
)
7134 && (GET_CODE (x
) == ASHIFT
|| GET_CODE (x
) == ASHIFTRT
7135 || GET_CODE (x
) == LSHIFTRT
7136 || GET_CODE (x
) == ZERO_EXTEND
|| GET_CODE (x
) == SIGN_EXTEND
))
7139 /* Similarly for a negated operand, but we can only do this for
7141 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
7142 && (REG_P (y
) || GET_CODE (y
) == SUBREG
)
7143 && (code
== EQ
|| code
== NE
)
7144 && GET_CODE (x
) == NEG
)
7147 /* A test for unsigned overflow. */
7148 if ((GET_MODE (x
) == DImode
|| GET_MODE (x
) == TImode
)
7150 && GET_CODE (x
) == PLUS
7151 && GET_CODE (y
) == ZERO_EXTEND
)
7154 /* A test for signed overflow. */
7155 if ((GET_MODE (x
) == DImode
|| GET_MODE (x
) == TImode
)
7157 && GET_CODE (x
) == PLUS
7158 && GET_CODE (y
) == SIGN_EXTEND
)
7161 /* For everything else, return CCmode. */
7166 aarch64_get_condition_code_1 (machine_mode
, enum rtx_code
);
7169 aarch64_get_condition_code (rtx x
)
7171 machine_mode mode
= GET_MODE (XEXP (x
, 0));
7172 enum rtx_code comp_code
= GET_CODE (x
);
7174 if (GET_MODE_CLASS (mode
) != MODE_CC
)
7175 mode
= SELECT_CC_MODE (comp_code
, XEXP (x
, 0), XEXP (x
, 1));
7176 return aarch64_get_condition_code_1 (mode
, comp_code
);
7180 aarch64_get_condition_code_1 (machine_mode mode
, enum rtx_code comp_code
)
7188 case GE
: return AARCH64_GE
;
7189 case GT
: return AARCH64_GT
;
7190 case LE
: return AARCH64_LS
;
7191 case LT
: return AARCH64_MI
;
7192 case NE
: return AARCH64_NE
;
7193 case EQ
: return AARCH64_EQ
;
7194 case ORDERED
: return AARCH64_VC
;
7195 case UNORDERED
: return AARCH64_VS
;
7196 case UNLT
: return AARCH64_LT
;
7197 case UNLE
: return AARCH64_LE
;
7198 case UNGT
: return AARCH64_HI
;
7199 case UNGE
: return AARCH64_PL
;
7207 case NE
: return AARCH64_NE
;
7208 case EQ
: return AARCH64_EQ
;
7209 case GE
: return AARCH64_GE
;
7210 case GT
: return AARCH64_GT
;
7211 case LE
: return AARCH64_LE
;
7212 case LT
: return AARCH64_LT
;
7213 case GEU
: return AARCH64_CS
;
7214 case GTU
: return AARCH64_HI
;
7215 case LEU
: return AARCH64_LS
;
7216 case LTU
: return AARCH64_CC
;
7224 case NE
: return AARCH64_NE
;
7225 case EQ
: return AARCH64_EQ
;
7226 case GE
: return AARCH64_LE
;
7227 case GT
: return AARCH64_LT
;
7228 case LE
: return AARCH64_GE
;
7229 case LT
: return AARCH64_GT
;
7230 case GEU
: return AARCH64_LS
;
7231 case GTU
: return AARCH64_CC
;
7232 case LEU
: return AARCH64_CS
;
7233 case LTU
: return AARCH64_HI
;
7241 case NE
: return AARCH64_NE
;
7242 case EQ
: return AARCH64_EQ
;
7243 case GE
: return AARCH64_PL
;
7244 case LT
: return AARCH64_MI
;
7252 case NE
: return AARCH64_NE
;
7253 case EQ
: return AARCH64_EQ
;
7261 case NE
: return AARCH64_CS
;
7262 case EQ
: return AARCH64_CC
;
7270 case NE
: return AARCH64_VS
;
7271 case EQ
: return AARCH64_VC
;
7284 aarch64_const_vec_all_same_in_range_p (rtx x
,
7285 HOST_WIDE_INT minval
,
7286 HOST_WIDE_INT maxval
)
7289 return (const_vec_duplicate_p (x
, &elt
)
7290 && CONST_INT_P (elt
)
7291 && IN_RANGE (INTVAL (elt
), minval
, maxval
));
7295 aarch64_const_vec_all_same_int_p (rtx x
, HOST_WIDE_INT val
)
7297 return aarch64_const_vec_all_same_in_range_p (x
, val
, val
);
7300 /* Return true if VEC is a constant in which every element is in the range
7301 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
7304 aarch64_const_vec_all_in_range_p (rtx vec
,
7305 HOST_WIDE_INT minval
,
7306 HOST_WIDE_INT maxval
)
7308 if (GET_CODE (vec
) != CONST_VECTOR
7309 || GET_MODE_CLASS (GET_MODE (vec
)) != MODE_VECTOR_INT
)
7313 if (!CONST_VECTOR_STEPPED_P (vec
))
7314 nunits
= const_vector_encoded_nelts (vec
);
7315 else if (!CONST_VECTOR_NUNITS (vec
).is_constant (&nunits
))
7318 for (int i
= 0; i
< nunits
; i
++)
7320 rtx vec_elem
= CONST_VECTOR_ELT (vec
, i
);
7321 if (!CONST_INT_P (vec_elem
)
7322 || !IN_RANGE (INTVAL (vec_elem
), minval
, maxval
))
7329 #define AARCH64_CC_V 1
7330 #define AARCH64_CC_C (1 << 1)
7331 #define AARCH64_CC_Z (1 << 2)
7332 #define AARCH64_CC_N (1 << 3)
7334 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
7335 static const int aarch64_nzcv_codes
[] =
7337 0, /* EQ, Z == 1. */
7338 AARCH64_CC_Z
, /* NE, Z == 0. */
7339 0, /* CS, C == 1. */
7340 AARCH64_CC_C
, /* CC, C == 0. */
7341 0, /* MI, N == 1. */
7342 AARCH64_CC_N
, /* PL, N == 0. */
7343 0, /* VS, V == 1. */
7344 AARCH64_CC_V
, /* VC, V == 0. */
7345 0, /* HI, C ==1 && Z == 0. */
7346 AARCH64_CC_C
, /* LS, !(C == 1 && Z == 0). */
7347 AARCH64_CC_V
, /* GE, N == V. */
7348 0, /* LT, N != V. */
7349 AARCH64_CC_Z
, /* GT, Z == 0 && N == V. */
7350 0, /* LE, !(Z == 0 && N == V). */
7355 /* Print floating-point vector immediate operand X to F, negating it
7356 first if NEGATE is true. Return true on success, false if it isn't
7357 a constant we can handle. */
7360 aarch64_print_vector_float_operand (FILE *f
, rtx x
, bool negate
)
7364 if (!const_vec_duplicate_p (x
, &elt
))
7367 REAL_VALUE_TYPE r
= *CONST_DOUBLE_REAL_VALUE (elt
);
7369 r
= real_value_negate (&r
);
7371 /* We only handle the SVE single-bit immediates here. */
7372 if (real_equal (&r
, &dconst0
))
7373 asm_fprintf (f
, "0.0");
7374 else if (real_equal (&r
, &dconst1
))
7375 asm_fprintf (f
, "1.0");
7376 else if (real_equal (&r
, &dconsthalf
))
7377 asm_fprintf (f
, "0.5");
7384 /* Return the equivalent letter for size. */
7386 sizetochar (int size
)
7390 case 64: return 'd';
7391 case 32: return 's';
7392 case 16: return 'h';
7393 case 8 : return 'b';
7394 default: gcc_unreachable ();
7398 /* Print operand X to file F in a target specific manner according to CODE.
7399 The acceptable formatting commands given by CODE are:
7400 'c': An integer or symbol address without a preceding #
7402 'C': Take the duplicated element in a vector constant
7403 and print it in hex.
7404 'D': Take the duplicated element in a vector constant
7405 and print it as an unsigned integer, in decimal.
7406 'e': Print the sign/zero-extend size as a character 8->b,
7408 'p': Prints N such that 2^N == X (X must be power of 2 and
7410 'P': Print the number of non-zero bits in X (a const_int).
7411 'H': Print the higher numbered register of a pair (TImode)
7413 'm': Print a condition (eq, ne, etc).
7414 'M': Same as 'm', but invert condition.
7415 'N': Take the duplicated element in a vector constant
7416 and print the negative of it in decimal.
7417 'b/h/s/d/q': Print a scalar FP/SIMD register name.
7418 'S/T/U/V': Print a FP/SIMD register name for a register list.
7419 The register printed is the FP/SIMD register name
7420 of X + 0/1/2/3 for S/T/U/V.
7421 'R': Print a scalar FP/SIMD register name + 1.
7422 'X': Print bottom 16 bits of integer constant in hex.
7423 'w/x': Print a general register name or the zero register
7425 '0': Print a normal operand, if it's a general register,
7426 then we assume DImode.
7427 'k': Print NZCV for conditional compare instructions.
7428 'A': Output address constant representing the first
7429 argument of X, specifying a relocation offset
7431 'L': Output constant address specified by X
7432 with a relocation offset if appropriate.
7433 'G': Prints address of X, specifying a PC relative
7434 relocation mode if appropriate.
7435 'y': Output address of LDP or STP - this is used for
7436 some LDP/STPs which don't use a PARALLEL in their
7437 pattern (so the mode needs to be adjusted).
7438 'z': Output address of a typical LDP or STP. */
7441 aarch64_print_operand (FILE *f
, rtx x
, int code
)
7447 switch (GET_CODE (x
))
7450 fprintf (f
, HOST_WIDE_INT_PRINT_DEC
, INTVAL (x
));
7454 output_addr_const (f
, x
);
7458 if (GET_CODE (XEXP (x
, 0)) == PLUS
7459 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
)
7461 output_addr_const (f
, x
);
7467 output_operand_lossage ("unsupported operand for code '%c'", code
);
7475 if (!CONST_INT_P (x
)
7476 || (n
= exact_log2 (INTVAL (x
) & ~7)) <= 0)
7478 output_operand_lossage ("invalid operand for '%%%c'", code
);
7494 output_operand_lossage ("invalid operand for '%%%c'", code
);
7504 if (!CONST_INT_P (x
) || (n
= exact_log2 (INTVAL (x
))) < 0)
7506 output_operand_lossage ("invalid operand for '%%%c'", code
);
7510 asm_fprintf (f
, "%d", n
);
7515 if (!CONST_INT_P (x
))
7517 output_operand_lossage ("invalid operand for '%%%c'", code
);
7521 asm_fprintf (f
, "%u", popcount_hwi (INTVAL (x
)));
7525 if (x
== const0_rtx
)
7527 asm_fprintf (f
, "xzr");
7531 if (!REG_P (x
) || !GP_REGNUM_P (REGNO (x
) + 1))
7533 output_operand_lossage ("invalid operand for '%%%c'", code
);
7537 asm_fprintf (f
, "%s", reg_names
[REGNO (x
) + 1]);
7544 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
7545 if (x
== const_true_rtx
)
7552 if (!COMPARISON_P (x
))
7554 output_operand_lossage ("invalid operand for '%%%c'", code
);
7558 cond_code
= aarch64_get_condition_code (x
);
7559 gcc_assert (cond_code
>= 0);
7561 cond_code
= AARCH64_INVERSE_CONDITION_CODE (cond_code
);
7562 fputs (aarch64_condition_codes
[cond_code
], f
);
7567 if (!const_vec_duplicate_p (x
, &elt
))
7569 output_operand_lossage ("invalid vector constant");
7573 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
7574 asm_fprintf (f
, "%wd", -INTVAL (elt
));
7575 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
7576 && aarch64_print_vector_float_operand (f
, x
, true))
7580 output_operand_lossage ("invalid vector constant");
7590 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
7592 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
7595 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - V0_REGNUM
);
7602 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
7604 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
7607 asm_fprintf (f
, "%c%d",
7608 aarch64_sve_data_mode_p (GET_MODE (x
)) ? 'z' : 'v',
7609 REGNO (x
) - V0_REGNUM
+ (code
- 'S'));
7613 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
7615 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
7618 asm_fprintf (f
, "q%d", REGNO (x
) - V0_REGNUM
+ 1);
7622 if (!CONST_INT_P (x
))
7624 output_operand_lossage ("invalid operand for '%%%c'", code
);
7627 asm_fprintf (f
, "0x%wx", UINTVAL (x
) & 0xffff);
7632 /* Print a replicated constant in hex. */
7633 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
7635 output_operand_lossage ("invalid operand for '%%%c'", code
);
7638 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
7639 asm_fprintf (f
, "0x%wx", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
7645 /* Print a replicated constant in decimal, treating it as
7647 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
7649 output_operand_lossage ("invalid operand for '%%%c'", code
);
7652 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
7653 asm_fprintf (f
, "%wd", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
7660 || (CONST_DOUBLE_P (x
) && aarch64_float_const_zero_rtx_p (x
)))
7662 asm_fprintf (f
, "%czr", code
);
7666 if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
7668 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - R0_REGNUM
);
7672 if (REG_P (x
) && REGNO (x
) == SP_REGNUM
)
7674 asm_fprintf (f
, "%ssp", code
== 'w' ? "w" : "");
7683 output_operand_lossage ("missing operand");
7687 switch (GET_CODE (x
))
7690 if (aarch64_sve_data_mode_p (GET_MODE (x
)))
7692 if (REG_NREGS (x
) == 1)
7693 asm_fprintf (f
, "z%d", REGNO (x
) - V0_REGNUM
);
7697 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x
)));
7698 asm_fprintf (f
, "{z%d.%c - z%d.%c}",
7699 REGNO (x
) - V0_REGNUM
, suffix
,
7700 END_REGNO (x
) - V0_REGNUM
- 1, suffix
);
7704 asm_fprintf (f
, "%s", reg_names
[REGNO (x
)]);
7708 output_address (GET_MODE (x
), XEXP (x
, 0));
7713 output_addr_const (asm_out_file
, x
);
7717 asm_fprintf (f
, "%wd", INTVAL (x
));
7721 if (!VECTOR_MODE_P (GET_MODE (x
)))
7723 output_addr_const (asm_out_file
, x
);
7729 if (!const_vec_duplicate_p (x
, &elt
))
7731 output_operand_lossage ("invalid vector constant");
7735 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
7736 asm_fprintf (f
, "%wd", INTVAL (elt
));
7737 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
7738 && aarch64_print_vector_float_operand (f
, x
, false))
7742 output_operand_lossage ("invalid vector constant");
7748 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
7749 be getting CONST_DOUBLEs holding integers. */
7750 gcc_assert (GET_MODE (x
) != VOIDmode
);
7751 if (aarch64_float_const_zero_rtx_p (x
))
7756 else if (aarch64_float_const_representable_p (x
))
7759 char float_buf
[buf_size
] = {'\0'};
7760 real_to_decimal_for_mode (float_buf
,
7761 CONST_DOUBLE_REAL_VALUE (x
),
7764 asm_fprintf (asm_out_file
, "%s", float_buf
);
7768 output_operand_lossage ("invalid constant");
7771 output_operand_lossage ("invalid operand");
7777 if (GET_CODE (x
) == HIGH
)
7780 switch (aarch64_classify_symbolic_expression (x
))
7782 case SYMBOL_SMALL_GOT_4G
:
7783 asm_fprintf (asm_out_file
, ":got:");
7786 case SYMBOL_SMALL_TLSGD
:
7787 asm_fprintf (asm_out_file
, ":tlsgd:");
7790 case SYMBOL_SMALL_TLSDESC
:
7791 asm_fprintf (asm_out_file
, ":tlsdesc:");
7794 case SYMBOL_SMALL_TLSIE
:
7795 asm_fprintf (asm_out_file
, ":gottprel:");
7798 case SYMBOL_TLSLE24
:
7799 asm_fprintf (asm_out_file
, ":tprel:");
7802 case SYMBOL_TINY_GOT
:
7809 output_addr_const (asm_out_file
, x
);
7813 switch (aarch64_classify_symbolic_expression (x
))
7815 case SYMBOL_SMALL_GOT_4G
:
7816 asm_fprintf (asm_out_file
, ":lo12:");
7819 case SYMBOL_SMALL_TLSGD
:
7820 asm_fprintf (asm_out_file
, ":tlsgd_lo12:");
7823 case SYMBOL_SMALL_TLSDESC
:
7824 asm_fprintf (asm_out_file
, ":tlsdesc_lo12:");
7827 case SYMBOL_SMALL_TLSIE
:
7828 asm_fprintf (asm_out_file
, ":gottprel_lo12:");
7831 case SYMBOL_TLSLE12
:
7832 asm_fprintf (asm_out_file
, ":tprel_lo12:");
7835 case SYMBOL_TLSLE24
:
7836 asm_fprintf (asm_out_file
, ":tprel_lo12_nc:");
7839 case SYMBOL_TINY_GOT
:
7840 asm_fprintf (asm_out_file
, ":got:");
7843 case SYMBOL_TINY_TLSIE
:
7844 asm_fprintf (asm_out_file
, ":gottprel:");
7850 output_addr_const (asm_out_file
, x
);
7854 switch (aarch64_classify_symbolic_expression (x
))
7856 case SYMBOL_TLSLE24
:
7857 asm_fprintf (asm_out_file
, ":tprel_hi12:");
7862 output_addr_const (asm_out_file
, x
);
7867 HOST_WIDE_INT cond_code
;
7869 if (!CONST_INT_P (x
))
7871 output_operand_lossage ("invalid operand for '%%%c'", code
);
7875 cond_code
= INTVAL (x
);
7876 gcc_assert (cond_code
>= 0 && cond_code
<= AARCH64_NV
);
7877 asm_fprintf (f
, "%d", aarch64_nzcv_codes
[cond_code
]);
7884 machine_mode mode
= GET_MODE (x
);
7886 if (GET_CODE (x
) != MEM
7887 || (code
== 'y' && maybe_ne (GET_MODE_SIZE (mode
), 16)))
7889 output_operand_lossage ("invalid operand for '%%%c'", code
);
7893 if (!aarch64_print_address_internal (f
, mode
, XEXP (x
, 0),
7895 ? ADDR_QUERY_LDP_STP_N
7896 : ADDR_QUERY_LDP_STP
))
7897 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
7902 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
7907 /* Print address 'x' of a memory access with mode 'mode'.
7908 'op' is the context required by aarch64_classify_address. It can either be
7909 MEM for a normal memory access or PARALLEL for LDP/STP. */
7911 aarch64_print_address_internal (FILE *f
, machine_mode mode
, rtx x
,
7912 aarch64_addr_query_type type
)
7914 struct aarch64_address_info addr
;
7917 /* Check all addresses are Pmode - including ILP32. */
7918 if (GET_MODE (x
) != Pmode
7919 && (!CONST_INT_P (x
)
7920 || trunc_int_for_mode (INTVAL (x
), Pmode
) != INTVAL (x
)))
7922 output_operand_lossage ("invalid address mode");
7926 if (aarch64_classify_address (&addr
, x
, mode
, true, type
))
7929 case ADDRESS_REG_IMM
:
7930 if (known_eq (addr
.const_offset
, 0))
7931 asm_fprintf (f
, "[%s]", reg_names
[REGNO (addr
.base
)]);
7932 else if (aarch64_sve_data_mode_p (mode
))
7935 = exact_div (addr
.const_offset
,
7936 BYTES_PER_SVE_VECTOR
).to_constant ();
7937 asm_fprintf (f
, "[%s, #%wd, mul vl]",
7938 reg_names
[REGNO (addr
.base
)], vnum
);
7940 else if (aarch64_sve_pred_mode_p (mode
))
7943 = exact_div (addr
.const_offset
,
7944 BYTES_PER_SVE_PRED
).to_constant ();
7945 asm_fprintf (f
, "[%s, #%wd, mul vl]",
7946 reg_names
[REGNO (addr
.base
)], vnum
);
7949 asm_fprintf (f
, "[%s, %wd]", reg_names
[REGNO (addr
.base
)],
7950 INTVAL (addr
.offset
));
7953 case ADDRESS_REG_REG
:
7954 if (addr
.shift
== 0)
7955 asm_fprintf (f
, "[%s, %s]", reg_names
[REGNO (addr
.base
)],
7956 reg_names
[REGNO (addr
.offset
)]);
7958 asm_fprintf (f
, "[%s, %s, lsl %u]", reg_names
[REGNO (addr
.base
)],
7959 reg_names
[REGNO (addr
.offset
)], addr
.shift
);
7962 case ADDRESS_REG_UXTW
:
7963 if (addr
.shift
== 0)
7964 asm_fprintf (f
, "[%s, w%d, uxtw]", reg_names
[REGNO (addr
.base
)],
7965 REGNO (addr
.offset
) - R0_REGNUM
);
7967 asm_fprintf (f
, "[%s, w%d, uxtw %u]", reg_names
[REGNO (addr
.base
)],
7968 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
7971 case ADDRESS_REG_SXTW
:
7972 if (addr
.shift
== 0)
7973 asm_fprintf (f
, "[%s, w%d, sxtw]", reg_names
[REGNO (addr
.base
)],
7974 REGNO (addr
.offset
) - R0_REGNUM
);
7976 asm_fprintf (f
, "[%s, w%d, sxtw %u]", reg_names
[REGNO (addr
.base
)],
7977 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
7980 case ADDRESS_REG_WB
:
7981 /* Writeback is only supported for fixed-width modes. */
7982 size
= GET_MODE_SIZE (mode
).to_constant ();
7983 switch (GET_CODE (x
))
7986 asm_fprintf (f
, "[%s, %d]!", reg_names
[REGNO (addr
.base
)], size
);
7989 asm_fprintf (f
, "[%s], %d", reg_names
[REGNO (addr
.base
)], size
);
7992 asm_fprintf (f
, "[%s, -%d]!", reg_names
[REGNO (addr
.base
)], size
);
7995 asm_fprintf (f
, "[%s], -%d", reg_names
[REGNO (addr
.base
)], size
);
7998 asm_fprintf (f
, "[%s, %wd]!", reg_names
[REGNO (addr
.base
)],
7999 INTVAL (addr
.offset
));
8002 asm_fprintf (f
, "[%s], %wd", reg_names
[REGNO (addr
.base
)],
8003 INTVAL (addr
.offset
));
8010 case ADDRESS_LO_SUM
:
8011 asm_fprintf (f
, "[%s, #:lo12:", reg_names
[REGNO (addr
.base
)]);
8012 output_addr_const (f
, addr
.offset
);
8013 asm_fprintf (f
, "]");
8016 case ADDRESS_SYMBOLIC
:
8017 output_addr_const (f
, x
);
8024 /* Print address 'x' of a memory access with mode 'mode'. */
8026 aarch64_print_operand_address (FILE *f
, machine_mode mode
, rtx x
)
8028 if (!aarch64_print_address_internal (f
, mode
, x
, ADDR_QUERY_ANY
))
8029 output_addr_const (f
, x
);
8033 aarch64_label_mentioned_p (rtx x
)
8038 if (GET_CODE (x
) == LABEL_REF
)
8041 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
8042 referencing instruction, but they are constant offsets, not
8044 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
8047 fmt
= GET_RTX_FORMAT (GET_CODE (x
));
8048 for (i
= GET_RTX_LENGTH (GET_CODE (x
)) - 1; i
>= 0; i
--)
8054 for (j
= XVECLEN (x
, i
) - 1; j
>= 0; j
--)
8055 if (aarch64_label_mentioned_p (XVECEXP (x
, i
, j
)))
8058 else if (fmt
[i
] == 'e' && aarch64_label_mentioned_p (XEXP (x
, i
)))
8065 /* Implement REGNO_REG_CLASS. */
8068 aarch64_regno_regclass (unsigned regno
)
8070 if (GP_REGNUM_P (regno
))
8071 return GENERAL_REGS
;
8073 if (regno
== SP_REGNUM
)
8076 if (regno
== FRAME_POINTER_REGNUM
8077 || regno
== ARG_POINTER_REGNUM
)
8078 return POINTER_REGS
;
8080 if (FP_REGNUM_P (regno
))
8081 return FP_LO_REGNUM_P (regno
) ? FP_LO_REGS
: FP_REGS
;
8083 if (PR_REGNUM_P (regno
))
8084 return PR_LO_REGNUM_P (regno
) ? PR_LO_REGS
: PR_HI_REGS
;
8089 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
8090 If OFFSET is out of range, return an offset of an anchor point
8091 that is in range. Return 0 otherwise. */
8093 static HOST_WIDE_INT
8094 aarch64_anchor_offset (HOST_WIDE_INT offset
, HOST_WIDE_INT size
,
8097 /* Does it look like we'll need a 16-byte load/store-pair operation? */
8099 return (offset
+ 0x400) & ~0x7f0;
8101 /* For offsets that aren't a multiple of the access size, the limit is
8103 if (offset
& (size
- 1))
8105 /* BLKmode typically uses LDP of X-registers. */
8106 if (mode
== BLKmode
)
8107 return (offset
+ 512) & ~0x3ff;
8108 return (offset
+ 0x100) & ~0x1ff;
8111 /* Small negative offsets are supported. */
8112 if (IN_RANGE (offset
, -256, 0))
8115 if (mode
== TImode
|| mode
== TFmode
)
8116 return (offset
+ 0x100) & ~0x1ff;
8118 /* Use 12-bit offset by access size. */
8119 return offset
& (~0xfff * size
);
8123 aarch64_legitimize_address (rtx x
, rtx
/* orig_x */, machine_mode mode
)
8125 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
8126 where mask is selected by alignment and size of the offset.
8127 We try to pick as large a range for the offset as possible to
8128 maximize the chance of a CSE. However, for aligned addresses
8129 we limit the range to 4k so that structures with different sized
8130 elements are likely to use the same base. We need to be careful
8131 not to split a CONST for some forms of address expression, otherwise
8132 it will generate sub-optimal code. */
8134 if (GET_CODE (x
) == PLUS
&& CONST_INT_P (XEXP (x
, 1)))
8136 rtx base
= XEXP (x
, 0);
8137 rtx offset_rtx
= XEXP (x
, 1);
8138 HOST_WIDE_INT offset
= INTVAL (offset_rtx
);
8140 if (GET_CODE (base
) == PLUS
)
8142 rtx op0
= XEXP (base
, 0);
8143 rtx op1
= XEXP (base
, 1);
8145 /* Force any scaling into a temp for CSE. */
8146 op0
= force_reg (Pmode
, op0
);
8147 op1
= force_reg (Pmode
, op1
);
8149 /* Let the pointer register be in op0. */
8150 if (REG_POINTER (op1
))
8151 std::swap (op0
, op1
);
8153 /* If the pointer is virtual or frame related, then we know that
8154 virtual register instantiation or register elimination is going
8155 to apply a second constant. We want the two constants folded
8156 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
8157 if (virt_or_elim_regno_p (REGNO (op0
)))
8159 base
= expand_binop (Pmode
, add_optab
, op0
, offset_rtx
,
8160 NULL_RTX
, true, OPTAB_DIRECT
);
8161 return gen_rtx_PLUS (Pmode
, base
, op1
);
8164 /* Otherwise, in order to encourage CSE (and thence loop strength
8165 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
8166 base
= expand_binop (Pmode
, add_optab
, op0
, op1
,
8167 NULL_RTX
, true, OPTAB_DIRECT
);
8168 x
= gen_rtx_PLUS (Pmode
, base
, offset_rtx
);
8172 if (GET_MODE_SIZE (mode
).is_constant (&size
))
8174 HOST_WIDE_INT base_offset
= aarch64_anchor_offset (offset
, size
,
8176 if (base_offset
!= 0)
8178 base
= plus_constant (Pmode
, base
, base_offset
);
8179 base
= force_operand (base
, NULL_RTX
);
8180 return plus_constant (Pmode
, base
, offset
- base_offset
);
8189 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED
, rtx x
,
8192 secondary_reload_info
*sri
)
8194 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
8195 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
8196 comment at the head of aarch64-sve.md for more details about the
8197 big-endian handling. */
8198 if (BYTES_BIG_ENDIAN
8199 && reg_class_subset_p (rclass
, FP_REGS
)
8200 && !((REG_P (x
) && HARD_REGISTER_P (x
))
8201 || aarch64_simd_valid_immediate (x
, NULL
))
8202 && aarch64_sve_data_mode_p (mode
))
8204 sri
->icode
= CODE_FOR_aarch64_sve_reload_be
;
8208 /* If we have to disable direct literal pool loads and stores because the
8209 function is too big, then we need a scratch register. */
8210 if (MEM_P (x
) && GET_CODE (x
) == SYMBOL_REF
&& CONSTANT_POOL_ADDRESS_P (x
)
8211 && (SCALAR_FLOAT_MODE_P (GET_MODE (x
))
8212 || targetm
.vector_mode_supported_p (GET_MODE (x
)))
8213 && !aarch64_pcrelative_literal_loads
)
8215 sri
->icode
= code_for_aarch64_reload_movcp (mode
, DImode
);
8219 /* Without the TARGET_SIMD instructions we cannot move a Q register
8220 to a Q register directly. We need a scratch. */
8221 if (REG_P (x
) && (mode
== TFmode
|| mode
== TImode
) && mode
== GET_MODE (x
)
8222 && FP_REGNUM_P (REGNO (x
)) && !TARGET_SIMD
8223 && reg_class_subset_p (rclass
, FP_REGS
))
8225 sri
->icode
= code_for_aarch64_reload_mov (mode
);
8229 /* A TFmode or TImode memory access should be handled via an FP_REGS
8230 because AArch64 has richer addressing modes for LDR/STR instructions
8231 than LDP/STP instructions. */
8232 if (TARGET_FLOAT
&& rclass
== GENERAL_REGS
8233 && known_eq (GET_MODE_SIZE (mode
), 16) && MEM_P (x
))
8236 if (rclass
== FP_REGS
&& (mode
== TImode
|| mode
== TFmode
) && CONSTANT_P(x
))
8237 return GENERAL_REGS
;
8243 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED
, const int to
)
8245 gcc_assert (from
== ARG_POINTER_REGNUM
|| from
== FRAME_POINTER_REGNUM
);
8247 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
8248 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
8249 if (frame_pointer_needed
)
8250 return to
== HARD_FRAME_POINTER_REGNUM
;
8255 aarch64_initial_elimination_offset (unsigned from
, unsigned to
)
8257 if (to
== HARD_FRAME_POINTER_REGNUM
)
8259 if (from
== ARG_POINTER_REGNUM
)
8260 return cfun
->machine
->frame
.hard_fp_offset
;
8262 if (from
== FRAME_POINTER_REGNUM
)
8263 return cfun
->machine
->frame
.hard_fp_offset
8264 - cfun
->machine
->frame
.locals_offset
;
8267 if (to
== STACK_POINTER_REGNUM
)
8269 if (from
== FRAME_POINTER_REGNUM
)
8270 return cfun
->machine
->frame
.frame_size
8271 - cfun
->machine
->frame
.locals_offset
;
8274 return cfun
->machine
->frame
.frame_size
;
8277 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
8281 aarch64_return_addr (int count
, rtx frame ATTRIBUTE_UNUSED
)
8285 return get_hard_reg_initial_val (Pmode
, LR_REGNUM
);
8290 aarch64_asm_trampoline_template (FILE *f
)
8295 if (aarch64_bti_enabled ())
8297 asm_fprintf (f
, "\thint\t34 // bti c\n");
8304 asm_fprintf (f
, "\tldr\tw%d, .+%d\n", IP1_REGNUM
- R0_REGNUM
, offset1
);
8305 asm_fprintf (f
, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM
- R0_REGNUM
,
8310 asm_fprintf (f
, "\tldr\t%s, .+%d\n", reg_names
[IP1_REGNUM
], offset1
);
8311 asm_fprintf (f
, "\tldr\t%s, .+%d\n", reg_names
[STATIC_CHAIN_REGNUM
],
8314 asm_fprintf (f
, "\tbr\t%s\n", reg_names
[IP1_REGNUM
]);
8316 /* The trampoline needs an extra padding instruction. In case if BTI is
8317 enabled the padding instruction is replaced by the BTI instruction at
8319 if (!aarch64_bti_enabled ())
8320 assemble_aligned_integer (4, const0_rtx
);
8322 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
8323 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
8327 aarch64_trampoline_init (rtx m_tramp
, tree fndecl
, rtx chain_value
)
8329 rtx fnaddr
, mem
, a_tramp
;
8330 const int tramp_code_sz
= 16;
8332 /* Don't need to copy the trailing D-words, we fill those in below. */
8333 emit_block_move (m_tramp
, assemble_trampoline_template (),
8334 GEN_INT (tramp_code_sz
), BLOCK_OP_NORMAL
);
8335 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
);
8336 fnaddr
= XEXP (DECL_RTL (fndecl
), 0);
8337 if (GET_MODE (fnaddr
) != ptr_mode
)
8338 fnaddr
= convert_memory_address (ptr_mode
, fnaddr
);
8339 emit_move_insn (mem
, fnaddr
);
8341 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
+ POINTER_BYTES
);
8342 emit_move_insn (mem
, chain_value
);
8344 /* XXX We should really define a "clear_cache" pattern and use
8345 gen_clear_cache(). */
8346 a_tramp
= XEXP (m_tramp
, 0);
8347 emit_library_call (gen_rtx_SYMBOL_REF (Pmode
, "__clear_cache"),
8348 LCT_NORMAL
, VOIDmode
, a_tramp
, ptr_mode
,
8349 plus_constant (ptr_mode
, a_tramp
, TRAMPOLINE_SIZE
),
8353 static unsigned char
8354 aarch64_class_max_nregs (reg_class_t regclass
, machine_mode mode
)
8356 /* ??? Logically we should only need to provide a value when
8357 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
8358 can hold MODE, but at the moment we need to handle all modes.
8359 Just ignore any runtime parts for registers that can't store them. */
8360 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
8364 case TAILCALL_ADDR_REGS
:
8368 case POINTER_AND_FP_REGS
:
8371 if (aarch64_sve_data_mode_p (mode
)
8372 && constant_multiple_p (GET_MODE_SIZE (mode
),
8373 BYTES_PER_SVE_VECTOR
, &nregs
))
8375 return (aarch64_vector_data_mode_p (mode
)
8376 ? CEIL (lowest_size
, UNITS_PER_VREG
)
8377 : CEIL (lowest_size
, UNITS_PER_WORD
));
8394 aarch64_preferred_reload_class (rtx x
, reg_class_t regclass
)
8396 if (regclass
== POINTER_REGS
)
8397 return GENERAL_REGS
;
8399 if (regclass
== STACK_REG
)
8402 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x
)), POINTER_REGS
))
8408 /* Register eliminiation can result in a request for
8409 SP+constant->FP_REGS. We cannot support such operations which
8410 use SP as source and an FP_REG as destination, so reject out
8412 if (! reg_class_subset_p (regclass
, GENERAL_REGS
) && GET_CODE (x
) == PLUS
)
8414 rtx lhs
= XEXP (x
, 0);
8416 /* Look through a possible SUBREG introduced by ILP32. */
8417 if (GET_CODE (lhs
) == SUBREG
)
8418 lhs
= SUBREG_REG (lhs
);
8420 gcc_assert (REG_P (lhs
));
8421 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs
)),
8430 aarch64_asm_output_labelref (FILE* f
, const char *name
)
8432 asm_fprintf (f
, "%U%s", name
);
8436 aarch64_elf_asm_constructor (rtx symbol
, int priority
)
8438 if (priority
== DEFAULT_INIT_PRIORITY
)
8439 default_ctor_section_asm_out_constructor (symbol
, priority
);
8443 /* While priority is known to be in range [0, 65535], so 18 bytes
8444 would be enough, the compiler might not know that. To avoid
8445 -Wformat-truncation false positive, use a larger size. */
8447 snprintf (buf
, sizeof (buf
), ".init_array.%.5u", priority
);
8448 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
8449 switch_to_section (s
);
8450 assemble_align (POINTER_SIZE
);
8451 assemble_aligned_integer (POINTER_BYTES
, symbol
);
8456 aarch64_elf_asm_destructor (rtx symbol
, int priority
)
8458 if (priority
== DEFAULT_INIT_PRIORITY
)
8459 default_dtor_section_asm_out_destructor (symbol
, priority
);
8463 /* While priority is known to be in range [0, 65535], so 18 bytes
8464 would be enough, the compiler might not know that. To avoid
8465 -Wformat-truncation false positive, use a larger size. */
8467 snprintf (buf
, sizeof (buf
), ".fini_array.%.5u", priority
);
8468 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
8469 switch_to_section (s
);
8470 assemble_align (POINTER_SIZE
);
8471 assemble_aligned_integer (POINTER_BYTES
, symbol
);
8476 aarch64_output_casesi (rtx
*operands
)
8480 rtx diff_vec
= PATTERN (NEXT_INSN (as_a
<rtx_insn
*> (operands
[2])));
8482 static const char *const patterns
[4][2] =
8485 "ldrb\t%w3, [%0,%w1,uxtw]",
8486 "add\t%3, %4, %w3, sxtb #2"
8489 "ldrh\t%w3, [%0,%w1,uxtw #1]",
8490 "add\t%3, %4, %w3, sxth #2"
8493 "ldr\t%w3, [%0,%w1,uxtw #2]",
8494 "add\t%3, %4, %w3, sxtw #2"
8496 /* We assume that DImode is only generated when not optimizing and
8497 that we don't really need 64-bit address offsets. That would
8498 imply an object file with 8GB of code in a single function! */
8500 "ldr\t%w3, [%0,%w1,uxtw #2]",
8501 "add\t%3, %4, %w3, sxtw #2"
8505 gcc_assert (GET_CODE (diff_vec
) == ADDR_DIFF_VEC
);
8507 scalar_int_mode mode
= as_a
<scalar_int_mode
> (GET_MODE (diff_vec
));
8508 index
= exact_log2 (GET_MODE_SIZE (mode
));
8510 gcc_assert (index
>= 0 && index
<= 3);
8512 /* Need to implement table size reduction, by chaning the code below. */
8513 output_asm_insn (patterns
[index
][0], operands
);
8514 ASM_GENERATE_INTERNAL_LABEL (label
, "Lrtx", CODE_LABEL_NUMBER (operands
[2]));
8515 snprintf (buf
, sizeof (buf
),
8516 "adr\t%%4, %s", targetm
.strip_name_encoding (label
));
8517 output_asm_insn (buf
, operands
);
8518 output_asm_insn (patterns
[index
][1], operands
);
8519 output_asm_insn ("br\t%3", operands
);
8520 assemble_label (asm_out_file
, label
);
8525 /* Return size in bits of an arithmetic operand which is shifted/scaled and
8526 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
8530 aarch64_uxt_size (int shift
, HOST_WIDE_INT mask
)
8532 if (shift
>= 0 && shift
<= 3)
8535 for (size
= 8; size
<= 32; size
*= 2)
8537 HOST_WIDE_INT bits
= ((HOST_WIDE_INT
)1U << size
) - 1;
8538 if (mask
== bits
<< shift
)
8545 /* Constant pools are per function only when PC relative
8546 literal loads are true or we are in the large memory
8550 aarch64_can_use_per_function_literal_pools_p (void)
8552 return (aarch64_pcrelative_literal_loads
8553 || aarch64_cmodel
== AARCH64_CMODEL_LARGE
);
8557 aarch64_use_blocks_for_constant_p (machine_mode
, const_rtx
)
8559 /* We can't use blocks for constants when we're using a per-function
8561 return !aarch64_can_use_per_function_literal_pools_p ();
8564 /* Select appropriate section for constants depending
8565 on where we place literal pools. */
8568 aarch64_select_rtx_section (machine_mode mode
,
8570 unsigned HOST_WIDE_INT align
)
8572 if (aarch64_can_use_per_function_literal_pools_p ())
8573 return function_section (current_function_decl
);
8575 return default_elf_select_rtx_section (mode
, x
, align
);
8578 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
8580 aarch64_asm_output_pool_epilogue (FILE *f
, const char *, tree
,
8581 HOST_WIDE_INT offset
)
8583 /* When using per-function literal pools, we must ensure that any code
8584 section is aligned to the minimal instruction length, lest we get
8585 errors from the assembler re "unaligned instructions". */
8586 if ((offset
& 3) && aarch64_can_use_per_function_literal_pools_p ())
8587 ASM_OUTPUT_ALIGN (f
, 2);
8592 /* Helper function for rtx cost calculation. Strip a shift expression
8593 from X. Returns the inner operand if successful, or the original
8594 expression on failure. */
8596 aarch64_strip_shift (rtx x
)
8600 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
8601 we can convert both to ROR during final output. */
8602 if ((GET_CODE (op
) == ASHIFT
8603 || GET_CODE (op
) == ASHIFTRT
8604 || GET_CODE (op
) == LSHIFTRT
8605 || GET_CODE (op
) == ROTATERT
8606 || GET_CODE (op
) == ROTATE
)
8607 && CONST_INT_P (XEXP (op
, 1)))
8608 return XEXP (op
, 0);
8610 if (GET_CODE (op
) == MULT
8611 && CONST_INT_P (XEXP (op
, 1))
8612 && ((unsigned) exact_log2 (INTVAL (XEXP (op
, 1)))) < 64)
8613 return XEXP (op
, 0);
8618 /* Helper function for rtx cost calculation. Strip an extend
8619 expression from X. Returns the inner operand if successful, or the
8620 original expression on failure. We deal with a number of possible
8621 canonicalization variations here. If STRIP_SHIFT is true, then
8622 we can strip off a shift also. */
8624 aarch64_strip_extend (rtx x
, bool strip_shift
)
8626 scalar_int_mode mode
;
8629 if (!is_a
<scalar_int_mode
> (GET_MODE (op
), &mode
))
8632 /* Zero and sign extraction of a widened value. */
8633 if ((GET_CODE (op
) == ZERO_EXTRACT
|| GET_CODE (op
) == SIGN_EXTRACT
)
8634 && XEXP (op
, 2) == const0_rtx
8635 && GET_CODE (XEXP (op
, 0)) == MULT
8636 && aarch64_is_extend_from_extract (mode
, XEXP (XEXP (op
, 0), 1),
8638 return XEXP (XEXP (op
, 0), 0);
8640 /* It can also be represented (for zero-extend) as an AND with an
8642 if (GET_CODE (op
) == AND
8643 && GET_CODE (XEXP (op
, 0)) == MULT
8644 && CONST_INT_P (XEXP (XEXP (op
, 0), 1))
8645 && CONST_INT_P (XEXP (op
, 1))
8646 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op
, 0), 1))),
8647 INTVAL (XEXP (op
, 1))) != 0)
8648 return XEXP (XEXP (op
, 0), 0);
8650 /* Now handle extended register, as this may also have an optional
8651 left shift by 1..4. */
8653 && GET_CODE (op
) == ASHIFT
8654 && CONST_INT_P (XEXP (op
, 1))
8655 && ((unsigned HOST_WIDE_INT
) INTVAL (XEXP (op
, 1))) <= 4)
8658 if (GET_CODE (op
) == ZERO_EXTEND
8659 || GET_CODE (op
) == SIGN_EXTEND
)
8668 /* Return true iff CODE is a shift supported in combination
8669 with arithmetic instructions. */
8672 aarch64_shift_p (enum rtx_code code
)
8674 return code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
;
8678 /* Return true iff X is a cheap shift without a sign extend. */
8681 aarch64_cheap_mult_shift_p (rtx x
)
8688 if (!(aarch64_tune_params
.extra_tuning_flags
8689 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
))
8692 if (GET_CODE (op0
) == SIGN_EXTEND
)
8695 if (GET_CODE (x
) == ASHIFT
&& CONST_INT_P (op1
)
8696 && UINTVAL (op1
) <= 4)
8699 if (GET_CODE (x
) != MULT
|| !CONST_INT_P (op1
))
8702 HOST_WIDE_INT l2
= exact_log2 (INTVAL (op1
));
8704 if (l2
> 0 && l2
<= 4)
8710 /* Helper function for rtx cost calculation. Calculate the cost of
8711 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
8712 Return the calculated cost of the expression, recursing manually in to
8713 operands where needed. */
8716 aarch64_rtx_mult_cost (rtx x
, enum rtx_code code
, int outer
, bool speed
)
8719 const struct cpu_cost_table
*extra_cost
8720 = aarch64_tune_params
.insn_extra_cost
;
8722 bool compound_p
= (outer
== PLUS
|| outer
== MINUS
);
8723 machine_mode mode
= GET_MODE (x
);
8725 gcc_checking_assert (code
== MULT
);
8730 if (VECTOR_MODE_P (mode
))
8731 mode
= GET_MODE_INNER (mode
);
8733 /* Integer multiply/fma. */
8734 if (GET_MODE_CLASS (mode
) == MODE_INT
)
8736 /* The multiply will be canonicalized as a shift, cost it as such. */
8737 if (aarch64_shift_p (GET_CODE (x
))
8738 || (CONST_INT_P (op1
)
8739 && exact_log2 (INTVAL (op1
)) > 0))
8741 bool is_extend
= GET_CODE (op0
) == ZERO_EXTEND
8742 || GET_CODE (op0
) == SIGN_EXTEND
;
8747 /* If the shift is considered cheap,
8748 then don't add any cost. */
8749 if (aarch64_cheap_mult_shift_p (x
))
8751 else if (REG_P (op1
))
8752 /* ARITH + shift-by-register. */
8753 cost
+= extra_cost
->alu
.arith_shift_reg
;
8755 /* ARITH + extended register. We don't have a cost field
8756 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
8757 cost
+= extra_cost
->alu
.extend_arith
;
8759 /* ARITH + shift-by-immediate. */
8760 cost
+= extra_cost
->alu
.arith_shift
;
8763 /* LSL (immediate). */
8764 cost
+= extra_cost
->alu
.shift
;
8767 /* Strip extends as we will have costed them in the case above. */
8769 op0
= aarch64_strip_extend (op0
, true);
8771 cost
+= rtx_cost (op0
, VOIDmode
, code
, 0, speed
);
8776 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
8777 compound and let the below cases handle it. After all, MNEG is a
8778 special-case alias of MSUB. */
8779 if (GET_CODE (op0
) == NEG
)
8781 op0
= XEXP (op0
, 0);
8785 /* Integer multiplies or FMAs have zero/sign extending variants. */
8786 if ((GET_CODE (op0
) == ZERO_EXTEND
8787 && GET_CODE (op1
) == ZERO_EXTEND
)
8788 || (GET_CODE (op0
) == SIGN_EXTEND
8789 && GET_CODE (op1
) == SIGN_EXTEND
))
8791 cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, MULT
, 0, speed
);
8792 cost
+= rtx_cost (XEXP (op1
, 0), VOIDmode
, MULT
, 1, speed
);
8797 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
8798 cost
+= extra_cost
->mult
[0].extend_add
;
8800 /* MUL/SMULL/UMULL. */
8801 cost
+= extra_cost
->mult
[0].extend
;
8807 /* This is either an integer multiply or a MADD. In both cases
8808 we want to recurse and cost the operands. */
8809 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
8810 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
8816 cost
+= extra_cost
->mult
[mode
== DImode
].add
;
8819 cost
+= extra_cost
->mult
[mode
== DImode
].simple
;
8828 /* Floating-point FMA/FMUL can also support negations of the
8829 operands, unless the rounding mode is upward or downward in
8830 which case FNMUL is different than FMUL with operand negation. */
8831 bool neg0
= GET_CODE (op0
) == NEG
;
8832 bool neg1
= GET_CODE (op1
) == NEG
;
8833 if (compound_p
|| !flag_rounding_math
|| (neg0
&& neg1
))
8836 op0
= XEXP (op0
, 0);
8838 op1
= XEXP (op1
, 0);
8842 /* FMADD/FNMADD/FNMSUB/FMSUB. */
8843 cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
8846 cost
+= extra_cost
->fp
[mode
== DFmode
].mult
;
8849 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
8850 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
8856 aarch64_address_cost (rtx x
,
8858 addr_space_t as ATTRIBUTE_UNUSED
,
8861 enum rtx_code c
= GET_CODE (x
);
8862 const struct cpu_addrcost_table
*addr_cost
= aarch64_tune_params
.addr_cost
;
8863 struct aarch64_address_info info
;
8867 if (!aarch64_classify_address (&info
, x
, mode
, false))
8869 if (GET_CODE (x
) == CONST
|| GET_CODE (x
) == SYMBOL_REF
)
8871 /* This is a CONST or SYMBOL ref which will be split
8872 in a different way depending on the code model in use.
8873 Cost it through the generic infrastructure. */
8874 int cost_symbol_ref
= rtx_cost (x
, Pmode
, MEM
, 1, speed
);
8875 /* Divide through by the cost of one instruction to
8876 bring it to the same units as the address costs. */
8877 cost_symbol_ref
/= COSTS_N_INSNS (1);
8878 /* The cost is then the cost of preparing the address,
8879 followed by an immediate (possibly 0) offset. */
8880 return cost_symbol_ref
+ addr_cost
->imm_offset
;
8884 /* This is most likely a jump table from a case
8886 return addr_cost
->register_offset
;
8892 case ADDRESS_LO_SUM
:
8893 case ADDRESS_SYMBOLIC
:
8894 case ADDRESS_REG_IMM
:
8895 cost
+= addr_cost
->imm_offset
;
8898 case ADDRESS_REG_WB
:
8899 if (c
== PRE_INC
|| c
== PRE_DEC
|| c
== PRE_MODIFY
)
8900 cost
+= addr_cost
->pre_modify
;
8901 else if (c
== POST_INC
|| c
== POST_DEC
|| c
== POST_MODIFY
)
8902 cost
+= addr_cost
->post_modify
;
8908 case ADDRESS_REG_REG
:
8909 cost
+= addr_cost
->register_offset
;
8912 case ADDRESS_REG_SXTW
:
8913 cost
+= addr_cost
->register_sextend
;
8916 case ADDRESS_REG_UXTW
:
8917 cost
+= addr_cost
->register_zextend
;
8927 /* For the sake of calculating the cost of the shifted register
8928 component, we can treat same sized modes in the same way. */
8929 if (known_eq (GET_MODE_BITSIZE (mode
), 16))
8930 cost
+= addr_cost
->addr_scale_costs
.hi
;
8931 else if (known_eq (GET_MODE_BITSIZE (mode
), 32))
8932 cost
+= addr_cost
->addr_scale_costs
.si
;
8933 else if (known_eq (GET_MODE_BITSIZE (mode
), 64))
8934 cost
+= addr_cost
->addr_scale_costs
.di
;
8936 /* We can't tell, or this is a 128-bit vector. */
8937 cost
+= addr_cost
->addr_scale_costs
.ti
;
8943 /* Return the cost of a branch. If SPEED_P is true then the compiler is
8944 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
8948 aarch64_branch_cost (bool speed_p
, bool predictable_p
)
8950 /* When optimizing for speed, use the cost of unpredictable branches. */
8951 const struct cpu_branch_cost
*branch_costs
=
8952 aarch64_tune_params
.branch_costs
;
8954 if (!speed_p
|| predictable_p
)
8955 return branch_costs
->predictable
;
8957 return branch_costs
->unpredictable
;
8960 /* Return true if the RTX X in mode MODE is a zero or sign extract
8961 usable in an ADD or SUB (extended register) instruction. */
8963 aarch64_rtx_arith_op_extract_p (rtx x
, scalar_int_mode mode
)
8965 /* Catch add with a sign extract.
8966 This is add_<optab><mode>_multp2. */
8967 if (GET_CODE (x
) == SIGN_EXTRACT
8968 || GET_CODE (x
) == ZERO_EXTRACT
)
8970 rtx op0
= XEXP (x
, 0);
8971 rtx op1
= XEXP (x
, 1);
8972 rtx op2
= XEXP (x
, 2);
8974 if (GET_CODE (op0
) == MULT
8975 && CONST_INT_P (op1
)
8976 && op2
== const0_rtx
8977 && CONST_INT_P (XEXP (op0
, 1))
8978 && aarch64_is_extend_from_extract (mode
,
8985 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8987 else if (GET_CODE (x
) == SIGN_EXTEND
8988 || GET_CODE (x
) == ZERO_EXTEND
)
8989 return REG_P (XEXP (x
, 0));
8995 aarch64_frint_unspec_p (unsigned int u
)
9013 /* Return true iff X is an rtx that will match an extr instruction
9014 i.e. as described in the *extr<mode>5_insn family of patterns.
9015 OP0 and OP1 will be set to the operands of the shifts involved
9016 on success and will be NULL_RTX otherwise. */
9019 aarch64_extr_rtx_p (rtx x
, rtx
*res_op0
, rtx
*res_op1
)
9022 scalar_int_mode mode
;
9023 if (!is_a
<scalar_int_mode
> (GET_MODE (x
), &mode
))
9026 *res_op0
= NULL_RTX
;
9027 *res_op1
= NULL_RTX
;
9029 if (GET_CODE (x
) != IOR
)
9035 if ((GET_CODE (op0
) == ASHIFT
&& GET_CODE (op1
) == LSHIFTRT
)
9036 || (GET_CODE (op1
) == ASHIFT
&& GET_CODE (op0
) == LSHIFTRT
))
9038 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
9039 if (GET_CODE (op1
) == ASHIFT
)
9040 std::swap (op0
, op1
);
9042 if (!CONST_INT_P (XEXP (op0
, 1)) || !CONST_INT_P (XEXP (op1
, 1)))
9045 unsigned HOST_WIDE_INT shft_amnt_0
= UINTVAL (XEXP (op0
, 1));
9046 unsigned HOST_WIDE_INT shft_amnt_1
= UINTVAL (XEXP (op1
, 1));
9048 if (shft_amnt_0
< GET_MODE_BITSIZE (mode
)
9049 && shft_amnt_0
+ shft_amnt_1
== GET_MODE_BITSIZE (mode
))
9051 *res_op0
= XEXP (op0
, 0);
9052 *res_op1
= XEXP (op1
, 0);
9060 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
9061 storing it in *COST. Result is true if the total cost of the operation
9062 has now been calculated. */
9064 aarch64_if_then_else_costs (rtx op0
, rtx op1
, rtx op2
, int *cost
, bool speed
)
9068 enum rtx_code cmpcode
;
9070 if (COMPARISON_P (op0
))
9072 inner
= XEXP (op0
, 0);
9073 comparator
= XEXP (op0
, 1);
9074 cmpcode
= GET_CODE (op0
);
9079 comparator
= const0_rtx
;
9083 if (GET_CODE (op1
) == PC
|| GET_CODE (op2
) == PC
)
9085 /* Conditional branch. */
9086 if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
9090 if (cmpcode
== NE
|| cmpcode
== EQ
)
9092 if (comparator
== const0_rtx
)
9094 /* TBZ/TBNZ/CBZ/CBNZ. */
9095 if (GET_CODE (inner
) == ZERO_EXTRACT
)
9097 *cost
+= rtx_cost (XEXP (inner
, 0), VOIDmode
,
9098 ZERO_EXTRACT
, 0, speed
);
9101 *cost
+= rtx_cost (inner
, VOIDmode
, cmpcode
, 0, speed
);
9106 else if (cmpcode
== LT
|| cmpcode
== GE
)
9109 if (comparator
== const0_rtx
)
9114 else if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
9117 if (GET_CODE (op1
) == COMPARE
)
9119 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
9120 if (XEXP (op1
, 1) == const0_rtx
)
9124 machine_mode mode
= GET_MODE (XEXP (op1
, 0));
9125 const struct cpu_cost_table
*extra_cost
9126 = aarch64_tune_params
.insn_extra_cost
;
9128 if (GET_MODE_CLASS (mode
) == MODE_INT
)
9129 *cost
+= extra_cost
->alu
.arith
;
9131 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
9136 /* It's a conditional operation based on the status flags,
9137 so it must be some flavor of CSEL. */
9139 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
9140 if (GET_CODE (op1
) == NEG
9141 || GET_CODE (op1
) == NOT
9142 || (GET_CODE (op1
) == PLUS
&& XEXP (op1
, 1) == const1_rtx
))
9143 op1
= XEXP (op1
, 0);
9144 else if (GET_CODE (op1
) == ZERO_EXTEND
&& GET_CODE (op2
) == ZERO_EXTEND
)
9146 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
9147 op1
= XEXP (op1
, 0);
9148 op2
= XEXP (op2
, 0);
9151 *cost
+= rtx_cost (op1
, VOIDmode
, IF_THEN_ELSE
, 1, speed
);
9152 *cost
+= rtx_cost (op2
, VOIDmode
, IF_THEN_ELSE
, 2, speed
);
9156 /* We don't know what this is, cost all operands. */
9160 /* Check whether X is a bitfield operation of the form shift + extend that
9161 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
9162 operand to which the bitfield operation is applied. Otherwise return
9166 aarch64_extend_bitfield_pattern_p (rtx x
)
9168 rtx_code outer_code
= GET_CODE (x
);
9169 machine_mode outer_mode
= GET_MODE (x
);
9171 if (outer_code
!= ZERO_EXTEND
&& outer_code
!= SIGN_EXTEND
9172 && outer_mode
!= SImode
&& outer_mode
!= DImode
)
9175 rtx inner
= XEXP (x
, 0);
9176 rtx_code inner_code
= GET_CODE (inner
);
9177 machine_mode inner_mode
= GET_MODE (inner
);
9183 if (CONST_INT_P (XEXP (inner
, 1))
9184 && (inner_mode
== QImode
|| inner_mode
== HImode
))
9185 op
= XEXP (inner
, 0);
9188 if (outer_code
== ZERO_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
9189 && (inner_mode
== QImode
|| inner_mode
== HImode
))
9190 op
= XEXP (inner
, 0);
9193 if (outer_code
== SIGN_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
9194 && (inner_mode
== QImode
|| inner_mode
== HImode
))
9195 op
= XEXP (inner
, 0);
9204 /* Return true if the mask and a shift amount from an RTX of the form
9205 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
9206 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
9209 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode
, rtx mask
,
9212 return CONST_INT_P (mask
) && CONST_INT_P (shft_amnt
)
9213 && INTVAL (shft_amnt
) < GET_MODE_BITSIZE (mode
)
9214 && exact_log2 ((INTVAL (mask
) >> INTVAL (shft_amnt
)) + 1) >= 0
9216 & ((HOST_WIDE_INT_1U
<< INTVAL (shft_amnt
)) - 1)) == 0;
9219 /* Calculate the cost of calculating X, storing it in *COST. Result
9220 is true if the total cost of the operation has now been calculated. */
9222 aarch64_rtx_costs (rtx x
, machine_mode mode
, int outer ATTRIBUTE_UNUSED
,
9223 int param ATTRIBUTE_UNUSED
, int *cost
, bool speed
)
9226 const struct cpu_cost_table
*extra_cost
9227 = aarch64_tune_params
.insn_extra_cost
;
9228 int code
= GET_CODE (x
);
9229 scalar_int_mode int_mode
;
9231 /* By default, assume that everything has equivalent cost to the
9232 cheapest instruction. Any additional costs are applied as a delta
9233 above this default. */
9234 *cost
= COSTS_N_INSNS (1);
9239 /* The cost depends entirely on the operands to SET. */
9244 switch (GET_CODE (op0
))
9249 rtx address
= XEXP (op0
, 0);
9250 if (VECTOR_MODE_P (mode
))
9251 *cost
+= extra_cost
->ldst
.storev
;
9252 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
9253 *cost
+= extra_cost
->ldst
.store
;
9254 else if (mode
== SFmode
)
9255 *cost
+= extra_cost
->ldst
.storef
;
9256 else if (mode
== DFmode
)
9257 *cost
+= extra_cost
->ldst
.stored
;
9260 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
9264 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
9268 if (! REG_P (SUBREG_REG (op0
)))
9269 *cost
+= rtx_cost (SUBREG_REG (op0
), VOIDmode
, SET
, 0, speed
);
9273 /* The cost is one per vector-register copied. */
9274 if (VECTOR_MODE_P (GET_MODE (op0
)) && REG_P (op1
))
9276 int nregs
= aarch64_hard_regno_nregs (V0_REGNUM
, GET_MODE (op0
));
9277 *cost
= COSTS_N_INSNS (nregs
);
9279 /* const0_rtx is in general free, but we will use an
9280 instruction to set a register to 0. */
9281 else if (REG_P (op1
) || op1
== const0_rtx
)
9283 /* The cost is 1 per register copied. */
9284 int nregs
= aarch64_hard_regno_nregs (R0_REGNUM
, GET_MODE (op0
));
9285 *cost
= COSTS_N_INSNS (nregs
);
9288 /* Cost is just the cost of the RHS of the set. */
9289 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
9294 /* Bit-field insertion. Strip any redundant widening of
9295 the RHS to meet the width of the target. */
9296 if (GET_CODE (op1
) == SUBREG
)
9297 op1
= SUBREG_REG (op1
);
9298 if ((GET_CODE (op1
) == ZERO_EXTEND
9299 || GET_CODE (op1
) == SIGN_EXTEND
)
9300 && CONST_INT_P (XEXP (op0
, 1))
9301 && is_a
<scalar_int_mode
> (GET_MODE (XEXP (op1
, 0)), &int_mode
)
9302 && GET_MODE_BITSIZE (int_mode
) >= INTVAL (XEXP (op0
, 1)))
9303 op1
= XEXP (op1
, 0);
9305 if (CONST_INT_P (op1
))
9307 /* MOV immediate is assumed to always be cheap. */
9308 *cost
= COSTS_N_INSNS (1);
9314 *cost
+= extra_cost
->alu
.bfi
;
9315 *cost
+= rtx_cost (op1
, VOIDmode
, (enum rtx_code
) code
, 1, speed
);
9321 /* We can't make sense of this, assume default cost. */
9322 *cost
= COSTS_N_INSNS (1);
9328 /* If an instruction can incorporate a constant within the
9329 instruction, the instruction's expression avoids calling
9330 rtx_cost() on the constant. If rtx_cost() is called on a
9331 constant, then it is usually because the constant must be
9332 moved into a register by one or more instructions.
9334 The exception is constant 0, which can be expressed
9335 as XZR/WZR and is therefore free. The exception to this is
9336 if we have (set (reg) (const0_rtx)) in which case we must cost
9337 the move. However, we can catch that when we cost the SET, so
9338 we don't need to consider that here. */
9339 if (x
== const0_rtx
)
9343 /* To an approximation, building any other constant is
9344 proportionally expensive to the number of instructions
9345 required to build that constant. This is true whether we
9346 are compiling for SPEED or otherwise. */
9347 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
9348 int_mode
= word_mode
;
9349 *cost
= COSTS_N_INSNS (aarch64_internal_mov_immediate
9350 (NULL_RTX
, x
, false, int_mode
));
9356 /* First determine number of instructions to do the move
9357 as an integer constant. */
9358 if (!aarch64_float_const_representable_p (x
)
9359 && !aarch64_can_const_movi_rtx_p (x
, mode
)
9360 && aarch64_float_const_rtx_p (x
))
9362 unsigned HOST_WIDE_INT ival
;
9363 bool succeed
= aarch64_reinterpret_float_as_int (x
, &ival
);
9364 gcc_assert (succeed
);
9366 scalar_int_mode imode
= (mode
== HFmode
9368 : int_mode_for_mode (mode
).require ());
9369 int ncost
= aarch64_internal_mov_immediate
9370 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
9371 *cost
+= COSTS_N_INSNS (ncost
);
9377 /* mov[df,sf]_aarch64. */
9378 if (aarch64_float_const_representable_p (x
))
9379 /* FMOV (scalar immediate). */
9380 *cost
+= extra_cost
->fp
[mode
== DFmode
].fpconst
;
9381 else if (!aarch64_float_const_zero_rtx_p (x
))
9383 /* This will be a load from memory. */
9385 *cost
+= extra_cost
->ldst
.loadd
;
9387 *cost
+= extra_cost
->ldst
.loadf
;
9390 /* Otherwise this is +0.0. We get this using MOVI d0, #0
9391 or MOV v0.s[0], wzr - neither of which are modeled by the
9392 cost tables. Just use the default cost. */
9402 /* For loads we want the base cost of a load, plus an
9403 approximation for the additional cost of the addressing
9405 rtx address
= XEXP (x
, 0);
9406 if (VECTOR_MODE_P (mode
))
9407 *cost
+= extra_cost
->ldst
.loadv
;
9408 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
9409 *cost
+= extra_cost
->ldst
.load
;
9410 else if (mode
== SFmode
)
9411 *cost
+= extra_cost
->ldst
.loadf
;
9412 else if (mode
== DFmode
)
9413 *cost
+= extra_cost
->ldst
.loadd
;
9416 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
9425 if (VECTOR_MODE_P (mode
))
9430 *cost
+= extra_cost
->vect
.alu
;
9435 if (GET_MODE_CLASS (mode
) == MODE_INT
)
9437 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
9438 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
9441 *cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, NEG
, 0, speed
);
9445 /* Cost this as SUB wzr, X. */
9446 op0
= CONST0_RTX (mode
);
9451 if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
9453 /* Support (neg(fma...)) as a single instruction only if
9454 sign of zeros is unimportant. This matches the decision
9455 making in aarch64.md. */
9456 if (GET_CODE (op0
) == FMA
&& !HONOR_SIGNED_ZEROS (GET_MODE (op0
)))
9459 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
9462 if (GET_CODE (op0
) == MULT
)
9465 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
9470 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
9480 if (VECTOR_MODE_P (mode
))
9481 *cost
+= extra_cost
->vect
.alu
;
9483 *cost
+= extra_cost
->alu
.clz
;
9492 if (op1
== const0_rtx
9493 && GET_CODE (op0
) == AND
)
9496 mode
= GET_MODE (op0
);
9500 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
)
9502 /* TODO: A write to the CC flags possibly costs extra, this
9503 needs encoding in the cost tables. */
9505 mode
= GET_MODE (op0
);
9507 if (GET_CODE (op0
) == AND
)
9513 if (GET_CODE (op0
) == PLUS
)
9515 /* ADDS (and CMN alias). */
9520 if (GET_CODE (op0
) == MINUS
)
9527 if (GET_CODE (op0
) == ZERO_EXTRACT
&& op1
== const0_rtx
9528 && GET_MODE (x
) == CC_NZmode
&& CONST_INT_P (XEXP (op0
, 1))
9529 && CONST_INT_P (XEXP (op0
, 2)))
9531 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
9532 Handle it here directly rather than going to cost_logic
9533 since we know the immediate generated for the TST is valid
9534 so we can avoid creating an intermediate rtx for it only
9535 for costing purposes. */
9537 *cost
+= extra_cost
->alu
.logical
;
9539 *cost
+= rtx_cost (XEXP (op0
, 0), GET_MODE (op0
),
9540 ZERO_EXTRACT
, 0, speed
);
9544 if (GET_CODE (op1
) == NEG
)
9548 *cost
+= extra_cost
->alu
.arith
;
9550 *cost
+= rtx_cost (op0
, mode
, COMPARE
, 0, speed
);
9551 *cost
+= rtx_cost (XEXP (op1
, 0), mode
, NEG
, 1, speed
);
9557 Compare can freely swap the order of operands, and
9558 canonicalization puts the more complex operation first.
9559 But the integer MINUS logic expects the shift/extend
9560 operation in op1. */
9562 || (GET_CODE (op0
) == SUBREG
&& REG_P (SUBREG_REG (op0
)))))
9570 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_FLOAT
)
9574 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
9576 if (CONST_DOUBLE_P (op1
) && aarch64_float_const_zero_rtx_p (op1
))
9578 *cost
+= rtx_cost (op0
, VOIDmode
, COMPARE
, 0, speed
);
9579 /* FCMP supports constant 0.0 for no extra cost. */
9585 if (VECTOR_MODE_P (mode
))
9587 /* Vector compare. */
9589 *cost
+= extra_cost
->vect
.alu
;
9591 if (aarch64_float_const_zero_rtx_p (op1
))
9593 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
9607 *cost
+= rtx_cost (op0
, mode
, MINUS
, 0, speed
);
9609 /* Detect valid immediates. */
9610 if ((GET_MODE_CLASS (mode
) == MODE_INT
9611 || (GET_MODE_CLASS (mode
) == MODE_CC
9612 && GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
))
9613 && CONST_INT_P (op1
)
9614 && aarch64_uimm12_shift (INTVAL (op1
)))
9617 /* SUB(S) (immediate). */
9618 *cost
+= extra_cost
->alu
.arith
;
9622 /* Look for SUB (extended register). */
9623 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
9624 && aarch64_rtx_arith_op_extract_p (op1
, int_mode
))
9627 *cost
+= extra_cost
->alu
.extend_arith
;
9629 op1
= aarch64_strip_extend (op1
, true);
9630 *cost
+= rtx_cost (op1
, VOIDmode
,
9631 (enum rtx_code
) GET_CODE (op1
), 0, speed
);
9635 rtx new_op1
= aarch64_strip_extend (op1
, false);
9637 /* Cost this as an FMA-alike operation. */
9638 if ((GET_CODE (new_op1
) == MULT
9639 || aarch64_shift_p (GET_CODE (new_op1
)))
9642 *cost
+= aarch64_rtx_mult_cost (new_op1
, MULT
,
9643 (enum rtx_code
) code
,
9648 *cost
+= rtx_cost (new_op1
, VOIDmode
, MINUS
, 1, speed
);
9652 if (VECTOR_MODE_P (mode
))
9655 *cost
+= extra_cost
->vect
.alu
;
9657 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
9660 *cost
+= extra_cost
->alu
.arith
;
9662 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
9665 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
9679 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
9680 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
9683 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, PLUS
, 0, speed
);
9684 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
9688 if (GET_MODE_CLASS (mode
) == MODE_INT
9689 && ((CONST_INT_P (op1
) && aarch64_uimm12_shift (INTVAL (op1
)))
9690 || aarch64_sve_addvl_addpl_immediate (op1
, mode
)))
9692 *cost
+= rtx_cost (op0
, mode
, PLUS
, 0, speed
);
9695 /* ADD (immediate). */
9696 *cost
+= extra_cost
->alu
.arith
;
9700 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
9702 /* Look for ADD (extended register). */
9703 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
9704 && aarch64_rtx_arith_op_extract_p (op0
, int_mode
))
9707 *cost
+= extra_cost
->alu
.extend_arith
;
9709 op0
= aarch64_strip_extend (op0
, true);
9710 *cost
+= rtx_cost (op0
, VOIDmode
,
9711 (enum rtx_code
) GET_CODE (op0
), 0, speed
);
9715 /* Strip any extend, leave shifts behind as we will
9716 cost them through mult_cost. */
9717 new_op0
= aarch64_strip_extend (op0
, false);
9719 if (GET_CODE (new_op0
) == MULT
9720 || aarch64_shift_p (GET_CODE (new_op0
)))
9722 *cost
+= aarch64_rtx_mult_cost (new_op0
, MULT
, PLUS
,
9727 *cost
+= rtx_cost (new_op0
, VOIDmode
, PLUS
, 0, speed
);
9731 if (VECTOR_MODE_P (mode
))
9734 *cost
+= extra_cost
->vect
.alu
;
9736 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
9739 *cost
+= extra_cost
->alu
.arith
;
9741 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
9744 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
9751 *cost
= COSTS_N_INSNS (1);
9755 if (VECTOR_MODE_P (mode
))
9756 *cost
+= extra_cost
->vect
.alu
;
9758 *cost
+= extra_cost
->alu
.rev
;
9763 if (aarch_rev16_p (x
))
9765 *cost
= COSTS_N_INSNS (1);
9769 if (VECTOR_MODE_P (mode
))
9770 *cost
+= extra_cost
->vect
.alu
;
9772 *cost
+= extra_cost
->alu
.rev
;
9777 if (aarch64_extr_rtx_p (x
, &op0
, &op1
))
9779 *cost
+= rtx_cost (op0
, mode
, IOR
, 0, speed
);
9780 *cost
+= rtx_cost (op1
, mode
, IOR
, 1, speed
);
9782 *cost
+= extra_cost
->alu
.shift
;
9793 if (VECTOR_MODE_P (mode
))
9796 *cost
+= extra_cost
->vect
.alu
;
9801 && GET_CODE (op0
) == MULT
9802 && CONST_INT_P (XEXP (op0
, 1))
9803 && CONST_INT_P (op1
)
9804 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0
, 1))),
9807 /* This is a UBFM/SBFM. */
9808 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, ZERO_EXTRACT
, 0, speed
);
9810 *cost
+= extra_cost
->alu
.bfx
;
9814 if (is_int_mode (mode
, &int_mode
))
9816 if (CONST_INT_P (op1
))
9818 /* We have a mask + shift version of a UBFIZ
9819 i.e. the *andim_ashift<mode>_bfiz pattern. */
9820 if (GET_CODE (op0
) == ASHIFT
9821 && aarch64_mask_and_shift_for_ubfiz_p (int_mode
, op1
,
9824 *cost
+= rtx_cost (XEXP (op0
, 0), int_mode
,
9825 (enum rtx_code
) code
, 0, speed
);
9827 *cost
+= extra_cost
->alu
.bfx
;
9831 else if (aarch64_bitmask_imm (INTVAL (op1
), int_mode
))
9833 /* We possibly get the immediate for free, this is not
9835 *cost
+= rtx_cost (op0
, int_mode
,
9836 (enum rtx_code
) code
, 0, speed
);
9838 *cost
+= extra_cost
->alu
.logical
;
9847 /* Handle ORN, EON, or BIC. */
9848 if (GET_CODE (op0
) == NOT
)
9849 op0
= XEXP (op0
, 0);
9851 new_op0
= aarch64_strip_shift (op0
);
9853 /* If we had a shift on op0 then this is a logical-shift-
9854 by-register/immediate operation. Otherwise, this is just
9855 a logical operation. */
9860 /* Shift by immediate. */
9861 if (CONST_INT_P (XEXP (op0
, 1)))
9862 *cost
+= extra_cost
->alu
.log_shift
;
9864 *cost
+= extra_cost
->alu
.log_shift_reg
;
9867 *cost
+= extra_cost
->alu
.logical
;
9870 /* In both cases we want to cost both operands. */
9871 *cost
+= rtx_cost (new_op0
, int_mode
, (enum rtx_code
) code
,
9873 *cost
+= rtx_cost (op1
, int_mode
, (enum rtx_code
) code
,
9883 op0
= aarch64_strip_shift (x
);
9885 if (VECTOR_MODE_P (mode
))
9888 *cost
+= extra_cost
->vect
.alu
;
9892 /* MVN-shifted-reg. */
9895 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
9898 *cost
+= extra_cost
->alu
.log_shift
;
9902 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9903 Handle the second form here taking care that 'a' in the above can
9905 else if (GET_CODE (op0
) == XOR
)
9907 rtx newop0
= XEXP (op0
, 0);
9908 rtx newop1
= XEXP (op0
, 1);
9909 rtx op0_stripped
= aarch64_strip_shift (newop0
);
9911 *cost
+= rtx_cost (newop1
, mode
, (enum rtx_code
) code
, 1, speed
);
9912 *cost
+= rtx_cost (op0_stripped
, mode
, XOR
, 0, speed
);
9916 if (op0_stripped
!= newop0
)
9917 *cost
+= extra_cost
->alu
.log_shift
;
9919 *cost
+= extra_cost
->alu
.logical
;
9926 *cost
+= extra_cost
->alu
.logical
;
9933 /* If a value is written in SI mode, then zero extended to DI
9934 mode, the operation will in general be free as a write to
9935 a 'w' register implicitly zeroes the upper bits of an 'x'
9936 register. However, if this is
9938 (set (reg) (zero_extend (reg)))
9940 we must cost the explicit register move. */
9942 && GET_MODE (op0
) == SImode
9945 int op_cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, 0, speed
);
9947 /* If OP_COST is non-zero, then the cost of the zero extend
9948 is effectively the cost of the inner operation. Otherwise
9949 we have a MOV instruction and we take the cost from the MOV
9950 itself. This is true independently of whether we are
9951 optimizing for space or time. */
9957 else if (MEM_P (op0
))
9959 /* All loads can zero extend to any size for free. */
9960 *cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, param
, speed
);
9964 op0
= aarch64_extend_bitfield_pattern_p (x
);
9967 *cost
+= rtx_cost (op0
, mode
, ZERO_EXTEND
, 0, speed
);
9969 *cost
+= extra_cost
->alu
.bfx
;
9975 if (VECTOR_MODE_P (mode
))
9978 *cost
+= extra_cost
->vect
.alu
;
9982 /* We generate an AND instead of UXTB/UXTH. */
9983 *cost
+= extra_cost
->alu
.logical
;
9989 if (MEM_P (XEXP (x
, 0)))
9994 rtx address
= XEXP (XEXP (x
, 0), 0);
9995 *cost
+= extra_cost
->ldst
.load_sign_extend
;
9998 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
10004 op0
= aarch64_extend_bitfield_pattern_p (x
);
10007 *cost
+= rtx_cost (op0
, mode
, SIGN_EXTEND
, 0, speed
);
10009 *cost
+= extra_cost
->alu
.bfx
;
10015 if (VECTOR_MODE_P (mode
))
10016 *cost
+= extra_cost
->vect
.alu
;
10018 *cost
+= extra_cost
->alu
.extend
;
10026 if (CONST_INT_P (op1
))
10030 if (VECTOR_MODE_P (mode
))
10032 /* Vector shift (immediate). */
10033 *cost
+= extra_cost
->vect
.alu
;
10037 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
10039 *cost
+= extra_cost
->alu
.shift
;
10043 /* We can incorporate zero/sign extend for free. */
10044 if (GET_CODE (op0
) == ZERO_EXTEND
10045 || GET_CODE (op0
) == SIGN_EXTEND
)
10046 op0
= XEXP (op0
, 0);
10048 *cost
+= rtx_cost (op0
, VOIDmode
, ASHIFT
, 0, speed
);
10053 if (VECTOR_MODE_P (mode
))
10056 /* Vector shift (register). */
10057 *cost
+= extra_cost
->vect
.alu
;
10063 *cost
+= extra_cost
->alu
.shift_reg
;
10065 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
10066 && CONST_INT_P (XEXP (op1
, 1))
10067 && known_eq (INTVAL (XEXP (op1
, 1)),
10068 GET_MODE_BITSIZE (mode
) - 1))
10070 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
10071 /* We already demanded XEXP (op1, 0) to be REG_P, so
10072 don't recurse into it. */
10076 return false; /* All arguments need to be in registers. */
10086 if (CONST_INT_P (op1
))
10088 /* ASR (immediate) and friends. */
10091 if (VECTOR_MODE_P (mode
))
10092 *cost
+= extra_cost
->vect
.alu
;
10094 *cost
+= extra_cost
->alu
.shift
;
10097 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
10102 if (VECTOR_MODE_P (mode
))
10105 /* Vector shift (register). */
10106 *cost
+= extra_cost
->vect
.alu
;
10111 /* ASR (register) and friends. */
10112 *cost
+= extra_cost
->alu
.shift_reg
;
10114 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
10115 && CONST_INT_P (XEXP (op1
, 1))
10116 && known_eq (INTVAL (XEXP (op1
, 1)),
10117 GET_MODE_BITSIZE (mode
) - 1))
10119 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
10120 /* We already demanded XEXP (op1, 0) to be REG_P, so
10121 don't recurse into it. */
10125 return false; /* All arguments need to be in registers. */
10130 if (aarch64_cmodel
== AARCH64_CMODEL_LARGE
10131 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
)
10135 *cost
+= extra_cost
->ldst
.load
;
10137 else if (aarch64_cmodel
== AARCH64_CMODEL_SMALL
10138 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_PIC
)
10140 /* ADRP, followed by ADD. */
10141 *cost
+= COSTS_N_INSNS (1);
10143 *cost
+= 2 * extra_cost
->alu
.arith
;
10145 else if (aarch64_cmodel
== AARCH64_CMODEL_TINY
10146 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
10150 *cost
+= extra_cost
->alu
.arith
;
10155 /* One extra load instruction, after accessing the GOT. */
10156 *cost
+= COSTS_N_INSNS (1);
10158 *cost
+= extra_cost
->ldst
.load
;
10164 /* ADRP/ADD (immediate). */
10166 *cost
+= extra_cost
->alu
.arith
;
10174 if (VECTOR_MODE_P (mode
))
10175 *cost
+= extra_cost
->vect
.alu
;
10177 *cost
+= extra_cost
->alu
.bfx
;
10180 /* We can trust that the immediates used will be correct (there
10181 are no by-register forms), so we need only cost op0. */
10182 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (enum rtx_code
) code
, 0, speed
);
10186 *cost
+= aarch64_rtx_mult_cost (x
, MULT
, 0, speed
);
10187 /* aarch64_rtx_mult_cost always handles recursion to its
10192 /* We can expand signed mod by power of 2 using a NEGS, two parallel
10193 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
10194 an unconditional negate. This case should only ever be reached through
10195 the set_smod_pow2_cheap check in expmed.c. */
10196 if (CONST_INT_P (XEXP (x
, 1))
10197 && exact_log2 (INTVAL (XEXP (x
, 1))) > 0
10198 && (mode
== SImode
|| mode
== DImode
))
10200 /* We expand to 4 instructions. Reset the baseline. */
10201 *cost
= COSTS_N_INSNS (4);
10204 *cost
+= 2 * extra_cost
->alu
.logical
10205 + 2 * extra_cost
->alu
.arith
;
10210 /* Fall-through. */
10214 /* Slighly prefer UMOD over SMOD. */
10215 if (VECTOR_MODE_P (mode
))
10216 *cost
+= extra_cost
->vect
.alu
;
10217 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
10218 *cost
+= (extra_cost
->mult
[mode
== DImode
].add
10219 + extra_cost
->mult
[mode
== DImode
].idiv
10220 + (code
== MOD
? 1 : 0));
10222 return false; /* All arguments need to be in registers. */
10229 if (VECTOR_MODE_P (mode
))
10230 *cost
+= extra_cost
->vect
.alu
;
10231 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
10232 /* There is no integer SQRT, so only DIV and UDIV can get
10234 *cost
+= (extra_cost
->mult
[mode
== DImode
].idiv
10235 /* Slighly prefer UDIV over SDIV. */
10236 + (code
== DIV
? 1 : 0));
10238 *cost
+= extra_cost
->fp
[mode
== DFmode
].div
;
10240 return false; /* All arguments need to be in registers. */
10243 return aarch64_if_then_else_costs (XEXP (x
, 0), XEXP (x
, 1),
10244 XEXP (x
, 2), cost
, speed
);
10257 return false; /* All arguments must be in registers. */
10266 if (VECTOR_MODE_P (mode
))
10267 *cost
+= extra_cost
->vect
.alu
;
10269 *cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
10272 /* FMSUB, FNMADD, and FNMSUB are free. */
10273 if (GET_CODE (op0
) == NEG
)
10274 op0
= XEXP (op0
, 0);
10276 if (GET_CODE (op2
) == NEG
)
10277 op2
= XEXP (op2
, 0);
10279 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
10280 and the by-element operand as operand 0. */
10281 if (GET_CODE (op1
) == NEG
)
10282 op1
= XEXP (op1
, 0);
10284 /* Catch vector-by-element operations. The by-element operand can
10285 either be (vec_duplicate (vec_select (x))) or just
10286 (vec_select (x)), depending on whether we are multiplying by
10287 a vector or a scalar.
10289 Canonicalization is not very good in these cases, FMA4 will put the
10290 by-element operand as operand 0, FNMA4 will have it as operand 1. */
10291 if (GET_CODE (op0
) == VEC_DUPLICATE
)
10292 op0
= XEXP (op0
, 0);
10293 else if (GET_CODE (op1
) == VEC_DUPLICATE
)
10294 op1
= XEXP (op1
, 0);
10296 if (GET_CODE (op0
) == VEC_SELECT
)
10297 op0
= XEXP (op0
, 0);
10298 else if (GET_CODE (op1
) == VEC_SELECT
)
10299 op1
= XEXP (op1
, 0);
10301 /* If the remaining parameters are not registers,
10302 get the cost to put them into registers. */
10303 *cost
+= rtx_cost (op0
, mode
, FMA
, 0, speed
);
10304 *cost
+= rtx_cost (op1
, mode
, FMA
, 1, speed
);
10305 *cost
+= rtx_cost (op2
, mode
, FMA
, 2, speed
);
10309 case UNSIGNED_FLOAT
:
10311 *cost
+= extra_cost
->fp
[mode
== DFmode
].fromint
;
10317 if (VECTOR_MODE_P (mode
))
10319 /*Vector truncate. */
10320 *cost
+= extra_cost
->vect
.alu
;
10323 *cost
+= extra_cost
->fp
[mode
== DFmode
].widen
;
10327 case FLOAT_TRUNCATE
:
10330 if (VECTOR_MODE_P (mode
))
10332 /*Vector conversion. */
10333 *cost
+= extra_cost
->vect
.alu
;
10336 *cost
+= extra_cost
->fp
[mode
== DFmode
].narrow
;
10343 /* Strip the rounding part. They will all be implemented
10344 by the fcvt* family of instructions anyway. */
10345 if (GET_CODE (x
) == UNSPEC
)
10347 unsigned int uns_code
= XINT (x
, 1);
10349 if (uns_code
== UNSPEC_FRINTA
10350 || uns_code
== UNSPEC_FRINTM
10351 || uns_code
== UNSPEC_FRINTN
10352 || uns_code
== UNSPEC_FRINTP
10353 || uns_code
== UNSPEC_FRINTZ
)
10354 x
= XVECEXP (x
, 0, 0);
10359 if (VECTOR_MODE_P (mode
))
10360 *cost
+= extra_cost
->vect
.alu
;
10362 *cost
+= extra_cost
->fp
[GET_MODE (x
) == DFmode
].toint
;
10365 /* We can combine fmul by a power of 2 followed by a fcvt into a single
10366 fixed-point fcvt. */
10367 if (GET_CODE (x
) == MULT
10368 && ((VECTOR_MODE_P (mode
)
10369 && aarch64_vec_fpconst_pow_of_2 (XEXP (x
, 1)) > 0)
10370 || aarch64_fpconst_pow_of_2 (XEXP (x
, 1)) > 0))
10372 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (rtx_code
) code
,
10377 *cost
+= rtx_cost (x
, VOIDmode
, (enum rtx_code
) code
, 0, speed
);
10381 if (VECTOR_MODE_P (mode
))
10383 /* ABS (vector). */
10385 *cost
+= extra_cost
->vect
.alu
;
10387 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
10391 /* FABD, which is analogous to FADD. */
10392 if (GET_CODE (op0
) == MINUS
)
10394 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, MINUS
, 0, speed
);
10395 *cost
+= rtx_cost (XEXP (op0
, 1), mode
, MINUS
, 1, speed
);
10397 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
10401 /* Simple FABS is analogous to FNEG. */
10403 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
10407 /* Integer ABS will either be split to
10408 two arithmetic instructions, or will be an ABS
10409 (scalar), which we don't model. */
10410 *cost
= COSTS_N_INSNS (2);
10412 *cost
+= 2 * extra_cost
->alu
.arith
;
10420 if (VECTOR_MODE_P (mode
))
10421 *cost
+= extra_cost
->vect
.alu
;
10424 /* FMAXNM/FMINNM/FMAX/FMIN.
10425 TODO: This may not be accurate for all implementations, but
10426 we do not model this in the cost tables. */
10427 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
10433 /* The floating point round to integer frint* instructions. */
10434 if (aarch64_frint_unspec_p (XINT (x
, 1)))
10437 *cost
+= extra_cost
->fp
[mode
== DFmode
].roundint
;
10442 if (XINT (x
, 1) == UNSPEC_RBIT
)
10445 *cost
+= extra_cost
->alu
.rev
;
10453 /* Decompose <su>muldi3_highpart. */
10454 if (/* (truncate:DI */
10457 && GET_MODE (XEXP (x
, 0)) == TImode
10458 && GET_CODE (XEXP (x
, 0)) == LSHIFTRT
10460 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == MULT
10461 /* (ANY_EXTEND:TI (reg:DI))
10462 (ANY_EXTEND:TI (reg:DI))) */
10463 && ((GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == ZERO_EXTEND
10464 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == ZERO_EXTEND
)
10465 || (GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == SIGN_EXTEND
10466 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == SIGN_EXTEND
))
10467 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0)) == DImode
10468 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0)) == DImode
10469 /* (const_int 64) */
10470 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
10471 && UINTVAL (XEXP (XEXP (x
, 0), 1)) == 64)
10475 *cost
+= extra_cost
->mult
[mode
== DImode
].extend
;
10476 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0),
10477 mode
, MULT
, 0, speed
);
10478 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0),
10479 mode
, MULT
, 1, speed
);
10483 /* Fall through. */
10489 && flag_aarch64_verbose_cost
)
10490 fprintf (dump_file
,
10491 "\nFailed to cost RTX. Assuming default cost.\n");
10496 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
10497 calculated for X. This cost is stored in *COST. Returns true
10498 if the total cost of X was calculated. */
10500 aarch64_rtx_costs_wrapper (rtx x
, machine_mode mode
, int outer
,
10501 int param
, int *cost
, bool speed
)
10503 bool result
= aarch64_rtx_costs (x
, mode
, outer
, param
, cost
, speed
);
10506 && flag_aarch64_verbose_cost
)
10508 print_rtl_single (dump_file
, x
);
10509 fprintf (dump_file
, "\n%s cost: %d (%s)\n",
10510 speed
? "Hot" : "Cold",
10511 *cost
, result
? "final" : "partial");
10518 aarch64_register_move_cost (machine_mode mode
,
10519 reg_class_t from_i
, reg_class_t to_i
)
10521 enum reg_class from
= (enum reg_class
) from_i
;
10522 enum reg_class to
= (enum reg_class
) to_i
;
10523 const struct cpu_regmove_cost
*regmove_cost
10524 = aarch64_tune_params
.regmove_cost
;
10526 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
10527 if (to
== TAILCALL_ADDR_REGS
|| to
== POINTER_REGS
)
10530 if (from
== TAILCALL_ADDR_REGS
|| from
== POINTER_REGS
)
10531 from
= GENERAL_REGS
;
10533 /* Moving between GPR and stack cost is the same as GP2GP. */
10534 if ((from
== GENERAL_REGS
&& to
== STACK_REG
)
10535 || (to
== GENERAL_REGS
&& from
== STACK_REG
))
10536 return regmove_cost
->GP2GP
;
10538 /* To/From the stack register, we move via the gprs. */
10539 if (to
== STACK_REG
|| from
== STACK_REG
)
10540 return aarch64_register_move_cost (mode
, from
, GENERAL_REGS
)
10541 + aarch64_register_move_cost (mode
, GENERAL_REGS
, to
);
10543 if (known_eq (GET_MODE_SIZE (mode
), 16))
10545 /* 128-bit operations on general registers require 2 instructions. */
10546 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
10547 return regmove_cost
->GP2GP
* 2;
10548 else if (from
== GENERAL_REGS
)
10549 return regmove_cost
->GP2FP
* 2;
10550 else if (to
== GENERAL_REGS
)
10551 return regmove_cost
->FP2GP
* 2;
10553 /* When AdvSIMD instructions are disabled it is not possible to move
10554 a 128-bit value directly between Q registers. This is handled in
10555 secondary reload. A general register is used as a scratch to move
10556 the upper DI value and the lower DI value is moved directly,
10557 hence the cost is the sum of three moves. */
10559 return regmove_cost
->GP2FP
+ regmove_cost
->FP2GP
+ regmove_cost
->FP2FP
;
10561 return regmove_cost
->FP2FP
;
10564 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
10565 return regmove_cost
->GP2GP
;
10566 else if (from
== GENERAL_REGS
)
10567 return regmove_cost
->GP2FP
;
10568 else if (to
== GENERAL_REGS
)
10569 return regmove_cost
->FP2GP
;
10571 return regmove_cost
->FP2FP
;
10575 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED
,
10576 reg_class_t rclass ATTRIBUTE_UNUSED
,
10577 bool in ATTRIBUTE_UNUSED
)
10579 return aarch64_tune_params
.memmov_cost
;
10582 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
10583 to optimize 1.0/sqrt. */
10586 use_rsqrt_p (machine_mode mode
)
10588 return (!flag_trapping_math
10589 && flag_unsafe_math_optimizations
10590 && ((aarch64_tune_params
.approx_modes
->recip_sqrt
10591 & AARCH64_APPROX_MODE (mode
))
10592 || flag_mrecip_low_precision_sqrt
));
10595 /* Function to decide when to use the approximate reciprocal square root
10599 aarch64_builtin_reciprocal (tree fndecl
)
10601 machine_mode mode
= TYPE_MODE (TREE_TYPE (fndecl
));
10603 if (!use_rsqrt_p (mode
))
10605 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl
));
10608 /* Emit instruction sequence to compute either the approximate square root
10609 or its approximate reciprocal, depending on the flag RECP, and return
10610 whether the sequence was emitted or not. */
10613 aarch64_emit_approx_sqrt (rtx dst
, rtx src
, bool recp
)
10615 machine_mode mode
= GET_MODE (dst
);
10617 if (GET_MODE_INNER (mode
) == HFmode
)
10619 gcc_assert (!recp
);
10625 if (!(flag_mlow_precision_sqrt
10626 || (aarch64_tune_params
.approx_modes
->sqrt
10627 & AARCH64_APPROX_MODE (mode
))))
10630 if (flag_finite_math_only
10631 || flag_trapping_math
10632 || !flag_unsafe_math_optimizations
10633 || optimize_function_for_size_p (cfun
))
10637 /* Caller assumes we cannot fail. */
10638 gcc_assert (use_rsqrt_p (mode
));
10640 machine_mode mmsk
= mode_for_int_vector (mode
).require ();
10641 rtx xmsk
= gen_reg_rtx (mmsk
);
10643 /* When calculating the approximate square root, compare the
10644 argument with 0.0 and create a mask. */
10645 emit_insn (gen_rtx_SET (xmsk
,
10647 gen_rtx_EQ (mmsk
, src
,
10648 CONST0_RTX (mode
)))));
10650 /* Estimate the approximate reciprocal square root. */
10651 rtx xdst
= gen_reg_rtx (mode
);
10652 emit_insn (gen_aarch64_rsqrte (mode
, xdst
, src
));
10654 /* Iterate over the series twice for SF and thrice for DF. */
10655 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
10657 /* Optionally iterate over the series once less for faster performance
10658 while sacrificing the accuracy. */
10659 if ((recp
&& flag_mrecip_low_precision_sqrt
)
10660 || (!recp
&& flag_mlow_precision_sqrt
))
10663 /* Iterate over the series to calculate the approximate reciprocal square
10665 rtx x1
= gen_reg_rtx (mode
);
10666 while (iterations
--)
10668 rtx x2
= gen_reg_rtx (mode
);
10669 emit_set_insn (x2
, gen_rtx_MULT (mode
, xdst
, xdst
));
10671 emit_insn (gen_aarch64_rsqrts (mode
, x1
, src
, x2
));
10673 if (iterations
> 0)
10674 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, x1
));
10679 /* Qualify the approximate reciprocal square root when the argument is
10680 0.0 by squashing the intermediary result to 0.0. */
10681 rtx xtmp
= gen_reg_rtx (mmsk
);
10682 emit_set_insn (xtmp
, gen_rtx_AND (mmsk
, gen_rtx_NOT (mmsk
, xmsk
),
10683 gen_rtx_SUBREG (mmsk
, xdst
, 0)));
10684 emit_move_insn (xdst
, gen_rtx_SUBREG (mode
, xtmp
, 0));
10686 /* Calculate the approximate square root. */
10687 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, src
));
10690 /* Finalize the approximation. */
10691 emit_set_insn (dst
, gen_rtx_MULT (mode
, xdst
, x1
));
10696 /* Emit the instruction sequence to compute the approximation for the division
10697 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
10700 aarch64_emit_approx_div (rtx quo
, rtx num
, rtx den
)
10702 machine_mode mode
= GET_MODE (quo
);
10704 if (GET_MODE_INNER (mode
) == HFmode
)
10707 bool use_approx_division_p
= (flag_mlow_precision_div
10708 || (aarch64_tune_params
.approx_modes
->division
10709 & AARCH64_APPROX_MODE (mode
)));
10711 if (!flag_finite_math_only
10712 || flag_trapping_math
10713 || !flag_unsafe_math_optimizations
10714 || optimize_function_for_size_p (cfun
)
10715 || !use_approx_division_p
)
10718 if (!TARGET_SIMD
&& VECTOR_MODE_P (mode
))
10721 /* Estimate the approximate reciprocal. */
10722 rtx xrcp
= gen_reg_rtx (mode
);
10723 emit_insn (gen_aarch64_frecpe (mode
, xrcp
, den
));
10725 /* Iterate over the series twice for SF and thrice for DF. */
10726 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
10728 /* Optionally iterate over the series once less for faster performance,
10729 while sacrificing the accuracy. */
10730 if (flag_mlow_precision_div
)
10733 /* Iterate over the series to calculate the approximate reciprocal. */
10734 rtx xtmp
= gen_reg_rtx (mode
);
10735 while (iterations
--)
10737 emit_insn (gen_aarch64_frecps (mode
, xtmp
, xrcp
, den
));
10739 if (iterations
> 0)
10740 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
10743 if (num
!= CONST1_RTX (mode
))
10745 /* As the approximate reciprocal of DEN is already calculated, only
10746 calculate the approximate division when NUM is not 1.0. */
10747 rtx xnum
= force_reg (mode
, num
);
10748 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xnum
));
10751 /* Finalize the approximation. */
10752 emit_set_insn (quo
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
10756 /* Return the number of instructions that can be issued per cycle. */
10758 aarch64_sched_issue_rate (void)
10760 return aarch64_tune_params
.issue_rate
;
10764 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10766 int issue_rate
= aarch64_sched_issue_rate ();
10768 return issue_rate
> 1 && !sched_fusion
? issue_rate
: 0;
10772 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10773 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
10774 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
10777 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn
*insn
,
10780 return autopref_multipass_dfa_lookahead_guard (insn
, ready_index
);
10784 /* Vectorizer cost model target hooks. */
10786 /* Implement targetm.vectorize.builtin_vectorization_cost. */
10788 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost
,
10790 int misalign ATTRIBUTE_UNUSED
)
10793 const cpu_vector_cost
*costs
= aarch64_tune_params
.vec_costs
;
10796 if (vectype
!= NULL
)
10797 fp
= FLOAT_TYPE_P (vectype
);
10799 switch (type_of_cost
)
10802 return fp
? costs
->scalar_fp_stmt_cost
: costs
->scalar_int_stmt_cost
;
10805 return costs
->scalar_load_cost
;
10808 return costs
->scalar_store_cost
;
10811 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
10814 return costs
->vec_align_load_cost
;
10817 return costs
->vec_store_cost
;
10819 case vec_to_scalar
:
10820 return costs
->vec_to_scalar_cost
;
10822 case scalar_to_vec
:
10823 return costs
->scalar_to_vec_cost
;
10825 case unaligned_load
:
10826 case vector_gather_load
:
10827 return costs
->vec_unalign_load_cost
;
10829 case unaligned_store
:
10830 case vector_scatter_store
:
10831 return costs
->vec_unalign_store_cost
;
10833 case cond_branch_taken
:
10834 return costs
->cond_taken_branch_cost
;
10836 case cond_branch_not_taken
:
10837 return costs
->cond_not_taken_branch_cost
;
10840 return costs
->vec_permute_cost
;
10842 case vec_promote_demote
:
10843 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
10845 case vec_construct
:
10846 elements
= estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype
));
10847 return elements
/ 2 + 1;
10850 gcc_unreachable ();
10854 /* Implement targetm.vectorize.add_stmt_cost. */
10856 aarch64_add_stmt_cost (void *data
, int count
, enum vect_cost_for_stmt kind
,
10857 struct _stmt_vec_info
*stmt_info
, int misalign
,
10858 enum vect_cost_model_location where
)
10860 unsigned *cost
= (unsigned *) data
;
10861 unsigned retval
= 0;
10863 if (flag_vect_cost_model
)
10865 tree vectype
= stmt_info
? stmt_vectype (stmt_info
) : NULL_TREE
;
10867 aarch64_builtin_vectorization_cost (kind
, vectype
, misalign
);
10869 /* Statements in an inner loop relative to the loop being
10870 vectorized are weighted more heavily. The value here is
10871 arbitrary and could potentially be improved with analysis. */
10872 if (where
== vect_body
&& stmt_info
&& stmt_in_inner_loop_p (stmt_info
))
10873 count
*= 50; /* FIXME */
10875 retval
= (unsigned) (count
* stmt_cost
);
10876 cost
[where
] += retval
;
10882 static void initialize_aarch64_code_model (struct gcc_options
*);
10884 /* Parse the TO_PARSE string and put the architecture struct that it
10885 selects into RES and the architectural features into ISA_FLAGS.
10886 Return an aarch64_parse_opt_result describing the parse result.
10887 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
10888 When the TO_PARSE string contains an invalid extension,
10889 a copy of the string is created and stored to INVALID_EXTENSION. */
10891 static enum aarch64_parse_opt_result
10892 aarch64_parse_arch (const char *to_parse
, const struct processor
**res
,
10893 unsigned long *isa_flags
, std::string
*invalid_extension
)
10896 const struct processor
*arch
;
10899 ext
= strchr (to_parse
, '+');
10902 len
= ext
- to_parse
;
10904 len
= strlen (to_parse
);
10907 return AARCH64_PARSE_MISSING_ARG
;
10910 /* Loop through the list of supported ARCHes to find a match. */
10911 for (arch
= all_architectures
; arch
->name
!= NULL
; arch
++)
10913 if (strlen (arch
->name
) == len
10914 && strncmp (arch
->name
, to_parse
, len
) == 0)
10916 unsigned long isa_temp
= arch
->flags
;
10920 /* TO_PARSE string contains at least one extension. */
10921 enum aarch64_parse_opt_result ext_res
10922 = aarch64_parse_extension (ext
, &isa_temp
, invalid_extension
);
10924 if (ext_res
!= AARCH64_PARSE_OK
)
10927 /* Extension parsing was successful. Confirm the result
10928 arch and ISA flags. */
10930 *isa_flags
= isa_temp
;
10931 return AARCH64_PARSE_OK
;
10935 /* ARCH name not found in list. */
10936 return AARCH64_PARSE_INVALID_ARG
;
10939 /* Parse the TO_PARSE string and put the result tuning in RES and the
10940 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
10941 describing the parse result. If there is an error parsing, RES and
10942 ISA_FLAGS are left unchanged.
10943 When the TO_PARSE string contains an invalid extension,
10944 a copy of the string is created and stored to INVALID_EXTENSION. */
10946 static enum aarch64_parse_opt_result
10947 aarch64_parse_cpu (const char *to_parse
, const struct processor
**res
,
10948 unsigned long *isa_flags
, std::string
*invalid_extension
)
10951 const struct processor
*cpu
;
10954 ext
= strchr (to_parse
, '+');
10957 len
= ext
- to_parse
;
10959 len
= strlen (to_parse
);
10962 return AARCH64_PARSE_MISSING_ARG
;
10965 /* Loop through the list of supported CPUs to find a match. */
10966 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
10968 if (strlen (cpu
->name
) == len
&& strncmp (cpu
->name
, to_parse
, len
) == 0)
10970 unsigned long isa_temp
= cpu
->flags
;
10975 /* TO_PARSE string contains at least one extension. */
10976 enum aarch64_parse_opt_result ext_res
10977 = aarch64_parse_extension (ext
, &isa_temp
, invalid_extension
);
10979 if (ext_res
!= AARCH64_PARSE_OK
)
10982 /* Extension parsing was successfull. Confirm the result
10983 cpu and ISA flags. */
10985 *isa_flags
= isa_temp
;
10986 return AARCH64_PARSE_OK
;
10990 /* CPU name not found in list. */
10991 return AARCH64_PARSE_INVALID_ARG
;
10994 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10995 Return an aarch64_parse_opt_result describing the parse result.
10996 If the parsing fails the RES does not change. */
10998 static enum aarch64_parse_opt_result
10999 aarch64_parse_tune (const char *to_parse
, const struct processor
**res
)
11001 const struct processor
*cpu
;
11003 /* Loop through the list of supported CPUs to find a match. */
11004 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
11006 if (strcmp (cpu
->name
, to_parse
) == 0)
11009 return AARCH64_PARSE_OK
;
11013 /* CPU name not found in list. */
11014 return AARCH64_PARSE_INVALID_ARG
;
11017 /* Parse TOKEN, which has length LENGTH to see if it is an option
11018 described in FLAG. If it is, return the index bit for that fusion type.
11019 If not, error (printing OPTION_NAME) and return zero. */
11021 static unsigned int
11022 aarch64_parse_one_option_token (const char *token
,
11024 const struct aarch64_flag_desc
*flag
,
11025 const char *option_name
)
11027 for (; flag
->name
!= NULL
; flag
++)
11029 if (length
== strlen (flag
->name
)
11030 && !strncmp (flag
->name
, token
, length
))
11034 error ("unknown flag passed in -moverride=%s (%s)", option_name
, token
);
11038 /* Parse OPTION which is a comma-separated list of flags to enable.
11039 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
11040 default state we inherit from the CPU tuning structures. OPTION_NAME
11041 gives the top-level option we are parsing in the -moverride string,
11042 for use in error messages. */
11044 static unsigned int
11045 aarch64_parse_boolean_options (const char *option
,
11046 const struct aarch64_flag_desc
*flags
,
11047 unsigned int initial_state
,
11048 const char *option_name
)
11050 const char separator
= '.';
11051 const char* specs
= option
;
11052 const char* ntoken
= option
;
11053 unsigned int found_flags
= initial_state
;
11055 while ((ntoken
= strchr (specs
, separator
)))
11057 size_t token_length
= ntoken
- specs
;
11058 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
11062 /* If we find "none" (or, for simplicity's sake, an error) anywhere
11063 in the token stream, reset the supported operations. So:
11065 adrp+add.cmp+branch.none.adrp+add
11067 would have the result of turning on only adrp+add fusion. */
11071 found_flags
|= token_ops
;
11075 /* We ended with a comma, print something. */
11078 error ("%s string ill-formed\n", option_name
);
11082 /* We still have one more token to parse. */
11083 size_t token_length
= strlen (specs
);
11084 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
11091 found_flags
|= token_ops
;
11092 return found_flags
;
11095 /* Support for overriding instruction fusion. */
11098 aarch64_parse_fuse_string (const char *fuse_string
,
11099 struct tune_params
*tune
)
11101 tune
->fusible_ops
= aarch64_parse_boolean_options (fuse_string
,
11102 aarch64_fusible_pairs
,
11107 /* Support for overriding other tuning flags. */
11110 aarch64_parse_tune_string (const char *tune_string
,
11111 struct tune_params
*tune
)
11113 tune
->extra_tuning_flags
11114 = aarch64_parse_boolean_options (tune_string
,
11115 aarch64_tuning_flags
,
11116 tune
->extra_tuning_flags
,
11120 /* Parse the sve_width tuning moverride string in TUNE_STRING.
11121 Accept the valid SVE vector widths allowed by
11122 aarch64_sve_vector_bits_enum and use it to override sve_width
11126 aarch64_parse_sve_width_string (const char *tune_string
,
11127 struct tune_params
*tune
)
11131 int n
= sscanf (tune_string
, "%d", &width
);
11134 error ("invalid format for sve_width");
11146 error ("invalid sve_width value: %d", width
);
11148 tune
->sve_width
= (enum aarch64_sve_vector_bits_enum
) width
;
11151 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
11152 we understand. If it is, extract the option string and handoff to
11153 the appropriate function. */
11156 aarch64_parse_one_override_token (const char* token
,
11158 struct tune_params
*tune
)
11160 const struct aarch64_tuning_override_function
*fn
11161 = aarch64_tuning_override_functions
;
11163 const char *option_part
= strchr (token
, '=');
11166 error ("tuning string missing in option (%s)", token
);
11170 /* Get the length of the option name. */
11171 length
= option_part
- token
;
11172 /* Skip the '=' to get to the option string. */
11175 for (; fn
->name
!= NULL
; fn
++)
11177 if (!strncmp (fn
->name
, token
, length
))
11179 fn
->parse_override (option_part
, tune
);
11184 error ("unknown tuning option (%s)",token
);
11188 /* A checking mechanism for the implementation of the tls size. */
11191 initialize_aarch64_tls_size (struct gcc_options
*opts
)
11193 if (aarch64_tls_size
== 0)
11194 aarch64_tls_size
= 24;
11196 switch (opts
->x_aarch64_cmodel_var
)
11198 case AARCH64_CMODEL_TINY
:
11199 /* Both the default and maximum TLS size allowed under tiny is 1M which
11200 needs two instructions to address, so we clamp the size to 24. */
11201 if (aarch64_tls_size
> 24)
11202 aarch64_tls_size
= 24;
11204 case AARCH64_CMODEL_SMALL
:
11205 /* The maximum TLS size allowed under small is 4G. */
11206 if (aarch64_tls_size
> 32)
11207 aarch64_tls_size
= 32;
11209 case AARCH64_CMODEL_LARGE
:
11210 /* The maximum TLS size allowed under large is 16E.
11211 FIXME: 16E should be 64bit, we only support 48bit offset now. */
11212 if (aarch64_tls_size
> 48)
11213 aarch64_tls_size
= 48;
11216 gcc_unreachable ();
11222 /* Parse STRING looking for options in the format:
11223 string :: option:string
11224 option :: name=substring
11226 substring :: defined by option. */
11229 aarch64_parse_override_string (const char* input_string
,
11230 struct tune_params
* tune
)
11232 const char separator
= ':';
11233 size_t string_length
= strlen (input_string
) + 1;
11234 char *string_root
= (char *) xmalloc (sizeof (*string_root
) * string_length
);
11235 char *string
= string_root
;
11236 strncpy (string
, input_string
, string_length
);
11237 string
[string_length
- 1] = '\0';
11239 char* ntoken
= string
;
11241 while ((ntoken
= strchr (string
, separator
)))
11243 size_t token_length
= ntoken
- string
;
11244 /* Make this substring look like a string. */
11246 aarch64_parse_one_override_token (string
, token_length
, tune
);
11250 /* One last option to parse. */
11251 aarch64_parse_one_override_token (string
, strlen (string
), tune
);
11252 free (string_root
);
11257 aarch64_override_options_after_change_1 (struct gcc_options
*opts
)
11259 if (accepted_branch_protection_string
)
11261 opts
->x_aarch64_branch_protection_string
11262 = xstrdup (accepted_branch_protection_string
);
11265 /* PR 70044: We have to be careful about being called multiple times for the
11266 same function. This means all changes should be repeatable. */
11268 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
11269 Disable the frame pointer flag so the mid-end will not use a frame
11270 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
11271 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
11272 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
11273 aarch64_use_frame_pointer
= opts
->x_flag_omit_frame_pointer
!= 1;
11274 if (opts
->x_flag_omit_frame_pointer
== 0)
11275 opts
->x_flag_omit_frame_pointer
= 2;
11277 /* If not optimizing for size, set the default
11278 alignment to what the target wants. */
11279 if (!opts
->x_optimize_size
)
11281 if (opts
->x_flag_align_loops
&& !opts
->x_str_align_loops
)
11282 opts
->x_str_align_loops
= aarch64_tune_params
.loop_align
;
11283 if (opts
->x_flag_align_jumps
&& !opts
->x_str_align_jumps
)
11284 opts
->x_str_align_jumps
= aarch64_tune_params
.jump_align
;
11285 if (opts
->x_flag_align_functions
&& !opts
->x_str_align_functions
)
11286 opts
->x_str_align_functions
= aarch64_tune_params
.function_align
;
11289 /* We default to no pc-relative literal loads. */
11291 aarch64_pcrelative_literal_loads
= false;
11293 /* If -mpc-relative-literal-loads is set on the command line, this
11294 implies that the user asked for PC relative literal loads. */
11295 if (opts
->x_pcrelative_literal_loads
== 1)
11296 aarch64_pcrelative_literal_loads
= true;
11298 /* In the tiny memory model it makes no sense to disallow PC relative
11299 literal pool loads. */
11300 if (aarch64_cmodel
== AARCH64_CMODEL_TINY
11301 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
11302 aarch64_pcrelative_literal_loads
= true;
11304 /* When enabling the lower precision Newton series for the square root, also
11305 enable it for the reciprocal square root, since the latter is an
11306 intermediary step for the former. */
11307 if (flag_mlow_precision_sqrt
)
11308 flag_mrecip_low_precision_sqrt
= true;
11311 /* 'Unpack' up the internal tuning structs and update the options
11312 in OPTS. The caller must have set up selected_tune and selected_arch
11313 as all the other target-specific codegen decisions are
11314 derived from them. */
11317 aarch64_override_options_internal (struct gcc_options
*opts
)
11319 aarch64_tune_flags
= selected_tune
->flags
;
11320 aarch64_tune
= selected_tune
->sched_core
;
11321 /* Make a copy of the tuning parameters attached to the core, which
11322 we may later overwrite. */
11323 aarch64_tune_params
= *(selected_tune
->tune
);
11324 aarch64_architecture_version
= selected_arch
->architecture_version
;
11326 if (opts
->x_aarch64_override_tune_string
)
11327 aarch64_parse_override_string (opts
->x_aarch64_override_tune_string
,
11328 &aarch64_tune_params
);
11330 /* This target defaults to strict volatile bitfields. */
11331 if (opts
->x_flag_strict_volatile_bitfields
< 0 && abi_version_at_least (2))
11332 opts
->x_flag_strict_volatile_bitfields
= 1;
11334 initialize_aarch64_code_model (opts
);
11335 initialize_aarch64_tls_size (opts
);
11337 int queue_depth
= 0;
11338 switch (aarch64_tune_params
.autoprefetcher_model
)
11340 case tune_params::AUTOPREFETCHER_OFF
:
11343 case tune_params::AUTOPREFETCHER_WEAK
:
11346 case tune_params::AUTOPREFETCHER_STRONG
:
11347 queue_depth
= max_insn_queue_index
+ 1;
11350 gcc_unreachable ();
11353 /* We don't mind passing in global_options_set here as we don't use
11354 the *options_set structs anyway. */
11355 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH
,
11357 opts
->x_param_values
,
11358 global_options_set
.x_param_values
);
11360 /* Set up parameters to be used in prefetching algorithm. Do not
11361 override the defaults unless we are tuning for a core we have
11362 researched values for. */
11363 if (aarch64_tune_params
.prefetch
->num_slots
> 0)
11364 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES
,
11365 aarch64_tune_params
.prefetch
->num_slots
,
11366 opts
->x_param_values
,
11367 global_options_set
.x_param_values
);
11368 if (aarch64_tune_params
.prefetch
->l1_cache_size
>= 0)
11369 maybe_set_param_value (PARAM_L1_CACHE_SIZE
,
11370 aarch64_tune_params
.prefetch
->l1_cache_size
,
11371 opts
->x_param_values
,
11372 global_options_set
.x_param_values
);
11373 if (aarch64_tune_params
.prefetch
->l1_cache_line_size
>= 0)
11374 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE
,
11375 aarch64_tune_params
.prefetch
->l1_cache_line_size
,
11376 opts
->x_param_values
,
11377 global_options_set
.x_param_values
);
11378 if (aarch64_tune_params
.prefetch
->l2_cache_size
>= 0)
11379 maybe_set_param_value (PARAM_L2_CACHE_SIZE
,
11380 aarch64_tune_params
.prefetch
->l2_cache_size
,
11381 opts
->x_param_values
,
11382 global_options_set
.x_param_values
);
11383 if (!aarch64_tune_params
.prefetch
->prefetch_dynamic_strides
)
11384 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES
,
11386 opts
->x_param_values
,
11387 global_options_set
.x_param_values
);
11388 if (aarch64_tune_params
.prefetch
->minimum_stride
>= 0)
11389 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE
,
11390 aarch64_tune_params
.prefetch
->minimum_stride
,
11391 opts
->x_param_values
,
11392 global_options_set
.x_param_values
);
11394 /* Use the alternative scheduling-pressure algorithm by default. */
11395 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM
, SCHED_PRESSURE_MODEL
,
11396 opts
->x_param_values
,
11397 global_options_set
.x_param_values
);
11399 /* If the user hasn't changed it via configure then set the default to 64 KB
11400 for the backend. */
11401 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
,
11402 DEFAULT_STK_CLASH_GUARD_SIZE
== 0
11403 ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE
,
11404 opts
->x_param_values
,
11405 global_options_set
.x_param_values
);
11407 /* Validate the guard size. */
11408 int guard_size
= PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
11410 /* Enforce that interval is the same size as size so the mid-end does the
11412 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL
,
11414 opts
->x_param_values
,
11415 global_options_set
.x_param_values
);
11417 /* The maybe_set calls won't update the value if the user has explicitly set
11418 one. Which means we need to validate that probing interval and guard size
11421 = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL
);
11422 if (guard_size
!= probe_interval
)
11423 error ("stack clash guard size '%d' must be equal to probing interval "
11424 "'%d'", guard_size
, probe_interval
);
11426 /* Enable sw prefetching at specified optimization level for
11427 CPUS that have prefetch. Lower optimization level threshold by 1
11428 when profiling is enabled. */
11429 if (opts
->x_flag_prefetch_loop_arrays
< 0
11430 && !opts
->x_optimize_size
11431 && aarch64_tune_params
.prefetch
->default_opt_level
>= 0
11432 && opts
->x_optimize
>= aarch64_tune_params
.prefetch
->default_opt_level
)
11433 opts
->x_flag_prefetch_loop_arrays
= 1;
11435 if (opts
->x_aarch64_arch_string
== NULL
)
11436 opts
->x_aarch64_arch_string
= selected_arch
->name
;
11437 if (opts
->x_aarch64_cpu_string
== NULL
)
11438 opts
->x_aarch64_cpu_string
= selected_cpu
->name
;
11439 if (opts
->x_aarch64_tune_string
== NULL
)
11440 opts
->x_aarch64_tune_string
= selected_tune
->name
;
11442 aarch64_override_options_after_change_1 (opts
);
11445 /* Print a hint with a suggestion for a core or architecture name that
11446 most closely resembles what the user passed in STR. ARCH is true if
11447 the user is asking for an architecture name. ARCH is false if the user
11448 is asking for a core name. */
11451 aarch64_print_hint_for_core_or_arch (const char *str
, bool arch
)
11453 auto_vec
<const char *> candidates
;
11454 const struct processor
*entry
= arch
? all_architectures
: all_cores
;
11455 for (; entry
->name
!= NULL
; entry
++)
11456 candidates
.safe_push (entry
->name
);
11458 #ifdef HAVE_LOCAL_CPU_DETECT
11459 /* Add also "native" as possible value. */
11461 candidates
.safe_push ("native");
11465 const char *hint
= candidates_list_and_hint (str
, s
, candidates
);
11467 inform (input_location
, "valid arguments are: %s;"
11468 " did you mean %qs?", s
, hint
);
11470 inform (input_location
, "valid arguments are: %s", s
);
11475 /* Print a hint with a suggestion for a core name that most closely resembles
11476 what the user passed in STR. */
11479 aarch64_print_hint_for_core (const char *str
)
11481 aarch64_print_hint_for_core_or_arch (str
, false);
11484 /* Print a hint with a suggestion for an architecture name that most closely
11485 resembles what the user passed in STR. */
11488 aarch64_print_hint_for_arch (const char *str
)
11490 aarch64_print_hint_for_core_or_arch (str
, true);
11494 /* Print a hint with a suggestion for an extension name
11495 that most closely resembles what the user passed in STR. */
11498 aarch64_print_hint_for_extensions (const std::string
&str
)
11500 auto_vec
<const char *> candidates
;
11501 aarch64_get_all_extension_candidates (&candidates
);
11503 const char *hint
= candidates_list_and_hint (str
.c_str (), s
, candidates
);
11505 inform (input_location
, "valid arguments are: %s;"
11506 " did you mean %qs?", s
, hint
);
11508 inform (input_location
, "valid arguments are: %s;", s
);
11513 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
11514 specified in STR and throw errors if appropriate. Put the results if
11515 they are valid in RES and ISA_FLAGS. Return whether the option is
11519 aarch64_validate_mcpu (const char *str
, const struct processor
**res
,
11520 unsigned long *isa_flags
)
11522 std::string invalid_extension
;
11523 enum aarch64_parse_opt_result parse_res
11524 = aarch64_parse_cpu (str
, res
, isa_flags
, &invalid_extension
);
11526 if (parse_res
== AARCH64_PARSE_OK
)
11531 case AARCH64_PARSE_MISSING_ARG
:
11532 error ("missing cpu name in %<-mcpu=%s%>", str
);
11534 case AARCH64_PARSE_INVALID_ARG
:
11535 error ("unknown value %qs for -mcpu", str
);
11536 aarch64_print_hint_for_core (str
);
11538 case AARCH64_PARSE_INVALID_FEATURE
:
11539 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
11540 invalid_extension
.c_str (), str
);
11541 aarch64_print_hint_for_extensions (invalid_extension
);
11544 gcc_unreachable ();
11550 /* Parses CONST_STR for branch protection features specified in
11551 aarch64_branch_protect_types, and set any global variables required. Returns
11552 the parsing result and assigns LAST_STR to the last processed token from
11553 CONST_STR so that it can be used for error reporting. */
11556 aarch64_parse_opt_result
aarch64_parse_branch_protection (const char *const_str
,
11559 char *str_root
= xstrdup (const_str
);
11560 char* token_save
= NULL
;
11561 char *str
= strtok_r (str_root
, "+", &token_save
);
11562 enum aarch64_parse_opt_result res
= AARCH64_PARSE_OK
;
11564 res
= AARCH64_PARSE_MISSING_ARG
;
11567 char *next_str
= strtok_r (NULL
, "+", &token_save
);
11568 /* Reset the branch protection features to their defaults. */
11569 aarch64_handle_no_branch_protection (NULL
, NULL
);
11571 while (str
&& res
== AARCH64_PARSE_OK
)
11573 const aarch64_branch_protect_type
* type
= aarch64_branch_protect_types
;
11574 bool found
= false;
11575 /* Search for this type. */
11576 while (type
&& type
->name
&& !found
&& res
== AARCH64_PARSE_OK
)
11578 if (strcmp (str
, type
->name
) == 0)
11581 res
= type
->handler (str
, next_str
);
11583 next_str
= strtok_r (NULL
, "+", &token_save
);
11588 if (found
&& res
== AARCH64_PARSE_OK
)
11590 bool found_subtype
= true;
11591 /* Loop through each token until we find one that isn't a
11593 while (found_subtype
)
11595 found_subtype
= false;
11596 const aarch64_branch_protect_type
*subtype
= type
->subtypes
;
11597 /* Search for the subtype. */
11598 while (str
&& subtype
&& subtype
->name
&& !found_subtype
11599 && res
== AARCH64_PARSE_OK
)
11601 if (strcmp (str
, subtype
->name
) == 0)
11603 found_subtype
= true;
11604 res
= subtype
->handler (str
, next_str
);
11606 next_str
= strtok_r (NULL
, "+", &token_save
);
11614 res
= AARCH64_PARSE_INVALID_ARG
;
11617 /* Copy the last processed token into the argument to pass it back.
11618 Used by option and attribute validation to print the offending token. */
11621 if (str
) strcpy (*last_str
, str
);
11622 else *last_str
= NULL
;
11624 if (res
== AARCH64_PARSE_OK
)
11626 /* If needed, alloc the accepted string then copy in const_str.
11627 Used by override_option_after_change_1. */
11628 if (!accepted_branch_protection_string
)
11629 accepted_branch_protection_string
= (char *) xmalloc (
11630 BRANCH_PROTECT_STR_MAX
11632 strncpy (accepted_branch_protection_string
, const_str
,
11633 BRANCH_PROTECT_STR_MAX
+ 1);
11634 /* Forcibly null-terminate. */
11635 accepted_branch_protection_string
[BRANCH_PROTECT_STR_MAX
] = '\0';
11641 aarch64_validate_mbranch_protection (const char *const_str
)
11643 char *str
= (char *) xmalloc (strlen (const_str
));
11644 enum aarch64_parse_opt_result res
=
11645 aarch64_parse_branch_protection (const_str
, &str
);
11646 if (res
== AARCH64_PARSE_INVALID_ARG
)
11647 error ("invalid arg %<%s%> for %<-mbranch-protection=%>", str
);
11648 else if (res
== AARCH64_PARSE_MISSING_ARG
)
11649 error ("missing arg for %<-mbranch-protection=%>");
11651 return res
== AARCH64_PARSE_OK
;
11654 /* Validate a command-line -march option. Parse the arch and extensions
11655 (if any) specified in STR and throw errors if appropriate. Put the
11656 results, if they are valid, in RES and ISA_FLAGS. Return whether the
11657 option is valid. */
11660 aarch64_validate_march (const char *str
, const struct processor
**res
,
11661 unsigned long *isa_flags
)
11663 std::string invalid_extension
;
11664 enum aarch64_parse_opt_result parse_res
11665 = aarch64_parse_arch (str
, res
, isa_flags
, &invalid_extension
);
11667 if (parse_res
== AARCH64_PARSE_OK
)
11672 case AARCH64_PARSE_MISSING_ARG
:
11673 error ("missing arch name in %<-march=%s%>", str
);
11675 case AARCH64_PARSE_INVALID_ARG
:
11676 error ("unknown value %qs for -march", str
);
11677 aarch64_print_hint_for_arch (str
);
11679 case AARCH64_PARSE_INVALID_FEATURE
:
11680 error ("invalid feature modifier %qs in %<-march=%s%>",
11681 invalid_extension
.c_str (), str
);
11682 aarch64_print_hint_for_extensions (invalid_extension
);
11685 gcc_unreachable ();
11691 /* Validate a command-line -mtune option. Parse the cpu
11692 specified in STR and throw errors if appropriate. Put the
11693 result, if it is valid, in RES. Return whether the option is
11697 aarch64_validate_mtune (const char *str
, const struct processor
**res
)
11699 enum aarch64_parse_opt_result parse_res
11700 = aarch64_parse_tune (str
, res
);
11702 if (parse_res
== AARCH64_PARSE_OK
)
11707 case AARCH64_PARSE_MISSING_ARG
:
11708 error ("missing cpu name in %<-mtune=%s%>", str
);
11710 case AARCH64_PARSE_INVALID_ARG
:
11711 error ("unknown value %qs for -mtune", str
);
11712 aarch64_print_hint_for_core (str
);
11715 gcc_unreachable ();
11720 /* Return the CPU corresponding to the enum CPU.
11721 If it doesn't specify a cpu, return the default. */
11723 static const struct processor
*
11724 aarch64_get_tune_cpu (enum aarch64_processor cpu
)
11726 if (cpu
!= aarch64_none
)
11727 return &all_cores
[cpu
];
11729 /* The & 0x3f is to extract the bottom 6 bits that encode the
11730 default cpu as selected by the --with-cpu GCC configure option
11732 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
11733 flags mechanism should be reworked to make it more sane. */
11734 return &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
11737 /* Return the architecture corresponding to the enum ARCH.
11738 If it doesn't specify a valid architecture, return the default. */
11740 static const struct processor
*
11741 aarch64_get_arch (enum aarch64_arch arch
)
11743 if (arch
!= aarch64_no_arch
)
11744 return &all_architectures
[arch
];
11746 const struct processor
*cpu
= &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
11748 return &all_architectures
[cpu
->arch
];
11751 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
11754 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value
)
11756 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
11757 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
11758 deciding which .md file patterns to use and when deciding whether
11759 something is a legitimate address or constant. */
11760 if (value
== SVE_SCALABLE
|| value
== SVE_128
)
11761 return poly_uint16 (2, 2);
11763 return (int) value
/ 64;
11766 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
11767 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
11768 tuning structs. In particular it must set selected_tune and
11769 aarch64_isa_flags that define the available ISA features and tuning
11770 decisions. It must also set selected_arch as this will be used to
11771 output the .arch asm tags for each function. */
11774 aarch64_override_options (void)
11776 unsigned long cpu_isa
= 0;
11777 unsigned long arch_isa
= 0;
11778 aarch64_isa_flags
= 0;
11780 bool valid_cpu
= true;
11781 bool valid_tune
= true;
11782 bool valid_arch
= true;
11784 selected_cpu
= NULL
;
11785 selected_arch
= NULL
;
11786 selected_tune
= NULL
;
11788 if (aarch64_branch_protection_string
)
11789 aarch64_validate_mbranch_protection (aarch64_branch_protection_string
);
11791 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
11792 If either of -march or -mtune is given, they override their
11793 respective component of -mcpu. */
11794 if (aarch64_cpu_string
)
11795 valid_cpu
= aarch64_validate_mcpu (aarch64_cpu_string
, &selected_cpu
,
11798 if (aarch64_arch_string
)
11799 valid_arch
= aarch64_validate_march (aarch64_arch_string
, &selected_arch
,
11802 if (aarch64_tune_string
)
11803 valid_tune
= aarch64_validate_mtune (aarch64_tune_string
, &selected_tune
);
11805 #ifdef SUBTARGET_OVERRIDE_OPTIONS
11806 SUBTARGET_OVERRIDE_OPTIONS
;
11809 /* If the user did not specify a processor, choose the default
11810 one for them. This will be the CPU set during configuration using
11811 --with-cpu, otherwise it is "generic". */
11816 selected_cpu
= &all_cores
[selected_arch
->ident
];
11817 aarch64_isa_flags
= arch_isa
;
11818 explicit_arch
= selected_arch
->arch
;
11822 /* Get default configure-time CPU. */
11823 selected_cpu
= aarch64_get_tune_cpu (aarch64_none
);
11824 aarch64_isa_flags
= TARGET_CPU_DEFAULT
>> 6;
11828 explicit_tune_core
= selected_tune
->ident
;
11830 /* If both -mcpu and -march are specified check that they are architecturally
11831 compatible, warn if they're not and prefer the -march ISA flags. */
11832 else if (selected_arch
)
11834 if (selected_arch
->arch
!= selected_cpu
->arch
)
11836 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
11837 all_architectures
[selected_cpu
->arch
].name
,
11838 selected_arch
->name
);
11840 aarch64_isa_flags
= arch_isa
;
11841 explicit_arch
= selected_arch
->arch
;
11842 explicit_tune_core
= selected_tune
? selected_tune
->ident
11843 : selected_cpu
->ident
;
11847 /* -mcpu but no -march. */
11848 aarch64_isa_flags
= cpu_isa
;
11849 explicit_tune_core
= selected_tune
? selected_tune
->ident
11850 : selected_cpu
->ident
;
11851 gcc_assert (selected_cpu
);
11852 selected_arch
= &all_architectures
[selected_cpu
->arch
];
11853 explicit_arch
= selected_arch
->arch
;
11856 /* Set the arch as well as we will need it when outputing
11857 the .arch directive in assembly. */
11858 if (!selected_arch
)
11860 gcc_assert (selected_cpu
);
11861 selected_arch
= &all_architectures
[selected_cpu
->arch
];
11864 if (!selected_tune
)
11865 selected_tune
= selected_cpu
;
11867 if (aarch64_enable_bti
== 2)
11869 #ifdef TARGET_ENABLE_BTI
11870 aarch64_enable_bti
= 1;
11872 aarch64_enable_bti
= 0;
11876 /* Return address signing is currently not supported for ILP32 targets. For
11877 LP64 targets use the configured option in the absence of a command-line
11878 option for -mbranch-protection. */
11879 if (!TARGET_ILP32
&& accepted_branch_protection_string
== NULL
)
11881 #ifdef TARGET_ENABLE_PAC_RET
11882 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
11883 aarch64_ra_sign_key
= AARCH64_KEY_A
;
11885 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NONE
;
11889 #ifndef HAVE_AS_MABI_OPTION
11890 /* The compiler may have been configured with 2.23.* binutils, which does
11891 not have support for ILP32. */
11893 error ("assembler does not support -mabi=ilp32");
11896 /* Convert -msve-vector-bits to a VG count. */
11897 aarch64_sve_vg
= aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits
);
11899 if (aarch64_ra_sign_scope
!= AARCH64_FUNCTION_NONE
&& TARGET_ILP32
)
11900 sorry ("return address signing is only supported for -mabi=lp64");
11902 /* Make sure we properly set up the explicit options. */
11903 if ((aarch64_cpu_string
&& valid_cpu
)
11904 || (aarch64_tune_string
&& valid_tune
))
11905 gcc_assert (explicit_tune_core
!= aarch64_none
);
11907 if ((aarch64_cpu_string
&& valid_cpu
)
11908 || (aarch64_arch_string
&& valid_arch
))
11909 gcc_assert (explicit_arch
!= aarch64_no_arch
);
11911 /* The pass to insert speculation tracking runs before
11912 shrink-wrapping and the latter does not know how to update the
11913 tracking status. So disable it in this case. */
11914 if (aarch64_track_speculation
)
11915 flag_shrink_wrap
= 0;
11917 aarch64_override_options_internal (&global_options
);
11919 /* Save these options as the default ones in case we push and pop them later
11920 while processing functions with potential target attributes. */
11921 target_option_default_node
= target_option_current_node
11922 = build_target_option_node (&global_options
);
11925 /* Implement targetm.override_options_after_change. */
11928 aarch64_override_options_after_change (void)
11930 aarch64_override_options_after_change_1 (&global_options
);
11933 static struct machine_function
*
11934 aarch64_init_machine_status (void)
11936 struct machine_function
*machine
;
11937 machine
= ggc_cleared_alloc
<machine_function
> ();
11942 aarch64_init_expanders (void)
11944 init_machine_status
= aarch64_init_machine_status
;
11947 /* A checking mechanism for the implementation of the various code models. */
11949 initialize_aarch64_code_model (struct gcc_options
*opts
)
11951 if (opts
->x_flag_pic
)
11953 switch (opts
->x_aarch64_cmodel_var
)
11955 case AARCH64_CMODEL_TINY
:
11956 aarch64_cmodel
= AARCH64_CMODEL_TINY_PIC
;
11958 case AARCH64_CMODEL_SMALL
:
11959 #ifdef HAVE_AS_SMALL_PIC_RELOCS
11960 aarch64_cmodel
= (flag_pic
== 2
11961 ? AARCH64_CMODEL_SMALL_PIC
11962 : AARCH64_CMODEL_SMALL_SPIC
);
11964 aarch64_cmodel
= AARCH64_CMODEL_SMALL_PIC
;
11967 case AARCH64_CMODEL_LARGE
:
11968 sorry ("code model %qs with -f%s", "large",
11969 opts
->x_flag_pic
> 1 ? "PIC" : "pic");
11972 gcc_unreachable ();
11976 aarch64_cmodel
= opts
->x_aarch64_cmodel_var
;
11979 /* Implement TARGET_OPTION_SAVE. */
11982 aarch64_option_save (struct cl_target_option
*ptr
, struct gcc_options
*opts
)
11984 ptr
->x_aarch64_override_tune_string
= opts
->x_aarch64_override_tune_string
;
11985 ptr
->x_aarch64_branch_protection_string
11986 = opts
->x_aarch64_branch_protection_string
;
11989 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
11990 using the information saved in PTR. */
11993 aarch64_option_restore (struct gcc_options
*opts
, struct cl_target_option
*ptr
)
11995 opts
->x_explicit_tune_core
= ptr
->x_explicit_tune_core
;
11996 selected_tune
= aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
11997 opts
->x_explicit_arch
= ptr
->x_explicit_arch
;
11998 selected_arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
11999 opts
->x_aarch64_override_tune_string
= ptr
->x_aarch64_override_tune_string
;
12000 opts
->x_aarch64_branch_protection_string
12001 = ptr
->x_aarch64_branch_protection_string
;
12002 if (opts
->x_aarch64_branch_protection_string
)
12004 aarch64_parse_branch_protection (opts
->x_aarch64_branch_protection_string
,
12008 aarch64_override_options_internal (opts
);
12011 /* Implement TARGET_OPTION_PRINT. */
12014 aarch64_option_print (FILE *file
, int indent
, struct cl_target_option
*ptr
)
12016 const struct processor
*cpu
12017 = aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
12018 unsigned long isa_flags
= ptr
->x_aarch64_isa_flags
;
12019 const struct processor
*arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
12020 std::string extension
12021 = aarch64_get_extension_string_for_isa_flags (isa_flags
, arch
->flags
);
12023 fprintf (file
, "%*sselected tune = %s\n", indent
, "", cpu
->name
);
12024 fprintf (file
, "%*sselected arch = %s%s\n", indent
, "",
12025 arch
->name
, extension
.c_str ());
12028 static GTY(()) tree aarch64_previous_fndecl
;
12031 aarch64_reset_previous_fndecl (void)
12033 aarch64_previous_fndecl
= NULL
;
12036 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
12037 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
12038 make sure optab availability predicates are recomputed when necessary. */
12041 aarch64_save_restore_target_globals (tree new_tree
)
12043 if (TREE_TARGET_GLOBALS (new_tree
))
12044 restore_target_globals (TREE_TARGET_GLOBALS (new_tree
));
12045 else if (new_tree
== target_option_default_node
)
12046 restore_target_globals (&default_target_globals
);
12048 TREE_TARGET_GLOBALS (new_tree
) = save_target_globals_default_opts ();
12051 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
12052 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
12053 of the function, if such exists. This function may be called multiple
12054 times on a single function so use aarch64_previous_fndecl to avoid
12055 setting up identical state. */
12058 aarch64_set_current_function (tree fndecl
)
12060 if (!fndecl
|| fndecl
== aarch64_previous_fndecl
)
12063 tree old_tree
= (aarch64_previous_fndecl
12064 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl
)
12067 tree new_tree
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
12069 /* If current function has no attributes but the previous one did,
12070 use the default node. */
12071 if (!new_tree
&& old_tree
)
12072 new_tree
= target_option_default_node
;
12074 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
12075 the default have been handled by aarch64_save_restore_target_globals from
12076 aarch64_pragma_target_parse. */
12077 if (old_tree
== new_tree
)
12080 aarch64_previous_fndecl
= fndecl
;
12082 /* First set the target options. */
12083 cl_target_option_restore (&global_options
, TREE_TARGET_OPTION (new_tree
));
12085 aarch64_save_restore_target_globals (new_tree
);
12088 /* Enum describing the various ways we can handle attributes.
12089 In many cases we can reuse the generic option handling machinery. */
12091 enum aarch64_attr_opt_type
12093 aarch64_attr_mask
, /* Attribute should set a bit in target_flags. */
12094 aarch64_attr_bool
, /* Attribute sets or unsets a boolean variable. */
12095 aarch64_attr_enum
, /* Attribute sets an enum variable. */
12096 aarch64_attr_custom
/* Attribute requires a custom handling function. */
12099 /* All the information needed to handle a target attribute.
12100 NAME is the name of the attribute.
12101 ATTR_TYPE specifies the type of behavior of the attribute as described
12102 in the definition of enum aarch64_attr_opt_type.
12103 ALLOW_NEG is true if the attribute supports a "no-" form.
12104 HANDLER is the function that takes the attribute string as an argument
12105 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
12106 OPT_NUM is the enum specifying the option that the attribute modifies.
12107 This is needed for attributes that mirror the behavior of a command-line
12108 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
12109 aarch64_attr_enum. */
12111 struct aarch64_attribute_info
12114 enum aarch64_attr_opt_type attr_type
;
12116 bool (*handler
) (const char *);
12117 enum opt_code opt_num
;
12120 /* Handle the ARCH_STR argument to the arch= target attribute. */
12123 aarch64_handle_attr_arch (const char *str
)
12125 const struct processor
*tmp_arch
= NULL
;
12126 std::string invalid_extension
;
12127 enum aarch64_parse_opt_result parse_res
12128 = aarch64_parse_arch (str
, &tmp_arch
, &aarch64_isa_flags
, &invalid_extension
);
12130 if (parse_res
== AARCH64_PARSE_OK
)
12132 gcc_assert (tmp_arch
);
12133 selected_arch
= tmp_arch
;
12134 explicit_arch
= selected_arch
->arch
;
12140 case AARCH64_PARSE_MISSING_ARG
:
12141 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
12143 case AARCH64_PARSE_INVALID_ARG
:
12144 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str
);
12145 aarch64_print_hint_for_arch (str
);
12147 case AARCH64_PARSE_INVALID_FEATURE
:
12148 error ("invalid feature modifier %s of value (\"%s\") in "
12149 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
12150 aarch64_print_hint_for_extensions (invalid_extension
);
12153 gcc_unreachable ();
12159 /* Handle the argument CPU_STR to the cpu= target attribute. */
12162 aarch64_handle_attr_cpu (const char *str
)
12164 const struct processor
*tmp_cpu
= NULL
;
12165 std::string invalid_extension
;
12166 enum aarch64_parse_opt_result parse_res
12167 = aarch64_parse_cpu (str
, &tmp_cpu
, &aarch64_isa_flags
, &invalid_extension
);
12169 if (parse_res
== AARCH64_PARSE_OK
)
12171 gcc_assert (tmp_cpu
);
12172 selected_tune
= tmp_cpu
;
12173 explicit_tune_core
= selected_tune
->ident
;
12175 selected_arch
= &all_architectures
[tmp_cpu
->arch
];
12176 explicit_arch
= selected_arch
->arch
;
12182 case AARCH64_PARSE_MISSING_ARG
:
12183 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
12185 case AARCH64_PARSE_INVALID_ARG
:
12186 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str
);
12187 aarch64_print_hint_for_core (str
);
12189 case AARCH64_PARSE_INVALID_FEATURE
:
12190 error ("invalid feature modifier %s of value (\"%s\") in "
12191 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
12192 aarch64_print_hint_for_extensions (invalid_extension
);
12195 gcc_unreachable ();
12201 /* Handle the argument STR to the branch-protection= attribute. */
12204 aarch64_handle_attr_branch_protection (const char* str
)
12206 char *err_str
= (char *) xmalloc (strlen (str
));
12207 enum aarch64_parse_opt_result res
= aarch64_parse_branch_protection (str
,
12209 bool success
= false;
12212 case AARCH64_PARSE_MISSING_ARG
:
12213 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
12216 case AARCH64_PARSE_INVALID_ARG
:
12217 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
12218 "=\")%> pragma or attribute", err_str
);
12220 case AARCH64_PARSE_OK
:
12222 /* Fall through. */
12223 case AARCH64_PARSE_INVALID_FEATURE
:
12226 gcc_unreachable ();
12232 /* Handle the argument STR to the tune= target attribute. */
12235 aarch64_handle_attr_tune (const char *str
)
12237 const struct processor
*tmp_tune
= NULL
;
12238 enum aarch64_parse_opt_result parse_res
12239 = aarch64_parse_tune (str
, &tmp_tune
);
12241 if (parse_res
== AARCH64_PARSE_OK
)
12243 gcc_assert (tmp_tune
);
12244 selected_tune
= tmp_tune
;
12245 explicit_tune_core
= selected_tune
->ident
;
12251 case AARCH64_PARSE_INVALID_ARG
:
12252 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str
);
12253 aarch64_print_hint_for_core (str
);
12256 gcc_unreachable ();
12262 /* Parse an architecture extensions target attribute string specified in STR.
12263 For example "+fp+nosimd". Show any errors if needed. Return TRUE
12264 if successful. Update aarch64_isa_flags to reflect the ISA features
12268 aarch64_handle_attr_isa_flags (char *str
)
12270 enum aarch64_parse_opt_result parse_res
;
12271 unsigned long isa_flags
= aarch64_isa_flags
;
12273 /* We allow "+nothing" in the beginning to clear out all architectural
12274 features if the user wants to handpick specific features. */
12275 if (strncmp ("+nothing", str
, 8) == 0)
12281 std::string invalid_extension
;
12282 parse_res
= aarch64_parse_extension (str
, &isa_flags
, &invalid_extension
);
12284 if (parse_res
== AARCH64_PARSE_OK
)
12286 aarch64_isa_flags
= isa_flags
;
12292 case AARCH64_PARSE_MISSING_ARG
:
12293 error ("missing value in %<target()%> pragma or attribute");
12296 case AARCH64_PARSE_INVALID_FEATURE
:
12297 error ("invalid feature modifier %s of value (\"%s\") in "
12298 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
12302 gcc_unreachable ();
12308 /* The target attributes that we support. On top of these we also support just
12309 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
12310 handled explicitly in aarch64_process_one_target_attr. */
12312 static const struct aarch64_attribute_info aarch64_attributes
[] =
12314 { "general-regs-only", aarch64_attr_mask
, false, NULL
,
12315 OPT_mgeneral_regs_only
},
12316 { "fix-cortex-a53-835769", aarch64_attr_bool
, true, NULL
,
12317 OPT_mfix_cortex_a53_835769
},
12318 { "fix-cortex-a53-843419", aarch64_attr_bool
, true, NULL
,
12319 OPT_mfix_cortex_a53_843419
},
12320 { "cmodel", aarch64_attr_enum
, false, NULL
, OPT_mcmodel_
},
12321 { "strict-align", aarch64_attr_mask
, true, NULL
, OPT_mstrict_align
},
12322 { "omit-leaf-frame-pointer", aarch64_attr_bool
, true, NULL
,
12323 OPT_momit_leaf_frame_pointer
},
12324 { "tls-dialect", aarch64_attr_enum
, false, NULL
, OPT_mtls_dialect_
},
12325 { "arch", aarch64_attr_custom
, false, aarch64_handle_attr_arch
,
12327 { "cpu", aarch64_attr_custom
, false, aarch64_handle_attr_cpu
, OPT_mcpu_
},
12328 { "tune", aarch64_attr_custom
, false, aarch64_handle_attr_tune
,
12330 { "branch-protection", aarch64_attr_custom
, false,
12331 aarch64_handle_attr_branch_protection
, OPT_mbranch_protection_
},
12332 { "sign-return-address", aarch64_attr_enum
, false, NULL
,
12333 OPT_msign_return_address_
},
12334 { NULL
, aarch64_attr_custom
, false, NULL
, OPT____
}
12337 /* Parse ARG_STR which contains the definition of one target attribute.
12338 Show appropriate errors if any or return true if the attribute is valid. */
12341 aarch64_process_one_target_attr (char *arg_str
)
12343 bool invert
= false;
12345 size_t len
= strlen (arg_str
);
12349 error ("malformed %<target()%> pragma or attribute");
12353 char *str_to_check
= (char *) alloca (len
+ 1);
12354 strcpy (str_to_check
, arg_str
);
12356 /* Skip leading whitespace. */
12357 while (*str_to_check
== ' ' || *str_to_check
== '\t')
12360 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
12361 It is easier to detect and handle it explicitly here rather than going
12362 through the machinery for the rest of the target attributes in this
12364 if (*str_to_check
== '+')
12365 return aarch64_handle_attr_isa_flags (str_to_check
);
12367 if (len
> 3 && strncmp (str_to_check
, "no-", 3) == 0)
12372 char *arg
= strchr (str_to_check
, '=');
12374 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
12375 and point ARG to "foo". */
12381 const struct aarch64_attribute_info
*p_attr
;
12382 bool found
= false;
12383 for (p_attr
= aarch64_attributes
; p_attr
->name
; p_attr
++)
12385 /* If the names don't match up, or the user has given an argument
12386 to an attribute that doesn't accept one, or didn't give an argument
12387 to an attribute that expects one, fail to match. */
12388 if (strcmp (str_to_check
, p_attr
->name
) != 0)
12392 bool attr_need_arg_p
= p_attr
->attr_type
== aarch64_attr_custom
12393 || p_attr
->attr_type
== aarch64_attr_enum
;
12395 if (attr_need_arg_p
^ (arg
!= NULL
))
12397 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check
);
12401 /* If the name matches but the attribute does not allow "no-" versions
12402 then we can't match. */
12403 if (invert
&& !p_attr
->allow_neg
)
12405 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check
);
12409 switch (p_attr
->attr_type
)
12411 /* Has a custom handler registered.
12412 For example, cpu=, arch=, tune=. */
12413 case aarch64_attr_custom
:
12414 gcc_assert (p_attr
->handler
);
12415 if (!p_attr
->handler (arg
))
12419 /* Either set or unset a boolean option. */
12420 case aarch64_attr_bool
:
12422 struct cl_decoded_option decoded
;
12424 generate_option (p_attr
->opt_num
, NULL
, !invert
,
12425 CL_TARGET
, &decoded
);
12426 aarch64_handle_option (&global_options
, &global_options_set
,
12427 &decoded
, input_location
);
12430 /* Set or unset a bit in the target_flags. aarch64_handle_option
12431 should know what mask to apply given the option number. */
12432 case aarch64_attr_mask
:
12434 struct cl_decoded_option decoded
;
12435 /* We only need to specify the option number.
12436 aarch64_handle_option will know which mask to apply. */
12437 decoded
.opt_index
= p_attr
->opt_num
;
12438 decoded
.value
= !invert
;
12439 aarch64_handle_option (&global_options
, &global_options_set
,
12440 &decoded
, input_location
);
12443 /* Use the option setting machinery to set an option to an enum. */
12444 case aarch64_attr_enum
:
12449 valid
= opt_enum_arg_to_value (p_attr
->opt_num
, arg
,
12450 &value
, CL_TARGET
);
12453 set_option (&global_options
, NULL
, p_attr
->opt_num
, value
,
12454 NULL
, DK_UNSPECIFIED
, input_location
,
12459 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check
, arg
);
12464 gcc_unreachable ();
12468 /* If we reached here we either have found an attribute and validated
12469 it or didn't match any. If we matched an attribute but its arguments
12470 were malformed we will have returned false already. */
12474 /* Count how many times the character C appears in
12475 NULL-terminated string STR. */
12477 static unsigned int
12478 num_occurences_in_str (char c
, char *str
)
12480 unsigned int res
= 0;
12481 while (*str
!= '\0')
12492 /* Parse the tree in ARGS that contains the target attribute information
12493 and update the global target options space. */
12496 aarch64_process_target_attr (tree args
)
12498 if (TREE_CODE (args
) == TREE_LIST
)
12502 tree head
= TREE_VALUE (args
);
12505 if (!aarch64_process_target_attr (head
))
12508 args
= TREE_CHAIN (args
);
12514 if (TREE_CODE (args
) != STRING_CST
)
12516 error ("attribute %<target%> argument not a string");
12520 size_t len
= strlen (TREE_STRING_POINTER (args
));
12521 char *str_to_check
= (char *) alloca (len
+ 1);
12522 strcpy (str_to_check
, TREE_STRING_POINTER (args
));
12526 error ("malformed %<target()%> pragma or attribute");
12530 /* Used to catch empty spaces between commas i.e.
12531 attribute ((target ("attr1,,attr2"))). */
12532 unsigned int num_commas
= num_occurences_in_str (',', str_to_check
);
12534 /* Handle multiple target attributes separated by ','. */
12535 char *token
= strtok_r (str_to_check
, ",", &str_to_check
);
12537 unsigned int num_attrs
= 0;
12541 if (!aarch64_process_one_target_attr (token
))
12543 error ("pragma or attribute %<target(\"%s\")%> is not valid", token
);
12547 token
= strtok_r (NULL
, ",", &str_to_check
);
12550 if (num_attrs
!= num_commas
+ 1)
12552 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args
));
12559 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
12560 process attribute ((target ("..."))). */
12563 aarch64_option_valid_attribute_p (tree fndecl
, tree
, tree args
, int)
12565 struct cl_target_option cur_target
;
12568 tree new_target
, new_optimize
;
12569 tree existing_target
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
12571 /* If what we're processing is the current pragma string then the
12572 target option node is already stored in target_option_current_node
12573 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
12574 having to re-parse the string. This is especially useful to keep
12575 arm_neon.h compile times down since that header contains a lot
12576 of intrinsics enclosed in pragmas. */
12577 if (!existing_target
&& args
== current_target_pragma
)
12579 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = target_option_current_node
;
12582 tree func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
12584 old_optimize
= build_optimization_node (&global_options
);
12585 func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
12587 /* If the function changed the optimization levels as well as setting
12588 target options, start with the optimizations specified. */
12589 if (func_optimize
&& func_optimize
!= old_optimize
)
12590 cl_optimization_restore (&global_options
,
12591 TREE_OPTIMIZATION (func_optimize
));
12593 /* Save the current target options to restore at the end. */
12594 cl_target_option_save (&cur_target
, &global_options
);
12596 /* If fndecl already has some target attributes applied to it, unpack
12597 them so that we add this attribute on top of them, rather than
12598 overwriting them. */
12599 if (existing_target
)
12601 struct cl_target_option
*existing_options
12602 = TREE_TARGET_OPTION (existing_target
);
12604 if (existing_options
)
12605 cl_target_option_restore (&global_options
, existing_options
);
12608 cl_target_option_restore (&global_options
,
12609 TREE_TARGET_OPTION (target_option_current_node
));
12611 ret
= aarch64_process_target_attr (args
);
12613 /* Set up any additional state. */
12616 aarch64_override_options_internal (&global_options
);
12617 /* Initialize SIMD builtins if we haven't already.
12618 Set current_target_pragma to NULL for the duration so that
12619 the builtin initialization code doesn't try to tag the functions
12620 being built with the attributes specified by any current pragma, thus
12621 going into an infinite recursion. */
12624 tree saved_current_target_pragma
= current_target_pragma
;
12625 current_target_pragma
= NULL
;
12626 aarch64_init_simd_builtins ();
12627 current_target_pragma
= saved_current_target_pragma
;
12629 new_target
= build_target_option_node (&global_options
);
12634 new_optimize
= build_optimization_node (&global_options
);
12638 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = new_target
;
12640 if (old_optimize
!= new_optimize
)
12641 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
) = new_optimize
;
12644 cl_target_option_restore (&global_options
, &cur_target
);
12646 if (old_optimize
!= new_optimize
)
12647 cl_optimization_restore (&global_options
,
12648 TREE_OPTIMIZATION (old_optimize
));
12652 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
12653 tri-bool options (yes, no, don't care) and the default value is
12654 DEF, determine whether to reject inlining. */
12657 aarch64_tribools_ok_for_inlining_p (int caller
, int callee
,
12658 int dont_care
, int def
)
12660 /* If the callee doesn't care, always allow inlining. */
12661 if (callee
== dont_care
)
12664 /* If the caller doesn't care, always allow inlining. */
12665 if (caller
== dont_care
)
12668 /* Otherwise, allow inlining if either the callee and caller values
12669 agree, or if the callee is using the default value. */
12670 return (callee
== caller
|| callee
== def
);
12673 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
12674 to inline CALLEE into CALLER based on target-specific info.
12675 Make sure that the caller and callee have compatible architectural
12676 features. Then go through the other possible target attributes
12677 and see if they can block inlining. Try not to reject always_inline
12678 callees unless they are incompatible architecturally. */
12681 aarch64_can_inline_p (tree caller
, tree callee
)
12683 tree caller_tree
= DECL_FUNCTION_SPECIFIC_TARGET (caller
);
12684 tree callee_tree
= DECL_FUNCTION_SPECIFIC_TARGET (callee
);
12686 struct cl_target_option
*caller_opts
12687 = TREE_TARGET_OPTION (caller_tree
? caller_tree
12688 : target_option_default_node
);
12690 struct cl_target_option
*callee_opts
12691 = TREE_TARGET_OPTION (callee_tree
? callee_tree
12692 : target_option_default_node
);
12694 /* Callee's ISA flags should be a subset of the caller's. */
12695 if ((caller_opts
->x_aarch64_isa_flags
& callee_opts
->x_aarch64_isa_flags
)
12696 != callee_opts
->x_aarch64_isa_flags
)
12699 /* Allow non-strict aligned functions inlining into strict
12701 if ((TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)
12702 != TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
))
12703 && !(!TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
)
12704 && TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)))
12707 bool always_inline
= lookup_attribute ("always_inline",
12708 DECL_ATTRIBUTES (callee
));
12710 /* If the architectural features match up and the callee is always_inline
12711 then the other attributes don't matter. */
12715 if (caller_opts
->x_aarch64_cmodel_var
12716 != callee_opts
->x_aarch64_cmodel_var
)
12719 if (caller_opts
->x_aarch64_tls_dialect
12720 != callee_opts
->x_aarch64_tls_dialect
)
12723 /* Honour explicit requests to workaround errata. */
12724 if (!aarch64_tribools_ok_for_inlining_p (
12725 caller_opts
->x_aarch64_fix_a53_err835769
,
12726 callee_opts
->x_aarch64_fix_a53_err835769
,
12727 2, TARGET_FIX_ERR_A53_835769_DEFAULT
))
12730 if (!aarch64_tribools_ok_for_inlining_p (
12731 caller_opts
->x_aarch64_fix_a53_err843419
,
12732 callee_opts
->x_aarch64_fix_a53_err843419
,
12733 2, TARGET_FIX_ERR_A53_843419
))
12736 /* If the user explicitly specified -momit-leaf-frame-pointer for the
12737 caller and calle and they don't match up, reject inlining. */
12738 if (!aarch64_tribools_ok_for_inlining_p (
12739 caller_opts
->x_flag_omit_leaf_frame_pointer
,
12740 callee_opts
->x_flag_omit_leaf_frame_pointer
,
12744 /* If the callee has specific tuning overrides, respect them. */
12745 if (callee_opts
->x_aarch64_override_tune_string
!= NULL
12746 && caller_opts
->x_aarch64_override_tune_string
== NULL
)
12749 /* If the user specified tuning override strings for the
12750 caller and callee and they don't match up, reject inlining.
12751 We just do a string compare here, we don't analyze the meaning
12752 of the string, as it would be too costly for little gain. */
12753 if (callee_opts
->x_aarch64_override_tune_string
12754 && caller_opts
->x_aarch64_override_tune_string
12755 && (strcmp (callee_opts
->x_aarch64_override_tune_string
,
12756 caller_opts
->x_aarch64_override_tune_string
) != 0))
12762 /* Return true if SYMBOL_REF X binds locally. */
12765 aarch64_symbol_binds_local_p (const_rtx x
)
12767 return (SYMBOL_REF_DECL (x
)
12768 ? targetm
.binds_local_p (SYMBOL_REF_DECL (x
))
12769 : SYMBOL_REF_LOCAL_P (x
));
12772 /* Return true if SYMBOL_REF X is thread local */
12774 aarch64_tls_symbol_p (rtx x
)
12776 if (! TARGET_HAVE_TLS
)
12779 if (GET_CODE (x
) != SYMBOL_REF
)
12782 return SYMBOL_REF_TLS_MODEL (x
) != 0;
12785 /* Classify a TLS symbol into one of the TLS kinds. */
12786 enum aarch64_symbol_type
12787 aarch64_classify_tls_symbol (rtx x
)
12789 enum tls_model tls_kind
= tls_symbolic_operand_type (x
);
12793 case TLS_MODEL_GLOBAL_DYNAMIC
:
12794 case TLS_MODEL_LOCAL_DYNAMIC
:
12795 return TARGET_TLS_DESC
? SYMBOL_SMALL_TLSDESC
: SYMBOL_SMALL_TLSGD
;
12797 case TLS_MODEL_INITIAL_EXEC
:
12798 switch (aarch64_cmodel
)
12800 case AARCH64_CMODEL_TINY
:
12801 case AARCH64_CMODEL_TINY_PIC
:
12802 return SYMBOL_TINY_TLSIE
;
12804 return SYMBOL_SMALL_TLSIE
;
12807 case TLS_MODEL_LOCAL_EXEC
:
12808 if (aarch64_tls_size
== 12)
12809 return SYMBOL_TLSLE12
;
12810 else if (aarch64_tls_size
== 24)
12811 return SYMBOL_TLSLE24
;
12812 else if (aarch64_tls_size
== 32)
12813 return SYMBOL_TLSLE32
;
12814 else if (aarch64_tls_size
== 48)
12815 return SYMBOL_TLSLE48
;
12817 gcc_unreachable ();
12819 case TLS_MODEL_EMULATED
:
12820 case TLS_MODEL_NONE
:
12821 return SYMBOL_FORCE_TO_MEM
;
12824 gcc_unreachable ();
12828 /* Return the correct method for accessing X + OFFSET, where X is either
12829 a SYMBOL_REF or LABEL_REF. */
12831 enum aarch64_symbol_type
12832 aarch64_classify_symbol (rtx x
, HOST_WIDE_INT offset
)
12834 if (GET_CODE (x
) == LABEL_REF
)
12836 switch (aarch64_cmodel
)
12838 case AARCH64_CMODEL_LARGE
:
12839 return SYMBOL_FORCE_TO_MEM
;
12841 case AARCH64_CMODEL_TINY_PIC
:
12842 case AARCH64_CMODEL_TINY
:
12843 return SYMBOL_TINY_ABSOLUTE
;
12845 case AARCH64_CMODEL_SMALL_SPIC
:
12846 case AARCH64_CMODEL_SMALL_PIC
:
12847 case AARCH64_CMODEL_SMALL
:
12848 return SYMBOL_SMALL_ABSOLUTE
;
12851 gcc_unreachable ();
12855 if (GET_CODE (x
) == SYMBOL_REF
)
12857 if (aarch64_tls_symbol_p (x
))
12858 return aarch64_classify_tls_symbol (x
);
12860 switch (aarch64_cmodel
)
12862 case AARCH64_CMODEL_TINY
:
12863 /* When we retrieve symbol + offset address, we have to make sure
12864 the offset does not cause overflow of the final address. But
12865 we have no way of knowing the address of symbol at compile time
12866 so we can't accurately say if the distance between the PC and
12867 symbol + offset is outside the addressible range of +/-1M in the
12868 TINY code model. So we rely on images not being greater than
12869 1M and cap the offset at 1M and anything beyond 1M will have to
12870 be loaded using an alternative mechanism. Furthermore if the
12871 symbol is a weak reference to something that isn't known to
12872 resolve to a symbol in this module, then force to memory. */
12873 if ((SYMBOL_REF_WEAK (x
)
12874 && !aarch64_symbol_binds_local_p (x
))
12875 || !IN_RANGE (offset
, -1048575, 1048575))
12876 return SYMBOL_FORCE_TO_MEM
;
12877 return SYMBOL_TINY_ABSOLUTE
;
12879 case AARCH64_CMODEL_SMALL
:
12880 /* Same reasoning as the tiny code model, but the offset cap here is
12882 if ((SYMBOL_REF_WEAK (x
)
12883 && !aarch64_symbol_binds_local_p (x
))
12884 || !IN_RANGE (offset
, HOST_WIDE_INT_C (-4294967263),
12885 HOST_WIDE_INT_C (4294967264)))
12886 return SYMBOL_FORCE_TO_MEM
;
12887 return SYMBOL_SMALL_ABSOLUTE
;
12889 case AARCH64_CMODEL_TINY_PIC
:
12890 if (!aarch64_symbol_binds_local_p (x
))
12891 return SYMBOL_TINY_GOT
;
12892 return SYMBOL_TINY_ABSOLUTE
;
12894 case AARCH64_CMODEL_SMALL_SPIC
:
12895 case AARCH64_CMODEL_SMALL_PIC
:
12896 if (!aarch64_symbol_binds_local_p (x
))
12897 return (aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
12898 ? SYMBOL_SMALL_GOT_28K
: SYMBOL_SMALL_GOT_4G
);
12899 return SYMBOL_SMALL_ABSOLUTE
;
12901 case AARCH64_CMODEL_LARGE
:
12902 /* This is alright even in PIC code as the constant
12903 pool reference is always PC relative and within
12904 the same translation unit. */
12905 if (!aarch64_pcrelative_literal_loads
&& CONSTANT_POOL_ADDRESS_P (x
))
12906 return SYMBOL_SMALL_ABSOLUTE
;
12908 return SYMBOL_FORCE_TO_MEM
;
12911 gcc_unreachable ();
12915 /* By default push everything into the constant pool. */
12916 return SYMBOL_FORCE_TO_MEM
;
12920 aarch64_constant_address_p (rtx x
)
12922 return (CONSTANT_P (x
) && memory_address_p (DImode
, x
));
12926 aarch64_legitimate_pic_operand_p (rtx x
)
12928 if (GET_CODE (x
) == SYMBOL_REF
12929 || (GET_CODE (x
) == CONST
12930 && GET_CODE (XEXP (x
, 0)) == PLUS
12931 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
))
12937 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
12938 that should be rematerialized rather than spilled. */
12941 aarch64_legitimate_constant_p (machine_mode mode
, rtx x
)
12943 /* Support CSE and rematerialization of common constants. */
12944 if (CONST_INT_P (x
)
12945 || (CONST_DOUBLE_P (x
) && GET_MODE_CLASS (mode
) == MODE_FLOAT
)
12946 || GET_CODE (x
) == CONST_VECTOR
)
12949 /* Do not allow vector struct mode constants for Advanced SIMD.
12950 We could support 0 and -1 easily, but they need support in
12951 aarch64-simd.md. */
12952 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
12953 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
12956 /* Only accept variable-length vector constants if they can be
12959 ??? It would be possible to handle rematerialization of other
12960 constants via secondary reloads. */
12961 if (vec_flags
& VEC_ANY_SVE
)
12962 return aarch64_simd_valid_immediate (x
, NULL
);
12964 if (GET_CODE (x
) == HIGH
)
12967 /* Accept polynomial constants that can be calculated by using the
12968 destination of a move as the sole temporary. Constants that
12969 require a second temporary cannot be rematerialized (they can't be
12970 forced to memory and also aren't legitimate constants). */
12972 if (poly_int_rtx_p (x
, &offset
))
12973 return aarch64_offset_temporaries (false, offset
) <= 1;
12975 /* If an offset is being added to something else, we need to allow the
12976 base to be moved into the destination register, meaning that there
12977 are no free temporaries for the offset. */
12978 x
= strip_offset (x
, &offset
);
12979 if (!offset
.is_constant () && aarch64_offset_temporaries (true, offset
) > 0)
12982 /* Do not allow const (plus (anchor_symbol, const_int)). */
12983 if (maybe_ne (offset
, 0) && SYMBOL_REF_P (x
) && SYMBOL_REF_ANCHOR_P (x
))
12986 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
12987 so spilling them is better than rematerialization. */
12988 if (SYMBOL_REF_P (x
) && !SYMBOL_REF_TLS_MODEL (x
))
12991 /* Label references are always constant. */
12992 if (GET_CODE (x
) == LABEL_REF
)
12999 aarch64_load_tp (rtx target
)
13002 || GET_MODE (target
) != Pmode
13003 || !register_operand (target
, Pmode
))
13004 target
= gen_reg_rtx (Pmode
);
13006 /* Can return in any reg. */
13007 emit_insn (gen_aarch64_load_tp_hard (target
));
13011 /* On AAPCS systems, this is the "struct __va_list". */
13012 static GTY(()) tree va_list_type
;
13014 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
13015 Return the type to use as __builtin_va_list.
13017 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
13029 aarch64_build_builtin_va_list (void)
13032 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
13034 /* Create the type. */
13035 va_list_type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
13036 /* Give it the required name. */
13037 va_list_name
= build_decl (BUILTINS_LOCATION
,
13039 get_identifier ("__va_list"),
13041 DECL_ARTIFICIAL (va_list_name
) = 1;
13042 TYPE_NAME (va_list_type
) = va_list_name
;
13043 TYPE_STUB_DECL (va_list_type
) = va_list_name
;
13045 /* Create the fields. */
13046 f_stack
= build_decl (BUILTINS_LOCATION
,
13047 FIELD_DECL
, get_identifier ("__stack"),
13049 f_grtop
= build_decl (BUILTINS_LOCATION
,
13050 FIELD_DECL
, get_identifier ("__gr_top"),
13052 f_vrtop
= build_decl (BUILTINS_LOCATION
,
13053 FIELD_DECL
, get_identifier ("__vr_top"),
13055 f_groff
= build_decl (BUILTINS_LOCATION
,
13056 FIELD_DECL
, get_identifier ("__gr_offs"),
13057 integer_type_node
);
13058 f_vroff
= build_decl (BUILTINS_LOCATION
,
13059 FIELD_DECL
, get_identifier ("__vr_offs"),
13060 integer_type_node
);
13062 /* Tell tree-stdarg pass about our internal offset fields.
13063 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
13064 purpose to identify whether the code is updating va_list internal
13065 offset fields through irregular way. */
13066 va_list_gpr_counter_field
= f_groff
;
13067 va_list_fpr_counter_field
= f_vroff
;
13069 DECL_ARTIFICIAL (f_stack
) = 1;
13070 DECL_ARTIFICIAL (f_grtop
) = 1;
13071 DECL_ARTIFICIAL (f_vrtop
) = 1;
13072 DECL_ARTIFICIAL (f_groff
) = 1;
13073 DECL_ARTIFICIAL (f_vroff
) = 1;
13075 DECL_FIELD_CONTEXT (f_stack
) = va_list_type
;
13076 DECL_FIELD_CONTEXT (f_grtop
) = va_list_type
;
13077 DECL_FIELD_CONTEXT (f_vrtop
) = va_list_type
;
13078 DECL_FIELD_CONTEXT (f_groff
) = va_list_type
;
13079 DECL_FIELD_CONTEXT (f_vroff
) = va_list_type
;
13081 TYPE_FIELDS (va_list_type
) = f_stack
;
13082 DECL_CHAIN (f_stack
) = f_grtop
;
13083 DECL_CHAIN (f_grtop
) = f_vrtop
;
13084 DECL_CHAIN (f_vrtop
) = f_groff
;
13085 DECL_CHAIN (f_groff
) = f_vroff
;
13087 /* Compute its layout. */
13088 layout_type (va_list_type
);
13090 return va_list_type
;
13093 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
13095 aarch64_expand_builtin_va_start (tree valist
, rtx nextarg ATTRIBUTE_UNUSED
)
13097 const CUMULATIVE_ARGS
*cum
;
13098 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
13099 tree stack
, grtop
, vrtop
, groff
, vroff
;
13101 int gr_save_area_size
= cfun
->va_list_gpr_size
;
13102 int vr_save_area_size
= cfun
->va_list_fpr_size
;
13105 cum
= &crtl
->args
.info
;
13106 if (cfun
->va_list_gpr_size
)
13107 gr_save_area_size
= MIN ((NUM_ARG_REGS
- cum
->aapcs_ncrn
) * UNITS_PER_WORD
,
13108 cfun
->va_list_gpr_size
);
13109 if (cfun
->va_list_fpr_size
)
13110 vr_save_area_size
= MIN ((NUM_FP_ARG_REGS
- cum
->aapcs_nvrn
)
13111 * UNITS_PER_VREG
, cfun
->va_list_fpr_size
);
13115 gcc_assert (cum
->aapcs_nvrn
== 0);
13116 vr_save_area_size
= 0;
13119 f_stack
= TYPE_FIELDS (va_list_type_node
);
13120 f_grtop
= DECL_CHAIN (f_stack
);
13121 f_vrtop
= DECL_CHAIN (f_grtop
);
13122 f_groff
= DECL_CHAIN (f_vrtop
);
13123 f_vroff
= DECL_CHAIN (f_groff
);
13125 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), valist
, f_stack
,
13127 grtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
), valist
, f_grtop
,
13129 vrtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
), valist
, f_vrtop
,
13131 groff
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
), valist
, f_groff
,
13133 vroff
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
), valist
, f_vroff
,
13136 /* Emit code to initialize STACK, which points to the next varargs stack
13137 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
13138 by named arguments. STACK is 8-byte aligned. */
13139 t
= make_tree (TREE_TYPE (stack
), virtual_incoming_args_rtx
);
13140 if (cum
->aapcs_stack_size
> 0)
13141 t
= fold_build_pointer_plus_hwi (t
, cum
->aapcs_stack_size
* UNITS_PER_WORD
);
13142 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), stack
, t
);
13143 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
13145 /* Emit code to initialize GRTOP, the top of the GR save area.
13146 virtual_incoming_args_rtx should have been 16 byte aligned. */
13147 t
= make_tree (TREE_TYPE (grtop
), virtual_incoming_args_rtx
);
13148 t
= build2 (MODIFY_EXPR
, TREE_TYPE (grtop
), grtop
, t
);
13149 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
13151 /* Emit code to initialize VRTOP, the top of the VR save area.
13152 This address is gr_save_area_bytes below GRTOP, rounded
13153 down to the next 16-byte boundary. */
13154 t
= make_tree (TREE_TYPE (vrtop
), virtual_incoming_args_rtx
);
13155 vr_offset
= ROUND_UP (gr_save_area_size
,
13156 STACK_BOUNDARY
/ BITS_PER_UNIT
);
13159 t
= fold_build_pointer_plus_hwi (t
, -vr_offset
);
13160 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vrtop
), vrtop
, t
);
13161 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
13163 /* Emit code to initialize GROFF, the offset from GRTOP of the
13164 next GPR argument. */
13165 t
= build2 (MODIFY_EXPR
, TREE_TYPE (groff
), groff
,
13166 build_int_cst (TREE_TYPE (groff
), -gr_save_area_size
));
13167 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
13169 /* Likewise emit code to initialize VROFF, the offset from FTOP
13170 of the next VR argument. */
13171 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vroff
), vroff
,
13172 build_int_cst (TREE_TYPE (vroff
), -vr_save_area_size
));
13173 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
13176 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
13179 aarch64_gimplify_va_arg_expr (tree valist
, tree type
, gimple_seq
*pre_p
,
13180 gimple_seq
*post_p ATTRIBUTE_UNUSED
)
13184 bool is_ha
; /* is HFA or HVA. */
13185 bool dw_align
; /* double-word align. */
13186 machine_mode ag_mode
= VOIDmode
;
13190 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
13191 tree stack
, f_top
, f_off
, off
, arg
, roundup
, on_stack
;
13192 HOST_WIDE_INT size
, rsize
, adjust
, align
;
13193 tree t
, u
, cond1
, cond2
;
13195 indirect_p
= pass_by_reference (NULL
, TYPE_MODE (type
), type
, false);
13197 type
= build_pointer_type (type
);
13199 mode
= TYPE_MODE (type
);
13201 f_stack
= TYPE_FIELDS (va_list_type_node
);
13202 f_grtop
= DECL_CHAIN (f_stack
);
13203 f_vrtop
= DECL_CHAIN (f_grtop
);
13204 f_groff
= DECL_CHAIN (f_vrtop
);
13205 f_vroff
= DECL_CHAIN (f_groff
);
13207 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), unshare_expr (valist
),
13208 f_stack
, NULL_TREE
);
13209 size
= int_size_in_bytes (type
);
13210 align
= aarch64_function_arg_alignment (mode
, type
) / BITS_PER_UNIT
;
13214 if (aarch64_vfp_is_call_or_return_candidate (mode
,
13220 /* No frontends can create types with variable-sized modes, so we
13221 shouldn't be asked to pass or return them. */
13222 unsigned int ag_size
= GET_MODE_SIZE (ag_mode
).to_constant ();
13224 /* TYPE passed in fp/simd registers. */
13226 aarch64_err_no_fpadvsimd (mode
);
13228 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
),
13229 unshare_expr (valist
), f_vrtop
, NULL_TREE
);
13230 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
),
13231 unshare_expr (valist
), f_vroff
, NULL_TREE
);
13233 rsize
= nregs
* UNITS_PER_VREG
;
13237 if (BYTES_BIG_ENDIAN
&& ag_size
< UNITS_PER_VREG
)
13238 adjust
= UNITS_PER_VREG
- ag_size
;
13240 else if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
13241 && size
< UNITS_PER_VREG
)
13243 adjust
= UNITS_PER_VREG
- size
;
13248 /* TYPE passed in general registers. */
13249 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
),
13250 unshare_expr (valist
), f_grtop
, NULL_TREE
);
13251 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
),
13252 unshare_expr (valist
), f_groff
, NULL_TREE
);
13253 rsize
= ROUND_UP (size
, UNITS_PER_WORD
);
13254 nregs
= rsize
/ UNITS_PER_WORD
;
13259 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
13260 && size
< UNITS_PER_WORD
)
13262 adjust
= UNITS_PER_WORD
- size
;
13266 /* Get a local temporary for the field value. */
13267 off
= get_initialized_tmp_var (f_off
, pre_p
, NULL
);
13269 /* Emit code to branch if off >= 0. */
13270 t
= build2 (GE_EXPR
, boolean_type_node
, off
,
13271 build_int_cst (TREE_TYPE (off
), 0));
13272 cond1
= build3 (COND_EXPR
, ptr_type_node
, t
, NULL_TREE
, NULL_TREE
);
13276 /* Emit: offs = (offs + 15) & -16. */
13277 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
13278 build_int_cst (TREE_TYPE (off
), 15));
13279 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (off
), t
,
13280 build_int_cst (TREE_TYPE (off
), -16));
13281 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (off
), off
, t
);
13286 /* Update ap.__[g|v]r_offs */
13287 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
13288 build_int_cst (TREE_TYPE (off
), rsize
));
13289 t
= build2 (MODIFY_EXPR
, TREE_TYPE (f_off
), unshare_expr (f_off
), t
);
13293 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
13295 /* [cond2] if (ap.__[g|v]r_offs > 0) */
13296 u
= build2 (GT_EXPR
, boolean_type_node
, unshare_expr (f_off
),
13297 build_int_cst (TREE_TYPE (f_off
), 0));
13298 cond2
= build3 (COND_EXPR
, ptr_type_node
, u
, NULL_TREE
, NULL_TREE
);
13300 /* String up: make sure the assignment happens before the use. */
13301 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (cond2
), t
, cond2
);
13302 COND_EXPR_ELSE (cond1
) = t
;
13304 /* Prepare the trees handling the argument that is passed on the stack;
13305 the top level node will store in ON_STACK. */
13306 arg
= get_initialized_tmp_var (stack
, pre_p
, NULL
);
13309 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
13310 t
= fold_build_pointer_plus_hwi (arg
, 15);
13311 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
13312 build_int_cst (TREE_TYPE (t
), -16));
13313 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (arg
), arg
, t
);
13317 /* Advance ap.__stack */
13318 t
= fold_build_pointer_plus_hwi (arg
, size
+ 7);
13319 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
13320 build_int_cst (TREE_TYPE (t
), -8));
13321 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), unshare_expr (stack
), t
);
13322 /* String up roundup and advance. */
13324 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
13325 /* String up with arg */
13326 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), t
, arg
);
13327 /* Big-endianness related address adjustment. */
13328 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
13329 && size
< UNITS_PER_WORD
)
13331 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (arg
), arg
,
13332 size_int (UNITS_PER_WORD
- size
));
13333 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), on_stack
, t
);
13336 COND_EXPR_THEN (cond1
) = unshare_expr (on_stack
);
13337 COND_EXPR_THEN (cond2
) = unshare_expr (on_stack
);
13339 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
13342 t
= build2 (PREINCREMENT_EXPR
, TREE_TYPE (off
), off
,
13343 build_int_cst (TREE_TYPE (off
), adjust
));
13345 t
= fold_convert (sizetype
, t
);
13346 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (f_top
), f_top
, t
);
13350 /* type ha; // treat as "struct {ftype field[n];}"
13351 ... [computing offs]
13352 for (i = 0; i <nregs; ++i, offs += 16)
13353 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
13356 tree tmp_ha
, field_t
, field_ptr_t
;
13358 /* Declare a local variable. */
13359 tmp_ha
= create_tmp_var_raw (type
, "ha");
13360 gimple_add_tmp_var (tmp_ha
);
13362 /* Establish the base type. */
13366 field_t
= float_type_node
;
13367 field_ptr_t
= float_ptr_type_node
;
13370 field_t
= double_type_node
;
13371 field_ptr_t
= double_ptr_type_node
;
13374 field_t
= long_double_type_node
;
13375 field_ptr_t
= long_double_ptr_type_node
;
13378 field_t
= aarch64_fp16_type_node
;
13379 field_ptr_t
= aarch64_fp16_ptr_type_node
;
13384 tree innertype
= make_signed_type (GET_MODE_PRECISION (SImode
));
13385 field_t
= build_vector_type_for_mode (innertype
, ag_mode
);
13386 field_ptr_t
= build_pointer_type (field_t
);
13393 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
13394 tmp_ha
= build1 (ADDR_EXPR
, field_ptr_t
, tmp_ha
);
13396 t
= fold_convert (field_ptr_t
, addr
);
13397 t
= build2 (MODIFY_EXPR
, field_t
,
13398 build1 (INDIRECT_REF
, field_t
, tmp_ha
),
13399 build1 (INDIRECT_REF
, field_t
, t
));
13401 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
13402 for (i
= 1; i
< nregs
; ++i
)
13404 addr
= fold_build_pointer_plus_hwi (addr
, UNITS_PER_VREG
);
13405 u
= fold_convert (field_ptr_t
, addr
);
13406 u
= build2 (MODIFY_EXPR
, field_t
,
13407 build2 (MEM_REF
, field_t
, tmp_ha
,
13408 build_int_cst (field_ptr_t
,
13410 int_size_in_bytes (field_t
)))),
13411 build1 (INDIRECT_REF
, field_t
, u
));
13412 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), t
, u
);
13415 u
= fold_convert (TREE_TYPE (f_top
), tmp_ha
);
13416 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (f_top
), t
, u
);
13419 COND_EXPR_ELSE (cond2
) = t
;
13420 addr
= fold_convert (build_pointer_type (type
), cond1
);
13421 addr
= build_va_arg_indirect_ref (addr
);
13424 addr
= build_va_arg_indirect_ref (addr
);
13429 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
13432 aarch64_setup_incoming_varargs (cumulative_args_t cum_v
, machine_mode mode
,
13433 tree type
, int *pretend_size ATTRIBUTE_UNUSED
,
13436 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
13437 CUMULATIVE_ARGS local_cum
;
13438 int gr_saved
= cfun
->va_list_gpr_size
;
13439 int vr_saved
= cfun
->va_list_fpr_size
;
13441 /* The caller has advanced CUM up to, but not beyond, the last named
13442 argument. Advance a local copy of CUM past the last "real" named
13443 argument, to find out how many registers are left over. */
13445 aarch64_function_arg_advance (pack_cumulative_args(&local_cum
), mode
, type
, true);
13447 /* Found out how many registers we need to save.
13448 Honor tree-stdvar analysis results. */
13449 if (cfun
->va_list_gpr_size
)
13450 gr_saved
= MIN (NUM_ARG_REGS
- local_cum
.aapcs_ncrn
,
13451 cfun
->va_list_gpr_size
/ UNITS_PER_WORD
);
13452 if (cfun
->va_list_fpr_size
)
13453 vr_saved
= MIN (NUM_FP_ARG_REGS
- local_cum
.aapcs_nvrn
,
13454 cfun
->va_list_fpr_size
/ UNITS_PER_VREG
);
13458 gcc_assert (local_cum
.aapcs_nvrn
== 0);
13468 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
13469 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
,
13470 - gr_saved
* UNITS_PER_WORD
);
13471 mem
= gen_frame_mem (BLKmode
, ptr
);
13472 set_mem_alias_set (mem
, get_varargs_alias_set ());
13474 move_block_from_reg (local_cum
.aapcs_ncrn
+ R0_REGNUM
,
13479 /* We can't use move_block_from_reg, because it will use
13480 the wrong mode, storing D regs only. */
13481 machine_mode mode
= TImode
;
13482 int off
, i
, vr_start
;
13484 /* Set OFF to the offset from virtual_incoming_args_rtx of
13485 the first vector register. The VR save area lies below
13486 the GR one, and is aligned to 16 bytes. */
13487 off
= -ROUND_UP (gr_saved
* UNITS_PER_WORD
,
13488 STACK_BOUNDARY
/ BITS_PER_UNIT
);
13489 off
-= vr_saved
* UNITS_PER_VREG
;
13491 vr_start
= V0_REGNUM
+ local_cum
.aapcs_nvrn
;
13492 for (i
= 0; i
< vr_saved
; ++i
)
13496 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
, off
);
13497 mem
= gen_frame_mem (mode
, ptr
);
13498 set_mem_alias_set (mem
, get_varargs_alias_set ());
13499 aarch64_emit_move (mem
, gen_rtx_REG (mode
, vr_start
+ i
));
13500 off
+= UNITS_PER_VREG
;
13505 /* We don't save the size into *PRETEND_SIZE because we want to avoid
13506 any complication of having crtl->args.pretend_args_size changed. */
13507 cfun
->machine
->frame
.saved_varargs_size
13508 = (ROUND_UP (gr_saved
* UNITS_PER_WORD
,
13509 STACK_BOUNDARY
/ BITS_PER_UNIT
)
13510 + vr_saved
* UNITS_PER_VREG
);
13514 aarch64_conditional_register_usage (void)
13519 for (i
= V0_REGNUM
; i
<= V31_REGNUM
; i
++)
13522 call_used_regs
[i
] = 1;
13526 for (i
= P0_REGNUM
; i
<= P15_REGNUM
; i
++)
13529 call_used_regs
[i
] = 1;
13532 /* When tracking speculation, we need a couple of call-clobbered registers
13533 to track the speculation state. It would be nice to just use
13534 IP0 and IP1, but currently there are numerous places that just
13535 assume these registers are free for other uses (eg pointer
13536 authentication). */
13537 if (aarch64_track_speculation
)
13539 fixed_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
13540 call_used_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
13541 fixed_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
13542 call_used_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
13546 /* Walk down the type tree of TYPE counting consecutive base elements.
13547 If *MODEP is VOIDmode, then set it to the first valid floating point
13548 type. If a non-floating point type is found, or if a floating point
13549 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
13550 otherwise return the count in the sub-tree. */
13552 aapcs_vfp_sub_candidate (const_tree type
, machine_mode
*modep
)
13555 HOST_WIDE_INT size
;
13557 switch (TREE_CODE (type
))
13560 mode
= TYPE_MODE (type
);
13561 if (mode
!= DFmode
&& mode
!= SFmode
13562 && mode
!= TFmode
&& mode
!= HFmode
)
13565 if (*modep
== VOIDmode
)
13568 if (*modep
== mode
)
13574 mode
= TYPE_MODE (TREE_TYPE (type
));
13575 if (mode
!= DFmode
&& mode
!= SFmode
13576 && mode
!= TFmode
&& mode
!= HFmode
)
13579 if (*modep
== VOIDmode
)
13582 if (*modep
== mode
)
13588 /* Use V2SImode and V4SImode as representatives of all 64-bit
13589 and 128-bit vector types. */
13590 size
= int_size_in_bytes (type
);
13603 if (*modep
== VOIDmode
)
13606 /* Vector modes are considered to be opaque: two vectors are
13607 equivalent for the purposes of being homogeneous aggregates
13608 if they are the same size. */
13609 if (*modep
== mode
)
13617 tree index
= TYPE_DOMAIN (type
);
13619 /* Can't handle incomplete types nor sizes that are not
13621 if (!COMPLETE_TYPE_P (type
)
13622 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
13625 count
= aapcs_vfp_sub_candidate (TREE_TYPE (type
), modep
);
13628 || !TYPE_MAX_VALUE (index
)
13629 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index
))
13630 || !TYPE_MIN_VALUE (index
)
13631 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index
))
13635 count
*= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index
))
13636 - tree_to_uhwi (TYPE_MIN_VALUE (index
)));
13638 /* There must be no padding. */
13639 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
13640 count
* GET_MODE_BITSIZE (*modep
)))
13652 /* Can't handle incomplete types nor sizes that are not
13654 if (!COMPLETE_TYPE_P (type
)
13655 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
13658 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
13660 if (TREE_CODE (field
) != FIELD_DECL
)
13663 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
13666 count
+= sub_count
;
13669 /* There must be no padding. */
13670 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
13671 count
* GET_MODE_BITSIZE (*modep
)))
13678 case QUAL_UNION_TYPE
:
13680 /* These aren't very interesting except in a degenerate case. */
13685 /* Can't handle incomplete types nor sizes that are not
13687 if (!COMPLETE_TYPE_P (type
)
13688 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
13691 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
13693 if (TREE_CODE (field
) != FIELD_DECL
)
13696 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
13699 count
= count
> sub_count
? count
: sub_count
;
13702 /* There must be no padding. */
13703 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
13704 count
* GET_MODE_BITSIZE (*modep
)))
13717 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
13718 type as described in AAPCS64 \S 4.1.2.
13720 See the comment above aarch64_composite_type_p for the notes on MODE. */
13723 aarch64_short_vector_p (const_tree type
,
13726 poly_int64 size
= -1;
13728 if (type
&& TREE_CODE (type
) == VECTOR_TYPE
)
13729 size
= int_size_in_bytes (type
);
13730 else if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
13731 || GET_MODE_CLASS (mode
) == MODE_VECTOR_FLOAT
)
13732 size
= GET_MODE_SIZE (mode
);
13734 return known_eq (size
, 8) || known_eq (size
, 16);
13737 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
13738 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
13739 array types. The C99 floating-point complex types are also considered
13740 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
13741 types, which are GCC extensions and out of the scope of AAPCS64, are
13742 treated as composite types here as well.
13744 Note that MODE itself is not sufficient in determining whether a type
13745 is such a composite type or not. This is because
13746 stor-layout.c:compute_record_mode may have already changed the MODE
13747 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
13748 structure with only one field may have its MODE set to the mode of the
13749 field. Also an integer mode whose size matches the size of the
13750 RECORD_TYPE type may be used to substitute the original mode
13751 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
13752 solely relied on. */
13755 aarch64_composite_type_p (const_tree type
,
13758 if (aarch64_short_vector_p (type
, mode
))
13761 if (type
&& (AGGREGATE_TYPE_P (type
) || TREE_CODE (type
) == COMPLEX_TYPE
))
13764 if (mode
== BLKmode
13765 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
13766 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_INT
)
13772 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
13773 shall be passed or returned in simd/fp register(s) (providing these
13774 parameter passing registers are available).
13776 Upon successful return, *COUNT returns the number of needed registers,
13777 *BASE_MODE returns the mode of the individual register and when IS_HAF
13778 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
13779 floating-point aggregate or a homogeneous short-vector aggregate. */
13782 aarch64_vfp_is_call_or_return_candidate (machine_mode mode
,
13784 machine_mode
*base_mode
,
13788 machine_mode new_mode
= VOIDmode
;
13789 bool composite_p
= aarch64_composite_type_p (type
, mode
);
13791 if (is_ha
!= NULL
) *is_ha
= false;
13793 if ((!composite_p
&& GET_MODE_CLASS (mode
) == MODE_FLOAT
)
13794 || aarch64_short_vector_p (type
, mode
))
13799 else if (GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
)
13801 if (is_ha
!= NULL
) *is_ha
= true;
13803 new_mode
= GET_MODE_INNER (mode
);
13805 else if (type
&& composite_p
)
13807 int ag_count
= aapcs_vfp_sub_candidate (type
, &new_mode
);
13809 if (ag_count
> 0 && ag_count
<= HA_MAX_NUM_FLDS
)
13811 if (is_ha
!= NULL
) *is_ha
= true;
13820 *base_mode
= new_mode
;
13824 /* Implement TARGET_STRUCT_VALUE_RTX. */
13827 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED
,
13828 int incoming ATTRIBUTE_UNUSED
)
13830 return gen_rtx_REG (Pmode
, AARCH64_STRUCT_VALUE_REGNUM
);
13833 /* Implements target hook vector_mode_supported_p. */
13835 aarch64_vector_mode_supported_p (machine_mode mode
)
13837 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
13838 return vec_flags
!= 0 && (vec_flags
& VEC_STRUCT
) == 0;
13841 /* Return appropriate SIMD container
13842 for MODE within a vector of WIDTH bits. */
13843 static machine_mode
13844 aarch64_simd_container_mode (scalar_mode mode
, poly_int64 width
)
13846 if (TARGET_SVE
&& known_eq (width
, BITS_PER_SVE_VECTOR
))
13862 return VNx16QImode
;
13867 gcc_assert (known_eq (width
, 64) || known_eq (width
, 128));
13870 if (known_eq (width
, 128))
13910 /* Return 128-bit container as the preferred SIMD mode for MODE. */
13911 static machine_mode
13912 aarch64_preferred_simd_mode (scalar_mode mode
)
13914 poly_int64 bits
= TARGET_SVE
? BITS_PER_SVE_VECTOR
: 128;
13915 return aarch64_simd_container_mode (mode
, bits
);
13918 /* Return a list of possible vector sizes for the vectorizer
13919 to iterate over. */
13921 aarch64_autovectorize_vector_sizes (vector_sizes
*sizes
)
13924 sizes
->safe_push (BYTES_PER_SVE_VECTOR
);
13925 sizes
->safe_push (16);
13926 sizes
->safe_push (8);
13929 /* Implement TARGET_MANGLE_TYPE. */
13931 static const char *
13932 aarch64_mangle_type (const_tree type
)
13934 /* The AArch64 ABI documents say that "__va_list" has to be
13935 mangled as if it is in the "std" namespace. */
13936 if (lang_hooks
.types_compatible_p (CONST_CAST_TREE (type
), va_list_type
))
13937 return "St9__va_list";
13939 /* Half-precision float. */
13940 if (TREE_CODE (type
) == REAL_TYPE
&& TYPE_PRECISION (type
) == 16)
13943 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
13945 if (TYPE_NAME (type
) != NULL
)
13946 return aarch64_mangle_builtin_type (type
);
13948 /* Use the default mangling. */
13952 /* Find the first rtx_insn before insn that will generate an assembly
13956 aarch64_prev_real_insn (rtx_insn
*insn
)
13963 insn
= prev_real_insn (insn
);
13965 while (insn
&& recog_memoized (insn
) < 0);
13971 is_madd_op (enum attr_type t1
)
13974 /* A number of these may be AArch32 only. */
13975 enum attr_type mlatypes
[] = {
13976 TYPE_MLA
, TYPE_MLAS
, TYPE_SMLAD
, TYPE_SMLADX
, TYPE_SMLAL
, TYPE_SMLALD
,
13977 TYPE_SMLALS
, TYPE_SMLALXY
, TYPE_SMLAWX
, TYPE_SMLAWY
, TYPE_SMLAXY
,
13978 TYPE_SMMLA
, TYPE_UMLAL
, TYPE_UMLALS
,TYPE_SMLSD
, TYPE_SMLSDX
, TYPE_SMLSLD
13981 for (i
= 0; i
< sizeof (mlatypes
) / sizeof (enum attr_type
); i
++)
13983 if (t1
== mlatypes
[i
])
13990 /* Check if there is a register dependency between a load and the insn
13991 for which we hold recog_data. */
13994 dep_between_memop_and_curr (rtx memop
)
13999 gcc_assert (GET_CODE (memop
) == SET
);
14001 if (!REG_P (SET_DEST (memop
)))
14004 load_reg
= SET_DEST (memop
);
14005 for (opno
= 1; opno
< recog_data
.n_operands
; opno
++)
14007 rtx operand
= recog_data
.operand
[opno
];
14008 if (REG_P (operand
)
14009 && reg_overlap_mentioned_p (load_reg
, operand
))
14017 /* When working around the Cortex-A53 erratum 835769,
14018 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
14019 instruction and has a preceding memory instruction such that a NOP
14020 should be inserted between them. */
14023 aarch64_madd_needs_nop (rtx_insn
* insn
)
14025 enum attr_type attr_type
;
14029 if (!TARGET_FIX_ERR_A53_835769
)
14032 if (!INSN_P (insn
) || recog_memoized (insn
) < 0)
14035 attr_type
= get_attr_type (insn
);
14036 if (!is_madd_op (attr_type
))
14039 prev
= aarch64_prev_real_insn (insn
);
14040 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
14041 Restore recog state to INSN to avoid state corruption. */
14042 extract_constrain_insn_cached (insn
);
14044 if (!prev
|| !contains_mem_rtx_p (PATTERN (prev
)))
14047 body
= single_set (prev
);
14049 /* If the previous insn is a memory op and there is no dependency between
14050 it and the DImode madd, emit a NOP between them. If body is NULL then we
14051 have a complex memory operation, probably a load/store pair.
14052 Be conservative for now and emit a NOP. */
14053 if (GET_MODE (recog_data
.operand
[0]) == DImode
14054 && (!body
|| !dep_between_memop_and_curr (body
)))
14062 /* Implement FINAL_PRESCAN_INSN. */
14065 aarch64_final_prescan_insn (rtx_insn
*insn
)
14067 if (aarch64_madd_needs_nop (insn
))
14068 fprintf (asm_out_file
, "\tnop // between mem op and mult-accumulate\n");
14072 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
14076 aarch64_sve_index_immediate_p (rtx base_or_step
)
14078 return (CONST_INT_P (base_or_step
)
14079 && IN_RANGE (INTVAL (base_or_step
), -16, 15));
14082 /* Return true if X is a valid immediate for the SVE ADD and SUB
14083 instructions. Negate X first if NEGATE_P is true. */
14086 aarch64_sve_arith_immediate_p (rtx x
, bool negate_p
)
14090 if (!const_vec_duplicate_p (x
, &elt
)
14091 || !CONST_INT_P (elt
))
14094 HOST_WIDE_INT val
= INTVAL (elt
);
14097 val
&= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x
)));
14100 return IN_RANGE (val
, 0, 0xff);
14101 return IN_RANGE (val
, 0, 0xff00);
14104 /* Return true if X is a valid immediate operand for an SVE logical
14105 instruction such as AND. */
14108 aarch64_sve_bitmask_immediate_p (rtx x
)
14112 return (const_vec_duplicate_p (x
, &elt
)
14113 && CONST_INT_P (elt
)
14114 && aarch64_bitmask_imm (INTVAL (elt
),
14115 GET_MODE_INNER (GET_MODE (x
))));
14118 /* Return true if X is a valid immediate for the SVE DUP and CPY
14122 aarch64_sve_dup_immediate_p (rtx x
)
14126 if (!const_vec_duplicate_p (x
, &elt
)
14127 || !CONST_INT_P (elt
))
14130 HOST_WIDE_INT val
= INTVAL (elt
);
14132 return IN_RANGE (val
, -0x80, 0x7f);
14133 return IN_RANGE (val
, -0x8000, 0x7f00);
14136 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
14137 SIGNED_P says whether the operand is signed rather than unsigned. */
14140 aarch64_sve_cmp_immediate_p (rtx x
, bool signed_p
)
14144 return (const_vec_duplicate_p (x
, &elt
)
14145 && CONST_INT_P (elt
)
14147 ? IN_RANGE (INTVAL (elt
), -16, 15)
14148 : IN_RANGE (INTVAL (elt
), 0, 127)));
14151 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
14152 instruction. Negate X first if NEGATE_P is true. */
14155 aarch64_sve_float_arith_immediate_p (rtx x
, bool negate_p
)
14160 if (!const_vec_duplicate_p (x
, &elt
)
14161 || GET_CODE (elt
) != CONST_DOUBLE
)
14164 r
= *CONST_DOUBLE_REAL_VALUE (elt
);
14167 r
= real_value_negate (&r
);
14169 if (real_equal (&r
, &dconst1
))
14171 if (real_equal (&r
, &dconsthalf
))
14176 /* Return true if X is a valid immediate operand for an SVE FMUL
14180 aarch64_sve_float_mul_immediate_p (rtx x
)
14184 /* GCC will never generate a multiply with an immediate of 2, so there is no
14185 point testing for it (even though it is a valid constant). */
14186 return (const_vec_duplicate_p (x
, &elt
)
14187 && GET_CODE (elt
) == CONST_DOUBLE
14188 && real_equal (CONST_DOUBLE_REAL_VALUE (elt
), &dconsthalf
));
14191 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
14192 for the Advanced SIMD operation described by WHICH and INSN. If INFO
14193 is nonnull, use it to describe valid immediates. */
14195 aarch64_advsimd_valid_immediate_hs (unsigned int val32
,
14196 simd_immediate_info
*info
,
14197 enum simd_immediate_check which
,
14198 simd_immediate_info::insn_type insn
)
14200 /* Try a 4-byte immediate with LSL. */
14201 for (unsigned int shift
= 0; shift
< 32; shift
+= 8)
14202 if ((val32
& (0xff << shift
)) == val32
)
14205 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
14206 simd_immediate_info::LSL
, shift
);
14210 /* Try a 2-byte immediate with LSL. */
14211 unsigned int imm16
= val32
& 0xffff;
14212 if (imm16
== (val32
>> 16))
14213 for (unsigned int shift
= 0; shift
< 16; shift
+= 8)
14214 if ((imm16
& (0xff << shift
)) == imm16
)
14217 *info
= simd_immediate_info (HImode
, imm16
>> shift
, insn
,
14218 simd_immediate_info::LSL
, shift
);
14222 /* Try a 4-byte immediate with MSL, except for cases that MVN
14224 if (which
== AARCH64_CHECK_MOV
)
14225 for (unsigned int shift
= 8; shift
< 24; shift
+= 8)
14227 unsigned int low
= (1 << shift
) - 1;
14228 if (((val32
& (0xff << shift
)) | low
) == val32
)
14231 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
14232 simd_immediate_info::MSL
, shift
);
14240 /* Return true if replicating VAL64 is a valid immediate for the
14241 Advanced SIMD operation described by WHICH. If INFO is nonnull,
14242 use it to describe valid immediates. */
14244 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64
,
14245 simd_immediate_info
*info
,
14246 enum simd_immediate_check which
)
14248 unsigned int val32
= val64
& 0xffffffff;
14249 unsigned int val16
= val64
& 0xffff;
14250 unsigned int val8
= val64
& 0xff;
14252 if (val32
== (val64
>> 32))
14254 if ((which
& AARCH64_CHECK_ORR
) != 0
14255 && aarch64_advsimd_valid_immediate_hs (val32
, info
, which
,
14256 simd_immediate_info::MOV
))
14259 if ((which
& AARCH64_CHECK_BIC
) != 0
14260 && aarch64_advsimd_valid_immediate_hs (~val32
, info
, which
,
14261 simd_immediate_info::MVN
))
14264 /* Try using a replicated byte. */
14265 if (which
== AARCH64_CHECK_MOV
14266 && val16
== (val32
>> 16)
14267 && val8
== (val16
>> 8))
14270 *info
= simd_immediate_info (QImode
, val8
);
14275 /* Try using a bit-to-bytemask. */
14276 if (which
== AARCH64_CHECK_MOV
)
14279 for (i
= 0; i
< 64; i
+= 8)
14281 unsigned char byte
= (val64
>> i
) & 0xff;
14282 if (byte
!= 0 && byte
!= 0xff)
14288 *info
= simd_immediate_info (DImode
, val64
);
14295 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
14296 instruction. If INFO is nonnull, use it to describe valid immediates. */
14299 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64
,
14300 simd_immediate_info
*info
)
14302 scalar_int_mode mode
= DImode
;
14303 unsigned int val32
= val64
& 0xffffffff;
14304 if (val32
== (val64
>> 32))
14307 unsigned int val16
= val32
& 0xffff;
14308 if (val16
== (val32
>> 16))
14311 unsigned int val8
= val16
& 0xff;
14312 if (val8
== (val16
>> 8))
14316 HOST_WIDE_INT val
= trunc_int_for_mode (val64
, mode
);
14317 if (IN_RANGE (val
, -0x80, 0x7f))
14319 /* DUP with no shift. */
14321 *info
= simd_immediate_info (mode
, val
);
14324 if ((val
& 0xff) == 0 && IN_RANGE (val
, -0x8000, 0x7f00))
14326 /* DUP with LSL #8. */
14328 *info
= simd_immediate_info (mode
, val
);
14331 if (aarch64_bitmask_imm (val64
, mode
))
14335 *info
= simd_immediate_info (mode
, val
);
14341 /* Return true if OP is a valid SIMD immediate for the operation
14342 described by WHICH. If INFO is nonnull, use it to describe valid
14345 aarch64_simd_valid_immediate (rtx op
, simd_immediate_info
*info
,
14346 enum simd_immediate_check which
)
14348 machine_mode mode
= GET_MODE (op
);
14349 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
14350 if (vec_flags
== 0 || vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
14353 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
14355 unsigned int n_elts
;
14356 if (GET_CODE (op
) == CONST_VECTOR
14357 && CONST_VECTOR_DUPLICATE_P (op
))
14358 n_elts
= CONST_VECTOR_NPATTERNS (op
);
14359 else if ((vec_flags
& VEC_SVE_DATA
)
14360 && const_vec_series_p (op
, &base
, &step
))
14362 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
);
14363 if (!aarch64_sve_index_immediate_p (base
)
14364 || !aarch64_sve_index_immediate_p (step
))
14368 *info
= simd_immediate_info (elt_mode
, base
, step
);
14371 else if (GET_CODE (op
) == CONST_VECTOR
14372 && CONST_VECTOR_NUNITS (op
).is_constant (&n_elts
))
14373 /* N_ELTS set above. */;
14377 /* Handle PFALSE and PTRUE. */
14378 if (vec_flags
& VEC_SVE_PRED
)
14379 return (op
== CONST0_RTX (mode
)
14380 || op
== CONSTM1_RTX (mode
));
14382 scalar_float_mode elt_float_mode
;
14384 && is_a
<scalar_float_mode
> (elt_mode
, &elt_float_mode
))
14386 rtx elt
= CONST_VECTOR_ENCODED_ELT (op
, 0);
14387 if (aarch64_float_const_zero_rtx_p (elt
)
14388 || aarch64_float_const_representable_p (elt
))
14391 *info
= simd_immediate_info (elt_float_mode
, elt
);
14396 unsigned int elt_size
= GET_MODE_SIZE (elt_mode
);
14400 scalar_int_mode elt_int_mode
= int_mode_for_mode (elt_mode
).require ();
14402 /* Expand the vector constant out into a byte vector, with the least
14403 significant byte of the register first. */
14404 auto_vec
<unsigned char, 16> bytes
;
14405 bytes
.reserve (n_elts
* elt_size
);
14406 for (unsigned int i
= 0; i
< n_elts
; i
++)
14408 /* The vector is provided in gcc endian-neutral fashion.
14409 For aarch64_be Advanced SIMD, it must be laid out in the vector
14410 register in reverse order. */
14411 bool swap_p
= ((vec_flags
& VEC_ADVSIMD
) != 0 && BYTES_BIG_ENDIAN
);
14412 rtx elt
= CONST_VECTOR_ELT (op
, swap_p
? (n_elts
- 1 - i
) : i
);
14414 if (elt_mode
!= elt_int_mode
)
14415 elt
= gen_lowpart (elt_int_mode
, elt
);
14417 if (!CONST_INT_P (elt
))
14420 unsigned HOST_WIDE_INT elt_val
= INTVAL (elt
);
14421 for (unsigned int byte
= 0; byte
< elt_size
; byte
++)
14423 bytes
.quick_push (elt_val
& 0xff);
14424 elt_val
>>= BITS_PER_UNIT
;
14428 /* The immediate must repeat every eight bytes. */
14429 unsigned int nbytes
= bytes
.length ();
14430 for (unsigned i
= 8; i
< nbytes
; ++i
)
14431 if (bytes
[i
] != bytes
[i
- 8])
14434 /* Get the repeating 8-byte value as an integer. No endian correction
14435 is needed here because bytes is already in lsb-first order. */
14436 unsigned HOST_WIDE_INT val64
= 0;
14437 for (unsigned int i
= 0; i
< 8; i
++)
14438 val64
|= ((unsigned HOST_WIDE_INT
) bytes
[i
% nbytes
]
14439 << (i
* BITS_PER_UNIT
));
14441 if (vec_flags
& VEC_SVE_DATA
)
14442 return aarch64_sve_valid_immediate (val64
, info
);
14444 return aarch64_advsimd_valid_immediate (val64
, info
, which
);
14447 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
14448 has a step in the range of INDEX. Return the index expression if so,
14449 otherwise return null. */
14451 aarch64_check_zero_based_sve_index_immediate (rtx x
)
14454 if (const_vec_series_p (x
, &base
, &step
)
14455 && base
== const0_rtx
14456 && aarch64_sve_index_immediate_p (step
))
14461 /* Check of immediate shift constants are within range. */
14463 aarch64_simd_shift_imm_p (rtx x
, machine_mode mode
, bool left
)
14465 int bit_width
= GET_MODE_UNIT_SIZE (mode
) * BITS_PER_UNIT
;
14467 return aarch64_const_vec_all_same_in_range_p (x
, 0, bit_width
- 1);
14469 return aarch64_const_vec_all_same_in_range_p (x
, 1, bit_width
);
14472 /* Return the bitmask CONST_INT to select the bits required by a zero extract
14473 operation of width WIDTH at bit position POS. */
14476 aarch64_mask_from_zextract_ops (rtx width
, rtx pos
)
14478 gcc_assert (CONST_INT_P (width
));
14479 gcc_assert (CONST_INT_P (pos
));
14481 unsigned HOST_WIDE_INT mask
14482 = ((unsigned HOST_WIDE_INT
) 1 << UINTVAL (width
)) - 1;
14483 return GEN_INT (mask
<< UINTVAL (pos
));
14487 aarch64_mov_operand_p (rtx x
, machine_mode mode
)
14489 if (GET_CODE (x
) == HIGH
14490 && aarch64_valid_symref (XEXP (x
, 0), GET_MODE (XEXP (x
, 0))))
14493 if (CONST_INT_P (x
))
14496 if (VECTOR_MODE_P (GET_MODE (x
)))
14497 return aarch64_simd_valid_immediate (x
, NULL
);
14499 if (GET_CODE (x
) == SYMBOL_REF
&& mode
== DImode
&& CONSTANT_ADDRESS_P (x
))
14502 if (aarch64_sve_cnt_immediate_p (x
))
14505 return aarch64_classify_symbolic_expression (x
)
14506 == SYMBOL_TINY_ABSOLUTE
;
14509 /* Return a const_int vector of VAL. */
14511 aarch64_simd_gen_const_vector_dup (machine_mode mode
, HOST_WIDE_INT val
)
14513 rtx c
= gen_int_mode (val
, GET_MODE_INNER (mode
));
14514 return gen_const_vec_duplicate (mode
, c
);
14517 /* Check OP is a legal scalar immediate for the MOVI instruction. */
14520 aarch64_simd_scalar_immediate_valid_for_move (rtx op
, scalar_int_mode mode
)
14522 machine_mode vmode
;
14524 vmode
= aarch64_simd_container_mode (mode
, 64);
14525 rtx op_v
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (op
));
14526 return aarch64_simd_valid_immediate (op_v
, NULL
);
14529 /* Construct and return a PARALLEL RTX vector with elements numbering the
14530 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
14531 the vector - from the perspective of the architecture. This does not
14532 line up with GCC's perspective on lane numbers, so we end up with
14533 different masks depending on our target endian-ness. The diagram
14534 below may help. We must draw the distinction when building masks
14535 which select one half of the vector. An instruction selecting
14536 architectural low-lanes for a big-endian target, must be described using
14537 a mask selecting GCC high-lanes.
14539 Big-Endian Little-Endian
14541 GCC 0 1 2 3 3 2 1 0
14542 | x | x | x | x | | x | x | x | x |
14543 Architecture 3 2 1 0 3 2 1 0
14545 Low Mask: { 2, 3 } { 0, 1 }
14546 High Mask: { 0, 1 } { 2, 3 }
14548 MODE Is the mode of the vector and NUNITS is the number of units in it. */
14551 aarch64_simd_vect_par_cnst_half (machine_mode mode
, int nunits
, bool high
)
14553 rtvec v
= rtvec_alloc (nunits
/ 2);
14554 int high_base
= nunits
/ 2;
14560 if (BYTES_BIG_ENDIAN
)
14561 base
= high
? low_base
: high_base
;
14563 base
= high
? high_base
: low_base
;
14565 for (i
= 0; i
< nunits
/ 2; i
++)
14566 RTVEC_ELT (v
, i
) = GEN_INT (base
+ i
);
14568 t1
= gen_rtx_PARALLEL (mode
, v
);
14572 /* Check OP for validity as a PARALLEL RTX vector with elements
14573 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
14574 from the perspective of the architecture. See the diagram above
14575 aarch64_simd_vect_par_cnst_half for more details. */
14578 aarch64_simd_check_vect_par_cnst_half (rtx op
, machine_mode mode
,
14582 if (!VECTOR_MODE_P (mode
) || !GET_MODE_NUNITS (mode
).is_constant (&nelts
))
14585 rtx ideal
= aarch64_simd_vect_par_cnst_half (mode
, nelts
, high
);
14586 HOST_WIDE_INT count_op
= XVECLEN (op
, 0);
14587 HOST_WIDE_INT count_ideal
= XVECLEN (ideal
, 0);
14590 if (count_op
!= count_ideal
)
14593 for (i
= 0; i
< count_ideal
; i
++)
14595 rtx elt_op
= XVECEXP (op
, 0, i
);
14596 rtx elt_ideal
= XVECEXP (ideal
, 0, i
);
14598 if (!CONST_INT_P (elt_op
)
14599 || INTVAL (elt_ideal
) != INTVAL (elt_op
))
14605 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
14606 HIGH (exclusive). */
14608 aarch64_simd_lane_bounds (rtx operand
, HOST_WIDE_INT low
, HOST_WIDE_INT high
,
14611 HOST_WIDE_INT lane
;
14612 gcc_assert (CONST_INT_P (operand
));
14613 lane
= INTVAL (operand
);
14615 if (lane
< low
|| lane
>= high
)
14618 error ("%Klane %wd out of range %wd - %wd", exp
, lane
, low
, high
- 1);
14620 error ("lane %wd out of range %wd - %wd", lane
, low
, high
- 1);
14624 /* Peform endian correction on lane number N, which indexes a vector
14625 of mode MODE, and return the result as an SImode rtx. */
14628 aarch64_endian_lane_rtx (machine_mode mode
, unsigned int n
)
14630 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode
), n
), SImode
);
14633 /* Return TRUE if OP is a valid vector addressing mode. */
14636 aarch64_simd_mem_operand_p (rtx op
)
14638 return MEM_P (op
) && (GET_CODE (XEXP (op
, 0)) == POST_INC
14639 || REG_P (XEXP (op
, 0)));
14642 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
14645 aarch64_sve_ld1r_operand_p (rtx op
)
14647 struct aarch64_address_info addr
;
14651 && is_a
<scalar_mode
> (GET_MODE (op
), &mode
)
14652 && aarch64_classify_address (&addr
, XEXP (op
, 0), mode
, false)
14653 && addr
.type
== ADDRESS_REG_IMM
14654 && offset_6bit_unsigned_scaled_p (mode
, addr
.const_offset
));
14657 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
14658 The conditions for STR are the same. */
14660 aarch64_sve_ldr_operand_p (rtx op
)
14662 struct aarch64_address_info addr
;
14665 && aarch64_classify_address (&addr
, XEXP (op
, 0), GET_MODE (op
),
14666 false, ADDR_QUERY_ANY
)
14667 && addr
.type
== ADDRESS_REG_IMM
);
14670 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
14671 We need to be able to access the individual pieces, so the range
14672 is different from LD[234] and ST[234]. */
14674 aarch64_sve_struct_memory_operand_p (rtx op
)
14679 machine_mode mode
= GET_MODE (op
);
14680 struct aarch64_address_info addr
;
14681 if (!aarch64_classify_address (&addr
, XEXP (op
, 0), SVE_BYTE_MODE
, false,
14683 || addr
.type
!= ADDRESS_REG_IMM
)
14686 poly_int64 first
= addr
.const_offset
;
14687 poly_int64 last
= first
+ GET_MODE_SIZE (mode
) - BYTES_PER_SVE_VECTOR
;
14688 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, first
)
14689 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, last
));
14692 /* Emit a register copy from operand to operand, taking care not to
14693 early-clobber source registers in the process.
14695 COUNT is the number of components into which the copy needs to be
14698 aarch64_simd_emit_reg_reg_move (rtx
*operands
, machine_mode mode
,
14699 unsigned int count
)
14702 int rdest
= REGNO (operands
[0]);
14703 int rsrc
= REGNO (operands
[1]);
14705 if (!reg_overlap_mentioned_p (operands
[0], operands
[1])
14707 for (i
= 0; i
< count
; i
++)
14708 emit_move_insn (gen_rtx_REG (mode
, rdest
+ i
),
14709 gen_rtx_REG (mode
, rsrc
+ i
));
14711 for (i
= 0; i
< count
; i
++)
14712 emit_move_insn (gen_rtx_REG (mode
, rdest
+ count
- i
- 1),
14713 gen_rtx_REG (mode
, rsrc
+ count
- i
- 1));
14716 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
14717 one of VSTRUCT modes: OI, CI, or XI. */
14719 aarch64_simd_attr_length_rglist (machine_mode mode
)
14721 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
14722 return (GET_MODE_SIZE (mode
).to_constant () / UNITS_PER_VREG
) * 4;
14725 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
14726 alignment of a vector to 128 bits. SVE predicates have an alignment of
14728 static HOST_WIDE_INT
14729 aarch64_simd_vector_alignment (const_tree type
)
14731 if (TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
14732 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
14733 be set for non-predicate vectors of booleans. Modes are the most
14734 direct way we have of identifying real SVE predicate types. */
14735 return GET_MODE_CLASS (TYPE_MODE (type
)) == MODE_VECTOR_BOOL
? 16 : 128;
14736 HOST_WIDE_INT align
= tree_to_shwi (TYPE_SIZE (type
));
14737 return MIN (align
, 128);
14740 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
14742 aarch64_vectorize_preferred_vector_alignment (const_tree type
)
14744 if (aarch64_sve_data_mode_p (TYPE_MODE (type
)))
14746 /* If the length of the vector is fixed, try to align to that length,
14747 otherwise don't try to align at all. */
14748 HOST_WIDE_INT result
;
14749 if (!BITS_PER_SVE_VECTOR
.is_constant (&result
))
14750 result
= TYPE_ALIGN (TREE_TYPE (type
));
14753 return TYPE_ALIGN (type
);
14756 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
14758 aarch64_simd_vector_alignment_reachable (const_tree type
, bool is_packed
)
14763 /* For fixed-length vectors, check that the vectorizer will aim for
14764 full-vector alignment. This isn't true for generic GCC vectors
14765 that are wider than the ABI maximum of 128 bits. */
14766 poly_uint64 preferred_alignment
=
14767 aarch64_vectorize_preferred_vector_alignment (type
);
14768 if (TREE_CODE (TYPE_SIZE (type
)) == INTEGER_CST
14769 && maybe_ne (wi::to_widest (TYPE_SIZE (type
)),
14770 preferred_alignment
))
14773 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
14777 /* Return true if the vector misalignment factor is supported by the
14780 aarch64_builtin_support_vector_misalignment (machine_mode mode
,
14781 const_tree type
, int misalignment
,
14784 if (TARGET_SIMD
&& STRICT_ALIGNMENT
)
14786 /* Return if movmisalign pattern is not supported for this mode. */
14787 if (optab_handler (movmisalign_optab
, mode
) == CODE_FOR_nothing
)
14790 /* Misalignment factor is unknown at compile time. */
14791 if (misalignment
== -1)
14794 return default_builtin_support_vector_misalignment (mode
, type
, misalignment
,
14798 /* If VALS is a vector constant that can be loaded into a register
14799 using DUP, generate instructions to do so and return an RTX to
14800 assign to the register. Otherwise return NULL_RTX. */
14802 aarch64_simd_dup_constant (rtx vals
)
14804 machine_mode mode
= GET_MODE (vals
);
14805 machine_mode inner_mode
= GET_MODE_INNER (mode
);
14808 if (!const_vec_duplicate_p (vals
, &x
))
14811 /* We can load this constant by using DUP and a constant in a
14812 single ARM register. This will be cheaper than a vector
14814 x
= copy_to_mode_reg (inner_mode
, x
);
14815 return gen_vec_duplicate (mode
, x
);
14819 /* Generate code to load VALS, which is a PARALLEL containing only
14820 constants (for vec_init) or CONST_VECTOR, efficiently into a
14821 register. Returns an RTX to copy into the register, or NULL_RTX
14822 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
14824 aarch64_simd_make_constant (rtx vals
)
14826 machine_mode mode
= GET_MODE (vals
);
14828 rtx const_vec
= NULL_RTX
;
14832 if (GET_CODE (vals
) == CONST_VECTOR
)
14834 else if (GET_CODE (vals
) == PARALLEL
)
14836 /* A CONST_VECTOR must contain only CONST_INTs and
14837 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
14838 Only store valid constants in a CONST_VECTOR. */
14839 int n_elts
= XVECLEN (vals
, 0);
14840 for (i
= 0; i
< n_elts
; ++i
)
14842 rtx x
= XVECEXP (vals
, 0, i
);
14843 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
14846 if (n_const
== n_elts
)
14847 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0));
14850 gcc_unreachable ();
14852 if (const_vec
!= NULL_RTX
14853 && aarch64_simd_valid_immediate (const_vec
, NULL
))
14854 /* Load using MOVI/MVNI. */
14856 else if ((const_dup
= aarch64_simd_dup_constant (vals
)) != NULL_RTX
)
14857 /* Loaded using DUP. */
14859 else if (const_vec
!= NULL_RTX
)
14860 /* Load from constant pool. We cannot take advantage of single-cycle
14861 LD1 because we need a PC-relative addressing mode. */
14864 /* A PARALLEL containing something not valid inside CONST_VECTOR.
14865 We cannot construct an initializer. */
14869 /* Expand a vector initialisation sequence, such that TARGET is
14870 initialised to contain VALS. */
14873 aarch64_expand_vector_init (rtx target
, rtx vals
)
14875 machine_mode mode
= GET_MODE (target
);
14876 scalar_mode inner_mode
= GET_MODE_INNER (mode
);
14877 /* The number of vector elements. */
14878 int n_elts
= XVECLEN (vals
, 0);
14879 /* The number of vector elements which are not constant. */
14881 rtx any_const
= NULL_RTX
;
14882 /* The first element of vals. */
14883 rtx v0
= XVECEXP (vals
, 0, 0);
14884 bool all_same
= true;
14886 /* Count the number of variable elements to initialise. */
14887 for (int i
= 0; i
< n_elts
; ++i
)
14889 rtx x
= XVECEXP (vals
, 0, i
);
14890 if (!(CONST_INT_P (x
) || CONST_DOUBLE_P (x
)))
14895 all_same
&= rtx_equal_p (x
, v0
);
14898 /* No variable elements, hand off to aarch64_simd_make_constant which knows
14899 how best to handle this. */
14902 rtx constant
= aarch64_simd_make_constant (vals
);
14903 if (constant
!= NULL_RTX
)
14905 emit_move_insn (target
, constant
);
14910 /* Splat a single non-constant element if we can. */
14913 rtx x
= copy_to_mode_reg (inner_mode
, v0
);
14914 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
14918 enum insn_code icode
= optab_handler (vec_set_optab
, mode
);
14919 gcc_assert (icode
!= CODE_FOR_nothing
);
14921 /* If there are only variable elements, try to optimize
14922 the insertion using dup for the most common element
14923 followed by insertions. */
14925 /* The algorithm will fill matches[*][0] with the earliest matching element,
14926 and matches[X][1] with the count of duplicate elements (if X is the
14927 earliest element which has duplicates). */
14929 if (n_var
== n_elts
&& n_elts
<= 16)
14931 int matches
[16][2] = {0};
14932 for (int i
= 0; i
< n_elts
; i
++)
14934 for (int j
= 0; j
<= i
; j
++)
14936 if (rtx_equal_p (XVECEXP (vals
, 0, i
), XVECEXP (vals
, 0, j
)))
14944 int maxelement
= 0;
14946 for (int i
= 0; i
< n_elts
; i
++)
14947 if (matches
[i
][1] > maxv
)
14950 maxv
= matches
[i
][1];
14953 /* Create a duplicate of the most common element, unless all elements
14954 are equally useless to us, in which case just immediately set the
14955 vector register using the first element. */
14959 /* For vectors of two 64-bit elements, we can do even better. */
14961 && (inner_mode
== E_DImode
14962 || inner_mode
== E_DFmode
))
14965 rtx x0
= XVECEXP (vals
, 0, 0);
14966 rtx x1
= XVECEXP (vals
, 0, 1);
14967 /* Combine can pick up this case, but handling it directly
14968 here leaves clearer RTL.
14970 This is load_pair_lanes<mode>, and also gives us a clean-up
14971 for store_pair_lanes<mode>. */
14972 if (memory_operand (x0
, inner_mode
)
14973 && memory_operand (x1
, inner_mode
)
14974 && !STRICT_ALIGNMENT
14975 && rtx_equal_p (XEXP (x1
, 0),
14976 plus_constant (Pmode
,
14978 GET_MODE_SIZE (inner_mode
))))
14981 if (inner_mode
== DFmode
)
14982 t
= gen_load_pair_lanesdf (target
, x0
, x1
);
14984 t
= gen_load_pair_lanesdi (target
, x0
, x1
);
14989 /* The subreg-move sequence below will move into lane zero of the
14990 vector register. For big-endian we want that position to hold
14991 the last element of VALS. */
14992 maxelement
= BYTES_BIG_ENDIAN
? n_elts
- 1 : 0;
14993 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
14994 aarch64_emit_move (target
, lowpart_subreg (mode
, x
, inner_mode
));
14998 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
14999 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
15002 /* Insert the rest. */
15003 for (int i
= 0; i
< n_elts
; i
++)
15005 rtx x
= XVECEXP (vals
, 0, i
);
15006 if (matches
[i
][0] == maxelement
)
15008 x
= copy_to_mode_reg (inner_mode
, x
);
15009 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
15014 /* Initialise a vector which is part-variable. We want to first try
15015 to build those lanes which are constant in the most efficient way we
15017 if (n_var
!= n_elts
)
15019 rtx copy
= copy_rtx (vals
);
15021 /* Load constant part of vector. We really don't care what goes into the
15022 parts we will overwrite, but we're more likely to be able to load the
15023 constant efficiently if it has fewer, larger, repeating parts
15024 (see aarch64_simd_valid_immediate). */
15025 for (int i
= 0; i
< n_elts
; i
++)
15027 rtx x
= XVECEXP (vals
, 0, i
);
15028 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
15030 rtx subst
= any_const
;
15031 for (int bit
= n_elts
/ 2; bit
> 0; bit
/= 2)
15033 /* Look in the copied vector, as more elements are const. */
15034 rtx test
= XVECEXP (copy
, 0, i
^ bit
);
15035 if (CONST_INT_P (test
) || CONST_DOUBLE_P (test
))
15041 XVECEXP (copy
, 0, i
) = subst
;
15043 aarch64_expand_vector_init (target
, copy
);
15046 /* Insert the variable lanes directly. */
15047 for (int i
= 0; i
< n_elts
; i
++)
15049 rtx x
= XVECEXP (vals
, 0, i
);
15050 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
15052 x
= copy_to_mode_reg (inner_mode
, x
);
15053 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
15057 static unsigned HOST_WIDE_INT
15058 aarch64_shift_truncation_mask (machine_mode mode
)
15060 if (!SHIFT_COUNT_TRUNCATED
|| aarch64_vector_data_mode_p (mode
))
15062 return GET_MODE_UNIT_BITSIZE (mode
) - 1;
15065 /* Select a format to encode pointers in exception handling data. */
15067 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED
, int global
)
15070 switch (aarch64_cmodel
)
15072 case AARCH64_CMODEL_TINY
:
15073 case AARCH64_CMODEL_TINY_PIC
:
15074 case AARCH64_CMODEL_SMALL
:
15075 case AARCH64_CMODEL_SMALL_PIC
:
15076 case AARCH64_CMODEL_SMALL_SPIC
:
15077 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
15079 type
= DW_EH_PE_sdata4
;
15082 /* No assumptions here. 8-byte relocs required. */
15083 type
= DW_EH_PE_sdata8
;
15086 return (global
? DW_EH_PE_indirect
: 0) | DW_EH_PE_pcrel
| type
;
15089 /* The last .arch and .tune assembly strings that we printed. */
15090 static std::string aarch64_last_printed_arch_string
;
15091 static std::string aarch64_last_printed_tune_string
;
15093 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
15094 by the function fndecl. */
15097 aarch64_declare_function_name (FILE *stream
, const char* name
,
15100 tree target_parts
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
15102 struct cl_target_option
*targ_options
;
15104 targ_options
= TREE_TARGET_OPTION (target_parts
);
15106 targ_options
= TREE_TARGET_OPTION (target_option_current_node
);
15107 gcc_assert (targ_options
);
15109 const struct processor
*this_arch
15110 = aarch64_get_arch (targ_options
->x_explicit_arch
);
15112 unsigned long isa_flags
= targ_options
->x_aarch64_isa_flags
;
15113 std::string extension
15114 = aarch64_get_extension_string_for_isa_flags (isa_flags
,
15116 /* Only update the assembler .arch string if it is distinct from the last
15117 such string we printed. */
15118 std::string to_print
= this_arch
->name
+ extension
;
15119 if (to_print
!= aarch64_last_printed_arch_string
)
15121 asm_fprintf (asm_out_file
, "\t.arch %s\n", to_print
.c_str ());
15122 aarch64_last_printed_arch_string
= to_print
;
15125 /* Print the cpu name we're tuning for in the comments, might be
15126 useful to readers of the generated asm. Do it only when it changes
15127 from function to function and verbose assembly is requested. */
15128 const struct processor
*this_tune
15129 = aarch64_get_tune_cpu (targ_options
->x_explicit_tune_core
);
15131 if (flag_debug_asm
&& aarch64_last_printed_tune_string
!= this_tune
->name
)
15133 asm_fprintf (asm_out_file
, "\t" ASM_COMMENT_START
".tune %s\n",
15135 aarch64_last_printed_tune_string
= this_tune
->name
;
15138 /* Don't forget the type directive for ELF. */
15139 ASM_OUTPUT_TYPE_DIRECTIVE (stream
, name
, "function");
15140 ASM_OUTPUT_LABEL (stream
, name
);
15143 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
15146 aarch64_start_file (void)
15148 struct cl_target_option
*default_options
15149 = TREE_TARGET_OPTION (target_option_default_node
);
15151 const struct processor
*default_arch
15152 = aarch64_get_arch (default_options
->x_explicit_arch
);
15153 unsigned long default_isa_flags
= default_options
->x_aarch64_isa_flags
;
15154 std::string extension
15155 = aarch64_get_extension_string_for_isa_flags (default_isa_flags
,
15156 default_arch
->flags
);
15158 aarch64_last_printed_arch_string
= default_arch
->name
+ extension
;
15159 aarch64_last_printed_tune_string
= "";
15160 asm_fprintf (asm_out_file
, "\t.arch %s\n",
15161 aarch64_last_printed_arch_string
.c_str ());
15163 default_file_start ();
15166 /* Emit load exclusive. */
15169 aarch64_emit_load_exclusive (machine_mode mode
, rtx rval
,
15170 rtx mem
, rtx model_rtx
)
15172 emit_insn (gen_aarch64_load_exclusive (mode
, rval
, mem
, model_rtx
));
15175 /* Emit store exclusive. */
15178 aarch64_emit_store_exclusive (machine_mode mode
, rtx bval
,
15179 rtx rval
, rtx mem
, rtx model_rtx
)
15181 emit_insn (gen_aarch64_store_exclusive (mode
, bval
, rval
, mem
, model_rtx
));
15184 /* Mark the previous jump instruction as unlikely. */
15187 aarch64_emit_unlikely_jump (rtx insn
)
15189 rtx_insn
*jump
= emit_jump_insn (insn
);
15190 add_reg_br_prob_note (jump
, profile_probability::very_unlikely ());
15193 /* Expand a compare and swap pattern. */
15196 aarch64_expand_compare_and_swap (rtx operands
[])
15198 rtx bval
, rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
, x
, cc_reg
;
15199 machine_mode mode
, r_mode
;
15201 bval
= operands
[0];
15202 rval
= operands
[1];
15204 oldval
= operands
[3];
15205 newval
= operands
[4];
15206 is_weak
= operands
[5];
15207 mod_s
= operands
[6];
15208 mod_f
= operands
[7];
15209 mode
= GET_MODE (mem
);
15211 /* Normally the succ memory model must be stronger than fail, but in the
15212 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
15213 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
15214 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f
)))
15215 && is_mm_release (memmodel_from_int (INTVAL (mod_s
))))
15216 mod_s
= GEN_INT (MEMMODEL_ACQ_REL
);
15219 if (mode
== QImode
|| mode
== HImode
)
15222 rval
= gen_reg_rtx (r_mode
);
15227 /* The CAS insn requires oldval and rval overlap, but we need to
15228 have a copy of oldval saved across the operation to tell if
15229 the operation is successful. */
15230 if (reg_overlap_mentioned_p (rval
, oldval
))
15231 rval
= copy_to_mode_reg (r_mode
, oldval
);
15233 emit_move_insn (rval
, gen_lowpart (r_mode
, oldval
));
15235 emit_insn (gen_aarch64_compare_and_swap_lse (mode
, rval
, mem
,
15237 cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
15241 /* The oldval predicate varies by mode. Test it and force to reg. */
15242 insn_code code
= code_for_aarch64_compare_and_swap (mode
);
15243 if (!insn_data
[code
].operand
[2].predicate (oldval
, mode
))
15244 oldval
= force_reg (mode
, oldval
);
15246 emit_insn (GEN_FCN (code
) (rval
, mem
, oldval
, newval
,
15247 is_weak
, mod_s
, mod_f
));
15248 cc_reg
= gen_rtx_REG (CCmode
, CC_REGNUM
);
15251 if (r_mode
!= mode
)
15252 rval
= gen_lowpart (mode
, rval
);
15253 emit_move_insn (operands
[1], rval
);
15255 x
= gen_rtx_EQ (SImode
, cc_reg
, const0_rtx
);
15256 emit_insn (gen_rtx_SET (bval
, x
));
15259 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
15260 sequence implementing an atomic operation. */
15263 aarch64_emit_post_barrier (enum memmodel model
)
15265 const enum memmodel base_model
= memmodel_base (model
);
15267 if (is_mm_sync (model
)
15268 && (base_model
== MEMMODEL_ACQUIRE
15269 || base_model
== MEMMODEL_ACQ_REL
15270 || base_model
== MEMMODEL_SEQ_CST
))
15272 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST
)));
15276 /* Split a compare and swap pattern. */
15279 aarch64_split_compare_and_swap (rtx operands
[])
15281 rtx rval
, mem
, oldval
, newval
, scratch
;
15284 rtx_code_label
*label1
, *label2
;
15286 enum memmodel model
;
15289 rval
= operands
[0];
15291 oldval
= operands
[2];
15292 newval
= operands
[3];
15293 is_weak
= (operands
[4] != const0_rtx
);
15294 model_rtx
= operands
[5];
15295 scratch
= operands
[7];
15296 mode
= GET_MODE (mem
);
15297 model
= memmodel_from_int (INTVAL (model_rtx
));
15299 /* When OLDVAL is zero and we want the strong version we can emit a tighter
15302 LD[A]XR rval, [mem]
15304 ST[L]XR scratch, newval, [mem]
15305 CBNZ scratch, .label1
15308 bool strong_zero_p
= !is_weak
&& oldval
== const0_rtx
;
15313 label1
= gen_label_rtx ();
15314 emit_label (label1
);
15316 label2
= gen_label_rtx ();
15318 /* The initial load can be relaxed for a __sync operation since a final
15319 barrier will be emitted to stop code hoisting. */
15320 if (is_mm_sync (model
))
15321 aarch64_emit_load_exclusive (mode
, rval
, mem
,
15322 GEN_INT (MEMMODEL_RELAXED
));
15324 aarch64_emit_load_exclusive (mode
, rval
, mem
, model_rtx
);
15328 if (aarch64_track_speculation
)
15330 /* Emit an explicit compare instruction, so that we can correctly
15331 track the condition codes. */
15332 rtx cc_reg
= aarch64_gen_compare_reg (NE
, rval
, const0_rtx
);
15333 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
15336 x
= gen_rtx_NE (VOIDmode
, rval
, const0_rtx
);
15338 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
15339 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
15340 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
15344 cond
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
15345 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
15346 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
15347 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
15348 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
15351 aarch64_emit_store_exclusive (mode
, scratch
, mem
, newval
, model_rtx
);
15355 if (aarch64_track_speculation
)
15357 /* Emit an explicit compare instruction, so that we can correctly
15358 track the condition codes. */
15359 rtx cc_reg
= aarch64_gen_compare_reg (NE
, scratch
, const0_rtx
);
15360 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
15363 x
= gen_rtx_NE (VOIDmode
, scratch
, const0_rtx
);
15365 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
15366 gen_rtx_LABEL_REF (Pmode
, label1
), pc_rtx
);
15367 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
15371 cond
= gen_rtx_REG (CCmode
, CC_REGNUM
);
15372 x
= gen_rtx_COMPARE (CCmode
, scratch
, const0_rtx
);
15373 emit_insn (gen_rtx_SET (cond
, x
));
15376 emit_label (label2
);
15377 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
15378 to set the condition flags. If this is not used it will be removed by
15382 cond
= gen_rtx_REG (CCmode
, CC_REGNUM
);
15383 x
= gen_rtx_COMPARE (CCmode
, rval
, const0_rtx
);
15384 emit_insn (gen_rtx_SET (cond
, x
));
15386 /* Emit any final barrier needed for a __sync operation. */
15387 if (is_mm_sync (model
))
15388 aarch64_emit_post_barrier (model
);
15391 /* Split an atomic operation. */
15394 aarch64_split_atomic_op (enum rtx_code code
, rtx old_out
, rtx new_out
, rtx mem
,
15395 rtx value
, rtx model_rtx
, rtx cond
)
15397 machine_mode mode
= GET_MODE (mem
);
15398 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
15399 const enum memmodel model
= memmodel_from_int (INTVAL (model_rtx
));
15400 const bool is_sync
= is_mm_sync (model
);
15401 rtx_code_label
*label
;
15404 /* Split the atomic operation into a sequence. */
15405 label
= gen_label_rtx ();
15406 emit_label (label
);
15409 new_out
= gen_lowpart (wmode
, new_out
);
15411 old_out
= gen_lowpart (wmode
, old_out
);
15414 value
= simplify_gen_subreg (wmode
, value
, mode
, 0);
15416 /* The initial load can be relaxed for a __sync operation since a final
15417 barrier will be emitted to stop code hoisting. */
15419 aarch64_emit_load_exclusive (mode
, old_out
, mem
,
15420 GEN_INT (MEMMODEL_RELAXED
));
15422 aarch64_emit_load_exclusive (mode
, old_out
, mem
, model_rtx
);
15431 x
= gen_rtx_AND (wmode
, old_out
, value
);
15432 emit_insn (gen_rtx_SET (new_out
, x
));
15433 x
= gen_rtx_NOT (wmode
, new_out
);
15434 emit_insn (gen_rtx_SET (new_out
, x
));
15438 if (CONST_INT_P (value
))
15440 value
= GEN_INT (-INTVAL (value
));
15443 /* Fall through. */
15446 x
= gen_rtx_fmt_ee (code
, wmode
, old_out
, value
);
15447 emit_insn (gen_rtx_SET (new_out
, x
));
15451 aarch64_emit_store_exclusive (mode
, cond
, mem
,
15452 gen_lowpart (mode
, new_out
), model_rtx
);
15454 if (aarch64_track_speculation
)
15456 /* Emit an explicit compare instruction, so that we can correctly
15457 track the condition codes. */
15458 rtx cc_reg
= aarch64_gen_compare_reg (NE
, cond
, const0_rtx
);
15459 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
15462 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
15464 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
15465 gen_rtx_LABEL_REF (Pmode
, label
), pc_rtx
);
15466 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
15468 /* Emit any final barrier needed for a __sync operation. */
15470 aarch64_emit_post_barrier (model
);
15474 aarch64_init_libfuncs (void)
15476 /* Half-precision float operations. The compiler handles all operations
15477 with NULL libfuncs by converting to SFmode. */
15480 set_conv_libfunc (trunc_optab
, HFmode
, SFmode
, "__gnu_f2h_ieee");
15481 set_conv_libfunc (sext_optab
, SFmode
, HFmode
, "__gnu_h2f_ieee");
15484 set_optab_libfunc (add_optab
, HFmode
, NULL
);
15485 set_optab_libfunc (sdiv_optab
, HFmode
, NULL
);
15486 set_optab_libfunc (smul_optab
, HFmode
, NULL
);
15487 set_optab_libfunc (neg_optab
, HFmode
, NULL
);
15488 set_optab_libfunc (sub_optab
, HFmode
, NULL
);
15491 set_optab_libfunc (eq_optab
, HFmode
, NULL
);
15492 set_optab_libfunc (ne_optab
, HFmode
, NULL
);
15493 set_optab_libfunc (lt_optab
, HFmode
, NULL
);
15494 set_optab_libfunc (le_optab
, HFmode
, NULL
);
15495 set_optab_libfunc (ge_optab
, HFmode
, NULL
);
15496 set_optab_libfunc (gt_optab
, HFmode
, NULL
);
15497 set_optab_libfunc (unord_optab
, HFmode
, NULL
);
15500 /* Target hook for c_mode_for_suffix. */
15501 static machine_mode
15502 aarch64_c_mode_for_suffix (char suffix
)
15510 /* We can only represent floating point constants which will fit in
15511 "quarter-precision" values. These values are characterised by
15512 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
15515 (-1)^s * (n/16) * 2^r
15518 's' is the sign bit.
15519 'n' is an integer in the range 16 <= n <= 31.
15520 'r' is an integer in the range -3 <= r <= 4. */
15522 /* Return true iff X can be represented by a quarter-precision
15523 floating point immediate operand X. Note, we cannot represent 0.0. */
15525 aarch64_float_const_representable_p (rtx x
)
15527 /* This represents our current view of how many bits
15528 make up the mantissa. */
15529 int point_pos
= 2 * HOST_BITS_PER_WIDE_INT
- 1;
15531 unsigned HOST_WIDE_INT mantissa
, mask
;
15532 REAL_VALUE_TYPE r
, m
;
15535 if (!CONST_DOUBLE_P (x
))
15538 if (GET_MODE (x
) == VOIDmode
15539 || (GET_MODE (x
) == HFmode
&& !TARGET_FP_F16INST
))
15542 r
= *CONST_DOUBLE_REAL_VALUE (x
);
15544 /* We cannot represent infinities, NaNs or +/-zero. We won't
15545 know if we have +zero until we analyse the mantissa, but we
15546 can reject the other invalid values. */
15547 if (REAL_VALUE_ISINF (r
) || REAL_VALUE_ISNAN (r
)
15548 || REAL_VALUE_MINUS_ZERO (r
))
15551 /* Extract exponent. */
15552 r
= real_value_abs (&r
);
15553 exponent
= REAL_EXP (&r
);
15555 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
15556 highest (sign) bit, with a fixed binary point at bit point_pos.
15557 m1 holds the low part of the mantissa, m2 the high part.
15558 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
15559 bits for the mantissa, this can fail (low bits will be lost). */
15560 real_ldexp (&m
, &r
, point_pos
- exponent
);
15561 wide_int w
= real_to_integer (&m
, &fail
, HOST_BITS_PER_WIDE_INT
* 2);
15563 /* If the low part of the mantissa has bits set we cannot represent
15565 if (w
.ulow () != 0)
15567 /* We have rejected the lower HOST_WIDE_INT, so update our
15568 understanding of how many bits lie in the mantissa and
15569 look only at the high HOST_WIDE_INT. */
15570 mantissa
= w
.elt (1);
15571 point_pos
-= HOST_BITS_PER_WIDE_INT
;
15573 /* We can only represent values with a mantissa of the form 1.xxxx. */
15574 mask
= ((unsigned HOST_WIDE_INT
)1 << (point_pos
- 5)) - 1;
15575 if ((mantissa
& mask
) != 0)
15578 /* Having filtered unrepresentable values, we may now remove all
15579 but the highest 5 bits. */
15580 mantissa
>>= point_pos
- 5;
15582 /* We cannot represent the value 0.0, so reject it. This is handled
15587 /* Then, as bit 4 is always set, we can mask it off, leaving
15588 the mantissa in the range [0, 15]. */
15589 mantissa
&= ~(1 << 4);
15590 gcc_assert (mantissa
<= 15);
15592 /* GCC internally does not use IEEE754-like encoding (where normalized
15593 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
15594 Our mantissa values are shifted 4 places to the left relative to
15595 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
15596 by 5 places to correct for GCC's representation. */
15597 exponent
= 5 - exponent
;
15599 return (exponent
>= 0 && exponent
<= 7);
15602 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
15603 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
15604 output MOVI/MVNI, ORR or BIC immediate. */
15606 aarch64_output_simd_mov_immediate (rtx const_vector
, unsigned width
,
15607 enum simd_immediate_check which
)
15610 static char templ
[40];
15611 const char *mnemonic
;
15612 const char *shift_op
;
15613 unsigned int lane_count
= 0;
15616 struct simd_immediate_info info
;
15618 /* This will return true to show const_vector is legal for use as either
15619 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
15620 It will also update INFO to show how the immediate should be generated.
15621 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
15622 is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
, which
);
15623 gcc_assert (is_valid
);
15625 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
15626 lane_count
= width
/ GET_MODE_BITSIZE (info
.elt_mode
);
15628 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
15630 gcc_assert (info
.shift
== 0 && info
.insn
== simd_immediate_info::MOV
);
15631 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
15632 move immediate path. */
15633 if (aarch64_float_const_zero_rtx_p (info
.value
))
15634 info
.value
= GEN_INT (0);
15637 const unsigned int buf_size
= 20;
15638 char float_buf
[buf_size
] = {'\0'};
15639 real_to_decimal_for_mode (float_buf
,
15640 CONST_DOUBLE_REAL_VALUE (info
.value
),
15641 buf_size
, buf_size
, 1, info
.elt_mode
);
15643 if (lane_count
== 1)
15644 snprintf (templ
, sizeof (templ
), "fmov\t%%d0, %s", float_buf
);
15646 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%d%c, %s",
15647 lane_count
, element_char
, float_buf
);
15652 gcc_assert (CONST_INT_P (info
.value
));
15654 if (which
== AARCH64_CHECK_MOV
)
15656 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "mvni" : "movi";
15657 shift_op
= info
.modifier
== simd_immediate_info::MSL
? "msl" : "lsl";
15658 if (lane_count
== 1)
15659 snprintf (templ
, sizeof (templ
), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX
,
15660 mnemonic
, UINTVAL (info
.value
));
15661 else if (info
.shift
)
15662 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
15663 HOST_WIDE_INT_PRINT_HEX
", %s %d", mnemonic
, lane_count
,
15664 element_char
, UINTVAL (info
.value
), shift_op
, info
.shift
);
15666 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
15667 HOST_WIDE_INT_PRINT_HEX
, mnemonic
, lane_count
,
15668 element_char
, UINTVAL (info
.value
));
15672 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
15673 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "bic" : "orr";
15675 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
15676 HOST_WIDE_INT_PRINT_DEC
", %s #%d", mnemonic
, lane_count
,
15677 element_char
, UINTVAL (info
.value
), "lsl", info
.shift
);
15679 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
15680 HOST_WIDE_INT_PRINT_DEC
, mnemonic
, lane_count
,
15681 element_char
, UINTVAL (info
.value
));
15687 aarch64_output_scalar_simd_mov_immediate (rtx immediate
, scalar_int_mode mode
)
15690 /* If a floating point number was passed and we desire to use it in an
15691 integer mode do the conversion to integer. */
15692 if (CONST_DOUBLE_P (immediate
) && GET_MODE_CLASS (mode
) == MODE_INT
)
15694 unsigned HOST_WIDE_INT ival
;
15695 if (!aarch64_reinterpret_float_as_int (immediate
, &ival
))
15696 gcc_unreachable ();
15697 immediate
= gen_int_mode (ival
, mode
);
15700 machine_mode vmode
;
15701 /* use a 64 bit mode for everything except for DI/DF mode, where we use
15702 a 128 bit vector mode. */
15703 int width
= GET_MODE_BITSIZE (mode
) == 64 ? 128 : 64;
15705 vmode
= aarch64_simd_container_mode (mode
, width
);
15706 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (immediate
));
15707 return aarch64_output_simd_mov_immediate (v_op
, width
);
15710 /* Return the output string to use for moving immediate CONST_VECTOR
15711 into an SVE register. */
15714 aarch64_output_sve_mov_immediate (rtx const_vector
)
15716 static char templ
[40];
15717 struct simd_immediate_info info
;
15720 bool is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
);
15721 gcc_assert (is_valid
);
15723 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
15727 snprintf (templ
, sizeof (templ
), "index\t%%0.%c, #"
15728 HOST_WIDE_INT_PRINT_DEC
", #" HOST_WIDE_INT_PRINT_DEC
,
15729 element_char
, INTVAL (info
.value
), INTVAL (info
.step
));
15733 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
15735 if (aarch64_float_const_zero_rtx_p (info
.value
))
15736 info
.value
= GEN_INT (0);
15739 const int buf_size
= 20;
15740 char float_buf
[buf_size
] = {};
15741 real_to_decimal_for_mode (float_buf
,
15742 CONST_DOUBLE_REAL_VALUE (info
.value
),
15743 buf_size
, buf_size
, 1, info
.elt_mode
);
15745 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%c, #%s",
15746 element_char
, float_buf
);
15751 snprintf (templ
, sizeof (templ
), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC
,
15752 element_char
, INTVAL (info
.value
));
15756 /* Return the asm format for a PTRUE instruction whose destination has
15757 mode MODE. SUFFIX is the element size suffix. */
15760 aarch64_output_ptrue (machine_mode mode
, char suffix
)
15762 unsigned int nunits
;
15763 static char buf
[sizeof ("ptrue\t%0.N, vlNNNNN")];
15764 if (GET_MODE_NUNITS (mode
).is_constant (&nunits
))
15765 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, vl%d", suffix
, nunits
);
15767 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, all", suffix
);
15771 /* Split operands into moves from op[1] + op[2] into op[0]. */
15774 aarch64_split_combinev16qi (rtx operands
[3])
15776 unsigned int dest
= REGNO (operands
[0]);
15777 unsigned int src1
= REGNO (operands
[1]);
15778 unsigned int src2
= REGNO (operands
[2]);
15779 machine_mode halfmode
= GET_MODE (operands
[1]);
15780 unsigned int halfregs
= REG_NREGS (operands
[1]);
15781 rtx destlo
, desthi
;
15783 gcc_assert (halfmode
== V16QImode
);
15785 if (src1
== dest
&& src2
== dest
+ halfregs
)
15787 /* No-op move. Can't split to nothing; emit something. */
15788 emit_note (NOTE_INSN_DELETED
);
15792 /* Preserve register attributes for variable tracking. */
15793 destlo
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
, 0);
15794 desthi
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
+ halfregs
,
15795 GET_MODE_SIZE (halfmode
));
15797 /* Special case of reversed high/low parts. */
15798 if (reg_overlap_mentioned_p (operands
[2], destlo
)
15799 && reg_overlap_mentioned_p (operands
[1], desthi
))
15801 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
15802 emit_insn (gen_xorv16qi3 (operands
[2], operands
[1], operands
[2]));
15803 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
15805 else if (!reg_overlap_mentioned_p (operands
[2], destlo
))
15807 /* Try to avoid unnecessary moves if part of the result
15808 is in the right place already. */
15810 emit_move_insn (destlo
, operands
[1]);
15811 if (src2
!= dest
+ halfregs
)
15812 emit_move_insn (desthi
, operands
[2]);
15816 if (src2
!= dest
+ halfregs
)
15817 emit_move_insn (desthi
, operands
[2]);
15819 emit_move_insn (destlo
, operands
[1]);
15823 /* vec_perm support. */
15825 struct expand_vec_perm_d
15827 rtx target
, op0
, op1
;
15828 vec_perm_indices perm
;
15829 machine_mode vmode
;
15830 unsigned int vec_flags
;
15835 /* Generate a variable permutation. */
15838 aarch64_expand_vec_perm_1 (rtx target
, rtx op0
, rtx op1
, rtx sel
)
15840 machine_mode vmode
= GET_MODE (target
);
15841 bool one_vector_p
= rtx_equal_p (op0
, op1
);
15843 gcc_checking_assert (vmode
== V8QImode
|| vmode
== V16QImode
);
15844 gcc_checking_assert (GET_MODE (op0
) == vmode
);
15845 gcc_checking_assert (GET_MODE (op1
) == vmode
);
15846 gcc_checking_assert (GET_MODE (sel
) == vmode
);
15847 gcc_checking_assert (TARGET_SIMD
);
15851 if (vmode
== V8QImode
)
15853 /* Expand the argument to a V16QI mode by duplicating it. */
15854 rtx pair
= gen_reg_rtx (V16QImode
);
15855 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op0
));
15856 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
15860 emit_insn (gen_aarch64_tbl1v16qi (target
, op0
, sel
));
15867 if (vmode
== V8QImode
)
15869 pair
= gen_reg_rtx (V16QImode
);
15870 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op1
));
15871 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
15875 pair
= gen_reg_rtx (OImode
);
15876 emit_insn (gen_aarch64_combinev16qi (pair
, op0
, op1
));
15877 emit_insn (gen_aarch64_tbl2v16qi (target
, pair
, sel
));
15882 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
15883 NELT is the number of elements in the vector. */
15886 aarch64_expand_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
,
15889 machine_mode vmode
= GET_MODE (target
);
15890 bool one_vector_p
= rtx_equal_p (op0
, op1
);
15893 /* The TBL instruction does not use a modulo index, so we must take care
15894 of that ourselves. */
15895 mask
= aarch64_simd_gen_const_vector_dup (vmode
,
15896 one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
15897 sel
= expand_simple_binop (vmode
, AND
, sel
, mask
, NULL
, 0, OPTAB_LIB_WIDEN
);
15899 /* For big-endian, we also need to reverse the index within the vector
15900 (but not which vector). */
15901 if (BYTES_BIG_ENDIAN
)
15903 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
15905 mask
= aarch64_simd_gen_const_vector_dup (vmode
, nelt
- 1);
15906 sel
= expand_simple_binop (vmode
, XOR
, sel
, mask
,
15907 NULL
, 0, OPTAB_LIB_WIDEN
);
15909 aarch64_expand_vec_perm_1 (target
, op0
, op1
, sel
);
15912 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
15915 emit_unspec2 (rtx target
, int code
, rtx op0
, rtx op1
)
15917 emit_insn (gen_rtx_SET (target
,
15918 gen_rtx_UNSPEC (GET_MODE (target
),
15919 gen_rtvec (2, op0
, op1
), code
)));
15922 /* Expand an SVE vec_perm with the given operands. */
15925 aarch64_expand_sve_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
)
15927 machine_mode data_mode
= GET_MODE (target
);
15928 machine_mode sel_mode
= GET_MODE (sel
);
15929 /* Enforced by the pattern condition. */
15930 int nunits
= GET_MODE_NUNITS (sel_mode
).to_constant ();
15932 /* Note: vec_perm indices are supposed to wrap when they go beyond the
15933 size of the two value vectors, i.e. the upper bits of the indices
15934 are effectively ignored. SVE TBL instead produces 0 for any
15935 out-of-range indices, so we need to modulo all the vec_perm indices
15936 to ensure they are all in range. */
15937 rtx sel_reg
= force_reg (sel_mode
, sel
);
15939 /* Check if the sel only references the first values vector. */
15940 if (GET_CODE (sel
) == CONST_VECTOR
15941 && aarch64_const_vec_all_in_range_p (sel
, 0, nunits
- 1))
15943 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_reg
);
15947 /* Check if the two values vectors are the same. */
15948 if (rtx_equal_p (op0
, op1
))
15950 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
, nunits
- 1);
15951 rtx sel_mod
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
15952 NULL
, 0, OPTAB_DIRECT
);
15953 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_mod
);
15957 /* Run TBL on for each value vector and combine the results. */
15959 rtx res0
= gen_reg_rtx (data_mode
);
15960 rtx res1
= gen_reg_rtx (data_mode
);
15961 rtx neg_num_elems
= aarch64_simd_gen_const_vector_dup (sel_mode
, -nunits
);
15962 if (GET_CODE (sel
) != CONST_VECTOR
15963 || !aarch64_const_vec_all_in_range_p (sel
, 0, 2 * nunits
- 1))
15965 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
,
15967 sel_reg
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
15968 NULL
, 0, OPTAB_DIRECT
);
15970 emit_unspec2 (res0
, UNSPEC_TBL
, op0
, sel_reg
);
15971 rtx sel_sub
= expand_simple_binop (sel_mode
, PLUS
, sel_reg
, neg_num_elems
,
15972 NULL
, 0, OPTAB_DIRECT
);
15973 emit_unspec2 (res1
, UNSPEC_TBL
, op1
, sel_sub
);
15974 if (GET_MODE_CLASS (data_mode
) == MODE_VECTOR_INT
)
15975 emit_insn (gen_rtx_SET (target
, gen_rtx_IOR (data_mode
, res0
, res1
)));
15977 emit_unspec2 (target
, UNSPEC_IORF
, res0
, res1
);
15980 /* Recognize patterns suitable for the TRN instructions. */
15982 aarch64_evpc_trn (struct expand_vec_perm_d
*d
)
15985 poly_uint64 nelt
= d
->perm
.length ();
15986 rtx out
, in0
, in1
, x
;
15987 machine_mode vmode
= d
->vmode
;
15989 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
15992 /* Note that these are little-endian tests.
15993 We correct for big-endian later. */
15994 if (!d
->perm
[0].is_constant (&odd
)
15995 || (odd
!= 0 && odd
!= 1)
15996 || !d
->perm
.series_p (0, 2, odd
, 2)
15997 || !d
->perm
.series_p (1, 2, nelt
+ odd
, 2))
16006 /* We don't need a big-endian lane correction for SVE; see the comment
16007 at the head of aarch64-sve.md for details. */
16008 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
16010 x
= in0
, in0
= in1
, in1
= x
;
16015 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
16016 odd
? UNSPEC_TRN2
: UNSPEC_TRN1
));
16020 /* Recognize patterns suitable for the UZP instructions. */
16022 aarch64_evpc_uzp (struct expand_vec_perm_d
*d
)
16025 rtx out
, in0
, in1
, x
;
16026 machine_mode vmode
= d
->vmode
;
16028 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
16031 /* Note that these are little-endian tests.
16032 We correct for big-endian later. */
16033 if (!d
->perm
[0].is_constant (&odd
)
16034 || (odd
!= 0 && odd
!= 1)
16035 || !d
->perm
.series_p (0, 1, odd
, 2))
16044 /* We don't need a big-endian lane correction for SVE; see the comment
16045 at the head of aarch64-sve.md for details. */
16046 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
16048 x
= in0
, in0
= in1
, in1
= x
;
16053 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
16054 odd
? UNSPEC_UZP2
: UNSPEC_UZP1
));
16058 /* Recognize patterns suitable for the ZIP instructions. */
16060 aarch64_evpc_zip (struct expand_vec_perm_d
*d
)
16063 poly_uint64 nelt
= d
->perm
.length ();
16064 rtx out
, in0
, in1
, x
;
16065 machine_mode vmode
= d
->vmode
;
16067 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
16070 /* Note that these are little-endian tests.
16071 We correct for big-endian later. */
16072 poly_uint64 first
= d
->perm
[0];
16073 if ((maybe_ne (first
, 0U) && maybe_ne (first
* 2, nelt
))
16074 || !d
->perm
.series_p (0, 2, first
, 1)
16075 || !d
->perm
.series_p (1, 2, first
+ nelt
, 1))
16077 high
= maybe_ne (first
, 0U);
16085 /* We don't need a big-endian lane correction for SVE; see the comment
16086 at the head of aarch64-sve.md for details. */
16087 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
16089 x
= in0
, in0
= in1
, in1
= x
;
16094 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
16095 high
? UNSPEC_ZIP2
: UNSPEC_ZIP1
));
16099 /* Recognize patterns for the EXT insn. */
16102 aarch64_evpc_ext (struct expand_vec_perm_d
*d
)
16104 HOST_WIDE_INT location
;
16107 /* The first element always refers to the first vector.
16108 Check if the extracted indices are increasing by one. */
16109 if (d
->vec_flags
== VEC_SVE_PRED
16110 || !d
->perm
[0].is_constant (&location
)
16111 || !d
->perm
.series_p (0, 1, location
, 1))
16118 /* The case where (location == 0) is a no-op for both big- and little-endian,
16119 and is removed by the mid-end at optimization levels -O1 and higher.
16121 We don't need a big-endian lane correction for SVE; see the comment
16122 at the head of aarch64-sve.md for details. */
16123 if (BYTES_BIG_ENDIAN
&& location
!= 0 && d
->vec_flags
== VEC_ADVSIMD
)
16125 /* After setup, we want the high elements of the first vector (stored
16126 at the LSB end of the register), and the low elements of the second
16127 vector (stored at the MSB end of the register). So swap. */
16128 std::swap (d
->op0
, d
->op1
);
16129 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
16130 to_constant () is safe since this is restricted to Advanced SIMD
16132 location
= d
->perm
.length ().to_constant () - location
;
16135 offset
= GEN_INT (location
);
16136 emit_set_insn (d
->target
,
16137 gen_rtx_UNSPEC (d
->vmode
,
16138 gen_rtvec (3, d
->op0
, d
->op1
, offset
),
16143 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
16144 within each 64-bit, 32-bit or 16-bit granule. */
16147 aarch64_evpc_rev_local (struct expand_vec_perm_d
*d
)
16149 HOST_WIDE_INT diff
;
16150 unsigned int i
, size
, unspec
;
16151 machine_mode pred_mode
;
16153 if (d
->vec_flags
== VEC_SVE_PRED
16154 || !d
->one_vector_p
16155 || !d
->perm
[0].is_constant (&diff
))
16158 size
= (diff
+ 1) * GET_MODE_UNIT_SIZE (d
->vmode
);
16161 unspec
= UNSPEC_REV64
;
16162 pred_mode
= VNx2BImode
;
16164 else if (size
== 4)
16166 unspec
= UNSPEC_REV32
;
16167 pred_mode
= VNx4BImode
;
16169 else if (size
== 2)
16171 unspec
= UNSPEC_REV16
;
16172 pred_mode
= VNx8BImode
;
16177 unsigned int step
= diff
+ 1;
16178 for (i
= 0; i
< step
; ++i
)
16179 if (!d
->perm
.series_p (i
, step
, diff
- i
, step
))
16186 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), unspec
);
16187 if (d
->vec_flags
== VEC_SVE_DATA
)
16189 rtx pred
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
16190 src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (2, pred
, src
),
16191 UNSPEC_MERGE_PTRUE
);
16193 emit_set_insn (d
->target
, src
);
16197 /* Recognize patterns for the REV insn, which reverses elements within
16201 aarch64_evpc_rev_global (struct expand_vec_perm_d
*d
)
16203 poly_uint64 nelt
= d
->perm
.length ();
16205 if (!d
->one_vector_p
|| d
->vec_flags
!= VEC_SVE_DATA
)
16208 if (!d
->perm
.series_p (0, 1, nelt
- 1, -1))
16215 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), UNSPEC_REV
);
16216 emit_set_insn (d
->target
, src
);
16221 aarch64_evpc_dup (struct expand_vec_perm_d
*d
)
16223 rtx out
= d
->target
;
16226 machine_mode vmode
= d
->vmode
;
16229 if (d
->vec_flags
== VEC_SVE_PRED
16230 || d
->perm
.encoding ().encoded_nelts () != 1
16231 || !d
->perm
[0].is_constant (&elt
))
16234 if (d
->vec_flags
== VEC_SVE_DATA
&& elt
>= 64 * GET_MODE_UNIT_SIZE (vmode
))
16241 /* The generic preparation in aarch64_expand_vec_perm_const_1
16242 swaps the operand order and the permute indices if it finds
16243 d->perm[0] to be in the second operand. Thus, we can always
16244 use d->op0 and need not do any extra arithmetic to get the
16245 correct lane number. */
16247 lane
= GEN_INT (elt
); /* The pattern corrects for big-endian. */
16249 rtx parallel
= gen_rtx_PARALLEL (vmode
, gen_rtvec (1, lane
));
16250 rtx select
= gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode
), in0
, parallel
);
16251 emit_set_insn (out
, gen_rtx_VEC_DUPLICATE (vmode
, select
));
16256 aarch64_evpc_tbl (struct expand_vec_perm_d
*d
)
16258 rtx rperm
[MAX_COMPILE_TIME_VEC_BYTES
], sel
;
16259 machine_mode vmode
= d
->vmode
;
16261 /* Make sure that the indices are constant. */
16262 unsigned int encoded_nelts
= d
->perm
.encoding ().encoded_nelts ();
16263 for (unsigned int i
= 0; i
< encoded_nelts
; ++i
)
16264 if (!d
->perm
[i
].is_constant ())
16270 /* Generic code will try constant permutation twice. Once with the
16271 original mode and again with the elements lowered to QImode.
16272 So wait and don't do the selector expansion ourselves. */
16273 if (vmode
!= V8QImode
&& vmode
!= V16QImode
)
16276 /* to_constant is safe since this routine is specific to Advanced SIMD
16278 unsigned int nelt
= d
->perm
.length ().to_constant ();
16279 for (unsigned int i
= 0; i
< nelt
; ++i
)
16280 /* If big-endian and two vectors we end up with a weird mixed-endian
16281 mode on NEON. Reverse the index within each word but not the word
16282 itself. to_constant is safe because we checked is_constant above. */
16283 rperm
[i
] = GEN_INT (BYTES_BIG_ENDIAN
16284 ? d
->perm
[i
].to_constant () ^ (nelt
- 1)
16285 : d
->perm
[i
].to_constant ());
16287 sel
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
16288 sel
= force_reg (vmode
, sel
);
16290 aarch64_expand_vec_perm_1 (d
->target
, d
->op0
, d
->op1
, sel
);
16294 /* Try to implement D using an SVE TBL instruction. */
16297 aarch64_evpc_sve_tbl (struct expand_vec_perm_d
*d
)
16299 unsigned HOST_WIDE_INT nelt
;
16301 /* Permuting two variable-length vectors could overflow the
16303 if (!d
->one_vector_p
&& !d
->perm
.length ().is_constant (&nelt
))
16309 machine_mode sel_mode
= mode_for_int_vector (d
->vmode
).require ();
16310 rtx sel
= vec_perm_indices_to_rtx (sel_mode
, d
->perm
);
16311 if (d
->one_vector_p
)
16312 emit_unspec2 (d
->target
, UNSPEC_TBL
, d
->op0
, force_reg (sel_mode
, sel
));
16314 aarch64_expand_sve_vec_perm (d
->target
, d
->op0
, d
->op1
, sel
);
16319 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
16321 /* The pattern matching functions above are written to look for a small
16322 number to begin the sequence (0, 1, N/2). If we begin with an index
16323 from the second operand, we can swap the operands. */
16324 poly_int64 nelt
= d
->perm
.length ();
16325 if (known_ge (d
->perm
[0], nelt
))
16327 d
->perm
.rotate_inputs (1);
16328 std::swap (d
->op0
, d
->op1
);
16331 if ((d
->vec_flags
== VEC_ADVSIMD
16332 || d
->vec_flags
== VEC_SVE_DATA
16333 || d
->vec_flags
== VEC_SVE_PRED
)
16334 && known_gt (nelt
, 1))
16336 if (aarch64_evpc_rev_local (d
))
16338 else if (aarch64_evpc_rev_global (d
))
16340 else if (aarch64_evpc_ext (d
))
16342 else if (aarch64_evpc_dup (d
))
16344 else if (aarch64_evpc_zip (d
))
16346 else if (aarch64_evpc_uzp (d
))
16348 else if (aarch64_evpc_trn (d
))
16350 if (d
->vec_flags
== VEC_SVE_DATA
)
16351 return aarch64_evpc_sve_tbl (d
);
16352 else if (d
->vec_flags
== VEC_ADVSIMD
)
16353 return aarch64_evpc_tbl (d
);
16358 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
16361 aarch64_vectorize_vec_perm_const (machine_mode vmode
, rtx target
, rtx op0
,
16362 rtx op1
, const vec_perm_indices
&sel
)
16364 struct expand_vec_perm_d d
;
16366 /* Check whether the mask can be applied to a single vector. */
16367 if (sel
.ninputs () == 1
16368 || (op0
&& rtx_equal_p (op0
, op1
)))
16369 d
.one_vector_p
= true;
16370 else if (sel
.all_from_input_p (0))
16372 d
.one_vector_p
= true;
16375 else if (sel
.all_from_input_p (1))
16377 d
.one_vector_p
= true;
16381 d
.one_vector_p
= false;
16383 d
.perm
.new_vector (sel
.encoding (), d
.one_vector_p
? 1 : 2,
16384 sel
.nelts_per_input ());
16386 d
.vec_flags
= aarch64_classify_vector_mode (d
.vmode
);
16390 d
.testing_p
= !target
;
16393 return aarch64_expand_vec_perm_const_1 (&d
);
16395 rtx_insn
*last
= get_last_insn ();
16396 bool ret
= aarch64_expand_vec_perm_const_1 (&d
);
16397 gcc_assert (last
== get_last_insn ());
16402 /* Generate a byte permute mask for a register of mode MODE,
16403 which has NUNITS units. */
16406 aarch64_reverse_mask (machine_mode mode
, unsigned int nunits
)
16408 /* We have to reverse each vector because we dont have
16409 a permuted load that can reverse-load according to ABI rules. */
16411 rtvec v
= rtvec_alloc (16);
16413 unsigned int usize
= GET_MODE_UNIT_SIZE (mode
);
16415 gcc_assert (BYTES_BIG_ENDIAN
);
16416 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode
));
16418 for (i
= 0; i
< nunits
; i
++)
16419 for (j
= 0; j
< usize
; j
++)
16420 RTVEC_ELT (v
, i
* usize
+ j
) = GEN_INT ((i
+ 1) * usize
- 1 - j
);
16421 mask
= gen_rtx_CONST_VECTOR (V16QImode
, v
);
16422 return force_reg (V16QImode
, mask
);
16425 /* Return true if X is a valid second operand for the SVE instruction
16426 that implements integer comparison OP_CODE. */
16429 aarch64_sve_cmp_operand_p (rtx_code op_code
, rtx x
)
16431 if (register_operand (x
, VOIDmode
))
16440 return aarch64_sve_cmp_immediate_p (x
, false);
16447 return aarch64_sve_cmp_immediate_p (x
, true);
16449 gcc_unreachable ();
16453 /* Use predicated SVE instructions to implement the equivalent of:
16457 given that PTRUE is an all-true predicate of the appropriate mode. */
16460 aarch64_emit_sve_ptrue_op (rtx target
, rtx ptrue
, rtx op
)
16462 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (target
),
16463 gen_rtvec (2, ptrue
, op
),
16464 UNSPEC_MERGE_PTRUE
);
16465 rtx_insn
*insn
= emit_set_insn (target
, unspec
);
16466 set_unique_reg_note (insn
, REG_EQUAL
, copy_rtx (op
));
16469 /* Likewise, but also clobber the condition codes. */
16472 aarch64_emit_sve_ptrue_op_cc (rtx target
, rtx ptrue
, rtx op
)
16474 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (target
),
16475 gen_rtvec (2, ptrue
, op
),
16476 UNSPEC_MERGE_PTRUE
);
16477 rtx_insn
*insn
= emit_insn (gen_set_clobber_cc (target
, unspec
));
16478 set_unique_reg_note (insn
, REG_EQUAL
, copy_rtx (op
));
16481 /* Return the UNSPEC_COND_* code for comparison CODE. */
16483 static unsigned int
16484 aarch64_unspec_cond_code (rtx_code code
)
16489 return UNSPEC_COND_NE
;
16491 return UNSPEC_COND_EQ
;
16493 return UNSPEC_COND_LT
;
16495 return UNSPEC_COND_GT
;
16497 return UNSPEC_COND_LE
;
16499 return UNSPEC_COND_GE
;
16501 gcc_unreachable ();
16507 (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
16509 where <X> is the operation associated with comparison CODE. This form
16510 of instruction is used when (and (CODE OP0 OP1) PRED) would have different
16511 semantics, such as when PRED might not be all-true and when comparing
16512 inactive lanes could have side effects. */
16515 aarch64_emit_sve_predicated_cond (rtx target
, rtx_code code
,
16516 rtx pred
, rtx op0
, rtx op1
)
16518 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (pred
),
16519 gen_rtvec (3, pred
, op0
, op1
),
16520 aarch64_unspec_cond_code (code
));
16521 emit_set_insn (target
, unspec
);
16524 /* Expand an SVE integer comparison using the SVE equivalent of:
16526 (set TARGET (CODE OP0 OP1)). */
16529 aarch64_expand_sve_vec_cmp_int (rtx target
, rtx_code code
, rtx op0
, rtx op1
)
16531 machine_mode pred_mode
= GET_MODE (target
);
16532 machine_mode data_mode
= GET_MODE (op0
);
16534 if (!aarch64_sve_cmp_operand_p (code
, op1
))
16535 op1
= force_reg (data_mode
, op1
);
16537 rtx ptrue
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
16538 rtx cond
= gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
);
16539 aarch64_emit_sve_ptrue_op_cc (target
, ptrue
, cond
);
16542 /* Emit the SVE equivalent of:
16544 (set TMP1 (CODE1 OP0 OP1))
16545 (set TMP2 (CODE2 OP0 OP1))
16546 (set TARGET (ior:PRED_MODE TMP1 TMP2))
16548 PTRUE is an all-true predicate with the same mode as TARGET. */
16551 aarch64_emit_sve_or_conds (rtx target
, rtx_code code1
, rtx_code code2
,
16552 rtx ptrue
, rtx op0
, rtx op1
)
16554 machine_mode pred_mode
= GET_MODE (ptrue
);
16555 rtx tmp1
= gen_reg_rtx (pred_mode
);
16556 aarch64_emit_sve_ptrue_op (tmp1
, ptrue
,
16557 gen_rtx_fmt_ee (code1
, pred_mode
, op0
, op1
));
16558 rtx tmp2
= gen_reg_rtx (pred_mode
);
16559 aarch64_emit_sve_ptrue_op (tmp2
, ptrue
,
16560 gen_rtx_fmt_ee (code2
, pred_mode
, op0
, op1
));
16561 aarch64_emit_binop (target
, ior_optab
, tmp1
, tmp2
);
16564 /* Emit the SVE equivalent of:
16566 (set TMP (CODE OP0 OP1))
16567 (set TARGET (not TMP))
16569 PTRUE is an all-true predicate with the same mode as TARGET. */
16572 aarch64_emit_sve_inverted_cond (rtx target
, rtx ptrue
, rtx_code code
,
16575 machine_mode pred_mode
= GET_MODE (ptrue
);
16576 rtx tmp
= gen_reg_rtx (pred_mode
);
16577 aarch64_emit_sve_ptrue_op (tmp
, ptrue
,
16578 gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
));
16579 aarch64_emit_unop (target
, one_cmpl_optab
, tmp
);
16582 /* Expand an SVE floating-point comparison using the SVE equivalent of:
16584 (set TARGET (CODE OP0 OP1))
16586 If CAN_INVERT_P is true, the caller can also handle inverted results;
16587 return true if the result is in fact inverted. */
16590 aarch64_expand_sve_vec_cmp_float (rtx target
, rtx_code code
,
16591 rtx op0
, rtx op1
, bool can_invert_p
)
16593 machine_mode pred_mode
= GET_MODE (target
);
16594 machine_mode data_mode
= GET_MODE (op0
);
16596 rtx ptrue
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
16600 /* UNORDERED has no immediate form. */
16601 op1
= force_reg (data_mode
, op1
);
16610 /* There is native support for the comparison. */
16611 rtx cond
= gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
);
16612 aarch64_emit_sve_ptrue_op (target
, ptrue
, cond
);
16617 /* This is a trapping operation (LT or GT). */
16618 aarch64_emit_sve_or_conds (target
, LT
, GT
, ptrue
, op0
, op1
);
16622 if (!flag_trapping_math
)
16624 /* This would trap for signaling NaNs. */
16625 op1
= force_reg (data_mode
, op1
);
16626 aarch64_emit_sve_or_conds (target
, UNORDERED
, EQ
, ptrue
, op0
, op1
);
16634 if (flag_trapping_math
)
16636 /* Work out which elements are ordered. */
16637 rtx ordered
= gen_reg_rtx (pred_mode
);
16638 op1
= force_reg (data_mode
, op1
);
16639 aarch64_emit_sve_inverted_cond (ordered
, ptrue
, UNORDERED
, op0
, op1
);
16641 /* Test the opposite condition for the ordered elements,
16642 then invert the result. */
16646 code
= reverse_condition_maybe_unordered (code
);
16649 aarch64_emit_sve_predicated_cond (target
, code
,
16650 ordered
, op0
, op1
);
16653 rtx tmp
= gen_reg_rtx (pred_mode
);
16654 aarch64_emit_sve_predicated_cond (tmp
, code
, ordered
, op0
, op1
);
16655 aarch64_emit_unop (target
, one_cmpl_optab
, tmp
);
16661 /* ORDERED has no immediate form. */
16662 op1
= force_reg (data_mode
, op1
);
16666 gcc_unreachable ();
16669 /* There is native support for the inverse comparison. */
16670 code
= reverse_condition_maybe_unordered (code
);
16673 rtx cond
= gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
);
16674 aarch64_emit_sve_ptrue_op (target
, ptrue
, cond
);
16677 aarch64_emit_sve_inverted_cond (target
, ptrue
, code
, op0
, op1
);
16681 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
16682 of the data being selected and CMP_MODE is the mode of the values being
16686 aarch64_expand_sve_vcond (machine_mode data_mode
, machine_mode cmp_mode
,
16689 machine_mode pred_mode
16690 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode
),
16691 GET_MODE_SIZE (cmp_mode
)).require ();
16692 rtx pred
= gen_reg_rtx (pred_mode
);
16693 if (FLOAT_MODE_P (cmp_mode
))
16695 if (aarch64_expand_sve_vec_cmp_float (pred
, GET_CODE (ops
[3]),
16696 ops
[4], ops
[5], true))
16697 std::swap (ops
[1], ops
[2]);
16700 aarch64_expand_sve_vec_cmp_int (pred
, GET_CODE (ops
[3]), ops
[4], ops
[5]);
16702 rtvec vec
= gen_rtvec (3, pred
, ops
[1], ops
[2]);
16703 emit_set_insn (ops
[0], gen_rtx_UNSPEC (data_mode
, vec
, UNSPEC_SEL
));
16706 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
16707 true. However due to issues with register allocation it is preferable
16708 to avoid tieing integer scalar and FP scalar modes. Executing integer
16709 operations in general registers is better than treating them as scalar
16710 vector operations. This reduces latency and avoids redundant int<->FP
16711 moves. So tie modes if they are either the same class, or vector modes
16712 with other vector modes, vector structs or any scalar mode. */
16715 aarch64_modes_tieable_p (machine_mode mode1
, machine_mode mode2
)
16717 if (GET_MODE_CLASS (mode1
) == GET_MODE_CLASS (mode2
))
16720 /* We specifically want to allow elements of "structure" modes to
16721 be tieable to the structure. This more general condition allows
16722 other rarer situations too. The reason we don't extend this to
16723 predicate modes is that there are no predicate structure modes
16724 nor any specific instructions for extracting part of a predicate
16726 if (aarch64_vector_data_mode_p (mode1
)
16727 && aarch64_vector_data_mode_p (mode2
))
16730 /* Also allow any scalar modes with vectors. */
16731 if (aarch64_vector_mode_supported_p (mode1
)
16732 || aarch64_vector_mode_supported_p (mode2
))
16738 /* Return a new RTX holding the result of moving POINTER forward by
16742 aarch64_move_pointer (rtx pointer
, poly_int64 amount
)
16744 rtx next
= plus_constant (Pmode
, XEXP (pointer
, 0), amount
);
16746 return adjust_automodify_address (pointer
, GET_MODE (pointer
),
16750 /* Return a new RTX holding the result of moving POINTER forward by the
16751 size of the mode it points to. */
16754 aarch64_progress_pointer (rtx pointer
)
16756 return aarch64_move_pointer (pointer
, GET_MODE_SIZE (GET_MODE (pointer
)));
16759 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
16763 aarch64_copy_one_block_and_progress_pointers (rtx
*src
, rtx
*dst
,
16766 rtx reg
= gen_reg_rtx (mode
);
16768 /* "Cast" the pointers to the correct mode. */
16769 *src
= adjust_address (*src
, mode
, 0);
16770 *dst
= adjust_address (*dst
, mode
, 0);
16771 /* Emit the memcpy. */
16772 emit_move_insn (reg
, *src
);
16773 emit_move_insn (*dst
, reg
);
16774 /* Move the pointers forward. */
16775 *src
= aarch64_progress_pointer (*src
);
16776 *dst
= aarch64_progress_pointer (*dst
);
16779 /* Expand movmem, as if from a __builtin_memcpy. Return true if
16780 we succeed, otherwise return false. */
16783 aarch64_expand_movmem (rtx
*operands
)
16786 rtx dst
= operands
[0];
16787 rtx src
= operands
[1];
16789 machine_mode cur_mode
= BLKmode
, next_mode
;
16790 bool speed_p
= !optimize_function_for_size_p (cfun
);
16792 /* When optimizing for size, give a better estimate of the length of a
16793 memcpy call, but use the default otherwise. Moves larger than 8 bytes
16794 will always require an even number of instructions to do now. And each
16795 operation requires both a load+store, so devide the max number by 2. */
16796 int max_num_moves
= (speed_p
? 16 : AARCH64_CALL_RATIO
) / 2;
16798 /* We can't do anything smart if the amount to copy is not constant. */
16799 if (!CONST_INT_P (operands
[2]))
16802 n
= INTVAL (operands
[2]);
16804 /* Try to keep the number of instructions low. For all cases we will do at
16805 most two moves for the residual amount, since we'll always overlap the
16807 if (((n
/ 16) + (n
% 16 ? 2 : 0)) > max_num_moves
)
16810 base
= copy_to_mode_reg (Pmode
, XEXP (dst
, 0));
16811 dst
= adjust_automodify_address (dst
, VOIDmode
, base
, 0);
16813 base
= copy_to_mode_reg (Pmode
, XEXP (src
, 0));
16814 src
= adjust_automodify_address (src
, VOIDmode
, base
, 0);
16816 /* Convert n to bits to make the rest of the code simpler. */
16817 n
= n
* BITS_PER_UNIT
;
16819 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
16820 larger than TImode, but we should not use them for loads/stores here. */
16821 const int copy_limit
= GET_MODE_BITSIZE (TImode
);
16825 /* Find the largest mode in which to do the copy in without over reading
16827 opt_scalar_int_mode mode_iter
;
16828 FOR_EACH_MODE_IN_CLASS (mode_iter
, MODE_INT
)
16829 if (GET_MODE_BITSIZE (mode_iter
.require ()) <= MIN (n
, copy_limit
))
16830 cur_mode
= mode_iter
.require ();
16832 gcc_assert (cur_mode
!= BLKmode
);
16834 mode_bits
= GET_MODE_BITSIZE (cur_mode
).to_constant ();
16835 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, cur_mode
);
16839 /* Do certain trailing copies as overlapping if it's going to be
16840 cheaper. i.e. less instructions to do so. For instance doing a 15
16841 byte copy it's more efficient to do two overlapping 8 byte copies than
16843 if (n
> 0 && n
<= 8 * BITS_PER_UNIT
)
16845 next_mode
= smallest_mode_for_size (n
, MODE_INT
);
16846 int n_bits
= GET_MODE_BITSIZE (next_mode
).to_constant ();
16847 src
= aarch64_move_pointer (src
, (n
- n_bits
) / BITS_PER_UNIT
);
16848 dst
= aarch64_move_pointer (dst
, (n
- n_bits
) / BITS_PER_UNIT
);
16856 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
16857 SImode stores. Handle the case when the constant has identical
16858 bottom and top halves. This is beneficial when the two stores can be
16859 merged into an STP and we avoid synthesising potentially expensive
16860 immediates twice. Return true if such a split is possible. */
16863 aarch64_split_dimode_const_store (rtx dst
, rtx src
)
16865 rtx lo
= gen_lowpart (SImode
, src
);
16866 rtx hi
= gen_highpart_mode (SImode
, DImode
, src
);
16868 bool size_p
= optimize_function_for_size_p (cfun
);
16870 if (!rtx_equal_p (lo
, hi
))
16873 unsigned int orig_cost
16874 = aarch64_internal_mov_immediate (NULL_RTX
, src
, false, DImode
);
16875 unsigned int lo_cost
16876 = aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, SImode
);
16878 /* We want to transform:
16880 MOVK x1, 0x140, lsl 16
16881 MOVK x1, 0xc0da, lsl 32
16882 MOVK x1, 0x140, lsl 48
16886 MOVK w1, 0x140, lsl 16
16888 So we want to perform this only when we save two instructions
16889 or more. When optimizing for size, however, accept any code size
16891 if (size_p
&& orig_cost
<= lo_cost
)
16895 && (orig_cost
<= lo_cost
+ 1))
16898 rtx mem_lo
= adjust_address (dst
, SImode
, 0);
16899 if (!aarch64_mem_pair_operand (mem_lo
, SImode
))
16902 rtx tmp_reg
= gen_reg_rtx (SImode
);
16903 aarch64_expand_mov_immediate (tmp_reg
, lo
);
16904 rtx mem_hi
= aarch64_move_pointer (mem_lo
, GET_MODE_SIZE (SImode
));
16905 /* Don't emit an explicit store pair as this may not be always profitable.
16906 Let the sched-fusion logic decide whether to merge them. */
16907 emit_move_insn (mem_lo
, tmp_reg
);
16908 emit_move_insn (mem_hi
, tmp_reg
);
16913 /* Generate RTL for a conditional branch with rtx comparison CODE in
16914 mode CC_MODE. The destination of the unlikely conditional branch
16918 aarch64_gen_unlikely_cbranch (enum rtx_code code
, machine_mode cc_mode
,
16922 x
= gen_rtx_fmt_ee (code
, VOIDmode
,
16923 gen_rtx_REG (cc_mode
, CC_REGNUM
),
16926 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
16927 gen_rtx_LABEL_REF (VOIDmode
, label_ref
),
16929 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
16932 /* Generate DImode scratch registers for 128-bit (TImode) addition.
16934 OP1 represents the TImode destination operand 1
16935 OP2 represents the TImode destination operand 2
16936 LOW_DEST represents the low half (DImode) of TImode operand 0
16937 LOW_IN1 represents the low half (DImode) of TImode operand 1
16938 LOW_IN2 represents the low half (DImode) of TImode operand 2
16939 HIGH_DEST represents the high half (DImode) of TImode operand 0
16940 HIGH_IN1 represents the high half (DImode) of TImode operand 1
16941 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
16944 aarch64_addti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
16945 rtx
*low_in1
, rtx
*low_in2
,
16946 rtx
*high_dest
, rtx
*high_in1
,
16949 *low_dest
= gen_reg_rtx (DImode
);
16950 *low_in1
= gen_lowpart (DImode
, op1
);
16951 *low_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
16952 subreg_lowpart_offset (DImode
, TImode
));
16953 *high_dest
= gen_reg_rtx (DImode
);
16954 *high_in1
= gen_highpart (DImode
, op1
);
16955 *high_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
16956 subreg_highpart_offset (DImode
, TImode
));
16959 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
16961 This function differs from 'arch64_addti_scratch_regs' in that
16962 OP1 can be an immediate constant (zero). We must call
16963 subreg_highpart_offset with DImode and TImode arguments, otherwise
16964 VOIDmode will be used for the const_int which generates an internal
16965 error from subreg_size_highpart_offset which does not expect a size of zero.
16967 OP1 represents the TImode destination operand 1
16968 OP2 represents the TImode destination operand 2
16969 LOW_DEST represents the low half (DImode) of TImode operand 0
16970 LOW_IN1 represents the low half (DImode) of TImode operand 1
16971 LOW_IN2 represents the low half (DImode) of TImode operand 2
16972 HIGH_DEST represents the high half (DImode) of TImode operand 0
16973 HIGH_IN1 represents the high half (DImode) of TImode operand 1
16974 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
16978 aarch64_subvti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
16979 rtx
*low_in1
, rtx
*low_in2
,
16980 rtx
*high_dest
, rtx
*high_in1
,
16983 *low_dest
= gen_reg_rtx (DImode
);
16984 *low_in1
= simplify_gen_subreg (DImode
, op1
, TImode
,
16985 subreg_lowpart_offset (DImode
, TImode
));
16987 *low_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
16988 subreg_lowpart_offset (DImode
, TImode
));
16989 *high_dest
= gen_reg_rtx (DImode
);
16991 *high_in1
= simplify_gen_subreg (DImode
, op1
, TImode
,
16992 subreg_highpart_offset (DImode
, TImode
));
16993 *high_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
16994 subreg_highpart_offset (DImode
, TImode
));
16997 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
16999 OP0 represents the TImode destination operand 0
17000 LOW_DEST represents the low half (DImode) of TImode operand 0
17001 LOW_IN1 represents the low half (DImode) of TImode operand 1
17002 LOW_IN2 represents the low half (DImode) of TImode operand 2
17003 HIGH_DEST represents the high half (DImode) of TImode operand 0
17004 HIGH_IN1 represents the high half (DImode) of TImode operand 1
17005 HIGH_IN2 represents the high half (DImode) of TImode operand 2
17006 UNSIGNED_P is true if the operation is being performed on unsigned
17009 aarch64_expand_subvti (rtx op0
, rtx low_dest
, rtx low_in1
,
17010 rtx low_in2
, rtx high_dest
, rtx high_in1
,
17011 rtx high_in2
, bool unsigned_p
)
17013 if (low_in2
== const0_rtx
)
17015 low_dest
= low_in1
;
17016 high_in2
= force_reg (DImode
, high_in2
);
17018 emit_insn (gen_subdi3_compare1 (high_dest
, high_in1
, high_in2
));
17020 emit_insn (gen_subvdi_insn (high_dest
, high_in1
, high_in2
));
17024 if (CONST_INT_P (low_in2
))
17026 high_in2
= force_reg (DImode
, high_in2
);
17027 emit_insn (gen_subdi3_compare1_imm (low_dest
, low_in1
, low_in2
,
17028 GEN_INT (-INTVAL (low_in2
))));
17031 emit_insn (gen_subdi3_compare1 (low_dest
, low_in1
, low_in2
));
17034 emit_insn (gen_usubdi3_carryinC (high_dest
, high_in1
, high_in2
));
17036 emit_insn (gen_subdi3_carryinV (high_dest
, high_in1
, high_in2
));
17039 emit_move_insn (gen_lowpart (DImode
, op0
), low_dest
);
17040 emit_move_insn (gen_highpart (DImode
, op0
), high_dest
);
17044 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
17046 static unsigned HOST_WIDE_INT
17047 aarch64_asan_shadow_offset (void)
17049 return (HOST_WIDE_INT_1
<< 36);
17053 aarch64_gen_ccmp_first (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
,
17054 int code
, tree treeop0
, tree treeop1
)
17056 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
17058 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
17060 struct expand_operand ops
[4];
17063 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
17065 op_mode
= GET_MODE (op0
);
17066 if (op_mode
== VOIDmode
)
17067 op_mode
= GET_MODE (op1
);
17075 icode
= CODE_FOR_cmpsi
;
17080 icode
= CODE_FOR_cmpdi
;
17085 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
17086 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpesf
: CODE_FOR_fcmpsf
;
17091 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
17092 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpedf
: CODE_FOR_fcmpdf
;
17100 op0
= prepare_operand (icode
, op0
, 0, op_mode
, cmp_mode
, unsignedp
);
17101 op1
= prepare_operand (icode
, op1
, 1, op_mode
, cmp_mode
, unsignedp
);
17107 *prep_seq
= get_insns ();
17110 create_fixed_operand (&ops
[0], op0
);
17111 create_fixed_operand (&ops
[1], op1
);
17114 if (!maybe_expand_insn (icode
, 2, ops
))
17119 *gen_seq
= get_insns ();
17122 return gen_rtx_fmt_ee ((rtx_code
) code
, cc_mode
,
17123 gen_rtx_REG (cc_mode
, CC_REGNUM
), const0_rtx
);
17127 aarch64_gen_ccmp_next (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
, rtx prev
,
17128 int cmp_code
, tree treeop0
, tree treeop1
, int bit_code
)
17130 rtx op0
, op1
, target
;
17131 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
17132 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
17134 struct expand_operand ops
[6];
17137 push_to_sequence (*prep_seq
);
17138 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
17140 op_mode
= GET_MODE (op0
);
17141 if (op_mode
== VOIDmode
)
17142 op_mode
= GET_MODE (op1
);
17150 icode
= CODE_FOR_ccmpsi
;
17155 icode
= CODE_FOR_ccmpdi
;
17160 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
17161 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpesf
: CODE_FOR_fccmpsf
;
17166 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
17167 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpedf
: CODE_FOR_fccmpdf
;
17175 op0
= prepare_operand (icode
, op0
, 2, op_mode
, cmp_mode
, unsignedp
);
17176 op1
= prepare_operand (icode
, op1
, 3, op_mode
, cmp_mode
, unsignedp
);
17182 *prep_seq
= get_insns ();
17185 target
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
17186 aarch64_cond
= aarch64_get_condition_code_1 (cc_mode
, (rtx_code
) cmp_code
);
17188 if (bit_code
!= AND
)
17190 prev
= gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev
),
17191 GET_MODE (XEXP (prev
, 0))),
17192 VOIDmode
, XEXP (prev
, 0), const0_rtx
);
17193 aarch64_cond
= AARCH64_INVERSE_CONDITION_CODE (aarch64_cond
);
17196 create_fixed_operand (&ops
[0], XEXP (prev
, 0));
17197 create_fixed_operand (&ops
[1], target
);
17198 create_fixed_operand (&ops
[2], op0
);
17199 create_fixed_operand (&ops
[3], op1
);
17200 create_fixed_operand (&ops
[4], prev
);
17201 create_fixed_operand (&ops
[5], GEN_INT (aarch64_cond
));
17203 push_to_sequence (*gen_seq
);
17204 if (!maybe_expand_insn (icode
, 6, ops
))
17210 *gen_seq
= get_insns ();
17213 return gen_rtx_fmt_ee ((rtx_code
) cmp_code
, VOIDmode
, target
, const0_rtx
);
17216 #undef TARGET_GEN_CCMP_FIRST
17217 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
17219 #undef TARGET_GEN_CCMP_NEXT
17220 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
17222 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
17223 instruction fusion of some sort. */
17226 aarch64_macro_fusion_p (void)
17228 return aarch64_tune_params
.fusible_ops
!= AARCH64_FUSE_NOTHING
;
17232 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
17233 should be kept together during scheduling. */
17236 aarch_macro_fusion_pair_p (rtx_insn
*prev
, rtx_insn
*curr
)
17239 rtx prev_set
= single_set (prev
);
17240 rtx curr_set
= single_set (curr
);
17241 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
17242 bool simple_sets_p
= prev_set
&& curr_set
&& !any_condjump_p (curr
);
17244 if (!aarch64_macro_fusion_p ())
17247 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK
))
17249 /* We are trying to match:
17250 prev (mov) == (set (reg r0) (const_int imm16))
17251 curr (movk) == (set (zero_extract (reg r0)
17254 (const_int imm16_1)) */
17256 set_dest
= SET_DEST (curr_set
);
17258 if (GET_CODE (set_dest
) == ZERO_EXTRACT
17259 && CONST_INT_P (SET_SRC (curr_set
))
17260 && CONST_INT_P (SET_SRC (prev_set
))
17261 && CONST_INT_P (XEXP (set_dest
, 2))
17262 && INTVAL (XEXP (set_dest
, 2)) == 16
17263 && REG_P (XEXP (set_dest
, 0))
17264 && REG_P (SET_DEST (prev_set
))
17265 && REGNO (XEXP (set_dest
, 0)) == REGNO (SET_DEST (prev_set
)))
17271 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD
))
17274 /* We're trying to match:
17275 prev (adrp) == (set (reg r1)
17276 (high (symbol_ref ("SYM"))))
17277 curr (add) == (set (reg r0)
17279 (symbol_ref ("SYM"))))
17280 Note that r0 need not necessarily be the same as r1, especially
17281 during pre-regalloc scheduling. */
17283 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
17284 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
17286 if (GET_CODE (SET_SRC (curr_set
)) == LO_SUM
17287 && REG_P (XEXP (SET_SRC (curr_set
), 0))
17288 && REGNO (XEXP (SET_SRC (curr_set
), 0))
17289 == REGNO (SET_DEST (prev_set
))
17290 && rtx_equal_p (XEXP (SET_SRC (prev_set
), 0),
17291 XEXP (SET_SRC (curr_set
), 1)))
17296 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK
))
17299 /* We're trying to match:
17300 prev (movk) == (set (zero_extract (reg r0)
17303 (const_int imm16_1))
17304 curr (movk) == (set (zero_extract (reg r0)
17307 (const_int imm16_2)) */
17309 if (GET_CODE (SET_DEST (prev_set
)) == ZERO_EXTRACT
17310 && GET_CODE (SET_DEST (curr_set
)) == ZERO_EXTRACT
17311 && REG_P (XEXP (SET_DEST (prev_set
), 0))
17312 && REG_P (XEXP (SET_DEST (curr_set
), 0))
17313 && REGNO (XEXP (SET_DEST (prev_set
), 0))
17314 == REGNO (XEXP (SET_DEST (curr_set
), 0))
17315 && CONST_INT_P (XEXP (SET_DEST (prev_set
), 2))
17316 && CONST_INT_P (XEXP (SET_DEST (curr_set
), 2))
17317 && INTVAL (XEXP (SET_DEST (prev_set
), 2)) == 32
17318 && INTVAL (XEXP (SET_DEST (curr_set
), 2)) == 48
17319 && CONST_INT_P (SET_SRC (prev_set
))
17320 && CONST_INT_P (SET_SRC (curr_set
)))
17324 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR
))
17326 /* We're trying to match:
17327 prev (adrp) == (set (reg r0)
17328 (high (symbol_ref ("SYM"))))
17329 curr (ldr) == (set (reg r1)
17330 (mem (lo_sum (reg r0)
17331 (symbol_ref ("SYM")))))
17333 curr (ldr) == (set (reg r1)
17336 (symbol_ref ("SYM")))))) */
17337 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
17338 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
17340 rtx curr_src
= SET_SRC (curr_set
);
17342 if (GET_CODE (curr_src
) == ZERO_EXTEND
)
17343 curr_src
= XEXP (curr_src
, 0);
17345 if (MEM_P (curr_src
) && GET_CODE (XEXP (curr_src
, 0)) == LO_SUM
17346 && REG_P (XEXP (XEXP (curr_src
, 0), 0))
17347 && REGNO (XEXP (XEXP (curr_src
, 0), 0))
17348 == REGNO (SET_DEST (prev_set
))
17349 && rtx_equal_p (XEXP (XEXP (curr_src
, 0), 1),
17350 XEXP (SET_SRC (prev_set
), 0)))
17355 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC
)
17356 && aarch_crypto_can_dual_issue (prev
, curr
))
17359 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH
)
17360 && any_condjump_p (curr
))
17362 unsigned int condreg1
, condreg2
;
17364 aarch64_fixed_condition_code_regs (&condreg1
, &condreg2
);
17365 cc_reg_1
= gen_rtx_REG (CCmode
, condreg1
);
17367 if (reg_referenced_p (cc_reg_1
, PATTERN (curr
))
17369 && modified_in_p (cc_reg_1
, prev
))
17371 enum attr_type prev_type
= get_attr_type (prev
);
17373 /* FIXME: this misses some which is considered simple arthematic
17374 instructions for ThunderX. Simple shifts are missed here. */
17375 if (prev_type
== TYPE_ALUS_SREG
17376 || prev_type
== TYPE_ALUS_IMM
17377 || prev_type
== TYPE_LOGICS_REG
17378 || prev_type
== TYPE_LOGICS_IMM
)
17385 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH
)
17386 && any_condjump_p (curr
))
17388 /* We're trying to match:
17389 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
17390 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
17392 (label_ref ("SYM"))
17394 if (SET_DEST (curr_set
) == (pc_rtx
)
17395 && GET_CODE (SET_SRC (curr_set
)) == IF_THEN_ELSE
17396 && REG_P (XEXP (XEXP (SET_SRC (curr_set
), 0), 0))
17397 && REG_P (SET_DEST (prev_set
))
17398 && REGNO (SET_DEST (prev_set
))
17399 == REGNO (XEXP (XEXP (SET_SRC (curr_set
), 0), 0)))
17401 /* Fuse ALU operations followed by conditional branch instruction. */
17402 switch (get_attr_type (prev
))
17405 case TYPE_ALU_SREG
:
17408 case TYPE_ADCS_REG
:
17409 case TYPE_ADCS_IMM
:
17410 case TYPE_LOGIC_REG
:
17411 case TYPE_LOGIC_IMM
:
17415 case TYPE_SHIFT_REG
:
17416 case TYPE_SHIFT_IMM
:
17431 /* Return true iff the instruction fusion described by OP is enabled. */
17434 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op
)
17436 return (aarch64_tune_params
.fusible_ops
& op
) != 0;
17439 /* If MEM is in the form of [base+offset], extract the two parts
17440 of address and set to BASE and OFFSET, otherwise return false
17441 after clearing BASE and OFFSET. */
17444 extract_base_offset_in_addr (rtx mem
, rtx
*base
, rtx
*offset
)
17448 gcc_assert (MEM_P (mem
));
17450 addr
= XEXP (mem
, 0);
17455 *offset
= const0_rtx
;
17459 if (GET_CODE (addr
) == PLUS
17460 && REG_P (XEXP (addr
, 0)) && CONST_INT_P (XEXP (addr
, 1)))
17462 *base
= XEXP (addr
, 0);
17463 *offset
= XEXP (addr
, 1);
17468 *offset
= NULL_RTX
;
17473 /* Types for scheduling fusion. */
17474 enum sched_fusion_type
17476 SCHED_FUSION_NONE
= 0,
17477 SCHED_FUSION_LD_SIGN_EXTEND
,
17478 SCHED_FUSION_LD_ZERO_EXTEND
,
17484 /* If INSN is a load or store of address in the form of [base+offset],
17485 extract the two parts and set to BASE and OFFSET. Return scheduling
17486 fusion type this INSN is. */
17488 static enum sched_fusion_type
17489 fusion_load_store (rtx_insn
*insn
, rtx
*base
, rtx
*offset
)
17492 enum sched_fusion_type fusion
= SCHED_FUSION_LD
;
17494 gcc_assert (INSN_P (insn
));
17495 x
= PATTERN (insn
);
17496 if (GET_CODE (x
) != SET
)
17497 return SCHED_FUSION_NONE
;
17500 dest
= SET_DEST (x
);
17502 machine_mode dest_mode
= GET_MODE (dest
);
17504 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode
))
17505 return SCHED_FUSION_NONE
;
17507 if (GET_CODE (src
) == SIGN_EXTEND
)
17509 fusion
= SCHED_FUSION_LD_SIGN_EXTEND
;
17510 src
= XEXP (src
, 0);
17511 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
17512 return SCHED_FUSION_NONE
;
17514 else if (GET_CODE (src
) == ZERO_EXTEND
)
17516 fusion
= SCHED_FUSION_LD_ZERO_EXTEND
;
17517 src
= XEXP (src
, 0);
17518 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
17519 return SCHED_FUSION_NONE
;
17522 if (GET_CODE (src
) == MEM
&& REG_P (dest
))
17523 extract_base_offset_in_addr (src
, base
, offset
);
17524 else if (GET_CODE (dest
) == MEM
&& (REG_P (src
) || src
== const0_rtx
))
17526 fusion
= SCHED_FUSION_ST
;
17527 extract_base_offset_in_addr (dest
, base
, offset
);
17530 return SCHED_FUSION_NONE
;
17532 if (*base
== NULL_RTX
|| *offset
== NULL_RTX
)
17533 fusion
= SCHED_FUSION_NONE
;
17538 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
17540 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
17541 and PRI are only calculated for these instructions. For other instruction,
17542 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
17543 type instruction fusion can be added by returning different priorities.
17545 It's important that irrelevant instructions get the largest FUSION_PRI. */
17548 aarch64_sched_fusion_priority (rtx_insn
*insn
, int max_pri
,
17549 int *fusion_pri
, int *pri
)
17553 enum sched_fusion_type fusion
;
17555 gcc_assert (INSN_P (insn
));
17558 fusion
= fusion_load_store (insn
, &base
, &offset
);
17559 if (fusion
== SCHED_FUSION_NONE
)
17566 /* Set FUSION_PRI according to fusion type and base register. */
17567 *fusion_pri
= tmp
- fusion
* FIRST_PSEUDO_REGISTER
- REGNO (base
);
17569 /* Calculate PRI. */
17572 /* INSN with smaller offset goes first. */
17573 off_val
= (int)(INTVAL (offset
));
17575 tmp
-= (off_val
& 0xfffff);
17577 tmp
+= ((- off_val
) & 0xfffff);
17583 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
17584 Adjust priority of sha1h instructions so they are scheduled before
17585 other SHA1 instructions. */
17588 aarch64_sched_adjust_priority (rtx_insn
*insn
, int priority
)
17590 rtx x
= PATTERN (insn
);
17592 if (GET_CODE (x
) == SET
)
17596 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_SHA1H
)
17597 return priority
+ 10;
17603 /* Given OPERANDS of consecutive load/store, check if we can merge
17604 them into ldp/stp. LOAD is true if they are load instructions.
17605 MODE is the mode of memory operands. */
17608 aarch64_operands_ok_for_ldpstp (rtx
*operands
, bool load
,
17611 HOST_WIDE_INT offval_1
, offval_2
, msize
;
17612 enum reg_class rclass_1
, rclass_2
;
17613 rtx mem_1
, mem_2
, reg_1
, reg_2
, base_1
, base_2
, offset_1
, offset_2
;
17617 mem_1
= operands
[1];
17618 mem_2
= operands
[3];
17619 reg_1
= operands
[0];
17620 reg_2
= operands
[2];
17621 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
));
17622 if (REGNO (reg_1
) == REGNO (reg_2
))
17627 mem_1
= operands
[0];
17628 mem_2
= operands
[2];
17629 reg_1
= operands
[1];
17630 reg_2
= operands
[3];
17633 /* The mems cannot be volatile. */
17634 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
))
17637 /* If we have SImode and slow unaligned ldp,
17638 check the alignment to be at least 8 byte. */
17640 && (aarch64_tune_params
.extra_tuning_flags
17641 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
17643 && MEM_ALIGN (mem_1
) < 8 * BITS_PER_UNIT
)
17646 /* Check if the addresses are in the form of [base+offset]. */
17647 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
17648 if (base_1
== NULL_RTX
|| offset_1
== NULL_RTX
)
17650 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
17651 if (base_2
== NULL_RTX
|| offset_2
== NULL_RTX
)
17654 /* Check if the bases are same. */
17655 if (!rtx_equal_p (base_1
, base_2
))
17658 /* The operands must be of the same size. */
17659 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1
)),
17660 GET_MODE_SIZE (GET_MODE (mem_2
))));
17662 offval_1
= INTVAL (offset_1
);
17663 offval_2
= INTVAL (offset_2
);
17664 /* We should only be trying this for fixed-sized modes. There is no
17665 SVE LDP/STP instruction. */
17666 msize
= GET_MODE_SIZE (mode
).to_constant ();
17667 /* Check if the offsets are consecutive. */
17668 if (offval_1
!= (offval_2
+ msize
) && offval_2
!= (offval_1
+ msize
))
17671 /* Check if the addresses are clobbered by load. */
17674 if (reg_mentioned_p (reg_1
, mem_1
))
17677 /* In increasing order, the last load can clobber the address. */
17678 if (offval_1
> offval_2
&& reg_mentioned_p (reg_2
, mem_2
))
17682 /* One of the memory accesses must be a mempair operand.
17683 If it is not the first one, they need to be swapped by the
17685 if (!aarch64_mem_pair_operand (mem_1
, GET_MODE (mem_1
))
17686 && !aarch64_mem_pair_operand (mem_2
, GET_MODE (mem_2
)))
17689 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
17690 rclass_1
= FP_REGS
;
17692 rclass_1
= GENERAL_REGS
;
17694 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
17695 rclass_2
= FP_REGS
;
17697 rclass_2
= GENERAL_REGS
;
17699 /* Check if the registers are of same class. */
17700 if (rclass_1
!= rclass_2
)
17706 /* Given OPERANDS of consecutive load/store that can be merged,
17707 swap them if they are not in ascending order. */
17709 aarch64_swap_ldrstr_operands (rtx
* operands
, bool load
)
17711 rtx mem_1
, mem_2
, base_1
, base_2
, offset_1
, offset_2
;
17712 HOST_WIDE_INT offval_1
, offval_2
;
17716 mem_1
= operands
[1];
17717 mem_2
= operands
[3];
17721 mem_1
= operands
[0];
17722 mem_2
= operands
[2];
17725 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
17726 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
17728 offval_1
= INTVAL (offset_1
);
17729 offval_2
= INTVAL (offset_2
);
17731 if (offval_1
> offval_2
)
17733 /* Irrespective of whether this is a load or a store,
17734 we do the same swap. */
17735 std::swap (operands
[0], operands
[2]);
17736 std::swap (operands
[1], operands
[3]);
17740 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
17741 comparison between the two. */
17743 aarch64_host_wide_int_compare (const void *x
, const void *y
)
17745 return wi::cmps (* ((const HOST_WIDE_INT
*) x
),
17746 * ((const HOST_WIDE_INT
*) y
));
17749 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
17750 other pointing to a REG rtx containing an offset, compare the offsets
17755 1 iff offset (X) > offset (Y)
17756 0 iff offset (X) == offset (Y)
17757 -1 iff offset (X) < offset (Y) */
17759 aarch64_ldrstr_offset_compare (const void *x
, const void *y
)
17761 const rtx
* operands_1
= (const rtx
*) x
;
17762 const rtx
* operands_2
= (const rtx
*) y
;
17763 rtx mem_1
, mem_2
, base
, offset_1
, offset_2
;
17765 if (MEM_P (operands_1
[0]))
17766 mem_1
= operands_1
[0];
17768 mem_1
= operands_1
[1];
17770 if (MEM_P (operands_2
[0]))
17771 mem_2
= operands_2
[0];
17773 mem_2
= operands_2
[1];
17775 /* Extract the offsets. */
17776 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
17777 extract_base_offset_in_addr (mem_2
, &base
, &offset_2
);
17779 gcc_assert (offset_1
!= NULL_RTX
&& offset_2
!= NULL_RTX
);
17781 return wi::cmps (INTVAL (offset_1
), INTVAL (offset_2
));
17784 /* Given OPERANDS of consecutive load/store, check if we can merge
17785 them into ldp/stp by adjusting the offset. LOAD is true if they
17786 are load instructions. MODE is the mode of memory operands.
17788 Given below consecutive stores:
17790 str w1, [xb, 0x100]
17791 str w1, [xb, 0x104]
17792 str w1, [xb, 0x108]
17793 str w1, [xb, 0x10c]
17795 Though the offsets are out of the range supported by stp, we can
17796 still pair them after adjusting the offset, like:
17798 add scratch, xb, 0x100
17799 stp w1, w1, [scratch]
17800 stp w1, w1, [scratch, 0x8]
17802 The peephole patterns detecting this opportunity should guarantee
17803 the scratch register is avaliable. */
17806 aarch64_operands_adjust_ok_for_ldpstp (rtx
*operands
, bool load
,
17809 const int num_insns
= 4;
17810 enum reg_class rclass
;
17811 HOST_WIDE_INT offvals
[num_insns
], msize
;
17812 rtx mem
[num_insns
], reg
[num_insns
], base
[num_insns
], offset
[num_insns
];
17816 for (int i
= 0; i
< num_insns
; i
++)
17818 reg
[i
] = operands
[2 * i
];
17819 mem
[i
] = operands
[2 * i
+ 1];
17821 gcc_assert (REG_P (reg
[i
]));
17824 /* Do not attempt to merge the loads if the loads clobber each other. */
17825 for (int i
= 0; i
< 8; i
+= 2)
17826 for (int j
= i
+ 2; j
< 8; j
+= 2)
17827 if (reg_overlap_mentioned_p (operands
[i
], operands
[j
]))
17831 for (int i
= 0; i
< num_insns
; i
++)
17833 mem
[i
] = operands
[2 * i
];
17834 reg
[i
] = operands
[2 * i
+ 1];
17837 /* Skip if memory operand is by itself valid for ldp/stp. */
17838 if (!MEM_P (mem
[0]) || aarch64_mem_pair_operand (mem
[0], mode
))
17841 for (int i
= 0; i
< num_insns
; i
++)
17843 /* The mems cannot be volatile. */
17844 if (MEM_VOLATILE_P (mem
[i
]))
17847 /* Check if the addresses are in the form of [base+offset]. */
17848 extract_base_offset_in_addr (mem
[i
], base
+ i
, offset
+ i
);
17849 if (base
[i
] == NULL_RTX
|| offset
[i
] == NULL_RTX
)
17853 /* Check if the registers are of same class. */
17854 rclass
= REG_P (reg
[0]) && FP_REGNUM_P (REGNO (reg
[0]))
17855 ? FP_REGS
: GENERAL_REGS
;
17857 for (int i
= 1; i
< num_insns
; i
++)
17858 if (REG_P (reg
[i
]) && FP_REGNUM_P (REGNO (reg
[i
])))
17860 if (rclass
!= FP_REGS
)
17865 if (rclass
!= GENERAL_REGS
)
17869 /* Only the last register in the order in which they occur
17870 may be clobbered by the load. */
17871 if (rclass
== GENERAL_REGS
&& load
)
17872 for (int i
= 0; i
< num_insns
- 1; i
++)
17873 if (reg_mentioned_p (reg
[i
], mem
[i
]))
17876 /* Check if the bases are same. */
17877 for (int i
= 0; i
< num_insns
- 1; i
++)
17878 if (!rtx_equal_p (base
[i
], base
[i
+ 1]))
17881 for (int i
= 0; i
< num_insns
; i
++)
17882 offvals
[i
] = INTVAL (offset
[i
]);
17884 msize
= GET_MODE_SIZE (mode
);
17886 /* Check if the offsets can be put in the right order to do a ldp/stp. */
17887 qsort (offvals
, num_insns
, sizeof (HOST_WIDE_INT
),
17888 aarch64_host_wide_int_compare
);
17890 if (!(offvals
[1] == offvals
[0] + msize
17891 && offvals
[3] == offvals
[2] + msize
))
17894 /* Check that offsets are within range of each other. The ldp/stp
17895 instructions have 7 bit immediate offsets, so use 0x80. */
17896 if (offvals
[2] - offvals
[0] >= msize
* 0x80)
17899 /* The offsets must be aligned with respect to each other. */
17900 if (offvals
[0] % msize
!= offvals
[2] % msize
)
17903 /* If we have SImode and slow unaligned ldp,
17904 check the alignment to be at least 8 byte. */
17906 && (aarch64_tune_params
.extra_tuning_flags
17907 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
17909 && MEM_ALIGN (mem
[0]) < 8 * BITS_PER_UNIT
)
17915 /* Given OPERANDS of consecutive load/store, this function pairs them
17916 into LDP/STP after adjusting the offset. It depends on the fact
17917 that the operands can be sorted so the offsets are correct for STP.
17918 MODE is the mode of memory operands. CODE is the rtl operator
17919 which should be applied to all memory operands, it's SIGN_EXTEND,
17920 ZERO_EXTEND or UNKNOWN. */
17923 aarch64_gen_adjusted_ldpstp (rtx
*operands
, bool load
,
17924 scalar_mode mode
, RTX_CODE code
)
17926 rtx base
, offset_1
, offset_3
, t1
, t2
;
17927 rtx mem_1
, mem_2
, mem_3
, mem_4
;
17928 rtx temp_operands
[8];
17929 HOST_WIDE_INT off_val_1
, off_val_3
, base_off
, new_off_1
, new_off_3
,
17930 stp_off_upper_limit
, stp_off_lower_limit
, msize
;
17932 /* We make changes on a copy as we may still bail out. */
17933 for (int i
= 0; i
< 8; i
++)
17934 temp_operands
[i
] = operands
[i
];
17936 /* Sort the operands. */
17937 qsort (temp_operands
, 4, 2 * sizeof (rtx
*), aarch64_ldrstr_offset_compare
);
17941 mem_1
= temp_operands
[1];
17942 mem_2
= temp_operands
[3];
17943 mem_3
= temp_operands
[5];
17944 mem_4
= temp_operands
[7];
17948 mem_1
= temp_operands
[0];
17949 mem_2
= temp_operands
[2];
17950 mem_3
= temp_operands
[4];
17951 mem_4
= temp_operands
[6];
17952 gcc_assert (code
== UNKNOWN
);
17955 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
17956 extract_base_offset_in_addr (mem_3
, &base
, &offset_3
);
17957 gcc_assert (base
!= NULL_RTX
&& offset_1
!= NULL_RTX
17958 && offset_3
!= NULL_RTX
);
17960 /* Adjust offset so it can fit in LDP/STP instruction. */
17961 msize
= GET_MODE_SIZE (mode
);
17962 stp_off_upper_limit
= msize
* (0x40 - 1);
17963 stp_off_lower_limit
= - msize
* 0x40;
17965 off_val_1
= INTVAL (offset_1
);
17966 off_val_3
= INTVAL (offset_3
);
17968 /* The base offset is optimally half way between the two STP/LDP offsets. */
17970 base_off
= (off_val_1
+ off_val_3
) / 2;
17972 /* However, due to issues with negative LDP/STP offset generation for
17973 larger modes, for DF, DI and vector modes. we must not use negative
17974 addresses smaller than 9 signed unadjusted bits can store. This
17975 provides the most range in this case. */
17976 base_off
= off_val_1
;
17978 /* Adjust the base so that it is aligned with the addresses but still
17980 if (base_off
% msize
!= off_val_1
% msize
)
17981 /* Fix the offset, bearing in mind we want to make it bigger not
17983 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
17984 else if (msize
<= 4)
17985 /* The negative range of LDP/STP is one larger than the positive range. */
17988 /* Check if base offset is too big or too small. We can attempt to resolve
17989 this issue by setting it to the maximum value and seeing if the offsets
17991 if (base_off
>= 0x1000)
17993 base_off
= 0x1000 - 1;
17994 /* We must still make sure that the base offset is aligned with respect
17995 to the address. But it may may not be made any bigger. */
17996 base_off
-= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
17999 /* Likewise for the case where the base is too small. */
18000 if (base_off
<= -0x1000)
18002 base_off
= -0x1000 + 1;
18003 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
18006 /* Offset of the first STP/LDP. */
18007 new_off_1
= off_val_1
- base_off
;
18009 /* Offset of the second STP/LDP. */
18010 new_off_3
= off_val_3
- base_off
;
18012 /* The offsets must be within the range of the LDP/STP instructions. */
18013 if (new_off_1
> stp_off_upper_limit
|| new_off_1
< stp_off_lower_limit
18014 || new_off_3
> stp_off_upper_limit
|| new_off_3
< stp_off_lower_limit
)
18017 replace_equiv_address_nv (mem_1
, plus_constant (Pmode
, operands
[8],
18019 replace_equiv_address_nv (mem_2
, plus_constant (Pmode
, operands
[8],
18020 new_off_1
+ msize
), true);
18021 replace_equiv_address_nv (mem_3
, plus_constant (Pmode
, operands
[8],
18023 replace_equiv_address_nv (mem_4
, plus_constant (Pmode
, operands
[8],
18024 new_off_3
+ msize
), true);
18026 if (!aarch64_mem_pair_operand (mem_1
, mode
)
18027 || !aarch64_mem_pair_operand (mem_3
, mode
))
18030 if (code
== ZERO_EXTEND
)
18032 mem_1
= gen_rtx_ZERO_EXTEND (DImode
, mem_1
);
18033 mem_2
= gen_rtx_ZERO_EXTEND (DImode
, mem_2
);
18034 mem_3
= gen_rtx_ZERO_EXTEND (DImode
, mem_3
);
18035 mem_4
= gen_rtx_ZERO_EXTEND (DImode
, mem_4
);
18037 else if (code
== SIGN_EXTEND
)
18039 mem_1
= gen_rtx_SIGN_EXTEND (DImode
, mem_1
);
18040 mem_2
= gen_rtx_SIGN_EXTEND (DImode
, mem_2
);
18041 mem_3
= gen_rtx_SIGN_EXTEND (DImode
, mem_3
);
18042 mem_4
= gen_rtx_SIGN_EXTEND (DImode
, mem_4
);
18047 operands
[0] = temp_operands
[0];
18048 operands
[1] = mem_1
;
18049 operands
[2] = temp_operands
[2];
18050 operands
[3] = mem_2
;
18051 operands
[4] = temp_operands
[4];
18052 operands
[5] = mem_3
;
18053 operands
[6] = temp_operands
[6];
18054 operands
[7] = mem_4
;
18058 operands
[0] = mem_1
;
18059 operands
[1] = temp_operands
[1];
18060 operands
[2] = mem_2
;
18061 operands
[3] = temp_operands
[3];
18062 operands
[4] = mem_3
;
18063 operands
[5] = temp_operands
[5];
18064 operands
[6] = mem_4
;
18065 operands
[7] = temp_operands
[7];
18068 /* Emit adjusting instruction. */
18069 emit_insn (gen_rtx_SET (operands
[8], plus_constant (DImode
, base
, base_off
)));
18070 /* Emit ldp/stp instructions. */
18071 t1
= gen_rtx_SET (operands
[0], operands
[1]);
18072 t2
= gen_rtx_SET (operands
[2], operands
[3]);
18073 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
18074 t1
= gen_rtx_SET (operands
[4], operands
[5]);
18075 t2
= gen_rtx_SET (operands
[6], operands
[7]);
18076 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
18080 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
18081 it isn't worth branching around empty masked ops (including masked
18085 aarch64_empty_mask_is_expensive (unsigned)
18090 /* Return 1 if pseudo register should be created and used to hold
18091 GOT address for PIC code. */
18094 aarch64_use_pseudo_pic_reg (void)
18096 return aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
;
18099 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
18102 aarch64_unspec_may_trap_p (const_rtx x
, unsigned flags
)
18104 switch (XINT (x
, 1))
18106 case UNSPEC_GOTSMALLPIC
:
18107 case UNSPEC_GOTSMALLPIC28K
:
18108 case UNSPEC_GOTTINYPIC
:
18114 return default_unspec_may_trap_p (x
, flags
);
18118 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
18119 return the log2 of that value. Otherwise return -1. */
18122 aarch64_fpconst_pow_of_2 (rtx x
)
18124 const REAL_VALUE_TYPE
*r
;
18126 if (!CONST_DOUBLE_P (x
))
18129 r
= CONST_DOUBLE_REAL_VALUE (x
);
18131 if (REAL_VALUE_NEGATIVE (*r
)
18132 || REAL_VALUE_ISNAN (*r
)
18133 || REAL_VALUE_ISINF (*r
)
18134 || !real_isinteger (r
, DFmode
))
18137 return exact_log2 (real_to_integer (r
));
18140 /* If X is a vector of equal CONST_DOUBLE values and that value is
18141 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
18144 aarch64_vec_fpconst_pow_of_2 (rtx x
)
18147 if (GET_CODE (x
) != CONST_VECTOR
18148 || !CONST_VECTOR_NUNITS (x
).is_constant (&nelts
))
18151 if (GET_MODE_CLASS (GET_MODE (x
)) != MODE_VECTOR_FLOAT
)
18154 int firstval
= aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, 0));
18158 for (int i
= 1; i
< nelts
; i
++)
18159 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, i
)) != firstval
)
18165 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
18168 __fp16 always promotes through this hook.
18169 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
18170 through the generic excess precision logic rather than here. */
18173 aarch64_promoted_type (const_tree t
)
18175 if (SCALAR_FLOAT_TYPE_P (t
)
18176 && TYPE_MAIN_VARIANT (t
) == aarch64_fp16_type_node
)
18177 return float_type_node
;
18182 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
18185 aarch64_optab_supported_p (int op
, machine_mode mode1
, machine_mode
,
18186 optimization_type opt_type
)
18191 return opt_type
== OPTIMIZE_FOR_SPEED
&& use_rsqrt_p (mode1
);
18198 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
18200 static unsigned int
18201 aarch64_dwarf_poly_indeterminate_value (unsigned int i
, unsigned int *factor
,
18204 /* Polynomial invariant 1 == (VG / 2) - 1. */
18205 gcc_assert (i
== 1);
18208 return AARCH64_DWARF_VG
;
18211 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
18212 if MODE is HFmode, and punt to the generic implementation otherwise. */
18215 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode
)
18217 return (mode
== HFmode
18219 : default_libgcc_floating_mode_supported_p (mode
));
18222 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
18223 if MODE is HFmode, and punt to the generic implementation otherwise. */
18226 aarch64_scalar_mode_supported_p (scalar_mode mode
)
18228 return (mode
== HFmode
18230 : default_scalar_mode_supported_p (mode
));
18233 /* Set the value of FLT_EVAL_METHOD.
18234 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
18236 0: evaluate all operations and constants, whose semantic type has at
18237 most the range and precision of type float, to the range and
18238 precision of float; evaluate all other operations and constants to
18239 the range and precision of the semantic type;
18241 N, where _FloatN is a supported interchange floating type
18242 evaluate all operations and constants, whose semantic type has at
18243 most the range and precision of _FloatN type, to the range and
18244 precision of the _FloatN type; evaluate all other operations and
18245 constants to the range and precision of the semantic type;
18247 If we have the ARMv8.2-A extensions then we support _Float16 in native
18248 precision, so we should set this to 16. Otherwise, we support the type,
18249 but want to evaluate expressions in float precision, so set this to
18252 static enum flt_eval_method
18253 aarch64_excess_precision (enum excess_precision_type type
)
18257 case EXCESS_PRECISION_TYPE_FAST
:
18258 case EXCESS_PRECISION_TYPE_STANDARD
:
18259 /* We can calculate either in 16-bit range and precision or
18260 32-bit range and precision. Make that decision based on whether
18261 we have native support for the ARMv8.2-A 16-bit floating-point
18262 instructions or not. */
18263 return (TARGET_FP_F16INST
18264 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
18265 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
);
18266 case EXCESS_PRECISION_TYPE_IMPLICIT
:
18267 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
;
18269 gcc_unreachable ();
18271 return FLT_EVAL_METHOD_UNPREDICTABLE
;
18274 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
18275 scheduled for speculative execution. Reject the long-running division
18276 and square-root instructions. */
18279 aarch64_sched_can_speculate_insn (rtx_insn
*insn
)
18281 switch (get_attr_type (insn
))
18289 case TYPE_NEON_FP_SQRT_S
:
18290 case TYPE_NEON_FP_SQRT_D
:
18291 case TYPE_NEON_FP_SQRT_S_Q
:
18292 case TYPE_NEON_FP_SQRT_D_Q
:
18293 case TYPE_NEON_FP_DIV_S
:
18294 case TYPE_NEON_FP_DIV_D
:
18295 case TYPE_NEON_FP_DIV_S_Q
:
18296 case TYPE_NEON_FP_DIV_D_Q
:
18303 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
18306 aarch64_compute_pressure_classes (reg_class
*classes
)
18309 classes
[i
++] = GENERAL_REGS
;
18310 classes
[i
++] = FP_REGS
;
18311 /* PR_REGS isn't a useful pressure class because many predicate pseudo
18312 registers need to go in PR_LO_REGS at some point during their
18313 lifetime. Splitting it into two halves has the effect of making
18314 all predicates count against PR_LO_REGS, so that we try whenever
18315 possible to restrict the number of live predicates to 8. This
18316 greatly reduces the amount of spilling in certain loops. */
18317 classes
[i
++] = PR_LO_REGS
;
18318 classes
[i
++] = PR_HI_REGS
;
18322 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
18325 aarch64_can_change_mode_class (machine_mode from
,
18326 machine_mode to
, reg_class_t
)
18328 if (BYTES_BIG_ENDIAN
)
18330 bool from_sve_p
= aarch64_sve_data_mode_p (from
);
18331 bool to_sve_p
= aarch64_sve_data_mode_p (to
);
18333 /* Don't allow changes between SVE data modes and non-SVE modes.
18334 See the comment at the head of aarch64-sve.md for details. */
18335 if (from_sve_p
!= to_sve_p
)
18338 /* Don't allow changes in element size: lane 0 of the new vector
18339 would not then be lane 0 of the old vector. See the comment
18340 above aarch64_maybe_expand_sve_subreg_move for a more detailed
18343 In the worst case, this forces a register to be spilled in
18344 one mode and reloaded in the other, which handles the
18345 endianness correctly. */
18346 if (from_sve_p
&& GET_MODE_UNIT_SIZE (from
) != GET_MODE_UNIT_SIZE (to
))
18352 /* Implement TARGET_EARLY_REMAT_MODES. */
18355 aarch64_select_early_remat_modes (sbitmap modes
)
18357 /* SVE values are not normally live across a call, so it should be
18358 worth doing early rematerialization even in VL-specific mode. */
18359 for (int i
= 0; i
< NUM_MACHINE_MODES
; ++i
)
18361 machine_mode mode
= (machine_mode
) i
;
18362 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
18363 if (vec_flags
& VEC_ANY_SVE
)
18364 bitmap_set_bit (modes
, i
);
18368 /* Override the default target speculation_safe_value. */
18370 aarch64_speculation_safe_value (machine_mode mode
,
18371 rtx result
, rtx val
, rtx failval
)
18373 /* Maybe we should warn if falling back to hard barriers. They are
18374 likely to be noticably more expensive than the alternative below. */
18375 if (!aarch64_track_speculation
)
18376 return default_speculation_safe_value (mode
, result
, val
, failval
);
18379 val
= copy_to_mode_reg (mode
, val
);
18381 if (!aarch64_reg_or_zero (failval
, mode
))
18382 failval
= copy_to_mode_reg (mode
, failval
);
18384 emit_insn (gen_despeculate_copy (mode
, result
, val
, failval
));
18388 /* Implement TARGET_ESTIMATED_POLY_VALUE.
18389 Look into the tuning structure for an estimate.
18390 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
18391 Advanced SIMD 128 bits. */
18393 static HOST_WIDE_INT
18394 aarch64_estimated_poly_value (poly_int64 val
)
18396 enum aarch64_sve_vector_bits_enum width_source
18397 = aarch64_tune_params
.sve_width
;
18399 /* If we still don't have an estimate, use the default. */
18400 if (width_source
== SVE_SCALABLE
)
18401 return default_estimated_poly_value (val
);
18403 HOST_WIDE_INT over_128
= width_source
- 128;
18404 return val
.coeffs
[0] + val
.coeffs
[1] * over_128
/ 128;
18407 /* Target-specific selftests. */
18411 namespace selftest
{
18413 /* Selftest for the RTL loader.
18414 Verify that the RTL loader copes with a dump from
18415 print_rtx_function. This is essentially just a test that class
18416 function_reader can handle a real dump, but it also verifies
18417 that lookup_reg_by_dump_name correctly handles hard regs.
18418 The presence of hard reg names in the dump means that the test is
18419 target-specific, hence it is in this file. */
18422 aarch64_test_loading_full_dump ()
18424 rtl_dump_test
t (SELFTEST_LOCATION
, locate_file ("aarch64/times-two.rtl"));
18426 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun
->decl
)));
18428 rtx_insn
*insn_1
= get_insn_by_uid (1);
18429 ASSERT_EQ (NOTE
, GET_CODE (insn_1
));
18431 rtx_insn
*insn_15
= get_insn_by_uid (15);
18432 ASSERT_EQ (INSN
, GET_CODE (insn_15
));
18433 ASSERT_EQ (USE
, GET_CODE (PATTERN (insn_15
)));
18435 /* Verify crtl->return_rtx. */
18436 ASSERT_EQ (REG
, GET_CODE (crtl
->return_rtx
));
18437 ASSERT_EQ (0, REGNO (crtl
->return_rtx
));
18438 ASSERT_EQ (SImode
, GET_MODE (crtl
->return_rtx
));
18441 /* Run all target-specific selftests. */
18444 aarch64_run_selftests (void)
18446 aarch64_test_loading_full_dump ();
18449 } // namespace selftest
18451 #endif /* #if CHECKING_P */
18453 #undef TARGET_ADDRESS_COST
18454 #define TARGET_ADDRESS_COST aarch64_address_cost
18456 /* This hook will determines whether unnamed bitfields affect the alignment
18457 of the containing structure. The hook returns true if the structure
18458 should inherit the alignment requirements of an unnamed bitfield's
18460 #undef TARGET_ALIGN_ANON_BITFIELD
18461 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
18463 #undef TARGET_ASM_ALIGNED_DI_OP
18464 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
18466 #undef TARGET_ASM_ALIGNED_HI_OP
18467 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
18469 #undef TARGET_ASM_ALIGNED_SI_OP
18470 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
18472 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
18473 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
18474 hook_bool_const_tree_hwi_hwi_const_tree_true
18476 #undef TARGET_ASM_FILE_START
18477 #define TARGET_ASM_FILE_START aarch64_start_file
18479 #undef TARGET_ASM_OUTPUT_MI_THUNK
18480 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
18482 #undef TARGET_ASM_SELECT_RTX_SECTION
18483 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
18485 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
18486 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
18488 #undef TARGET_BUILD_BUILTIN_VA_LIST
18489 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
18491 #undef TARGET_CALLEE_COPIES
18492 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
18494 #undef TARGET_CAN_ELIMINATE
18495 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
18497 #undef TARGET_CAN_INLINE_P
18498 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
18500 #undef TARGET_CANNOT_FORCE_CONST_MEM
18501 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
18503 #undef TARGET_CASE_VALUES_THRESHOLD
18504 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
18506 #undef TARGET_CONDITIONAL_REGISTER_USAGE
18507 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
18509 /* Only the least significant bit is used for initialization guard
18511 #undef TARGET_CXX_GUARD_MASK_BIT
18512 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
18514 #undef TARGET_C_MODE_FOR_SUFFIX
18515 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
18517 #ifdef TARGET_BIG_ENDIAN_DEFAULT
18518 #undef TARGET_DEFAULT_TARGET_FLAGS
18519 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
18522 #undef TARGET_CLASS_MAX_NREGS
18523 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
18525 #undef TARGET_BUILTIN_DECL
18526 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
18528 #undef TARGET_BUILTIN_RECIPROCAL
18529 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
18531 #undef TARGET_C_EXCESS_PRECISION
18532 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
18534 #undef TARGET_EXPAND_BUILTIN
18535 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
18537 #undef TARGET_EXPAND_BUILTIN_VA_START
18538 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
18540 #undef TARGET_FOLD_BUILTIN
18541 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
18543 #undef TARGET_FUNCTION_ARG
18544 #define TARGET_FUNCTION_ARG aarch64_function_arg
18546 #undef TARGET_FUNCTION_ARG_ADVANCE
18547 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
18549 #undef TARGET_FUNCTION_ARG_BOUNDARY
18550 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
18552 #undef TARGET_FUNCTION_ARG_PADDING
18553 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
18555 #undef TARGET_GET_RAW_RESULT_MODE
18556 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
18557 #undef TARGET_GET_RAW_ARG_MODE
18558 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
18560 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
18561 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
18563 #undef TARGET_FUNCTION_VALUE
18564 #define TARGET_FUNCTION_VALUE aarch64_function_value
18566 #undef TARGET_FUNCTION_VALUE_REGNO_P
18567 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
18569 #undef TARGET_GIMPLE_FOLD_BUILTIN
18570 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
18572 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
18573 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
18575 #undef TARGET_INIT_BUILTINS
18576 #define TARGET_INIT_BUILTINS aarch64_init_builtins
18578 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
18579 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
18580 aarch64_ira_change_pseudo_allocno_class
18582 #undef TARGET_LEGITIMATE_ADDRESS_P
18583 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
18585 #undef TARGET_LEGITIMATE_CONSTANT_P
18586 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
18588 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
18589 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
18590 aarch64_legitimize_address_displacement
18592 #undef TARGET_LIBGCC_CMP_RETURN_MODE
18593 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
18595 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
18596 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
18597 aarch64_libgcc_floating_mode_supported_p
18599 #undef TARGET_MANGLE_TYPE
18600 #define TARGET_MANGLE_TYPE aarch64_mangle_type
18602 #undef TARGET_MEMORY_MOVE_COST
18603 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
18605 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
18606 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
18608 #undef TARGET_MUST_PASS_IN_STACK
18609 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
18611 /* This target hook should return true if accesses to volatile bitfields
18612 should use the narrowest mode possible. It should return false if these
18613 accesses should use the bitfield container type. */
18614 #undef TARGET_NARROW_VOLATILE_BITFIELD
18615 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
18617 #undef TARGET_OPTION_OVERRIDE
18618 #define TARGET_OPTION_OVERRIDE aarch64_override_options
18620 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
18621 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
18622 aarch64_override_options_after_change
18624 #undef TARGET_OPTION_SAVE
18625 #define TARGET_OPTION_SAVE aarch64_option_save
18627 #undef TARGET_OPTION_RESTORE
18628 #define TARGET_OPTION_RESTORE aarch64_option_restore
18630 #undef TARGET_OPTION_PRINT
18631 #define TARGET_OPTION_PRINT aarch64_option_print
18633 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
18634 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
18636 #undef TARGET_SET_CURRENT_FUNCTION
18637 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
18639 #undef TARGET_PASS_BY_REFERENCE
18640 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
18642 #undef TARGET_PREFERRED_RELOAD_CLASS
18643 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
18645 #undef TARGET_SCHED_REASSOCIATION_WIDTH
18646 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
18648 #undef TARGET_PROMOTED_TYPE
18649 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
18651 #undef TARGET_SECONDARY_RELOAD
18652 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
18654 #undef TARGET_SHIFT_TRUNCATION_MASK
18655 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
18657 #undef TARGET_SETUP_INCOMING_VARARGS
18658 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
18660 #undef TARGET_STRUCT_VALUE_RTX
18661 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
18663 #undef TARGET_REGISTER_MOVE_COST
18664 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
18666 #undef TARGET_RETURN_IN_MEMORY
18667 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
18669 #undef TARGET_RETURN_IN_MSB
18670 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
18672 #undef TARGET_RTX_COSTS
18673 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
18675 #undef TARGET_SCALAR_MODE_SUPPORTED_P
18676 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
18678 #undef TARGET_SCHED_ISSUE_RATE
18679 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
18681 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
18682 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
18683 aarch64_sched_first_cycle_multipass_dfa_lookahead
18685 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
18686 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
18687 aarch64_first_cycle_multipass_dfa_lookahead_guard
18689 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
18690 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
18691 aarch64_get_separate_components
18693 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
18694 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
18695 aarch64_components_for_bb
18697 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
18698 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
18699 aarch64_disqualify_components
18701 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
18702 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
18703 aarch64_emit_prologue_components
18705 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
18706 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
18707 aarch64_emit_epilogue_components
18709 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
18710 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
18711 aarch64_set_handled_components
18713 #undef TARGET_TRAMPOLINE_INIT
18714 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
18716 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
18717 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
18719 #undef TARGET_VECTOR_MODE_SUPPORTED_P
18720 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
18722 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
18723 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
18724 aarch64_builtin_support_vector_misalignment
18726 #undef TARGET_ARRAY_MODE
18727 #define TARGET_ARRAY_MODE aarch64_array_mode
18729 #undef TARGET_ARRAY_MODE_SUPPORTED_P
18730 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
18732 #undef TARGET_VECTORIZE_ADD_STMT_COST
18733 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
18735 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
18736 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
18737 aarch64_builtin_vectorization_cost
18739 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
18740 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
18742 #undef TARGET_VECTORIZE_BUILTINS
18743 #define TARGET_VECTORIZE_BUILTINS
18745 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
18746 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
18747 aarch64_builtin_vectorized_function
18749 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
18750 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
18751 aarch64_autovectorize_vector_sizes
18753 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
18754 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
18755 aarch64_atomic_assign_expand_fenv
18757 /* Section anchor support. */
18759 #undef TARGET_MIN_ANCHOR_OFFSET
18760 #define TARGET_MIN_ANCHOR_OFFSET -256
18762 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
18763 byte offset; we can do much more for larger data types, but have no way
18764 to determine the size of the access. We assume accesses are aligned. */
18765 #undef TARGET_MAX_ANCHOR_OFFSET
18766 #define TARGET_MAX_ANCHOR_OFFSET 4095
18768 #undef TARGET_VECTOR_ALIGNMENT
18769 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
18771 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
18772 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
18773 aarch64_vectorize_preferred_vector_alignment
18774 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
18775 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
18776 aarch64_simd_vector_alignment_reachable
18778 /* vec_perm support. */
18780 #undef TARGET_VECTORIZE_VEC_PERM_CONST
18781 #define TARGET_VECTORIZE_VEC_PERM_CONST \
18782 aarch64_vectorize_vec_perm_const
18784 #undef TARGET_VECTORIZE_GET_MASK_MODE
18785 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
18786 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
18787 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
18788 aarch64_empty_mask_is_expensive
18789 #undef TARGET_PREFERRED_ELSE_VALUE
18790 #define TARGET_PREFERRED_ELSE_VALUE \
18791 aarch64_preferred_else_value
18793 #undef TARGET_INIT_LIBFUNCS
18794 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
18796 #undef TARGET_FIXED_CONDITION_CODE_REGS
18797 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
18799 #undef TARGET_FLAGS_REGNUM
18800 #define TARGET_FLAGS_REGNUM CC_REGNUM
18802 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
18803 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
18805 #undef TARGET_ASAN_SHADOW_OFFSET
18806 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
18808 #undef TARGET_LEGITIMIZE_ADDRESS
18809 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
18811 #undef TARGET_SCHED_CAN_SPECULATE_INSN
18812 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
18814 #undef TARGET_CAN_USE_DOLOOP_P
18815 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
18817 #undef TARGET_SCHED_ADJUST_PRIORITY
18818 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
18820 #undef TARGET_SCHED_MACRO_FUSION_P
18821 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
18823 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
18824 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
18826 #undef TARGET_SCHED_FUSION_PRIORITY
18827 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
18829 #undef TARGET_UNSPEC_MAY_TRAP_P
18830 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
18832 #undef TARGET_USE_PSEUDO_PIC_REG
18833 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
18835 #undef TARGET_PRINT_OPERAND
18836 #define TARGET_PRINT_OPERAND aarch64_print_operand
18838 #undef TARGET_PRINT_OPERAND_ADDRESS
18839 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
18841 #undef TARGET_OPTAB_SUPPORTED_P
18842 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
18844 #undef TARGET_OMIT_STRUCT_RETURN_REG
18845 #define TARGET_OMIT_STRUCT_RETURN_REG true
18847 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
18848 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
18849 aarch64_dwarf_poly_indeterminate_value
18851 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
18852 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
18853 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
18855 #undef TARGET_HARD_REGNO_NREGS
18856 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
18857 #undef TARGET_HARD_REGNO_MODE_OK
18858 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
18860 #undef TARGET_MODES_TIEABLE_P
18861 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
18863 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
18864 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
18865 aarch64_hard_regno_call_part_clobbered
18867 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
18868 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
18869 aarch64_remove_extra_call_preserved_regs
18871 #undef TARGET_CONSTANT_ALIGNMENT
18872 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
18874 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
18875 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
18876 aarch64_stack_clash_protection_alloca_probe_range
18878 #undef TARGET_COMPUTE_PRESSURE_CLASSES
18879 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
18881 #undef TARGET_CAN_CHANGE_MODE_CLASS
18882 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
18884 #undef TARGET_SELECT_EARLY_REMAT_MODES
18885 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
18887 #undef TARGET_SPECULATION_SAFE_VALUE
18888 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
18890 #undef TARGET_ESTIMATED_POLY_VALUE
18891 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
18893 #undef TARGET_ATTRIBUTE_TABLE
18894 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
18897 #undef TARGET_RUN_TARGET_SELFTESTS
18898 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
18899 #endif /* #if CHECKING_P */
18901 struct gcc_target targetm
= TARGET_INITIALIZER
;
18903 #include "gt-aarch64.h"