1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2019 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
24 #define INCLUDE_STRING
26 #include "coretypes.h"
37 #include "stringpool.h"
43 #include "diagnostic.h"
44 #include "insn-attr.h"
46 #include "fold-const.h"
47 #include "stor-layout.h"
55 #include "langhooks.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
75 /* This file should be included last. */
76 #include "target-def.h"
78 /* Defined for convenience. */
79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
81 /* Information about a legitimate vector immediate operand. */
82 struct simd_immediate_info
84 enum insn_type
{ MOV
, MVN
};
85 enum modifier_type
{ LSL
, MSL
};
87 simd_immediate_info () {}
88 simd_immediate_info (scalar_float_mode
, rtx
);
89 simd_immediate_info (scalar_int_mode
, unsigned HOST_WIDE_INT
,
90 insn_type
= MOV
, modifier_type
= LSL
,
92 simd_immediate_info (scalar_mode
, rtx
, rtx
);
94 /* The mode of the elements. */
97 /* The value of each element if all elements are the same, or the
98 first value if the constant is a series. */
101 /* The value of the step if the constant is a series, null otherwise. */
104 /* The instruction to use to move the immediate into a vector. */
107 /* The kind of shift modifier to use, and the number of bits to shift.
108 This is (LSL, 0) if no shift is needed. */
109 modifier_type modifier
;
113 /* Construct a floating-point immediate in which each element has mode
114 ELT_MODE_IN and value VALUE_IN. */
115 inline simd_immediate_info
116 ::simd_immediate_info (scalar_float_mode elt_mode_in
, rtx value_in
)
117 : elt_mode (elt_mode_in
), value (value_in
), step (NULL_RTX
), insn (MOV
),
118 modifier (LSL
), shift (0)
121 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
122 and value VALUE_IN. The other parameters are as for the structure
124 inline simd_immediate_info
125 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
126 unsigned HOST_WIDE_INT value_in
,
127 insn_type insn_in
, modifier_type modifier_in
,
128 unsigned int shift_in
)
129 : elt_mode (elt_mode_in
), value (gen_int_mode (value_in
, elt_mode_in
)),
130 step (NULL_RTX
), insn (insn_in
), modifier (modifier_in
), shift (shift_in
)
133 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
134 and where element I is equal to VALUE_IN + I * STEP_IN. */
135 inline simd_immediate_info
136 ::simd_immediate_info (scalar_mode elt_mode_in
, rtx value_in
, rtx step_in
)
137 : elt_mode (elt_mode_in
), value (value_in
), step (step_in
), insn (MOV
),
138 modifier (LSL
), shift (0)
141 /* The current code model. */
142 enum aarch64_code_model aarch64_cmodel
;
144 /* The number of 64-bit elements in an SVE vector. */
145 poly_uint16 aarch64_sve_vg
;
148 #undef TARGET_HAVE_TLS
149 #define TARGET_HAVE_TLS 1
152 static bool aarch64_composite_type_p (const_tree
, machine_mode
);
153 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode
,
155 machine_mode
*, int *,
157 static void aarch64_elf_asm_constructor (rtx
, int) ATTRIBUTE_UNUSED
;
158 static void aarch64_elf_asm_destructor (rtx
, int) ATTRIBUTE_UNUSED
;
159 static void aarch64_override_options_after_change (void);
160 static bool aarch64_vector_mode_supported_p (machine_mode
);
161 static int aarch64_address_cost (rtx
, machine_mode
, addr_space_t
, bool);
162 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode
,
166 static machine_mode
aarch64_simd_container_mode (scalar_mode
, poly_int64
);
167 static bool aarch64_print_address_internal (FILE*, machine_mode
, rtx
,
168 aarch64_addr_query_type
);
169 static HOST_WIDE_INT
aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val
);
171 /* Major revision number of the ARM Architecture implemented by the target. */
172 unsigned aarch64_architecture_version
;
174 /* The processor for which instructions should be scheduled. */
175 enum aarch64_processor aarch64_tune
= cortexa53
;
177 /* Mask to specify which instruction scheduling options should be used. */
178 unsigned long aarch64_tune_flags
= 0;
180 /* Global flag for PC relative loads. */
181 bool aarch64_pcrelative_literal_loads
;
183 /* Global flag for whether frame pointer is enabled. */
184 bool aarch64_use_frame_pointer
;
186 #define BRANCH_PROTECT_STR_MAX 255
187 char *accepted_branch_protection_string
= NULL
;
189 static enum aarch64_parse_opt_result
190 aarch64_parse_branch_protection (const char*, char**);
192 /* Support for command line parsing of boolean flags in the tuning
194 struct aarch64_flag_desc
200 #define AARCH64_FUSION_PAIR(name, internal_name) \
201 { name, AARCH64_FUSE_##internal_name },
202 static const struct aarch64_flag_desc aarch64_fusible_pairs
[] =
204 { "none", AARCH64_FUSE_NOTHING
},
205 #include "aarch64-fusion-pairs.def"
206 { "all", AARCH64_FUSE_ALL
},
207 { NULL
, AARCH64_FUSE_NOTHING
}
210 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
211 { name, AARCH64_EXTRA_TUNE_##internal_name },
212 static const struct aarch64_flag_desc aarch64_tuning_flags
[] =
214 { "none", AARCH64_EXTRA_TUNE_NONE
},
215 #include "aarch64-tuning-flags.def"
216 { "all", AARCH64_EXTRA_TUNE_ALL
},
217 { NULL
, AARCH64_EXTRA_TUNE_NONE
}
220 /* Tuning parameters. */
222 static const struct cpu_addrcost_table generic_addrcost_table
=
232 0, /* register_offset */
233 0, /* register_sextend */
234 0, /* register_zextend */
238 static const struct cpu_addrcost_table exynosm1_addrcost_table
=
248 1, /* register_offset */
249 1, /* register_sextend */
250 2, /* register_zextend */
254 static const struct cpu_addrcost_table xgene1_addrcost_table
=
264 0, /* register_offset */
265 1, /* register_sextend */
266 1, /* register_zextend */
270 static const struct cpu_addrcost_table thunderx2t99_addrcost_table
=
280 2, /* register_offset */
281 3, /* register_sextend */
282 3, /* register_zextend */
286 static const struct cpu_addrcost_table tsv110_addrcost_table
=
296 0, /* register_offset */
297 1, /* register_sextend */
298 1, /* register_zextend */
302 static const struct cpu_addrcost_table qdf24xx_addrcost_table
=
312 3, /* register_offset */
313 3, /* register_sextend */
314 3, /* register_zextend */
318 static const struct cpu_regmove_cost generic_regmove_cost
=
321 /* Avoid the use of slow int<->fp moves for spilling by setting
322 their cost higher than memmov_cost. */
328 static const struct cpu_regmove_cost cortexa57_regmove_cost
=
331 /* Avoid the use of slow int<->fp moves for spilling by setting
332 their cost higher than memmov_cost. */
338 static const struct cpu_regmove_cost cortexa53_regmove_cost
=
341 /* Avoid the use of slow int<->fp moves for spilling by setting
342 their cost higher than memmov_cost. */
348 static const struct cpu_regmove_cost exynosm1_regmove_cost
=
351 /* Avoid the use of slow int<->fp moves for spilling by setting
352 their cost higher than memmov_cost (actual, 4 and 9). */
358 static const struct cpu_regmove_cost thunderx_regmove_cost
=
366 static const struct cpu_regmove_cost xgene1_regmove_cost
=
369 /* Avoid the use of slow int<->fp moves for spilling by setting
370 their cost higher than memmov_cost. */
376 static const struct cpu_regmove_cost qdf24xx_regmove_cost
=
379 /* Avoid the use of int<->fp moves for spilling. */
385 static const struct cpu_regmove_cost thunderx2t99_regmove_cost
=
388 /* Avoid the use of int<->fp moves for spilling. */
394 static const struct cpu_regmove_cost tsv110_regmove_cost
=
397 /* Avoid the use of slow int<->fp moves for spilling by setting
398 their cost higher than memmov_cost. */
404 /* Generic costs for vector insn classes. */
405 static const struct cpu_vector_cost generic_vector_cost
=
407 1, /* scalar_int_stmt_cost */
408 1, /* scalar_fp_stmt_cost */
409 1, /* scalar_load_cost */
410 1, /* scalar_store_cost */
411 1, /* vec_int_stmt_cost */
412 1, /* vec_fp_stmt_cost */
413 2, /* vec_permute_cost */
414 1, /* vec_to_scalar_cost */
415 1, /* scalar_to_vec_cost */
416 1, /* vec_align_load_cost */
417 1, /* vec_unalign_load_cost */
418 1, /* vec_unalign_store_cost */
419 1, /* vec_store_cost */
420 3, /* cond_taken_branch_cost */
421 1 /* cond_not_taken_branch_cost */
424 /* QDF24XX costs for vector insn classes. */
425 static const struct cpu_vector_cost qdf24xx_vector_cost
=
427 1, /* scalar_int_stmt_cost */
428 1, /* scalar_fp_stmt_cost */
429 1, /* scalar_load_cost */
430 1, /* scalar_store_cost */
431 1, /* vec_int_stmt_cost */
432 3, /* vec_fp_stmt_cost */
433 2, /* vec_permute_cost */
434 1, /* vec_to_scalar_cost */
435 1, /* scalar_to_vec_cost */
436 1, /* vec_align_load_cost */
437 1, /* vec_unalign_load_cost */
438 1, /* vec_unalign_store_cost */
439 1, /* vec_store_cost */
440 3, /* cond_taken_branch_cost */
441 1 /* cond_not_taken_branch_cost */
444 /* ThunderX costs for vector insn classes. */
445 static const struct cpu_vector_cost thunderx_vector_cost
=
447 1, /* scalar_int_stmt_cost */
448 1, /* scalar_fp_stmt_cost */
449 3, /* scalar_load_cost */
450 1, /* scalar_store_cost */
451 4, /* vec_int_stmt_cost */
452 1, /* vec_fp_stmt_cost */
453 4, /* vec_permute_cost */
454 2, /* vec_to_scalar_cost */
455 2, /* scalar_to_vec_cost */
456 3, /* vec_align_load_cost */
457 5, /* vec_unalign_load_cost */
458 5, /* vec_unalign_store_cost */
459 1, /* vec_store_cost */
460 3, /* cond_taken_branch_cost */
461 3 /* cond_not_taken_branch_cost */
464 static const struct cpu_vector_cost tsv110_vector_cost
=
466 1, /* scalar_int_stmt_cost */
467 1, /* scalar_fp_stmt_cost */
468 5, /* scalar_load_cost */
469 1, /* scalar_store_cost */
470 2, /* vec_int_stmt_cost */
471 2, /* vec_fp_stmt_cost */
472 2, /* vec_permute_cost */
473 3, /* vec_to_scalar_cost */
474 2, /* scalar_to_vec_cost */
475 5, /* vec_align_load_cost */
476 5, /* vec_unalign_load_cost */
477 1, /* vec_unalign_store_cost */
478 1, /* vec_store_cost */
479 1, /* cond_taken_branch_cost */
480 1 /* cond_not_taken_branch_cost */
483 /* Generic costs for vector insn classes. */
484 static const struct cpu_vector_cost cortexa57_vector_cost
=
486 1, /* scalar_int_stmt_cost */
487 1, /* scalar_fp_stmt_cost */
488 4, /* scalar_load_cost */
489 1, /* scalar_store_cost */
490 2, /* vec_int_stmt_cost */
491 2, /* vec_fp_stmt_cost */
492 3, /* vec_permute_cost */
493 8, /* vec_to_scalar_cost */
494 8, /* scalar_to_vec_cost */
495 4, /* vec_align_load_cost */
496 4, /* vec_unalign_load_cost */
497 1, /* vec_unalign_store_cost */
498 1, /* vec_store_cost */
499 1, /* cond_taken_branch_cost */
500 1 /* cond_not_taken_branch_cost */
503 static const struct cpu_vector_cost exynosm1_vector_cost
=
505 1, /* scalar_int_stmt_cost */
506 1, /* scalar_fp_stmt_cost */
507 5, /* scalar_load_cost */
508 1, /* scalar_store_cost */
509 3, /* vec_int_stmt_cost */
510 3, /* vec_fp_stmt_cost */
511 3, /* vec_permute_cost */
512 3, /* vec_to_scalar_cost */
513 3, /* scalar_to_vec_cost */
514 5, /* vec_align_load_cost */
515 5, /* vec_unalign_load_cost */
516 1, /* vec_unalign_store_cost */
517 1, /* vec_store_cost */
518 1, /* cond_taken_branch_cost */
519 1 /* cond_not_taken_branch_cost */
522 /* Generic costs for vector insn classes. */
523 static const struct cpu_vector_cost xgene1_vector_cost
=
525 1, /* scalar_int_stmt_cost */
526 1, /* scalar_fp_stmt_cost */
527 5, /* scalar_load_cost */
528 1, /* scalar_store_cost */
529 2, /* vec_int_stmt_cost */
530 2, /* vec_fp_stmt_cost */
531 2, /* vec_permute_cost */
532 4, /* vec_to_scalar_cost */
533 4, /* scalar_to_vec_cost */
534 10, /* vec_align_load_cost */
535 10, /* vec_unalign_load_cost */
536 2, /* vec_unalign_store_cost */
537 2, /* vec_store_cost */
538 2, /* cond_taken_branch_cost */
539 1 /* cond_not_taken_branch_cost */
542 /* Costs for vector insn classes for Vulcan. */
543 static const struct cpu_vector_cost thunderx2t99_vector_cost
=
545 1, /* scalar_int_stmt_cost */
546 6, /* scalar_fp_stmt_cost */
547 4, /* scalar_load_cost */
548 1, /* scalar_store_cost */
549 5, /* vec_int_stmt_cost */
550 6, /* vec_fp_stmt_cost */
551 3, /* vec_permute_cost */
552 6, /* vec_to_scalar_cost */
553 5, /* scalar_to_vec_cost */
554 8, /* vec_align_load_cost */
555 8, /* vec_unalign_load_cost */
556 4, /* vec_unalign_store_cost */
557 4, /* vec_store_cost */
558 2, /* cond_taken_branch_cost */
559 1 /* cond_not_taken_branch_cost */
562 /* Generic costs for branch instructions. */
563 static const struct cpu_branch_cost generic_branch_cost
=
565 1, /* Predictable. */
566 3 /* Unpredictable. */
569 /* Generic approximation modes. */
570 static const cpu_approx_modes generic_approx_modes
=
572 AARCH64_APPROX_NONE
, /* division */
573 AARCH64_APPROX_NONE
, /* sqrt */
574 AARCH64_APPROX_NONE
/* recip_sqrt */
577 /* Approximation modes for Exynos M1. */
578 static const cpu_approx_modes exynosm1_approx_modes
=
580 AARCH64_APPROX_NONE
, /* division */
581 AARCH64_APPROX_ALL
, /* sqrt */
582 AARCH64_APPROX_ALL
/* recip_sqrt */
585 /* Approximation modes for X-Gene 1. */
586 static const cpu_approx_modes xgene1_approx_modes
=
588 AARCH64_APPROX_NONE
, /* division */
589 AARCH64_APPROX_NONE
, /* sqrt */
590 AARCH64_APPROX_ALL
/* recip_sqrt */
593 /* Generic prefetch settings (which disable prefetch). */
594 static const cpu_prefetch_tune generic_prefetch_tune
=
597 -1, /* l1_cache_size */
598 -1, /* l1_cache_line_size */
599 -1, /* l2_cache_size */
600 true, /* prefetch_dynamic_strides */
601 -1, /* minimum_stride */
602 -1 /* default_opt_level */
605 static const cpu_prefetch_tune exynosm1_prefetch_tune
=
608 -1, /* l1_cache_size */
609 64, /* l1_cache_line_size */
610 -1, /* l2_cache_size */
611 true, /* prefetch_dynamic_strides */
612 -1, /* minimum_stride */
613 -1 /* default_opt_level */
616 static const cpu_prefetch_tune qdf24xx_prefetch_tune
=
619 32, /* l1_cache_size */
620 64, /* l1_cache_line_size */
621 512, /* l2_cache_size */
622 false, /* prefetch_dynamic_strides */
623 2048, /* minimum_stride */
624 3 /* default_opt_level */
627 static const cpu_prefetch_tune thunderxt88_prefetch_tune
=
630 32, /* l1_cache_size */
631 128, /* l1_cache_line_size */
632 16*1024, /* l2_cache_size */
633 true, /* prefetch_dynamic_strides */
634 -1, /* minimum_stride */
635 3 /* default_opt_level */
638 static const cpu_prefetch_tune thunderx_prefetch_tune
=
641 32, /* l1_cache_size */
642 128, /* l1_cache_line_size */
643 -1, /* l2_cache_size */
644 true, /* prefetch_dynamic_strides */
645 -1, /* minimum_stride */
646 -1 /* default_opt_level */
649 static const cpu_prefetch_tune thunderx2t99_prefetch_tune
=
652 32, /* l1_cache_size */
653 64, /* l1_cache_line_size */
654 256, /* l2_cache_size */
655 true, /* prefetch_dynamic_strides */
656 -1, /* minimum_stride */
657 -1 /* default_opt_level */
660 static const cpu_prefetch_tune tsv110_prefetch_tune
=
663 64, /* l1_cache_size */
664 64, /* l1_cache_line_size */
665 512, /* l2_cache_size */
666 true, /* prefetch_dynamic_strides */
667 -1, /* minimum_stride */
668 -1 /* default_opt_level */
671 static const cpu_prefetch_tune xgene1_prefetch_tune
=
674 32, /* l1_cache_size */
675 64, /* l1_cache_line_size */
676 256, /* l2_cache_size */
677 true, /* prefetch_dynamic_strides */
678 -1, /* minimum_stride */
679 -1 /* default_opt_level */
682 static const struct tune_params generic_tunings
=
684 &cortexa57_extra_costs
,
685 &generic_addrcost_table
,
686 &generic_regmove_cost
,
687 &generic_vector_cost
,
688 &generic_branch_cost
,
689 &generic_approx_modes
,
690 SVE_NOT_IMPLEMENTED
, /* sve_width */
693 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
694 "8", /* function_align. */
695 "4", /* jump_align. */
696 "8", /* loop_align. */
697 2, /* int_reassoc_width. */
698 4, /* fp_reassoc_width. */
699 1, /* vec_reassoc_width. */
700 2, /* min_div_recip_mul_sf. */
701 2, /* min_div_recip_mul_df. */
702 0, /* max_case_values. */
703 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
704 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
705 &generic_prefetch_tune
708 static const struct tune_params cortexa35_tunings
=
710 &cortexa53_extra_costs
,
711 &generic_addrcost_table
,
712 &cortexa53_regmove_cost
,
713 &generic_vector_cost
,
714 &generic_branch_cost
,
715 &generic_approx_modes
,
716 SVE_NOT_IMPLEMENTED
, /* sve_width */
719 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
720 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
721 "16", /* function_align. */
722 "4", /* jump_align. */
723 "8", /* loop_align. */
724 2, /* int_reassoc_width. */
725 4, /* fp_reassoc_width. */
726 1, /* vec_reassoc_width. */
727 2, /* min_div_recip_mul_sf. */
728 2, /* min_div_recip_mul_df. */
729 0, /* max_case_values. */
730 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
731 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
732 &generic_prefetch_tune
735 static const struct tune_params cortexa53_tunings
=
737 &cortexa53_extra_costs
,
738 &generic_addrcost_table
,
739 &cortexa53_regmove_cost
,
740 &generic_vector_cost
,
741 &generic_branch_cost
,
742 &generic_approx_modes
,
743 SVE_NOT_IMPLEMENTED
, /* sve_width */
746 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
747 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
748 "16", /* function_align. */
749 "4", /* jump_align. */
750 "8", /* loop_align. */
751 2, /* int_reassoc_width. */
752 4, /* fp_reassoc_width. */
753 1, /* vec_reassoc_width. */
754 2, /* min_div_recip_mul_sf. */
755 2, /* min_div_recip_mul_df. */
756 0, /* max_case_values. */
757 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
758 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
759 &generic_prefetch_tune
762 static const struct tune_params cortexa57_tunings
=
764 &cortexa57_extra_costs
,
765 &generic_addrcost_table
,
766 &cortexa57_regmove_cost
,
767 &cortexa57_vector_cost
,
768 &generic_branch_cost
,
769 &generic_approx_modes
,
770 SVE_NOT_IMPLEMENTED
, /* sve_width */
773 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
774 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
775 "16", /* function_align. */
776 "4", /* jump_align. */
777 "8", /* loop_align. */
778 2, /* int_reassoc_width. */
779 4, /* fp_reassoc_width. */
780 1, /* vec_reassoc_width. */
781 2, /* min_div_recip_mul_sf. */
782 2, /* min_div_recip_mul_df. */
783 0, /* max_case_values. */
784 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
785 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
), /* tune_flags. */
786 &generic_prefetch_tune
789 static const struct tune_params cortexa72_tunings
=
791 &cortexa57_extra_costs
,
792 &generic_addrcost_table
,
793 &cortexa57_regmove_cost
,
794 &cortexa57_vector_cost
,
795 &generic_branch_cost
,
796 &generic_approx_modes
,
797 SVE_NOT_IMPLEMENTED
, /* sve_width */
800 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
801 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
802 "16", /* function_align. */
803 "4", /* jump_align. */
804 "8", /* loop_align. */
805 2, /* int_reassoc_width. */
806 4, /* fp_reassoc_width. */
807 1, /* vec_reassoc_width. */
808 2, /* min_div_recip_mul_sf. */
809 2, /* min_div_recip_mul_df. */
810 0, /* max_case_values. */
811 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
812 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
813 &generic_prefetch_tune
816 static const struct tune_params cortexa73_tunings
=
818 &cortexa57_extra_costs
,
819 &generic_addrcost_table
,
820 &cortexa57_regmove_cost
,
821 &cortexa57_vector_cost
,
822 &generic_branch_cost
,
823 &generic_approx_modes
,
824 SVE_NOT_IMPLEMENTED
, /* sve_width */
825 4, /* memmov_cost. */
827 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
828 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
829 "16", /* function_align. */
830 "4", /* jump_align. */
831 "8", /* loop_align. */
832 2, /* int_reassoc_width. */
833 4, /* fp_reassoc_width. */
834 1, /* vec_reassoc_width. */
835 2, /* min_div_recip_mul_sf. */
836 2, /* min_div_recip_mul_df. */
837 0, /* max_case_values. */
838 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
839 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
840 &generic_prefetch_tune
845 static const struct tune_params exynosm1_tunings
=
847 &exynosm1_extra_costs
,
848 &exynosm1_addrcost_table
,
849 &exynosm1_regmove_cost
,
850 &exynosm1_vector_cost
,
851 &generic_branch_cost
,
852 &exynosm1_approx_modes
,
853 SVE_NOT_IMPLEMENTED
, /* sve_width */
856 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
857 "4", /* function_align. */
858 "4", /* jump_align. */
859 "4", /* loop_align. */
860 2, /* int_reassoc_width. */
861 4, /* fp_reassoc_width. */
862 1, /* vec_reassoc_width. */
863 2, /* min_div_recip_mul_sf. */
864 2, /* min_div_recip_mul_df. */
865 48, /* max_case_values. */
866 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
867 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
868 &exynosm1_prefetch_tune
871 static const struct tune_params thunderxt88_tunings
=
873 &thunderx_extra_costs
,
874 &generic_addrcost_table
,
875 &thunderx_regmove_cost
,
876 &thunderx_vector_cost
,
877 &generic_branch_cost
,
878 &generic_approx_modes
,
879 SVE_NOT_IMPLEMENTED
, /* sve_width */
882 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
883 "8", /* function_align. */
884 "8", /* jump_align. */
885 "8", /* loop_align. */
886 2, /* int_reassoc_width. */
887 4, /* fp_reassoc_width. */
888 1, /* vec_reassoc_width. */
889 2, /* min_div_recip_mul_sf. */
890 2, /* min_div_recip_mul_df. */
891 0, /* max_case_values. */
892 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
893 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
), /* tune_flags. */
894 &thunderxt88_prefetch_tune
897 static const struct tune_params thunderx_tunings
=
899 &thunderx_extra_costs
,
900 &generic_addrcost_table
,
901 &thunderx_regmove_cost
,
902 &thunderx_vector_cost
,
903 &generic_branch_cost
,
904 &generic_approx_modes
,
905 SVE_NOT_IMPLEMENTED
, /* sve_width */
908 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
909 "8", /* function_align. */
910 "8", /* jump_align. */
911 "8", /* loop_align. */
912 2, /* int_reassoc_width. */
913 4, /* fp_reassoc_width. */
914 1, /* vec_reassoc_width. */
915 2, /* min_div_recip_mul_sf. */
916 2, /* min_div_recip_mul_df. */
917 0, /* max_case_values. */
918 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
919 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
920 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
), /* tune_flags. */
921 &thunderx_prefetch_tune
924 static const struct tune_params tsv110_tunings
=
927 &tsv110_addrcost_table
,
928 &tsv110_regmove_cost
,
930 &generic_branch_cost
,
931 &generic_approx_modes
,
932 SVE_NOT_IMPLEMENTED
, /* sve_width */
935 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_CMP_BRANCH
936 | AARCH64_FUSE_ALU_BRANCH
), /* fusible_ops */
937 "16", /* function_align. */
938 "4", /* jump_align. */
939 "8", /* loop_align. */
940 2, /* int_reassoc_width. */
941 4, /* fp_reassoc_width. */
942 1, /* vec_reassoc_width. */
943 2, /* min_div_recip_mul_sf. */
944 2, /* min_div_recip_mul_df. */
945 0, /* max_case_values. */
946 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
947 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
948 &tsv110_prefetch_tune
951 static const struct tune_params xgene1_tunings
=
954 &xgene1_addrcost_table
,
955 &xgene1_regmove_cost
,
957 &generic_branch_cost
,
958 &xgene1_approx_modes
,
959 SVE_NOT_IMPLEMENTED
, /* sve_width */
962 AARCH64_FUSE_NOTHING
, /* fusible_ops */
963 "16", /* function_align. */
964 "16", /* jump_align. */
965 "16", /* loop_align. */
966 2, /* int_reassoc_width. */
967 4, /* fp_reassoc_width. */
968 1, /* vec_reassoc_width. */
969 2, /* min_div_recip_mul_sf. */
970 2, /* min_div_recip_mul_df. */
971 17, /* max_case_values. */
972 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
973 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
), /* tune_flags. */
974 &xgene1_prefetch_tune
977 static const struct tune_params emag_tunings
=
980 &xgene1_addrcost_table
,
981 &xgene1_regmove_cost
,
983 &generic_branch_cost
,
984 &xgene1_approx_modes
,
988 AARCH64_FUSE_NOTHING
, /* fusible_ops */
989 "16", /* function_align. */
990 "16", /* jump_align. */
991 "16", /* loop_align. */
992 2, /* int_reassoc_width. */
993 4, /* fp_reassoc_width. */
994 1, /* vec_reassoc_width. */
995 2, /* min_div_recip_mul_sf. */
996 2, /* min_div_recip_mul_df. */
997 17, /* max_case_values. */
998 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
999 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
), /* tune_flags. */
1000 &xgene1_prefetch_tune
1003 static const struct tune_params qdf24xx_tunings
=
1005 &qdf24xx_extra_costs
,
1006 &qdf24xx_addrcost_table
,
1007 &qdf24xx_regmove_cost
,
1008 &qdf24xx_vector_cost
,
1009 &generic_branch_cost
,
1010 &generic_approx_modes
,
1011 SVE_NOT_IMPLEMENTED
, /* sve_width */
1012 4, /* memmov_cost */
1014 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
1015 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
1016 "16", /* function_align. */
1017 "8", /* jump_align. */
1018 "16", /* loop_align. */
1019 2, /* int_reassoc_width. */
1020 4, /* fp_reassoc_width. */
1021 1, /* vec_reassoc_width. */
1022 2, /* min_div_recip_mul_sf. */
1023 2, /* min_div_recip_mul_df. */
1024 0, /* max_case_values. */
1025 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1026 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS
, /* tune_flags. */
1027 &qdf24xx_prefetch_tune
1030 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1032 static const struct tune_params saphira_tunings
=
1034 &generic_extra_costs
,
1035 &generic_addrcost_table
,
1036 &generic_regmove_cost
,
1037 &generic_vector_cost
,
1038 &generic_branch_cost
,
1039 &generic_approx_modes
,
1040 SVE_NOT_IMPLEMENTED
, /* sve_width */
1041 4, /* memmov_cost */
1043 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
1044 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
1045 "16", /* function_align. */
1046 "8", /* jump_align. */
1047 "16", /* loop_align. */
1048 2, /* int_reassoc_width. */
1049 4, /* fp_reassoc_width. */
1050 1, /* vec_reassoc_width. */
1051 2, /* min_div_recip_mul_sf. */
1052 2, /* min_div_recip_mul_df. */
1053 0, /* max_case_values. */
1054 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1055 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1056 &generic_prefetch_tune
1059 static const struct tune_params thunderx2t99_tunings
=
1061 &thunderx2t99_extra_costs
,
1062 &thunderx2t99_addrcost_table
,
1063 &thunderx2t99_regmove_cost
,
1064 &thunderx2t99_vector_cost
,
1065 &generic_branch_cost
,
1066 &generic_approx_modes
,
1067 SVE_NOT_IMPLEMENTED
, /* sve_width */
1068 4, /* memmov_cost. */
1069 4, /* issue_rate. */
1070 (AARCH64_FUSE_CMP_BRANCH
| AARCH64_FUSE_AES_AESMC
1071 | AARCH64_FUSE_ALU_BRANCH
), /* fusible_ops */
1072 "16", /* function_align. */
1073 "8", /* jump_align. */
1074 "16", /* loop_align. */
1075 3, /* int_reassoc_width. */
1076 2, /* fp_reassoc_width. */
1077 2, /* vec_reassoc_width. */
1078 2, /* min_div_recip_mul_sf. */
1079 2, /* min_div_recip_mul_df. */
1080 0, /* max_case_values. */
1081 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1082 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1083 &thunderx2t99_prefetch_tune
1086 /* Support for fine-grained override of the tuning structures. */
1087 struct aarch64_tuning_override_function
1090 void (*parse_override
)(const char*, struct tune_params
*);
1093 static void aarch64_parse_fuse_string (const char*, struct tune_params
*);
1094 static void aarch64_parse_tune_string (const char*, struct tune_params
*);
1095 static void aarch64_parse_sve_width_string (const char*, struct tune_params
*);
1097 static const struct aarch64_tuning_override_function
1098 aarch64_tuning_override_functions
[] =
1100 { "fuse", aarch64_parse_fuse_string
},
1101 { "tune", aarch64_parse_tune_string
},
1102 { "sve_width", aarch64_parse_sve_width_string
},
1106 /* A processor implementing AArch64. */
1109 const char *const name
;
1110 enum aarch64_processor ident
;
1111 enum aarch64_processor sched_core
;
1112 enum aarch64_arch arch
;
1113 unsigned architecture_version
;
1114 const unsigned long flags
;
1115 const struct tune_params
*const tune
;
1118 /* Architectures implementing AArch64. */
1119 static const struct processor all_architectures
[] =
1121 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1122 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1123 #include "aarch64-arches.def"
1124 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
1127 /* Processor cores implementing AArch64. */
1128 static const struct processor all_cores
[] =
1130 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1131 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1132 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1133 FLAGS, &COSTS##_tunings},
1134 #include "aarch64-cores.def"
1135 {"generic", generic
, cortexa53
, AARCH64_ARCH_8A
, 8,
1136 AARCH64_FL_FOR_ARCH8
, &generic_tunings
},
1137 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
1141 /* Target specification. These are populated by the -march, -mtune, -mcpu
1142 handling code or by target attributes. */
1143 static const struct processor
*selected_arch
;
1144 static const struct processor
*selected_cpu
;
1145 static const struct processor
*selected_tune
;
1147 /* The current tuning set. */
1148 struct tune_params aarch64_tune_params
= generic_tunings
;
1150 /* Table of machine attributes. */
1151 static const struct attribute_spec aarch64_attribute_table
[] =
1153 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1154 affects_type_identity, handler, exclude } */
1155 { "aarch64_vector_pcs", 0, 0, false, true, true, false, NULL
, NULL
},
1156 { NULL
, 0, 0, false, false, false, false, NULL
, NULL
}
1159 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1161 /* An ISA extension in the co-processor and main instruction set space. */
1162 struct aarch64_option_extension
1164 const char *const name
;
1165 const unsigned long flags_on
;
1166 const unsigned long flags_off
;
1169 typedef enum aarch64_cond_code
1171 AARCH64_EQ
= 0, AARCH64_NE
, AARCH64_CS
, AARCH64_CC
, AARCH64_MI
, AARCH64_PL
,
1172 AARCH64_VS
, AARCH64_VC
, AARCH64_HI
, AARCH64_LS
, AARCH64_GE
, AARCH64_LT
,
1173 AARCH64_GT
, AARCH64_LE
, AARCH64_AL
, AARCH64_NV
1177 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1179 struct aarch64_branch_protect_type
1181 /* The type's name that the user passes to the branch-protection option
1184 /* Function to handle the protection type and set global variables.
1185 First argument is the string token corresponding with this type and the
1186 second argument is the next token in the option string.
1188 * AARCH64_PARSE_OK: Handling was sucessful.
1189 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1190 should print an error.
1191 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1193 enum aarch64_parse_opt_result (*handler
)(char*, char*);
1194 /* A list of types that can follow this type in the option string. */
1195 const aarch64_branch_protect_type
* subtypes
;
1196 unsigned int num_subtypes
;
1199 static enum aarch64_parse_opt_result
1200 aarch64_handle_no_branch_protection (char* str
, char* rest
)
1202 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NONE
;
1203 aarch64_enable_bti
= 0;
1206 error ("unexpected %<%s%> after %<%s%>", rest
, str
);
1207 return AARCH64_PARSE_INVALID_FEATURE
;
1209 return AARCH64_PARSE_OK
;
1212 static enum aarch64_parse_opt_result
1213 aarch64_handle_standard_branch_protection (char* str
, char* rest
)
1215 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
1216 aarch64_enable_bti
= 1;
1219 error ("unexpected %<%s%> after %<%s%>", rest
, str
);
1220 return AARCH64_PARSE_INVALID_FEATURE
;
1222 return AARCH64_PARSE_OK
;
1225 static enum aarch64_parse_opt_result
1226 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED
,
1227 char* rest ATTRIBUTE_UNUSED
)
1229 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
1230 return AARCH64_PARSE_OK
;
1233 static enum aarch64_parse_opt_result
1234 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED
,
1235 char* rest ATTRIBUTE_UNUSED
)
1237 aarch64_ra_sign_scope
= AARCH64_FUNCTION_ALL
;
1238 return AARCH64_PARSE_OK
;
1241 static enum aarch64_parse_opt_result
1242 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED
,
1243 char* rest ATTRIBUTE_UNUSED
)
1245 aarch64_enable_bti
= 1;
1246 return AARCH64_PARSE_OK
;
1249 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes
[] = {
1250 { "leaf", aarch64_handle_pac_ret_leaf
, NULL
, 0 },
1251 { NULL
, NULL
, NULL
, 0 }
1254 static const struct aarch64_branch_protect_type aarch64_branch_protect_types
[] = {
1255 { "none", aarch64_handle_no_branch_protection
, NULL
, 0 },
1256 { "standard", aarch64_handle_standard_branch_protection
, NULL
, 0 },
1257 { "pac-ret", aarch64_handle_pac_ret_protection
, aarch64_pac_ret_subtypes
,
1258 ARRAY_SIZE (aarch64_pac_ret_subtypes
) },
1259 { "bti", aarch64_handle_bti_protection
, NULL
, 0 },
1260 { NULL
, NULL
, NULL
, 0 }
1263 /* The condition codes of the processor, and the inverse function. */
1264 static const char * const aarch64_condition_codes
[] =
1266 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1267 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1270 /* Generate code to enable conditional branches in functions over 1 MiB. */
1272 aarch64_gen_far_branch (rtx
* operands
, int pos_label
, const char * dest
,
1273 const char * branch_format
)
1275 rtx_code_label
* tmp_label
= gen_label_rtx ();
1276 char label_buf
[256];
1278 ASM_GENERATE_INTERNAL_LABEL (label_buf
, dest
,
1279 CODE_LABEL_NUMBER (tmp_label
));
1280 const char *label_ptr
= targetm
.strip_name_encoding (label_buf
);
1281 rtx dest_label
= operands
[pos_label
];
1282 operands
[pos_label
] = tmp_label
;
1284 snprintf (buffer
, sizeof (buffer
), "%s%s", branch_format
, label_ptr
);
1285 output_asm_insn (buffer
, operands
);
1287 snprintf (buffer
, sizeof (buffer
), "b\t%%l%d\n%s:", pos_label
, label_ptr
);
1288 operands
[pos_label
] = dest_label
;
1289 output_asm_insn (buffer
, operands
);
1294 aarch64_err_no_fpadvsimd (machine_mode mode
)
1296 if (TARGET_GENERAL_REGS_ONLY
)
1297 if (FLOAT_MODE_P (mode
))
1298 error ("%qs is incompatible with the use of floating-point types",
1299 "-mgeneral-regs-only");
1301 error ("%qs is incompatible with the use of vector types",
1302 "-mgeneral-regs-only");
1304 if (FLOAT_MODE_P (mode
))
1305 error ("%qs feature modifier is incompatible with the use of"
1306 " floating-point types", "+nofp");
1308 error ("%qs feature modifier is incompatible with the use of"
1309 " vector types", "+nofp");
1312 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1313 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1314 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1315 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1316 and GENERAL_REGS is lower than the memory cost (in this case the best class
1317 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1318 cost results in bad allocations with many redundant int<->FP moves which
1319 are expensive on various cores.
1320 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1321 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1322 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1323 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1324 The result of this is that it is no longer inefficient to have a higher
1325 memory move cost than the register move cost.
1329 aarch64_ira_change_pseudo_allocno_class (int regno
, reg_class_t allocno_class
,
1330 reg_class_t best_class
)
1334 if (!reg_class_subset_p (GENERAL_REGS
, allocno_class
)
1335 || !reg_class_subset_p (FP_REGS
, allocno_class
))
1336 return allocno_class
;
1338 if (!reg_class_subset_p (GENERAL_REGS
, best_class
)
1339 || !reg_class_subset_p (FP_REGS
, best_class
))
1342 mode
= PSEUDO_REGNO_MODE (regno
);
1343 return FLOAT_MODE_P (mode
) || VECTOR_MODE_P (mode
) ? FP_REGS
: GENERAL_REGS
;
1347 aarch64_min_divisions_for_recip_mul (machine_mode mode
)
1349 if (GET_MODE_UNIT_SIZE (mode
) == 4)
1350 return aarch64_tune_params
.min_div_recip_mul_sf
;
1351 return aarch64_tune_params
.min_div_recip_mul_df
;
1354 /* Return the reassociation width of treeop OPC with mode MODE. */
1356 aarch64_reassociation_width (unsigned opc
, machine_mode mode
)
1358 if (VECTOR_MODE_P (mode
))
1359 return aarch64_tune_params
.vec_reassoc_width
;
1360 if (INTEGRAL_MODE_P (mode
))
1361 return aarch64_tune_params
.int_reassoc_width
;
1362 /* Avoid reassociating floating point addition so we emit more FMAs. */
1363 if (FLOAT_MODE_P (mode
) && opc
!= PLUS_EXPR
)
1364 return aarch64_tune_params
.fp_reassoc_width
;
1368 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1370 aarch64_dbx_register_number (unsigned regno
)
1372 if (GP_REGNUM_P (regno
))
1373 return AARCH64_DWARF_R0
+ regno
- R0_REGNUM
;
1374 else if (regno
== SP_REGNUM
)
1375 return AARCH64_DWARF_SP
;
1376 else if (FP_REGNUM_P (regno
))
1377 return AARCH64_DWARF_V0
+ regno
- V0_REGNUM
;
1378 else if (PR_REGNUM_P (regno
))
1379 return AARCH64_DWARF_P0
+ regno
- P0_REGNUM
;
1380 else if (regno
== VG_REGNUM
)
1381 return AARCH64_DWARF_VG
;
1383 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1384 equivalent DWARF register. */
1385 return DWARF_FRAME_REGISTERS
;
1388 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1390 aarch64_advsimd_struct_mode_p (machine_mode mode
)
1393 && (mode
== OImode
|| mode
== CImode
|| mode
== XImode
));
1396 /* Return true if MODE is an SVE predicate mode. */
1398 aarch64_sve_pred_mode_p (machine_mode mode
)
1401 && (mode
== VNx16BImode
1402 || mode
== VNx8BImode
1403 || mode
== VNx4BImode
1404 || mode
== VNx2BImode
));
1407 /* Three mutually-exclusive flags describing a vector or predicate type. */
1408 const unsigned int VEC_ADVSIMD
= 1;
1409 const unsigned int VEC_SVE_DATA
= 2;
1410 const unsigned int VEC_SVE_PRED
= 4;
1411 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1412 a structure of 2, 3 or 4 vectors. */
1413 const unsigned int VEC_STRUCT
= 8;
1414 /* Useful combinations of the above. */
1415 const unsigned int VEC_ANY_SVE
= VEC_SVE_DATA
| VEC_SVE_PRED
;
1416 const unsigned int VEC_ANY_DATA
= VEC_ADVSIMD
| VEC_SVE_DATA
;
1418 /* Return a set of flags describing the vector properties of mode MODE.
1419 Ignore modes that are not supported by the current target. */
1421 aarch64_classify_vector_mode (machine_mode mode
)
1423 if (aarch64_advsimd_struct_mode_p (mode
))
1424 return VEC_ADVSIMD
| VEC_STRUCT
;
1426 if (aarch64_sve_pred_mode_p (mode
))
1427 return VEC_SVE_PRED
;
1429 scalar_mode inner
= GET_MODE_INNER (mode
);
1430 if (VECTOR_MODE_P (mode
)
1437 || inner
== DFmode
))
1441 if (known_eq (GET_MODE_BITSIZE (mode
), BITS_PER_SVE_VECTOR
))
1442 return VEC_SVE_DATA
;
1443 if (known_eq (GET_MODE_BITSIZE (mode
), BITS_PER_SVE_VECTOR
* 2)
1444 || known_eq (GET_MODE_BITSIZE (mode
), BITS_PER_SVE_VECTOR
* 3)
1445 || known_eq (GET_MODE_BITSIZE (mode
), BITS_PER_SVE_VECTOR
* 4))
1446 return VEC_SVE_DATA
| VEC_STRUCT
;
1449 /* This includes V1DF but not V1DI (which doesn't exist). */
1451 && (known_eq (GET_MODE_BITSIZE (mode
), 64)
1452 || known_eq (GET_MODE_BITSIZE (mode
), 128)))
1459 /* Return true if MODE is any of the data vector modes, including
1462 aarch64_vector_data_mode_p (machine_mode mode
)
1464 return aarch64_classify_vector_mode (mode
) & VEC_ANY_DATA
;
1467 /* Return true if MODE is an SVE data vector mode; either a single vector
1468 or a structure of vectors. */
1470 aarch64_sve_data_mode_p (machine_mode mode
)
1472 return aarch64_classify_vector_mode (mode
) & VEC_SVE_DATA
;
1475 /* Implement target hook TARGET_ARRAY_MODE. */
1476 static opt_machine_mode
1477 aarch64_array_mode (machine_mode mode
, unsigned HOST_WIDE_INT nelems
)
1479 if (aarch64_classify_vector_mode (mode
) == VEC_SVE_DATA
1480 && IN_RANGE (nelems
, 2, 4))
1481 return mode_for_vector (GET_MODE_INNER (mode
),
1482 GET_MODE_NUNITS (mode
) * nelems
);
1484 return opt_machine_mode ();
1487 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1489 aarch64_array_mode_supported_p (machine_mode mode
,
1490 unsigned HOST_WIDE_INT nelems
)
1493 && (AARCH64_VALID_SIMD_QREG_MODE (mode
)
1494 || AARCH64_VALID_SIMD_DREG_MODE (mode
))
1495 && (nelems
>= 2 && nelems
<= 4))
1501 /* Return the SVE predicate mode to use for elements that have
1502 ELEM_NBYTES bytes, if such a mode exists. */
1505 aarch64_sve_pred_mode (unsigned int elem_nbytes
)
1509 if (elem_nbytes
== 1)
1511 if (elem_nbytes
== 2)
1513 if (elem_nbytes
== 4)
1515 if (elem_nbytes
== 8)
1518 return opt_machine_mode ();
1521 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1523 static opt_machine_mode
1524 aarch64_get_mask_mode (poly_uint64 nunits
, poly_uint64 nbytes
)
1526 if (TARGET_SVE
&& known_eq (nbytes
, BYTES_PER_SVE_VECTOR
))
1528 unsigned int elem_nbytes
= vector_element_size (nbytes
, nunits
);
1529 machine_mode pred_mode
;
1530 if (aarch64_sve_pred_mode (elem_nbytes
).exists (&pred_mode
))
1534 return default_get_mask_mode (nunits
, nbytes
);
1537 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1538 prefer to use the first arithmetic operand as the else value if
1539 the else value doesn't matter, since that exactly matches the SVE
1540 destructive merging form. For ternary operations we could either
1541 pick the first operand and use FMAD-like instructions or the last
1542 operand and use FMLA-like instructions; the latter seems more
1546 aarch64_preferred_else_value (unsigned, tree
, unsigned int nops
, tree
*ops
)
1548 return nops
== 3 ? ops
[2] : ops
[0];
1551 /* Implement TARGET_HARD_REGNO_NREGS. */
1554 aarch64_hard_regno_nregs (unsigned regno
, machine_mode mode
)
1556 /* ??? Logically we should only need to provide a value when
1557 HARD_REGNO_MODE_OK says that the combination is valid,
1558 but at the moment we need to handle all modes. Just ignore
1559 any runtime parts for registers that can't store them. */
1560 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
1561 switch (aarch64_regno_regclass (regno
))
1565 if (aarch64_sve_data_mode_p (mode
))
1566 return exact_div (GET_MODE_SIZE (mode
),
1567 BYTES_PER_SVE_VECTOR
).to_constant ();
1568 return CEIL (lowest_size
, UNITS_PER_VREG
);
1574 return CEIL (lowest_size
, UNITS_PER_WORD
);
1579 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1582 aarch64_hard_regno_mode_ok (unsigned regno
, machine_mode mode
)
1584 if (GET_MODE_CLASS (mode
) == MODE_CC
)
1585 return regno
== CC_REGNUM
;
1587 if (regno
== VG_REGNUM
)
1588 /* This must have the same size as _Unwind_Word. */
1589 return mode
== DImode
;
1591 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1592 if (vec_flags
& VEC_SVE_PRED
)
1593 return PR_REGNUM_P (regno
);
1595 if (PR_REGNUM_P (regno
))
1598 if (regno
== SP_REGNUM
)
1599 /* The purpose of comparing with ptr_mode is to support the
1600 global register variable associated with the stack pointer
1601 register via the syntax of asm ("wsp") in ILP32. */
1602 return mode
== Pmode
|| mode
== ptr_mode
;
1604 if (regno
== FRAME_POINTER_REGNUM
|| regno
== ARG_POINTER_REGNUM
)
1605 return mode
== Pmode
;
1607 if (GP_REGNUM_P (regno
))
1609 if (known_le (GET_MODE_SIZE (mode
), 8))
1611 else if (known_le (GET_MODE_SIZE (mode
), 16))
1612 return (regno
& 1) == 0;
1614 else if (FP_REGNUM_P (regno
))
1616 if (vec_flags
& VEC_STRUCT
)
1617 return end_hard_regno (mode
, regno
) - 1 <= V31_REGNUM
;
1619 return !VECTOR_MODE_P (mode
) || vec_flags
!= 0;
1625 /* Return true if this is a definition of a vectorized simd function. */
1628 aarch64_simd_decl_p (tree fndecl
)
1634 fntype
= TREE_TYPE (fndecl
);
1638 /* Functions with the aarch64_vector_pcs attribute use the simd ABI. */
1639 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype
)) != NULL
)
1645 /* Return the mode a register save/restore should use. DImode for integer
1646 registers, DFmode for FP registers in non-SIMD functions (they only save
1647 the bottom half of a 128 bit register), or TFmode for FP registers in
1651 aarch64_reg_save_mode (tree fndecl
, unsigned regno
)
1653 return GP_REGNUM_P (regno
)
1655 : (aarch64_simd_decl_p (fndecl
) ? E_TFmode
: E_DFmode
);
1658 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1659 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1660 clobbers the top 64 bits when restoring the bottom 64 bits. */
1663 aarch64_hard_regno_call_part_clobbered (unsigned int regno
, machine_mode mode
)
1665 return FP_REGNUM_P (regno
) && maybe_gt (GET_MODE_SIZE (mode
), 8);
1668 /* Implement REGMODE_NATURAL_SIZE. */
1670 aarch64_regmode_natural_size (machine_mode mode
)
1672 /* The natural size for SVE data modes is one SVE data vector,
1673 and similarly for predicates. We can't independently modify
1674 anything smaller than that. */
1675 /* ??? For now, only do this for variable-width SVE registers.
1676 Doing it for constant-sized registers breaks lower-subreg.c. */
1677 /* ??? And once that's fixed, we should probably have similar
1678 code for Advanced SIMD. */
1679 if (!aarch64_sve_vg
.is_constant ())
1681 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1682 if (vec_flags
& VEC_SVE_PRED
)
1683 return BYTES_PER_SVE_PRED
;
1684 if (vec_flags
& VEC_SVE_DATA
)
1685 return BYTES_PER_SVE_VECTOR
;
1687 return UNITS_PER_WORD
;
1690 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1692 aarch64_hard_regno_caller_save_mode (unsigned regno
, unsigned,
1695 /* The predicate mode determines which bits are significant and
1696 which are "don't care". Decreasing the number of lanes would
1697 lose data while increasing the number of lanes would make bits
1698 unnecessarily significant. */
1699 if (PR_REGNUM_P (regno
))
1701 if (known_ge (GET_MODE_SIZE (mode
), 4))
1707 /* Return true if I's bits are consecutive ones from the MSB. */
1709 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i
)
1711 return exact_log2 (-i
) != HOST_WIDE_INT_M1
;
1714 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1715 that strcpy from constants will be faster. */
1717 static HOST_WIDE_INT
1718 aarch64_constant_alignment (const_tree exp
, HOST_WIDE_INT align
)
1720 if (TREE_CODE (exp
) == STRING_CST
&& !optimize_size
)
1721 return MAX (align
, BITS_PER_WORD
);
1725 /* Return true if calls to DECL should be treated as
1726 long-calls (ie called via a register). */
1728 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED
)
1733 /* Return true if calls to symbol-ref SYM should be treated as
1734 long-calls (ie called via a register). */
1736 aarch64_is_long_call_p (rtx sym
)
1738 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym
));
1741 /* Return true if calls to symbol-ref SYM should not go through
1745 aarch64_is_noplt_call_p (rtx sym
)
1747 const_tree decl
= SYMBOL_REF_DECL (sym
);
1752 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl
)))
1753 && !targetm
.binds_local_p (decl
))
1759 /* Return true if the offsets to a zero/sign-extract operation
1760 represent an expression that matches an extend operation. The
1761 operands represent the paramters from
1763 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1765 aarch64_is_extend_from_extract (scalar_int_mode mode
, rtx mult_imm
,
1768 HOST_WIDE_INT mult_val
, extract_val
;
1770 if (! CONST_INT_P (mult_imm
) || ! CONST_INT_P (extract_imm
))
1773 mult_val
= INTVAL (mult_imm
);
1774 extract_val
= INTVAL (extract_imm
);
1777 && extract_val
< GET_MODE_BITSIZE (mode
)
1778 && exact_log2 (extract_val
& ~7) > 0
1779 && (extract_val
& 7) <= 4
1780 && mult_val
== (1 << (extract_val
& 7)))
1786 /* Emit an insn that's a simple single-set. Both the operands must be
1787 known to be valid. */
1788 inline static rtx_insn
*
1789 emit_set_insn (rtx x
, rtx y
)
1791 return emit_insn (gen_rtx_SET (x
, y
));
1794 /* X and Y are two things to compare using CODE. Emit the compare insn and
1795 return the rtx for register 0 in the proper mode. */
1797 aarch64_gen_compare_reg (RTX_CODE code
, rtx x
, rtx y
)
1799 machine_mode mode
= SELECT_CC_MODE (code
, x
, y
);
1800 rtx cc_reg
= gen_rtx_REG (mode
, CC_REGNUM
);
1802 emit_set_insn (cc_reg
, gen_rtx_COMPARE (mode
, x
, y
));
1806 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
1809 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code
, rtx x
, rtx y
,
1810 machine_mode y_mode
)
1812 if (y_mode
== E_QImode
|| y_mode
== E_HImode
)
1814 if (CONST_INT_P (y
))
1815 y
= GEN_INT (INTVAL (y
) & GET_MODE_MASK (y_mode
));
1819 machine_mode cc_mode
;
1821 t
= gen_rtx_ZERO_EXTEND (SImode
, y
);
1822 t
= gen_rtx_COMPARE (CC_SWPmode
, t
, x
);
1823 cc_mode
= CC_SWPmode
;
1824 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
1825 emit_set_insn (cc_reg
, t
);
1830 return aarch64_gen_compare_reg (code
, x
, y
);
1833 /* Build the SYMBOL_REF for __tls_get_addr. */
1835 static GTY(()) rtx tls_get_addr_libfunc
;
1838 aarch64_tls_get_addr (void)
1840 if (!tls_get_addr_libfunc
)
1841 tls_get_addr_libfunc
= init_one_libfunc ("__tls_get_addr");
1842 return tls_get_addr_libfunc
;
1845 /* Return the TLS model to use for ADDR. */
1847 static enum tls_model
1848 tls_symbolic_operand_type (rtx addr
)
1850 enum tls_model tls_kind
= TLS_MODEL_NONE
;
1851 if (GET_CODE (addr
) == CONST
)
1854 rtx sym
= strip_offset (addr
, &addend
);
1855 if (GET_CODE (sym
) == SYMBOL_REF
)
1856 tls_kind
= SYMBOL_REF_TLS_MODEL (sym
);
1858 else if (GET_CODE (addr
) == SYMBOL_REF
)
1859 tls_kind
= SYMBOL_REF_TLS_MODEL (addr
);
1864 /* We'll allow lo_sum's in addresses in our legitimate addresses
1865 so that combine would take care of combining addresses where
1866 necessary, but for generation purposes, we'll generate the address
1869 tmp = hi (symbol_ref); adrp x1, foo
1870 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1874 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1875 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1879 Load TLS symbol, depending on TLS mechanism and TLS access model.
1881 Global Dynamic - Traditional TLS:
1882 adrp tmp, :tlsgd:imm
1883 add dest, tmp, #:tlsgd_lo12:imm
1886 Global Dynamic - TLS Descriptors:
1887 adrp dest, :tlsdesc:imm
1888 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1889 add dest, dest, #:tlsdesc_lo12:imm
1896 adrp tmp, :gottprel:imm
1897 ldr dest, [tmp, #:gottprel_lo12:imm]
1902 add t0, tp, #:tprel_hi12:imm, lsl #12
1903 add t0, t0, #:tprel_lo12_nc:imm
1907 aarch64_load_symref_appropriately (rtx dest
, rtx imm
,
1908 enum aarch64_symbol_type type
)
1912 case SYMBOL_SMALL_ABSOLUTE
:
1914 /* In ILP32, the mode of dest can be either SImode or DImode. */
1916 machine_mode mode
= GET_MODE (dest
);
1918 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
1920 if (can_create_pseudo_p ())
1921 tmp_reg
= gen_reg_rtx (mode
);
1923 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
1924 emit_insn (gen_add_losym (dest
, tmp_reg
, imm
));
1928 case SYMBOL_TINY_ABSOLUTE
:
1929 emit_insn (gen_rtx_SET (dest
, imm
));
1932 case SYMBOL_SMALL_GOT_28K
:
1934 machine_mode mode
= GET_MODE (dest
);
1935 rtx gp_rtx
= pic_offset_table_rtx
;
1939 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1940 here before rtl expand. Tree IVOPT will generate rtl pattern to
1941 decide rtx costs, in which case pic_offset_table_rtx is not
1942 initialized. For that case no need to generate the first adrp
1943 instruction as the final cost for global variable access is
1947 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1948 using the page base as GOT base, the first page may be wasted,
1949 in the worst scenario, there is only 28K space for GOT).
1951 The generate instruction sequence for accessing global variable
1954 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1956 Only one instruction needed. But we must initialize
1957 pic_offset_table_rtx properly. We generate initialize insn for
1958 every global access, and allow CSE to remove all redundant.
1960 The final instruction sequences will look like the following
1961 for multiply global variables access.
1963 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1965 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1966 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1967 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1970 rtx s
= gen_rtx_SYMBOL_REF (Pmode
, "_GLOBAL_OFFSET_TABLE_");
1971 crtl
->uses_pic_offset_table
= 1;
1972 emit_move_insn (gp_rtx
, gen_rtx_HIGH (Pmode
, s
));
1974 if (mode
!= GET_MODE (gp_rtx
))
1975 gp_rtx
= gen_lowpart (mode
, gp_rtx
);
1979 if (mode
== ptr_mode
)
1982 insn
= gen_ldr_got_small_28k_di (dest
, gp_rtx
, imm
);
1984 insn
= gen_ldr_got_small_28k_si (dest
, gp_rtx
, imm
);
1986 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
1990 gcc_assert (mode
== Pmode
);
1992 insn
= gen_ldr_got_small_28k_sidi (dest
, gp_rtx
, imm
);
1993 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
1996 /* The operand is expected to be MEM. Whenever the related insn
1997 pattern changed, above code which calculate mem should be
1999 gcc_assert (GET_CODE (mem
) == MEM
);
2000 MEM_READONLY_P (mem
) = 1;
2001 MEM_NOTRAP_P (mem
) = 1;
2006 case SYMBOL_SMALL_GOT_4G
:
2008 /* In ILP32, the mode of dest can be either SImode or DImode,
2009 while the got entry is always of SImode size. The mode of
2010 dest depends on how dest is used: if dest is assigned to a
2011 pointer (e.g. in the memory), it has SImode; it may have
2012 DImode if dest is dereferenced to access the memeory.
2013 This is why we have to handle three different ldr_got_small
2014 patterns here (two patterns for ILP32). */
2019 machine_mode mode
= GET_MODE (dest
);
2021 if (can_create_pseudo_p ())
2022 tmp_reg
= gen_reg_rtx (mode
);
2024 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
2025 if (mode
== ptr_mode
)
2028 insn
= gen_ldr_got_small_di (dest
, tmp_reg
, imm
);
2030 insn
= gen_ldr_got_small_si (dest
, tmp_reg
, imm
);
2032 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
2036 gcc_assert (mode
== Pmode
);
2038 insn
= gen_ldr_got_small_sidi (dest
, tmp_reg
, imm
);
2039 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
2042 gcc_assert (GET_CODE (mem
) == MEM
);
2043 MEM_READONLY_P (mem
) = 1;
2044 MEM_NOTRAP_P (mem
) = 1;
2049 case SYMBOL_SMALL_TLSGD
:
2052 machine_mode mode
= GET_MODE (dest
);
2053 rtx result
= gen_rtx_REG (mode
, R0_REGNUM
);
2057 aarch64_emit_call_insn (gen_tlsgd_small_si (result
, imm
));
2059 aarch64_emit_call_insn (gen_tlsgd_small_di (result
, imm
));
2060 insns
= get_insns ();
2063 RTL_CONST_CALL_P (insns
) = 1;
2064 emit_libcall_block (insns
, dest
, result
, imm
);
2068 case SYMBOL_SMALL_TLSDESC
:
2070 machine_mode mode
= GET_MODE (dest
);
2071 rtx x0
= gen_rtx_REG (mode
, R0_REGNUM
);
2074 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
2076 /* In ILP32, the got entry is always of SImode size. Unlike
2077 small GOT, the dest is fixed at reg 0. */
2079 emit_insn (gen_tlsdesc_small_si (imm
));
2081 emit_insn (gen_tlsdesc_small_di (imm
));
2082 tp
= aarch64_load_tp (NULL
);
2085 tp
= gen_lowpart (mode
, tp
);
2087 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, x0
)));
2089 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2093 case SYMBOL_SMALL_TLSIE
:
2095 /* In ILP32, the mode of dest can be either SImode or DImode,
2096 while the got entry is always of SImode size. The mode of
2097 dest depends on how dest is used: if dest is assigned to a
2098 pointer (e.g. in the memory), it has SImode; it may have
2099 DImode if dest is dereferenced to access the memeory.
2100 This is why we have to handle three different tlsie_small
2101 patterns here (two patterns for ILP32). */
2102 machine_mode mode
= GET_MODE (dest
);
2103 rtx tmp_reg
= gen_reg_rtx (mode
);
2104 rtx tp
= aarch64_load_tp (NULL
);
2106 if (mode
== ptr_mode
)
2109 emit_insn (gen_tlsie_small_di (tmp_reg
, imm
));
2112 emit_insn (gen_tlsie_small_si (tmp_reg
, imm
));
2113 tp
= gen_lowpart (mode
, tp
);
2118 gcc_assert (mode
== Pmode
);
2119 emit_insn (gen_tlsie_small_sidi (tmp_reg
, imm
));
2122 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, tmp_reg
)));
2124 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2128 case SYMBOL_TLSLE12
:
2129 case SYMBOL_TLSLE24
:
2130 case SYMBOL_TLSLE32
:
2131 case SYMBOL_TLSLE48
:
2133 machine_mode mode
= GET_MODE (dest
);
2134 rtx tp
= aarch64_load_tp (NULL
);
2137 tp
= gen_lowpart (mode
, tp
);
2141 case SYMBOL_TLSLE12
:
2142 emit_insn ((mode
== DImode
? gen_tlsle12_di
: gen_tlsle12_si
)
2145 case SYMBOL_TLSLE24
:
2146 emit_insn ((mode
== DImode
? gen_tlsle24_di
: gen_tlsle24_si
)
2149 case SYMBOL_TLSLE32
:
2150 emit_insn ((mode
== DImode
? gen_tlsle32_di
: gen_tlsle32_si
)
2152 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
2155 case SYMBOL_TLSLE48
:
2156 emit_insn ((mode
== DImode
? gen_tlsle48_di
: gen_tlsle48_si
)
2158 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
2166 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2170 case SYMBOL_TINY_GOT
:
2171 emit_insn (gen_ldr_got_tiny (dest
, imm
));
2174 case SYMBOL_TINY_TLSIE
:
2176 machine_mode mode
= GET_MODE (dest
);
2177 rtx tp
= aarch64_load_tp (NULL
);
2179 if (mode
== ptr_mode
)
2182 emit_insn (gen_tlsie_tiny_di (dest
, imm
, tp
));
2185 tp
= gen_lowpart (mode
, tp
);
2186 emit_insn (gen_tlsie_tiny_si (dest
, imm
, tp
));
2191 gcc_assert (mode
== Pmode
);
2192 emit_insn (gen_tlsie_tiny_sidi (dest
, imm
, tp
));
2196 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2205 /* Emit a move from SRC to DEST. Assume that the move expanders can
2206 handle all moves if !can_create_pseudo_p (). The distinction is
2207 important because, unlike emit_move_insn, the move expanders know
2208 how to force Pmode objects into the constant pool even when the
2209 constant pool address is not itself legitimate. */
2211 aarch64_emit_move (rtx dest
, rtx src
)
2213 return (can_create_pseudo_p ()
2214 ? emit_move_insn (dest
, src
)
2215 : emit_move_insn_1 (dest
, src
));
2218 /* Apply UNOPTAB to OP and store the result in DEST. */
2221 aarch64_emit_unop (rtx dest
, optab unoptab
, rtx op
)
2223 rtx tmp
= expand_unop (GET_MODE (dest
), unoptab
, op
, dest
, 0);
2225 emit_move_insn (dest
, tmp
);
2228 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
2231 aarch64_emit_binop (rtx dest
, optab binoptab
, rtx op0
, rtx op1
)
2233 rtx tmp
= expand_binop (GET_MODE (dest
), binoptab
, op0
, op1
, dest
, 0,
2236 emit_move_insn (dest
, tmp
);
2239 /* Split a 128-bit move operation into two 64-bit move operations,
2240 taking care to handle partial overlap of register to register
2241 copies. Special cases are needed when moving between GP regs and
2242 FP regs. SRC can be a register, constant or memory; DST a register
2243 or memory. If either operand is memory it must not have any side
2246 aarch64_split_128bit_move (rtx dst
, rtx src
)
2251 machine_mode mode
= GET_MODE (dst
);
2253 gcc_assert (mode
== TImode
|| mode
== TFmode
);
2254 gcc_assert (!(side_effects_p (src
) || side_effects_p (dst
)));
2255 gcc_assert (mode
== GET_MODE (src
) || GET_MODE (src
) == VOIDmode
);
2257 if (REG_P (dst
) && REG_P (src
))
2259 int src_regno
= REGNO (src
);
2260 int dst_regno
= REGNO (dst
);
2262 /* Handle FP <-> GP regs. */
2263 if (FP_REGNUM_P (dst_regno
) && GP_REGNUM_P (src_regno
))
2265 src_lo
= gen_lowpart (word_mode
, src
);
2266 src_hi
= gen_highpart (word_mode
, src
);
2268 emit_insn (gen_aarch64_movlow_di (mode
, dst
, src_lo
));
2269 emit_insn (gen_aarch64_movhigh_di (mode
, dst
, src_hi
));
2272 else if (GP_REGNUM_P (dst_regno
) && FP_REGNUM_P (src_regno
))
2274 dst_lo
= gen_lowpart (word_mode
, dst
);
2275 dst_hi
= gen_highpart (word_mode
, dst
);
2277 emit_insn (gen_aarch64_movdi_low (mode
, dst_lo
, src
));
2278 emit_insn (gen_aarch64_movdi_high (mode
, dst_hi
, src
));
2283 dst_lo
= gen_lowpart (word_mode
, dst
);
2284 dst_hi
= gen_highpart (word_mode
, dst
);
2285 src_lo
= gen_lowpart (word_mode
, src
);
2286 src_hi
= gen_highpart_mode (word_mode
, mode
, src
);
2288 /* At most one pairing may overlap. */
2289 if (reg_overlap_mentioned_p (dst_lo
, src_hi
))
2291 aarch64_emit_move (dst_hi
, src_hi
);
2292 aarch64_emit_move (dst_lo
, src_lo
);
2296 aarch64_emit_move (dst_lo
, src_lo
);
2297 aarch64_emit_move (dst_hi
, src_hi
);
2302 aarch64_split_128bit_move_p (rtx dst
, rtx src
)
2304 return (! REG_P (src
)
2305 || ! (FP_REGNUM_P (REGNO (dst
)) && FP_REGNUM_P (REGNO (src
))));
2308 /* Split a complex SIMD combine. */
2311 aarch64_split_simd_combine (rtx dst
, rtx src1
, rtx src2
)
2313 machine_mode src_mode
= GET_MODE (src1
);
2314 machine_mode dst_mode
= GET_MODE (dst
);
2316 gcc_assert (VECTOR_MODE_P (dst_mode
));
2317 gcc_assert (register_operand (dst
, dst_mode
)
2318 && register_operand (src1
, src_mode
)
2319 && register_operand (src2
, src_mode
));
2321 emit_insn (gen_aarch64_simd_combine (src_mode
, dst
, src1
, src2
));
2325 /* Split a complex SIMD move. */
2328 aarch64_split_simd_move (rtx dst
, rtx src
)
2330 machine_mode src_mode
= GET_MODE (src
);
2331 machine_mode dst_mode
= GET_MODE (dst
);
2333 gcc_assert (VECTOR_MODE_P (dst_mode
));
2335 if (REG_P (dst
) && REG_P (src
))
2337 gcc_assert (VECTOR_MODE_P (src_mode
));
2338 emit_insn (gen_aarch64_split_simd_mov (src_mode
, dst
, src
));
2343 aarch64_zero_extend_const_eq (machine_mode xmode
, rtx x
,
2344 machine_mode ymode
, rtx y
)
2346 rtx r
= simplify_const_unary_operation (ZERO_EXTEND
, xmode
, y
, ymode
);
2347 gcc_assert (r
!= NULL
);
2348 return rtx_equal_p (x
, r
);
2353 aarch64_force_temporary (machine_mode mode
, rtx x
, rtx value
)
2355 if (can_create_pseudo_p ())
2356 return force_reg (mode
, value
);
2360 aarch64_emit_move (x
, value
);
2365 /* Return true if we can move VALUE into a register using a single
2366 CNT[BHWD] instruction. */
2369 aarch64_sve_cnt_immediate_p (poly_int64 value
)
2371 HOST_WIDE_INT factor
= value
.coeffs
[0];
2372 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2373 return (value
.coeffs
[1] == factor
2374 && IN_RANGE (factor
, 2, 16 * 16)
2375 && (factor
& 1) == 0
2376 && factor
<= 16 * (factor
& -factor
));
2379 /* Likewise for rtx X. */
2382 aarch64_sve_cnt_immediate_p (rtx x
)
2385 return poly_int_rtx_p (x
, &value
) && aarch64_sve_cnt_immediate_p (value
);
2388 /* Return the asm string for an instruction with a CNT-like vector size
2389 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2390 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2391 first part of the operands template (the part that comes before the
2392 vector size itself). FACTOR is the number of quadwords.
2393 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2394 If it is zero, we can use any element size. */
2397 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
2398 unsigned int factor
,
2399 unsigned int nelts_per_vq
)
2401 static char buffer
[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2403 if (nelts_per_vq
== 0)
2404 /* There is some overlap in the ranges of the four CNT instructions.
2405 Here we always use the smallest possible element size, so that the
2406 multiplier is 1 whereever possible. */
2407 nelts_per_vq
= factor
& -factor
;
2408 int shift
= std::min (exact_log2 (nelts_per_vq
), 4);
2409 gcc_assert (IN_RANGE (shift
, 1, 4));
2410 char suffix
= "dwhb"[shift
- 1];
2413 unsigned int written
;
2415 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s",
2416 prefix
, suffix
, operands
);
2418 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s, all, mul #%d",
2419 prefix
, suffix
, operands
, factor
);
2420 gcc_assert (written
< sizeof (buffer
));
2424 /* Return the asm string for an instruction with a CNT-like vector size
2425 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2426 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2427 first part of the operands template (the part that comes before the
2428 vector size itself). X is the value of the vector size operand,
2429 as a polynomial integer rtx. */
2432 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
2435 poly_int64 value
= rtx_to_poly_int64 (x
);
2436 gcc_assert (aarch64_sve_cnt_immediate_p (value
));
2437 return aarch64_output_sve_cnt_immediate (prefix
, operands
,
2438 value
.coeffs
[1], 0);
2441 /* Return true if we can add VALUE to a register using a single ADDVL
2442 or ADDPL instruction. */
2445 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value
)
2447 HOST_WIDE_INT factor
= value
.coeffs
[0];
2448 if (factor
== 0 || value
.coeffs
[1] != factor
)
2450 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2451 and a value of 16 is one vector width. */
2452 return (((factor
& 15) == 0 && IN_RANGE (factor
, -32 * 16, 31 * 16))
2453 || ((factor
& 1) == 0 && IN_RANGE (factor
, -32 * 2, 31 * 2)));
2456 /* Likewise for rtx X. */
2459 aarch64_sve_addvl_addpl_immediate_p (rtx x
)
2462 return (poly_int_rtx_p (x
, &value
)
2463 && aarch64_sve_addvl_addpl_immediate_p (value
));
2466 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2467 and storing the result in operand 0. */
2470 aarch64_output_sve_addvl_addpl (rtx dest
, rtx base
, rtx offset
)
2472 static char buffer
[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2473 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
2474 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value
));
2476 /* Use INC or DEC if possible. */
2477 if (rtx_equal_p (dest
, base
) && GP_REGNUM_P (REGNO (dest
)))
2479 if (aarch64_sve_cnt_immediate_p (offset_value
))
2480 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2481 offset_value
.coeffs
[1], 0);
2482 if (aarch64_sve_cnt_immediate_p (-offset_value
))
2483 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2484 -offset_value
.coeffs
[1], 0);
2487 int factor
= offset_value
.coeffs
[1];
2488 if ((factor
& 15) == 0)
2489 snprintf (buffer
, sizeof (buffer
), "addvl\t%%x0, %%x1, #%d", factor
/ 16);
2491 snprintf (buffer
, sizeof (buffer
), "addpl\t%%x0, %%x1, #%d", factor
/ 2);
2495 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2496 instruction. If it is, store the number of elements in each vector
2497 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2498 factor in *FACTOR_OUT (if nonnull). */
2501 aarch64_sve_inc_dec_immediate_p (rtx x
, int *factor_out
,
2502 unsigned int *nelts_per_vq_out
)
2507 if (!const_vec_duplicate_p (x
, &elt
)
2508 || !poly_int_rtx_p (elt
, &value
))
2511 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x
));
2512 if (nelts_per_vq
!= 8 && nelts_per_vq
!= 4 && nelts_per_vq
!= 2)
2513 /* There's no vector INCB. */
2516 HOST_WIDE_INT factor
= value
.coeffs
[0];
2517 if (value
.coeffs
[1] != factor
)
2520 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2521 if ((factor
% nelts_per_vq
) != 0
2522 || !IN_RANGE (abs (factor
), nelts_per_vq
, 16 * nelts_per_vq
))
2526 *factor_out
= factor
;
2527 if (nelts_per_vq_out
)
2528 *nelts_per_vq_out
= nelts_per_vq
;
2532 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2536 aarch64_sve_inc_dec_immediate_p (rtx x
)
2538 return aarch64_sve_inc_dec_immediate_p (x
, NULL
, NULL
);
2541 /* Return the asm template for an SVE vector INC or DEC instruction.
2542 OPERANDS gives the operands before the vector count and X is the
2543 value of the vector count operand itself. */
2546 aarch64_output_sve_inc_dec_immediate (const char *operands
, rtx x
)
2549 unsigned int nelts_per_vq
;
2550 if (!aarch64_sve_inc_dec_immediate_p (x
, &factor
, &nelts_per_vq
))
2553 return aarch64_output_sve_cnt_immediate ("dec", operands
, -factor
,
2556 return aarch64_output_sve_cnt_immediate ("inc", operands
, factor
,
2561 aarch64_internal_mov_immediate (rtx dest
, rtx imm
, bool generate
,
2562 scalar_int_mode mode
)
2565 unsigned HOST_WIDE_INT val
, val2
, mask
;
2566 int one_match
, zero_match
;
2571 if (aarch64_move_imm (val
, mode
))
2574 emit_insn (gen_rtx_SET (dest
, imm
));
2578 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2579 (with XXXX non-zero). In that case check to see if the move can be done in
2581 val2
= val
& 0xffffffff;
2583 && aarch64_move_imm (val2
, SImode
)
2584 && (((val
>> 32) & 0xffff) == 0 || (val
>> 48) == 0))
2587 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
2589 /* Check if we have to emit a second instruction by checking to see
2590 if any of the upper 32 bits of the original DI mode value is set. */
2594 i
= (val
>> 48) ? 48 : 32;
2597 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
2598 GEN_INT ((val
>> i
) & 0xffff)));
2603 if ((val
>> 32) == 0 || mode
== SImode
)
2607 emit_insn (gen_rtx_SET (dest
, GEN_INT (val
& 0xffff)));
2609 emit_insn (gen_insv_immsi (dest
, GEN_INT (16),
2610 GEN_INT ((val
>> 16) & 0xffff)));
2612 emit_insn (gen_insv_immdi (dest
, GEN_INT (16),
2613 GEN_INT ((val
>> 16) & 0xffff)));
2618 /* Remaining cases are all for DImode. */
2621 zero_match
= ((val
& mask
) == 0) + ((val
& (mask
<< 16)) == 0) +
2622 ((val
& (mask
<< 32)) == 0) + ((val
& (mask
<< 48)) == 0);
2623 one_match
= ((~val
& mask
) == 0) + ((~val
& (mask
<< 16)) == 0) +
2624 ((~val
& (mask
<< 32)) == 0) + ((~val
& (mask
<< 48)) == 0);
2626 if (zero_match
!= 2 && one_match
!= 2)
2628 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2629 For a 64-bit bitmask try whether changing 16 bits to all ones or
2630 zeroes creates a valid bitmask. To check any repeated bitmask,
2631 try using 16 bits from the other 32-bit half of val. */
2633 for (i
= 0; i
< 64; i
+= 16, mask
<<= 16)
2636 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
2639 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
2641 val2
= val2
& ~mask
;
2642 val2
= val2
| (((val2
>> 32) | (val2
<< 32)) & mask
);
2643 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
2650 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
2651 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
2652 GEN_INT ((val
>> i
) & 0xffff)));
2658 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2659 are emitted by the initial mov. If one_match > zero_match, skip set bits,
2660 otherwise skip zero bits. */
2664 val2
= one_match
> zero_match
? ~val
: val
;
2665 i
= (val2
& mask
) != 0 ? 0 : (val2
& (mask
<< 16)) != 0 ? 16 : 32;
2668 emit_insn (gen_rtx_SET (dest
, GEN_INT (one_match
> zero_match
2669 ? (val
| ~(mask
<< i
))
2670 : (val
& (mask
<< i
)))));
2671 for (i
+= 16; i
< 64; i
+= 16)
2673 if ((val2
& (mask
<< i
)) == 0)
2676 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
2677 GEN_INT ((val
>> i
) & 0xffff)));
2684 /* Return whether imm is a 128-bit immediate which is simple enough to
2687 aarch64_mov128_immediate (rtx imm
)
2689 if (GET_CODE (imm
) == CONST_INT
)
2692 gcc_assert (CONST_WIDE_INT_NUNITS (imm
) == 2);
2694 rtx lo
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 0));
2695 rtx hi
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 1));
2697 return aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, DImode
)
2698 + aarch64_internal_mov_immediate (NULL_RTX
, hi
, false, DImode
) <= 4;
2702 /* Return the number of temporary registers that aarch64_add_offset_1
2703 would need to add OFFSET to a register. */
2706 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset
)
2708 return abs_hwi (offset
) < 0x1000000 ? 0 : 1;
2711 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
2712 a non-polynomial OFFSET. MODE is the mode of the addition.
2713 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2714 be set and CFA adjustments added to the generated instructions.
2716 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2717 temporary if register allocation is already complete. This temporary
2718 register may overlap DEST but must not overlap SRC. If TEMP1 is known
2719 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2720 the immediate again.
2722 Since this function may be used to adjust the stack pointer, we must
2723 ensure that it cannot cause transient stack deallocation (for example
2724 by first incrementing SP and then decrementing when adjusting by a
2725 large immediate). */
2728 aarch64_add_offset_1 (scalar_int_mode mode
, rtx dest
,
2729 rtx src
, HOST_WIDE_INT offset
, rtx temp1
,
2730 bool frame_related_p
, bool emit_move_imm
)
2732 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
2733 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
2735 HOST_WIDE_INT moffset
= abs_hwi (offset
);
2740 if (!rtx_equal_p (dest
, src
))
2742 insn
= emit_insn (gen_rtx_SET (dest
, src
));
2743 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2748 /* Single instruction adjustment. */
2749 if (aarch64_uimm12_shift (moffset
))
2751 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (offset
)));
2752 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2756 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2759 a) the offset cannot be loaded by a 16-bit move or
2760 b) there is no spare register into which we can move it. */
2761 if (moffset
< 0x1000000
2762 && ((!temp1
&& !can_create_pseudo_p ())
2763 || !aarch64_move_imm (moffset
, mode
)))
2765 HOST_WIDE_INT low_off
= moffset
& 0xfff;
2767 low_off
= offset
< 0 ? -low_off
: low_off
;
2768 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (low_off
)));
2769 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2770 insn
= emit_insn (gen_add2_insn (dest
, GEN_INT (offset
- low_off
)));
2771 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2775 /* Emit a move immediate if required and an addition/subtraction. */
2778 gcc_assert (temp1
!= NULL_RTX
|| can_create_pseudo_p ());
2779 temp1
= aarch64_force_temporary (mode
, temp1
, GEN_INT (moffset
));
2781 insn
= emit_insn (offset
< 0
2782 ? gen_sub3_insn (dest
, src
, temp1
)
2783 : gen_add3_insn (dest
, src
, temp1
));
2784 if (frame_related_p
)
2786 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2787 rtx adj
= plus_constant (mode
, src
, offset
);
2788 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, gen_rtx_SET (dest
, adj
));
2792 /* Return the number of temporary registers that aarch64_add_offset
2793 would need to move OFFSET into a register or add OFFSET to a register;
2794 ADD_P is true if we want the latter rather than the former. */
2797 aarch64_offset_temporaries (bool add_p
, poly_int64 offset
)
2799 /* This follows the same structure as aarch64_add_offset. */
2800 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
2803 unsigned int count
= 0;
2804 HOST_WIDE_INT factor
= offset
.coeffs
[1];
2805 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
2806 poly_int64
poly_offset (factor
, factor
);
2807 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
2808 /* Need one register for the ADDVL/ADDPL result. */
2810 else if (factor
!= 0)
2812 factor
= abs (factor
);
2813 if (factor
> 16 * (factor
& -factor
))
2814 /* Need one register for the CNT result and one for the multiplication
2815 factor. If necessary, the second temporary can be reused for the
2816 constant part of the offset. */
2818 /* Need one register for the CNT result (which might then
2822 return count
+ aarch64_add_offset_1_temporaries (constant
);
2825 /* If X can be represented as a poly_int64, return the number
2826 of temporaries that are required to add it to a register.
2827 Return -1 otherwise. */
2830 aarch64_add_offset_temporaries (rtx x
)
2833 if (!poly_int_rtx_p (x
, &offset
))
2835 return aarch64_offset_temporaries (true, offset
);
2838 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
2839 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2840 be set and CFA adjustments added to the generated instructions.
2842 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2843 temporary if register allocation is already complete. This temporary
2844 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2845 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2846 false to avoid emitting the immediate again.
2848 TEMP2, if nonnull, is a second temporary register that doesn't
2849 overlap either DEST or REG.
2851 Since this function may be used to adjust the stack pointer, we must
2852 ensure that it cannot cause transient stack deallocation (for example
2853 by first incrementing SP and then decrementing when adjusting by a
2854 large immediate). */
2857 aarch64_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
2858 poly_int64 offset
, rtx temp1
, rtx temp2
,
2859 bool frame_related_p
, bool emit_move_imm
= true)
2861 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
2862 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
2863 gcc_assert (temp1
== NULL_RTX
2865 || !reg_overlap_mentioned_p (temp1
, dest
));
2866 gcc_assert (temp2
== NULL_RTX
|| !reg_overlap_mentioned_p (dest
, temp2
));
2868 /* Try using ADDVL or ADDPL to add the whole value. */
2869 if (src
!= const0_rtx
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
2871 rtx offset_rtx
= gen_int_mode (offset
, mode
);
2872 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
2873 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2877 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2878 SVE vector register, over and above the minimum size of 128 bits.
2879 This is equivalent to half the value returned by CNTD with a
2880 vector shape of ALL. */
2881 HOST_WIDE_INT factor
= offset
.coeffs
[1];
2882 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
2884 /* Try using ADDVL or ADDPL to add the VG-based part. */
2885 poly_int64
poly_offset (factor
, factor
);
2886 if (src
!= const0_rtx
2887 && aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
2889 rtx offset_rtx
= gen_int_mode (poly_offset
, mode
);
2890 if (frame_related_p
)
2892 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
2893 RTX_FRAME_RELATED_P (insn
) = true;
2898 rtx addr
= gen_rtx_PLUS (mode
, src
, offset_rtx
);
2899 src
= aarch64_force_temporary (mode
, temp1
, addr
);
2904 /* Otherwise use a CNT-based sequence. */
2905 else if (factor
!= 0)
2907 /* Use a subtraction if we have a negative factor. */
2908 rtx_code code
= PLUS
;
2915 /* Calculate CNTD * FACTOR / 2. First try to fold the division
2916 into the multiplication. */
2920 /* Use a right shift by 1. */
2924 HOST_WIDE_INT low_bit
= factor
& -factor
;
2925 if (factor
<= 16 * low_bit
)
2927 if (factor
> 16 * 8)
2929 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2930 the value with the minimum multiplier and shift it into
2932 int extra_shift
= exact_log2 (low_bit
);
2933 shift
+= extra_shift
;
2934 factor
>>= extra_shift
;
2936 val
= gen_int_mode (poly_int64 (factor
* 2, factor
* 2), mode
);
2940 /* Use CNTD, then multiply it by FACTOR. */
2941 val
= gen_int_mode (poly_int64 (2, 2), mode
);
2942 val
= aarch64_force_temporary (mode
, temp1
, val
);
2944 /* Go back to using a negative multiplication factor if we have
2945 no register from which to subtract. */
2946 if (code
== MINUS
&& src
== const0_rtx
)
2951 rtx coeff1
= gen_int_mode (factor
, mode
);
2952 coeff1
= aarch64_force_temporary (mode
, temp2
, coeff1
);
2953 val
= gen_rtx_MULT (mode
, val
, coeff1
);
2958 /* Multiply by 1 << SHIFT. */
2959 val
= aarch64_force_temporary (mode
, temp1
, val
);
2960 val
= gen_rtx_ASHIFT (mode
, val
, GEN_INT (shift
));
2962 else if (shift
== -1)
2965 val
= aarch64_force_temporary (mode
, temp1
, val
);
2966 val
= gen_rtx_ASHIFTRT (mode
, val
, const1_rtx
);
2969 /* Calculate SRC +/- CNTD * FACTOR / 2. */
2970 if (src
!= const0_rtx
)
2972 val
= aarch64_force_temporary (mode
, temp1
, val
);
2973 val
= gen_rtx_fmt_ee (code
, mode
, src
, val
);
2975 else if (code
== MINUS
)
2977 val
= aarch64_force_temporary (mode
, temp1
, val
);
2978 val
= gen_rtx_NEG (mode
, val
);
2981 if (constant
== 0 || frame_related_p
)
2983 rtx_insn
*insn
= emit_insn (gen_rtx_SET (dest
, val
));
2984 if (frame_related_p
)
2986 RTX_FRAME_RELATED_P (insn
) = true;
2987 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
2988 gen_rtx_SET (dest
, plus_constant (Pmode
, src
,
2997 src
= aarch64_force_temporary (mode
, temp1
, val
);
3002 emit_move_imm
= true;
3005 aarch64_add_offset_1 (mode
, dest
, src
, constant
, temp1
,
3006 frame_related_p
, emit_move_imm
);
3009 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3010 than a poly_int64. */
3013 aarch64_split_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
3014 rtx offset_rtx
, rtx temp1
, rtx temp2
)
3016 aarch64_add_offset (mode
, dest
, src
, rtx_to_poly_int64 (offset_rtx
),
3017 temp1
, temp2
, false);
3020 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3021 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
3022 if TEMP1 already contains abs (DELTA). */
3025 aarch64_add_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool emit_move_imm
)
3027 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, delta
,
3028 temp1
, temp2
, true, emit_move_imm
);
3031 /* Subtract DELTA from the stack pointer, marking the instructions
3032 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
3036 aarch64_sub_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool frame_related_p
,
3037 bool emit_move_imm
= true)
3039 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, -delta
,
3040 temp1
, temp2
, frame_related_p
, emit_move_imm
);
3043 /* Set DEST to (vec_series BASE STEP). */
3046 aarch64_expand_vec_series (rtx dest
, rtx base
, rtx step
)
3048 machine_mode mode
= GET_MODE (dest
);
3049 scalar_mode inner
= GET_MODE_INNER (mode
);
3051 /* Each operand can be a register or an immediate in the range [-16, 15]. */
3052 if (!aarch64_sve_index_immediate_p (base
))
3053 base
= force_reg (inner
, base
);
3054 if (!aarch64_sve_index_immediate_p (step
))
3055 step
= force_reg (inner
, step
);
3057 emit_set_insn (dest
, gen_rtx_VEC_SERIES (mode
, base
, step
));
3060 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
3061 integer of mode INT_MODE. Return true on success. */
3064 aarch64_expand_sve_widened_duplicate (rtx dest
, scalar_int_mode src_mode
,
3067 /* If the constant is smaller than 128 bits, we can do the move
3068 using a vector of SRC_MODEs. */
3069 if (src_mode
!= TImode
)
3071 poly_uint64 count
= exact_div (GET_MODE_SIZE (GET_MODE (dest
)),
3072 GET_MODE_SIZE (src_mode
));
3073 machine_mode dup_mode
= mode_for_vector (src_mode
, count
).require ();
3074 emit_move_insn (gen_lowpart (dup_mode
, dest
),
3075 gen_const_vec_duplicate (dup_mode
, src
));
3079 /* Use LD1RQ[BHWD] to load the 128 bits from memory. */
3080 src
= force_const_mem (src_mode
, src
);
3084 /* Make sure that the address is legitimate. */
3085 if (!aarch64_sve_ld1r_operand_p (src
))
3087 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
3088 src
= replace_equiv_address (src
, addr
);
3091 machine_mode mode
= GET_MODE (dest
);
3092 unsigned int elem_bytes
= GET_MODE_UNIT_SIZE (mode
);
3093 machine_mode pred_mode
= aarch64_sve_pred_mode (elem_bytes
).require ();
3094 rtx ptrue
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
3095 src
= gen_rtx_UNSPEC (mode
, gen_rtvec (2, ptrue
, src
), UNSPEC_LD1RQ
);
3096 emit_insn (gen_rtx_SET (dest
, src
));
3100 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
3101 isn't a simple duplicate or series. */
3104 aarch64_expand_sve_const_vector (rtx dest
, rtx src
)
3106 machine_mode mode
= GET_MODE (src
);
3107 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (src
);
3108 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (src
);
3109 gcc_assert (npatterns
> 1);
3111 if (nelts_per_pattern
== 1)
3113 /* The constant is a repeating seqeuence of at least two elements,
3114 where the repeating elements occupy no more than 128 bits.
3115 Get an integer representation of the replicated value. */
3116 scalar_int_mode int_mode
;
3117 if (BYTES_BIG_ENDIAN
)
3118 /* For now, always use LD1RQ to load the value on big-endian
3119 targets, since the handling of smaller integers includes a
3120 subreg that is semantically an element reverse. */
3124 unsigned int int_bits
= GET_MODE_UNIT_BITSIZE (mode
) * npatterns
;
3125 gcc_assert (int_bits
<= 128);
3126 int_mode
= int_mode_for_size (int_bits
, 0).require ();
3128 rtx int_value
= simplify_gen_subreg (int_mode
, src
, mode
, 0);
3130 && aarch64_expand_sve_widened_duplicate (dest
, int_mode
, int_value
))
3134 /* Expand each pattern individually. */
3135 rtx_vector_builder builder
;
3136 auto_vec
<rtx
, 16> vectors (npatterns
);
3137 for (unsigned int i
= 0; i
< npatterns
; ++i
)
3139 builder
.new_vector (mode
, 1, nelts_per_pattern
);
3140 for (unsigned int j
= 0; j
< nelts_per_pattern
; ++j
)
3141 builder
.quick_push (CONST_VECTOR_ELT (src
, i
+ j
* npatterns
));
3142 vectors
.quick_push (force_reg (mode
, builder
.build ()));
3145 /* Use permutes to interleave the separate vectors. */
3146 while (npatterns
> 1)
3149 for (unsigned int i
= 0; i
< npatterns
; ++i
)
3151 rtx tmp
= (npatterns
== 1 ? dest
: gen_reg_rtx (mode
));
3152 rtvec v
= gen_rtvec (2, vectors
[i
], vectors
[i
+ npatterns
]);
3153 emit_set_insn (tmp
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
3157 gcc_assert (vectors
[0] == dest
);
3160 /* Set DEST to immediate IMM. For SVE vector modes, GEN_VEC_DUPLICATE
3161 is a pattern that can be used to set DEST to a replicated scalar
3165 aarch64_expand_mov_immediate (rtx dest
, rtx imm
,
3166 rtx (*gen_vec_duplicate
) (rtx
, rtx
))
3168 machine_mode mode
= GET_MODE (dest
);
3170 /* Check on what type of symbol it is. */
3171 scalar_int_mode int_mode
;
3172 if ((GET_CODE (imm
) == SYMBOL_REF
3173 || GET_CODE (imm
) == LABEL_REF
3174 || GET_CODE (imm
) == CONST
3175 || GET_CODE (imm
) == CONST_POLY_INT
)
3176 && is_a
<scalar_int_mode
> (mode
, &int_mode
))
3180 HOST_WIDE_INT const_offset
;
3181 enum aarch64_symbol_type sty
;
3183 /* If we have (const (plus symbol offset)), separate out the offset
3184 before we start classifying the symbol. */
3185 rtx base
= strip_offset (imm
, &offset
);
3187 /* We must always add an offset involving VL separately, rather than
3188 folding it into the relocation. */
3189 if (!offset
.is_constant (&const_offset
))
3191 if (base
== const0_rtx
&& aarch64_sve_cnt_immediate_p (offset
))
3192 emit_insn (gen_rtx_SET (dest
, imm
));
3195 /* Do arithmetic on 32-bit values if the result is smaller
3197 if (partial_subreg_p (int_mode
, SImode
))
3199 /* It is invalid to do symbol calculations in modes
3200 narrower than SImode. */
3201 gcc_assert (base
== const0_rtx
);
3202 dest
= gen_lowpart (SImode
, dest
);
3205 if (base
!= const0_rtx
)
3207 base
= aarch64_force_temporary (int_mode
, dest
, base
);
3208 aarch64_add_offset (int_mode
, dest
, base
, offset
,
3209 NULL_RTX
, NULL_RTX
, false);
3212 aarch64_add_offset (int_mode
, dest
, base
, offset
,
3213 dest
, NULL_RTX
, false);
3218 sty
= aarch64_classify_symbol (base
, const_offset
);
3221 case SYMBOL_FORCE_TO_MEM
:
3222 if (const_offset
!= 0
3223 && targetm
.cannot_force_const_mem (int_mode
, imm
))
3225 gcc_assert (can_create_pseudo_p ());
3226 base
= aarch64_force_temporary (int_mode
, dest
, base
);
3227 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
3228 NULL_RTX
, NULL_RTX
, false);
3232 mem
= force_const_mem (ptr_mode
, imm
);
3235 /* If we aren't generating PC relative literals, then
3236 we need to expand the literal pool access carefully.
3237 This is something that needs to be done in a number
3238 of places, so could well live as a separate function. */
3239 if (!aarch64_pcrelative_literal_loads
)
3241 gcc_assert (can_create_pseudo_p ());
3242 base
= gen_reg_rtx (ptr_mode
);
3243 aarch64_expand_mov_immediate (base
, XEXP (mem
, 0));
3244 if (ptr_mode
!= Pmode
)
3245 base
= convert_memory_address (Pmode
, base
);
3246 mem
= gen_rtx_MEM (ptr_mode
, base
);
3249 if (int_mode
!= ptr_mode
)
3250 mem
= gen_rtx_ZERO_EXTEND (int_mode
, mem
);
3252 emit_insn (gen_rtx_SET (dest
, mem
));
3256 case SYMBOL_SMALL_TLSGD
:
3257 case SYMBOL_SMALL_TLSDESC
:
3258 case SYMBOL_SMALL_TLSIE
:
3259 case SYMBOL_SMALL_GOT_28K
:
3260 case SYMBOL_SMALL_GOT_4G
:
3261 case SYMBOL_TINY_GOT
:
3262 case SYMBOL_TINY_TLSIE
:
3263 if (const_offset
!= 0)
3265 gcc_assert(can_create_pseudo_p ());
3266 base
= aarch64_force_temporary (int_mode
, dest
, base
);
3267 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
3268 NULL_RTX
, NULL_RTX
, false);
3273 case SYMBOL_SMALL_ABSOLUTE
:
3274 case SYMBOL_TINY_ABSOLUTE
:
3275 case SYMBOL_TLSLE12
:
3276 case SYMBOL_TLSLE24
:
3277 case SYMBOL_TLSLE32
:
3278 case SYMBOL_TLSLE48
:
3279 aarch64_load_symref_appropriately (dest
, imm
, sty
);
3287 if (!CONST_INT_P (imm
))
3289 rtx base
, step
, value
;
3290 if (GET_CODE (imm
) == HIGH
3291 || aarch64_simd_valid_immediate (imm
, NULL
))
3292 emit_insn (gen_rtx_SET (dest
, imm
));
3293 else if (const_vec_series_p (imm
, &base
, &step
))
3294 aarch64_expand_vec_series (dest
, base
, step
);
3295 else if (const_vec_duplicate_p (imm
, &value
))
3297 /* If the constant is out of range of an SVE vector move,
3298 load it from memory if we can, otherwise move it into
3299 a register and use a DUP. */
3300 scalar_mode inner_mode
= GET_MODE_INNER (mode
);
3301 rtx op
= force_const_mem (inner_mode
, value
);
3303 op
= force_reg (inner_mode
, value
);
3304 else if (!aarch64_sve_ld1r_operand_p (op
))
3306 rtx addr
= force_reg (Pmode
, XEXP (op
, 0));
3307 op
= replace_equiv_address (op
, addr
);
3309 emit_insn (gen_vec_duplicate (dest
, op
));
3311 else if (GET_CODE (imm
) == CONST_VECTOR
3312 && !GET_MODE_NUNITS (GET_MODE (imm
)).is_constant ())
3313 aarch64_expand_sve_const_vector (dest
, imm
);
3316 rtx mem
= force_const_mem (mode
, imm
);
3318 emit_move_insn (dest
, mem
);
3324 aarch64_internal_mov_immediate (dest
, imm
, true,
3325 as_a
<scalar_int_mode
> (mode
));
3328 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
3329 that is known to contain PTRUE. */
3332 aarch64_emit_sve_pred_move (rtx dest
, rtx pred
, rtx src
)
3334 emit_insn (gen_rtx_SET (dest
, gen_rtx_UNSPEC (GET_MODE (dest
),
3335 gen_rtvec (2, pred
, src
),
3336 UNSPEC_MERGE_PTRUE
)));
3339 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3340 operand is in memory. In this case we need to use the predicated LD1
3341 and ST1 instead of LDR and STR, both for correctness on big-endian
3342 targets and because LD1 and ST1 support a wider range of addressing modes.
3343 PRED_MODE is the mode of the predicate.
3345 See the comment at the head of aarch64-sve.md for details about the
3346 big-endian handling. */
3349 aarch64_expand_sve_mem_move (rtx dest
, rtx src
, machine_mode pred_mode
)
3351 machine_mode mode
= GET_MODE (dest
);
3352 rtx ptrue
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
3353 if (!register_operand (src
, mode
)
3354 && !register_operand (dest
, mode
))
3356 rtx tmp
= gen_reg_rtx (mode
);
3358 aarch64_emit_sve_pred_move (tmp
, ptrue
, src
);
3360 emit_move_insn (tmp
, src
);
3363 aarch64_emit_sve_pred_move (dest
, ptrue
, src
);
3366 /* Called only on big-endian targets. See whether an SVE vector move
3367 from SRC to DEST is effectively a REV[BHW] instruction, because at
3368 least one operand is a subreg of an SVE vector that has wider or
3369 narrower elements. Return true and emit the instruction if so.
3373 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3375 represents a VIEW_CONVERT between the following vectors, viewed
3378 R2: { [0].high, [0].low, [1].high, [1].low, ... }
3379 R1: { [0], [1], [2], [3], ... }
3381 The high part of lane X in R2 should therefore correspond to lane X*2
3382 of R1, but the register representations are:
3385 R2: ...... [1].high [1].low [0].high [0].low
3386 R1: ...... [3] [2] [1] [0]
3388 where the low part of lane X in R2 corresponds to lane X*2 in R1.
3389 We therefore need a reverse operation to swap the high and low values
3392 This is purely an optimization. Without it we would spill the
3393 subreg operand to the stack in one mode and reload it in the
3394 other mode, which has the same effect as the REV. */
3397 aarch64_maybe_expand_sve_subreg_move (rtx dest
, rtx src
)
3399 gcc_assert (BYTES_BIG_ENDIAN
);
3400 if (GET_CODE (dest
) == SUBREG
)
3401 dest
= SUBREG_REG (dest
);
3402 if (GET_CODE (src
) == SUBREG
)
3403 src
= SUBREG_REG (src
);
3405 /* The optimization handles two single SVE REGs with different element
3409 || aarch64_classify_vector_mode (GET_MODE (dest
)) != VEC_SVE_DATA
3410 || aarch64_classify_vector_mode (GET_MODE (src
)) != VEC_SVE_DATA
3411 || (GET_MODE_UNIT_SIZE (GET_MODE (dest
))
3412 == GET_MODE_UNIT_SIZE (GET_MODE (src
))))
3415 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
3416 rtx ptrue
= force_reg (VNx16BImode
, CONSTM1_RTX (VNx16BImode
));
3417 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (dest
), gen_rtvec (2, ptrue
, src
),
3419 emit_insn (gen_rtx_SET (dest
, unspec
));
3423 /* Return a copy of X with mode MODE, without changing its other
3424 attributes. Unlike gen_lowpart, this doesn't care whether the
3425 mode change is valid. */
3428 aarch64_replace_reg_mode (rtx x
, machine_mode mode
)
3430 if (GET_MODE (x
) == mode
)
3433 x
= shallow_copy_rtx (x
);
3434 set_mode_and_regno (x
, mode
, REGNO (x
));
3438 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3442 aarch64_split_sve_subreg_move (rtx dest
, rtx ptrue
, rtx src
)
3444 /* Decide which REV operation we need. The mode with narrower elements
3445 determines the mode of the operands and the mode with the wider
3446 elements determines the reverse width. */
3447 machine_mode mode_with_wider_elts
= GET_MODE (dest
);
3448 machine_mode mode_with_narrower_elts
= GET_MODE (src
);
3449 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts
)
3450 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts
))
3451 std::swap (mode_with_wider_elts
, mode_with_narrower_elts
);
3453 unsigned int wider_bytes
= GET_MODE_UNIT_SIZE (mode_with_wider_elts
);
3454 unsigned int unspec
;
3455 if (wider_bytes
== 8)
3456 unspec
= UNSPEC_REV64
;
3457 else if (wider_bytes
== 4)
3458 unspec
= UNSPEC_REV32
;
3459 else if (wider_bytes
== 2)
3460 unspec
= UNSPEC_REV16
;
3463 machine_mode pred_mode
= aarch64_sve_pred_mode (wider_bytes
).require ();
3467 (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3468 UNSPEC_MERGE_PTRUE))
3470 with the appropriate modes. */
3471 ptrue
= gen_lowpart (pred_mode
, ptrue
);
3472 dest
= aarch64_replace_reg_mode (dest
, mode_with_narrower_elts
);
3473 src
= aarch64_replace_reg_mode (src
, mode_with_narrower_elts
);
3474 src
= gen_rtx_UNSPEC (mode_with_narrower_elts
, gen_rtvec (1, src
), unspec
);
3475 src
= gen_rtx_UNSPEC (mode_with_narrower_elts
, gen_rtvec (2, ptrue
, src
),
3476 UNSPEC_MERGE_PTRUE
);
3477 emit_insn (gen_rtx_SET (dest
, src
));
3481 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED
,
3482 tree exp ATTRIBUTE_UNUSED
)
3484 if (aarch64_simd_decl_p (cfun
->decl
) != aarch64_simd_decl_p (decl
))
3490 /* Implement TARGET_PASS_BY_REFERENCE. */
3493 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED
,
3496 bool named ATTRIBUTE_UNUSED
)
3499 machine_mode dummymode
;
3502 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
3503 if (mode
== BLKmode
&& type
)
3504 size
= int_size_in_bytes (type
);
3506 /* No frontends can create types with variable-sized modes, so we
3507 shouldn't be asked to pass or return them. */
3508 size
= GET_MODE_SIZE (mode
).to_constant ();
3510 /* Aggregates are passed by reference based on their size. */
3511 if (type
&& AGGREGATE_TYPE_P (type
))
3513 size
= int_size_in_bytes (type
);
3516 /* Variable sized arguments are always returned by reference. */
3520 /* Can this be a candidate to be passed in fp/simd register(s)? */
3521 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
3526 /* Arguments which are variable sized or larger than 2 registers are
3527 passed by reference unless they are a homogenous floating point
3529 return size
> 2 * UNITS_PER_WORD
;
3532 /* Return TRUE if VALTYPE is padded to its least significant bits. */
3534 aarch64_return_in_msb (const_tree valtype
)
3536 machine_mode dummy_mode
;
3539 /* Never happens in little-endian mode. */
3540 if (!BYTES_BIG_ENDIAN
)
3543 /* Only composite types smaller than or equal to 16 bytes can
3544 be potentially returned in registers. */
3545 if (!aarch64_composite_type_p (valtype
, TYPE_MODE (valtype
))
3546 || int_size_in_bytes (valtype
) <= 0
3547 || int_size_in_bytes (valtype
) > 16)
3550 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3551 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3552 is always passed/returned in the least significant bits of fp/simd
3554 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype
), valtype
,
3555 &dummy_mode
, &dummy_int
, NULL
))
3561 /* Implement TARGET_FUNCTION_VALUE.
3562 Define how to find the value returned by a function. */
3565 aarch64_function_value (const_tree type
, const_tree func
,
3566 bool outgoing ATTRIBUTE_UNUSED
)
3571 machine_mode ag_mode
;
3573 mode
= TYPE_MODE (type
);
3574 if (INTEGRAL_TYPE_P (type
))
3575 mode
= promote_function_mode (type
, mode
, &unsignedp
, func
, 1);
3577 if (aarch64_return_in_msb (type
))
3579 HOST_WIDE_INT size
= int_size_in_bytes (type
);
3581 if (size
% UNITS_PER_WORD
!= 0)
3583 size
+= UNITS_PER_WORD
- size
% UNITS_PER_WORD
;
3584 mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 0).require ();
3588 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
3589 &ag_mode
, &count
, NULL
))
3591 if (!aarch64_composite_type_p (type
, mode
))
3593 gcc_assert (count
== 1 && mode
== ag_mode
);
3594 return gen_rtx_REG (mode
, V0_REGNUM
);
3601 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (count
));
3602 for (i
= 0; i
< count
; i
++)
3604 rtx tmp
= gen_rtx_REG (ag_mode
, V0_REGNUM
+ i
);
3605 rtx offset
= gen_int_mode (i
* GET_MODE_SIZE (ag_mode
), Pmode
);
3606 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
3607 XVECEXP (par
, 0, i
) = tmp
;
3613 return gen_rtx_REG (mode
, R0_REGNUM
);
3616 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3617 Return true if REGNO is the number of a hard register in which the values
3618 of called function may come back. */
3621 aarch64_function_value_regno_p (const unsigned int regno
)
3623 /* Maximum of 16 bytes can be returned in the general registers. Examples
3624 of 16-byte return values are: 128-bit integers and 16-byte small
3625 structures (excluding homogeneous floating-point aggregates). */
3626 if (regno
== R0_REGNUM
|| regno
== R1_REGNUM
)
3629 /* Up to four fp/simd registers can return a function value, e.g. a
3630 homogeneous floating-point aggregate having four members. */
3631 if (regno
>= V0_REGNUM
&& regno
< V0_REGNUM
+ HA_MAX_NUM_FLDS
)
3632 return TARGET_FLOAT
;
3637 /* Implement TARGET_RETURN_IN_MEMORY.
3639 If the type T of the result of a function is such that
3641 would require that arg be passed as a value in a register (or set of
3642 registers) according to the parameter passing rules, then the result
3643 is returned in the same registers as would be used for such an
3647 aarch64_return_in_memory (const_tree type
, const_tree fndecl ATTRIBUTE_UNUSED
)
3650 machine_mode ag_mode
;
3653 if (!AGGREGATE_TYPE_P (type
)
3654 && TREE_CODE (type
) != COMPLEX_TYPE
3655 && TREE_CODE (type
) != VECTOR_TYPE
)
3656 /* Simple scalar types always returned in registers. */
3659 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
),
3666 /* Types larger than 2 registers returned in memory. */
3667 size
= int_size_in_bytes (type
);
3668 return (size
< 0 || size
> 2 * UNITS_PER_WORD
);
3672 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v
, machine_mode mode
,
3673 const_tree type
, int *nregs
)
3675 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
3676 return aarch64_vfp_is_call_or_return_candidate (mode
,
3678 &pcum
->aapcs_vfp_rmode
,
3683 /* Given MODE and TYPE of a function argument, return the alignment in
3684 bits. The idea is to suppress any stronger alignment requested by
3685 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3686 This is a helper function for local use only. */
3689 aarch64_function_arg_alignment (machine_mode mode
, const_tree type
)
3692 return GET_MODE_ALIGNMENT (mode
);
3694 if (integer_zerop (TYPE_SIZE (type
)))
3697 gcc_assert (TYPE_MODE (type
) == mode
);
3699 if (!AGGREGATE_TYPE_P (type
))
3700 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type
));
3702 if (TREE_CODE (type
) == ARRAY_TYPE
)
3703 return TYPE_ALIGN (TREE_TYPE (type
));
3705 unsigned int alignment
= 0;
3706 for (tree field
= TYPE_FIELDS (type
); field
; field
= DECL_CHAIN (field
))
3707 if (TREE_CODE (field
) == FIELD_DECL
)
3708 alignment
= std::max (alignment
, DECL_ALIGN (field
));
3713 /* Layout a function argument according to the AAPCS64 rules. The rule
3714 numbers refer to the rule numbers in the AAPCS64. */
3717 aarch64_layout_arg (cumulative_args_t pcum_v
, machine_mode mode
,
3719 bool named ATTRIBUTE_UNUSED
)
3721 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
3722 int ncrn
, nvrn
, nregs
;
3723 bool allocate_ncrn
, allocate_nvrn
;
3726 /* We need to do this once per argument. */
3727 if (pcum
->aapcs_arg_processed
)
3730 pcum
->aapcs_arg_processed
= true;
3732 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
3734 size
= int_size_in_bytes (type
);
3736 /* No frontends can create types with variable-sized modes, so we
3737 shouldn't be asked to pass or return them. */
3738 size
= GET_MODE_SIZE (mode
).to_constant ();
3739 size
= ROUND_UP (size
, UNITS_PER_WORD
);
3741 allocate_ncrn
= (type
) ? !(FLOAT_TYPE_P (type
)) : !FLOAT_MODE_P (mode
);
3742 allocate_nvrn
= aarch64_vfp_is_call_candidate (pcum_v
,
3747 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3748 The following code thus handles passing by SIMD/FP registers first. */
3750 nvrn
= pcum
->aapcs_nvrn
;
3752 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3753 and homogenous short-vector aggregates (HVA). */
3757 aarch64_err_no_fpadvsimd (mode
);
3759 if (nvrn
+ nregs
<= NUM_FP_ARG_REGS
)
3761 pcum
->aapcs_nextnvrn
= nvrn
+ nregs
;
3762 if (!aarch64_composite_type_p (type
, mode
))
3764 gcc_assert (nregs
== 1);
3765 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ nvrn
);
3771 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
3772 for (i
= 0; i
< nregs
; i
++)
3774 rtx tmp
= gen_rtx_REG (pcum
->aapcs_vfp_rmode
,
3775 V0_REGNUM
+ nvrn
+ i
);
3776 rtx offset
= gen_int_mode
3777 (i
* GET_MODE_SIZE (pcum
->aapcs_vfp_rmode
), Pmode
);
3778 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
3779 XVECEXP (par
, 0, i
) = tmp
;
3781 pcum
->aapcs_reg
= par
;
3787 /* C.3 NSRN is set to 8. */
3788 pcum
->aapcs_nextnvrn
= NUM_FP_ARG_REGS
;
3793 ncrn
= pcum
->aapcs_ncrn
;
3794 nregs
= size
/ UNITS_PER_WORD
;
3796 /* C6 - C9. though the sign and zero extension semantics are
3797 handled elsewhere. This is the case where the argument fits
3798 entirely general registers. */
3799 if (allocate_ncrn
&& (ncrn
+ nregs
<= NUM_ARG_REGS
))
3802 gcc_assert (nregs
== 0 || nregs
== 1 || nregs
== 2);
3804 /* C.8 if the argument has an alignment of 16 then the NGRN is
3805 rounded up to the next even number. */
3808 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3809 comparison is there because for > 16 * BITS_PER_UNIT
3810 alignment nregs should be > 2 and therefore it should be
3811 passed by reference rather than value. */
3812 && aarch64_function_arg_alignment (mode
, type
) == 16 * BITS_PER_UNIT
)
3815 gcc_assert (ncrn
+ nregs
<= NUM_ARG_REGS
);
3818 /* NREGS can be 0 when e.g. an empty structure is to be passed.
3819 A reg is still generated for it, but the caller should be smart
3820 enough not to use it. */
3821 if (nregs
== 0 || nregs
== 1 || GET_MODE_CLASS (mode
) == MODE_INT
)
3822 pcum
->aapcs_reg
= gen_rtx_REG (mode
, R0_REGNUM
+ ncrn
);
3828 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
3829 for (i
= 0; i
< nregs
; i
++)
3831 rtx tmp
= gen_rtx_REG (word_mode
, R0_REGNUM
+ ncrn
+ i
);
3832 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
,
3833 GEN_INT (i
* UNITS_PER_WORD
));
3834 XVECEXP (par
, 0, i
) = tmp
;
3836 pcum
->aapcs_reg
= par
;
3839 pcum
->aapcs_nextncrn
= ncrn
+ nregs
;
3844 pcum
->aapcs_nextncrn
= NUM_ARG_REGS
;
3846 /* The argument is passed on stack; record the needed number of words for
3847 this argument and align the total size if necessary. */
3849 pcum
->aapcs_stack_words
= size
/ UNITS_PER_WORD
;
3851 if (aarch64_function_arg_alignment (mode
, type
) == 16 * BITS_PER_UNIT
)
3852 pcum
->aapcs_stack_size
= ROUND_UP (pcum
->aapcs_stack_size
,
3853 16 / UNITS_PER_WORD
);
3857 /* Implement TARGET_FUNCTION_ARG. */
3860 aarch64_function_arg (cumulative_args_t pcum_v
, machine_mode mode
,
3861 const_tree type
, bool named
)
3863 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
3864 gcc_assert (pcum
->pcs_variant
== ARM_PCS_AAPCS64
);
3866 if (mode
== VOIDmode
)
3869 aarch64_layout_arg (pcum_v
, mode
, type
, named
);
3870 return pcum
->aapcs_reg
;
3874 aarch64_init_cumulative_args (CUMULATIVE_ARGS
*pcum
,
3875 const_tree fntype ATTRIBUTE_UNUSED
,
3876 rtx libname ATTRIBUTE_UNUSED
,
3877 const_tree fndecl ATTRIBUTE_UNUSED
,
3878 unsigned n_named ATTRIBUTE_UNUSED
)
3880 pcum
->aapcs_ncrn
= 0;
3881 pcum
->aapcs_nvrn
= 0;
3882 pcum
->aapcs_nextncrn
= 0;
3883 pcum
->aapcs_nextnvrn
= 0;
3884 pcum
->pcs_variant
= ARM_PCS_AAPCS64
;
3885 pcum
->aapcs_reg
= NULL_RTX
;
3886 pcum
->aapcs_arg_processed
= false;
3887 pcum
->aapcs_stack_words
= 0;
3888 pcum
->aapcs_stack_size
= 0;
3891 && fndecl
&& TREE_PUBLIC (fndecl
)
3892 && fntype
&& fntype
!= error_mark_node
)
3894 const_tree type
= TREE_TYPE (fntype
);
3895 machine_mode mode ATTRIBUTE_UNUSED
; /* To pass pointer as argument. */
3896 int nregs ATTRIBUTE_UNUSED
; /* Likewise. */
3897 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
), type
,
3898 &mode
, &nregs
, NULL
))
3899 aarch64_err_no_fpadvsimd (TYPE_MODE (type
));
3905 aarch64_function_arg_advance (cumulative_args_t pcum_v
,
3910 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
3911 if (pcum
->pcs_variant
== ARM_PCS_AAPCS64
)
3913 aarch64_layout_arg (pcum_v
, mode
, type
, named
);
3914 gcc_assert ((pcum
->aapcs_reg
!= NULL_RTX
)
3915 != (pcum
->aapcs_stack_words
!= 0));
3916 pcum
->aapcs_arg_processed
= false;
3917 pcum
->aapcs_ncrn
= pcum
->aapcs_nextncrn
;
3918 pcum
->aapcs_nvrn
= pcum
->aapcs_nextnvrn
;
3919 pcum
->aapcs_stack_size
+= pcum
->aapcs_stack_words
;
3920 pcum
->aapcs_stack_words
= 0;
3921 pcum
->aapcs_reg
= NULL_RTX
;
3926 aarch64_function_arg_regno_p (unsigned regno
)
3928 return ((GP_REGNUM_P (regno
) && regno
< R0_REGNUM
+ NUM_ARG_REGS
)
3929 || (FP_REGNUM_P (regno
) && regno
< V0_REGNUM
+ NUM_FP_ARG_REGS
));
3932 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
3933 PARM_BOUNDARY bits of alignment, but will be given anything up
3934 to STACK_BOUNDARY bits if the type requires it. This makes sure
3935 that both before and after the layout of each argument, the Next
3936 Stacked Argument Address (NSAA) will have a minimum alignment of
3940 aarch64_function_arg_boundary (machine_mode mode
, const_tree type
)
3942 unsigned int alignment
= aarch64_function_arg_alignment (mode
, type
);
3943 return MIN (MAX (alignment
, PARM_BOUNDARY
), STACK_BOUNDARY
);
3946 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
3948 static fixed_size_mode
3949 aarch64_get_reg_raw_mode (int regno
)
3951 if (TARGET_SVE
&& FP_REGNUM_P (regno
))
3952 /* Don't use the SVE part of the register for __builtin_apply and
3953 __builtin_return. The SVE registers aren't used by the normal PCS,
3954 so using them there would be a waste of time. The PCS extensions
3955 for SVE types are fundamentally incompatible with the
3956 __builtin_return/__builtin_apply interface. */
3957 return as_a
<fixed_size_mode
> (V16QImode
);
3958 return default_get_reg_raw_mode (regno
);
3961 /* Implement TARGET_FUNCTION_ARG_PADDING.
3963 Small aggregate types are placed in the lowest memory address.
3965 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
3967 static pad_direction
3968 aarch64_function_arg_padding (machine_mode mode
, const_tree type
)
3970 /* On little-endian targets, the least significant byte of every stack
3971 argument is passed at the lowest byte address of the stack slot. */
3972 if (!BYTES_BIG_ENDIAN
)
3975 /* Otherwise, integral, floating-point and pointer types are padded downward:
3976 the least significant byte of a stack argument is passed at the highest
3977 byte address of the stack slot. */
3979 ? (INTEGRAL_TYPE_P (type
) || SCALAR_FLOAT_TYPE_P (type
)
3980 || POINTER_TYPE_P (type
))
3981 : (SCALAR_INT_MODE_P (mode
) || SCALAR_FLOAT_MODE_P (mode
)))
3982 return PAD_DOWNWARD
;
3984 /* Everything else padded upward, i.e. data in first byte of stack slot. */
3988 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3990 It specifies padding for the last (may also be the only)
3991 element of a block move between registers and memory. If
3992 assuming the block is in the memory, padding upward means that
3993 the last element is padded after its highest significant byte,
3994 while in downward padding, the last element is padded at the
3995 its least significant byte side.
3997 Small aggregates and small complex types are always padded
4000 We don't need to worry about homogeneous floating-point or
4001 short-vector aggregates; their move is not affected by the
4002 padding direction determined here. Regardless of endianness,
4003 each element of such an aggregate is put in the least
4004 significant bits of a fp/simd register.
4006 Return !BYTES_BIG_ENDIAN if the least significant byte of the
4007 register has useful data, and return the opposite if the most
4008 significant byte does. */
4011 aarch64_pad_reg_upward (machine_mode mode
, const_tree type
,
4012 bool first ATTRIBUTE_UNUSED
)
4015 /* Small composite types are always padded upward. */
4016 if (BYTES_BIG_ENDIAN
&& aarch64_composite_type_p (type
, mode
))
4020 size
= int_size_in_bytes (type
);
4022 /* No frontends can create types with variable-sized modes, so we
4023 shouldn't be asked to pass or return them. */
4024 size
= GET_MODE_SIZE (mode
).to_constant ();
4025 if (size
< 2 * UNITS_PER_WORD
)
4029 /* Otherwise, use the default padding. */
4030 return !BYTES_BIG_ENDIAN
;
4033 static scalar_int_mode
4034 aarch64_libgcc_cmp_return_mode (void)
4039 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4041 /* We use the 12-bit shifted immediate arithmetic instructions so values
4042 must be multiple of (1 << 12), i.e. 4096. */
4043 #define ARITH_FACTOR 4096
4045 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4046 #error Cannot use simple address calculation for stack probing
4049 /* The pair of scratch registers used for stack probing. */
4050 #define PROBE_STACK_FIRST_REG R9_REGNUM
4051 #define PROBE_STACK_SECOND_REG R10_REGNUM
4053 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
4054 inclusive. These are offsets from the current stack pointer. */
4057 aarch64_emit_probe_stack_range (HOST_WIDE_INT first
, poly_int64 poly_size
)
4060 if (!poly_size
.is_constant (&size
))
4062 sorry ("stack probes for SVE frames");
4066 rtx reg1
= gen_rtx_REG (Pmode
, PROBE_STACK_FIRST_REG
);
4068 /* See the same assertion on PROBE_INTERVAL above. */
4069 gcc_assert ((first
% ARITH_FACTOR
) == 0);
4071 /* See if we have a constant small number of probes to generate. If so,
4072 that's the easy case. */
4073 if (size
<= PROBE_INTERVAL
)
4075 const HOST_WIDE_INT base
= ROUND_UP (size
, ARITH_FACTOR
);
4077 emit_set_insn (reg1
,
4078 plus_constant (Pmode
,
4079 stack_pointer_rtx
, -(first
+ base
)));
4080 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- size
));
4083 /* The run-time loop is made up of 8 insns in the generic case while the
4084 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
4085 else if (size
<= 4 * PROBE_INTERVAL
)
4087 HOST_WIDE_INT i
, rem
;
4089 emit_set_insn (reg1
,
4090 plus_constant (Pmode
,
4092 -(first
+ PROBE_INTERVAL
)));
4093 emit_stack_probe (reg1
);
4095 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
4096 it exceeds SIZE. If only two probes are needed, this will not
4097 generate any code. Then probe at FIRST + SIZE. */
4098 for (i
= 2 * PROBE_INTERVAL
; i
< size
; i
+= PROBE_INTERVAL
)
4100 emit_set_insn (reg1
,
4101 plus_constant (Pmode
, reg1
, -PROBE_INTERVAL
));
4102 emit_stack_probe (reg1
);
4105 rem
= size
- (i
- PROBE_INTERVAL
);
4108 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
4110 emit_set_insn (reg1
, plus_constant (Pmode
, reg1
, -base
));
4111 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- rem
));
4114 emit_stack_probe (plus_constant (Pmode
, reg1
, -rem
));
4117 /* Otherwise, do the same as above, but in a loop. Note that we must be
4118 extra careful with variables wrapping around because we might be at
4119 the very top (or the very bottom) of the address space and we have
4120 to be able to handle this case properly; in particular, we use an
4121 equality test for the loop condition. */
4124 rtx reg2
= gen_rtx_REG (Pmode
, PROBE_STACK_SECOND_REG
);
4126 /* Step 1: round SIZE to the previous multiple of the interval. */
4128 HOST_WIDE_INT rounded_size
= size
& -PROBE_INTERVAL
;
4131 /* Step 2: compute initial and final value of the loop counter. */
4133 /* TEST_ADDR = SP + FIRST. */
4134 emit_set_insn (reg1
,
4135 plus_constant (Pmode
, stack_pointer_rtx
, -first
));
4137 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
4138 HOST_WIDE_INT adjustment
= - (first
+ rounded_size
);
4139 if (! aarch64_uimm12_shift (adjustment
))
4141 aarch64_internal_mov_immediate (reg2
, GEN_INT (adjustment
),
4143 emit_set_insn (reg2
, gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, reg2
));
4146 emit_set_insn (reg2
,
4147 plus_constant (Pmode
, stack_pointer_rtx
, adjustment
));
4153 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
4156 while (TEST_ADDR != LAST_ADDR)
4158 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
4159 until it is equal to ROUNDED_SIZE. */
4161 emit_insn (gen_probe_stack_range (reg1
, reg1
, reg2
));
4164 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
4165 that SIZE is equal to ROUNDED_SIZE. */
4167 if (size
!= rounded_size
)
4169 HOST_WIDE_INT rem
= size
- rounded_size
;
4173 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
4175 emit_set_insn (reg2
, plus_constant (Pmode
, reg2
, -base
));
4176 emit_stack_probe (plus_constant (Pmode
, reg2
, base
- rem
));
4179 emit_stack_probe (plus_constant (Pmode
, reg2
, -rem
));
4183 /* Make sure nothing is scheduled before we are done. */
4184 emit_insn (gen_blockage ());
4187 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
4188 absolute addresses. */
4191 aarch64_output_probe_stack_range (rtx reg1
, rtx reg2
)
4193 static int labelno
= 0;
4197 ASM_GENERATE_INTERNAL_LABEL (loop_lab
, "LPSRL", labelno
++);
4200 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_lab
);
4202 HOST_WIDE_INT stack_clash_probe_interval
4203 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
4205 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
4207 HOST_WIDE_INT interval
;
4208 if (flag_stack_clash_protection
)
4209 interval
= stack_clash_probe_interval
;
4211 interval
= PROBE_INTERVAL
;
4213 gcc_assert (aarch64_uimm12_shift (interval
));
4214 xops
[1] = GEN_INT (interval
);
4216 output_asm_insn ("sub\t%0, %0, %1", xops
);
4218 /* If doing stack clash protection then we probe up by the ABI specified
4219 amount. We do this because we're dropping full pages at a time in the
4220 loop. But if we're doing non-stack clash probing, probe at SP 0. */
4221 if (flag_stack_clash_protection
)
4222 xops
[1] = GEN_INT (STACK_CLASH_CALLER_GUARD
);
4224 xops
[1] = CONST0_RTX (GET_MODE (xops
[1]));
4226 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
4227 by this amount for each iteration. */
4228 output_asm_insn ("str\txzr, [%0, %1]", xops
);
4230 /* Test if TEST_ADDR == LAST_ADDR. */
4232 output_asm_insn ("cmp\t%0, %1", xops
);
4235 fputs ("\tb.ne\t", asm_out_file
);
4236 assemble_name_raw (asm_out_file
, loop_lab
);
4237 fputc ('\n', asm_out_file
);
4242 /* Emit the probe loop for doing stack clash probes and stack adjustments for
4243 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
4244 of GUARD_SIZE. When a probe is emitted it is done at most
4245 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
4246 at most MIN_PROBE_THRESHOLD. By the end of this function
4247 BASE = BASE - ADJUSTMENT. */
4250 aarch64_output_probe_sve_stack_clash (rtx base
, rtx adjustment
,
4251 rtx min_probe_threshold
, rtx guard_size
)
4253 /* This function is not allowed to use any instruction generation function
4254 like gen_ and friends. If you do you'll likely ICE during CFG validation,
4255 so instead emit the code you want using output_asm_insn. */
4256 gcc_assert (flag_stack_clash_protection
);
4257 gcc_assert (CONST_INT_P (min_probe_threshold
) && CONST_INT_P (guard_size
));
4258 gcc_assert (INTVAL (guard_size
) > INTVAL (min_probe_threshold
));
4260 /* The minimum required allocation before the residual requires probing. */
4261 HOST_WIDE_INT residual_probe_guard
= INTVAL (min_probe_threshold
);
4263 /* Clamp the value down to the nearest value that can be used with a cmp. */
4264 residual_probe_guard
= aarch64_clamp_to_uimm12_shift (residual_probe_guard
);
4265 rtx probe_offset_value_rtx
= gen_int_mode (residual_probe_guard
, Pmode
);
4267 gcc_assert (INTVAL (min_probe_threshold
) >= residual_probe_guard
);
4268 gcc_assert (aarch64_uimm12_shift (residual_probe_guard
));
4270 static int labelno
= 0;
4271 char loop_start_lab
[32];
4272 char loop_end_lab
[32];
4275 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab
, "SVLPSPL", labelno
);
4276 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab
, "SVLPEND", labelno
++);
4278 /* Emit loop start label. */
4279 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_start_lab
);
4281 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
4282 xops
[0] = adjustment
;
4283 xops
[1] = probe_offset_value_rtx
;
4284 output_asm_insn ("cmp\t%0, %1", xops
);
4286 /* Branch to end if not enough adjustment to probe. */
4287 fputs ("\tb.lt\t", asm_out_file
);
4288 assemble_name_raw (asm_out_file
, loop_end_lab
);
4289 fputc ('\n', asm_out_file
);
4291 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
4293 xops
[1] = probe_offset_value_rtx
;
4294 output_asm_insn ("sub\t%0, %0, %1", xops
);
4296 /* Probe at BASE. */
4297 xops
[1] = const0_rtx
;
4298 output_asm_insn ("str\txzr, [%0, %1]", xops
);
4300 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
4301 xops
[0] = adjustment
;
4302 xops
[1] = probe_offset_value_rtx
;
4303 output_asm_insn ("sub\t%0, %0, %1", xops
);
4305 /* Branch to start if still more bytes to allocate. */
4306 fputs ("\tb\t", asm_out_file
);
4307 assemble_name_raw (asm_out_file
, loop_start_lab
);
4308 fputc ('\n', asm_out_file
);
4310 /* No probe leave. */
4311 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_end_lab
);
4313 /* BASE = BASE - ADJUSTMENT. */
4315 xops
[1] = adjustment
;
4316 output_asm_insn ("sub\t%0, %0, %1", xops
);
4320 /* Determine whether a frame chain needs to be generated. */
4322 aarch64_needs_frame_chain (void)
4324 /* Force a frame chain for EH returns so the return address is at FP+8. */
4325 if (frame_pointer_needed
|| crtl
->calls_eh_return
)
4328 /* A leaf function cannot have calls or write LR. */
4329 bool is_leaf
= crtl
->is_leaf
&& !df_regs_ever_live_p (LR_REGNUM
);
4331 /* Don't use a frame chain in leaf functions if leaf frame pointers
4333 if (flag_omit_leaf_frame_pointer
&& is_leaf
)
4336 return aarch64_use_frame_pointer
;
4339 /* Mark the registers that need to be saved by the callee and calculate
4340 the size of the callee-saved registers area and frame record (both FP
4341 and LR may be omitted). */
4343 aarch64_layout_frame (void)
4345 HOST_WIDE_INT offset
= 0;
4346 int regno
, last_fp_reg
= INVALID_REGNUM
;
4347 bool simd_function
= aarch64_simd_decl_p (cfun
->decl
);
4349 cfun
->machine
->frame
.emit_frame_chain
= aarch64_needs_frame_chain ();
4351 /* Adjust the outgoing arguments size if required. Keep it in sync with what
4352 the mid-end is doing. */
4353 crtl
->outgoing_args_size
= STACK_DYNAMIC_OFFSET (cfun
);
4355 #define SLOT_NOT_REQUIRED (-2)
4356 #define SLOT_REQUIRED (-1)
4358 cfun
->machine
->frame
.wb_candidate1
= INVALID_REGNUM
;
4359 cfun
->machine
->frame
.wb_candidate2
= INVALID_REGNUM
;
4361 /* If this is a non-leaf simd function with calls we assume that
4362 at least one of those calls is to a non-simd function and thus
4363 we must save V8 to V23 in the prologue. */
4365 if (simd_function
&& !crtl
->is_leaf
)
4367 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
4368 if (FP_SIMD_SAVED_REGNUM_P (regno
))
4369 df_set_regs_ever_live (regno
, true);
4372 /* First mark all the registers that really need to be saved... */
4373 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
4374 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
4376 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
4377 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
4379 /* ... that includes the eh data registers (if needed)... */
4380 if (crtl
->calls_eh_return
)
4381 for (regno
= 0; EH_RETURN_DATA_REGNO (regno
) != INVALID_REGNUM
; regno
++)
4382 cfun
->machine
->frame
.reg_offset
[EH_RETURN_DATA_REGNO (regno
)]
4385 /* ... and any callee saved register that dataflow says is live. */
4386 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
4387 if (df_regs_ever_live_p (regno
)
4388 && (regno
== R30_REGNUM
4389 || !call_used_regs
[regno
]))
4390 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
4392 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
4393 if (df_regs_ever_live_p (regno
)
4394 && (!call_used_regs
[regno
]
4395 || (simd_function
&& FP_SIMD_SAVED_REGNUM_P (regno
))))
4397 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
4398 last_fp_reg
= regno
;
4401 if (cfun
->machine
->frame
.emit_frame_chain
)
4403 /* FP and LR are placed in the linkage record. */
4404 cfun
->machine
->frame
.reg_offset
[R29_REGNUM
] = 0;
4405 cfun
->machine
->frame
.wb_candidate1
= R29_REGNUM
;
4406 cfun
->machine
->frame
.reg_offset
[R30_REGNUM
] = UNITS_PER_WORD
;
4407 cfun
->machine
->frame
.wb_candidate2
= R30_REGNUM
;
4408 offset
= 2 * UNITS_PER_WORD
;
4411 /* With stack-clash, LR must be saved in non-leaf functions. */
4412 gcc_assert (crtl
->is_leaf
4413 || (cfun
->machine
->frame
.reg_offset
[R30_REGNUM
]
4414 != SLOT_NOT_REQUIRED
));
4416 /* Now assign stack slots for them. */
4417 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
4418 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
4420 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
4421 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
4422 cfun
->machine
->frame
.wb_candidate1
= regno
;
4423 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
)
4424 cfun
->machine
->frame
.wb_candidate2
= regno
;
4425 offset
+= UNITS_PER_WORD
;
4428 HOST_WIDE_INT max_int_offset
= offset
;
4429 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
4430 bool has_align_gap
= offset
!= max_int_offset
;
4432 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
4433 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
4435 /* If there is an alignment gap between integer and fp callee-saves,
4436 allocate the last fp register to it if possible. */
4437 if (regno
== last_fp_reg
4440 && (offset
& 8) == 0)
4442 cfun
->machine
->frame
.reg_offset
[regno
] = max_int_offset
;
4446 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
4447 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
4448 cfun
->machine
->frame
.wb_candidate1
= regno
;
4449 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
4450 && cfun
->machine
->frame
.wb_candidate1
>= V0_REGNUM
)
4451 cfun
->machine
->frame
.wb_candidate2
= regno
;
4452 offset
+= simd_function
? UNITS_PER_VREG
: UNITS_PER_WORD
;
4455 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
4457 cfun
->machine
->frame
.saved_regs_size
= offset
;
4459 HOST_WIDE_INT varargs_and_saved_regs_size
4460 = offset
+ cfun
->machine
->frame
.saved_varargs_size
;
4462 cfun
->machine
->frame
.hard_fp_offset
4463 = aligned_upper_bound (varargs_and_saved_regs_size
4464 + get_frame_size (),
4465 STACK_BOUNDARY
/ BITS_PER_UNIT
);
4467 /* Both these values are already aligned. */
4468 gcc_assert (multiple_p (crtl
->outgoing_args_size
,
4469 STACK_BOUNDARY
/ BITS_PER_UNIT
));
4470 cfun
->machine
->frame
.frame_size
4471 = (cfun
->machine
->frame
.hard_fp_offset
4472 + crtl
->outgoing_args_size
);
4474 cfun
->machine
->frame
.locals_offset
= cfun
->machine
->frame
.saved_varargs_size
;
4476 cfun
->machine
->frame
.initial_adjust
= 0;
4477 cfun
->machine
->frame
.final_adjust
= 0;
4478 cfun
->machine
->frame
.callee_adjust
= 0;
4479 cfun
->machine
->frame
.callee_offset
= 0;
4481 HOST_WIDE_INT max_push_offset
= 0;
4482 if (cfun
->machine
->frame
.wb_candidate2
!= INVALID_REGNUM
)
4483 max_push_offset
= 512;
4484 else if (cfun
->machine
->frame
.wb_candidate1
!= INVALID_REGNUM
)
4485 max_push_offset
= 256;
4487 HOST_WIDE_INT const_size
, const_fp_offset
;
4488 if (cfun
->machine
->frame
.frame_size
.is_constant (&const_size
)
4489 && const_size
< max_push_offset
4490 && known_eq (crtl
->outgoing_args_size
, 0))
4492 /* Simple, small frame with no outgoing arguments:
4493 stp reg1, reg2, [sp, -frame_size]!
4494 stp reg3, reg4, [sp, 16] */
4495 cfun
->machine
->frame
.callee_adjust
= const_size
;
4497 else if (known_lt (crtl
->outgoing_args_size
4498 + cfun
->machine
->frame
.saved_regs_size
, 512)
4499 && !(cfun
->calls_alloca
4500 && known_lt (cfun
->machine
->frame
.hard_fp_offset
,
4503 /* Frame with small outgoing arguments:
4504 sub sp, sp, frame_size
4505 stp reg1, reg2, [sp, outgoing_args_size]
4506 stp reg3, reg4, [sp, outgoing_args_size + 16] */
4507 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.frame_size
;
4508 cfun
->machine
->frame
.callee_offset
4509 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.hard_fp_offset
;
4511 else if (cfun
->machine
->frame
.hard_fp_offset
.is_constant (&const_fp_offset
)
4512 && const_fp_offset
< max_push_offset
)
4514 /* Frame with large outgoing arguments but a small local area:
4515 stp reg1, reg2, [sp, -hard_fp_offset]!
4516 stp reg3, reg4, [sp, 16]
4517 sub sp, sp, outgoing_args_size */
4518 cfun
->machine
->frame
.callee_adjust
= const_fp_offset
;
4519 cfun
->machine
->frame
.final_adjust
4520 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.callee_adjust
;
4524 /* Frame with large local area and outgoing arguments using frame pointer:
4525 sub sp, sp, hard_fp_offset
4526 stp x29, x30, [sp, 0]
4528 stp reg3, reg4, [sp, 16]
4529 sub sp, sp, outgoing_args_size */
4530 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.hard_fp_offset
;
4531 cfun
->machine
->frame
.final_adjust
4532 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.initial_adjust
;
4535 cfun
->machine
->frame
.laid_out
= true;
4538 /* Return true if the register REGNO is saved on entry to
4539 the current function. */
4542 aarch64_register_saved_on_entry (int regno
)
4544 return cfun
->machine
->frame
.reg_offset
[regno
] >= 0;
4547 /* Return the next register up from REGNO up to LIMIT for the callee
4551 aarch64_next_callee_save (unsigned regno
, unsigned limit
)
4553 while (regno
<= limit
&& !aarch64_register_saved_on_entry (regno
))
4558 /* Push the register number REGNO of mode MODE to the stack with write-back
4559 adjusting the stack by ADJUSTMENT. */
4562 aarch64_pushwb_single_reg (machine_mode mode
, unsigned regno
,
4563 HOST_WIDE_INT adjustment
)
4565 rtx base_rtx
= stack_pointer_rtx
;
4568 reg
= gen_rtx_REG (mode
, regno
);
4569 mem
= gen_rtx_PRE_MODIFY (Pmode
, base_rtx
,
4570 plus_constant (Pmode
, base_rtx
, -adjustment
));
4571 mem
= gen_frame_mem (mode
, mem
);
4573 insn
= emit_move_insn (mem
, reg
);
4574 RTX_FRAME_RELATED_P (insn
) = 1;
4577 /* Generate and return an instruction to store the pair of registers
4578 REG and REG2 of mode MODE to location BASE with write-back adjusting
4579 the stack location BASE by ADJUSTMENT. */
4582 aarch64_gen_storewb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
4583 HOST_WIDE_INT adjustment
)
4588 return gen_storewb_pairdi_di (base
, base
, reg
, reg2
,
4589 GEN_INT (-adjustment
),
4590 GEN_INT (UNITS_PER_WORD
- adjustment
));
4592 return gen_storewb_pairdf_di (base
, base
, reg
, reg2
,
4593 GEN_INT (-adjustment
),
4594 GEN_INT (UNITS_PER_WORD
- adjustment
));
4596 return gen_storewb_pairtf_di (base
, base
, reg
, reg2
,
4597 GEN_INT (-adjustment
),
4598 GEN_INT (UNITS_PER_VREG
- adjustment
));
4604 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4605 stack pointer by ADJUSTMENT. */
4608 aarch64_push_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
)
4611 machine_mode mode
= aarch64_reg_save_mode (cfun
->decl
, regno1
);
4613 if (regno2
== INVALID_REGNUM
)
4614 return aarch64_pushwb_single_reg (mode
, regno1
, adjustment
);
4616 rtx reg1
= gen_rtx_REG (mode
, regno1
);
4617 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4619 insn
= emit_insn (aarch64_gen_storewb_pair (mode
, stack_pointer_rtx
, reg1
,
4621 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 2)) = 1;
4622 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
4623 RTX_FRAME_RELATED_P (insn
) = 1;
4626 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4627 adjusting it by ADJUSTMENT afterwards. */
4630 aarch64_gen_loadwb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
4631 HOST_WIDE_INT adjustment
)
4636 return gen_loadwb_pairdi_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
4637 GEN_INT (UNITS_PER_WORD
));
4639 return gen_loadwb_pairdf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
4640 GEN_INT (UNITS_PER_WORD
));
4642 return gen_loadwb_pairtf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
4643 GEN_INT (UNITS_PER_VREG
));
4649 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4650 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4654 aarch64_pop_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
,
4657 machine_mode mode
= aarch64_reg_save_mode (cfun
->decl
, regno1
);
4658 rtx reg1
= gen_rtx_REG (mode
, regno1
);
4660 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg1
, *cfi_ops
);
4662 if (regno2
== INVALID_REGNUM
)
4664 rtx mem
= plus_constant (Pmode
, stack_pointer_rtx
, adjustment
);
4665 mem
= gen_rtx_POST_MODIFY (Pmode
, stack_pointer_rtx
, mem
);
4666 emit_move_insn (reg1
, gen_frame_mem (mode
, mem
));
4670 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4671 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
4672 emit_insn (aarch64_gen_loadwb_pair (mode
, stack_pointer_rtx
, reg1
,
4677 /* Generate and return a store pair instruction of mode MODE to store
4678 register REG1 to MEM1 and register REG2 to MEM2. */
4681 aarch64_gen_store_pair (machine_mode mode
, rtx mem1
, rtx reg1
, rtx mem2
,
4687 return gen_store_pair_dw_didi (mem1
, reg1
, mem2
, reg2
);
4690 return gen_store_pair_dw_dfdf (mem1
, reg1
, mem2
, reg2
);
4693 return gen_store_pair_dw_tftf (mem1
, reg1
, mem2
, reg2
);
4700 /* Generate and regurn a load pair isntruction of mode MODE to load register
4701 REG1 from MEM1 and register REG2 from MEM2. */
4704 aarch64_gen_load_pair (machine_mode mode
, rtx reg1
, rtx mem1
, rtx reg2
,
4710 return gen_load_pair_dw_didi (reg1
, mem1
, reg2
, mem2
);
4713 return gen_load_pair_dw_dfdf (reg1
, mem1
, reg2
, mem2
);
4716 return gen_load_pair_dw_tftf (reg1
, mem1
, reg2
, mem2
);
4723 /* Return TRUE if return address signing should be enabled for the current
4724 function, otherwise return FALSE. */
4727 aarch64_return_address_signing_enabled (void)
4729 /* This function should only be called after frame laid out. */
4730 gcc_assert (cfun
->machine
->frame
.laid_out
);
4732 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4733 if it's LR is pushed onto stack. */
4734 return (aarch64_ra_sign_scope
== AARCH64_FUNCTION_ALL
4735 || (aarch64_ra_sign_scope
== AARCH64_FUNCTION_NON_LEAF
4736 && cfun
->machine
->frame
.reg_offset
[LR_REGNUM
] >= 0));
4739 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
4741 aarch64_bti_enabled (void)
4743 return (aarch64_enable_bti
== 1);
4746 /* Emit code to save the callee-saved registers from register number START
4747 to LIMIT to the stack at the location starting at offset START_OFFSET,
4748 skipping any write-back candidates if SKIP_WB is true. */
4751 aarch64_save_callee_saves (machine_mode mode
, poly_int64 start_offset
,
4752 unsigned start
, unsigned limit
, bool skip_wb
)
4758 for (regno
= aarch64_next_callee_save (start
, limit
);
4760 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
4767 && (regno
== cfun
->machine
->frame
.wb_candidate1
4768 || regno
== cfun
->machine
->frame
.wb_candidate2
))
4771 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
4774 reg
= gen_rtx_REG (mode
, regno
);
4775 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
4776 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
4779 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
4780 offset_diff
= cfun
->machine
->frame
.reg_offset
[regno2
]
4781 - cfun
->machine
->frame
.reg_offset
[regno
];
4784 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
4785 && known_eq (GET_MODE_SIZE (mode
), offset_diff
))
4787 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4790 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
4791 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
4793 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
,
4796 /* The first part of a frame-related parallel insn is
4797 always assumed to be relevant to the frame
4798 calculations; subsequent parts, are only
4799 frame-related if explicitly marked. */
4800 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
4804 insn
= emit_move_insn (mem
, reg
);
4806 RTX_FRAME_RELATED_P (insn
) = 1;
4810 /* Emit code to restore the callee registers of mode MODE from register
4811 number START up to and including LIMIT. Restore from the stack offset
4812 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4813 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
4816 aarch64_restore_callee_saves (machine_mode mode
,
4817 poly_int64 start_offset
, unsigned start
,
4818 unsigned limit
, bool skip_wb
, rtx
*cfi_ops
)
4820 rtx base_rtx
= stack_pointer_rtx
;
4825 for (regno
= aarch64_next_callee_save (start
, limit
);
4827 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
4829 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
4836 && (regno
== cfun
->machine
->frame
.wb_candidate1
4837 || regno
== cfun
->machine
->frame
.wb_candidate2
))
4840 reg
= gen_rtx_REG (mode
, regno
);
4841 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
4842 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
4844 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
4845 offset_diff
= cfun
->machine
->frame
.reg_offset
[regno2
]
4846 - cfun
->machine
->frame
.reg_offset
[regno
];
4849 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
4850 && known_eq (GET_MODE_SIZE (mode
), offset_diff
))
4852 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4855 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
4856 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
4857 emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
4859 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
4863 emit_move_insn (reg
, mem
);
4864 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg
, *cfi_ops
);
4868 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4872 offset_4bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
4874 HOST_WIDE_INT multiple
;
4875 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4876 && IN_RANGE (multiple
, -8, 7));
4879 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4883 offset_6bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
4885 HOST_WIDE_INT multiple
;
4886 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4887 && IN_RANGE (multiple
, 0, 63));
4890 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4894 aarch64_offset_7bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
4896 HOST_WIDE_INT multiple
;
4897 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4898 && IN_RANGE (multiple
, -64, 63));
4901 /* Return true if OFFSET is a signed 9-bit value. */
4904 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED
,
4907 HOST_WIDE_INT const_offset
;
4908 return (offset
.is_constant (&const_offset
)
4909 && IN_RANGE (const_offset
, -256, 255));
4912 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4916 offset_9bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
4918 HOST_WIDE_INT multiple
;
4919 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4920 && IN_RANGE (multiple
, -256, 255));
4923 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4927 offset_12bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
4929 HOST_WIDE_INT multiple
;
4930 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4931 && IN_RANGE (multiple
, 0, 4095));
4934 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
4937 aarch64_get_separate_components (void)
4939 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
4940 bitmap_clear (components
);
4942 /* The registers we need saved to the frame. */
4943 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
4944 if (aarch64_register_saved_on_entry (regno
))
4946 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
4947 if (!frame_pointer_needed
)
4948 offset
+= cfun
->machine
->frame
.frame_size
4949 - cfun
->machine
->frame
.hard_fp_offset
;
4950 /* Check that we can access the stack slot of the register with one
4951 direct load with no adjustments needed. */
4952 if (offset_12bit_unsigned_scaled_p (DImode
, offset
))
4953 bitmap_set_bit (components
, regno
);
4956 /* Don't mess with the hard frame pointer. */
4957 if (frame_pointer_needed
)
4958 bitmap_clear_bit (components
, HARD_FRAME_POINTER_REGNUM
);
4960 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
4961 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
4962 /* If registers have been chosen to be stored/restored with
4963 writeback don't interfere with them to avoid having to output explicit
4964 stack adjustment instructions. */
4965 if (reg2
!= INVALID_REGNUM
)
4966 bitmap_clear_bit (components
, reg2
);
4967 if (reg1
!= INVALID_REGNUM
)
4968 bitmap_clear_bit (components
, reg1
);
4970 bitmap_clear_bit (components
, LR_REGNUM
);
4971 bitmap_clear_bit (components
, SP_REGNUM
);
4976 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
4979 aarch64_components_for_bb (basic_block bb
)
4981 bitmap in
= DF_LIVE_IN (bb
);
4982 bitmap gen
= &DF_LIVE_BB_INFO (bb
)->gen
;
4983 bitmap kill
= &DF_LIVE_BB_INFO (bb
)->kill
;
4984 bool simd_function
= aarch64_simd_decl_p (cfun
->decl
);
4986 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
4987 bitmap_clear (components
);
4989 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
4990 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
4991 if ((!call_used_regs
[regno
]
4992 || (simd_function
&& FP_SIMD_SAVED_REGNUM_P (regno
)))
4993 && (bitmap_bit_p (in
, regno
)
4994 || bitmap_bit_p (gen
, regno
)
4995 || bitmap_bit_p (kill
, regno
)))
4997 unsigned regno2
, offset
, offset2
;
4998 bitmap_set_bit (components
, regno
);
5000 /* If there is a callee-save at an adjacent offset, add it too
5001 to increase the use of LDP/STP. */
5002 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
5003 regno2
= ((offset
& 8) == 0) ? regno
+ 1 : regno
- 1;
5005 if (regno2
<= LAST_SAVED_REGNUM
)
5007 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
5008 if ((offset
& ~8) == (offset2
& ~8))
5009 bitmap_set_bit (components
, regno2
);
5016 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5017 Nothing to do for aarch64. */
5020 aarch64_disqualify_components (sbitmap
, edge
, sbitmap
, bool)
5024 /* Return the next set bit in BMP from START onwards. Return the total number
5025 of bits in BMP if no set bit is found at or after START. */
5028 aarch64_get_next_set_bit (sbitmap bmp
, unsigned int start
)
5030 unsigned int nbits
= SBITMAP_SIZE (bmp
);
5034 gcc_assert (start
< nbits
);
5035 for (unsigned int i
= start
; i
< nbits
; i
++)
5036 if (bitmap_bit_p (bmp
, i
))
5042 /* Do the work for aarch64_emit_prologue_components and
5043 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
5044 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5045 for these components or the epilogue sequence. That is, it determines
5046 whether we should emit stores or loads and what kind of CFA notes to attach
5047 to the insns. Otherwise the logic for the two sequences is very
5051 aarch64_process_components (sbitmap components
, bool prologue_p
)
5053 rtx ptr_reg
= gen_rtx_REG (Pmode
, frame_pointer_needed
5054 ? HARD_FRAME_POINTER_REGNUM
5055 : STACK_POINTER_REGNUM
);
5057 unsigned last_regno
= SBITMAP_SIZE (components
);
5058 unsigned regno
= aarch64_get_next_set_bit (components
, R0_REGNUM
);
5059 rtx_insn
*insn
= NULL
;
5061 while (regno
!= last_regno
)
5063 /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
5064 so DFmode for the vector registers is enough. For simd functions
5065 we want to save the low 128 bits. */
5066 machine_mode mode
= aarch64_reg_save_mode (cfun
->decl
, regno
);
5068 rtx reg
= gen_rtx_REG (mode
, regno
);
5069 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
5070 if (!frame_pointer_needed
)
5071 offset
+= cfun
->machine
->frame
.frame_size
5072 - cfun
->machine
->frame
.hard_fp_offset
;
5073 rtx addr
= plus_constant (Pmode
, ptr_reg
, offset
);
5074 rtx mem
= gen_frame_mem (mode
, addr
);
5076 rtx set
= prologue_p
? gen_rtx_SET (mem
, reg
) : gen_rtx_SET (reg
, mem
);
5077 unsigned regno2
= aarch64_get_next_set_bit (components
, regno
+ 1);
5078 /* No more registers to handle after REGNO.
5079 Emit a single save/restore and exit. */
5080 if (regno2
== last_regno
)
5082 insn
= emit_insn (set
);
5083 RTX_FRAME_RELATED_P (insn
) = 1;
5085 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
5087 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
5091 poly_int64 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
5092 /* The next register is not of the same class or its offset is not
5093 mergeable with the current one into a pair. */
5094 if (!satisfies_constraint_Ump (mem
)
5095 || GP_REGNUM_P (regno
) != GP_REGNUM_P (regno2
)
5096 || (aarch64_simd_decl_p (cfun
->decl
) && FP_REGNUM_P (regno
))
5097 || maybe_ne ((offset2
- cfun
->machine
->frame
.reg_offset
[regno
]),
5098 GET_MODE_SIZE (mode
)))
5100 insn
= emit_insn (set
);
5101 RTX_FRAME_RELATED_P (insn
) = 1;
5103 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
5105 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
5111 /* REGNO2 can be saved/restored in a pair with REGNO. */
5112 rtx reg2
= gen_rtx_REG (mode
, regno2
);
5113 if (!frame_pointer_needed
)
5114 offset2
+= cfun
->machine
->frame
.frame_size
5115 - cfun
->machine
->frame
.hard_fp_offset
;
5116 rtx addr2
= plus_constant (Pmode
, ptr_reg
, offset2
);
5117 rtx mem2
= gen_frame_mem (mode
, addr2
);
5118 rtx set2
= prologue_p
? gen_rtx_SET (mem2
, reg2
)
5119 : gen_rtx_SET (reg2
, mem2
);
5122 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
, reg2
));
5124 insn
= emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
5126 RTX_FRAME_RELATED_P (insn
) = 1;
5129 add_reg_note (insn
, REG_CFA_OFFSET
, set
);
5130 add_reg_note (insn
, REG_CFA_OFFSET
, set2
);
5134 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
5135 add_reg_note (insn
, REG_CFA_RESTORE
, reg2
);
5138 regno
= aarch64_get_next_set_bit (components
, regno2
+ 1);
5142 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
5145 aarch64_emit_prologue_components (sbitmap components
)
5147 aarch64_process_components (components
, true);
5150 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
5153 aarch64_emit_epilogue_components (sbitmap components
)
5155 aarch64_process_components (components
, false);
5158 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
5161 aarch64_set_handled_components (sbitmap components
)
5163 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
5164 if (bitmap_bit_p (components
, regno
))
5165 cfun
->machine
->reg_is_wrapped_separately
[regno
] = true;
5168 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
5169 determining the probe offset for alloca. */
5171 static HOST_WIDE_INT
5172 aarch64_stack_clash_protection_alloca_probe_range (void)
5174 return STACK_CLASH_CALLER_GUARD
;
5178 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
5179 registers. If POLY_SIZE is not large enough to require a probe this function
5180 will only adjust the stack. When allocating the stack space
5181 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
5182 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
5183 arguments. If we are then we ensure that any allocation larger than the ABI
5184 defined buffer needs a probe so that the invariant of having a 1KB buffer is
5187 We emit barriers after each stack adjustment to prevent optimizations from
5188 breaking the invariant that we never drop the stack more than a page. This
5189 invariant is needed to make it easier to correctly handle asynchronous
5190 events, e.g. if we were to allow the stack to be dropped by more than a page
5191 and then have multiple probes up and we take a signal somewhere in between
5192 then the signal handler doesn't know the state of the stack and can make no
5193 assumptions about which pages have been probed. */
5196 aarch64_allocate_and_probe_stack_space (rtx temp1
, rtx temp2
,
5197 poly_int64 poly_size
,
5198 bool frame_related_p
,
5199 bool final_adjustment_p
)
5201 HOST_WIDE_INT guard_size
5202 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
5203 HOST_WIDE_INT guard_used_by_caller
= STACK_CLASH_CALLER_GUARD
;
5204 /* When doing the final adjustment for the outgoing argument size we can't
5205 assume that LR was saved at position 0. So subtract it's offset from the
5206 ABI safe buffer so that we don't accidentally allow an adjustment that
5207 would result in an allocation larger than the ABI buffer without
5209 HOST_WIDE_INT min_probe_threshold
5210 = final_adjustment_p
5211 ? guard_used_by_caller
- cfun
->machine
->frame
.reg_offset
[LR_REGNUM
]
5212 : guard_size
- guard_used_by_caller
;
5214 poly_int64 frame_size
= cfun
->machine
->frame
.frame_size
;
5216 /* We should always have a positive probe threshold. */
5217 gcc_assert (min_probe_threshold
> 0);
5219 if (flag_stack_clash_protection
&& !final_adjustment_p
)
5221 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
5222 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
5224 if (known_eq (frame_size
, 0))
5226 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME
, false);
5228 else if (known_lt (initial_adjust
, guard_size
- guard_used_by_caller
)
5229 && known_lt (final_adjust
, guard_used_by_caller
))
5231 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME
, true);
5235 /* If SIZE is not large enough to require probing, just adjust the stack and
5237 if (known_lt (poly_size
, min_probe_threshold
)
5238 || !flag_stack_clash_protection
)
5240 aarch64_sub_sp (temp1
, temp2
, poly_size
, frame_related_p
);
5245 /* Handle the SVE non-constant case first. */
5246 if (!poly_size
.is_constant (&size
))
5250 fprintf (dump_file
, "Stack clash SVE prologue: ");
5251 print_dec (poly_size
, dump_file
);
5252 fprintf (dump_file
, " bytes, dynamic probing will be required.\n");
5255 /* First calculate the amount of bytes we're actually spilling. */
5256 aarch64_add_offset (Pmode
, temp1
, CONST0_RTX (Pmode
),
5257 poly_size
, temp1
, temp2
, false, true);
5259 rtx_insn
*insn
= get_last_insn ();
5261 if (frame_related_p
)
5263 /* This is done to provide unwinding information for the stack
5264 adjustments we're about to do, however to prevent the optimizers
5265 from removing the R15 move and leaving the CFA note (which would be
5266 very wrong) we tie the old and new stack pointer together.
5267 The tie will expand to nothing but the optimizers will not touch
5269 rtx stack_ptr_copy
= gen_rtx_REG (Pmode
, R15_REGNUM
);
5270 emit_move_insn (stack_ptr_copy
, stack_pointer_rtx
);
5271 emit_insn (gen_stack_tie (stack_ptr_copy
, stack_pointer_rtx
));
5273 /* We want the CFA independent of the stack pointer for the
5274 duration of the loop. */
5275 add_reg_note (insn
, REG_CFA_DEF_CFA
, stack_ptr_copy
);
5276 RTX_FRAME_RELATED_P (insn
) = 1;
5279 rtx probe_const
= gen_int_mode (min_probe_threshold
, Pmode
);
5280 rtx guard_const
= gen_int_mode (guard_size
, Pmode
);
5282 insn
= emit_insn (gen_probe_sve_stack_clash (Pmode
, stack_pointer_rtx
,
5283 stack_pointer_rtx
, temp1
,
5284 probe_const
, guard_const
));
5286 /* Now reset the CFA register if needed. */
5287 if (frame_related_p
)
5289 add_reg_note (insn
, REG_CFA_DEF_CFA
,
5290 gen_rtx_PLUS (Pmode
, stack_pointer_rtx
,
5291 gen_int_mode (poly_size
, Pmode
)));
5292 RTX_FRAME_RELATED_P (insn
) = 1;
5300 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
5301 " bytes, probing will be required.\n", size
);
5303 /* Round size to the nearest multiple of guard_size, and calculate the
5304 residual as the difference between the original size and the rounded
5306 HOST_WIDE_INT rounded_size
= ROUND_DOWN (size
, guard_size
);
5307 HOST_WIDE_INT residual
= size
- rounded_size
;
5309 /* We can handle a small number of allocations/probes inline. Otherwise
5311 if (rounded_size
<= STACK_CLASH_MAX_UNROLL_PAGES
* guard_size
)
5313 for (HOST_WIDE_INT i
= 0; i
< rounded_size
; i
+= guard_size
)
5315 aarch64_sub_sp (NULL
, temp2
, guard_size
, true);
5316 emit_stack_probe (plus_constant (Pmode
, stack_pointer_rtx
,
5317 guard_used_by_caller
));
5318 emit_insn (gen_blockage ());
5320 dump_stack_clash_frame_info (PROBE_INLINE
, size
!= rounded_size
);
5324 /* Compute the ending address. */
5325 aarch64_add_offset (Pmode
, temp1
, stack_pointer_rtx
, -rounded_size
,
5326 temp1
, NULL
, false, true);
5327 rtx_insn
*insn
= get_last_insn ();
5329 /* For the initial allocation, we don't have a frame pointer
5330 set up, so we always need CFI notes. If we're doing the
5331 final allocation, then we may have a frame pointer, in which
5332 case it is the CFA, otherwise we need CFI notes.
5334 We can determine which allocation we are doing by looking at
5335 the value of FRAME_RELATED_P since the final allocations are not
5337 if (frame_related_p
)
5339 /* We want the CFA independent of the stack pointer for the
5340 duration of the loop. */
5341 add_reg_note (insn
, REG_CFA_DEF_CFA
,
5342 plus_constant (Pmode
, temp1
, rounded_size
));
5343 RTX_FRAME_RELATED_P (insn
) = 1;
5346 /* This allocates and probes the stack. Note that this re-uses some of
5347 the existing Ada stack protection code. However we are guaranteed not
5348 to enter the non loop or residual branches of that code.
5350 The non-loop part won't be entered because if our allocation amount
5351 doesn't require a loop, the case above would handle it.
5353 The residual amount won't be entered because TEMP1 is a mutliple of
5354 the allocation size. The residual will always be 0. As such, the only
5355 part we are actually using from that code is the loop setup. The
5356 actual probing is done in aarch64_output_probe_stack_range. */
5357 insn
= emit_insn (gen_probe_stack_range (stack_pointer_rtx
,
5358 stack_pointer_rtx
, temp1
));
5360 /* Now reset the CFA register if needed. */
5361 if (frame_related_p
)
5363 add_reg_note (insn
, REG_CFA_DEF_CFA
,
5364 plus_constant (Pmode
, stack_pointer_rtx
, rounded_size
));
5365 RTX_FRAME_RELATED_P (insn
) = 1;
5368 emit_insn (gen_blockage ());
5369 dump_stack_clash_frame_info (PROBE_LOOP
, size
!= rounded_size
);
5372 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
5373 be probed. This maintains the requirement that each page is probed at
5374 least once. For initial probing we probe only if the allocation is
5375 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
5376 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
5377 GUARD_SIZE. This works that for any allocation that is large enough to
5378 trigger a probe here, we'll have at least one, and if they're not large
5379 enough for this code to emit anything for them, The page would have been
5380 probed by the saving of FP/LR either by this function or any callees. If
5381 we don't have any callees then we won't have more stack adjustments and so
5385 HOST_WIDE_INT residual_probe_offset
= guard_used_by_caller
;
5386 /* If we're doing final adjustments, and we've done any full page
5387 allocations then any residual needs to be probed. */
5388 if (final_adjustment_p
&& rounded_size
!= 0)
5389 min_probe_threshold
= 0;
5390 /* If doing a small final adjustment, we always probe at offset 0.
5391 This is done to avoid issues when LR is not at position 0 or when
5392 the final adjustment is smaller than the probing offset. */
5393 else if (final_adjustment_p
&& rounded_size
== 0)
5394 residual_probe_offset
= 0;
5396 aarch64_sub_sp (temp1
, temp2
, residual
, frame_related_p
);
5397 if (residual
>= min_probe_threshold
)
5401 "Stack clash AArch64 prologue residuals: "
5402 HOST_WIDE_INT_PRINT_DEC
" bytes, probing will be required."
5405 emit_stack_probe (plus_constant (Pmode
, stack_pointer_rtx
,
5406 residual_probe_offset
));
5407 emit_insn (gen_blockage ());
5412 /* Return 1 if the register is used by the epilogue. We need to say the
5413 return register is used, but only after epilogue generation is complete.
5414 Note that in the case of sibcalls, the values "used by the epilogue" are
5415 considered live at the start of the called function.
5417 For SIMD functions we need to return 1 for FP registers that are saved and
5418 restored by a function but are not zero in call_used_regs. If we do not do
5419 this optimizations may remove the restore of the register. */
5422 aarch64_epilogue_uses (int regno
)
5424 if (epilogue_completed
)
5426 if (regno
== LR_REGNUM
)
5428 if (aarch64_simd_decl_p (cfun
->decl
) && FP_SIMD_SAVED_REGNUM_P (regno
))
5434 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
5435 is saved at BASE + OFFSET. */
5438 aarch64_add_cfa_expression (rtx_insn
*insn
, unsigned int reg
,
5439 rtx base
, poly_int64 offset
)
5441 rtx mem
= gen_frame_mem (DImode
, plus_constant (Pmode
, base
, offset
));
5442 add_reg_note (insn
, REG_CFA_EXPRESSION
,
5443 gen_rtx_SET (mem
, regno_reg_rtx
[reg
]));
5446 /* AArch64 stack frames generated by this compiler look like:
5448 +-------------------------------+
5450 | incoming stack arguments |
5452 +-------------------------------+
5453 | | <-- incoming stack pointer (aligned)
5454 | callee-allocated save area |
5455 | for register varargs |
5457 +-------------------------------+
5458 | local variables | <-- frame_pointer_rtx
5460 +-------------------------------+
5462 +-------------------------------+ |
5463 | callee-saved registers | | frame.saved_regs_size
5464 +-------------------------------+ |
5466 +-------------------------------+ |
5467 | FP' | / <- hard_frame_pointer_rtx (aligned)
5468 +-------------------------------+
5469 | dynamic allocation |
5470 +-------------------------------+
5472 +-------------------------------+
5473 | outgoing stack arguments | <-- arg_pointer
5475 +-------------------------------+
5476 | | <-- stack_pointer_rtx (aligned)
5478 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
5479 but leave frame_pointer_rtx and hard_frame_pointer_rtx
5482 By default for stack-clash we assume the guard is at least 64KB, but this
5483 value is configurable to either 4KB or 64KB. We also force the guard size to
5484 be the same as the probing interval and both values are kept in sync.
5486 With those assumptions the callee can allocate up to 63KB (or 3KB depending
5487 on the guard size) of stack space without probing.
5489 When probing is needed, we emit a probe at the start of the prologue
5490 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
5492 We have to track how much space has been allocated and the only stores
5493 to the stack we track as implicit probes are the FP/LR stores.
5495 For outgoing arguments we probe if the size is larger than 1KB, such that
5496 the ABI specified buffer is maintained for the next callee. */
5498 /* Generate the prologue instructions for entry into a function.
5499 Establish the stack frame by decreasing the stack pointer with a
5500 properly calculated size and, if necessary, create a frame record
5501 filled with the values of LR and previous frame pointer. The
5502 current FP is also set up if it is in use. */
5505 aarch64_expand_prologue (void)
5507 poly_int64 frame_size
= cfun
->machine
->frame
.frame_size
;
5508 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
5509 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
5510 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
5511 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
5512 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
5513 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
5514 bool emit_frame_chain
= cfun
->machine
->frame
.emit_frame_chain
;
5517 /* Sign return address for functions. */
5518 if (aarch64_return_address_signing_enabled ())
5520 insn
= emit_insn (gen_pacisp ());
5521 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
5522 RTX_FRAME_RELATED_P (insn
) = 1;
5525 if (flag_stack_usage_info
)
5526 current_function_static_stack_size
= constant_lower_bound (frame_size
);
5528 if (flag_stack_check
== STATIC_BUILTIN_STACK_CHECK
)
5530 if (crtl
->is_leaf
&& !cfun
->calls_alloca
)
5532 if (maybe_gt (frame_size
, PROBE_INTERVAL
)
5533 && maybe_gt (frame_size
, get_stack_check_protect ()))
5534 aarch64_emit_probe_stack_range (get_stack_check_protect (),
5536 - get_stack_check_protect ()));
5538 else if (maybe_gt (frame_size
, 0))
5539 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size
);
5542 rtx tmp0_rtx
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
5543 rtx tmp1_rtx
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
5545 /* In theory we should never have both an initial adjustment
5546 and a callee save adjustment. Verify that is the case since the
5547 code below does not handle it for -fstack-clash-protection. */
5548 gcc_assert (known_eq (initial_adjust
, 0) || callee_adjust
== 0);
5550 /* Will only probe if the initial adjustment is larger than the guard
5551 less the amount of the guard reserved for use by the caller's
5553 aarch64_allocate_and_probe_stack_space (tmp0_rtx
, tmp1_rtx
, initial_adjust
,
5556 if (callee_adjust
!= 0)
5557 aarch64_push_regs (reg1
, reg2
, callee_adjust
);
5559 if (emit_frame_chain
)
5561 poly_int64 reg_offset
= callee_adjust
;
5562 if (callee_adjust
== 0)
5566 reg_offset
= callee_offset
;
5567 aarch64_save_callee_saves (DImode
, reg_offset
, reg1
, reg2
, false);
5569 aarch64_add_offset (Pmode
, hard_frame_pointer_rtx
,
5570 stack_pointer_rtx
, callee_offset
,
5571 tmp1_rtx
, tmp0_rtx
, frame_pointer_needed
);
5572 if (frame_pointer_needed
&& !frame_size
.is_constant ())
5574 /* Variable-sized frames need to describe the save slot
5575 address using DW_CFA_expression rather than DW_CFA_offset.
5576 This means that, without taking further action, the
5577 locations of the registers that we've already saved would
5578 remain based on the stack pointer even after we redefine
5579 the CFA based on the frame pointer. We therefore need new
5580 DW_CFA_expressions to re-express the save slots with addresses
5581 based on the frame pointer. */
5582 rtx_insn
*insn
= get_last_insn ();
5583 gcc_assert (RTX_FRAME_RELATED_P (insn
));
5585 /* Add an explicit CFA definition if this was previously
5587 if (!find_reg_note (insn
, REG_CFA_ADJUST_CFA
, NULL_RTX
))
5589 rtx src
= plus_constant (Pmode
, stack_pointer_rtx
,
5591 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
5592 gen_rtx_SET (hard_frame_pointer_rtx
, src
));
5595 /* Change the save slot expressions for the registers that
5596 we've already saved. */
5597 reg_offset
-= callee_offset
;
5598 aarch64_add_cfa_expression (insn
, reg2
, hard_frame_pointer_rtx
,
5599 reg_offset
+ UNITS_PER_WORD
);
5600 aarch64_add_cfa_expression (insn
, reg1
, hard_frame_pointer_rtx
,
5603 emit_insn (gen_stack_tie (stack_pointer_rtx
, hard_frame_pointer_rtx
));
5606 aarch64_save_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
5607 callee_adjust
!= 0 || emit_frame_chain
);
5608 if (aarch64_simd_decl_p (cfun
->decl
))
5609 aarch64_save_callee_saves (TFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
5610 callee_adjust
!= 0 || emit_frame_chain
);
5612 aarch64_save_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
5613 callee_adjust
!= 0 || emit_frame_chain
);
5615 /* We may need to probe the final adjustment if it is larger than the guard
5616 that is assumed by the called. */
5617 aarch64_allocate_and_probe_stack_space (tmp1_rtx
, tmp0_rtx
, final_adjust
,
5618 !frame_pointer_needed
, true);
5621 /* Return TRUE if we can use a simple_return insn.
5623 This function checks whether the callee saved stack is empty, which
5624 means no restore actions are need. The pro_and_epilogue will use
5625 this to check whether shrink-wrapping opt is feasible. */
5628 aarch64_use_return_insn_p (void)
5630 if (!reload_completed
)
5636 return known_eq (cfun
->machine
->frame
.frame_size
, 0);
5639 /* Return false for non-leaf SIMD functions in order to avoid
5640 shrink-wrapping them. Doing this will lose the necessary
5641 save/restore of FP registers. */
5644 aarch64_use_simple_return_insn_p (void)
5646 if (aarch64_simd_decl_p (cfun
->decl
) && !crtl
->is_leaf
)
5652 /* Generate the epilogue instructions for returning from a function.
5653 This is almost exactly the reverse of the prolog sequence, except
5654 that we need to insert barriers to avoid scheduling loads that read
5655 from a deallocated stack, and we optimize the unwind records by
5656 emitting them all together if possible. */
5658 aarch64_expand_epilogue (bool for_sibcall
)
5660 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
5661 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
5662 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
5663 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
5664 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
5665 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
5668 /* A stack clash protection prologue may not have left EP0_REGNUM or
5669 EP1_REGNUM in a usable state. The same is true for allocations
5670 with an SVE component, since we then need both temporary registers
5671 for each allocation. For stack clash we are in a usable state if
5672 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
5673 HOST_WIDE_INT guard_size
5674 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
5675 HOST_WIDE_INT guard_used_by_caller
= STACK_CLASH_CALLER_GUARD
;
5677 /* We can re-use the registers when the allocation amount is smaller than
5678 guard_size - guard_used_by_caller because we won't be doing any probes
5679 then. In such situations the register should remain live with the correct
5681 bool can_inherit_p
= (initial_adjust
.is_constant ()
5682 && final_adjust
.is_constant ())
5683 && (!flag_stack_clash_protection
5684 || known_lt (initial_adjust
,
5685 guard_size
- guard_used_by_caller
));
5687 /* We need to add memory barrier to prevent read from deallocated stack. */
5689 = maybe_ne (get_frame_size ()
5690 + cfun
->machine
->frame
.saved_varargs_size
, 0);
5692 /* Emit a barrier to prevent loads from a deallocated stack. */
5693 if (maybe_gt (final_adjust
, crtl
->outgoing_args_size
)
5694 || cfun
->calls_alloca
5695 || crtl
->calls_eh_return
)
5697 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
5698 need_barrier_p
= false;
5701 /* Restore the stack pointer from the frame pointer if it may not
5702 be the same as the stack pointer. */
5703 rtx tmp0_rtx
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
5704 rtx tmp1_rtx
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
5705 if (frame_pointer_needed
5706 && (maybe_ne (final_adjust
, 0) || cfun
->calls_alloca
))
5707 /* If writeback is used when restoring callee-saves, the CFA
5708 is restored on the instruction doing the writeback. */
5709 aarch64_add_offset (Pmode
, stack_pointer_rtx
,
5710 hard_frame_pointer_rtx
, -callee_offset
,
5711 tmp1_rtx
, tmp0_rtx
, callee_adjust
== 0);
5713 /* The case where we need to re-use the register here is very rare, so
5714 avoid the complicated condition and just always emit a move if the
5715 immediate doesn't fit. */
5716 aarch64_add_sp (tmp1_rtx
, tmp0_rtx
, final_adjust
, true);
5718 aarch64_restore_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
5719 callee_adjust
!= 0, &cfi_ops
);
5720 if (aarch64_simd_decl_p (cfun
->decl
))
5721 aarch64_restore_callee_saves (TFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
5722 callee_adjust
!= 0, &cfi_ops
);
5724 aarch64_restore_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
5725 callee_adjust
!= 0, &cfi_ops
);
5728 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
5730 if (callee_adjust
!= 0)
5731 aarch64_pop_regs (reg1
, reg2
, callee_adjust
, &cfi_ops
);
5733 if (callee_adjust
!= 0 || maybe_gt (initial_adjust
, 65536))
5735 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
5736 insn
= get_last_insn ();
5737 rtx new_cfa
= plus_constant (Pmode
, stack_pointer_rtx
, initial_adjust
);
5738 REG_NOTES (insn
) = alloc_reg_note (REG_CFA_DEF_CFA
, new_cfa
, cfi_ops
);
5739 RTX_FRAME_RELATED_P (insn
) = 1;
5743 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
5744 add restriction on emit_move optimization to leaf functions. */
5745 aarch64_add_sp (tmp0_rtx
, tmp1_rtx
, initial_adjust
,
5746 (!can_inherit_p
|| !crtl
->is_leaf
5747 || df_regs_ever_live_p (EP0_REGNUM
)));
5751 /* Emit delayed restores and reset the CFA to be SP. */
5752 insn
= get_last_insn ();
5753 cfi_ops
= alloc_reg_note (REG_CFA_DEF_CFA
, stack_pointer_rtx
, cfi_ops
);
5754 REG_NOTES (insn
) = cfi_ops
;
5755 RTX_FRAME_RELATED_P (insn
) = 1;
5758 /* We prefer to emit the combined return/authenticate instruction RETAA,
5759 however there are three cases in which we must instead emit an explicit
5760 authentication instruction.
5762 1) Sibcalls don't return in a normal way, so if we're about to call one
5763 we must authenticate.
5765 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5766 generating code for !TARGET_ARMV8_3 we can't use it and must
5767 explicitly authenticate.
5769 3) On an eh_return path we make extra stack adjustments to update the
5770 canonical frame address to be the exception handler's CFA. We want
5771 to authenticate using the CFA of the function which calls eh_return.
5773 if (aarch64_return_address_signing_enabled ()
5774 && (for_sibcall
|| !TARGET_ARMV8_3
|| crtl
->calls_eh_return
))
5776 insn
= emit_insn (gen_autisp ());
5777 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
5778 RTX_FRAME_RELATED_P (insn
) = 1;
5781 /* Stack adjustment for exception handler. */
5782 if (crtl
->calls_eh_return
)
5784 /* We need to unwind the stack by the offset computed by
5785 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
5786 to be SP; letting the CFA move during this adjustment
5787 is just as correct as retaining the CFA from the body
5788 of the function. Therefore, do nothing special. */
5789 emit_insn (gen_add2_insn (stack_pointer_rtx
, EH_RETURN_STACKADJ_RTX
));
5792 emit_use (gen_rtx_REG (DImode
, LR_REGNUM
));
5794 emit_jump_insn (ret_rtx
);
5797 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
5798 normally or return to a previous frame after unwinding.
5800 An EH return uses a single shared return sequence. The epilogue is
5801 exactly like a normal epilogue except that it has an extra input
5802 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5803 that must be applied after the frame has been destroyed. An extra label
5804 is inserted before the epilogue which initializes this register to zero,
5805 and this is the entry point for a normal return.
5807 An actual EH return updates the return address, initializes the stack
5808 adjustment and jumps directly into the epilogue (bypassing the zeroing
5809 of the adjustment). Since the return address is typically saved on the
5810 stack when a function makes a call, the saved LR must be updated outside
5813 This poses problems as the store is generated well before the epilogue,
5814 so the offset of LR is not known yet. Also optimizations will remove the
5815 store as it appears dead, even after the epilogue is generated (as the
5816 base or offset for loading LR is different in many cases).
5818 To avoid these problems this implementation forces the frame pointer
5819 in eh_return functions so that the location of LR is fixed and known early.
5820 It also marks the store volatile, so no optimization is permitted to
5821 remove the store. */
5823 aarch64_eh_return_handler_rtx (void)
5825 rtx tmp
= gen_frame_mem (Pmode
,
5826 plus_constant (Pmode
, hard_frame_pointer_rtx
, UNITS_PER_WORD
));
5828 /* Mark the store volatile, so no optimization is permitted to remove it. */
5829 MEM_VOLATILE_P (tmp
) = true;
5833 /* Output code to add DELTA to the first argument, and then jump
5834 to FUNCTION. Used for C++ multiple inheritance. */
5836 aarch64_output_mi_thunk (FILE *file
, tree thunk ATTRIBUTE_UNUSED
,
5837 HOST_WIDE_INT delta
,
5838 HOST_WIDE_INT vcall_offset
,
5841 /* The this pointer is always in x0. Note that this differs from
5842 Arm where the this pointer maybe bumped to r1 if r0 is required
5843 to return a pointer to an aggregate. On AArch64 a result value
5844 pointer will be in x8. */
5845 int this_regno
= R0_REGNUM
;
5846 rtx this_rtx
, temp0
, temp1
, addr
, funexp
;
5849 reload_completed
= 1;
5850 emit_note (NOTE_INSN_PROLOGUE_END
);
5852 this_rtx
= gen_rtx_REG (Pmode
, this_regno
);
5853 temp0
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
5854 temp1
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
5856 if (vcall_offset
== 0)
5857 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
, temp1
, temp0
, false);
5860 gcc_assert ((vcall_offset
& (POINTER_BYTES
- 1)) == 0);
5865 if (delta
>= -256 && delta
< 256)
5866 addr
= gen_rtx_PRE_MODIFY (Pmode
, this_rtx
,
5867 plus_constant (Pmode
, this_rtx
, delta
));
5869 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
,
5870 temp1
, temp0
, false);
5873 if (Pmode
== ptr_mode
)
5874 aarch64_emit_move (temp0
, gen_rtx_MEM (ptr_mode
, addr
));
5876 aarch64_emit_move (temp0
,
5877 gen_rtx_ZERO_EXTEND (Pmode
,
5878 gen_rtx_MEM (ptr_mode
, addr
)));
5880 if (vcall_offset
>= -256 && vcall_offset
< 4096 * POINTER_BYTES
)
5881 addr
= plus_constant (Pmode
, temp0
, vcall_offset
);
5884 aarch64_internal_mov_immediate (temp1
, GEN_INT (vcall_offset
), true,
5886 addr
= gen_rtx_PLUS (Pmode
, temp0
, temp1
);
5889 if (Pmode
== ptr_mode
)
5890 aarch64_emit_move (temp1
, gen_rtx_MEM (ptr_mode
,addr
));
5892 aarch64_emit_move (temp1
,
5893 gen_rtx_SIGN_EXTEND (Pmode
,
5894 gen_rtx_MEM (ptr_mode
, addr
)));
5896 emit_insn (gen_add2_insn (this_rtx
, temp1
));
5899 /* Generate a tail call to the target function. */
5900 if (!TREE_USED (function
))
5902 assemble_external (function
);
5903 TREE_USED (function
) = 1;
5905 funexp
= XEXP (DECL_RTL (function
), 0);
5906 funexp
= gen_rtx_MEM (FUNCTION_MODE
, funexp
);
5907 insn
= emit_call_insn (gen_sibcall (funexp
, const0_rtx
, NULL_RTX
));
5908 SIBLING_CALL_P (insn
) = 1;
5910 insn
= get_insns ();
5911 shorten_branches (insn
);
5912 final_start_function (insn
, file
, 1);
5913 final (insn
, file
, 1);
5914 final_end_function ();
5916 /* Stop pretending to be a post-reload pass. */
5917 reload_completed
= 0;
5921 aarch64_tls_referenced_p (rtx x
)
5923 if (!TARGET_HAVE_TLS
)
5925 subrtx_iterator::array_type array
;
5926 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
5928 const_rtx x
= *iter
;
5929 if (GET_CODE (x
) == SYMBOL_REF
&& SYMBOL_REF_TLS_MODEL (x
) != 0)
5931 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5932 TLS offsets, not real symbol references. */
5933 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
5934 iter
.skip_subrtxes ();
5940 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5941 a left shift of 0 or 12 bits. */
5943 aarch64_uimm12_shift (HOST_WIDE_INT val
)
5945 return ((val
& (((HOST_WIDE_INT
) 0xfff) << 0)) == val
5946 || (val
& (((HOST_WIDE_INT
) 0xfff) << 12)) == val
5950 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
5951 that can be created with a left shift of 0 or 12. */
5952 static HOST_WIDE_INT
5953 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val
)
5955 /* Check to see if the value fits in 24 bits, as that is the maximum we can
5956 handle correctly. */
5957 gcc_assert ((val
& 0xffffff) == val
);
5959 if (((val
& 0xfff) << 0) == val
)
5962 return val
& (0xfff << 12);
5965 /* Return true if val is an immediate that can be loaded into a
5966 register by a MOVZ instruction. */
5968 aarch64_movw_imm (HOST_WIDE_INT val
, scalar_int_mode mode
)
5970 if (GET_MODE_SIZE (mode
) > 4)
5972 if ((val
& (((HOST_WIDE_INT
) 0xffff) << 32)) == val
5973 || (val
& (((HOST_WIDE_INT
) 0xffff) << 48)) == val
)
5978 /* Ignore sign extension. */
5979 val
&= (HOST_WIDE_INT
) 0xffffffff;
5981 return ((val
& (((HOST_WIDE_INT
) 0xffff) << 0)) == val
5982 || (val
& (((HOST_WIDE_INT
) 0xffff) << 16)) == val
);
5985 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
5986 64-bit (DImode) integer. */
5988 static unsigned HOST_WIDE_INT
5989 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val
, machine_mode mode
)
5991 unsigned int size
= GET_MODE_UNIT_PRECISION (mode
);
5994 val
&= (HOST_WIDE_INT_1U
<< size
) - 1;
6001 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
6003 static const unsigned HOST_WIDE_INT bitmask_imm_mul
[] =
6005 0x0000000100000001ull
,
6006 0x0001000100010001ull
,
6007 0x0101010101010101ull
,
6008 0x1111111111111111ull
,
6009 0x5555555555555555ull
,
6013 /* Return true if val is a valid bitmask immediate. */
6016 aarch64_bitmask_imm (HOST_WIDE_INT val_in
, machine_mode mode
)
6018 unsigned HOST_WIDE_INT val
, tmp
, mask
, first_one
, next_one
;
6021 /* Check for a single sequence of one bits and return quickly if so.
6022 The special cases of all ones and all zeroes returns false. */
6023 val
= aarch64_replicate_bitmask_imm (val_in
, mode
);
6024 tmp
= val
+ (val
& -val
);
6026 if (tmp
== (tmp
& -tmp
))
6027 return (val
+ 1) > 1;
6029 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
6031 val
= (val
<< 32) | (val
& 0xffffffff);
6033 /* Invert if the immediate doesn't start with a zero bit - this means we
6034 only need to search for sequences of one bits. */
6038 /* Find the first set bit and set tmp to val with the first sequence of one
6039 bits removed. Return success if there is a single sequence of ones. */
6040 first_one
= val
& -val
;
6041 tmp
= val
& (val
+ first_one
);
6046 /* Find the next set bit and compute the difference in bit position. */
6047 next_one
= tmp
& -tmp
;
6048 bits
= clz_hwi (first_one
) - clz_hwi (next_one
);
6051 /* Check the bit position difference is a power of 2, and that the first
6052 sequence of one bits fits within 'bits' bits. */
6053 if ((mask
>> bits
) != 0 || bits
!= (bits
& -bits
))
6056 /* Check the sequence of one bits is repeated 64/bits times. */
6057 return val
== mask
* bitmask_imm_mul
[__builtin_clz (bits
) - 26];
6060 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
6061 Assumed precondition: VAL_IN Is not zero. */
6063 unsigned HOST_WIDE_INT
6064 aarch64_and_split_imm1 (HOST_WIDE_INT val_in
)
6066 int lowest_bit_set
= ctz_hwi (val_in
);
6067 int highest_bit_set
= floor_log2 (val_in
);
6068 gcc_assert (val_in
!= 0);
6070 return ((HOST_WIDE_INT_UC (2) << highest_bit_set
) -
6071 (HOST_WIDE_INT_1U
<< lowest_bit_set
));
6074 /* Create constant where bits outside of lowest bit set to highest bit set
6077 unsigned HOST_WIDE_INT
6078 aarch64_and_split_imm2 (HOST_WIDE_INT val_in
)
6080 return val_in
| ~aarch64_and_split_imm1 (val_in
);
6083 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
6086 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in
, machine_mode mode
)
6088 scalar_int_mode int_mode
;
6089 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
6092 if (aarch64_bitmask_imm (val_in
, int_mode
))
6095 if (aarch64_move_imm (val_in
, int_mode
))
6098 unsigned HOST_WIDE_INT imm2
= aarch64_and_split_imm2 (val_in
);
6100 return aarch64_bitmask_imm (imm2
, int_mode
);
6103 /* Return true if val is an immediate that can be loaded into a
6104 register in a single instruction. */
6106 aarch64_move_imm (HOST_WIDE_INT val
, machine_mode mode
)
6108 scalar_int_mode int_mode
;
6109 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
6112 if (aarch64_movw_imm (val
, int_mode
) || aarch64_movw_imm (~val
, int_mode
))
6114 return aarch64_bitmask_imm (val
, int_mode
);
6118 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED
, rtx x
)
6122 if (GET_CODE (x
) == HIGH
)
6125 /* There's no way to calculate VL-based values using relocations. */
6126 subrtx_iterator::array_type array
;
6127 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
6128 if (GET_CODE (*iter
) == CONST_POLY_INT
)
6131 split_const (x
, &base
, &offset
);
6132 if (GET_CODE (base
) == SYMBOL_REF
|| GET_CODE (base
) == LABEL_REF
)
6134 if (aarch64_classify_symbol (base
, INTVAL (offset
))
6135 != SYMBOL_FORCE_TO_MEM
)
6138 /* Avoid generating a 64-bit relocation in ILP32; leave
6139 to aarch64_expand_mov_immediate to handle it properly. */
6140 return mode
!= ptr_mode
;
6143 return aarch64_tls_referenced_p (x
);
6146 /* Implement TARGET_CASE_VALUES_THRESHOLD.
6147 The expansion for a table switch is quite expensive due to the number
6148 of instructions, the table lookup and hard to predict indirect jump.
6149 When optimizing for speed, and -O3 enabled, use the per-core tuning if
6150 set, otherwise use tables for > 16 cases as a tradeoff between size and
6151 performance. When optimizing for size, use the default setting. */
6154 aarch64_case_values_threshold (void)
6156 /* Use the specified limit for the number of cases before using jump
6157 tables at higher optimization levels. */
6159 && selected_cpu
->tune
->max_case_values
!= 0)
6160 return selected_cpu
->tune
->max_case_values
;
6162 return optimize_size
? default_case_values_threshold () : 17;
6165 /* Return true if register REGNO is a valid index register.
6166 STRICT_P is true if REG_OK_STRICT is in effect. */
6169 aarch64_regno_ok_for_index_p (int regno
, bool strict_p
)
6171 if (!HARD_REGISTER_NUM_P (regno
))
6179 regno
= reg_renumber
[regno
];
6181 return GP_REGNUM_P (regno
);
6184 /* Return true if register REGNO is a valid base register for mode MODE.
6185 STRICT_P is true if REG_OK_STRICT is in effect. */
6188 aarch64_regno_ok_for_base_p (int regno
, bool strict_p
)
6190 if (!HARD_REGISTER_NUM_P (regno
))
6198 regno
= reg_renumber
[regno
];
6201 /* The fake registers will be eliminated to either the stack or
6202 hard frame pointer, both of which are usually valid base registers.
6203 Reload deals with the cases where the eliminated form isn't valid. */
6204 return (GP_REGNUM_P (regno
)
6205 || regno
== SP_REGNUM
6206 || regno
== FRAME_POINTER_REGNUM
6207 || regno
== ARG_POINTER_REGNUM
);
6210 /* Return true if X is a valid base register for mode MODE.
6211 STRICT_P is true if REG_OK_STRICT is in effect. */
6214 aarch64_base_register_rtx_p (rtx x
, bool strict_p
)
6217 && GET_CODE (x
) == SUBREG
6218 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (x
))])
6221 return (REG_P (x
) && aarch64_regno_ok_for_base_p (REGNO (x
), strict_p
));
6224 /* Return true if address offset is a valid index. If it is, fill in INFO
6225 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
6228 aarch64_classify_index (struct aarch64_address_info
*info
, rtx x
,
6229 machine_mode mode
, bool strict_p
)
6231 enum aarch64_address_type type
;
6236 if ((REG_P (x
) || GET_CODE (x
) == SUBREG
)
6237 && GET_MODE (x
) == Pmode
)
6239 type
= ADDRESS_REG_REG
;
6243 /* (sign_extend:DI (reg:SI)) */
6244 else if ((GET_CODE (x
) == SIGN_EXTEND
6245 || GET_CODE (x
) == ZERO_EXTEND
)
6246 && GET_MODE (x
) == DImode
6247 && GET_MODE (XEXP (x
, 0)) == SImode
)
6249 type
= (GET_CODE (x
) == SIGN_EXTEND
)
6250 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
6251 index
= XEXP (x
, 0);
6254 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
6255 else if (GET_CODE (x
) == MULT
6256 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
6257 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
6258 && GET_MODE (XEXP (x
, 0)) == DImode
6259 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
6260 && CONST_INT_P (XEXP (x
, 1)))
6262 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
6263 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
6264 index
= XEXP (XEXP (x
, 0), 0);
6265 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
6267 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
6268 else if (GET_CODE (x
) == ASHIFT
6269 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
6270 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
6271 && GET_MODE (XEXP (x
, 0)) == DImode
6272 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
6273 && CONST_INT_P (XEXP (x
, 1)))
6275 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
6276 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
6277 index
= XEXP (XEXP (x
, 0), 0);
6278 shift
= INTVAL (XEXP (x
, 1));
6280 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
6281 else if ((GET_CODE (x
) == SIGN_EXTRACT
6282 || GET_CODE (x
) == ZERO_EXTRACT
)
6283 && GET_MODE (x
) == DImode
6284 && GET_CODE (XEXP (x
, 0)) == MULT
6285 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
6286 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
6288 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
6289 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
6290 index
= XEXP (XEXP (x
, 0), 0);
6291 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
6292 if (INTVAL (XEXP (x
, 1)) != 32 + shift
6293 || INTVAL (XEXP (x
, 2)) != 0)
6296 /* (and:DI (mult:DI (reg:DI) (const_int scale))
6297 (const_int 0xffffffff<<shift)) */
6298 else if (GET_CODE (x
) == AND
6299 && GET_MODE (x
) == DImode
6300 && GET_CODE (XEXP (x
, 0)) == MULT
6301 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
6302 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
6303 && CONST_INT_P (XEXP (x
, 1)))
6305 type
= ADDRESS_REG_UXTW
;
6306 index
= XEXP (XEXP (x
, 0), 0);
6307 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
6308 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
6311 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
6312 else if ((GET_CODE (x
) == SIGN_EXTRACT
6313 || GET_CODE (x
) == ZERO_EXTRACT
)
6314 && GET_MODE (x
) == DImode
6315 && GET_CODE (XEXP (x
, 0)) == ASHIFT
6316 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
6317 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
6319 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
6320 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
6321 index
= XEXP (XEXP (x
, 0), 0);
6322 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
6323 if (INTVAL (XEXP (x
, 1)) != 32 + shift
6324 || INTVAL (XEXP (x
, 2)) != 0)
6327 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
6328 (const_int 0xffffffff<<shift)) */
6329 else if (GET_CODE (x
) == AND
6330 && GET_MODE (x
) == DImode
6331 && GET_CODE (XEXP (x
, 0)) == ASHIFT
6332 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
6333 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
6334 && CONST_INT_P (XEXP (x
, 1)))
6336 type
= ADDRESS_REG_UXTW
;
6337 index
= XEXP (XEXP (x
, 0), 0);
6338 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
6339 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
6342 /* (mult:P (reg:P) (const_int scale)) */
6343 else if (GET_CODE (x
) == MULT
6344 && GET_MODE (x
) == Pmode
6345 && GET_MODE (XEXP (x
, 0)) == Pmode
6346 && CONST_INT_P (XEXP (x
, 1)))
6348 type
= ADDRESS_REG_REG
;
6349 index
= XEXP (x
, 0);
6350 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
6352 /* (ashift:P (reg:P) (const_int shift)) */
6353 else if (GET_CODE (x
) == ASHIFT
6354 && GET_MODE (x
) == Pmode
6355 && GET_MODE (XEXP (x
, 0)) == Pmode
6356 && CONST_INT_P (XEXP (x
, 1)))
6358 type
= ADDRESS_REG_REG
;
6359 index
= XEXP (x
, 0);
6360 shift
= INTVAL (XEXP (x
, 1));
6366 && GET_CODE (index
) == SUBREG
6367 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (index
))])
6368 index
= SUBREG_REG (index
);
6370 if (aarch64_sve_data_mode_p (mode
))
6372 if (type
!= ADDRESS_REG_REG
6373 || (1 << shift
) != GET_MODE_UNIT_SIZE (mode
))
6379 && !(IN_RANGE (shift
, 1, 3)
6380 && known_eq (1 << shift
, GET_MODE_SIZE (mode
))))
6385 && aarch64_regno_ok_for_index_p (REGNO (index
), strict_p
))
6388 info
->offset
= index
;
6389 info
->shift
= shift
;
6396 /* Return true if MODE is one of the modes for which we
6397 support LDP/STP operations. */
6400 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode
)
6402 return mode
== SImode
|| mode
== DImode
6403 || mode
== SFmode
|| mode
== DFmode
6404 || (aarch64_vector_mode_supported_p (mode
)
6405 && (known_eq (GET_MODE_SIZE (mode
), 8)
6406 || (known_eq (GET_MODE_SIZE (mode
), 16)
6407 && (aarch64_tune_params
.extra_tuning_flags
6408 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
) == 0)));
6411 /* Return true if REGNO is a virtual pointer register, or an eliminable
6412 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
6413 include stack_pointer or hard_frame_pointer. */
6415 virt_or_elim_regno_p (unsigned regno
)
6417 return ((regno
>= FIRST_VIRTUAL_REGISTER
6418 && regno
<= LAST_VIRTUAL_POINTER_REGISTER
)
6419 || regno
== FRAME_POINTER_REGNUM
6420 || regno
== ARG_POINTER_REGNUM
);
6423 /* Return true if X is a valid address of type TYPE for machine mode MODE.
6424 If it is, fill in INFO appropriately. STRICT_P is true if
6425 REG_OK_STRICT is in effect. */
6428 aarch64_classify_address (struct aarch64_address_info
*info
,
6429 rtx x
, machine_mode mode
, bool strict_p
,
6430 aarch64_addr_query_type type
)
6432 enum rtx_code code
= GET_CODE (x
);
6436 HOST_WIDE_INT const_size
;
6438 /* On BE, we use load/store pair for all large int mode load/stores.
6439 TI/TFmode may also use a load/store pair. */
6440 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
6441 bool advsimd_struct_p
= (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
));
6442 bool load_store_pair_p
= (type
== ADDR_QUERY_LDP_STP
6443 || type
== ADDR_QUERY_LDP_STP_N
6446 || (BYTES_BIG_ENDIAN
&& advsimd_struct_p
));
6448 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
6449 corresponds to the actual size of the memory being loaded/stored and the
6450 mode of the corresponding addressing mode is half of that. */
6451 if (type
== ADDR_QUERY_LDP_STP_N
6452 && known_eq (GET_MODE_SIZE (mode
), 16))
6455 bool allow_reg_index_p
= (!load_store_pair_p
6456 && (known_lt (GET_MODE_SIZE (mode
), 16)
6457 || vec_flags
== VEC_ADVSIMD
6458 || vec_flags
== VEC_SVE_DATA
));
6460 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
6461 [Rn, #offset, MUL VL]. */
6462 if ((vec_flags
& (VEC_SVE_DATA
| VEC_SVE_PRED
)) != 0
6463 && (code
!= REG
&& code
!= PLUS
))
6466 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
6468 if (advsimd_struct_p
6469 && !BYTES_BIG_ENDIAN
6470 && (code
!= POST_INC
&& code
!= REG
))
6473 gcc_checking_assert (GET_MODE (x
) == VOIDmode
6474 || SCALAR_INT_MODE_P (GET_MODE (x
)));
6480 info
->type
= ADDRESS_REG_IMM
;
6482 info
->offset
= const0_rtx
;
6483 info
->const_offset
= 0;
6484 return aarch64_base_register_rtx_p (x
, strict_p
);
6492 && virt_or_elim_regno_p (REGNO (op0
))
6493 && poly_int_rtx_p (op1
, &offset
))
6495 info
->type
= ADDRESS_REG_IMM
;
6498 info
->const_offset
= offset
;
6503 if (maybe_ne (GET_MODE_SIZE (mode
), 0)
6504 && aarch64_base_register_rtx_p (op0
, strict_p
)
6505 && poly_int_rtx_p (op1
, &offset
))
6507 info
->type
= ADDRESS_REG_IMM
;
6510 info
->const_offset
= offset
;
6512 /* TImode and TFmode values are allowed in both pairs of X
6513 registers and individual Q registers. The available
6515 X,X: 7-bit signed scaled offset
6516 Q: 9-bit signed offset
6517 We conservatively require an offset representable in either mode.
6518 When performing the check for pairs of X registers i.e. LDP/STP
6519 pass down DImode since that is the natural size of the LDP/STP
6520 instruction memory accesses. */
6521 if (mode
== TImode
|| mode
== TFmode
)
6522 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
6523 && (aarch64_offset_9bit_signed_unscaled_p (mode
, offset
)
6524 || offset_12bit_unsigned_scaled_p (mode
, offset
)));
6526 /* A 7bit offset check because OImode will emit a ldp/stp
6527 instruction (only big endian will get here).
6528 For ldp/stp instructions, the offset is scaled for the size of a
6529 single element of the pair. */
6531 return aarch64_offset_7bit_signed_scaled_p (TImode
, offset
);
6533 /* Three 9/12 bit offsets checks because CImode will emit three
6534 ldr/str instructions (only big endian will get here). */
6536 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
6537 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode
,
6539 || offset_12bit_unsigned_scaled_p (V16QImode
,
6542 /* Two 7bit offsets checks because XImode will emit two ldp/stp
6543 instructions (only big endian will get here). */
6545 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
6546 && aarch64_offset_7bit_signed_scaled_p (TImode
,
6549 /* Make "m" use the LD1 offset range for SVE data modes, so
6550 that pre-RTL optimizers like ivopts will work to that
6551 instead of the wider LDR/STR range. */
6552 if (vec_flags
== VEC_SVE_DATA
)
6553 return (type
== ADDR_QUERY_M
6554 ? offset_4bit_signed_scaled_p (mode
, offset
)
6555 : offset_9bit_signed_scaled_p (mode
, offset
));
6557 if (vec_flags
== (VEC_SVE_DATA
| VEC_STRUCT
))
6559 poly_int64 end_offset
= (offset
6560 + GET_MODE_SIZE (mode
)
6561 - BYTES_PER_SVE_VECTOR
);
6562 return (type
== ADDR_QUERY_M
6563 ? offset_4bit_signed_scaled_p (mode
, offset
)
6564 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE
, offset
)
6565 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE
,
6569 if (vec_flags
== VEC_SVE_PRED
)
6570 return offset_9bit_signed_scaled_p (mode
, offset
);
6572 if (load_store_pair_p
)
6573 return ((known_eq (GET_MODE_SIZE (mode
), 4)
6574 || known_eq (GET_MODE_SIZE (mode
), 8)
6575 || known_eq (GET_MODE_SIZE (mode
), 16))
6576 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
6578 return (aarch64_offset_9bit_signed_unscaled_p (mode
, offset
)
6579 || offset_12bit_unsigned_scaled_p (mode
, offset
));
6582 if (allow_reg_index_p
)
6584 /* Look for base + (scaled/extended) index register. */
6585 if (aarch64_base_register_rtx_p (op0
, strict_p
)
6586 && aarch64_classify_index (info
, op1
, mode
, strict_p
))
6591 if (aarch64_base_register_rtx_p (op1
, strict_p
)
6592 && aarch64_classify_index (info
, op0
, mode
, strict_p
))
6605 info
->type
= ADDRESS_REG_WB
;
6606 info
->base
= XEXP (x
, 0);
6607 info
->offset
= NULL_RTX
;
6608 return aarch64_base_register_rtx_p (info
->base
, strict_p
);
6612 info
->type
= ADDRESS_REG_WB
;
6613 info
->base
= XEXP (x
, 0);
6614 if (GET_CODE (XEXP (x
, 1)) == PLUS
6615 && poly_int_rtx_p (XEXP (XEXP (x
, 1), 1), &offset
)
6616 && rtx_equal_p (XEXP (XEXP (x
, 1), 0), info
->base
)
6617 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
6619 info
->offset
= XEXP (XEXP (x
, 1), 1);
6620 info
->const_offset
= offset
;
6622 /* TImode and TFmode values are allowed in both pairs of X
6623 registers and individual Q registers. The available
6625 X,X: 7-bit signed scaled offset
6626 Q: 9-bit signed offset
6627 We conservatively require an offset representable in either mode.
6629 if (mode
== TImode
|| mode
== TFmode
)
6630 return (aarch64_offset_7bit_signed_scaled_p (mode
, offset
)
6631 && aarch64_offset_9bit_signed_unscaled_p (mode
, offset
));
6633 if (load_store_pair_p
)
6634 return ((known_eq (GET_MODE_SIZE (mode
), 4)
6635 || known_eq (GET_MODE_SIZE (mode
), 8)
6636 || known_eq (GET_MODE_SIZE (mode
), 16))
6637 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
6639 return aarch64_offset_9bit_signed_unscaled_p (mode
, offset
);
6646 /* load literal: pc-relative constant pool entry. Only supported
6647 for SI mode or larger. */
6648 info
->type
= ADDRESS_SYMBOLIC
;
6650 if (!load_store_pair_p
6651 && GET_MODE_SIZE (mode
).is_constant (&const_size
)
6656 split_const (x
, &sym
, &addend
);
6657 return ((GET_CODE (sym
) == LABEL_REF
6658 || (GET_CODE (sym
) == SYMBOL_REF
6659 && CONSTANT_POOL_ADDRESS_P (sym
)
6660 && aarch64_pcrelative_literal_loads
)));
6665 info
->type
= ADDRESS_LO_SUM
;
6666 info
->base
= XEXP (x
, 0);
6667 info
->offset
= XEXP (x
, 1);
6668 if (allow_reg_index_p
6669 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
6672 split_const (info
->offset
, &sym
, &offs
);
6673 if (GET_CODE (sym
) == SYMBOL_REF
6674 && (aarch64_classify_symbol (sym
, INTVAL (offs
))
6675 == SYMBOL_SMALL_ABSOLUTE
))
6677 /* The symbol and offset must be aligned to the access size. */
6680 if (CONSTANT_POOL_ADDRESS_P (sym
))
6681 align
= GET_MODE_ALIGNMENT (get_pool_mode (sym
));
6682 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym
))
6684 tree exp
= SYMBOL_REF_DECL (sym
);
6685 align
= TYPE_ALIGN (TREE_TYPE (exp
));
6686 align
= aarch64_constant_alignment (exp
, align
);
6688 else if (SYMBOL_REF_DECL (sym
))
6689 align
= DECL_ALIGN (SYMBOL_REF_DECL (sym
));
6690 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym
)
6691 && SYMBOL_REF_BLOCK (sym
) != NULL
)
6692 align
= SYMBOL_REF_BLOCK (sym
)->alignment
;
6694 align
= BITS_PER_UNIT
;
6696 poly_int64 ref_size
= GET_MODE_SIZE (mode
);
6697 if (known_eq (ref_size
, 0))
6698 ref_size
= GET_MODE_SIZE (DImode
);
6700 return (multiple_p (INTVAL (offs
), ref_size
)
6701 && multiple_p (align
/ BITS_PER_UNIT
, ref_size
));
6711 /* Return true if the address X is valid for a PRFM instruction.
6712 STRICT_P is true if we should do strict checking with
6713 aarch64_classify_address. */
6716 aarch64_address_valid_for_prefetch_p (rtx x
, bool strict_p
)
6718 struct aarch64_address_info addr
;
6720 /* PRFM accepts the same addresses as DImode... */
6721 bool res
= aarch64_classify_address (&addr
, x
, DImode
, strict_p
);
6725 /* ... except writeback forms. */
6726 return addr
.type
!= ADDRESS_REG_WB
;
6730 aarch64_symbolic_address_p (rtx x
)
6734 split_const (x
, &x
, &offset
);
6735 return GET_CODE (x
) == SYMBOL_REF
|| GET_CODE (x
) == LABEL_REF
;
6738 /* Classify the base of symbolic expression X. */
6740 enum aarch64_symbol_type
6741 aarch64_classify_symbolic_expression (rtx x
)
6745 split_const (x
, &x
, &offset
);
6746 return aarch64_classify_symbol (x
, INTVAL (offset
));
6750 /* Return TRUE if X is a legitimate address for accessing memory in
6753 aarch64_legitimate_address_hook_p (machine_mode mode
, rtx x
, bool strict_p
)
6755 struct aarch64_address_info addr
;
6757 return aarch64_classify_address (&addr
, x
, mode
, strict_p
);
6760 /* Return TRUE if X is a legitimate address of type TYPE for accessing
6761 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
6763 aarch64_legitimate_address_p (machine_mode mode
, rtx x
, bool strict_p
,
6764 aarch64_addr_query_type type
)
6766 struct aarch64_address_info addr
;
6768 return aarch64_classify_address (&addr
, x
, mode
, strict_p
, type
);
6771 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
6774 aarch64_legitimize_address_displacement (rtx
*offset1
, rtx
*offset2
,
6775 poly_int64 orig_offset
,
6779 if (GET_MODE_SIZE (mode
).is_constant (&size
))
6781 HOST_WIDE_INT const_offset
, second_offset
;
6783 /* A general SVE offset is A * VQ + B. Remove the A component from
6784 coefficient 0 in order to get the constant B. */
6785 const_offset
= orig_offset
.coeffs
[0] - orig_offset
.coeffs
[1];
6787 /* Split an out-of-range address displacement into a base and
6788 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
6789 range otherwise to increase opportunities for sharing the base
6790 address of different sizes. Unaligned accesses use the signed
6791 9-bit range, TImode/TFmode use the intersection of signed
6792 scaled 7-bit and signed 9-bit offset. */
6793 if (mode
== TImode
|| mode
== TFmode
)
6794 second_offset
= ((const_offset
+ 0x100) & 0x1f8) - 0x100;
6795 else if ((const_offset
& (size
- 1)) != 0)
6796 second_offset
= ((const_offset
+ 0x100) & 0x1ff) - 0x100;
6798 second_offset
= const_offset
& (size
< 4 ? 0xfff : 0x3ffc);
6800 if (second_offset
== 0 || known_eq (orig_offset
, second_offset
))
6803 /* Split the offset into second_offset and the rest. */
6804 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
6805 *offset2
= gen_int_mode (second_offset
, Pmode
);
6810 /* Get the mode we should use as the basis of the range. For structure
6811 modes this is the mode of one vector. */
6812 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
6813 machine_mode step_mode
6814 = (vec_flags
& VEC_STRUCT
) != 0 ? SVE_BYTE_MODE
: mode
;
6816 /* Get the "mul vl" multiplier we'd like to use. */
6817 HOST_WIDE_INT factor
= GET_MODE_SIZE (step_mode
).coeffs
[1];
6818 HOST_WIDE_INT vnum
= orig_offset
.coeffs
[1] / factor
;
6819 if (vec_flags
& VEC_SVE_DATA
)
6820 /* LDR supports a 9-bit range, but the move patterns for
6821 structure modes require all vectors to be in range of the
6822 same base. The simplest way of accomodating that while still
6823 promoting reuse of anchor points between different modes is
6824 to use an 8-bit range unconditionally. */
6825 vnum
= ((vnum
+ 128) & 255) - 128;
6827 /* Predicates are only handled singly, so we might as well use
6829 vnum
= ((vnum
+ 256) & 511) - 256;
6833 /* Convert the "mul vl" multiplier into a byte offset. */
6834 poly_int64 second_offset
= GET_MODE_SIZE (step_mode
) * vnum
;
6835 if (known_eq (second_offset
, orig_offset
))
6838 /* Split the offset into second_offset and the rest. */
6839 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
6840 *offset2
= gen_int_mode (second_offset
, Pmode
);
6845 /* Return the binary representation of floating point constant VALUE in INTVAL.
6846 If the value cannot be converted, return false without setting INTVAL.
6847 The conversion is done in the given MODE. */
6849 aarch64_reinterpret_float_as_int (rtx value
, unsigned HOST_WIDE_INT
*intval
)
6852 /* We make a general exception for 0. */
6853 if (aarch64_float_const_zero_rtx_p (value
))
6859 scalar_float_mode mode
;
6860 if (GET_CODE (value
) != CONST_DOUBLE
6861 || !is_a
<scalar_float_mode
> (GET_MODE (value
), &mode
)
6862 || GET_MODE_BITSIZE (mode
) > HOST_BITS_PER_WIDE_INT
6863 /* Only support up to DF mode. */
6864 || GET_MODE_BITSIZE (mode
) > GET_MODE_BITSIZE (DFmode
))
6867 unsigned HOST_WIDE_INT ival
= 0;
6870 real_to_target (res
,
6871 CONST_DOUBLE_REAL_VALUE (value
),
6872 REAL_MODE_FORMAT (mode
));
6876 int order
= BYTES_BIG_ENDIAN
? 1 : 0;
6877 ival
= zext_hwi (res
[order
], 32);
6878 ival
|= (zext_hwi (res
[1 - order
], 32) << 32);
6881 ival
= zext_hwi (res
[0], 32);
6887 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6888 single MOV(+MOVK) followed by an FMOV. */
6890 aarch64_float_const_rtx_p (rtx x
)
6892 machine_mode mode
= GET_MODE (x
);
6893 if (mode
== VOIDmode
)
6896 /* Determine whether it's cheaper to write float constants as
6897 mov/movk pairs over ldr/adrp pairs. */
6898 unsigned HOST_WIDE_INT ival
;
6900 if (GET_CODE (x
) == CONST_DOUBLE
6901 && SCALAR_FLOAT_MODE_P (mode
)
6902 && aarch64_reinterpret_float_as_int (x
, &ival
))
6904 scalar_int_mode imode
= (mode
== HFmode
6906 : int_mode_for_mode (mode
).require ());
6907 int num_instr
= aarch64_internal_mov_immediate
6908 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
6909 return num_instr
< 3;
6915 /* Return TRUE if rtx X is immediate constant 0.0 */
6917 aarch64_float_const_zero_rtx_p (rtx x
)
6919 if (GET_MODE (x
) == VOIDmode
)
6922 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x
)))
6923 return !HONOR_SIGNED_ZEROS (GET_MODE (x
));
6924 return real_equal (CONST_DOUBLE_REAL_VALUE (x
), &dconst0
);
6927 /* Return TRUE if rtx X is immediate constant that fits in a single
6928 MOVI immediate operation. */
6930 aarch64_can_const_movi_rtx_p (rtx x
, machine_mode mode
)
6936 scalar_int_mode imode
;
6937 unsigned HOST_WIDE_INT ival
;
6939 if (GET_CODE (x
) == CONST_DOUBLE
6940 && SCALAR_FLOAT_MODE_P (mode
))
6942 if (!aarch64_reinterpret_float_as_int (x
, &ival
))
6945 /* We make a general exception for 0. */
6946 if (aarch64_float_const_zero_rtx_p (x
))
6949 imode
= int_mode_for_mode (mode
).require ();
6951 else if (GET_CODE (x
) == CONST_INT
6952 && is_a
<scalar_int_mode
> (mode
, &imode
))
6957 /* use a 64 bit mode for everything except for DI/DF mode, where we use
6958 a 128 bit vector mode. */
6959 int width
= GET_MODE_BITSIZE (imode
) == 64 ? 128 : 64;
6961 vmode
= aarch64_simd_container_mode (imode
, width
);
6962 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, ival
);
6964 return aarch64_simd_valid_immediate (v_op
, NULL
);
6968 /* Return the fixed registers used for condition codes. */
6971 aarch64_fixed_condition_code_regs (unsigned int *p1
, unsigned int *p2
)
6974 *p2
= INVALID_REGNUM
;
6978 /* This function is used by the call expanders of the machine description.
6979 RESULT is the register in which the result is returned. It's NULL for
6980 "call" and "sibcall".
6981 MEM is the location of the function call.
6982 SIBCALL indicates whether this function call is normal call or sibling call.
6983 It will generate different pattern accordingly. */
6986 aarch64_expand_call (rtx result
, rtx mem
, bool sibcall
)
6988 rtx call
, callee
, tmp
;
6992 gcc_assert (MEM_P (mem
));
6993 callee
= XEXP (mem
, 0);
6994 mode
= GET_MODE (callee
);
6995 gcc_assert (mode
== Pmode
);
6997 /* Decide if we should generate indirect calls by loading the
6998 address of the callee into a register before performing
6999 the branch-and-link. */
7000 if (SYMBOL_REF_P (callee
)
7001 ? (aarch64_is_long_call_p (callee
)
7002 || aarch64_is_noplt_call_p (callee
))
7004 XEXP (mem
, 0) = force_reg (mode
, callee
);
7006 call
= gen_rtx_CALL (VOIDmode
, mem
, const0_rtx
);
7008 if (result
!= NULL_RTX
)
7009 call
= gen_rtx_SET (result
, call
);
7014 tmp
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (Pmode
, LR_REGNUM
));
7016 vec
= gen_rtvec (2, call
, tmp
);
7017 call
= gen_rtx_PARALLEL (VOIDmode
, vec
);
7019 aarch64_emit_call_insn (call
);
7022 /* Emit call insn with PAT and do aarch64-specific handling. */
7025 aarch64_emit_call_insn (rtx pat
)
7027 rtx insn
= emit_call_insn (pat
);
7029 rtx
*fusage
= &CALL_INSN_FUNCTION_USAGE (insn
);
7030 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP0_REGNUM
));
7031 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP1_REGNUM
));
7035 aarch64_select_cc_mode (RTX_CODE code
, rtx x
, rtx y
)
7037 /* All floating point compares return CCFP if it is an equality
7038 comparison, and CCFPE otherwise. */
7039 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_FLOAT
)
7066 /* Equality comparisons of short modes against zero can be performed
7067 using the TST instruction with the appropriate bitmask. */
7068 if (y
== const0_rtx
&& REG_P (x
)
7069 && (code
== EQ
|| code
== NE
)
7070 && (GET_MODE (x
) == HImode
|| GET_MODE (x
) == QImode
))
7073 /* Similarly, comparisons of zero_extends from shorter modes can
7074 be performed using an ANDS with an immediate mask. */
7075 if (y
== const0_rtx
&& GET_CODE (x
) == ZERO_EXTEND
7076 && (GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
7077 && (GET_MODE (XEXP (x
, 0)) == HImode
|| GET_MODE (XEXP (x
, 0)) == QImode
)
7078 && (code
== EQ
|| code
== NE
))
7081 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
7083 && (code
== EQ
|| code
== NE
|| code
== LT
|| code
== GE
)
7084 && (GET_CODE (x
) == PLUS
|| GET_CODE (x
) == MINUS
|| GET_CODE (x
) == AND
7085 || GET_CODE (x
) == NEG
7086 || (GET_CODE (x
) == ZERO_EXTRACT
&& CONST_INT_P (XEXP (x
, 1))
7087 && CONST_INT_P (XEXP (x
, 2)))))
7090 /* A compare with a shifted operand. Because of canonicalization,
7091 the comparison will have to be swapped when we emit the assembly
7093 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
7094 && (REG_P (y
) || GET_CODE (y
) == SUBREG
|| y
== const0_rtx
)
7095 && (GET_CODE (x
) == ASHIFT
|| GET_CODE (x
) == ASHIFTRT
7096 || GET_CODE (x
) == LSHIFTRT
7097 || GET_CODE (x
) == ZERO_EXTEND
|| GET_CODE (x
) == SIGN_EXTEND
))
7100 /* Similarly for a negated operand, but we can only do this for
7102 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
7103 && (REG_P (y
) || GET_CODE (y
) == SUBREG
)
7104 && (code
== EQ
|| code
== NE
)
7105 && GET_CODE (x
) == NEG
)
7108 /* A test for unsigned overflow. */
7109 if ((GET_MODE (x
) == DImode
|| GET_MODE (x
) == TImode
)
7111 && GET_CODE (x
) == PLUS
7112 && GET_CODE (y
) == ZERO_EXTEND
)
7115 /* A test for signed overflow. */
7116 if ((GET_MODE (x
) == DImode
|| GET_MODE (x
) == TImode
)
7118 && GET_CODE (x
) == PLUS
7119 && GET_CODE (y
) == SIGN_EXTEND
)
7122 /* For everything else, return CCmode. */
7127 aarch64_get_condition_code_1 (machine_mode
, enum rtx_code
);
7130 aarch64_get_condition_code (rtx x
)
7132 machine_mode mode
= GET_MODE (XEXP (x
, 0));
7133 enum rtx_code comp_code
= GET_CODE (x
);
7135 if (GET_MODE_CLASS (mode
) != MODE_CC
)
7136 mode
= SELECT_CC_MODE (comp_code
, XEXP (x
, 0), XEXP (x
, 1));
7137 return aarch64_get_condition_code_1 (mode
, comp_code
);
7141 aarch64_get_condition_code_1 (machine_mode mode
, enum rtx_code comp_code
)
7149 case GE
: return AARCH64_GE
;
7150 case GT
: return AARCH64_GT
;
7151 case LE
: return AARCH64_LS
;
7152 case LT
: return AARCH64_MI
;
7153 case NE
: return AARCH64_NE
;
7154 case EQ
: return AARCH64_EQ
;
7155 case ORDERED
: return AARCH64_VC
;
7156 case UNORDERED
: return AARCH64_VS
;
7157 case UNLT
: return AARCH64_LT
;
7158 case UNLE
: return AARCH64_LE
;
7159 case UNGT
: return AARCH64_HI
;
7160 case UNGE
: return AARCH64_PL
;
7168 case NE
: return AARCH64_NE
;
7169 case EQ
: return AARCH64_EQ
;
7170 case GE
: return AARCH64_GE
;
7171 case GT
: return AARCH64_GT
;
7172 case LE
: return AARCH64_LE
;
7173 case LT
: return AARCH64_LT
;
7174 case GEU
: return AARCH64_CS
;
7175 case GTU
: return AARCH64_HI
;
7176 case LEU
: return AARCH64_LS
;
7177 case LTU
: return AARCH64_CC
;
7185 case NE
: return AARCH64_NE
;
7186 case EQ
: return AARCH64_EQ
;
7187 case GE
: return AARCH64_LE
;
7188 case GT
: return AARCH64_LT
;
7189 case LE
: return AARCH64_GE
;
7190 case LT
: return AARCH64_GT
;
7191 case GEU
: return AARCH64_LS
;
7192 case GTU
: return AARCH64_CC
;
7193 case LEU
: return AARCH64_CS
;
7194 case LTU
: return AARCH64_HI
;
7202 case NE
: return AARCH64_NE
;
7203 case EQ
: return AARCH64_EQ
;
7204 case GE
: return AARCH64_PL
;
7205 case LT
: return AARCH64_MI
;
7213 case NE
: return AARCH64_NE
;
7214 case EQ
: return AARCH64_EQ
;
7222 case NE
: return AARCH64_CS
;
7223 case EQ
: return AARCH64_CC
;
7231 case NE
: return AARCH64_VS
;
7232 case EQ
: return AARCH64_VC
;
7245 aarch64_const_vec_all_same_in_range_p (rtx x
,
7246 HOST_WIDE_INT minval
,
7247 HOST_WIDE_INT maxval
)
7250 return (const_vec_duplicate_p (x
, &elt
)
7251 && CONST_INT_P (elt
)
7252 && IN_RANGE (INTVAL (elt
), minval
, maxval
));
7256 aarch64_const_vec_all_same_int_p (rtx x
, HOST_WIDE_INT val
)
7258 return aarch64_const_vec_all_same_in_range_p (x
, val
, val
);
7261 /* Return true if VEC is a constant in which every element is in the range
7262 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
7265 aarch64_const_vec_all_in_range_p (rtx vec
,
7266 HOST_WIDE_INT minval
,
7267 HOST_WIDE_INT maxval
)
7269 if (GET_CODE (vec
) != CONST_VECTOR
7270 || GET_MODE_CLASS (GET_MODE (vec
)) != MODE_VECTOR_INT
)
7274 if (!CONST_VECTOR_STEPPED_P (vec
))
7275 nunits
= const_vector_encoded_nelts (vec
);
7276 else if (!CONST_VECTOR_NUNITS (vec
).is_constant (&nunits
))
7279 for (int i
= 0; i
< nunits
; i
++)
7281 rtx vec_elem
= CONST_VECTOR_ELT (vec
, i
);
7282 if (!CONST_INT_P (vec_elem
)
7283 || !IN_RANGE (INTVAL (vec_elem
), minval
, maxval
))
7290 #define AARCH64_CC_V 1
7291 #define AARCH64_CC_C (1 << 1)
7292 #define AARCH64_CC_Z (1 << 2)
7293 #define AARCH64_CC_N (1 << 3)
7295 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
7296 static const int aarch64_nzcv_codes
[] =
7298 0, /* EQ, Z == 1. */
7299 AARCH64_CC_Z
, /* NE, Z == 0. */
7300 0, /* CS, C == 1. */
7301 AARCH64_CC_C
, /* CC, C == 0. */
7302 0, /* MI, N == 1. */
7303 AARCH64_CC_N
, /* PL, N == 0. */
7304 0, /* VS, V == 1. */
7305 AARCH64_CC_V
, /* VC, V == 0. */
7306 0, /* HI, C ==1 && Z == 0. */
7307 AARCH64_CC_C
, /* LS, !(C == 1 && Z == 0). */
7308 AARCH64_CC_V
, /* GE, N == V. */
7309 0, /* LT, N != V. */
7310 AARCH64_CC_Z
, /* GT, Z == 0 && N == V. */
7311 0, /* LE, !(Z == 0 && N == V). */
7316 /* Print floating-point vector immediate operand X to F, negating it
7317 first if NEGATE is true. Return true on success, false if it isn't
7318 a constant we can handle. */
7321 aarch64_print_vector_float_operand (FILE *f
, rtx x
, bool negate
)
7325 if (!const_vec_duplicate_p (x
, &elt
))
7328 REAL_VALUE_TYPE r
= *CONST_DOUBLE_REAL_VALUE (elt
);
7330 r
= real_value_negate (&r
);
7332 /* We only handle the SVE single-bit immediates here. */
7333 if (real_equal (&r
, &dconst0
))
7334 asm_fprintf (f
, "0.0");
7335 else if (real_equal (&r
, &dconst1
))
7336 asm_fprintf (f
, "1.0");
7337 else if (real_equal (&r
, &dconsthalf
))
7338 asm_fprintf (f
, "0.5");
7345 /* Return the equivalent letter for size. */
7347 sizetochar (int size
)
7351 case 64: return 'd';
7352 case 32: return 's';
7353 case 16: return 'h';
7354 case 8 : return 'b';
7355 default: gcc_unreachable ();
7359 /* Print operand X to file F in a target specific manner according to CODE.
7360 The acceptable formatting commands given by CODE are:
7361 'c': An integer or symbol address without a preceding #
7363 'C': Take the duplicated element in a vector constant
7364 and print it in hex.
7365 'D': Take the duplicated element in a vector constant
7366 and print it as an unsigned integer, in decimal.
7367 'e': Print the sign/zero-extend size as a character 8->b,
7369 'p': Prints N such that 2^N == X (X must be power of 2 and
7371 'P': Print the number of non-zero bits in X (a const_int).
7372 'H': Print the higher numbered register of a pair (TImode)
7374 'm': Print a condition (eq, ne, etc).
7375 'M': Same as 'm', but invert condition.
7376 'N': Take the duplicated element in a vector constant
7377 and print the negative of it in decimal.
7378 'b/h/s/d/q': Print a scalar FP/SIMD register name.
7379 'S/T/U/V': Print a FP/SIMD register name for a register list.
7380 The register printed is the FP/SIMD register name
7381 of X + 0/1/2/3 for S/T/U/V.
7382 'R': Print a scalar FP/SIMD register name + 1.
7383 'X': Print bottom 16 bits of integer constant in hex.
7384 'w/x': Print a general register name or the zero register
7386 '0': Print a normal operand, if it's a general register,
7387 then we assume DImode.
7388 'k': Print NZCV for conditional compare instructions.
7389 'A': Output address constant representing the first
7390 argument of X, specifying a relocation offset
7392 'L': Output constant address specified by X
7393 with a relocation offset if appropriate.
7394 'G': Prints address of X, specifying a PC relative
7395 relocation mode if appropriate.
7396 'y': Output address of LDP or STP - this is used for
7397 some LDP/STPs which don't use a PARALLEL in their
7398 pattern (so the mode needs to be adjusted).
7399 'z': Output address of a typical LDP or STP. */
7402 aarch64_print_operand (FILE *f
, rtx x
, int code
)
7408 switch (GET_CODE (x
))
7411 fprintf (f
, HOST_WIDE_INT_PRINT_DEC
, INTVAL (x
));
7415 output_addr_const (f
, x
);
7419 if (GET_CODE (XEXP (x
, 0)) == PLUS
7420 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
)
7422 output_addr_const (f
, x
);
7428 output_operand_lossage ("unsupported operand for code '%c'", code
);
7436 if (!CONST_INT_P (x
)
7437 || (n
= exact_log2 (INTVAL (x
) & ~7)) <= 0)
7439 output_operand_lossage ("invalid operand for '%%%c'", code
);
7455 output_operand_lossage ("invalid operand for '%%%c'", code
);
7465 if (!CONST_INT_P (x
) || (n
= exact_log2 (INTVAL (x
))) < 0)
7467 output_operand_lossage ("invalid operand for '%%%c'", code
);
7471 asm_fprintf (f
, "%d", n
);
7476 if (!CONST_INT_P (x
))
7478 output_operand_lossage ("invalid operand for '%%%c'", code
);
7482 asm_fprintf (f
, "%u", popcount_hwi (INTVAL (x
)));
7486 if (x
== const0_rtx
)
7488 asm_fprintf (f
, "xzr");
7492 if (!REG_P (x
) || !GP_REGNUM_P (REGNO (x
) + 1))
7494 output_operand_lossage ("invalid operand for '%%%c'", code
);
7498 asm_fprintf (f
, "%s", reg_names
[REGNO (x
) + 1]);
7505 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
7506 if (x
== const_true_rtx
)
7513 if (!COMPARISON_P (x
))
7515 output_operand_lossage ("invalid operand for '%%%c'", code
);
7519 cond_code
= aarch64_get_condition_code (x
);
7520 gcc_assert (cond_code
>= 0);
7522 cond_code
= AARCH64_INVERSE_CONDITION_CODE (cond_code
);
7523 fputs (aarch64_condition_codes
[cond_code
], f
);
7528 if (!const_vec_duplicate_p (x
, &elt
))
7530 output_operand_lossage ("invalid vector constant");
7534 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
7535 asm_fprintf (f
, "%wd", -INTVAL (elt
));
7536 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
7537 && aarch64_print_vector_float_operand (f
, x
, true))
7541 output_operand_lossage ("invalid vector constant");
7551 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
7553 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
7556 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - V0_REGNUM
);
7563 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
7565 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
7568 asm_fprintf (f
, "%c%d",
7569 aarch64_sve_data_mode_p (GET_MODE (x
)) ? 'z' : 'v',
7570 REGNO (x
) - V0_REGNUM
+ (code
- 'S'));
7574 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
7576 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
7579 asm_fprintf (f
, "q%d", REGNO (x
) - V0_REGNUM
+ 1);
7583 if (!CONST_INT_P (x
))
7585 output_operand_lossage ("invalid operand for '%%%c'", code
);
7588 asm_fprintf (f
, "0x%wx", UINTVAL (x
) & 0xffff);
7593 /* Print a replicated constant in hex. */
7594 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
7596 output_operand_lossage ("invalid operand for '%%%c'", code
);
7599 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
7600 asm_fprintf (f
, "0x%wx", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
7606 /* Print a replicated constant in decimal, treating it as
7608 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
7610 output_operand_lossage ("invalid operand for '%%%c'", code
);
7613 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
7614 asm_fprintf (f
, "%wd", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
7621 || (CONST_DOUBLE_P (x
) && aarch64_float_const_zero_rtx_p (x
)))
7623 asm_fprintf (f
, "%czr", code
);
7627 if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
7629 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - R0_REGNUM
);
7633 if (REG_P (x
) && REGNO (x
) == SP_REGNUM
)
7635 asm_fprintf (f
, "%ssp", code
== 'w' ? "w" : "");
7644 output_operand_lossage ("missing operand");
7648 switch (GET_CODE (x
))
7651 if (aarch64_sve_data_mode_p (GET_MODE (x
)))
7653 if (REG_NREGS (x
) == 1)
7654 asm_fprintf (f
, "z%d", REGNO (x
) - V0_REGNUM
);
7658 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x
)));
7659 asm_fprintf (f
, "{z%d.%c - z%d.%c}",
7660 REGNO (x
) - V0_REGNUM
, suffix
,
7661 END_REGNO (x
) - V0_REGNUM
- 1, suffix
);
7665 asm_fprintf (f
, "%s", reg_names
[REGNO (x
)]);
7669 output_address (GET_MODE (x
), XEXP (x
, 0));
7674 output_addr_const (asm_out_file
, x
);
7678 asm_fprintf (f
, "%wd", INTVAL (x
));
7682 if (!VECTOR_MODE_P (GET_MODE (x
)))
7684 output_addr_const (asm_out_file
, x
);
7690 if (!const_vec_duplicate_p (x
, &elt
))
7692 output_operand_lossage ("invalid vector constant");
7696 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
7697 asm_fprintf (f
, "%wd", INTVAL (elt
));
7698 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
7699 && aarch64_print_vector_float_operand (f
, x
, false))
7703 output_operand_lossage ("invalid vector constant");
7709 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
7710 be getting CONST_DOUBLEs holding integers. */
7711 gcc_assert (GET_MODE (x
) != VOIDmode
);
7712 if (aarch64_float_const_zero_rtx_p (x
))
7717 else if (aarch64_float_const_representable_p (x
))
7720 char float_buf
[buf_size
] = {'\0'};
7721 real_to_decimal_for_mode (float_buf
,
7722 CONST_DOUBLE_REAL_VALUE (x
),
7725 asm_fprintf (asm_out_file
, "%s", float_buf
);
7729 output_operand_lossage ("invalid constant");
7732 output_operand_lossage ("invalid operand");
7738 if (GET_CODE (x
) == HIGH
)
7741 switch (aarch64_classify_symbolic_expression (x
))
7743 case SYMBOL_SMALL_GOT_4G
:
7744 asm_fprintf (asm_out_file
, ":got:");
7747 case SYMBOL_SMALL_TLSGD
:
7748 asm_fprintf (asm_out_file
, ":tlsgd:");
7751 case SYMBOL_SMALL_TLSDESC
:
7752 asm_fprintf (asm_out_file
, ":tlsdesc:");
7755 case SYMBOL_SMALL_TLSIE
:
7756 asm_fprintf (asm_out_file
, ":gottprel:");
7759 case SYMBOL_TLSLE24
:
7760 asm_fprintf (asm_out_file
, ":tprel:");
7763 case SYMBOL_TINY_GOT
:
7770 output_addr_const (asm_out_file
, x
);
7774 switch (aarch64_classify_symbolic_expression (x
))
7776 case SYMBOL_SMALL_GOT_4G
:
7777 asm_fprintf (asm_out_file
, ":lo12:");
7780 case SYMBOL_SMALL_TLSGD
:
7781 asm_fprintf (asm_out_file
, ":tlsgd_lo12:");
7784 case SYMBOL_SMALL_TLSDESC
:
7785 asm_fprintf (asm_out_file
, ":tlsdesc_lo12:");
7788 case SYMBOL_SMALL_TLSIE
:
7789 asm_fprintf (asm_out_file
, ":gottprel_lo12:");
7792 case SYMBOL_TLSLE12
:
7793 asm_fprintf (asm_out_file
, ":tprel_lo12:");
7796 case SYMBOL_TLSLE24
:
7797 asm_fprintf (asm_out_file
, ":tprel_lo12_nc:");
7800 case SYMBOL_TINY_GOT
:
7801 asm_fprintf (asm_out_file
, ":got:");
7804 case SYMBOL_TINY_TLSIE
:
7805 asm_fprintf (asm_out_file
, ":gottprel:");
7811 output_addr_const (asm_out_file
, x
);
7815 switch (aarch64_classify_symbolic_expression (x
))
7817 case SYMBOL_TLSLE24
:
7818 asm_fprintf (asm_out_file
, ":tprel_hi12:");
7823 output_addr_const (asm_out_file
, x
);
7828 HOST_WIDE_INT cond_code
;
7830 if (!CONST_INT_P (x
))
7832 output_operand_lossage ("invalid operand for '%%%c'", code
);
7836 cond_code
= INTVAL (x
);
7837 gcc_assert (cond_code
>= 0 && cond_code
<= AARCH64_NV
);
7838 asm_fprintf (f
, "%d", aarch64_nzcv_codes
[cond_code
]);
7845 machine_mode mode
= GET_MODE (x
);
7847 if (GET_CODE (x
) != MEM
7848 || (code
== 'y' && maybe_ne (GET_MODE_SIZE (mode
), 16)))
7850 output_operand_lossage ("invalid operand for '%%%c'", code
);
7854 if (!aarch64_print_address_internal (f
, mode
, XEXP (x
, 0),
7856 ? ADDR_QUERY_LDP_STP_N
7857 : ADDR_QUERY_LDP_STP
))
7858 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
7863 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
7868 /* Print address 'x' of a memory access with mode 'mode'.
7869 'op' is the context required by aarch64_classify_address. It can either be
7870 MEM for a normal memory access or PARALLEL for LDP/STP. */
7872 aarch64_print_address_internal (FILE *f
, machine_mode mode
, rtx x
,
7873 aarch64_addr_query_type type
)
7875 struct aarch64_address_info addr
;
7878 /* Check all addresses are Pmode - including ILP32. */
7879 if (GET_MODE (x
) != Pmode
7880 && (!CONST_INT_P (x
)
7881 || trunc_int_for_mode (INTVAL (x
), Pmode
) != INTVAL (x
)))
7883 output_operand_lossage ("invalid address mode");
7887 if (aarch64_classify_address (&addr
, x
, mode
, true, type
))
7890 case ADDRESS_REG_IMM
:
7891 if (known_eq (addr
.const_offset
, 0))
7892 asm_fprintf (f
, "[%s]", reg_names
[REGNO (addr
.base
)]);
7893 else if (aarch64_sve_data_mode_p (mode
))
7896 = exact_div (addr
.const_offset
,
7897 BYTES_PER_SVE_VECTOR
).to_constant ();
7898 asm_fprintf (f
, "[%s, #%wd, mul vl]",
7899 reg_names
[REGNO (addr
.base
)], vnum
);
7901 else if (aarch64_sve_pred_mode_p (mode
))
7904 = exact_div (addr
.const_offset
,
7905 BYTES_PER_SVE_PRED
).to_constant ();
7906 asm_fprintf (f
, "[%s, #%wd, mul vl]",
7907 reg_names
[REGNO (addr
.base
)], vnum
);
7910 asm_fprintf (f
, "[%s, %wd]", reg_names
[REGNO (addr
.base
)],
7911 INTVAL (addr
.offset
));
7914 case ADDRESS_REG_REG
:
7915 if (addr
.shift
== 0)
7916 asm_fprintf (f
, "[%s, %s]", reg_names
[REGNO (addr
.base
)],
7917 reg_names
[REGNO (addr
.offset
)]);
7919 asm_fprintf (f
, "[%s, %s, lsl %u]", reg_names
[REGNO (addr
.base
)],
7920 reg_names
[REGNO (addr
.offset
)], addr
.shift
);
7923 case ADDRESS_REG_UXTW
:
7924 if (addr
.shift
== 0)
7925 asm_fprintf (f
, "[%s, w%d, uxtw]", reg_names
[REGNO (addr
.base
)],
7926 REGNO (addr
.offset
) - R0_REGNUM
);
7928 asm_fprintf (f
, "[%s, w%d, uxtw %u]", reg_names
[REGNO (addr
.base
)],
7929 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
7932 case ADDRESS_REG_SXTW
:
7933 if (addr
.shift
== 0)
7934 asm_fprintf (f
, "[%s, w%d, sxtw]", reg_names
[REGNO (addr
.base
)],
7935 REGNO (addr
.offset
) - R0_REGNUM
);
7937 asm_fprintf (f
, "[%s, w%d, sxtw %u]", reg_names
[REGNO (addr
.base
)],
7938 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
7941 case ADDRESS_REG_WB
:
7942 /* Writeback is only supported for fixed-width modes. */
7943 size
= GET_MODE_SIZE (mode
).to_constant ();
7944 switch (GET_CODE (x
))
7947 asm_fprintf (f
, "[%s, %d]!", reg_names
[REGNO (addr
.base
)], size
);
7950 asm_fprintf (f
, "[%s], %d", reg_names
[REGNO (addr
.base
)], size
);
7953 asm_fprintf (f
, "[%s, -%d]!", reg_names
[REGNO (addr
.base
)], size
);
7956 asm_fprintf (f
, "[%s], -%d", reg_names
[REGNO (addr
.base
)], size
);
7959 asm_fprintf (f
, "[%s, %wd]!", reg_names
[REGNO (addr
.base
)],
7960 INTVAL (addr
.offset
));
7963 asm_fprintf (f
, "[%s], %wd", reg_names
[REGNO (addr
.base
)],
7964 INTVAL (addr
.offset
));
7971 case ADDRESS_LO_SUM
:
7972 asm_fprintf (f
, "[%s, #:lo12:", reg_names
[REGNO (addr
.base
)]);
7973 output_addr_const (f
, addr
.offset
);
7974 asm_fprintf (f
, "]");
7977 case ADDRESS_SYMBOLIC
:
7978 output_addr_const (f
, x
);
7985 /* Print address 'x' of a memory access with mode 'mode'. */
7987 aarch64_print_operand_address (FILE *f
, machine_mode mode
, rtx x
)
7989 if (!aarch64_print_address_internal (f
, mode
, x
, ADDR_QUERY_ANY
))
7990 output_addr_const (f
, x
);
7994 aarch64_label_mentioned_p (rtx x
)
7999 if (GET_CODE (x
) == LABEL_REF
)
8002 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
8003 referencing instruction, but they are constant offsets, not
8005 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
8008 fmt
= GET_RTX_FORMAT (GET_CODE (x
));
8009 for (i
= GET_RTX_LENGTH (GET_CODE (x
)) - 1; i
>= 0; i
--)
8015 for (j
= XVECLEN (x
, i
) - 1; j
>= 0; j
--)
8016 if (aarch64_label_mentioned_p (XVECEXP (x
, i
, j
)))
8019 else if (fmt
[i
] == 'e' && aarch64_label_mentioned_p (XEXP (x
, i
)))
8026 /* Implement REGNO_REG_CLASS. */
8029 aarch64_regno_regclass (unsigned regno
)
8031 if (GP_REGNUM_P (regno
))
8032 return GENERAL_REGS
;
8034 if (regno
== SP_REGNUM
)
8037 if (regno
== FRAME_POINTER_REGNUM
8038 || regno
== ARG_POINTER_REGNUM
)
8039 return POINTER_REGS
;
8041 if (FP_REGNUM_P (regno
))
8042 return FP_LO_REGNUM_P (regno
) ? FP_LO_REGS
: FP_REGS
;
8044 if (PR_REGNUM_P (regno
))
8045 return PR_LO_REGNUM_P (regno
) ? PR_LO_REGS
: PR_HI_REGS
;
8050 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
8051 If OFFSET is out of range, return an offset of an anchor point
8052 that is in range. Return 0 otherwise. */
8054 static HOST_WIDE_INT
8055 aarch64_anchor_offset (HOST_WIDE_INT offset
, HOST_WIDE_INT size
,
8058 /* Does it look like we'll need a 16-byte load/store-pair operation? */
8060 return (offset
+ 0x400) & ~0x7f0;
8062 /* For offsets that aren't a multiple of the access size, the limit is
8064 if (offset
& (size
- 1))
8066 /* BLKmode typically uses LDP of X-registers. */
8067 if (mode
== BLKmode
)
8068 return (offset
+ 512) & ~0x3ff;
8069 return (offset
+ 0x100) & ~0x1ff;
8072 /* Small negative offsets are supported. */
8073 if (IN_RANGE (offset
, -256, 0))
8076 if (mode
== TImode
|| mode
== TFmode
)
8077 return (offset
+ 0x100) & ~0x1ff;
8079 /* Use 12-bit offset by access size. */
8080 return offset
& (~0xfff * size
);
8084 aarch64_legitimize_address (rtx x
, rtx
/* orig_x */, machine_mode mode
)
8086 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
8087 where mask is selected by alignment and size of the offset.
8088 We try to pick as large a range for the offset as possible to
8089 maximize the chance of a CSE. However, for aligned addresses
8090 we limit the range to 4k so that structures with different sized
8091 elements are likely to use the same base. We need to be careful
8092 not to split a CONST for some forms of address expression, otherwise
8093 it will generate sub-optimal code. */
8095 if (GET_CODE (x
) == PLUS
&& CONST_INT_P (XEXP (x
, 1)))
8097 rtx base
= XEXP (x
, 0);
8098 rtx offset_rtx
= XEXP (x
, 1);
8099 HOST_WIDE_INT offset
= INTVAL (offset_rtx
);
8101 if (GET_CODE (base
) == PLUS
)
8103 rtx op0
= XEXP (base
, 0);
8104 rtx op1
= XEXP (base
, 1);
8106 /* Force any scaling into a temp for CSE. */
8107 op0
= force_reg (Pmode
, op0
);
8108 op1
= force_reg (Pmode
, op1
);
8110 /* Let the pointer register be in op0. */
8111 if (REG_POINTER (op1
))
8112 std::swap (op0
, op1
);
8114 /* If the pointer is virtual or frame related, then we know that
8115 virtual register instantiation or register elimination is going
8116 to apply a second constant. We want the two constants folded
8117 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
8118 if (virt_or_elim_regno_p (REGNO (op0
)))
8120 base
= expand_binop (Pmode
, add_optab
, op0
, offset_rtx
,
8121 NULL_RTX
, true, OPTAB_DIRECT
);
8122 return gen_rtx_PLUS (Pmode
, base
, op1
);
8125 /* Otherwise, in order to encourage CSE (and thence loop strength
8126 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
8127 base
= expand_binop (Pmode
, add_optab
, op0
, op1
,
8128 NULL_RTX
, true, OPTAB_DIRECT
);
8129 x
= gen_rtx_PLUS (Pmode
, base
, offset_rtx
);
8133 if (GET_MODE_SIZE (mode
).is_constant (&size
))
8135 HOST_WIDE_INT base_offset
= aarch64_anchor_offset (offset
, size
,
8137 if (base_offset
!= 0)
8139 base
= plus_constant (Pmode
, base
, base_offset
);
8140 base
= force_operand (base
, NULL_RTX
);
8141 return plus_constant (Pmode
, base
, offset
- base_offset
);
8150 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED
, rtx x
,
8153 secondary_reload_info
*sri
)
8155 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
8156 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
8157 comment at the head of aarch64-sve.md for more details about the
8158 big-endian handling. */
8159 if (BYTES_BIG_ENDIAN
8160 && reg_class_subset_p (rclass
, FP_REGS
)
8161 && !((REG_P (x
) && HARD_REGISTER_P (x
))
8162 || aarch64_simd_valid_immediate (x
, NULL
))
8163 && aarch64_sve_data_mode_p (mode
))
8165 sri
->icode
= CODE_FOR_aarch64_sve_reload_be
;
8169 /* If we have to disable direct literal pool loads and stores because the
8170 function is too big, then we need a scratch register. */
8171 if (MEM_P (x
) && GET_CODE (x
) == SYMBOL_REF
&& CONSTANT_POOL_ADDRESS_P (x
)
8172 && (SCALAR_FLOAT_MODE_P (GET_MODE (x
))
8173 || targetm
.vector_mode_supported_p (GET_MODE (x
)))
8174 && !aarch64_pcrelative_literal_loads
)
8176 sri
->icode
= code_for_aarch64_reload_movcp (mode
, DImode
);
8180 /* Without the TARGET_SIMD instructions we cannot move a Q register
8181 to a Q register directly. We need a scratch. */
8182 if (REG_P (x
) && (mode
== TFmode
|| mode
== TImode
) && mode
== GET_MODE (x
)
8183 && FP_REGNUM_P (REGNO (x
)) && !TARGET_SIMD
8184 && reg_class_subset_p (rclass
, FP_REGS
))
8186 sri
->icode
= code_for_aarch64_reload_mov (mode
);
8190 /* A TFmode or TImode memory access should be handled via an FP_REGS
8191 because AArch64 has richer addressing modes for LDR/STR instructions
8192 than LDP/STP instructions. */
8193 if (TARGET_FLOAT
&& rclass
== GENERAL_REGS
8194 && known_eq (GET_MODE_SIZE (mode
), 16) && MEM_P (x
))
8197 if (rclass
== FP_REGS
&& (mode
== TImode
|| mode
== TFmode
) && CONSTANT_P(x
))
8198 return GENERAL_REGS
;
8204 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED
, const int to
)
8206 gcc_assert (from
== ARG_POINTER_REGNUM
|| from
== FRAME_POINTER_REGNUM
);
8208 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
8209 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
8210 if (frame_pointer_needed
)
8211 return to
== HARD_FRAME_POINTER_REGNUM
;
8216 aarch64_initial_elimination_offset (unsigned from
, unsigned to
)
8218 if (to
== HARD_FRAME_POINTER_REGNUM
)
8220 if (from
== ARG_POINTER_REGNUM
)
8221 return cfun
->machine
->frame
.hard_fp_offset
;
8223 if (from
== FRAME_POINTER_REGNUM
)
8224 return cfun
->machine
->frame
.hard_fp_offset
8225 - cfun
->machine
->frame
.locals_offset
;
8228 if (to
== STACK_POINTER_REGNUM
)
8230 if (from
== FRAME_POINTER_REGNUM
)
8231 return cfun
->machine
->frame
.frame_size
8232 - cfun
->machine
->frame
.locals_offset
;
8235 return cfun
->machine
->frame
.frame_size
;
8238 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
8242 aarch64_return_addr (int count
, rtx frame ATTRIBUTE_UNUSED
)
8246 return get_hard_reg_initial_val (Pmode
, LR_REGNUM
);
8251 aarch64_asm_trampoline_template (FILE *f
)
8255 asm_fprintf (f
, "\tldr\tw%d, .+16\n", IP1_REGNUM
- R0_REGNUM
);
8256 asm_fprintf (f
, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM
- R0_REGNUM
);
8260 asm_fprintf (f
, "\tldr\t%s, .+16\n", reg_names
[IP1_REGNUM
]);
8261 asm_fprintf (f
, "\tldr\t%s, .+20\n", reg_names
[STATIC_CHAIN_REGNUM
]);
8263 asm_fprintf (f
, "\tbr\t%s\n", reg_names
[IP1_REGNUM
]);
8264 assemble_aligned_integer (4, const0_rtx
);
8265 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
8266 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
8270 aarch64_trampoline_init (rtx m_tramp
, tree fndecl
, rtx chain_value
)
8272 rtx fnaddr
, mem
, a_tramp
;
8273 const int tramp_code_sz
= 16;
8275 /* Don't need to copy the trailing D-words, we fill those in below. */
8276 emit_block_move (m_tramp
, assemble_trampoline_template (),
8277 GEN_INT (tramp_code_sz
), BLOCK_OP_NORMAL
);
8278 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
);
8279 fnaddr
= XEXP (DECL_RTL (fndecl
), 0);
8280 if (GET_MODE (fnaddr
) != ptr_mode
)
8281 fnaddr
= convert_memory_address (ptr_mode
, fnaddr
);
8282 emit_move_insn (mem
, fnaddr
);
8284 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
+ POINTER_BYTES
);
8285 emit_move_insn (mem
, chain_value
);
8287 /* XXX We should really define a "clear_cache" pattern and use
8288 gen_clear_cache(). */
8289 a_tramp
= XEXP (m_tramp
, 0);
8290 emit_library_call (gen_rtx_SYMBOL_REF (Pmode
, "__clear_cache"),
8291 LCT_NORMAL
, VOIDmode
, a_tramp
, ptr_mode
,
8292 plus_constant (ptr_mode
, a_tramp
, TRAMPOLINE_SIZE
),
8296 static unsigned char
8297 aarch64_class_max_nregs (reg_class_t regclass
, machine_mode mode
)
8299 /* ??? Logically we should only need to provide a value when
8300 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
8301 can hold MODE, but at the moment we need to handle all modes.
8302 Just ignore any runtime parts for registers that can't store them. */
8303 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
8307 case TAILCALL_ADDR_REGS
:
8311 case POINTER_AND_FP_REGS
:
8314 if (aarch64_sve_data_mode_p (mode
)
8315 && constant_multiple_p (GET_MODE_SIZE (mode
),
8316 BYTES_PER_SVE_VECTOR
, &nregs
))
8318 return (aarch64_vector_data_mode_p (mode
)
8319 ? CEIL (lowest_size
, UNITS_PER_VREG
)
8320 : CEIL (lowest_size
, UNITS_PER_WORD
));
8337 aarch64_preferred_reload_class (rtx x
, reg_class_t regclass
)
8339 if (regclass
== POINTER_REGS
)
8340 return GENERAL_REGS
;
8342 if (regclass
== STACK_REG
)
8345 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x
)), POINTER_REGS
))
8351 /* Register eliminiation can result in a request for
8352 SP+constant->FP_REGS. We cannot support such operations which
8353 use SP as source and an FP_REG as destination, so reject out
8355 if (! reg_class_subset_p (regclass
, GENERAL_REGS
) && GET_CODE (x
) == PLUS
)
8357 rtx lhs
= XEXP (x
, 0);
8359 /* Look through a possible SUBREG introduced by ILP32. */
8360 if (GET_CODE (lhs
) == SUBREG
)
8361 lhs
= SUBREG_REG (lhs
);
8363 gcc_assert (REG_P (lhs
));
8364 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs
)),
8373 aarch64_asm_output_labelref (FILE* f
, const char *name
)
8375 asm_fprintf (f
, "%U%s", name
);
8379 aarch64_elf_asm_constructor (rtx symbol
, int priority
)
8381 if (priority
== DEFAULT_INIT_PRIORITY
)
8382 default_ctor_section_asm_out_constructor (symbol
, priority
);
8386 /* While priority is known to be in range [0, 65535], so 18 bytes
8387 would be enough, the compiler might not know that. To avoid
8388 -Wformat-truncation false positive, use a larger size. */
8390 snprintf (buf
, sizeof (buf
), ".init_array.%.5u", priority
);
8391 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
8392 switch_to_section (s
);
8393 assemble_align (POINTER_SIZE
);
8394 assemble_aligned_integer (POINTER_BYTES
, symbol
);
8399 aarch64_elf_asm_destructor (rtx symbol
, int priority
)
8401 if (priority
== DEFAULT_INIT_PRIORITY
)
8402 default_dtor_section_asm_out_destructor (symbol
, priority
);
8406 /* While priority is known to be in range [0, 65535], so 18 bytes
8407 would be enough, the compiler might not know that. To avoid
8408 -Wformat-truncation false positive, use a larger size. */
8410 snprintf (buf
, sizeof (buf
), ".fini_array.%.5u", priority
);
8411 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
8412 switch_to_section (s
);
8413 assemble_align (POINTER_SIZE
);
8414 assemble_aligned_integer (POINTER_BYTES
, symbol
);
8419 aarch64_output_casesi (rtx
*operands
)
8423 rtx diff_vec
= PATTERN (NEXT_INSN (as_a
<rtx_insn
*> (operands
[2])));
8425 static const char *const patterns
[4][2] =
8428 "ldrb\t%w3, [%0,%w1,uxtw]",
8429 "add\t%3, %4, %w3, sxtb #2"
8432 "ldrh\t%w3, [%0,%w1,uxtw #1]",
8433 "add\t%3, %4, %w3, sxth #2"
8436 "ldr\t%w3, [%0,%w1,uxtw #2]",
8437 "add\t%3, %4, %w3, sxtw #2"
8439 /* We assume that DImode is only generated when not optimizing and
8440 that we don't really need 64-bit address offsets. That would
8441 imply an object file with 8GB of code in a single function! */
8443 "ldr\t%w3, [%0,%w1,uxtw #2]",
8444 "add\t%3, %4, %w3, sxtw #2"
8448 gcc_assert (GET_CODE (diff_vec
) == ADDR_DIFF_VEC
);
8450 scalar_int_mode mode
= as_a
<scalar_int_mode
> (GET_MODE (diff_vec
));
8451 index
= exact_log2 (GET_MODE_SIZE (mode
));
8453 gcc_assert (index
>= 0 && index
<= 3);
8455 /* Need to implement table size reduction, by chaning the code below. */
8456 output_asm_insn (patterns
[index
][0], operands
);
8457 ASM_GENERATE_INTERNAL_LABEL (label
, "Lrtx", CODE_LABEL_NUMBER (operands
[2]));
8458 snprintf (buf
, sizeof (buf
),
8459 "adr\t%%4, %s", targetm
.strip_name_encoding (label
));
8460 output_asm_insn (buf
, operands
);
8461 output_asm_insn (patterns
[index
][1], operands
);
8462 output_asm_insn ("br\t%3", operands
);
8463 assemble_label (asm_out_file
, label
);
8468 /* Return size in bits of an arithmetic operand which is shifted/scaled and
8469 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
8473 aarch64_uxt_size (int shift
, HOST_WIDE_INT mask
)
8475 if (shift
>= 0 && shift
<= 3)
8478 for (size
= 8; size
<= 32; size
*= 2)
8480 HOST_WIDE_INT bits
= ((HOST_WIDE_INT
)1U << size
) - 1;
8481 if (mask
== bits
<< shift
)
8488 /* Constant pools are per function only when PC relative
8489 literal loads are true or we are in the large memory
8493 aarch64_can_use_per_function_literal_pools_p (void)
8495 return (aarch64_pcrelative_literal_loads
8496 || aarch64_cmodel
== AARCH64_CMODEL_LARGE
);
8500 aarch64_use_blocks_for_constant_p (machine_mode
, const_rtx
)
8502 /* We can't use blocks for constants when we're using a per-function
8504 return !aarch64_can_use_per_function_literal_pools_p ();
8507 /* Select appropriate section for constants depending
8508 on where we place literal pools. */
8511 aarch64_select_rtx_section (machine_mode mode
,
8513 unsigned HOST_WIDE_INT align
)
8515 if (aarch64_can_use_per_function_literal_pools_p ())
8516 return function_section (current_function_decl
);
8518 return default_elf_select_rtx_section (mode
, x
, align
);
8521 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
8523 aarch64_asm_output_pool_epilogue (FILE *f
, const char *, tree
,
8524 HOST_WIDE_INT offset
)
8526 /* When using per-function literal pools, we must ensure that any code
8527 section is aligned to the minimal instruction length, lest we get
8528 errors from the assembler re "unaligned instructions". */
8529 if ((offset
& 3) && aarch64_can_use_per_function_literal_pools_p ())
8530 ASM_OUTPUT_ALIGN (f
, 2);
8535 /* Helper function for rtx cost calculation. Strip a shift expression
8536 from X. Returns the inner operand if successful, or the original
8537 expression on failure. */
8539 aarch64_strip_shift (rtx x
)
8543 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
8544 we can convert both to ROR during final output. */
8545 if ((GET_CODE (op
) == ASHIFT
8546 || GET_CODE (op
) == ASHIFTRT
8547 || GET_CODE (op
) == LSHIFTRT
8548 || GET_CODE (op
) == ROTATERT
8549 || GET_CODE (op
) == ROTATE
)
8550 && CONST_INT_P (XEXP (op
, 1)))
8551 return XEXP (op
, 0);
8553 if (GET_CODE (op
) == MULT
8554 && CONST_INT_P (XEXP (op
, 1))
8555 && ((unsigned) exact_log2 (INTVAL (XEXP (op
, 1)))) < 64)
8556 return XEXP (op
, 0);
8561 /* Helper function for rtx cost calculation. Strip an extend
8562 expression from X. Returns the inner operand if successful, or the
8563 original expression on failure. We deal with a number of possible
8564 canonicalization variations here. If STRIP_SHIFT is true, then
8565 we can strip off a shift also. */
8567 aarch64_strip_extend (rtx x
, bool strip_shift
)
8569 scalar_int_mode mode
;
8572 if (!is_a
<scalar_int_mode
> (GET_MODE (op
), &mode
))
8575 /* Zero and sign extraction of a widened value. */
8576 if ((GET_CODE (op
) == ZERO_EXTRACT
|| GET_CODE (op
) == SIGN_EXTRACT
)
8577 && XEXP (op
, 2) == const0_rtx
8578 && GET_CODE (XEXP (op
, 0)) == MULT
8579 && aarch64_is_extend_from_extract (mode
, XEXP (XEXP (op
, 0), 1),
8581 return XEXP (XEXP (op
, 0), 0);
8583 /* It can also be represented (for zero-extend) as an AND with an
8585 if (GET_CODE (op
) == AND
8586 && GET_CODE (XEXP (op
, 0)) == MULT
8587 && CONST_INT_P (XEXP (XEXP (op
, 0), 1))
8588 && CONST_INT_P (XEXP (op
, 1))
8589 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op
, 0), 1))),
8590 INTVAL (XEXP (op
, 1))) != 0)
8591 return XEXP (XEXP (op
, 0), 0);
8593 /* Now handle extended register, as this may also have an optional
8594 left shift by 1..4. */
8596 && GET_CODE (op
) == ASHIFT
8597 && CONST_INT_P (XEXP (op
, 1))
8598 && ((unsigned HOST_WIDE_INT
) INTVAL (XEXP (op
, 1))) <= 4)
8601 if (GET_CODE (op
) == ZERO_EXTEND
8602 || GET_CODE (op
) == SIGN_EXTEND
)
8611 /* Return true iff CODE is a shift supported in combination
8612 with arithmetic instructions. */
8615 aarch64_shift_p (enum rtx_code code
)
8617 return code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
;
8621 /* Return true iff X is a cheap shift without a sign extend. */
8624 aarch64_cheap_mult_shift_p (rtx x
)
8631 if (!(aarch64_tune_params
.extra_tuning_flags
8632 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
))
8635 if (GET_CODE (op0
) == SIGN_EXTEND
)
8638 if (GET_CODE (x
) == ASHIFT
&& CONST_INT_P (op1
)
8639 && UINTVAL (op1
) <= 4)
8642 if (GET_CODE (x
) != MULT
|| !CONST_INT_P (op1
))
8645 HOST_WIDE_INT l2
= exact_log2 (INTVAL (op1
));
8647 if (l2
> 0 && l2
<= 4)
8653 /* Helper function for rtx cost calculation. Calculate the cost of
8654 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
8655 Return the calculated cost of the expression, recursing manually in to
8656 operands where needed. */
8659 aarch64_rtx_mult_cost (rtx x
, enum rtx_code code
, int outer
, bool speed
)
8662 const struct cpu_cost_table
*extra_cost
8663 = aarch64_tune_params
.insn_extra_cost
;
8665 bool compound_p
= (outer
== PLUS
|| outer
== MINUS
);
8666 machine_mode mode
= GET_MODE (x
);
8668 gcc_checking_assert (code
== MULT
);
8673 if (VECTOR_MODE_P (mode
))
8674 mode
= GET_MODE_INNER (mode
);
8676 /* Integer multiply/fma. */
8677 if (GET_MODE_CLASS (mode
) == MODE_INT
)
8679 /* The multiply will be canonicalized as a shift, cost it as such. */
8680 if (aarch64_shift_p (GET_CODE (x
))
8681 || (CONST_INT_P (op1
)
8682 && exact_log2 (INTVAL (op1
)) > 0))
8684 bool is_extend
= GET_CODE (op0
) == ZERO_EXTEND
8685 || GET_CODE (op0
) == SIGN_EXTEND
;
8690 /* If the shift is considered cheap,
8691 then don't add any cost. */
8692 if (aarch64_cheap_mult_shift_p (x
))
8694 else if (REG_P (op1
))
8695 /* ARITH + shift-by-register. */
8696 cost
+= extra_cost
->alu
.arith_shift_reg
;
8698 /* ARITH + extended register. We don't have a cost field
8699 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
8700 cost
+= extra_cost
->alu
.extend_arith
;
8702 /* ARITH + shift-by-immediate. */
8703 cost
+= extra_cost
->alu
.arith_shift
;
8706 /* LSL (immediate). */
8707 cost
+= extra_cost
->alu
.shift
;
8710 /* Strip extends as we will have costed them in the case above. */
8712 op0
= aarch64_strip_extend (op0
, true);
8714 cost
+= rtx_cost (op0
, VOIDmode
, code
, 0, speed
);
8719 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
8720 compound and let the below cases handle it. After all, MNEG is a
8721 special-case alias of MSUB. */
8722 if (GET_CODE (op0
) == NEG
)
8724 op0
= XEXP (op0
, 0);
8728 /* Integer multiplies or FMAs have zero/sign extending variants. */
8729 if ((GET_CODE (op0
) == ZERO_EXTEND
8730 && GET_CODE (op1
) == ZERO_EXTEND
)
8731 || (GET_CODE (op0
) == SIGN_EXTEND
8732 && GET_CODE (op1
) == SIGN_EXTEND
))
8734 cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, MULT
, 0, speed
);
8735 cost
+= rtx_cost (XEXP (op1
, 0), VOIDmode
, MULT
, 1, speed
);
8740 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
8741 cost
+= extra_cost
->mult
[0].extend_add
;
8743 /* MUL/SMULL/UMULL. */
8744 cost
+= extra_cost
->mult
[0].extend
;
8750 /* This is either an integer multiply or a MADD. In both cases
8751 we want to recurse and cost the operands. */
8752 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
8753 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
8759 cost
+= extra_cost
->mult
[mode
== DImode
].add
;
8762 cost
+= extra_cost
->mult
[mode
== DImode
].simple
;
8771 /* Floating-point FMA/FMUL can also support negations of the
8772 operands, unless the rounding mode is upward or downward in
8773 which case FNMUL is different than FMUL with operand negation. */
8774 bool neg0
= GET_CODE (op0
) == NEG
;
8775 bool neg1
= GET_CODE (op1
) == NEG
;
8776 if (compound_p
|| !flag_rounding_math
|| (neg0
&& neg1
))
8779 op0
= XEXP (op0
, 0);
8781 op1
= XEXP (op1
, 0);
8785 /* FMADD/FNMADD/FNMSUB/FMSUB. */
8786 cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
8789 cost
+= extra_cost
->fp
[mode
== DFmode
].mult
;
8792 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
8793 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
8799 aarch64_address_cost (rtx x
,
8801 addr_space_t as ATTRIBUTE_UNUSED
,
8804 enum rtx_code c
= GET_CODE (x
);
8805 const struct cpu_addrcost_table
*addr_cost
= aarch64_tune_params
.addr_cost
;
8806 struct aarch64_address_info info
;
8810 if (!aarch64_classify_address (&info
, x
, mode
, false))
8812 if (GET_CODE (x
) == CONST
|| GET_CODE (x
) == SYMBOL_REF
)
8814 /* This is a CONST or SYMBOL ref which will be split
8815 in a different way depending on the code model in use.
8816 Cost it through the generic infrastructure. */
8817 int cost_symbol_ref
= rtx_cost (x
, Pmode
, MEM
, 1, speed
);
8818 /* Divide through by the cost of one instruction to
8819 bring it to the same units as the address costs. */
8820 cost_symbol_ref
/= COSTS_N_INSNS (1);
8821 /* The cost is then the cost of preparing the address,
8822 followed by an immediate (possibly 0) offset. */
8823 return cost_symbol_ref
+ addr_cost
->imm_offset
;
8827 /* This is most likely a jump table from a case
8829 return addr_cost
->register_offset
;
8835 case ADDRESS_LO_SUM
:
8836 case ADDRESS_SYMBOLIC
:
8837 case ADDRESS_REG_IMM
:
8838 cost
+= addr_cost
->imm_offset
;
8841 case ADDRESS_REG_WB
:
8842 if (c
== PRE_INC
|| c
== PRE_DEC
|| c
== PRE_MODIFY
)
8843 cost
+= addr_cost
->pre_modify
;
8844 else if (c
== POST_INC
|| c
== POST_DEC
|| c
== POST_MODIFY
)
8845 cost
+= addr_cost
->post_modify
;
8851 case ADDRESS_REG_REG
:
8852 cost
+= addr_cost
->register_offset
;
8855 case ADDRESS_REG_SXTW
:
8856 cost
+= addr_cost
->register_sextend
;
8859 case ADDRESS_REG_UXTW
:
8860 cost
+= addr_cost
->register_zextend
;
8870 /* For the sake of calculating the cost of the shifted register
8871 component, we can treat same sized modes in the same way. */
8872 if (known_eq (GET_MODE_BITSIZE (mode
), 16))
8873 cost
+= addr_cost
->addr_scale_costs
.hi
;
8874 else if (known_eq (GET_MODE_BITSIZE (mode
), 32))
8875 cost
+= addr_cost
->addr_scale_costs
.si
;
8876 else if (known_eq (GET_MODE_BITSIZE (mode
), 64))
8877 cost
+= addr_cost
->addr_scale_costs
.di
;
8879 /* We can't tell, or this is a 128-bit vector. */
8880 cost
+= addr_cost
->addr_scale_costs
.ti
;
8886 /* Return the cost of a branch. If SPEED_P is true then the compiler is
8887 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
8891 aarch64_branch_cost (bool speed_p
, bool predictable_p
)
8893 /* When optimizing for speed, use the cost of unpredictable branches. */
8894 const struct cpu_branch_cost
*branch_costs
=
8895 aarch64_tune_params
.branch_costs
;
8897 if (!speed_p
|| predictable_p
)
8898 return branch_costs
->predictable
;
8900 return branch_costs
->unpredictable
;
8903 /* Return true if the RTX X in mode MODE is a zero or sign extract
8904 usable in an ADD or SUB (extended register) instruction. */
8906 aarch64_rtx_arith_op_extract_p (rtx x
, scalar_int_mode mode
)
8908 /* Catch add with a sign extract.
8909 This is add_<optab><mode>_multp2. */
8910 if (GET_CODE (x
) == SIGN_EXTRACT
8911 || GET_CODE (x
) == ZERO_EXTRACT
)
8913 rtx op0
= XEXP (x
, 0);
8914 rtx op1
= XEXP (x
, 1);
8915 rtx op2
= XEXP (x
, 2);
8917 if (GET_CODE (op0
) == MULT
8918 && CONST_INT_P (op1
)
8919 && op2
== const0_rtx
8920 && CONST_INT_P (XEXP (op0
, 1))
8921 && aarch64_is_extend_from_extract (mode
,
8928 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8930 else if (GET_CODE (x
) == SIGN_EXTEND
8931 || GET_CODE (x
) == ZERO_EXTEND
)
8932 return REG_P (XEXP (x
, 0));
8938 aarch64_frint_unspec_p (unsigned int u
)
8956 /* Return true iff X is an rtx that will match an extr instruction
8957 i.e. as described in the *extr<mode>5_insn family of patterns.
8958 OP0 and OP1 will be set to the operands of the shifts involved
8959 on success and will be NULL_RTX otherwise. */
8962 aarch64_extr_rtx_p (rtx x
, rtx
*res_op0
, rtx
*res_op1
)
8965 scalar_int_mode mode
;
8966 if (!is_a
<scalar_int_mode
> (GET_MODE (x
), &mode
))
8969 *res_op0
= NULL_RTX
;
8970 *res_op1
= NULL_RTX
;
8972 if (GET_CODE (x
) != IOR
)
8978 if ((GET_CODE (op0
) == ASHIFT
&& GET_CODE (op1
) == LSHIFTRT
)
8979 || (GET_CODE (op1
) == ASHIFT
&& GET_CODE (op0
) == LSHIFTRT
))
8981 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
8982 if (GET_CODE (op1
) == ASHIFT
)
8983 std::swap (op0
, op1
);
8985 if (!CONST_INT_P (XEXP (op0
, 1)) || !CONST_INT_P (XEXP (op1
, 1)))
8988 unsigned HOST_WIDE_INT shft_amnt_0
= UINTVAL (XEXP (op0
, 1));
8989 unsigned HOST_WIDE_INT shft_amnt_1
= UINTVAL (XEXP (op1
, 1));
8991 if (shft_amnt_0
< GET_MODE_BITSIZE (mode
)
8992 && shft_amnt_0
+ shft_amnt_1
== GET_MODE_BITSIZE (mode
))
8994 *res_op0
= XEXP (op0
, 0);
8995 *res_op1
= XEXP (op1
, 0);
9003 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
9004 storing it in *COST. Result is true if the total cost of the operation
9005 has now been calculated. */
9007 aarch64_if_then_else_costs (rtx op0
, rtx op1
, rtx op2
, int *cost
, bool speed
)
9011 enum rtx_code cmpcode
;
9013 if (COMPARISON_P (op0
))
9015 inner
= XEXP (op0
, 0);
9016 comparator
= XEXP (op0
, 1);
9017 cmpcode
= GET_CODE (op0
);
9022 comparator
= const0_rtx
;
9026 if (GET_CODE (op1
) == PC
|| GET_CODE (op2
) == PC
)
9028 /* Conditional branch. */
9029 if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
9033 if (cmpcode
== NE
|| cmpcode
== EQ
)
9035 if (comparator
== const0_rtx
)
9037 /* TBZ/TBNZ/CBZ/CBNZ. */
9038 if (GET_CODE (inner
) == ZERO_EXTRACT
)
9040 *cost
+= rtx_cost (XEXP (inner
, 0), VOIDmode
,
9041 ZERO_EXTRACT
, 0, speed
);
9044 *cost
+= rtx_cost (inner
, VOIDmode
, cmpcode
, 0, speed
);
9049 else if (cmpcode
== LT
|| cmpcode
== GE
)
9052 if (comparator
== const0_rtx
)
9057 else if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
9060 if (GET_CODE (op1
) == COMPARE
)
9062 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
9063 if (XEXP (op1
, 1) == const0_rtx
)
9067 machine_mode mode
= GET_MODE (XEXP (op1
, 0));
9068 const struct cpu_cost_table
*extra_cost
9069 = aarch64_tune_params
.insn_extra_cost
;
9071 if (GET_MODE_CLASS (mode
) == MODE_INT
)
9072 *cost
+= extra_cost
->alu
.arith
;
9074 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
9079 /* It's a conditional operation based on the status flags,
9080 so it must be some flavor of CSEL. */
9082 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
9083 if (GET_CODE (op1
) == NEG
9084 || GET_CODE (op1
) == NOT
9085 || (GET_CODE (op1
) == PLUS
&& XEXP (op1
, 1) == const1_rtx
))
9086 op1
= XEXP (op1
, 0);
9087 else if (GET_CODE (op1
) == ZERO_EXTEND
&& GET_CODE (op2
) == ZERO_EXTEND
)
9089 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
9090 op1
= XEXP (op1
, 0);
9091 op2
= XEXP (op2
, 0);
9094 *cost
+= rtx_cost (op1
, VOIDmode
, IF_THEN_ELSE
, 1, speed
);
9095 *cost
+= rtx_cost (op2
, VOIDmode
, IF_THEN_ELSE
, 2, speed
);
9099 /* We don't know what this is, cost all operands. */
9103 /* Check whether X is a bitfield operation of the form shift + extend that
9104 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
9105 operand to which the bitfield operation is applied. Otherwise return
9109 aarch64_extend_bitfield_pattern_p (rtx x
)
9111 rtx_code outer_code
= GET_CODE (x
);
9112 machine_mode outer_mode
= GET_MODE (x
);
9114 if (outer_code
!= ZERO_EXTEND
&& outer_code
!= SIGN_EXTEND
9115 && outer_mode
!= SImode
&& outer_mode
!= DImode
)
9118 rtx inner
= XEXP (x
, 0);
9119 rtx_code inner_code
= GET_CODE (inner
);
9120 machine_mode inner_mode
= GET_MODE (inner
);
9126 if (CONST_INT_P (XEXP (inner
, 1))
9127 && (inner_mode
== QImode
|| inner_mode
== HImode
))
9128 op
= XEXP (inner
, 0);
9131 if (outer_code
== ZERO_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
9132 && (inner_mode
== QImode
|| inner_mode
== HImode
))
9133 op
= XEXP (inner
, 0);
9136 if (outer_code
== SIGN_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
9137 && (inner_mode
== QImode
|| inner_mode
== HImode
))
9138 op
= XEXP (inner
, 0);
9147 /* Return true if the mask and a shift amount from an RTX of the form
9148 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
9149 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
9152 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode
, rtx mask
,
9155 return CONST_INT_P (mask
) && CONST_INT_P (shft_amnt
)
9156 && INTVAL (shft_amnt
) < GET_MODE_BITSIZE (mode
)
9157 && exact_log2 ((INTVAL (mask
) >> INTVAL (shft_amnt
)) + 1) >= 0
9159 & ((HOST_WIDE_INT_1U
<< INTVAL (shft_amnt
)) - 1)) == 0;
9162 /* Calculate the cost of calculating X, storing it in *COST. Result
9163 is true if the total cost of the operation has now been calculated. */
9165 aarch64_rtx_costs (rtx x
, machine_mode mode
, int outer ATTRIBUTE_UNUSED
,
9166 int param ATTRIBUTE_UNUSED
, int *cost
, bool speed
)
9169 const struct cpu_cost_table
*extra_cost
9170 = aarch64_tune_params
.insn_extra_cost
;
9171 int code
= GET_CODE (x
);
9172 scalar_int_mode int_mode
;
9174 /* By default, assume that everything has equivalent cost to the
9175 cheapest instruction. Any additional costs are applied as a delta
9176 above this default. */
9177 *cost
= COSTS_N_INSNS (1);
9182 /* The cost depends entirely on the operands to SET. */
9187 switch (GET_CODE (op0
))
9192 rtx address
= XEXP (op0
, 0);
9193 if (VECTOR_MODE_P (mode
))
9194 *cost
+= extra_cost
->ldst
.storev
;
9195 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
9196 *cost
+= extra_cost
->ldst
.store
;
9197 else if (mode
== SFmode
)
9198 *cost
+= extra_cost
->ldst
.storef
;
9199 else if (mode
== DFmode
)
9200 *cost
+= extra_cost
->ldst
.stored
;
9203 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
9207 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
9211 if (! REG_P (SUBREG_REG (op0
)))
9212 *cost
+= rtx_cost (SUBREG_REG (op0
), VOIDmode
, SET
, 0, speed
);
9216 /* The cost is one per vector-register copied. */
9217 if (VECTOR_MODE_P (GET_MODE (op0
)) && REG_P (op1
))
9219 int nregs
= aarch64_hard_regno_nregs (V0_REGNUM
, GET_MODE (op0
));
9220 *cost
= COSTS_N_INSNS (nregs
);
9222 /* const0_rtx is in general free, but we will use an
9223 instruction to set a register to 0. */
9224 else if (REG_P (op1
) || op1
== const0_rtx
)
9226 /* The cost is 1 per register copied. */
9227 int nregs
= aarch64_hard_regno_nregs (R0_REGNUM
, GET_MODE (op0
));
9228 *cost
= COSTS_N_INSNS (nregs
);
9231 /* Cost is just the cost of the RHS of the set. */
9232 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
9237 /* Bit-field insertion. Strip any redundant widening of
9238 the RHS to meet the width of the target. */
9239 if (GET_CODE (op1
) == SUBREG
)
9240 op1
= SUBREG_REG (op1
);
9241 if ((GET_CODE (op1
) == ZERO_EXTEND
9242 || GET_CODE (op1
) == SIGN_EXTEND
)
9243 && CONST_INT_P (XEXP (op0
, 1))
9244 && is_a
<scalar_int_mode
> (GET_MODE (XEXP (op1
, 0)), &int_mode
)
9245 && GET_MODE_BITSIZE (int_mode
) >= INTVAL (XEXP (op0
, 1)))
9246 op1
= XEXP (op1
, 0);
9248 if (CONST_INT_P (op1
))
9250 /* MOV immediate is assumed to always be cheap. */
9251 *cost
= COSTS_N_INSNS (1);
9257 *cost
+= extra_cost
->alu
.bfi
;
9258 *cost
+= rtx_cost (op1
, VOIDmode
, (enum rtx_code
) code
, 1, speed
);
9264 /* We can't make sense of this, assume default cost. */
9265 *cost
= COSTS_N_INSNS (1);
9271 /* If an instruction can incorporate a constant within the
9272 instruction, the instruction's expression avoids calling
9273 rtx_cost() on the constant. If rtx_cost() is called on a
9274 constant, then it is usually because the constant must be
9275 moved into a register by one or more instructions.
9277 The exception is constant 0, which can be expressed
9278 as XZR/WZR and is therefore free. The exception to this is
9279 if we have (set (reg) (const0_rtx)) in which case we must cost
9280 the move. However, we can catch that when we cost the SET, so
9281 we don't need to consider that here. */
9282 if (x
== const0_rtx
)
9286 /* To an approximation, building any other constant is
9287 proportionally expensive to the number of instructions
9288 required to build that constant. This is true whether we
9289 are compiling for SPEED or otherwise. */
9290 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
9291 int_mode
= word_mode
;
9292 *cost
= COSTS_N_INSNS (aarch64_internal_mov_immediate
9293 (NULL_RTX
, x
, false, int_mode
));
9299 /* First determine number of instructions to do the move
9300 as an integer constant. */
9301 if (!aarch64_float_const_representable_p (x
)
9302 && !aarch64_can_const_movi_rtx_p (x
, mode
)
9303 && aarch64_float_const_rtx_p (x
))
9305 unsigned HOST_WIDE_INT ival
;
9306 bool succeed
= aarch64_reinterpret_float_as_int (x
, &ival
);
9307 gcc_assert (succeed
);
9309 scalar_int_mode imode
= (mode
== HFmode
9311 : int_mode_for_mode (mode
).require ());
9312 int ncost
= aarch64_internal_mov_immediate
9313 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
9314 *cost
+= COSTS_N_INSNS (ncost
);
9320 /* mov[df,sf]_aarch64. */
9321 if (aarch64_float_const_representable_p (x
))
9322 /* FMOV (scalar immediate). */
9323 *cost
+= extra_cost
->fp
[mode
== DFmode
].fpconst
;
9324 else if (!aarch64_float_const_zero_rtx_p (x
))
9326 /* This will be a load from memory. */
9328 *cost
+= extra_cost
->ldst
.loadd
;
9330 *cost
+= extra_cost
->ldst
.loadf
;
9333 /* Otherwise this is +0.0. We get this using MOVI d0, #0
9334 or MOV v0.s[0], wzr - neither of which are modeled by the
9335 cost tables. Just use the default cost. */
9345 /* For loads we want the base cost of a load, plus an
9346 approximation for the additional cost of the addressing
9348 rtx address
= XEXP (x
, 0);
9349 if (VECTOR_MODE_P (mode
))
9350 *cost
+= extra_cost
->ldst
.loadv
;
9351 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
9352 *cost
+= extra_cost
->ldst
.load
;
9353 else if (mode
== SFmode
)
9354 *cost
+= extra_cost
->ldst
.loadf
;
9355 else if (mode
== DFmode
)
9356 *cost
+= extra_cost
->ldst
.loadd
;
9359 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
9368 if (VECTOR_MODE_P (mode
))
9373 *cost
+= extra_cost
->vect
.alu
;
9378 if (GET_MODE_CLASS (mode
) == MODE_INT
)
9380 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
9381 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
9384 *cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, NEG
, 0, speed
);
9388 /* Cost this as SUB wzr, X. */
9389 op0
= CONST0_RTX (mode
);
9394 if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
9396 /* Support (neg(fma...)) as a single instruction only if
9397 sign of zeros is unimportant. This matches the decision
9398 making in aarch64.md. */
9399 if (GET_CODE (op0
) == FMA
&& !HONOR_SIGNED_ZEROS (GET_MODE (op0
)))
9402 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
9405 if (GET_CODE (op0
) == MULT
)
9408 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
9413 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
9423 if (VECTOR_MODE_P (mode
))
9424 *cost
+= extra_cost
->vect
.alu
;
9426 *cost
+= extra_cost
->alu
.clz
;
9435 if (op1
== const0_rtx
9436 && GET_CODE (op0
) == AND
)
9439 mode
= GET_MODE (op0
);
9443 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
)
9445 /* TODO: A write to the CC flags possibly costs extra, this
9446 needs encoding in the cost tables. */
9448 mode
= GET_MODE (op0
);
9450 if (GET_CODE (op0
) == AND
)
9456 if (GET_CODE (op0
) == PLUS
)
9458 /* ADDS (and CMN alias). */
9463 if (GET_CODE (op0
) == MINUS
)
9470 if (GET_CODE (op0
) == ZERO_EXTRACT
&& op1
== const0_rtx
9471 && GET_MODE (x
) == CC_NZmode
&& CONST_INT_P (XEXP (op0
, 1))
9472 && CONST_INT_P (XEXP (op0
, 2)))
9474 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
9475 Handle it here directly rather than going to cost_logic
9476 since we know the immediate generated for the TST is valid
9477 so we can avoid creating an intermediate rtx for it only
9478 for costing purposes. */
9480 *cost
+= extra_cost
->alu
.logical
;
9482 *cost
+= rtx_cost (XEXP (op0
, 0), GET_MODE (op0
),
9483 ZERO_EXTRACT
, 0, speed
);
9487 if (GET_CODE (op1
) == NEG
)
9491 *cost
+= extra_cost
->alu
.arith
;
9493 *cost
+= rtx_cost (op0
, mode
, COMPARE
, 0, speed
);
9494 *cost
+= rtx_cost (XEXP (op1
, 0), mode
, NEG
, 1, speed
);
9500 Compare can freely swap the order of operands, and
9501 canonicalization puts the more complex operation first.
9502 But the integer MINUS logic expects the shift/extend
9503 operation in op1. */
9505 || (GET_CODE (op0
) == SUBREG
&& REG_P (SUBREG_REG (op0
)))))
9513 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_FLOAT
)
9517 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
9519 if (CONST_DOUBLE_P (op1
) && aarch64_float_const_zero_rtx_p (op1
))
9521 *cost
+= rtx_cost (op0
, VOIDmode
, COMPARE
, 0, speed
);
9522 /* FCMP supports constant 0.0 for no extra cost. */
9528 if (VECTOR_MODE_P (mode
))
9530 /* Vector compare. */
9532 *cost
+= extra_cost
->vect
.alu
;
9534 if (aarch64_float_const_zero_rtx_p (op1
))
9536 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
9550 *cost
+= rtx_cost (op0
, mode
, MINUS
, 0, speed
);
9552 /* Detect valid immediates. */
9553 if ((GET_MODE_CLASS (mode
) == MODE_INT
9554 || (GET_MODE_CLASS (mode
) == MODE_CC
9555 && GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
))
9556 && CONST_INT_P (op1
)
9557 && aarch64_uimm12_shift (INTVAL (op1
)))
9560 /* SUB(S) (immediate). */
9561 *cost
+= extra_cost
->alu
.arith
;
9565 /* Look for SUB (extended register). */
9566 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
9567 && aarch64_rtx_arith_op_extract_p (op1
, int_mode
))
9570 *cost
+= extra_cost
->alu
.extend_arith
;
9572 op1
= aarch64_strip_extend (op1
, true);
9573 *cost
+= rtx_cost (op1
, VOIDmode
,
9574 (enum rtx_code
) GET_CODE (op1
), 0, speed
);
9578 rtx new_op1
= aarch64_strip_extend (op1
, false);
9580 /* Cost this as an FMA-alike operation. */
9581 if ((GET_CODE (new_op1
) == MULT
9582 || aarch64_shift_p (GET_CODE (new_op1
)))
9585 *cost
+= aarch64_rtx_mult_cost (new_op1
, MULT
,
9586 (enum rtx_code
) code
,
9591 *cost
+= rtx_cost (new_op1
, VOIDmode
, MINUS
, 1, speed
);
9595 if (VECTOR_MODE_P (mode
))
9598 *cost
+= extra_cost
->vect
.alu
;
9600 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
9603 *cost
+= extra_cost
->alu
.arith
;
9605 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
9608 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
9622 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
9623 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
9626 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, PLUS
, 0, speed
);
9627 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
9631 if (GET_MODE_CLASS (mode
) == MODE_INT
9632 && ((CONST_INT_P (op1
) && aarch64_uimm12_shift (INTVAL (op1
)))
9633 || aarch64_sve_addvl_addpl_immediate (op1
, mode
)))
9635 *cost
+= rtx_cost (op0
, mode
, PLUS
, 0, speed
);
9638 /* ADD (immediate). */
9639 *cost
+= extra_cost
->alu
.arith
;
9643 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
9645 /* Look for ADD (extended register). */
9646 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
9647 && aarch64_rtx_arith_op_extract_p (op0
, int_mode
))
9650 *cost
+= extra_cost
->alu
.extend_arith
;
9652 op0
= aarch64_strip_extend (op0
, true);
9653 *cost
+= rtx_cost (op0
, VOIDmode
,
9654 (enum rtx_code
) GET_CODE (op0
), 0, speed
);
9658 /* Strip any extend, leave shifts behind as we will
9659 cost them through mult_cost. */
9660 new_op0
= aarch64_strip_extend (op0
, false);
9662 if (GET_CODE (new_op0
) == MULT
9663 || aarch64_shift_p (GET_CODE (new_op0
)))
9665 *cost
+= aarch64_rtx_mult_cost (new_op0
, MULT
, PLUS
,
9670 *cost
+= rtx_cost (new_op0
, VOIDmode
, PLUS
, 0, speed
);
9674 if (VECTOR_MODE_P (mode
))
9677 *cost
+= extra_cost
->vect
.alu
;
9679 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
9682 *cost
+= extra_cost
->alu
.arith
;
9684 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
9687 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
9694 *cost
= COSTS_N_INSNS (1);
9698 if (VECTOR_MODE_P (mode
))
9699 *cost
+= extra_cost
->vect
.alu
;
9701 *cost
+= extra_cost
->alu
.rev
;
9706 if (aarch_rev16_p (x
))
9708 *cost
= COSTS_N_INSNS (1);
9712 if (VECTOR_MODE_P (mode
))
9713 *cost
+= extra_cost
->vect
.alu
;
9715 *cost
+= extra_cost
->alu
.rev
;
9720 if (aarch64_extr_rtx_p (x
, &op0
, &op1
))
9722 *cost
+= rtx_cost (op0
, mode
, IOR
, 0, speed
);
9723 *cost
+= rtx_cost (op1
, mode
, IOR
, 1, speed
);
9725 *cost
+= extra_cost
->alu
.shift
;
9736 if (VECTOR_MODE_P (mode
))
9739 *cost
+= extra_cost
->vect
.alu
;
9744 && GET_CODE (op0
) == MULT
9745 && CONST_INT_P (XEXP (op0
, 1))
9746 && CONST_INT_P (op1
)
9747 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0
, 1))),
9750 /* This is a UBFM/SBFM. */
9751 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, ZERO_EXTRACT
, 0, speed
);
9753 *cost
+= extra_cost
->alu
.bfx
;
9757 if (is_int_mode (mode
, &int_mode
))
9759 if (CONST_INT_P (op1
))
9761 /* We have a mask + shift version of a UBFIZ
9762 i.e. the *andim_ashift<mode>_bfiz pattern. */
9763 if (GET_CODE (op0
) == ASHIFT
9764 && aarch64_mask_and_shift_for_ubfiz_p (int_mode
, op1
,
9767 *cost
+= rtx_cost (XEXP (op0
, 0), int_mode
,
9768 (enum rtx_code
) code
, 0, speed
);
9770 *cost
+= extra_cost
->alu
.bfx
;
9774 else if (aarch64_bitmask_imm (INTVAL (op1
), int_mode
))
9776 /* We possibly get the immediate for free, this is not
9778 *cost
+= rtx_cost (op0
, int_mode
,
9779 (enum rtx_code
) code
, 0, speed
);
9781 *cost
+= extra_cost
->alu
.logical
;
9790 /* Handle ORN, EON, or BIC. */
9791 if (GET_CODE (op0
) == NOT
)
9792 op0
= XEXP (op0
, 0);
9794 new_op0
= aarch64_strip_shift (op0
);
9796 /* If we had a shift on op0 then this is a logical-shift-
9797 by-register/immediate operation. Otherwise, this is just
9798 a logical operation. */
9803 /* Shift by immediate. */
9804 if (CONST_INT_P (XEXP (op0
, 1)))
9805 *cost
+= extra_cost
->alu
.log_shift
;
9807 *cost
+= extra_cost
->alu
.log_shift_reg
;
9810 *cost
+= extra_cost
->alu
.logical
;
9813 /* In both cases we want to cost both operands. */
9814 *cost
+= rtx_cost (new_op0
, int_mode
, (enum rtx_code
) code
,
9816 *cost
+= rtx_cost (op1
, int_mode
, (enum rtx_code
) code
,
9826 op0
= aarch64_strip_shift (x
);
9828 if (VECTOR_MODE_P (mode
))
9831 *cost
+= extra_cost
->vect
.alu
;
9835 /* MVN-shifted-reg. */
9838 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
9841 *cost
+= extra_cost
->alu
.log_shift
;
9845 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9846 Handle the second form here taking care that 'a' in the above can
9848 else if (GET_CODE (op0
) == XOR
)
9850 rtx newop0
= XEXP (op0
, 0);
9851 rtx newop1
= XEXP (op0
, 1);
9852 rtx op0_stripped
= aarch64_strip_shift (newop0
);
9854 *cost
+= rtx_cost (newop1
, mode
, (enum rtx_code
) code
, 1, speed
);
9855 *cost
+= rtx_cost (op0_stripped
, mode
, XOR
, 0, speed
);
9859 if (op0_stripped
!= newop0
)
9860 *cost
+= extra_cost
->alu
.log_shift
;
9862 *cost
+= extra_cost
->alu
.logical
;
9869 *cost
+= extra_cost
->alu
.logical
;
9876 /* If a value is written in SI mode, then zero extended to DI
9877 mode, the operation will in general be free as a write to
9878 a 'w' register implicitly zeroes the upper bits of an 'x'
9879 register. However, if this is
9881 (set (reg) (zero_extend (reg)))
9883 we must cost the explicit register move. */
9885 && GET_MODE (op0
) == SImode
9888 int op_cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, 0, speed
);
9890 /* If OP_COST is non-zero, then the cost of the zero extend
9891 is effectively the cost of the inner operation. Otherwise
9892 we have a MOV instruction and we take the cost from the MOV
9893 itself. This is true independently of whether we are
9894 optimizing for space or time. */
9900 else if (MEM_P (op0
))
9902 /* All loads can zero extend to any size for free. */
9903 *cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, param
, speed
);
9907 op0
= aarch64_extend_bitfield_pattern_p (x
);
9910 *cost
+= rtx_cost (op0
, mode
, ZERO_EXTEND
, 0, speed
);
9912 *cost
+= extra_cost
->alu
.bfx
;
9918 if (VECTOR_MODE_P (mode
))
9921 *cost
+= extra_cost
->vect
.alu
;
9925 /* We generate an AND instead of UXTB/UXTH. */
9926 *cost
+= extra_cost
->alu
.logical
;
9932 if (MEM_P (XEXP (x
, 0)))
9937 rtx address
= XEXP (XEXP (x
, 0), 0);
9938 *cost
+= extra_cost
->ldst
.load_sign_extend
;
9941 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
9947 op0
= aarch64_extend_bitfield_pattern_p (x
);
9950 *cost
+= rtx_cost (op0
, mode
, SIGN_EXTEND
, 0, speed
);
9952 *cost
+= extra_cost
->alu
.bfx
;
9958 if (VECTOR_MODE_P (mode
))
9959 *cost
+= extra_cost
->vect
.alu
;
9961 *cost
+= extra_cost
->alu
.extend
;
9969 if (CONST_INT_P (op1
))
9973 if (VECTOR_MODE_P (mode
))
9975 /* Vector shift (immediate). */
9976 *cost
+= extra_cost
->vect
.alu
;
9980 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
9982 *cost
+= extra_cost
->alu
.shift
;
9986 /* We can incorporate zero/sign extend for free. */
9987 if (GET_CODE (op0
) == ZERO_EXTEND
9988 || GET_CODE (op0
) == SIGN_EXTEND
)
9989 op0
= XEXP (op0
, 0);
9991 *cost
+= rtx_cost (op0
, VOIDmode
, ASHIFT
, 0, speed
);
9996 if (VECTOR_MODE_P (mode
))
9999 /* Vector shift (register). */
10000 *cost
+= extra_cost
->vect
.alu
;
10006 *cost
+= extra_cost
->alu
.shift_reg
;
10008 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
10009 && CONST_INT_P (XEXP (op1
, 1))
10010 && known_eq (INTVAL (XEXP (op1
, 1)),
10011 GET_MODE_BITSIZE (mode
) - 1))
10013 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
10014 /* We already demanded XEXP (op1, 0) to be REG_P, so
10015 don't recurse into it. */
10019 return false; /* All arguments need to be in registers. */
10029 if (CONST_INT_P (op1
))
10031 /* ASR (immediate) and friends. */
10034 if (VECTOR_MODE_P (mode
))
10035 *cost
+= extra_cost
->vect
.alu
;
10037 *cost
+= extra_cost
->alu
.shift
;
10040 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
10045 if (VECTOR_MODE_P (mode
))
10048 /* Vector shift (register). */
10049 *cost
+= extra_cost
->vect
.alu
;
10054 /* ASR (register) and friends. */
10055 *cost
+= extra_cost
->alu
.shift_reg
;
10057 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
10058 && CONST_INT_P (XEXP (op1
, 1))
10059 && known_eq (INTVAL (XEXP (op1
, 1)),
10060 GET_MODE_BITSIZE (mode
) - 1))
10062 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
10063 /* We already demanded XEXP (op1, 0) to be REG_P, so
10064 don't recurse into it. */
10068 return false; /* All arguments need to be in registers. */
10073 if (aarch64_cmodel
== AARCH64_CMODEL_LARGE
10074 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
)
10078 *cost
+= extra_cost
->ldst
.load
;
10080 else if (aarch64_cmodel
== AARCH64_CMODEL_SMALL
10081 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_PIC
)
10083 /* ADRP, followed by ADD. */
10084 *cost
+= COSTS_N_INSNS (1);
10086 *cost
+= 2 * extra_cost
->alu
.arith
;
10088 else if (aarch64_cmodel
== AARCH64_CMODEL_TINY
10089 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
10093 *cost
+= extra_cost
->alu
.arith
;
10098 /* One extra load instruction, after accessing the GOT. */
10099 *cost
+= COSTS_N_INSNS (1);
10101 *cost
+= extra_cost
->ldst
.load
;
10107 /* ADRP/ADD (immediate). */
10109 *cost
+= extra_cost
->alu
.arith
;
10117 if (VECTOR_MODE_P (mode
))
10118 *cost
+= extra_cost
->vect
.alu
;
10120 *cost
+= extra_cost
->alu
.bfx
;
10123 /* We can trust that the immediates used will be correct (there
10124 are no by-register forms), so we need only cost op0. */
10125 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (enum rtx_code
) code
, 0, speed
);
10129 *cost
+= aarch64_rtx_mult_cost (x
, MULT
, 0, speed
);
10130 /* aarch64_rtx_mult_cost always handles recursion to its
10135 /* We can expand signed mod by power of 2 using a NEGS, two parallel
10136 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
10137 an unconditional negate. This case should only ever be reached through
10138 the set_smod_pow2_cheap check in expmed.c. */
10139 if (CONST_INT_P (XEXP (x
, 1))
10140 && exact_log2 (INTVAL (XEXP (x
, 1))) > 0
10141 && (mode
== SImode
|| mode
== DImode
))
10143 /* We expand to 4 instructions. Reset the baseline. */
10144 *cost
= COSTS_N_INSNS (4);
10147 *cost
+= 2 * extra_cost
->alu
.logical
10148 + 2 * extra_cost
->alu
.arith
;
10153 /* Fall-through. */
10157 /* Slighly prefer UMOD over SMOD. */
10158 if (VECTOR_MODE_P (mode
))
10159 *cost
+= extra_cost
->vect
.alu
;
10160 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
10161 *cost
+= (extra_cost
->mult
[mode
== DImode
].add
10162 + extra_cost
->mult
[mode
== DImode
].idiv
10163 + (code
== MOD
? 1 : 0));
10165 return false; /* All arguments need to be in registers. */
10172 if (VECTOR_MODE_P (mode
))
10173 *cost
+= extra_cost
->vect
.alu
;
10174 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
10175 /* There is no integer SQRT, so only DIV and UDIV can get
10177 *cost
+= (extra_cost
->mult
[mode
== DImode
].idiv
10178 /* Slighly prefer UDIV over SDIV. */
10179 + (code
== DIV
? 1 : 0));
10181 *cost
+= extra_cost
->fp
[mode
== DFmode
].div
;
10183 return false; /* All arguments need to be in registers. */
10186 return aarch64_if_then_else_costs (XEXP (x
, 0), XEXP (x
, 1),
10187 XEXP (x
, 2), cost
, speed
);
10200 return false; /* All arguments must be in registers. */
10209 if (VECTOR_MODE_P (mode
))
10210 *cost
+= extra_cost
->vect
.alu
;
10212 *cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
10215 /* FMSUB, FNMADD, and FNMSUB are free. */
10216 if (GET_CODE (op0
) == NEG
)
10217 op0
= XEXP (op0
, 0);
10219 if (GET_CODE (op2
) == NEG
)
10220 op2
= XEXP (op2
, 0);
10222 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
10223 and the by-element operand as operand 0. */
10224 if (GET_CODE (op1
) == NEG
)
10225 op1
= XEXP (op1
, 0);
10227 /* Catch vector-by-element operations. The by-element operand can
10228 either be (vec_duplicate (vec_select (x))) or just
10229 (vec_select (x)), depending on whether we are multiplying by
10230 a vector or a scalar.
10232 Canonicalization is not very good in these cases, FMA4 will put the
10233 by-element operand as operand 0, FNMA4 will have it as operand 1. */
10234 if (GET_CODE (op0
) == VEC_DUPLICATE
)
10235 op0
= XEXP (op0
, 0);
10236 else if (GET_CODE (op1
) == VEC_DUPLICATE
)
10237 op1
= XEXP (op1
, 0);
10239 if (GET_CODE (op0
) == VEC_SELECT
)
10240 op0
= XEXP (op0
, 0);
10241 else if (GET_CODE (op1
) == VEC_SELECT
)
10242 op1
= XEXP (op1
, 0);
10244 /* If the remaining parameters are not registers,
10245 get the cost to put them into registers. */
10246 *cost
+= rtx_cost (op0
, mode
, FMA
, 0, speed
);
10247 *cost
+= rtx_cost (op1
, mode
, FMA
, 1, speed
);
10248 *cost
+= rtx_cost (op2
, mode
, FMA
, 2, speed
);
10252 case UNSIGNED_FLOAT
:
10254 *cost
+= extra_cost
->fp
[mode
== DFmode
].fromint
;
10260 if (VECTOR_MODE_P (mode
))
10262 /*Vector truncate. */
10263 *cost
+= extra_cost
->vect
.alu
;
10266 *cost
+= extra_cost
->fp
[mode
== DFmode
].widen
;
10270 case FLOAT_TRUNCATE
:
10273 if (VECTOR_MODE_P (mode
))
10275 /*Vector conversion. */
10276 *cost
+= extra_cost
->vect
.alu
;
10279 *cost
+= extra_cost
->fp
[mode
== DFmode
].narrow
;
10286 /* Strip the rounding part. They will all be implemented
10287 by the fcvt* family of instructions anyway. */
10288 if (GET_CODE (x
) == UNSPEC
)
10290 unsigned int uns_code
= XINT (x
, 1);
10292 if (uns_code
== UNSPEC_FRINTA
10293 || uns_code
== UNSPEC_FRINTM
10294 || uns_code
== UNSPEC_FRINTN
10295 || uns_code
== UNSPEC_FRINTP
10296 || uns_code
== UNSPEC_FRINTZ
)
10297 x
= XVECEXP (x
, 0, 0);
10302 if (VECTOR_MODE_P (mode
))
10303 *cost
+= extra_cost
->vect
.alu
;
10305 *cost
+= extra_cost
->fp
[GET_MODE (x
) == DFmode
].toint
;
10308 /* We can combine fmul by a power of 2 followed by a fcvt into a single
10309 fixed-point fcvt. */
10310 if (GET_CODE (x
) == MULT
10311 && ((VECTOR_MODE_P (mode
)
10312 && aarch64_vec_fpconst_pow_of_2 (XEXP (x
, 1)) > 0)
10313 || aarch64_fpconst_pow_of_2 (XEXP (x
, 1)) > 0))
10315 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (rtx_code
) code
,
10320 *cost
+= rtx_cost (x
, VOIDmode
, (enum rtx_code
) code
, 0, speed
);
10324 if (VECTOR_MODE_P (mode
))
10326 /* ABS (vector). */
10328 *cost
+= extra_cost
->vect
.alu
;
10330 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
10334 /* FABD, which is analogous to FADD. */
10335 if (GET_CODE (op0
) == MINUS
)
10337 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, MINUS
, 0, speed
);
10338 *cost
+= rtx_cost (XEXP (op0
, 1), mode
, MINUS
, 1, speed
);
10340 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
10344 /* Simple FABS is analogous to FNEG. */
10346 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
10350 /* Integer ABS will either be split to
10351 two arithmetic instructions, or will be an ABS
10352 (scalar), which we don't model. */
10353 *cost
= COSTS_N_INSNS (2);
10355 *cost
+= 2 * extra_cost
->alu
.arith
;
10363 if (VECTOR_MODE_P (mode
))
10364 *cost
+= extra_cost
->vect
.alu
;
10367 /* FMAXNM/FMINNM/FMAX/FMIN.
10368 TODO: This may not be accurate for all implementations, but
10369 we do not model this in the cost tables. */
10370 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
10376 /* The floating point round to integer frint* instructions. */
10377 if (aarch64_frint_unspec_p (XINT (x
, 1)))
10380 *cost
+= extra_cost
->fp
[mode
== DFmode
].roundint
;
10385 if (XINT (x
, 1) == UNSPEC_RBIT
)
10388 *cost
+= extra_cost
->alu
.rev
;
10396 /* Decompose <su>muldi3_highpart. */
10397 if (/* (truncate:DI */
10400 && GET_MODE (XEXP (x
, 0)) == TImode
10401 && GET_CODE (XEXP (x
, 0)) == LSHIFTRT
10403 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == MULT
10404 /* (ANY_EXTEND:TI (reg:DI))
10405 (ANY_EXTEND:TI (reg:DI))) */
10406 && ((GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == ZERO_EXTEND
10407 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == ZERO_EXTEND
)
10408 || (GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == SIGN_EXTEND
10409 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == SIGN_EXTEND
))
10410 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0)) == DImode
10411 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0)) == DImode
10412 /* (const_int 64) */
10413 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
10414 && UINTVAL (XEXP (XEXP (x
, 0), 1)) == 64)
10418 *cost
+= extra_cost
->mult
[mode
== DImode
].extend
;
10419 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0),
10420 mode
, MULT
, 0, speed
);
10421 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0),
10422 mode
, MULT
, 1, speed
);
10426 /* Fall through. */
10432 && flag_aarch64_verbose_cost
)
10433 fprintf (dump_file
,
10434 "\nFailed to cost RTX. Assuming default cost.\n");
10439 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
10440 calculated for X. This cost is stored in *COST. Returns true
10441 if the total cost of X was calculated. */
10443 aarch64_rtx_costs_wrapper (rtx x
, machine_mode mode
, int outer
,
10444 int param
, int *cost
, bool speed
)
10446 bool result
= aarch64_rtx_costs (x
, mode
, outer
, param
, cost
, speed
);
10449 && flag_aarch64_verbose_cost
)
10451 print_rtl_single (dump_file
, x
);
10452 fprintf (dump_file
, "\n%s cost: %d (%s)\n",
10453 speed
? "Hot" : "Cold",
10454 *cost
, result
? "final" : "partial");
10461 aarch64_register_move_cost (machine_mode mode
,
10462 reg_class_t from_i
, reg_class_t to_i
)
10464 enum reg_class from
= (enum reg_class
) from_i
;
10465 enum reg_class to
= (enum reg_class
) to_i
;
10466 const struct cpu_regmove_cost
*regmove_cost
10467 = aarch64_tune_params
.regmove_cost
;
10469 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
10470 if (to
== TAILCALL_ADDR_REGS
|| to
== POINTER_REGS
)
10473 if (from
== TAILCALL_ADDR_REGS
|| from
== POINTER_REGS
)
10474 from
= GENERAL_REGS
;
10476 /* Moving between GPR and stack cost is the same as GP2GP. */
10477 if ((from
== GENERAL_REGS
&& to
== STACK_REG
)
10478 || (to
== GENERAL_REGS
&& from
== STACK_REG
))
10479 return regmove_cost
->GP2GP
;
10481 /* To/From the stack register, we move via the gprs. */
10482 if (to
== STACK_REG
|| from
== STACK_REG
)
10483 return aarch64_register_move_cost (mode
, from
, GENERAL_REGS
)
10484 + aarch64_register_move_cost (mode
, GENERAL_REGS
, to
);
10486 if (known_eq (GET_MODE_SIZE (mode
), 16))
10488 /* 128-bit operations on general registers require 2 instructions. */
10489 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
10490 return regmove_cost
->GP2GP
* 2;
10491 else if (from
== GENERAL_REGS
)
10492 return regmove_cost
->GP2FP
* 2;
10493 else if (to
== GENERAL_REGS
)
10494 return regmove_cost
->FP2GP
* 2;
10496 /* When AdvSIMD instructions are disabled it is not possible to move
10497 a 128-bit value directly between Q registers. This is handled in
10498 secondary reload. A general register is used as a scratch to move
10499 the upper DI value and the lower DI value is moved directly,
10500 hence the cost is the sum of three moves. */
10502 return regmove_cost
->GP2FP
+ regmove_cost
->FP2GP
+ regmove_cost
->FP2FP
;
10504 return regmove_cost
->FP2FP
;
10507 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
10508 return regmove_cost
->GP2GP
;
10509 else if (from
== GENERAL_REGS
)
10510 return regmove_cost
->GP2FP
;
10511 else if (to
== GENERAL_REGS
)
10512 return regmove_cost
->FP2GP
;
10514 return regmove_cost
->FP2FP
;
10518 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED
,
10519 reg_class_t rclass ATTRIBUTE_UNUSED
,
10520 bool in ATTRIBUTE_UNUSED
)
10522 return aarch64_tune_params
.memmov_cost
;
10525 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
10526 to optimize 1.0/sqrt. */
10529 use_rsqrt_p (machine_mode mode
)
10531 return (!flag_trapping_math
10532 && flag_unsafe_math_optimizations
10533 && ((aarch64_tune_params
.approx_modes
->recip_sqrt
10534 & AARCH64_APPROX_MODE (mode
))
10535 || flag_mrecip_low_precision_sqrt
));
10538 /* Function to decide when to use the approximate reciprocal square root
10542 aarch64_builtin_reciprocal (tree fndecl
)
10544 machine_mode mode
= TYPE_MODE (TREE_TYPE (fndecl
));
10546 if (!use_rsqrt_p (mode
))
10548 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl
));
10551 /* Emit instruction sequence to compute either the approximate square root
10552 or its approximate reciprocal, depending on the flag RECP, and return
10553 whether the sequence was emitted or not. */
10556 aarch64_emit_approx_sqrt (rtx dst
, rtx src
, bool recp
)
10558 machine_mode mode
= GET_MODE (dst
);
10560 if (GET_MODE_INNER (mode
) == HFmode
)
10562 gcc_assert (!recp
);
10568 if (!(flag_mlow_precision_sqrt
10569 || (aarch64_tune_params
.approx_modes
->sqrt
10570 & AARCH64_APPROX_MODE (mode
))))
10573 if (flag_finite_math_only
10574 || flag_trapping_math
10575 || !flag_unsafe_math_optimizations
10576 || optimize_function_for_size_p (cfun
))
10580 /* Caller assumes we cannot fail. */
10581 gcc_assert (use_rsqrt_p (mode
));
10583 machine_mode mmsk
= mode_for_int_vector (mode
).require ();
10584 rtx xmsk
= gen_reg_rtx (mmsk
);
10586 /* When calculating the approximate square root, compare the
10587 argument with 0.0 and create a mask. */
10588 emit_insn (gen_rtx_SET (xmsk
,
10590 gen_rtx_EQ (mmsk
, src
,
10591 CONST0_RTX (mode
)))));
10593 /* Estimate the approximate reciprocal square root. */
10594 rtx xdst
= gen_reg_rtx (mode
);
10595 emit_insn (gen_aarch64_rsqrte (mode
, xdst
, src
));
10597 /* Iterate over the series twice for SF and thrice for DF. */
10598 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
10600 /* Optionally iterate over the series once less for faster performance
10601 while sacrificing the accuracy. */
10602 if ((recp
&& flag_mrecip_low_precision_sqrt
)
10603 || (!recp
&& flag_mlow_precision_sqrt
))
10606 /* Iterate over the series to calculate the approximate reciprocal square
10608 rtx x1
= gen_reg_rtx (mode
);
10609 while (iterations
--)
10611 rtx x2
= gen_reg_rtx (mode
);
10612 emit_set_insn (x2
, gen_rtx_MULT (mode
, xdst
, xdst
));
10614 emit_insn (gen_aarch64_rsqrts (mode
, x1
, src
, x2
));
10616 if (iterations
> 0)
10617 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, x1
));
10622 /* Qualify the approximate reciprocal square root when the argument is
10623 0.0 by squashing the intermediary result to 0.0. */
10624 rtx xtmp
= gen_reg_rtx (mmsk
);
10625 emit_set_insn (xtmp
, gen_rtx_AND (mmsk
, gen_rtx_NOT (mmsk
, xmsk
),
10626 gen_rtx_SUBREG (mmsk
, xdst
, 0)));
10627 emit_move_insn (xdst
, gen_rtx_SUBREG (mode
, xtmp
, 0));
10629 /* Calculate the approximate square root. */
10630 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, src
));
10633 /* Finalize the approximation. */
10634 emit_set_insn (dst
, gen_rtx_MULT (mode
, xdst
, x1
));
10639 /* Emit the instruction sequence to compute the approximation for the division
10640 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
10643 aarch64_emit_approx_div (rtx quo
, rtx num
, rtx den
)
10645 machine_mode mode
= GET_MODE (quo
);
10647 if (GET_MODE_INNER (mode
) == HFmode
)
10650 bool use_approx_division_p
= (flag_mlow_precision_div
10651 || (aarch64_tune_params
.approx_modes
->division
10652 & AARCH64_APPROX_MODE (mode
)));
10654 if (!flag_finite_math_only
10655 || flag_trapping_math
10656 || !flag_unsafe_math_optimizations
10657 || optimize_function_for_size_p (cfun
)
10658 || !use_approx_division_p
)
10661 if (!TARGET_SIMD
&& VECTOR_MODE_P (mode
))
10664 /* Estimate the approximate reciprocal. */
10665 rtx xrcp
= gen_reg_rtx (mode
);
10666 emit_insn (gen_aarch64_frecpe (mode
, xrcp
, den
));
10668 /* Iterate over the series twice for SF and thrice for DF. */
10669 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
10671 /* Optionally iterate over the series once less for faster performance,
10672 while sacrificing the accuracy. */
10673 if (flag_mlow_precision_div
)
10676 /* Iterate over the series to calculate the approximate reciprocal. */
10677 rtx xtmp
= gen_reg_rtx (mode
);
10678 while (iterations
--)
10680 emit_insn (gen_aarch64_frecps (mode
, xtmp
, xrcp
, den
));
10682 if (iterations
> 0)
10683 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
10686 if (num
!= CONST1_RTX (mode
))
10688 /* As the approximate reciprocal of DEN is already calculated, only
10689 calculate the approximate division when NUM is not 1.0. */
10690 rtx xnum
= force_reg (mode
, num
);
10691 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xnum
));
10694 /* Finalize the approximation. */
10695 emit_set_insn (quo
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
10699 /* Return the number of instructions that can be issued per cycle. */
10701 aarch64_sched_issue_rate (void)
10703 return aarch64_tune_params
.issue_rate
;
10707 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10709 int issue_rate
= aarch64_sched_issue_rate ();
10711 return issue_rate
> 1 && !sched_fusion
? issue_rate
: 0;
10715 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10716 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
10717 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
10720 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn
*insn
,
10723 return autopref_multipass_dfa_lookahead_guard (insn
, ready_index
);
10727 /* Vectorizer cost model target hooks. */
10729 /* Implement targetm.vectorize.builtin_vectorization_cost. */
10731 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost
,
10733 int misalign ATTRIBUTE_UNUSED
)
10736 const cpu_vector_cost
*costs
= aarch64_tune_params
.vec_costs
;
10739 if (vectype
!= NULL
)
10740 fp
= FLOAT_TYPE_P (vectype
);
10742 switch (type_of_cost
)
10745 return fp
? costs
->scalar_fp_stmt_cost
: costs
->scalar_int_stmt_cost
;
10748 return costs
->scalar_load_cost
;
10751 return costs
->scalar_store_cost
;
10754 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
10757 return costs
->vec_align_load_cost
;
10760 return costs
->vec_store_cost
;
10762 case vec_to_scalar
:
10763 return costs
->vec_to_scalar_cost
;
10765 case scalar_to_vec
:
10766 return costs
->scalar_to_vec_cost
;
10768 case unaligned_load
:
10769 case vector_gather_load
:
10770 return costs
->vec_unalign_load_cost
;
10772 case unaligned_store
:
10773 case vector_scatter_store
:
10774 return costs
->vec_unalign_store_cost
;
10776 case cond_branch_taken
:
10777 return costs
->cond_taken_branch_cost
;
10779 case cond_branch_not_taken
:
10780 return costs
->cond_not_taken_branch_cost
;
10783 return costs
->vec_permute_cost
;
10785 case vec_promote_demote
:
10786 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
10788 case vec_construct
:
10789 elements
= estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype
));
10790 return elements
/ 2 + 1;
10793 gcc_unreachable ();
10797 /* Implement targetm.vectorize.add_stmt_cost. */
10799 aarch64_add_stmt_cost (void *data
, int count
, enum vect_cost_for_stmt kind
,
10800 struct _stmt_vec_info
*stmt_info
, int misalign
,
10801 enum vect_cost_model_location where
)
10803 unsigned *cost
= (unsigned *) data
;
10804 unsigned retval
= 0;
10806 if (flag_vect_cost_model
)
10808 tree vectype
= stmt_info
? stmt_vectype (stmt_info
) : NULL_TREE
;
10810 aarch64_builtin_vectorization_cost (kind
, vectype
, misalign
);
10812 /* Statements in an inner loop relative to the loop being
10813 vectorized are weighted more heavily. The value here is
10814 arbitrary and could potentially be improved with analysis. */
10815 if (where
== vect_body
&& stmt_info
&& stmt_in_inner_loop_p (stmt_info
))
10816 count
*= 50; /* FIXME */
10818 retval
= (unsigned) (count
* stmt_cost
);
10819 cost
[where
] += retval
;
10825 static void initialize_aarch64_code_model (struct gcc_options
*);
10827 /* Parse the TO_PARSE string and put the architecture struct that it
10828 selects into RES and the architectural features into ISA_FLAGS.
10829 Return an aarch64_parse_opt_result describing the parse result.
10830 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
10831 When the TO_PARSE string contains an invalid extension,
10832 a copy of the string is created and stored to INVALID_EXTENSION. */
10834 static enum aarch64_parse_opt_result
10835 aarch64_parse_arch (const char *to_parse
, const struct processor
**res
,
10836 unsigned long *isa_flags
, std::string
*invalid_extension
)
10839 const struct processor
*arch
;
10842 ext
= strchr (to_parse
, '+');
10845 len
= ext
- to_parse
;
10847 len
= strlen (to_parse
);
10850 return AARCH64_PARSE_MISSING_ARG
;
10853 /* Loop through the list of supported ARCHes to find a match. */
10854 for (arch
= all_architectures
; arch
->name
!= NULL
; arch
++)
10856 if (strlen (arch
->name
) == len
10857 && strncmp (arch
->name
, to_parse
, len
) == 0)
10859 unsigned long isa_temp
= arch
->flags
;
10863 /* TO_PARSE string contains at least one extension. */
10864 enum aarch64_parse_opt_result ext_res
10865 = aarch64_parse_extension (ext
, &isa_temp
, invalid_extension
);
10867 if (ext_res
!= AARCH64_PARSE_OK
)
10870 /* Extension parsing was successful. Confirm the result
10871 arch and ISA flags. */
10873 *isa_flags
= isa_temp
;
10874 return AARCH64_PARSE_OK
;
10878 /* ARCH name not found in list. */
10879 return AARCH64_PARSE_INVALID_ARG
;
10882 /* Parse the TO_PARSE string and put the result tuning in RES and the
10883 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
10884 describing the parse result. If there is an error parsing, RES and
10885 ISA_FLAGS are left unchanged.
10886 When the TO_PARSE string contains an invalid extension,
10887 a copy of the string is created and stored to INVALID_EXTENSION. */
10889 static enum aarch64_parse_opt_result
10890 aarch64_parse_cpu (const char *to_parse
, const struct processor
**res
,
10891 unsigned long *isa_flags
, std::string
*invalid_extension
)
10894 const struct processor
*cpu
;
10897 ext
= strchr (to_parse
, '+');
10900 len
= ext
- to_parse
;
10902 len
= strlen (to_parse
);
10905 return AARCH64_PARSE_MISSING_ARG
;
10908 /* Loop through the list of supported CPUs to find a match. */
10909 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
10911 if (strlen (cpu
->name
) == len
&& strncmp (cpu
->name
, to_parse
, len
) == 0)
10913 unsigned long isa_temp
= cpu
->flags
;
10918 /* TO_PARSE string contains at least one extension. */
10919 enum aarch64_parse_opt_result ext_res
10920 = aarch64_parse_extension (ext
, &isa_temp
, invalid_extension
);
10922 if (ext_res
!= AARCH64_PARSE_OK
)
10925 /* Extension parsing was successfull. Confirm the result
10926 cpu and ISA flags. */
10928 *isa_flags
= isa_temp
;
10929 return AARCH64_PARSE_OK
;
10933 /* CPU name not found in list. */
10934 return AARCH64_PARSE_INVALID_ARG
;
10937 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10938 Return an aarch64_parse_opt_result describing the parse result.
10939 If the parsing fails the RES does not change. */
10941 static enum aarch64_parse_opt_result
10942 aarch64_parse_tune (const char *to_parse
, const struct processor
**res
)
10944 const struct processor
*cpu
;
10946 /* Loop through the list of supported CPUs to find a match. */
10947 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
10949 if (strcmp (cpu
->name
, to_parse
) == 0)
10952 return AARCH64_PARSE_OK
;
10956 /* CPU name not found in list. */
10957 return AARCH64_PARSE_INVALID_ARG
;
10960 /* Parse TOKEN, which has length LENGTH to see if it is an option
10961 described in FLAG. If it is, return the index bit for that fusion type.
10962 If not, error (printing OPTION_NAME) and return zero. */
10964 static unsigned int
10965 aarch64_parse_one_option_token (const char *token
,
10967 const struct aarch64_flag_desc
*flag
,
10968 const char *option_name
)
10970 for (; flag
->name
!= NULL
; flag
++)
10972 if (length
== strlen (flag
->name
)
10973 && !strncmp (flag
->name
, token
, length
))
10977 error ("unknown flag passed in -moverride=%s (%s)", option_name
, token
);
10981 /* Parse OPTION which is a comma-separated list of flags to enable.
10982 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10983 default state we inherit from the CPU tuning structures. OPTION_NAME
10984 gives the top-level option we are parsing in the -moverride string,
10985 for use in error messages. */
10987 static unsigned int
10988 aarch64_parse_boolean_options (const char *option
,
10989 const struct aarch64_flag_desc
*flags
,
10990 unsigned int initial_state
,
10991 const char *option_name
)
10993 const char separator
= '.';
10994 const char* specs
= option
;
10995 const char* ntoken
= option
;
10996 unsigned int found_flags
= initial_state
;
10998 while ((ntoken
= strchr (specs
, separator
)))
11000 size_t token_length
= ntoken
- specs
;
11001 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
11005 /* If we find "none" (or, for simplicity's sake, an error) anywhere
11006 in the token stream, reset the supported operations. So:
11008 adrp+add.cmp+branch.none.adrp+add
11010 would have the result of turning on only adrp+add fusion. */
11014 found_flags
|= token_ops
;
11018 /* We ended with a comma, print something. */
11021 error ("%s string ill-formed\n", option_name
);
11025 /* We still have one more token to parse. */
11026 size_t token_length
= strlen (specs
);
11027 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
11034 found_flags
|= token_ops
;
11035 return found_flags
;
11038 /* Support for overriding instruction fusion. */
11041 aarch64_parse_fuse_string (const char *fuse_string
,
11042 struct tune_params
*tune
)
11044 tune
->fusible_ops
= aarch64_parse_boolean_options (fuse_string
,
11045 aarch64_fusible_pairs
,
11050 /* Support for overriding other tuning flags. */
11053 aarch64_parse_tune_string (const char *tune_string
,
11054 struct tune_params
*tune
)
11056 tune
->extra_tuning_flags
11057 = aarch64_parse_boolean_options (tune_string
,
11058 aarch64_tuning_flags
,
11059 tune
->extra_tuning_flags
,
11063 /* Parse the sve_width tuning moverride string in TUNE_STRING.
11064 Accept the valid SVE vector widths allowed by
11065 aarch64_sve_vector_bits_enum and use it to override sve_width
11069 aarch64_parse_sve_width_string (const char *tune_string
,
11070 struct tune_params
*tune
)
11074 int n
= sscanf (tune_string
, "%d", &width
);
11077 error ("invalid format for sve_width");
11089 error ("invalid sve_width value: %d", width
);
11091 tune
->sve_width
= (enum aarch64_sve_vector_bits_enum
) width
;
11094 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
11095 we understand. If it is, extract the option string and handoff to
11096 the appropriate function. */
11099 aarch64_parse_one_override_token (const char* token
,
11101 struct tune_params
*tune
)
11103 const struct aarch64_tuning_override_function
*fn
11104 = aarch64_tuning_override_functions
;
11106 const char *option_part
= strchr (token
, '=');
11109 error ("tuning string missing in option (%s)", token
);
11113 /* Get the length of the option name. */
11114 length
= option_part
- token
;
11115 /* Skip the '=' to get to the option string. */
11118 for (; fn
->name
!= NULL
; fn
++)
11120 if (!strncmp (fn
->name
, token
, length
))
11122 fn
->parse_override (option_part
, tune
);
11127 error ("unknown tuning option (%s)",token
);
11131 /* A checking mechanism for the implementation of the tls size. */
11134 initialize_aarch64_tls_size (struct gcc_options
*opts
)
11136 if (aarch64_tls_size
== 0)
11137 aarch64_tls_size
= 24;
11139 switch (opts
->x_aarch64_cmodel_var
)
11141 case AARCH64_CMODEL_TINY
:
11142 /* Both the default and maximum TLS size allowed under tiny is 1M which
11143 needs two instructions to address, so we clamp the size to 24. */
11144 if (aarch64_tls_size
> 24)
11145 aarch64_tls_size
= 24;
11147 case AARCH64_CMODEL_SMALL
:
11148 /* The maximum TLS size allowed under small is 4G. */
11149 if (aarch64_tls_size
> 32)
11150 aarch64_tls_size
= 32;
11152 case AARCH64_CMODEL_LARGE
:
11153 /* The maximum TLS size allowed under large is 16E.
11154 FIXME: 16E should be 64bit, we only support 48bit offset now. */
11155 if (aarch64_tls_size
> 48)
11156 aarch64_tls_size
= 48;
11159 gcc_unreachable ();
11165 /* Parse STRING looking for options in the format:
11166 string :: option:string
11167 option :: name=substring
11169 substring :: defined by option. */
11172 aarch64_parse_override_string (const char* input_string
,
11173 struct tune_params
* tune
)
11175 const char separator
= ':';
11176 size_t string_length
= strlen (input_string
) + 1;
11177 char *string_root
= (char *) xmalloc (sizeof (*string_root
) * string_length
);
11178 char *string
= string_root
;
11179 strncpy (string
, input_string
, string_length
);
11180 string
[string_length
- 1] = '\0';
11182 char* ntoken
= string
;
11184 while ((ntoken
= strchr (string
, separator
)))
11186 size_t token_length
= ntoken
- string
;
11187 /* Make this substring look like a string. */
11189 aarch64_parse_one_override_token (string
, token_length
, tune
);
11193 /* One last option to parse. */
11194 aarch64_parse_one_override_token (string
, strlen (string
), tune
);
11195 free (string_root
);
11200 aarch64_override_options_after_change_1 (struct gcc_options
*opts
)
11202 if (accepted_branch_protection_string
)
11204 opts
->x_aarch64_branch_protection_string
11205 = xstrdup (accepted_branch_protection_string
);
11208 /* PR 70044: We have to be careful about being called multiple times for the
11209 same function. This means all changes should be repeatable. */
11211 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
11212 Disable the frame pointer flag so the mid-end will not use a frame
11213 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
11214 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
11215 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
11216 aarch64_use_frame_pointer
= opts
->x_flag_omit_frame_pointer
!= 1;
11217 if (opts
->x_flag_omit_frame_pointer
== 0)
11218 opts
->x_flag_omit_frame_pointer
= 2;
11220 /* If not optimizing for size, set the default
11221 alignment to what the target wants. */
11222 if (!opts
->x_optimize_size
)
11224 if (opts
->x_flag_align_loops
&& !opts
->x_str_align_loops
)
11225 opts
->x_str_align_loops
= aarch64_tune_params
.loop_align
;
11226 if (opts
->x_flag_align_jumps
&& !opts
->x_str_align_jumps
)
11227 opts
->x_str_align_jumps
= aarch64_tune_params
.jump_align
;
11228 if (opts
->x_flag_align_functions
&& !opts
->x_str_align_functions
)
11229 opts
->x_str_align_functions
= aarch64_tune_params
.function_align
;
11232 /* We default to no pc-relative literal loads. */
11234 aarch64_pcrelative_literal_loads
= false;
11236 /* If -mpc-relative-literal-loads is set on the command line, this
11237 implies that the user asked for PC relative literal loads. */
11238 if (opts
->x_pcrelative_literal_loads
== 1)
11239 aarch64_pcrelative_literal_loads
= true;
11241 /* In the tiny memory model it makes no sense to disallow PC relative
11242 literal pool loads. */
11243 if (aarch64_cmodel
== AARCH64_CMODEL_TINY
11244 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
11245 aarch64_pcrelative_literal_loads
= true;
11247 /* When enabling the lower precision Newton series for the square root, also
11248 enable it for the reciprocal square root, since the latter is an
11249 intermediary step for the former. */
11250 if (flag_mlow_precision_sqrt
)
11251 flag_mrecip_low_precision_sqrt
= true;
11254 /* 'Unpack' up the internal tuning structs and update the options
11255 in OPTS. The caller must have set up selected_tune and selected_arch
11256 as all the other target-specific codegen decisions are
11257 derived from them. */
11260 aarch64_override_options_internal (struct gcc_options
*opts
)
11262 aarch64_tune_flags
= selected_tune
->flags
;
11263 aarch64_tune
= selected_tune
->sched_core
;
11264 /* Make a copy of the tuning parameters attached to the core, which
11265 we may later overwrite. */
11266 aarch64_tune_params
= *(selected_tune
->tune
);
11267 aarch64_architecture_version
= selected_arch
->architecture_version
;
11269 if (opts
->x_aarch64_override_tune_string
)
11270 aarch64_parse_override_string (opts
->x_aarch64_override_tune_string
,
11271 &aarch64_tune_params
);
11273 /* This target defaults to strict volatile bitfields. */
11274 if (opts
->x_flag_strict_volatile_bitfields
< 0 && abi_version_at_least (2))
11275 opts
->x_flag_strict_volatile_bitfields
= 1;
11277 initialize_aarch64_code_model (opts
);
11278 initialize_aarch64_tls_size (opts
);
11280 int queue_depth
= 0;
11281 switch (aarch64_tune_params
.autoprefetcher_model
)
11283 case tune_params::AUTOPREFETCHER_OFF
:
11286 case tune_params::AUTOPREFETCHER_WEAK
:
11289 case tune_params::AUTOPREFETCHER_STRONG
:
11290 queue_depth
= max_insn_queue_index
+ 1;
11293 gcc_unreachable ();
11296 /* We don't mind passing in global_options_set here as we don't use
11297 the *options_set structs anyway. */
11298 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH
,
11300 opts
->x_param_values
,
11301 global_options_set
.x_param_values
);
11303 /* Set up parameters to be used in prefetching algorithm. Do not
11304 override the defaults unless we are tuning for a core we have
11305 researched values for. */
11306 if (aarch64_tune_params
.prefetch
->num_slots
> 0)
11307 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES
,
11308 aarch64_tune_params
.prefetch
->num_slots
,
11309 opts
->x_param_values
,
11310 global_options_set
.x_param_values
);
11311 if (aarch64_tune_params
.prefetch
->l1_cache_size
>= 0)
11312 maybe_set_param_value (PARAM_L1_CACHE_SIZE
,
11313 aarch64_tune_params
.prefetch
->l1_cache_size
,
11314 opts
->x_param_values
,
11315 global_options_set
.x_param_values
);
11316 if (aarch64_tune_params
.prefetch
->l1_cache_line_size
>= 0)
11317 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE
,
11318 aarch64_tune_params
.prefetch
->l1_cache_line_size
,
11319 opts
->x_param_values
,
11320 global_options_set
.x_param_values
);
11321 if (aarch64_tune_params
.prefetch
->l2_cache_size
>= 0)
11322 maybe_set_param_value (PARAM_L2_CACHE_SIZE
,
11323 aarch64_tune_params
.prefetch
->l2_cache_size
,
11324 opts
->x_param_values
,
11325 global_options_set
.x_param_values
);
11326 if (!aarch64_tune_params
.prefetch
->prefetch_dynamic_strides
)
11327 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES
,
11329 opts
->x_param_values
,
11330 global_options_set
.x_param_values
);
11331 if (aarch64_tune_params
.prefetch
->minimum_stride
>= 0)
11332 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE
,
11333 aarch64_tune_params
.prefetch
->minimum_stride
,
11334 opts
->x_param_values
,
11335 global_options_set
.x_param_values
);
11337 /* Use the alternative scheduling-pressure algorithm by default. */
11338 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM
, SCHED_PRESSURE_MODEL
,
11339 opts
->x_param_values
,
11340 global_options_set
.x_param_values
);
11342 /* If the user hasn't changed it via configure then set the default to 64 KB
11343 for the backend. */
11344 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
,
11345 DEFAULT_STK_CLASH_GUARD_SIZE
== 0
11346 ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE
,
11347 opts
->x_param_values
,
11348 global_options_set
.x_param_values
);
11350 /* Validate the guard size. */
11351 int guard_size
= PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
11353 /* Enforce that interval is the same size as size so the mid-end does the
11355 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL
,
11357 opts
->x_param_values
,
11358 global_options_set
.x_param_values
);
11360 /* The maybe_set calls won't update the value if the user has explicitly set
11361 one. Which means we need to validate that probing interval and guard size
11364 = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL
);
11365 if (guard_size
!= probe_interval
)
11366 error ("stack clash guard size '%d' must be equal to probing interval "
11367 "'%d'", guard_size
, probe_interval
);
11369 /* Enable sw prefetching at specified optimization level for
11370 CPUS that have prefetch. Lower optimization level threshold by 1
11371 when profiling is enabled. */
11372 if (opts
->x_flag_prefetch_loop_arrays
< 0
11373 && !opts
->x_optimize_size
11374 && aarch64_tune_params
.prefetch
->default_opt_level
>= 0
11375 && opts
->x_optimize
>= aarch64_tune_params
.prefetch
->default_opt_level
)
11376 opts
->x_flag_prefetch_loop_arrays
= 1;
11378 if (opts
->x_aarch64_arch_string
== NULL
)
11379 opts
->x_aarch64_arch_string
= selected_arch
->name
;
11380 if (opts
->x_aarch64_cpu_string
== NULL
)
11381 opts
->x_aarch64_cpu_string
= selected_cpu
->name
;
11382 if (opts
->x_aarch64_tune_string
== NULL
)
11383 opts
->x_aarch64_tune_string
= selected_tune
->name
;
11385 aarch64_override_options_after_change_1 (opts
);
11388 /* Print a hint with a suggestion for a core or architecture name that
11389 most closely resembles what the user passed in STR. ARCH is true if
11390 the user is asking for an architecture name. ARCH is false if the user
11391 is asking for a core name. */
11394 aarch64_print_hint_for_core_or_arch (const char *str
, bool arch
)
11396 auto_vec
<const char *> candidates
;
11397 const struct processor
*entry
= arch
? all_architectures
: all_cores
;
11398 for (; entry
->name
!= NULL
; entry
++)
11399 candidates
.safe_push (entry
->name
);
11401 #ifdef HAVE_LOCAL_CPU_DETECT
11402 /* Add also "native" as possible value. */
11404 candidates
.safe_push ("native");
11408 const char *hint
= candidates_list_and_hint (str
, s
, candidates
);
11410 inform (input_location
, "valid arguments are: %s;"
11411 " did you mean %qs?", s
, hint
);
11413 inform (input_location
, "valid arguments are: %s", s
);
11418 /* Print a hint with a suggestion for a core name that most closely resembles
11419 what the user passed in STR. */
11422 aarch64_print_hint_for_core (const char *str
)
11424 aarch64_print_hint_for_core_or_arch (str
, false);
11427 /* Print a hint with a suggestion for an architecture name that most closely
11428 resembles what the user passed in STR. */
11431 aarch64_print_hint_for_arch (const char *str
)
11433 aarch64_print_hint_for_core_or_arch (str
, true);
11437 /* Print a hint with a suggestion for an extension name
11438 that most closely resembles what the user passed in STR. */
11441 aarch64_print_hint_for_extensions (const std::string
&str
)
11443 auto_vec
<const char *> candidates
;
11444 aarch64_get_all_extension_candidates (&candidates
);
11446 const char *hint
= candidates_list_and_hint (str
.c_str (), s
, candidates
);
11448 inform (input_location
, "valid arguments are: %s;"
11449 " did you mean %qs?", s
, hint
);
11451 inform (input_location
, "valid arguments are: %s;", s
);
11456 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
11457 specified in STR and throw errors if appropriate. Put the results if
11458 they are valid in RES and ISA_FLAGS. Return whether the option is
11462 aarch64_validate_mcpu (const char *str
, const struct processor
**res
,
11463 unsigned long *isa_flags
)
11465 std::string invalid_extension
;
11466 enum aarch64_parse_opt_result parse_res
11467 = aarch64_parse_cpu (str
, res
, isa_flags
, &invalid_extension
);
11469 if (parse_res
== AARCH64_PARSE_OK
)
11474 case AARCH64_PARSE_MISSING_ARG
:
11475 error ("missing cpu name in %<-mcpu=%s%>", str
);
11477 case AARCH64_PARSE_INVALID_ARG
:
11478 error ("unknown value %qs for -mcpu", str
);
11479 aarch64_print_hint_for_core (str
);
11481 case AARCH64_PARSE_INVALID_FEATURE
:
11482 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
11483 invalid_extension
.c_str (), str
);
11484 aarch64_print_hint_for_extensions (invalid_extension
);
11487 gcc_unreachable ();
11493 /* Parses CONST_STR for branch protection features specified in
11494 aarch64_branch_protect_types, and set any global variables required. Returns
11495 the parsing result and assigns LAST_STR to the last processed token from
11496 CONST_STR so that it can be used for error reporting. */
11499 aarch64_parse_opt_result
aarch64_parse_branch_protection (const char *const_str
,
11502 char *str_root
= xstrdup (const_str
);
11503 char* token_save
= NULL
;
11504 char *str
= strtok_r (str_root
, "+", &token_save
);
11505 enum aarch64_parse_opt_result res
= AARCH64_PARSE_OK
;
11507 res
= AARCH64_PARSE_MISSING_ARG
;
11510 char *next_str
= strtok_r (NULL
, "+", &token_save
);
11511 /* Reset the branch protection features to their defaults. */
11512 aarch64_handle_no_branch_protection (NULL
, NULL
);
11514 while (str
&& res
== AARCH64_PARSE_OK
)
11516 const aarch64_branch_protect_type
* type
= aarch64_branch_protect_types
;
11517 bool found
= false;
11518 /* Search for this type. */
11519 while (type
&& type
->name
&& !found
&& res
== AARCH64_PARSE_OK
)
11521 if (strcmp (str
, type
->name
) == 0)
11524 res
= type
->handler (str
, next_str
);
11526 next_str
= strtok_r (NULL
, "+", &token_save
);
11531 if (found
&& res
== AARCH64_PARSE_OK
)
11533 bool found_subtype
= true;
11534 /* Loop through each token until we find one that isn't a
11536 while (found_subtype
)
11538 found_subtype
= false;
11539 const aarch64_branch_protect_type
*subtype
= type
->subtypes
;
11540 /* Search for the subtype. */
11541 while (str
&& subtype
&& subtype
->name
&& !found_subtype
11542 && res
== AARCH64_PARSE_OK
)
11544 if (strcmp (str
, subtype
->name
) == 0)
11546 found_subtype
= true;
11547 res
= subtype
->handler (str
, next_str
);
11549 next_str
= strtok_r (NULL
, "+", &token_save
);
11557 res
= AARCH64_PARSE_INVALID_ARG
;
11560 /* Copy the last processed token into the argument to pass it back.
11561 Used by option and attribute validation to print the offending token. */
11564 if (str
) strcpy (*last_str
, str
);
11565 else *last_str
= NULL
;
11567 if (res
== AARCH64_PARSE_OK
)
11569 /* If needed, alloc the accepted string then copy in const_str.
11570 Used by override_option_after_change_1. */
11571 if (!accepted_branch_protection_string
)
11572 accepted_branch_protection_string
= (char *) xmalloc (
11573 BRANCH_PROTECT_STR_MAX
11575 strncpy (accepted_branch_protection_string
, const_str
,
11576 BRANCH_PROTECT_STR_MAX
+ 1);
11577 /* Forcibly null-terminate. */
11578 accepted_branch_protection_string
[BRANCH_PROTECT_STR_MAX
] = '\0';
11584 aarch64_validate_mbranch_protection (const char *const_str
)
11586 char *str
= (char *) xmalloc (strlen (const_str
));
11587 enum aarch64_parse_opt_result res
=
11588 aarch64_parse_branch_protection (const_str
, &str
);
11589 if (res
== AARCH64_PARSE_INVALID_ARG
)
11590 error ("invalid arg %<%s%> for %<-mbranch-protection=%>", str
);
11591 else if (res
== AARCH64_PARSE_MISSING_ARG
)
11592 error ("missing arg for %<-mbranch-protection=%>");
11594 return res
== AARCH64_PARSE_OK
;
11597 /* Validate a command-line -march option. Parse the arch and extensions
11598 (if any) specified in STR and throw errors if appropriate. Put the
11599 results, if they are valid, in RES and ISA_FLAGS. Return whether the
11600 option is valid. */
11603 aarch64_validate_march (const char *str
, const struct processor
**res
,
11604 unsigned long *isa_flags
)
11606 std::string invalid_extension
;
11607 enum aarch64_parse_opt_result parse_res
11608 = aarch64_parse_arch (str
, res
, isa_flags
, &invalid_extension
);
11610 if (parse_res
== AARCH64_PARSE_OK
)
11615 case AARCH64_PARSE_MISSING_ARG
:
11616 error ("missing arch name in %<-march=%s%>", str
);
11618 case AARCH64_PARSE_INVALID_ARG
:
11619 error ("unknown value %qs for -march", str
);
11620 aarch64_print_hint_for_arch (str
);
11622 case AARCH64_PARSE_INVALID_FEATURE
:
11623 error ("invalid feature modifier %qs in %<-march=%s%>",
11624 invalid_extension
.c_str (), str
);
11625 aarch64_print_hint_for_extensions (invalid_extension
);
11628 gcc_unreachable ();
11634 /* Validate a command-line -mtune option. Parse the cpu
11635 specified in STR and throw errors if appropriate. Put the
11636 result, if it is valid, in RES. Return whether the option is
11640 aarch64_validate_mtune (const char *str
, const struct processor
**res
)
11642 enum aarch64_parse_opt_result parse_res
11643 = aarch64_parse_tune (str
, res
);
11645 if (parse_res
== AARCH64_PARSE_OK
)
11650 case AARCH64_PARSE_MISSING_ARG
:
11651 error ("missing cpu name in %<-mtune=%s%>", str
);
11653 case AARCH64_PARSE_INVALID_ARG
:
11654 error ("unknown value %qs for -mtune", str
);
11655 aarch64_print_hint_for_core (str
);
11658 gcc_unreachable ();
11663 /* Return the CPU corresponding to the enum CPU.
11664 If it doesn't specify a cpu, return the default. */
11666 static const struct processor
*
11667 aarch64_get_tune_cpu (enum aarch64_processor cpu
)
11669 if (cpu
!= aarch64_none
)
11670 return &all_cores
[cpu
];
11672 /* The & 0x3f is to extract the bottom 6 bits that encode the
11673 default cpu as selected by the --with-cpu GCC configure option
11675 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
11676 flags mechanism should be reworked to make it more sane. */
11677 return &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
11680 /* Return the architecture corresponding to the enum ARCH.
11681 If it doesn't specify a valid architecture, return the default. */
11683 static const struct processor
*
11684 aarch64_get_arch (enum aarch64_arch arch
)
11686 if (arch
!= aarch64_no_arch
)
11687 return &all_architectures
[arch
];
11689 const struct processor
*cpu
= &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
11691 return &all_architectures
[cpu
->arch
];
11694 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
11697 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value
)
11699 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
11700 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
11701 deciding which .md file patterns to use and when deciding whether
11702 something is a legitimate address or constant. */
11703 if (value
== SVE_SCALABLE
|| value
== SVE_128
)
11704 return poly_uint16 (2, 2);
11706 return (int) value
/ 64;
11709 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
11710 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
11711 tuning structs. In particular it must set selected_tune and
11712 aarch64_isa_flags that define the available ISA features and tuning
11713 decisions. It must also set selected_arch as this will be used to
11714 output the .arch asm tags for each function. */
11717 aarch64_override_options (void)
11719 unsigned long cpu_isa
= 0;
11720 unsigned long arch_isa
= 0;
11721 aarch64_isa_flags
= 0;
11723 bool valid_cpu
= true;
11724 bool valid_tune
= true;
11725 bool valid_arch
= true;
11727 selected_cpu
= NULL
;
11728 selected_arch
= NULL
;
11729 selected_tune
= NULL
;
11731 if (aarch64_branch_protection_string
)
11732 aarch64_validate_mbranch_protection (aarch64_branch_protection_string
);
11734 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
11735 If either of -march or -mtune is given, they override their
11736 respective component of -mcpu. */
11737 if (aarch64_cpu_string
)
11738 valid_cpu
= aarch64_validate_mcpu (aarch64_cpu_string
, &selected_cpu
,
11741 if (aarch64_arch_string
)
11742 valid_arch
= aarch64_validate_march (aarch64_arch_string
, &selected_arch
,
11745 if (aarch64_tune_string
)
11746 valid_tune
= aarch64_validate_mtune (aarch64_tune_string
, &selected_tune
);
11748 #ifdef SUBTARGET_OVERRIDE_OPTIONS
11749 SUBTARGET_OVERRIDE_OPTIONS
;
11752 /* If the user did not specify a processor, choose the default
11753 one for them. This will be the CPU set during configuration using
11754 --with-cpu, otherwise it is "generic". */
11759 selected_cpu
= &all_cores
[selected_arch
->ident
];
11760 aarch64_isa_flags
= arch_isa
;
11761 explicit_arch
= selected_arch
->arch
;
11765 /* Get default configure-time CPU. */
11766 selected_cpu
= aarch64_get_tune_cpu (aarch64_none
);
11767 aarch64_isa_flags
= TARGET_CPU_DEFAULT
>> 6;
11771 explicit_tune_core
= selected_tune
->ident
;
11773 /* If both -mcpu and -march are specified check that they are architecturally
11774 compatible, warn if they're not and prefer the -march ISA flags. */
11775 else if (selected_arch
)
11777 if (selected_arch
->arch
!= selected_cpu
->arch
)
11779 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
11780 all_architectures
[selected_cpu
->arch
].name
,
11781 selected_arch
->name
);
11783 aarch64_isa_flags
= arch_isa
;
11784 explicit_arch
= selected_arch
->arch
;
11785 explicit_tune_core
= selected_tune
? selected_tune
->ident
11786 : selected_cpu
->ident
;
11790 /* -mcpu but no -march. */
11791 aarch64_isa_flags
= cpu_isa
;
11792 explicit_tune_core
= selected_tune
? selected_tune
->ident
11793 : selected_cpu
->ident
;
11794 gcc_assert (selected_cpu
);
11795 selected_arch
= &all_architectures
[selected_cpu
->arch
];
11796 explicit_arch
= selected_arch
->arch
;
11799 /* Set the arch as well as we will need it when outputing
11800 the .arch directive in assembly. */
11801 if (!selected_arch
)
11803 gcc_assert (selected_cpu
);
11804 selected_arch
= &all_architectures
[selected_cpu
->arch
];
11807 if (!selected_tune
)
11808 selected_tune
= selected_cpu
;
11810 #ifndef HAVE_AS_MABI_OPTION
11811 /* The compiler may have been configured with 2.23.* binutils, which does
11812 not have support for ILP32. */
11814 error ("assembler does not support -mabi=ilp32");
11817 /* Convert -msve-vector-bits to a VG count. */
11818 aarch64_sve_vg
= aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits
);
11820 if (aarch64_ra_sign_scope
!= AARCH64_FUNCTION_NONE
&& TARGET_ILP32
)
11821 sorry ("return address signing is only supported for -mabi=lp64");
11823 /* Make sure we properly set up the explicit options. */
11824 if ((aarch64_cpu_string
&& valid_cpu
)
11825 || (aarch64_tune_string
&& valid_tune
))
11826 gcc_assert (explicit_tune_core
!= aarch64_none
);
11828 if ((aarch64_cpu_string
&& valid_cpu
)
11829 || (aarch64_arch_string
&& valid_arch
))
11830 gcc_assert (explicit_arch
!= aarch64_no_arch
);
11832 /* The pass to insert speculation tracking runs before
11833 shrink-wrapping and the latter does not know how to update the
11834 tracking status. So disable it in this case. */
11835 if (aarch64_track_speculation
)
11836 flag_shrink_wrap
= 0;
11838 aarch64_override_options_internal (&global_options
);
11840 /* Save these options as the default ones in case we push and pop them later
11841 while processing functions with potential target attributes. */
11842 target_option_default_node
= target_option_current_node
11843 = build_target_option_node (&global_options
);
11846 /* Implement targetm.override_options_after_change. */
11849 aarch64_override_options_after_change (void)
11851 aarch64_override_options_after_change_1 (&global_options
);
11854 static struct machine_function
*
11855 aarch64_init_machine_status (void)
11857 struct machine_function
*machine
;
11858 machine
= ggc_cleared_alloc
<machine_function
> ();
11863 aarch64_init_expanders (void)
11865 init_machine_status
= aarch64_init_machine_status
;
11868 /* A checking mechanism for the implementation of the various code models. */
11870 initialize_aarch64_code_model (struct gcc_options
*opts
)
11872 if (opts
->x_flag_pic
)
11874 switch (opts
->x_aarch64_cmodel_var
)
11876 case AARCH64_CMODEL_TINY
:
11877 aarch64_cmodel
= AARCH64_CMODEL_TINY_PIC
;
11879 case AARCH64_CMODEL_SMALL
:
11880 #ifdef HAVE_AS_SMALL_PIC_RELOCS
11881 aarch64_cmodel
= (flag_pic
== 2
11882 ? AARCH64_CMODEL_SMALL_PIC
11883 : AARCH64_CMODEL_SMALL_SPIC
);
11885 aarch64_cmodel
= AARCH64_CMODEL_SMALL_PIC
;
11888 case AARCH64_CMODEL_LARGE
:
11889 sorry ("code model %qs with -f%s", "large",
11890 opts
->x_flag_pic
> 1 ? "PIC" : "pic");
11893 gcc_unreachable ();
11897 aarch64_cmodel
= opts
->x_aarch64_cmodel_var
;
11900 /* Implement TARGET_OPTION_SAVE. */
11903 aarch64_option_save (struct cl_target_option
*ptr
, struct gcc_options
*opts
)
11905 ptr
->x_aarch64_override_tune_string
= opts
->x_aarch64_override_tune_string
;
11906 ptr
->x_aarch64_branch_protection_string
11907 = opts
->x_aarch64_branch_protection_string
;
11910 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
11911 using the information saved in PTR. */
11914 aarch64_option_restore (struct gcc_options
*opts
, struct cl_target_option
*ptr
)
11916 opts
->x_explicit_tune_core
= ptr
->x_explicit_tune_core
;
11917 selected_tune
= aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
11918 opts
->x_explicit_arch
= ptr
->x_explicit_arch
;
11919 selected_arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
11920 opts
->x_aarch64_override_tune_string
= ptr
->x_aarch64_override_tune_string
;
11921 opts
->x_aarch64_branch_protection_string
11922 = ptr
->x_aarch64_branch_protection_string
;
11923 if (opts
->x_aarch64_branch_protection_string
)
11925 aarch64_parse_branch_protection (opts
->x_aarch64_branch_protection_string
,
11929 aarch64_override_options_internal (opts
);
11932 /* Implement TARGET_OPTION_PRINT. */
11935 aarch64_option_print (FILE *file
, int indent
, struct cl_target_option
*ptr
)
11937 const struct processor
*cpu
11938 = aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
11939 unsigned long isa_flags
= ptr
->x_aarch64_isa_flags
;
11940 const struct processor
*arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
11941 std::string extension
11942 = aarch64_get_extension_string_for_isa_flags (isa_flags
, arch
->flags
);
11944 fprintf (file
, "%*sselected tune = %s\n", indent
, "", cpu
->name
);
11945 fprintf (file
, "%*sselected arch = %s%s\n", indent
, "",
11946 arch
->name
, extension
.c_str ());
11949 static GTY(()) tree aarch64_previous_fndecl
;
11952 aarch64_reset_previous_fndecl (void)
11954 aarch64_previous_fndecl
= NULL
;
11957 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
11958 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
11959 make sure optab availability predicates are recomputed when necessary. */
11962 aarch64_save_restore_target_globals (tree new_tree
)
11964 if (TREE_TARGET_GLOBALS (new_tree
))
11965 restore_target_globals (TREE_TARGET_GLOBALS (new_tree
));
11966 else if (new_tree
== target_option_default_node
)
11967 restore_target_globals (&default_target_globals
);
11969 TREE_TARGET_GLOBALS (new_tree
) = save_target_globals_default_opts ();
11972 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
11973 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
11974 of the function, if such exists. This function may be called multiple
11975 times on a single function so use aarch64_previous_fndecl to avoid
11976 setting up identical state. */
11979 aarch64_set_current_function (tree fndecl
)
11981 if (!fndecl
|| fndecl
== aarch64_previous_fndecl
)
11984 tree old_tree
= (aarch64_previous_fndecl
11985 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl
)
11988 tree new_tree
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
11990 /* If current function has no attributes but the previous one did,
11991 use the default node. */
11992 if (!new_tree
&& old_tree
)
11993 new_tree
= target_option_default_node
;
11995 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
11996 the default have been handled by aarch64_save_restore_target_globals from
11997 aarch64_pragma_target_parse. */
11998 if (old_tree
== new_tree
)
12001 aarch64_previous_fndecl
= fndecl
;
12003 /* First set the target options. */
12004 cl_target_option_restore (&global_options
, TREE_TARGET_OPTION (new_tree
));
12006 aarch64_save_restore_target_globals (new_tree
);
12009 /* Enum describing the various ways we can handle attributes.
12010 In many cases we can reuse the generic option handling machinery. */
12012 enum aarch64_attr_opt_type
12014 aarch64_attr_mask
, /* Attribute should set a bit in target_flags. */
12015 aarch64_attr_bool
, /* Attribute sets or unsets a boolean variable. */
12016 aarch64_attr_enum
, /* Attribute sets an enum variable. */
12017 aarch64_attr_custom
/* Attribute requires a custom handling function. */
12020 /* All the information needed to handle a target attribute.
12021 NAME is the name of the attribute.
12022 ATTR_TYPE specifies the type of behavior of the attribute as described
12023 in the definition of enum aarch64_attr_opt_type.
12024 ALLOW_NEG is true if the attribute supports a "no-" form.
12025 HANDLER is the function that takes the attribute string as an argument
12026 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
12027 OPT_NUM is the enum specifying the option that the attribute modifies.
12028 This is needed for attributes that mirror the behavior of a command-line
12029 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
12030 aarch64_attr_enum. */
12032 struct aarch64_attribute_info
12035 enum aarch64_attr_opt_type attr_type
;
12037 bool (*handler
) (const char *);
12038 enum opt_code opt_num
;
12041 /* Handle the ARCH_STR argument to the arch= target attribute. */
12044 aarch64_handle_attr_arch (const char *str
)
12046 const struct processor
*tmp_arch
= NULL
;
12047 std::string invalid_extension
;
12048 enum aarch64_parse_opt_result parse_res
12049 = aarch64_parse_arch (str
, &tmp_arch
, &aarch64_isa_flags
, &invalid_extension
);
12051 if (parse_res
== AARCH64_PARSE_OK
)
12053 gcc_assert (tmp_arch
);
12054 selected_arch
= tmp_arch
;
12055 explicit_arch
= selected_arch
->arch
;
12061 case AARCH64_PARSE_MISSING_ARG
:
12062 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
12064 case AARCH64_PARSE_INVALID_ARG
:
12065 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str
);
12066 aarch64_print_hint_for_arch (str
);
12068 case AARCH64_PARSE_INVALID_FEATURE
:
12069 error ("invalid feature modifier %s of value (\"%s\") in "
12070 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
12071 aarch64_print_hint_for_extensions (invalid_extension
);
12074 gcc_unreachable ();
12080 /* Handle the argument CPU_STR to the cpu= target attribute. */
12083 aarch64_handle_attr_cpu (const char *str
)
12085 const struct processor
*tmp_cpu
= NULL
;
12086 std::string invalid_extension
;
12087 enum aarch64_parse_opt_result parse_res
12088 = aarch64_parse_cpu (str
, &tmp_cpu
, &aarch64_isa_flags
, &invalid_extension
);
12090 if (parse_res
== AARCH64_PARSE_OK
)
12092 gcc_assert (tmp_cpu
);
12093 selected_tune
= tmp_cpu
;
12094 explicit_tune_core
= selected_tune
->ident
;
12096 selected_arch
= &all_architectures
[tmp_cpu
->arch
];
12097 explicit_arch
= selected_arch
->arch
;
12103 case AARCH64_PARSE_MISSING_ARG
:
12104 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
12106 case AARCH64_PARSE_INVALID_ARG
:
12107 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str
);
12108 aarch64_print_hint_for_core (str
);
12110 case AARCH64_PARSE_INVALID_FEATURE
:
12111 error ("invalid feature modifier %s of value (\"%s\") in "
12112 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
12113 aarch64_print_hint_for_extensions (invalid_extension
);
12116 gcc_unreachable ();
12122 /* Handle the argument STR to the branch-protection= attribute. */
12125 aarch64_handle_attr_branch_protection (const char* str
)
12127 char *err_str
= (char *) xmalloc (strlen (str
));
12128 enum aarch64_parse_opt_result res
= aarch64_parse_branch_protection (str
,
12130 bool success
= false;
12133 case AARCH64_PARSE_MISSING_ARG
:
12134 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
12137 case AARCH64_PARSE_INVALID_ARG
:
12138 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
12139 "=\")%> pragma or attribute", err_str
);
12141 case AARCH64_PARSE_OK
:
12143 /* Fall through. */
12144 case AARCH64_PARSE_INVALID_FEATURE
:
12147 gcc_unreachable ();
12153 /* Handle the argument STR to the tune= target attribute. */
12156 aarch64_handle_attr_tune (const char *str
)
12158 const struct processor
*tmp_tune
= NULL
;
12159 enum aarch64_parse_opt_result parse_res
12160 = aarch64_parse_tune (str
, &tmp_tune
);
12162 if (parse_res
== AARCH64_PARSE_OK
)
12164 gcc_assert (tmp_tune
);
12165 selected_tune
= tmp_tune
;
12166 explicit_tune_core
= selected_tune
->ident
;
12172 case AARCH64_PARSE_INVALID_ARG
:
12173 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str
);
12174 aarch64_print_hint_for_core (str
);
12177 gcc_unreachable ();
12183 /* Parse an architecture extensions target attribute string specified in STR.
12184 For example "+fp+nosimd". Show any errors if needed. Return TRUE
12185 if successful. Update aarch64_isa_flags to reflect the ISA features
12189 aarch64_handle_attr_isa_flags (char *str
)
12191 enum aarch64_parse_opt_result parse_res
;
12192 unsigned long isa_flags
= aarch64_isa_flags
;
12194 /* We allow "+nothing" in the beginning to clear out all architectural
12195 features if the user wants to handpick specific features. */
12196 if (strncmp ("+nothing", str
, 8) == 0)
12202 std::string invalid_extension
;
12203 parse_res
= aarch64_parse_extension (str
, &isa_flags
, &invalid_extension
);
12205 if (parse_res
== AARCH64_PARSE_OK
)
12207 aarch64_isa_flags
= isa_flags
;
12213 case AARCH64_PARSE_MISSING_ARG
:
12214 error ("missing value in %<target()%> pragma or attribute");
12217 case AARCH64_PARSE_INVALID_FEATURE
:
12218 error ("invalid feature modifier %s of value (\"%s\") in "
12219 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
12223 gcc_unreachable ();
12229 /* The target attributes that we support. On top of these we also support just
12230 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
12231 handled explicitly in aarch64_process_one_target_attr. */
12233 static const struct aarch64_attribute_info aarch64_attributes
[] =
12235 { "general-regs-only", aarch64_attr_mask
, false, NULL
,
12236 OPT_mgeneral_regs_only
},
12237 { "fix-cortex-a53-835769", aarch64_attr_bool
, true, NULL
,
12238 OPT_mfix_cortex_a53_835769
},
12239 { "fix-cortex-a53-843419", aarch64_attr_bool
, true, NULL
,
12240 OPT_mfix_cortex_a53_843419
},
12241 { "cmodel", aarch64_attr_enum
, false, NULL
, OPT_mcmodel_
},
12242 { "strict-align", aarch64_attr_mask
, true, NULL
, OPT_mstrict_align
},
12243 { "omit-leaf-frame-pointer", aarch64_attr_bool
, true, NULL
,
12244 OPT_momit_leaf_frame_pointer
},
12245 { "tls-dialect", aarch64_attr_enum
, false, NULL
, OPT_mtls_dialect_
},
12246 { "arch", aarch64_attr_custom
, false, aarch64_handle_attr_arch
,
12248 { "cpu", aarch64_attr_custom
, false, aarch64_handle_attr_cpu
, OPT_mcpu_
},
12249 { "tune", aarch64_attr_custom
, false, aarch64_handle_attr_tune
,
12251 { "branch-protection", aarch64_attr_custom
, false,
12252 aarch64_handle_attr_branch_protection
, OPT_mbranch_protection_
},
12253 { "sign-return-address", aarch64_attr_enum
, false, NULL
,
12254 OPT_msign_return_address_
},
12255 { NULL
, aarch64_attr_custom
, false, NULL
, OPT____
}
12258 /* Parse ARG_STR which contains the definition of one target attribute.
12259 Show appropriate errors if any or return true if the attribute is valid. */
12262 aarch64_process_one_target_attr (char *arg_str
)
12264 bool invert
= false;
12266 size_t len
= strlen (arg_str
);
12270 error ("malformed %<target()%> pragma or attribute");
12274 char *str_to_check
= (char *) alloca (len
+ 1);
12275 strcpy (str_to_check
, arg_str
);
12277 /* Skip leading whitespace. */
12278 while (*str_to_check
== ' ' || *str_to_check
== '\t')
12281 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
12282 It is easier to detect and handle it explicitly here rather than going
12283 through the machinery for the rest of the target attributes in this
12285 if (*str_to_check
== '+')
12286 return aarch64_handle_attr_isa_flags (str_to_check
);
12288 if (len
> 3 && strncmp (str_to_check
, "no-", 3) == 0)
12293 char *arg
= strchr (str_to_check
, '=');
12295 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
12296 and point ARG to "foo". */
12302 const struct aarch64_attribute_info
*p_attr
;
12303 bool found
= false;
12304 for (p_attr
= aarch64_attributes
; p_attr
->name
; p_attr
++)
12306 /* If the names don't match up, or the user has given an argument
12307 to an attribute that doesn't accept one, or didn't give an argument
12308 to an attribute that expects one, fail to match. */
12309 if (strcmp (str_to_check
, p_attr
->name
) != 0)
12313 bool attr_need_arg_p
= p_attr
->attr_type
== aarch64_attr_custom
12314 || p_attr
->attr_type
== aarch64_attr_enum
;
12316 if (attr_need_arg_p
^ (arg
!= NULL
))
12318 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check
);
12322 /* If the name matches but the attribute does not allow "no-" versions
12323 then we can't match. */
12324 if (invert
&& !p_attr
->allow_neg
)
12326 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check
);
12330 switch (p_attr
->attr_type
)
12332 /* Has a custom handler registered.
12333 For example, cpu=, arch=, tune=. */
12334 case aarch64_attr_custom
:
12335 gcc_assert (p_attr
->handler
);
12336 if (!p_attr
->handler (arg
))
12340 /* Either set or unset a boolean option. */
12341 case aarch64_attr_bool
:
12343 struct cl_decoded_option decoded
;
12345 generate_option (p_attr
->opt_num
, NULL
, !invert
,
12346 CL_TARGET
, &decoded
);
12347 aarch64_handle_option (&global_options
, &global_options_set
,
12348 &decoded
, input_location
);
12351 /* Set or unset a bit in the target_flags. aarch64_handle_option
12352 should know what mask to apply given the option number. */
12353 case aarch64_attr_mask
:
12355 struct cl_decoded_option decoded
;
12356 /* We only need to specify the option number.
12357 aarch64_handle_option will know which mask to apply. */
12358 decoded
.opt_index
= p_attr
->opt_num
;
12359 decoded
.value
= !invert
;
12360 aarch64_handle_option (&global_options
, &global_options_set
,
12361 &decoded
, input_location
);
12364 /* Use the option setting machinery to set an option to an enum. */
12365 case aarch64_attr_enum
:
12370 valid
= opt_enum_arg_to_value (p_attr
->opt_num
, arg
,
12371 &value
, CL_TARGET
);
12374 set_option (&global_options
, NULL
, p_attr
->opt_num
, value
,
12375 NULL
, DK_UNSPECIFIED
, input_location
,
12380 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check
, arg
);
12385 gcc_unreachable ();
12389 /* If we reached here we either have found an attribute and validated
12390 it or didn't match any. If we matched an attribute but its arguments
12391 were malformed we will have returned false already. */
12395 /* Count how many times the character C appears in
12396 NULL-terminated string STR. */
12398 static unsigned int
12399 num_occurences_in_str (char c
, char *str
)
12401 unsigned int res
= 0;
12402 while (*str
!= '\0')
12413 /* Parse the tree in ARGS that contains the target attribute information
12414 and update the global target options space. */
12417 aarch64_process_target_attr (tree args
)
12419 if (TREE_CODE (args
) == TREE_LIST
)
12423 tree head
= TREE_VALUE (args
);
12426 if (!aarch64_process_target_attr (head
))
12429 args
= TREE_CHAIN (args
);
12435 if (TREE_CODE (args
) != STRING_CST
)
12437 error ("attribute %<target%> argument not a string");
12441 size_t len
= strlen (TREE_STRING_POINTER (args
));
12442 char *str_to_check
= (char *) alloca (len
+ 1);
12443 strcpy (str_to_check
, TREE_STRING_POINTER (args
));
12447 error ("malformed %<target()%> pragma or attribute");
12451 /* Used to catch empty spaces between commas i.e.
12452 attribute ((target ("attr1,,attr2"))). */
12453 unsigned int num_commas
= num_occurences_in_str (',', str_to_check
);
12455 /* Handle multiple target attributes separated by ','. */
12456 char *token
= strtok_r (str_to_check
, ",", &str_to_check
);
12458 unsigned int num_attrs
= 0;
12462 if (!aarch64_process_one_target_attr (token
))
12464 error ("pragma or attribute %<target(\"%s\")%> is not valid", token
);
12468 token
= strtok_r (NULL
, ",", &str_to_check
);
12471 if (num_attrs
!= num_commas
+ 1)
12473 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args
));
12480 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
12481 process attribute ((target ("..."))). */
12484 aarch64_option_valid_attribute_p (tree fndecl
, tree
, tree args
, int)
12486 struct cl_target_option cur_target
;
12489 tree new_target
, new_optimize
;
12490 tree existing_target
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
12492 /* If what we're processing is the current pragma string then the
12493 target option node is already stored in target_option_current_node
12494 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
12495 having to re-parse the string. This is especially useful to keep
12496 arm_neon.h compile times down since that header contains a lot
12497 of intrinsics enclosed in pragmas. */
12498 if (!existing_target
&& args
== current_target_pragma
)
12500 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = target_option_current_node
;
12503 tree func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
12505 old_optimize
= build_optimization_node (&global_options
);
12506 func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
12508 /* If the function changed the optimization levels as well as setting
12509 target options, start with the optimizations specified. */
12510 if (func_optimize
&& func_optimize
!= old_optimize
)
12511 cl_optimization_restore (&global_options
,
12512 TREE_OPTIMIZATION (func_optimize
));
12514 /* Save the current target options to restore at the end. */
12515 cl_target_option_save (&cur_target
, &global_options
);
12517 /* If fndecl already has some target attributes applied to it, unpack
12518 them so that we add this attribute on top of them, rather than
12519 overwriting them. */
12520 if (existing_target
)
12522 struct cl_target_option
*existing_options
12523 = TREE_TARGET_OPTION (existing_target
);
12525 if (existing_options
)
12526 cl_target_option_restore (&global_options
, existing_options
);
12529 cl_target_option_restore (&global_options
,
12530 TREE_TARGET_OPTION (target_option_current_node
));
12532 ret
= aarch64_process_target_attr (args
);
12534 /* Set up any additional state. */
12537 aarch64_override_options_internal (&global_options
);
12538 /* Initialize SIMD builtins if we haven't already.
12539 Set current_target_pragma to NULL for the duration so that
12540 the builtin initialization code doesn't try to tag the functions
12541 being built with the attributes specified by any current pragma, thus
12542 going into an infinite recursion. */
12545 tree saved_current_target_pragma
= current_target_pragma
;
12546 current_target_pragma
= NULL
;
12547 aarch64_init_simd_builtins ();
12548 current_target_pragma
= saved_current_target_pragma
;
12550 new_target
= build_target_option_node (&global_options
);
12555 new_optimize
= build_optimization_node (&global_options
);
12559 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = new_target
;
12561 if (old_optimize
!= new_optimize
)
12562 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
) = new_optimize
;
12565 cl_target_option_restore (&global_options
, &cur_target
);
12567 if (old_optimize
!= new_optimize
)
12568 cl_optimization_restore (&global_options
,
12569 TREE_OPTIMIZATION (old_optimize
));
12573 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
12574 tri-bool options (yes, no, don't care) and the default value is
12575 DEF, determine whether to reject inlining. */
12578 aarch64_tribools_ok_for_inlining_p (int caller
, int callee
,
12579 int dont_care
, int def
)
12581 /* If the callee doesn't care, always allow inlining. */
12582 if (callee
== dont_care
)
12585 /* If the caller doesn't care, always allow inlining. */
12586 if (caller
== dont_care
)
12589 /* Otherwise, allow inlining if either the callee and caller values
12590 agree, or if the callee is using the default value. */
12591 return (callee
== caller
|| callee
== def
);
12594 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
12595 to inline CALLEE into CALLER based on target-specific info.
12596 Make sure that the caller and callee have compatible architectural
12597 features. Then go through the other possible target attributes
12598 and see if they can block inlining. Try not to reject always_inline
12599 callees unless they are incompatible architecturally. */
12602 aarch64_can_inline_p (tree caller
, tree callee
)
12604 tree caller_tree
= DECL_FUNCTION_SPECIFIC_TARGET (caller
);
12605 tree callee_tree
= DECL_FUNCTION_SPECIFIC_TARGET (callee
);
12607 struct cl_target_option
*caller_opts
12608 = TREE_TARGET_OPTION (caller_tree
? caller_tree
12609 : target_option_default_node
);
12611 struct cl_target_option
*callee_opts
12612 = TREE_TARGET_OPTION (callee_tree
? callee_tree
12613 : target_option_default_node
);
12615 /* Callee's ISA flags should be a subset of the caller's. */
12616 if ((caller_opts
->x_aarch64_isa_flags
& callee_opts
->x_aarch64_isa_flags
)
12617 != callee_opts
->x_aarch64_isa_flags
)
12620 /* Allow non-strict aligned functions inlining into strict
12622 if ((TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)
12623 != TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
))
12624 && !(!TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
)
12625 && TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)))
12628 bool always_inline
= lookup_attribute ("always_inline",
12629 DECL_ATTRIBUTES (callee
));
12631 /* If the architectural features match up and the callee is always_inline
12632 then the other attributes don't matter. */
12636 if (caller_opts
->x_aarch64_cmodel_var
12637 != callee_opts
->x_aarch64_cmodel_var
)
12640 if (caller_opts
->x_aarch64_tls_dialect
12641 != callee_opts
->x_aarch64_tls_dialect
)
12644 /* Honour explicit requests to workaround errata. */
12645 if (!aarch64_tribools_ok_for_inlining_p (
12646 caller_opts
->x_aarch64_fix_a53_err835769
,
12647 callee_opts
->x_aarch64_fix_a53_err835769
,
12648 2, TARGET_FIX_ERR_A53_835769_DEFAULT
))
12651 if (!aarch64_tribools_ok_for_inlining_p (
12652 caller_opts
->x_aarch64_fix_a53_err843419
,
12653 callee_opts
->x_aarch64_fix_a53_err843419
,
12654 2, TARGET_FIX_ERR_A53_843419
))
12657 /* If the user explicitly specified -momit-leaf-frame-pointer for the
12658 caller and calle and they don't match up, reject inlining. */
12659 if (!aarch64_tribools_ok_for_inlining_p (
12660 caller_opts
->x_flag_omit_leaf_frame_pointer
,
12661 callee_opts
->x_flag_omit_leaf_frame_pointer
,
12665 /* If the callee has specific tuning overrides, respect them. */
12666 if (callee_opts
->x_aarch64_override_tune_string
!= NULL
12667 && caller_opts
->x_aarch64_override_tune_string
== NULL
)
12670 /* If the user specified tuning override strings for the
12671 caller and callee and they don't match up, reject inlining.
12672 We just do a string compare here, we don't analyze the meaning
12673 of the string, as it would be too costly for little gain. */
12674 if (callee_opts
->x_aarch64_override_tune_string
12675 && caller_opts
->x_aarch64_override_tune_string
12676 && (strcmp (callee_opts
->x_aarch64_override_tune_string
,
12677 caller_opts
->x_aarch64_override_tune_string
) != 0))
12683 /* Return true if SYMBOL_REF X binds locally. */
12686 aarch64_symbol_binds_local_p (const_rtx x
)
12688 return (SYMBOL_REF_DECL (x
)
12689 ? targetm
.binds_local_p (SYMBOL_REF_DECL (x
))
12690 : SYMBOL_REF_LOCAL_P (x
));
12693 /* Return true if SYMBOL_REF X is thread local */
12695 aarch64_tls_symbol_p (rtx x
)
12697 if (! TARGET_HAVE_TLS
)
12700 if (GET_CODE (x
) != SYMBOL_REF
)
12703 return SYMBOL_REF_TLS_MODEL (x
) != 0;
12706 /* Classify a TLS symbol into one of the TLS kinds. */
12707 enum aarch64_symbol_type
12708 aarch64_classify_tls_symbol (rtx x
)
12710 enum tls_model tls_kind
= tls_symbolic_operand_type (x
);
12714 case TLS_MODEL_GLOBAL_DYNAMIC
:
12715 case TLS_MODEL_LOCAL_DYNAMIC
:
12716 return TARGET_TLS_DESC
? SYMBOL_SMALL_TLSDESC
: SYMBOL_SMALL_TLSGD
;
12718 case TLS_MODEL_INITIAL_EXEC
:
12719 switch (aarch64_cmodel
)
12721 case AARCH64_CMODEL_TINY
:
12722 case AARCH64_CMODEL_TINY_PIC
:
12723 return SYMBOL_TINY_TLSIE
;
12725 return SYMBOL_SMALL_TLSIE
;
12728 case TLS_MODEL_LOCAL_EXEC
:
12729 if (aarch64_tls_size
== 12)
12730 return SYMBOL_TLSLE12
;
12731 else if (aarch64_tls_size
== 24)
12732 return SYMBOL_TLSLE24
;
12733 else if (aarch64_tls_size
== 32)
12734 return SYMBOL_TLSLE32
;
12735 else if (aarch64_tls_size
== 48)
12736 return SYMBOL_TLSLE48
;
12738 gcc_unreachable ();
12740 case TLS_MODEL_EMULATED
:
12741 case TLS_MODEL_NONE
:
12742 return SYMBOL_FORCE_TO_MEM
;
12745 gcc_unreachable ();
12749 /* Return the correct method for accessing X + OFFSET, where X is either
12750 a SYMBOL_REF or LABEL_REF. */
12752 enum aarch64_symbol_type
12753 aarch64_classify_symbol (rtx x
, HOST_WIDE_INT offset
)
12755 if (GET_CODE (x
) == LABEL_REF
)
12757 switch (aarch64_cmodel
)
12759 case AARCH64_CMODEL_LARGE
:
12760 return SYMBOL_FORCE_TO_MEM
;
12762 case AARCH64_CMODEL_TINY_PIC
:
12763 case AARCH64_CMODEL_TINY
:
12764 return SYMBOL_TINY_ABSOLUTE
;
12766 case AARCH64_CMODEL_SMALL_SPIC
:
12767 case AARCH64_CMODEL_SMALL_PIC
:
12768 case AARCH64_CMODEL_SMALL
:
12769 return SYMBOL_SMALL_ABSOLUTE
;
12772 gcc_unreachable ();
12776 if (GET_CODE (x
) == SYMBOL_REF
)
12778 if (aarch64_tls_symbol_p (x
))
12779 return aarch64_classify_tls_symbol (x
);
12781 switch (aarch64_cmodel
)
12783 case AARCH64_CMODEL_TINY
:
12784 /* When we retrieve symbol + offset address, we have to make sure
12785 the offset does not cause overflow of the final address. But
12786 we have no way of knowing the address of symbol at compile time
12787 so we can't accurately say if the distance between the PC and
12788 symbol + offset is outside the addressible range of +/-1M in the
12789 TINY code model. So we rely on images not being greater than
12790 1M and cap the offset at 1M and anything beyond 1M will have to
12791 be loaded using an alternative mechanism. Furthermore if the
12792 symbol is a weak reference to something that isn't known to
12793 resolve to a symbol in this module, then force to memory. */
12794 if ((SYMBOL_REF_WEAK (x
)
12795 && !aarch64_symbol_binds_local_p (x
))
12796 || !IN_RANGE (offset
, -1048575, 1048575))
12797 return SYMBOL_FORCE_TO_MEM
;
12798 return SYMBOL_TINY_ABSOLUTE
;
12800 case AARCH64_CMODEL_SMALL
:
12801 /* Same reasoning as the tiny code model, but the offset cap here is
12803 if ((SYMBOL_REF_WEAK (x
)
12804 && !aarch64_symbol_binds_local_p (x
))
12805 || !IN_RANGE (offset
, HOST_WIDE_INT_C (-4294967263),
12806 HOST_WIDE_INT_C (4294967264)))
12807 return SYMBOL_FORCE_TO_MEM
;
12808 return SYMBOL_SMALL_ABSOLUTE
;
12810 case AARCH64_CMODEL_TINY_PIC
:
12811 if (!aarch64_symbol_binds_local_p (x
))
12812 return SYMBOL_TINY_GOT
;
12813 return SYMBOL_TINY_ABSOLUTE
;
12815 case AARCH64_CMODEL_SMALL_SPIC
:
12816 case AARCH64_CMODEL_SMALL_PIC
:
12817 if (!aarch64_symbol_binds_local_p (x
))
12818 return (aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
12819 ? SYMBOL_SMALL_GOT_28K
: SYMBOL_SMALL_GOT_4G
);
12820 return SYMBOL_SMALL_ABSOLUTE
;
12822 case AARCH64_CMODEL_LARGE
:
12823 /* This is alright even in PIC code as the constant
12824 pool reference is always PC relative and within
12825 the same translation unit. */
12826 if (!aarch64_pcrelative_literal_loads
&& CONSTANT_POOL_ADDRESS_P (x
))
12827 return SYMBOL_SMALL_ABSOLUTE
;
12829 return SYMBOL_FORCE_TO_MEM
;
12832 gcc_unreachable ();
12836 /* By default push everything into the constant pool. */
12837 return SYMBOL_FORCE_TO_MEM
;
12841 aarch64_constant_address_p (rtx x
)
12843 return (CONSTANT_P (x
) && memory_address_p (DImode
, x
));
12847 aarch64_legitimate_pic_operand_p (rtx x
)
12849 if (GET_CODE (x
) == SYMBOL_REF
12850 || (GET_CODE (x
) == CONST
12851 && GET_CODE (XEXP (x
, 0)) == PLUS
12852 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
))
12858 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
12859 that should be rematerialized rather than spilled. */
12862 aarch64_legitimate_constant_p (machine_mode mode
, rtx x
)
12864 /* Support CSE and rematerialization of common constants. */
12865 if (CONST_INT_P (x
)
12866 || (CONST_DOUBLE_P (x
) && GET_MODE_CLASS (mode
) == MODE_FLOAT
)
12867 || GET_CODE (x
) == CONST_VECTOR
)
12870 /* Do not allow vector struct mode constants for Advanced SIMD.
12871 We could support 0 and -1 easily, but they need support in
12872 aarch64-simd.md. */
12873 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
12874 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
12877 /* Only accept variable-length vector constants if they can be
12880 ??? It would be possible to handle rematerialization of other
12881 constants via secondary reloads. */
12882 if (vec_flags
& VEC_ANY_SVE
)
12883 return aarch64_simd_valid_immediate (x
, NULL
);
12885 if (GET_CODE (x
) == HIGH
)
12888 /* Accept polynomial constants that can be calculated by using the
12889 destination of a move as the sole temporary. Constants that
12890 require a second temporary cannot be rematerialized (they can't be
12891 forced to memory and also aren't legitimate constants). */
12893 if (poly_int_rtx_p (x
, &offset
))
12894 return aarch64_offset_temporaries (false, offset
) <= 1;
12896 /* If an offset is being added to something else, we need to allow the
12897 base to be moved into the destination register, meaning that there
12898 are no free temporaries for the offset. */
12899 x
= strip_offset (x
, &offset
);
12900 if (!offset
.is_constant () && aarch64_offset_temporaries (true, offset
) > 0)
12903 /* Do not allow const (plus (anchor_symbol, const_int)). */
12904 if (maybe_ne (offset
, 0) && SYMBOL_REF_P (x
) && SYMBOL_REF_ANCHOR_P (x
))
12907 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
12908 so spilling them is better than rematerialization. */
12909 if (SYMBOL_REF_P (x
) && !SYMBOL_REF_TLS_MODEL (x
))
12912 /* Label references are always constant. */
12913 if (GET_CODE (x
) == LABEL_REF
)
12920 aarch64_load_tp (rtx target
)
12923 || GET_MODE (target
) != Pmode
12924 || !register_operand (target
, Pmode
))
12925 target
= gen_reg_rtx (Pmode
);
12927 /* Can return in any reg. */
12928 emit_insn (gen_aarch64_load_tp_hard (target
));
12932 /* On AAPCS systems, this is the "struct __va_list". */
12933 static GTY(()) tree va_list_type
;
12935 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
12936 Return the type to use as __builtin_va_list.
12938 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
12950 aarch64_build_builtin_va_list (void)
12953 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
12955 /* Create the type. */
12956 va_list_type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
12957 /* Give it the required name. */
12958 va_list_name
= build_decl (BUILTINS_LOCATION
,
12960 get_identifier ("__va_list"),
12962 DECL_ARTIFICIAL (va_list_name
) = 1;
12963 TYPE_NAME (va_list_type
) = va_list_name
;
12964 TYPE_STUB_DECL (va_list_type
) = va_list_name
;
12966 /* Create the fields. */
12967 f_stack
= build_decl (BUILTINS_LOCATION
,
12968 FIELD_DECL
, get_identifier ("__stack"),
12970 f_grtop
= build_decl (BUILTINS_LOCATION
,
12971 FIELD_DECL
, get_identifier ("__gr_top"),
12973 f_vrtop
= build_decl (BUILTINS_LOCATION
,
12974 FIELD_DECL
, get_identifier ("__vr_top"),
12976 f_groff
= build_decl (BUILTINS_LOCATION
,
12977 FIELD_DECL
, get_identifier ("__gr_offs"),
12978 integer_type_node
);
12979 f_vroff
= build_decl (BUILTINS_LOCATION
,
12980 FIELD_DECL
, get_identifier ("__vr_offs"),
12981 integer_type_node
);
12983 /* Tell tree-stdarg pass about our internal offset fields.
12984 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
12985 purpose to identify whether the code is updating va_list internal
12986 offset fields through irregular way. */
12987 va_list_gpr_counter_field
= f_groff
;
12988 va_list_fpr_counter_field
= f_vroff
;
12990 DECL_ARTIFICIAL (f_stack
) = 1;
12991 DECL_ARTIFICIAL (f_grtop
) = 1;
12992 DECL_ARTIFICIAL (f_vrtop
) = 1;
12993 DECL_ARTIFICIAL (f_groff
) = 1;
12994 DECL_ARTIFICIAL (f_vroff
) = 1;
12996 DECL_FIELD_CONTEXT (f_stack
) = va_list_type
;
12997 DECL_FIELD_CONTEXT (f_grtop
) = va_list_type
;
12998 DECL_FIELD_CONTEXT (f_vrtop
) = va_list_type
;
12999 DECL_FIELD_CONTEXT (f_groff
) = va_list_type
;
13000 DECL_FIELD_CONTEXT (f_vroff
) = va_list_type
;
13002 TYPE_FIELDS (va_list_type
) = f_stack
;
13003 DECL_CHAIN (f_stack
) = f_grtop
;
13004 DECL_CHAIN (f_grtop
) = f_vrtop
;
13005 DECL_CHAIN (f_vrtop
) = f_groff
;
13006 DECL_CHAIN (f_groff
) = f_vroff
;
13008 /* Compute its layout. */
13009 layout_type (va_list_type
);
13011 return va_list_type
;
13014 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
13016 aarch64_expand_builtin_va_start (tree valist
, rtx nextarg ATTRIBUTE_UNUSED
)
13018 const CUMULATIVE_ARGS
*cum
;
13019 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
13020 tree stack
, grtop
, vrtop
, groff
, vroff
;
13022 int gr_save_area_size
= cfun
->va_list_gpr_size
;
13023 int vr_save_area_size
= cfun
->va_list_fpr_size
;
13026 cum
= &crtl
->args
.info
;
13027 if (cfun
->va_list_gpr_size
)
13028 gr_save_area_size
= MIN ((NUM_ARG_REGS
- cum
->aapcs_ncrn
) * UNITS_PER_WORD
,
13029 cfun
->va_list_gpr_size
);
13030 if (cfun
->va_list_fpr_size
)
13031 vr_save_area_size
= MIN ((NUM_FP_ARG_REGS
- cum
->aapcs_nvrn
)
13032 * UNITS_PER_VREG
, cfun
->va_list_fpr_size
);
13036 gcc_assert (cum
->aapcs_nvrn
== 0);
13037 vr_save_area_size
= 0;
13040 f_stack
= TYPE_FIELDS (va_list_type_node
);
13041 f_grtop
= DECL_CHAIN (f_stack
);
13042 f_vrtop
= DECL_CHAIN (f_grtop
);
13043 f_groff
= DECL_CHAIN (f_vrtop
);
13044 f_vroff
= DECL_CHAIN (f_groff
);
13046 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), valist
, f_stack
,
13048 grtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
), valist
, f_grtop
,
13050 vrtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
), valist
, f_vrtop
,
13052 groff
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
), valist
, f_groff
,
13054 vroff
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
), valist
, f_vroff
,
13057 /* Emit code to initialize STACK, which points to the next varargs stack
13058 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
13059 by named arguments. STACK is 8-byte aligned. */
13060 t
= make_tree (TREE_TYPE (stack
), virtual_incoming_args_rtx
);
13061 if (cum
->aapcs_stack_size
> 0)
13062 t
= fold_build_pointer_plus_hwi (t
, cum
->aapcs_stack_size
* UNITS_PER_WORD
);
13063 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), stack
, t
);
13064 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
13066 /* Emit code to initialize GRTOP, the top of the GR save area.
13067 virtual_incoming_args_rtx should have been 16 byte aligned. */
13068 t
= make_tree (TREE_TYPE (grtop
), virtual_incoming_args_rtx
);
13069 t
= build2 (MODIFY_EXPR
, TREE_TYPE (grtop
), grtop
, t
);
13070 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
13072 /* Emit code to initialize VRTOP, the top of the VR save area.
13073 This address is gr_save_area_bytes below GRTOP, rounded
13074 down to the next 16-byte boundary. */
13075 t
= make_tree (TREE_TYPE (vrtop
), virtual_incoming_args_rtx
);
13076 vr_offset
= ROUND_UP (gr_save_area_size
,
13077 STACK_BOUNDARY
/ BITS_PER_UNIT
);
13080 t
= fold_build_pointer_plus_hwi (t
, -vr_offset
);
13081 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vrtop
), vrtop
, t
);
13082 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
13084 /* Emit code to initialize GROFF, the offset from GRTOP of the
13085 next GPR argument. */
13086 t
= build2 (MODIFY_EXPR
, TREE_TYPE (groff
), groff
,
13087 build_int_cst (TREE_TYPE (groff
), -gr_save_area_size
));
13088 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
13090 /* Likewise emit code to initialize VROFF, the offset from FTOP
13091 of the next VR argument. */
13092 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vroff
), vroff
,
13093 build_int_cst (TREE_TYPE (vroff
), -vr_save_area_size
));
13094 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
13097 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
13100 aarch64_gimplify_va_arg_expr (tree valist
, tree type
, gimple_seq
*pre_p
,
13101 gimple_seq
*post_p ATTRIBUTE_UNUSED
)
13105 bool is_ha
; /* is HFA or HVA. */
13106 bool dw_align
; /* double-word align. */
13107 machine_mode ag_mode
= VOIDmode
;
13111 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
13112 tree stack
, f_top
, f_off
, off
, arg
, roundup
, on_stack
;
13113 HOST_WIDE_INT size
, rsize
, adjust
, align
;
13114 tree t
, u
, cond1
, cond2
;
13116 indirect_p
= pass_by_reference (NULL
, TYPE_MODE (type
), type
, false);
13118 type
= build_pointer_type (type
);
13120 mode
= TYPE_MODE (type
);
13122 f_stack
= TYPE_FIELDS (va_list_type_node
);
13123 f_grtop
= DECL_CHAIN (f_stack
);
13124 f_vrtop
= DECL_CHAIN (f_grtop
);
13125 f_groff
= DECL_CHAIN (f_vrtop
);
13126 f_vroff
= DECL_CHAIN (f_groff
);
13128 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), unshare_expr (valist
),
13129 f_stack
, NULL_TREE
);
13130 size
= int_size_in_bytes (type
);
13131 align
= aarch64_function_arg_alignment (mode
, type
) / BITS_PER_UNIT
;
13135 if (aarch64_vfp_is_call_or_return_candidate (mode
,
13141 /* No frontends can create types with variable-sized modes, so we
13142 shouldn't be asked to pass or return them. */
13143 unsigned int ag_size
= GET_MODE_SIZE (ag_mode
).to_constant ();
13145 /* TYPE passed in fp/simd registers. */
13147 aarch64_err_no_fpadvsimd (mode
);
13149 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
),
13150 unshare_expr (valist
), f_vrtop
, NULL_TREE
);
13151 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
),
13152 unshare_expr (valist
), f_vroff
, NULL_TREE
);
13154 rsize
= nregs
* UNITS_PER_VREG
;
13158 if (BYTES_BIG_ENDIAN
&& ag_size
< UNITS_PER_VREG
)
13159 adjust
= UNITS_PER_VREG
- ag_size
;
13161 else if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
13162 && size
< UNITS_PER_VREG
)
13164 adjust
= UNITS_PER_VREG
- size
;
13169 /* TYPE passed in general registers. */
13170 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
),
13171 unshare_expr (valist
), f_grtop
, NULL_TREE
);
13172 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
),
13173 unshare_expr (valist
), f_groff
, NULL_TREE
);
13174 rsize
= ROUND_UP (size
, UNITS_PER_WORD
);
13175 nregs
= rsize
/ UNITS_PER_WORD
;
13180 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
13181 && size
< UNITS_PER_WORD
)
13183 adjust
= UNITS_PER_WORD
- size
;
13187 /* Get a local temporary for the field value. */
13188 off
= get_initialized_tmp_var (f_off
, pre_p
, NULL
);
13190 /* Emit code to branch if off >= 0. */
13191 t
= build2 (GE_EXPR
, boolean_type_node
, off
,
13192 build_int_cst (TREE_TYPE (off
), 0));
13193 cond1
= build3 (COND_EXPR
, ptr_type_node
, t
, NULL_TREE
, NULL_TREE
);
13197 /* Emit: offs = (offs + 15) & -16. */
13198 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
13199 build_int_cst (TREE_TYPE (off
), 15));
13200 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (off
), t
,
13201 build_int_cst (TREE_TYPE (off
), -16));
13202 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (off
), off
, t
);
13207 /* Update ap.__[g|v]r_offs */
13208 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
13209 build_int_cst (TREE_TYPE (off
), rsize
));
13210 t
= build2 (MODIFY_EXPR
, TREE_TYPE (f_off
), unshare_expr (f_off
), t
);
13214 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
13216 /* [cond2] if (ap.__[g|v]r_offs > 0) */
13217 u
= build2 (GT_EXPR
, boolean_type_node
, unshare_expr (f_off
),
13218 build_int_cst (TREE_TYPE (f_off
), 0));
13219 cond2
= build3 (COND_EXPR
, ptr_type_node
, u
, NULL_TREE
, NULL_TREE
);
13221 /* String up: make sure the assignment happens before the use. */
13222 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (cond2
), t
, cond2
);
13223 COND_EXPR_ELSE (cond1
) = t
;
13225 /* Prepare the trees handling the argument that is passed on the stack;
13226 the top level node will store in ON_STACK. */
13227 arg
= get_initialized_tmp_var (stack
, pre_p
, NULL
);
13230 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
13231 t
= fold_build_pointer_plus_hwi (arg
, 15);
13232 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
13233 build_int_cst (TREE_TYPE (t
), -16));
13234 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (arg
), arg
, t
);
13238 /* Advance ap.__stack */
13239 t
= fold_build_pointer_plus_hwi (arg
, size
+ 7);
13240 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
13241 build_int_cst (TREE_TYPE (t
), -8));
13242 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), unshare_expr (stack
), t
);
13243 /* String up roundup and advance. */
13245 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
13246 /* String up with arg */
13247 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), t
, arg
);
13248 /* Big-endianness related address adjustment. */
13249 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
13250 && size
< UNITS_PER_WORD
)
13252 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (arg
), arg
,
13253 size_int (UNITS_PER_WORD
- size
));
13254 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), on_stack
, t
);
13257 COND_EXPR_THEN (cond1
) = unshare_expr (on_stack
);
13258 COND_EXPR_THEN (cond2
) = unshare_expr (on_stack
);
13260 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
13263 t
= build2 (PREINCREMENT_EXPR
, TREE_TYPE (off
), off
,
13264 build_int_cst (TREE_TYPE (off
), adjust
));
13266 t
= fold_convert (sizetype
, t
);
13267 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (f_top
), f_top
, t
);
13271 /* type ha; // treat as "struct {ftype field[n];}"
13272 ... [computing offs]
13273 for (i = 0; i <nregs; ++i, offs += 16)
13274 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
13277 tree tmp_ha
, field_t
, field_ptr_t
;
13279 /* Declare a local variable. */
13280 tmp_ha
= create_tmp_var_raw (type
, "ha");
13281 gimple_add_tmp_var (tmp_ha
);
13283 /* Establish the base type. */
13287 field_t
= float_type_node
;
13288 field_ptr_t
= float_ptr_type_node
;
13291 field_t
= double_type_node
;
13292 field_ptr_t
= double_ptr_type_node
;
13295 field_t
= long_double_type_node
;
13296 field_ptr_t
= long_double_ptr_type_node
;
13299 field_t
= aarch64_fp16_type_node
;
13300 field_ptr_t
= aarch64_fp16_ptr_type_node
;
13305 tree innertype
= make_signed_type (GET_MODE_PRECISION (SImode
));
13306 field_t
= build_vector_type_for_mode (innertype
, ag_mode
);
13307 field_ptr_t
= build_pointer_type (field_t
);
13314 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
13315 tmp_ha
= build1 (ADDR_EXPR
, field_ptr_t
, tmp_ha
);
13317 t
= fold_convert (field_ptr_t
, addr
);
13318 t
= build2 (MODIFY_EXPR
, field_t
,
13319 build1 (INDIRECT_REF
, field_t
, tmp_ha
),
13320 build1 (INDIRECT_REF
, field_t
, t
));
13322 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
13323 for (i
= 1; i
< nregs
; ++i
)
13325 addr
= fold_build_pointer_plus_hwi (addr
, UNITS_PER_VREG
);
13326 u
= fold_convert (field_ptr_t
, addr
);
13327 u
= build2 (MODIFY_EXPR
, field_t
,
13328 build2 (MEM_REF
, field_t
, tmp_ha
,
13329 build_int_cst (field_ptr_t
,
13331 int_size_in_bytes (field_t
)))),
13332 build1 (INDIRECT_REF
, field_t
, u
));
13333 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), t
, u
);
13336 u
= fold_convert (TREE_TYPE (f_top
), tmp_ha
);
13337 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (f_top
), t
, u
);
13340 COND_EXPR_ELSE (cond2
) = t
;
13341 addr
= fold_convert (build_pointer_type (type
), cond1
);
13342 addr
= build_va_arg_indirect_ref (addr
);
13345 addr
= build_va_arg_indirect_ref (addr
);
13350 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
13353 aarch64_setup_incoming_varargs (cumulative_args_t cum_v
, machine_mode mode
,
13354 tree type
, int *pretend_size ATTRIBUTE_UNUSED
,
13357 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
13358 CUMULATIVE_ARGS local_cum
;
13359 int gr_saved
= cfun
->va_list_gpr_size
;
13360 int vr_saved
= cfun
->va_list_fpr_size
;
13362 /* The caller has advanced CUM up to, but not beyond, the last named
13363 argument. Advance a local copy of CUM past the last "real" named
13364 argument, to find out how many registers are left over. */
13366 aarch64_function_arg_advance (pack_cumulative_args(&local_cum
), mode
, type
, true);
13368 /* Found out how many registers we need to save.
13369 Honor tree-stdvar analysis results. */
13370 if (cfun
->va_list_gpr_size
)
13371 gr_saved
= MIN (NUM_ARG_REGS
- local_cum
.aapcs_ncrn
,
13372 cfun
->va_list_gpr_size
/ UNITS_PER_WORD
);
13373 if (cfun
->va_list_fpr_size
)
13374 vr_saved
= MIN (NUM_FP_ARG_REGS
- local_cum
.aapcs_nvrn
,
13375 cfun
->va_list_fpr_size
/ UNITS_PER_VREG
);
13379 gcc_assert (local_cum
.aapcs_nvrn
== 0);
13389 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
13390 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
,
13391 - gr_saved
* UNITS_PER_WORD
);
13392 mem
= gen_frame_mem (BLKmode
, ptr
);
13393 set_mem_alias_set (mem
, get_varargs_alias_set ());
13395 move_block_from_reg (local_cum
.aapcs_ncrn
+ R0_REGNUM
,
13400 /* We can't use move_block_from_reg, because it will use
13401 the wrong mode, storing D regs only. */
13402 machine_mode mode
= TImode
;
13403 int off
, i
, vr_start
;
13405 /* Set OFF to the offset from virtual_incoming_args_rtx of
13406 the first vector register. The VR save area lies below
13407 the GR one, and is aligned to 16 bytes. */
13408 off
= -ROUND_UP (gr_saved
* UNITS_PER_WORD
,
13409 STACK_BOUNDARY
/ BITS_PER_UNIT
);
13410 off
-= vr_saved
* UNITS_PER_VREG
;
13412 vr_start
= V0_REGNUM
+ local_cum
.aapcs_nvrn
;
13413 for (i
= 0; i
< vr_saved
; ++i
)
13417 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
, off
);
13418 mem
= gen_frame_mem (mode
, ptr
);
13419 set_mem_alias_set (mem
, get_varargs_alias_set ());
13420 aarch64_emit_move (mem
, gen_rtx_REG (mode
, vr_start
+ i
));
13421 off
+= UNITS_PER_VREG
;
13426 /* We don't save the size into *PRETEND_SIZE because we want to avoid
13427 any complication of having crtl->args.pretend_args_size changed. */
13428 cfun
->machine
->frame
.saved_varargs_size
13429 = (ROUND_UP (gr_saved
* UNITS_PER_WORD
,
13430 STACK_BOUNDARY
/ BITS_PER_UNIT
)
13431 + vr_saved
* UNITS_PER_VREG
);
13435 aarch64_conditional_register_usage (void)
13440 for (i
= V0_REGNUM
; i
<= V31_REGNUM
; i
++)
13443 call_used_regs
[i
] = 1;
13447 for (i
= P0_REGNUM
; i
<= P15_REGNUM
; i
++)
13450 call_used_regs
[i
] = 1;
13453 /* When tracking speculation, we need a couple of call-clobbered registers
13454 to track the speculation state. It would be nice to just use
13455 IP0 and IP1, but currently there are numerous places that just
13456 assume these registers are free for other uses (eg pointer
13457 authentication). */
13458 if (aarch64_track_speculation
)
13460 fixed_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
13461 call_used_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
13462 fixed_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
13463 call_used_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
13467 /* Walk down the type tree of TYPE counting consecutive base elements.
13468 If *MODEP is VOIDmode, then set it to the first valid floating point
13469 type. If a non-floating point type is found, or if a floating point
13470 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
13471 otherwise return the count in the sub-tree. */
13473 aapcs_vfp_sub_candidate (const_tree type
, machine_mode
*modep
)
13476 HOST_WIDE_INT size
;
13478 switch (TREE_CODE (type
))
13481 mode
= TYPE_MODE (type
);
13482 if (mode
!= DFmode
&& mode
!= SFmode
13483 && mode
!= TFmode
&& mode
!= HFmode
)
13486 if (*modep
== VOIDmode
)
13489 if (*modep
== mode
)
13495 mode
= TYPE_MODE (TREE_TYPE (type
));
13496 if (mode
!= DFmode
&& mode
!= SFmode
13497 && mode
!= TFmode
&& mode
!= HFmode
)
13500 if (*modep
== VOIDmode
)
13503 if (*modep
== mode
)
13509 /* Use V2SImode and V4SImode as representatives of all 64-bit
13510 and 128-bit vector types. */
13511 size
= int_size_in_bytes (type
);
13524 if (*modep
== VOIDmode
)
13527 /* Vector modes are considered to be opaque: two vectors are
13528 equivalent for the purposes of being homogeneous aggregates
13529 if they are the same size. */
13530 if (*modep
== mode
)
13538 tree index
= TYPE_DOMAIN (type
);
13540 /* Can't handle incomplete types nor sizes that are not
13542 if (!COMPLETE_TYPE_P (type
)
13543 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
13546 count
= aapcs_vfp_sub_candidate (TREE_TYPE (type
), modep
);
13549 || !TYPE_MAX_VALUE (index
)
13550 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index
))
13551 || !TYPE_MIN_VALUE (index
)
13552 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index
))
13556 count
*= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index
))
13557 - tree_to_uhwi (TYPE_MIN_VALUE (index
)));
13559 /* There must be no padding. */
13560 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
13561 count
* GET_MODE_BITSIZE (*modep
)))
13573 /* Can't handle incomplete types nor sizes that are not
13575 if (!COMPLETE_TYPE_P (type
)
13576 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
13579 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
13581 if (TREE_CODE (field
) != FIELD_DECL
)
13584 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
13587 count
+= sub_count
;
13590 /* There must be no padding. */
13591 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
13592 count
* GET_MODE_BITSIZE (*modep
)))
13599 case QUAL_UNION_TYPE
:
13601 /* These aren't very interesting except in a degenerate case. */
13606 /* Can't handle incomplete types nor sizes that are not
13608 if (!COMPLETE_TYPE_P (type
)
13609 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
13612 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
13614 if (TREE_CODE (field
) != FIELD_DECL
)
13617 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
13620 count
= count
> sub_count
? count
: sub_count
;
13623 /* There must be no padding. */
13624 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
13625 count
* GET_MODE_BITSIZE (*modep
)))
13638 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
13639 type as described in AAPCS64 \S 4.1.2.
13641 See the comment above aarch64_composite_type_p for the notes on MODE. */
13644 aarch64_short_vector_p (const_tree type
,
13647 poly_int64 size
= -1;
13649 if (type
&& TREE_CODE (type
) == VECTOR_TYPE
)
13650 size
= int_size_in_bytes (type
);
13651 else if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
13652 || GET_MODE_CLASS (mode
) == MODE_VECTOR_FLOAT
)
13653 size
= GET_MODE_SIZE (mode
);
13655 return known_eq (size
, 8) || known_eq (size
, 16);
13658 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
13659 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
13660 array types. The C99 floating-point complex types are also considered
13661 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
13662 types, which are GCC extensions and out of the scope of AAPCS64, are
13663 treated as composite types here as well.
13665 Note that MODE itself is not sufficient in determining whether a type
13666 is such a composite type or not. This is because
13667 stor-layout.c:compute_record_mode may have already changed the MODE
13668 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
13669 structure with only one field may have its MODE set to the mode of the
13670 field. Also an integer mode whose size matches the size of the
13671 RECORD_TYPE type may be used to substitute the original mode
13672 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
13673 solely relied on. */
13676 aarch64_composite_type_p (const_tree type
,
13679 if (aarch64_short_vector_p (type
, mode
))
13682 if (type
&& (AGGREGATE_TYPE_P (type
) || TREE_CODE (type
) == COMPLEX_TYPE
))
13685 if (mode
== BLKmode
13686 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
13687 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_INT
)
13693 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
13694 shall be passed or returned in simd/fp register(s) (providing these
13695 parameter passing registers are available).
13697 Upon successful return, *COUNT returns the number of needed registers,
13698 *BASE_MODE returns the mode of the individual register and when IS_HAF
13699 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
13700 floating-point aggregate or a homogeneous short-vector aggregate. */
13703 aarch64_vfp_is_call_or_return_candidate (machine_mode mode
,
13705 machine_mode
*base_mode
,
13709 machine_mode new_mode
= VOIDmode
;
13710 bool composite_p
= aarch64_composite_type_p (type
, mode
);
13712 if (is_ha
!= NULL
) *is_ha
= false;
13714 if ((!composite_p
&& GET_MODE_CLASS (mode
) == MODE_FLOAT
)
13715 || aarch64_short_vector_p (type
, mode
))
13720 else if (GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
)
13722 if (is_ha
!= NULL
) *is_ha
= true;
13724 new_mode
= GET_MODE_INNER (mode
);
13726 else if (type
&& composite_p
)
13728 int ag_count
= aapcs_vfp_sub_candidate (type
, &new_mode
);
13730 if (ag_count
> 0 && ag_count
<= HA_MAX_NUM_FLDS
)
13732 if (is_ha
!= NULL
) *is_ha
= true;
13741 *base_mode
= new_mode
;
13745 /* Implement TARGET_STRUCT_VALUE_RTX. */
13748 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED
,
13749 int incoming ATTRIBUTE_UNUSED
)
13751 return gen_rtx_REG (Pmode
, AARCH64_STRUCT_VALUE_REGNUM
);
13754 /* Implements target hook vector_mode_supported_p. */
13756 aarch64_vector_mode_supported_p (machine_mode mode
)
13758 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
13759 return vec_flags
!= 0 && (vec_flags
& VEC_STRUCT
) == 0;
13762 /* Return appropriate SIMD container
13763 for MODE within a vector of WIDTH bits. */
13764 static machine_mode
13765 aarch64_simd_container_mode (scalar_mode mode
, poly_int64 width
)
13767 if (TARGET_SVE
&& known_eq (width
, BITS_PER_SVE_VECTOR
))
13783 return VNx16QImode
;
13788 gcc_assert (known_eq (width
, 64) || known_eq (width
, 128));
13791 if (known_eq (width
, 128))
13831 /* Return 128-bit container as the preferred SIMD mode for MODE. */
13832 static machine_mode
13833 aarch64_preferred_simd_mode (scalar_mode mode
)
13835 poly_int64 bits
= TARGET_SVE
? BITS_PER_SVE_VECTOR
: 128;
13836 return aarch64_simd_container_mode (mode
, bits
);
13839 /* Return a list of possible vector sizes for the vectorizer
13840 to iterate over. */
13842 aarch64_autovectorize_vector_sizes (vector_sizes
*sizes
)
13845 sizes
->safe_push (BYTES_PER_SVE_VECTOR
);
13846 sizes
->safe_push (16);
13847 sizes
->safe_push (8);
13850 /* Implement TARGET_MANGLE_TYPE. */
13852 static const char *
13853 aarch64_mangle_type (const_tree type
)
13855 /* The AArch64 ABI documents say that "__va_list" has to be
13856 mangled as if it is in the "std" namespace. */
13857 if (lang_hooks
.types_compatible_p (CONST_CAST_TREE (type
), va_list_type
))
13858 return "St9__va_list";
13860 /* Half-precision float. */
13861 if (TREE_CODE (type
) == REAL_TYPE
&& TYPE_PRECISION (type
) == 16)
13864 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
13866 if (TYPE_NAME (type
) != NULL
)
13867 return aarch64_mangle_builtin_type (type
);
13869 /* Use the default mangling. */
13873 /* Find the first rtx_insn before insn that will generate an assembly
13877 aarch64_prev_real_insn (rtx_insn
*insn
)
13884 insn
= prev_real_insn (insn
);
13886 while (insn
&& recog_memoized (insn
) < 0);
13892 is_madd_op (enum attr_type t1
)
13895 /* A number of these may be AArch32 only. */
13896 enum attr_type mlatypes
[] = {
13897 TYPE_MLA
, TYPE_MLAS
, TYPE_SMLAD
, TYPE_SMLADX
, TYPE_SMLAL
, TYPE_SMLALD
,
13898 TYPE_SMLALS
, TYPE_SMLALXY
, TYPE_SMLAWX
, TYPE_SMLAWY
, TYPE_SMLAXY
,
13899 TYPE_SMMLA
, TYPE_UMLAL
, TYPE_UMLALS
,TYPE_SMLSD
, TYPE_SMLSDX
, TYPE_SMLSLD
13902 for (i
= 0; i
< sizeof (mlatypes
) / sizeof (enum attr_type
); i
++)
13904 if (t1
== mlatypes
[i
])
13911 /* Check if there is a register dependency between a load and the insn
13912 for which we hold recog_data. */
13915 dep_between_memop_and_curr (rtx memop
)
13920 gcc_assert (GET_CODE (memop
) == SET
);
13922 if (!REG_P (SET_DEST (memop
)))
13925 load_reg
= SET_DEST (memop
);
13926 for (opno
= 1; opno
< recog_data
.n_operands
; opno
++)
13928 rtx operand
= recog_data
.operand
[opno
];
13929 if (REG_P (operand
)
13930 && reg_overlap_mentioned_p (load_reg
, operand
))
13938 /* When working around the Cortex-A53 erratum 835769,
13939 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
13940 instruction and has a preceding memory instruction such that a NOP
13941 should be inserted between them. */
13944 aarch64_madd_needs_nop (rtx_insn
* insn
)
13946 enum attr_type attr_type
;
13950 if (!TARGET_FIX_ERR_A53_835769
)
13953 if (!INSN_P (insn
) || recog_memoized (insn
) < 0)
13956 attr_type
= get_attr_type (insn
);
13957 if (!is_madd_op (attr_type
))
13960 prev
= aarch64_prev_real_insn (insn
);
13961 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
13962 Restore recog state to INSN to avoid state corruption. */
13963 extract_constrain_insn_cached (insn
);
13965 if (!prev
|| !contains_mem_rtx_p (PATTERN (prev
)))
13968 body
= single_set (prev
);
13970 /* If the previous insn is a memory op and there is no dependency between
13971 it and the DImode madd, emit a NOP between them. If body is NULL then we
13972 have a complex memory operation, probably a load/store pair.
13973 Be conservative for now and emit a NOP. */
13974 if (GET_MODE (recog_data
.operand
[0]) == DImode
13975 && (!body
|| !dep_between_memop_and_curr (body
)))
13983 /* Implement FINAL_PRESCAN_INSN. */
13986 aarch64_final_prescan_insn (rtx_insn
*insn
)
13988 if (aarch64_madd_needs_nop (insn
))
13989 fprintf (asm_out_file
, "\tnop // between mem op and mult-accumulate\n");
13993 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
13997 aarch64_sve_index_immediate_p (rtx base_or_step
)
13999 return (CONST_INT_P (base_or_step
)
14000 && IN_RANGE (INTVAL (base_or_step
), -16, 15));
14003 /* Return true if X is a valid immediate for the SVE ADD and SUB
14004 instructions. Negate X first if NEGATE_P is true. */
14007 aarch64_sve_arith_immediate_p (rtx x
, bool negate_p
)
14011 if (!const_vec_duplicate_p (x
, &elt
)
14012 || !CONST_INT_P (elt
))
14015 HOST_WIDE_INT val
= INTVAL (elt
);
14018 val
&= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x
)));
14021 return IN_RANGE (val
, 0, 0xff);
14022 return IN_RANGE (val
, 0, 0xff00);
14025 /* Return true if X is a valid immediate operand for an SVE logical
14026 instruction such as AND. */
14029 aarch64_sve_bitmask_immediate_p (rtx x
)
14033 return (const_vec_duplicate_p (x
, &elt
)
14034 && CONST_INT_P (elt
)
14035 && aarch64_bitmask_imm (INTVAL (elt
),
14036 GET_MODE_INNER (GET_MODE (x
))));
14039 /* Return true if X is a valid immediate for the SVE DUP and CPY
14043 aarch64_sve_dup_immediate_p (rtx x
)
14047 if (!const_vec_duplicate_p (x
, &elt
)
14048 || !CONST_INT_P (elt
))
14051 HOST_WIDE_INT val
= INTVAL (elt
);
14053 return IN_RANGE (val
, -0x80, 0x7f);
14054 return IN_RANGE (val
, -0x8000, 0x7f00);
14057 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
14058 SIGNED_P says whether the operand is signed rather than unsigned. */
14061 aarch64_sve_cmp_immediate_p (rtx x
, bool signed_p
)
14065 return (const_vec_duplicate_p (x
, &elt
)
14066 && CONST_INT_P (elt
)
14068 ? IN_RANGE (INTVAL (elt
), -16, 15)
14069 : IN_RANGE (INTVAL (elt
), 0, 127)));
14072 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
14073 instruction. Negate X first if NEGATE_P is true. */
14076 aarch64_sve_float_arith_immediate_p (rtx x
, bool negate_p
)
14081 if (!const_vec_duplicate_p (x
, &elt
)
14082 || GET_CODE (elt
) != CONST_DOUBLE
)
14085 r
= *CONST_DOUBLE_REAL_VALUE (elt
);
14088 r
= real_value_negate (&r
);
14090 if (real_equal (&r
, &dconst1
))
14092 if (real_equal (&r
, &dconsthalf
))
14097 /* Return true if X is a valid immediate operand for an SVE FMUL
14101 aarch64_sve_float_mul_immediate_p (rtx x
)
14105 /* GCC will never generate a multiply with an immediate of 2, so there is no
14106 point testing for it (even though it is a valid constant). */
14107 return (const_vec_duplicate_p (x
, &elt
)
14108 && GET_CODE (elt
) == CONST_DOUBLE
14109 && real_equal (CONST_DOUBLE_REAL_VALUE (elt
), &dconsthalf
));
14112 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
14113 for the Advanced SIMD operation described by WHICH and INSN. If INFO
14114 is nonnull, use it to describe valid immediates. */
14116 aarch64_advsimd_valid_immediate_hs (unsigned int val32
,
14117 simd_immediate_info
*info
,
14118 enum simd_immediate_check which
,
14119 simd_immediate_info::insn_type insn
)
14121 /* Try a 4-byte immediate with LSL. */
14122 for (unsigned int shift
= 0; shift
< 32; shift
+= 8)
14123 if ((val32
& (0xff << shift
)) == val32
)
14126 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
14127 simd_immediate_info::LSL
, shift
);
14131 /* Try a 2-byte immediate with LSL. */
14132 unsigned int imm16
= val32
& 0xffff;
14133 if (imm16
== (val32
>> 16))
14134 for (unsigned int shift
= 0; shift
< 16; shift
+= 8)
14135 if ((imm16
& (0xff << shift
)) == imm16
)
14138 *info
= simd_immediate_info (HImode
, imm16
>> shift
, insn
,
14139 simd_immediate_info::LSL
, shift
);
14143 /* Try a 4-byte immediate with MSL, except for cases that MVN
14145 if (which
== AARCH64_CHECK_MOV
)
14146 for (unsigned int shift
= 8; shift
< 24; shift
+= 8)
14148 unsigned int low
= (1 << shift
) - 1;
14149 if (((val32
& (0xff << shift
)) | low
) == val32
)
14152 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
14153 simd_immediate_info::MSL
, shift
);
14161 /* Return true if replicating VAL64 is a valid immediate for the
14162 Advanced SIMD operation described by WHICH. If INFO is nonnull,
14163 use it to describe valid immediates. */
14165 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64
,
14166 simd_immediate_info
*info
,
14167 enum simd_immediate_check which
)
14169 unsigned int val32
= val64
& 0xffffffff;
14170 unsigned int val16
= val64
& 0xffff;
14171 unsigned int val8
= val64
& 0xff;
14173 if (val32
== (val64
>> 32))
14175 if ((which
& AARCH64_CHECK_ORR
) != 0
14176 && aarch64_advsimd_valid_immediate_hs (val32
, info
, which
,
14177 simd_immediate_info::MOV
))
14180 if ((which
& AARCH64_CHECK_BIC
) != 0
14181 && aarch64_advsimd_valid_immediate_hs (~val32
, info
, which
,
14182 simd_immediate_info::MVN
))
14185 /* Try using a replicated byte. */
14186 if (which
== AARCH64_CHECK_MOV
14187 && val16
== (val32
>> 16)
14188 && val8
== (val16
>> 8))
14191 *info
= simd_immediate_info (QImode
, val8
);
14196 /* Try using a bit-to-bytemask. */
14197 if (which
== AARCH64_CHECK_MOV
)
14200 for (i
= 0; i
< 64; i
+= 8)
14202 unsigned char byte
= (val64
>> i
) & 0xff;
14203 if (byte
!= 0 && byte
!= 0xff)
14209 *info
= simd_immediate_info (DImode
, val64
);
14216 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
14217 instruction. If INFO is nonnull, use it to describe valid immediates. */
14220 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64
,
14221 simd_immediate_info
*info
)
14223 scalar_int_mode mode
= DImode
;
14224 unsigned int val32
= val64
& 0xffffffff;
14225 if (val32
== (val64
>> 32))
14228 unsigned int val16
= val32
& 0xffff;
14229 if (val16
== (val32
>> 16))
14232 unsigned int val8
= val16
& 0xff;
14233 if (val8
== (val16
>> 8))
14237 HOST_WIDE_INT val
= trunc_int_for_mode (val64
, mode
);
14238 if (IN_RANGE (val
, -0x80, 0x7f))
14240 /* DUP with no shift. */
14242 *info
= simd_immediate_info (mode
, val
);
14245 if ((val
& 0xff) == 0 && IN_RANGE (val
, -0x8000, 0x7f00))
14247 /* DUP with LSL #8. */
14249 *info
= simd_immediate_info (mode
, val
);
14252 if (aarch64_bitmask_imm (val64
, mode
))
14256 *info
= simd_immediate_info (mode
, val
);
14262 /* Return true if OP is a valid SIMD immediate for the operation
14263 described by WHICH. If INFO is nonnull, use it to describe valid
14266 aarch64_simd_valid_immediate (rtx op
, simd_immediate_info
*info
,
14267 enum simd_immediate_check which
)
14269 machine_mode mode
= GET_MODE (op
);
14270 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
14271 if (vec_flags
== 0 || vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
14274 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
14276 unsigned int n_elts
;
14277 if (GET_CODE (op
) == CONST_VECTOR
14278 && CONST_VECTOR_DUPLICATE_P (op
))
14279 n_elts
= CONST_VECTOR_NPATTERNS (op
);
14280 else if ((vec_flags
& VEC_SVE_DATA
)
14281 && const_vec_series_p (op
, &base
, &step
))
14283 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
);
14284 if (!aarch64_sve_index_immediate_p (base
)
14285 || !aarch64_sve_index_immediate_p (step
))
14289 *info
= simd_immediate_info (elt_mode
, base
, step
);
14292 else if (GET_CODE (op
) == CONST_VECTOR
14293 && CONST_VECTOR_NUNITS (op
).is_constant (&n_elts
))
14294 /* N_ELTS set above. */;
14298 /* Handle PFALSE and PTRUE. */
14299 if (vec_flags
& VEC_SVE_PRED
)
14300 return (op
== CONST0_RTX (mode
)
14301 || op
== CONSTM1_RTX (mode
));
14303 scalar_float_mode elt_float_mode
;
14305 && is_a
<scalar_float_mode
> (elt_mode
, &elt_float_mode
))
14307 rtx elt
= CONST_VECTOR_ENCODED_ELT (op
, 0);
14308 if (aarch64_float_const_zero_rtx_p (elt
)
14309 || aarch64_float_const_representable_p (elt
))
14312 *info
= simd_immediate_info (elt_float_mode
, elt
);
14317 unsigned int elt_size
= GET_MODE_SIZE (elt_mode
);
14321 scalar_int_mode elt_int_mode
= int_mode_for_mode (elt_mode
).require ();
14323 /* Expand the vector constant out into a byte vector, with the least
14324 significant byte of the register first. */
14325 auto_vec
<unsigned char, 16> bytes
;
14326 bytes
.reserve (n_elts
* elt_size
);
14327 for (unsigned int i
= 0; i
< n_elts
; i
++)
14329 /* The vector is provided in gcc endian-neutral fashion.
14330 For aarch64_be Advanced SIMD, it must be laid out in the vector
14331 register in reverse order. */
14332 bool swap_p
= ((vec_flags
& VEC_ADVSIMD
) != 0 && BYTES_BIG_ENDIAN
);
14333 rtx elt
= CONST_VECTOR_ELT (op
, swap_p
? (n_elts
- 1 - i
) : i
);
14335 if (elt_mode
!= elt_int_mode
)
14336 elt
= gen_lowpart (elt_int_mode
, elt
);
14338 if (!CONST_INT_P (elt
))
14341 unsigned HOST_WIDE_INT elt_val
= INTVAL (elt
);
14342 for (unsigned int byte
= 0; byte
< elt_size
; byte
++)
14344 bytes
.quick_push (elt_val
& 0xff);
14345 elt_val
>>= BITS_PER_UNIT
;
14349 /* The immediate must repeat every eight bytes. */
14350 unsigned int nbytes
= bytes
.length ();
14351 for (unsigned i
= 8; i
< nbytes
; ++i
)
14352 if (bytes
[i
] != bytes
[i
- 8])
14355 /* Get the repeating 8-byte value as an integer. No endian correction
14356 is needed here because bytes is already in lsb-first order. */
14357 unsigned HOST_WIDE_INT val64
= 0;
14358 for (unsigned int i
= 0; i
< 8; i
++)
14359 val64
|= ((unsigned HOST_WIDE_INT
) bytes
[i
% nbytes
]
14360 << (i
* BITS_PER_UNIT
));
14362 if (vec_flags
& VEC_SVE_DATA
)
14363 return aarch64_sve_valid_immediate (val64
, info
);
14365 return aarch64_advsimd_valid_immediate (val64
, info
, which
);
14368 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
14369 has a step in the range of INDEX. Return the index expression if so,
14370 otherwise return null. */
14372 aarch64_check_zero_based_sve_index_immediate (rtx x
)
14375 if (const_vec_series_p (x
, &base
, &step
)
14376 && base
== const0_rtx
14377 && aarch64_sve_index_immediate_p (step
))
14382 /* Check of immediate shift constants are within range. */
14384 aarch64_simd_shift_imm_p (rtx x
, machine_mode mode
, bool left
)
14386 int bit_width
= GET_MODE_UNIT_SIZE (mode
) * BITS_PER_UNIT
;
14388 return aarch64_const_vec_all_same_in_range_p (x
, 0, bit_width
- 1);
14390 return aarch64_const_vec_all_same_in_range_p (x
, 1, bit_width
);
14393 /* Return the bitmask CONST_INT to select the bits required by a zero extract
14394 operation of width WIDTH at bit position POS. */
14397 aarch64_mask_from_zextract_ops (rtx width
, rtx pos
)
14399 gcc_assert (CONST_INT_P (width
));
14400 gcc_assert (CONST_INT_P (pos
));
14402 unsigned HOST_WIDE_INT mask
14403 = ((unsigned HOST_WIDE_INT
) 1 << UINTVAL (width
)) - 1;
14404 return GEN_INT (mask
<< UINTVAL (pos
));
14408 aarch64_mov_operand_p (rtx x
, machine_mode mode
)
14410 if (GET_CODE (x
) == HIGH
14411 && aarch64_valid_symref (XEXP (x
, 0), GET_MODE (XEXP (x
, 0))))
14414 if (CONST_INT_P (x
))
14417 if (VECTOR_MODE_P (GET_MODE (x
)))
14418 return aarch64_simd_valid_immediate (x
, NULL
);
14420 if (GET_CODE (x
) == SYMBOL_REF
&& mode
== DImode
&& CONSTANT_ADDRESS_P (x
))
14423 if (aarch64_sve_cnt_immediate_p (x
))
14426 return aarch64_classify_symbolic_expression (x
)
14427 == SYMBOL_TINY_ABSOLUTE
;
14430 /* Return a const_int vector of VAL. */
14432 aarch64_simd_gen_const_vector_dup (machine_mode mode
, HOST_WIDE_INT val
)
14434 rtx c
= gen_int_mode (val
, GET_MODE_INNER (mode
));
14435 return gen_const_vec_duplicate (mode
, c
);
14438 /* Check OP is a legal scalar immediate for the MOVI instruction. */
14441 aarch64_simd_scalar_immediate_valid_for_move (rtx op
, scalar_int_mode mode
)
14443 machine_mode vmode
;
14445 vmode
= aarch64_simd_container_mode (mode
, 64);
14446 rtx op_v
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (op
));
14447 return aarch64_simd_valid_immediate (op_v
, NULL
);
14450 /* Construct and return a PARALLEL RTX vector with elements numbering the
14451 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
14452 the vector - from the perspective of the architecture. This does not
14453 line up with GCC's perspective on lane numbers, so we end up with
14454 different masks depending on our target endian-ness. The diagram
14455 below may help. We must draw the distinction when building masks
14456 which select one half of the vector. An instruction selecting
14457 architectural low-lanes for a big-endian target, must be described using
14458 a mask selecting GCC high-lanes.
14460 Big-Endian Little-Endian
14462 GCC 0 1 2 3 3 2 1 0
14463 | x | x | x | x | | x | x | x | x |
14464 Architecture 3 2 1 0 3 2 1 0
14466 Low Mask: { 2, 3 } { 0, 1 }
14467 High Mask: { 0, 1 } { 2, 3 }
14469 MODE Is the mode of the vector and NUNITS is the number of units in it. */
14472 aarch64_simd_vect_par_cnst_half (machine_mode mode
, int nunits
, bool high
)
14474 rtvec v
= rtvec_alloc (nunits
/ 2);
14475 int high_base
= nunits
/ 2;
14481 if (BYTES_BIG_ENDIAN
)
14482 base
= high
? low_base
: high_base
;
14484 base
= high
? high_base
: low_base
;
14486 for (i
= 0; i
< nunits
/ 2; i
++)
14487 RTVEC_ELT (v
, i
) = GEN_INT (base
+ i
);
14489 t1
= gen_rtx_PARALLEL (mode
, v
);
14493 /* Check OP for validity as a PARALLEL RTX vector with elements
14494 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
14495 from the perspective of the architecture. See the diagram above
14496 aarch64_simd_vect_par_cnst_half for more details. */
14499 aarch64_simd_check_vect_par_cnst_half (rtx op
, machine_mode mode
,
14503 if (!VECTOR_MODE_P (mode
) || !GET_MODE_NUNITS (mode
).is_constant (&nelts
))
14506 rtx ideal
= aarch64_simd_vect_par_cnst_half (mode
, nelts
, high
);
14507 HOST_WIDE_INT count_op
= XVECLEN (op
, 0);
14508 HOST_WIDE_INT count_ideal
= XVECLEN (ideal
, 0);
14511 if (count_op
!= count_ideal
)
14514 for (i
= 0; i
< count_ideal
; i
++)
14516 rtx elt_op
= XVECEXP (op
, 0, i
);
14517 rtx elt_ideal
= XVECEXP (ideal
, 0, i
);
14519 if (!CONST_INT_P (elt_op
)
14520 || INTVAL (elt_ideal
) != INTVAL (elt_op
))
14526 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
14527 HIGH (exclusive). */
14529 aarch64_simd_lane_bounds (rtx operand
, HOST_WIDE_INT low
, HOST_WIDE_INT high
,
14532 HOST_WIDE_INT lane
;
14533 gcc_assert (CONST_INT_P (operand
));
14534 lane
= INTVAL (operand
);
14536 if (lane
< low
|| lane
>= high
)
14539 error ("%Klane %wd out of range %wd - %wd", exp
, lane
, low
, high
- 1);
14541 error ("lane %wd out of range %wd - %wd", lane
, low
, high
- 1);
14545 /* Peform endian correction on lane number N, which indexes a vector
14546 of mode MODE, and return the result as an SImode rtx. */
14549 aarch64_endian_lane_rtx (machine_mode mode
, unsigned int n
)
14551 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode
), n
), SImode
);
14554 /* Return TRUE if OP is a valid vector addressing mode. */
14557 aarch64_simd_mem_operand_p (rtx op
)
14559 return MEM_P (op
) && (GET_CODE (XEXP (op
, 0)) == POST_INC
14560 || REG_P (XEXP (op
, 0)));
14563 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
14566 aarch64_sve_ld1r_operand_p (rtx op
)
14568 struct aarch64_address_info addr
;
14572 && is_a
<scalar_mode
> (GET_MODE (op
), &mode
)
14573 && aarch64_classify_address (&addr
, XEXP (op
, 0), mode
, false)
14574 && addr
.type
== ADDRESS_REG_IMM
14575 && offset_6bit_unsigned_scaled_p (mode
, addr
.const_offset
));
14578 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
14579 The conditions for STR are the same. */
14581 aarch64_sve_ldr_operand_p (rtx op
)
14583 struct aarch64_address_info addr
;
14586 && aarch64_classify_address (&addr
, XEXP (op
, 0), GET_MODE (op
),
14587 false, ADDR_QUERY_ANY
)
14588 && addr
.type
== ADDRESS_REG_IMM
);
14591 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
14592 We need to be able to access the individual pieces, so the range
14593 is different from LD[234] and ST[234]. */
14595 aarch64_sve_struct_memory_operand_p (rtx op
)
14600 machine_mode mode
= GET_MODE (op
);
14601 struct aarch64_address_info addr
;
14602 if (!aarch64_classify_address (&addr
, XEXP (op
, 0), SVE_BYTE_MODE
, false,
14604 || addr
.type
!= ADDRESS_REG_IMM
)
14607 poly_int64 first
= addr
.const_offset
;
14608 poly_int64 last
= first
+ GET_MODE_SIZE (mode
) - BYTES_PER_SVE_VECTOR
;
14609 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, first
)
14610 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, last
));
14613 /* Emit a register copy from operand to operand, taking care not to
14614 early-clobber source registers in the process.
14616 COUNT is the number of components into which the copy needs to be
14619 aarch64_simd_emit_reg_reg_move (rtx
*operands
, machine_mode mode
,
14620 unsigned int count
)
14623 int rdest
= REGNO (operands
[0]);
14624 int rsrc
= REGNO (operands
[1]);
14626 if (!reg_overlap_mentioned_p (operands
[0], operands
[1])
14628 for (i
= 0; i
< count
; i
++)
14629 emit_move_insn (gen_rtx_REG (mode
, rdest
+ i
),
14630 gen_rtx_REG (mode
, rsrc
+ i
));
14632 for (i
= 0; i
< count
; i
++)
14633 emit_move_insn (gen_rtx_REG (mode
, rdest
+ count
- i
- 1),
14634 gen_rtx_REG (mode
, rsrc
+ count
- i
- 1));
14637 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
14638 one of VSTRUCT modes: OI, CI, or XI. */
14640 aarch64_simd_attr_length_rglist (machine_mode mode
)
14642 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
14643 return (GET_MODE_SIZE (mode
).to_constant () / UNITS_PER_VREG
) * 4;
14646 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
14647 alignment of a vector to 128 bits. SVE predicates have an alignment of
14649 static HOST_WIDE_INT
14650 aarch64_simd_vector_alignment (const_tree type
)
14652 if (TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
14653 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
14654 be set for non-predicate vectors of booleans. Modes are the most
14655 direct way we have of identifying real SVE predicate types. */
14656 return GET_MODE_CLASS (TYPE_MODE (type
)) == MODE_VECTOR_BOOL
? 16 : 128;
14657 HOST_WIDE_INT align
= tree_to_shwi (TYPE_SIZE (type
));
14658 return MIN (align
, 128);
14661 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
14663 aarch64_vectorize_preferred_vector_alignment (const_tree type
)
14665 if (aarch64_sve_data_mode_p (TYPE_MODE (type
)))
14667 /* If the length of the vector is fixed, try to align to that length,
14668 otherwise don't try to align at all. */
14669 HOST_WIDE_INT result
;
14670 if (!BITS_PER_SVE_VECTOR
.is_constant (&result
))
14671 result
= TYPE_ALIGN (TREE_TYPE (type
));
14674 return TYPE_ALIGN (type
);
14677 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
14679 aarch64_simd_vector_alignment_reachable (const_tree type
, bool is_packed
)
14684 /* For fixed-length vectors, check that the vectorizer will aim for
14685 full-vector alignment. This isn't true for generic GCC vectors
14686 that are wider than the ABI maximum of 128 bits. */
14687 poly_uint64 preferred_alignment
=
14688 aarch64_vectorize_preferred_vector_alignment (type
);
14689 if (TREE_CODE (TYPE_SIZE (type
)) == INTEGER_CST
14690 && maybe_ne (wi::to_widest (TYPE_SIZE (type
)),
14691 preferred_alignment
))
14694 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
14698 /* Return true if the vector misalignment factor is supported by the
14701 aarch64_builtin_support_vector_misalignment (machine_mode mode
,
14702 const_tree type
, int misalignment
,
14705 if (TARGET_SIMD
&& STRICT_ALIGNMENT
)
14707 /* Return if movmisalign pattern is not supported for this mode. */
14708 if (optab_handler (movmisalign_optab
, mode
) == CODE_FOR_nothing
)
14711 /* Misalignment factor is unknown at compile time. */
14712 if (misalignment
== -1)
14715 return default_builtin_support_vector_misalignment (mode
, type
, misalignment
,
14719 /* If VALS is a vector constant that can be loaded into a register
14720 using DUP, generate instructions to do so and return an RTX to
14721 assign to the register. Otherwise return NULL_RTX. */
14723 aarch64_simd_dup_constant (rtx vals
)
14725 machine_mode mode
= GET_MODE (vals
);
14726 machine_mode inner_mode
= GET_MODE_INNER (mode
);
14729 if (!const_vec_duplicate_p (vals
, &x
))
14732 /* We can load this constant by using DUP and a constant in a
14733 single ARM register. This will be cheaper than a vector
14735 x
= copy_to_mode_reg (inner_mode
, x
);
14736 return gen_vec_duplicate (mode
, x
);
14740 /* Generate code to load VALS, which is a PARALLEL containing only
14741 constants (for vec_init) or CONST_VECTOR, efficiently into a
14742 register. Returns an RTX to copy into the register, or NULL_RTX
14743 for a PARALLEL that can not be converted into a CONST_VECTOR. */
14745 aarch64_simd_make_constant (rtx vals
)
14747 machine_mode mode
= GET_MODE (vals
);
14749 rtx const_vec
= NULL_RTX
;
14753 if (GET_CODE (vals
) == CONST_VECTOR
)
14755 else if (GET_CODE (vals
) == PARALLEL
)
14757 /* A CONST_VECTOR must contain only CONST_INTs and
14758 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
14759 Only store valid constants in a CONST_VECTOR. */
14760 int n_elts
= XVECLEN (vals
, 0);
14761 for (i
= 0; i
< n_elts
; ++i
)
14763 rtx x
= XVECEXP (vals
, 0, i
);
14764 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
14767 if (n_const
== n_elts
)
14768 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0));
14771 gcc_unreachable ();
14773 if (const_vec
!= NULL_RTX
14774 && aarch64_simd_valid_immediate (const_vec
, NULL
))
14775 /* Load using MOVI/MVNI. */
14777 else if ((const_dup
= aarch64_simd_dup_constant (vals
)) != NULL_RTX
)
14778 /* Loaded using DUP. */
14780 else if (const_vec
!= NULL_RTX
)
14781 /* Load from constant pool. We can not take advantage of single-cycle
14782 LD1 because we need a PC-relative addressing mode. */
14785 /* A PARALLEL containing something not valid inside CONST_VECTOR.
14786 We can not construct an initializer. */
14790 /* Expand a vector initialisation sequence, such that TARGET is
14791 initialised to contain VALS. */
14794 aarch64_expand_vector_init (rtx target
, rtx vals
)
14796 machine_mode mode
= GET_MODE (target
);
14797 scalar_mode inner_mode
= GET_MODE_INNER (mode
);
14798 /* The number of vector elements. */
14799 int n_elts
= XVECLEN (vals
, 0);
14800 /* The number of vector elements which are not constant. */
14802 rtx any_const
= NULL_RTX
;
14803 /* The first element of vals. */
14804 rtx v0
= XVECEXP (vals
, 0, 0);
14805 bool all_same
= true;
14807 /* Count the number of variable elements to initialise. */
14808 for (int i
= 0; i
< n_elts
; ++i
)
14810 rtx x
= XVECEXP (vals
, 0, i
);
14811 if (!(CONST_INT_P (x
) || CONST_DOUBLE_P (x
)))
14816 all_same
&= rtx_equal_p (x
, v0
);
14819 /* No variable elements, hand off to aarch64_simd_make_constant which knows
14820 how best to handle this. */
14823 rtx constant
= aarch64_simd_make_constant (vals
);
14824 if (constant
!= NULL_RTX
)
14826 emit_move_insn (target
, constant
);
14831 /* Splat a single non-constant element if we can. */
14834 rtx x
= copy_to_mode_reg (inner_mode
, v0
);
14835 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
14839 enum insn_code icode
= optab_handler (vec_set_optab
, mode
);
14840 gcc_assert (icode
!= CODE_FOR_nothing
);
14842 /* If there are only variable elements, try to optimize
14843 the insertion using dup for the most common element
14844 followed by insertions. */
14846 /* The algorithm will fill matches[*][0] with the earliest matching element,
14847 and matches[X][1] with the count of duplicate elements (if X is the
14848 earliest element which has duplicates). */
14850 if (n_var
== n_elts
&& n_elts
<= 16)
14852 int matches
[16][2] = {0};
14853 for (int i
= 0; i
< n_elts
; i
++)
14855 for (int j
= 0; j
<= i
; j
++)
14857 if (rtx_equal_p (XVECEXP (vals
, 0, i
), XVECEXP (vals
, 0, j
)))
14865 int maxelement
= 0;
14867 for (int i
= 0; i
< n_elts
; i
++)
14868 if (matches
[i
][1] > maxv
)
14871 maxv
= matches
[i
][1];
14874 /* Create a duplicate of the most common element, unless all elements
14875 are equally useless to us, in which case just immediately set the
14876 vector register using the first element. */
14880 /* For vectors of two 64-bit elements, we can do even better. */
14882 && (inner_mode
== E_DImode
14883 || inner_mode
== E_DFmode
))
14886 rtx x0
= XVECEXP (vals
, 0, 0);
14887 rtx x1
= XVECEXP (vals
, 0, 1);
14888 /* Combine can pick up this case, but handling it directly
14889 here leaves clearer RTL.
14891 This is load_pair_lanes<mode>, and also gives us a clean-up
14892 for store_pair_lanes<mode>. */
14893 if (memory_operand (x0
, inner_mode
)
14894 && memory_operand (x1
, inner_mode
)
14895 && !STRICT_ALIGNMENT
14896 && rtx_equal_p (XEXP (x1
, 0),
14897 plus_constant (Pmode
,
14899 GET_MODE_SIZE (inner_mode
))))
14902 if (inner_mode
== DFmode
)
14903 t
= gen_load_pair_lanesdf (target
, x0
, x1
);
14905 t
= gen_load_pair_lanesdi (target
, x0
, x1
);
14910 /* The subreg-move sequence below will move into lane zero of the
14911 vector register. For big-endian we want that position to hold
14912 the last element of VALS. */
14913 maxelement
= BYTES_BIG_ENDIAN
? n_elts
- 1 : 0;
14914 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
14915 aarch64_emit_move (target
, lowpart_subreg (mode
, x
, inner_mode
));
14919 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
14920 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
14923 /* Insert the rest. */
14924 for (int i
= 0; i
< n_elts
; i
++)
14926 rtx x
= XVECEXP (vals
, 0, i
);
14927 if (matches
[i
][0] == maxelement
)
14929 x
= copy_to_mode_reg (inner_mode
, x
);
14930 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
14935 /* Initialise a vector which is part-variable. We want to first try
14936 to build those lanes which are constant in the most efficient way we
14938 if (n_var
!= n_elts
)
14940 rtx copy
= copy_rtx (vals
);
14942 /* Load constant part of vector. We really don't care what goes into the
14943 parts we will overwrite, but we're more likely to be able to load the
14944 constant efficiently if it has fewer, larger, repeating parts
14945 (see aarch64_simd_valid_immediate). */
14946 for (int i
= 0; i
< n_elts
; i
++)
14948 rtx x
= XVECEXP (vals
, 0, i
);
14949 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
14951 rtx subst
= any_const
;
14952 for (int bit
= n_elts
/ 2; bit
> 0; bit
/= 2)
14954 /* Look in the copied vector, as more elements are const. */
14955 rtx test
= XVECEXP (copy
, 0, i
^ bit
);
14956 if (CONST_INT_P (test
) || CONST_DOUBLE_P (test
))
14962 XVECEXP (copy
, 0, i
) = subst
;
14964 aarch64_expand_vector_init (target
, copy
);
14967 /* Insert the variable lanes directly. */
14968 for (int i
= 0; i
< n_elts
; i
++)
14970 rtx x
= XVECEXP (vals
, 0, i
);
14971 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
14973 x
= copy_to_mode_reg (inner_mode
, x
);
14974 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
14978 static unsigned HOST_WIDE_INT
14979 aarch64_shift_truncation_mask (machine_mode mode
)
14981 if (!SHIFT_COUNT_TRUNCATED
|| aarch64_vector_data_mode_p (mode
))
14983 return GET_MODE_UNIT_BITSIZE (mode
) - 1;
14986 /* Select a format to encode pointers in exception handling data. */
14988 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED
, int global
)
14991 switch (aarch64_cmodel
)
14993 case AARCH64_CMODEL_TINY
:
14994 case AARCH64_CMODEL_TINY_PIC
:
14995 case AARCH64_CMODEL_SMALL
:
14996 case AARCH64_CMODEL_SMALL_PIC
:
14997 case AARCH64_CMODEL_SMALL_SPIC
:
14998 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
15000 type
= DW_EH_PE_sdata4
;
15003 /* No assumptions here. 8-byte relocs required. */
15004 type
= DW_EH_PE_sdata8
;
15007 return (global
? DW_EH_PE_indirect
: 0) | DW_EH_PE_pcrel
| type
;
15010 /* The last .arch and .tune assembly strings that we printed. */
15011 static std::string aarch64_last_printed_arch_string
;
15012 static std::string aarch64_last_printed_tune_string
;
15014 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
15015 by the function fndecl. */
15018 aarch64_declare_function_name (FILE *stream
, const char* name
,
15021 tree target_parts
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
15023 struct cl_target_option
*targ_options
;
15025 targ_options
= TREE_TARGET_OPTION (target_parts
);
15027 targ_options
= TREE_TARGET_OPTION (target_option_current_node
);
15028 gcc_assert (targ_options
);
15030 const struct processor
*this_arch
15031 = aarch64_get_arch (targ_options
->x_explicit_arch
);
15033 unsigned long isa_flags
= targ_options
->x_aarch64_isa_flags
;
15034 std::string extension
15035 = aarch64_get_extension_string_for_isa_flags (isa_flags
,
15037 /* Only update the assembler .arch string if it is distinct from the last
15038 such string we printed. */
15039 std::string to_print
= this_arch
->name
+ extension
;
15040 if (to_print
!= aarch64_last_printed_arch_string
)
15042 asm_fprintf (asm_out_file
, "\t.arch %s\n", to_print
.c_str ());
15043 aarch64_last_printed_arch_string
= to_print
;
15046 /* Print the cpu name we're tuning for in the comments, might be
15047 useful to readers of the generated asm. Do it only when it changes
15048 from function to function and verbose assembly is requested. */
15049 const struct processor
*this_tune
15050 = aarch64_get_tune_cpu (targ_options
->x_explicit_tune_core
);
15052 if (flag_debug_asm
&& aarch64_last_printed_tune_string
!= this_tune
->name
)
15054 asm_fprintf (asm_out_file
, "\t" ASM_COMMENT_START
".tune %s\n",
15056 aarch64_last_printed_tune_string
= this_tune
->name
;
15059 /* Don't forget the type directive for ELF. */
15060 ASM_OUTPUT_TYPE_DIRECTIVE (stream
, name
, "function");
15061 ASM_OUTPUT_LABEL (stream
, name
);
15064 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
15067 aarch64_start_file (void)
15069 struct cl_target_option
*default_options
15070 = TREE_TARGET_OPTION (target_option_default_node
);
15072 const struct processor
*default_arch
15073 = aarch64_get_arch (default_options
->x_explicit_arch
);
15074 unsigned long default_isa_flags
= default_options
->x_aarch64_isa_flags
;
15075 std::string extension
15076 = aarch64_get_extension_string_for_isa_flags (default_isa_flags
,
15077 default_arch
->flags
);
15079 aarch64_last_printed_arch_string
= default_arch
->name
+ extension
;
15080 aarch64_last_printed_tune_string
= "";
15081 asm_fprintf (asm_out_file
, "\t.arch %s\n",
15082 aarch64_last_printed_arch_string
.c_str ());
15084 default_file_start ();
15087 /* Emit load exclusive. */
15090 aarch64_emit_load_exclusive (machine_mode mode
, rtx rval
,
15091 rtx mem
, rtx model_rtx
)
15093 emit_insn (gen_aarch64_load_exclusive (mode
, rval
, mem
, model_rtx
));
15096 /* Emit store exclusive. */
15099 aarch64_emit_store_exclusive (machine_mode mode
, rtx bval
,
15100 rtx rval
, rtx mem
, rtx model_rtx
)
15102 emit_insn (gen_aarch64_store_exclusive (mode
, bval
, rval
, mem
, model_rtx
));
15105 /* Mark the previous jump instruction as unlikely. */
15108 aarch64_emit_unlikely_jump (rtx insn
)
15110 rtx_insn
*jump
= emit_jump_insn (insn
);
15111 add_reg_br_prob_note (jump
, profile_probability::very_unlikely ());
15114 /* Expand a compare and swap pattern. */
15117 aarch64_expand_compare_and_swap (rtx operands
[])
15119 rtx bval
, rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
, x
, cc_reg
;
15120 machine_mode mode
, r_mode
;
15122 bval
= operands
[0];
15123 rval
= operands
[1];
15125 oldval
= operands
[3];
15126 newval
= operands
[4];
15127 is_weak
= operands
[5];
15128 mod_s
= operands
[6];
15129 mod_f
= operands
[7];
15130 mode
= GET_MODE (mem
);
15132 /* Normally the succ memory model must be stronger than fail, but in the
15133 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
15134 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
15135 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f
)))
15136 && is_mm_release (memmodel_from_int (INTVAL (mod_s
))))
15137 mod_s
= GEN_INT (MEMMODEL_ACQ_REL
);
15140 if (mode
== QImode
|| mode
== HImode
)
15143 rval
= gen_reg_rtx (r_mode
);
15148 /* The CAS insn requires oldval and rval overlap, but we need to
15149 have a copy of oldval saved across the operation to tell if
15150 the operation is successful. */
15151 if (reg_overlap_mentioned_p (rval
, oldval
))
15152 rval
= copy_to_mode_reg (r_mode
, oldval
);
15154 emit_move_insn (rval
, gen_lowpart (r_mode
, oldval
));
15156 emit_insn (gen_aarch64_compare_and_swap_lse (mode
, rval
, mem
,
15158 cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
15162 /* The oldval predicate varies by mode. Test it and force to reg. */
15163 insn_code code
= code_for_aarch64_compare_and_swap (mode
);
15164 if (!insn_data
[code
].operand
[2].predicate (oldval
, mode
))
15165 oldval
= force_reg (mode
, oldval
);
15167 emit_insn (GEN_FCN (code
) (rval
, mem
, oldval
, newval
,
15168 is_weak
, mod_s
, mod_f
));
15169 cc_reg
= gen_rtx_REG (CCmode
, CC_REGNUM
);
15172 if (r_mode
!= mode
)
15173 rval
= gen_lowpart (mode
, rval
);
15174 emit_move_insn (operands
[1], rval
);
15176 x
= gen_rtx_EQ (SImode
, cc_reg
, const0_rtx
);
15177 emit_insn (gen_rtx_SET (bval
, x
));
15180 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
15181 sequence implementing an atomic operation. */
15184 aarch64_emit_post_barrier (enum memmodel model
)
15186 const enum memmodel base_model
= memmodel_base (model
);
15188 if (is_mm_sync (model
)
15189 && (base_model
== MEMMODEL_ACQUIRE
15190 || base_model
== MEMMODEL_ACQ_REL
15191 || base_model
== MEMMODEL_SEQ_CST
))
15193 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST
)));
15197 /* Split a compare and swap pattern. */
15200 aarch64_split_compare_and_swap (rtx operands
[])
15202 rtx rval
, mem
, oldval
, newval
, scratch
;
15205 rtx_code_label
*label1
, *label2
;
15207 enum memmodel model
;
15210 rval
= operands
[0];
15212 oldval
= operands
[2];
15213 newval
= operands
[3];
15214 is_weak
= (operands
[4] != const0_rtx
);
15215 model_rtx
= operands
[5];
15216 scratch
= operands
[7];
15217 mode
= GET_MODE (mem
);
15218 model
= memmodel_from_int (INTVAL (model_rtx
));
15220 /* When OLDVAL is zero and we want the strong version we can emit a tighter
15223 LD[A]XR rval, [mem]
15225 ST[L]XR scratch, newval, [mem]
15226 CBNZ scratch, .label1
15229 bool strong_zero_p
= !is_weak
&& oldval
== const0_rtx
;
15234 label1
= gen_label_rtx ();
15235 emit_label (label1
);
15237 label2
= gen_label_rtx ();
15239 /* The initial load can be relaxed for a __sync operation since a final
15240 barrier will be emitted to stop code hoisting. */
15241 if (is_mm_sync (model
))
15242 aarch64_emit_load_exclusive (mode
, rval
, mem
,
15243 GEN_INT (MEMMODEL_RELAXED
));
15245 aarch64_emit_load_exclusive (mode
, rval
, mem
, model_rtx
);
15249 if (aarch64_track_speculation
)
15251 /* Emit an explicit compare instruction, so that we can correctly
15252 track the condition codes. */
15253 rtx cc_reg
= aarch64_gen_compare_reg (NE
, rval
, const0_rtx
);
15254 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
15257 x
= gen_rtx_NE (VOIDmode
, rval
, const0_rtx
);
15259 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
15260 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
15261 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
15265 cond
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
15266 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
15267 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
15268 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
15269 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
15272 aarch64_emit_store_exclusive (mode
, scratch
, mem
, newval
, model_rtx
);
15276 if (aarch64_track_speculation
)
15278 /* Emit an explicit compare instruction, so that we can correctly
15279 track the condition codes. */
15280 rtx cc_reg
= aarch64_gen_compare_reg (NE
, scratch
, const0_rtx
);
15281 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
15284 x
= gen_rtx_NE (VOIDmode
, scratch
, const0_rtx
);
15286 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
15287 gen_rtx_LABEL_REF (Pmode
, label1
), pc_rtx
);
15288 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
15292 cond
= gen_rtx_REG (CCmode
, CC_REGNUM
);
15293 x
= gen_rtx_COMPARE (CCmode
, scratch
, const0_rtx
);
15294 emit_insn (gen_rtx_SET (cond
, x
));
15297 emit_label (label2
);
15298 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
15299 to set the condition flags. If this is not used it will be removed by
15303 cond
= gen_rtx_REG (CCmode
, CC_REGNUM
);
15304 x
= gen_rtx_COMPARE (CCmode
, rval
, const0_rtx
);
15305 emit_insn (gen_rtx_SET (cond
, x
));
15307 /* Emit any final barrier needed for a __sync operation. */
15308 if (is_mm_sync (model
))
15309 aarch64_emit_post_barrier (model
);
15312 /* Split an atomic operation. */
15315 aarch64_split_atomic_op (enum rtx_code code
, rtx old_out
, rtx new_out
, rtx mem
,
15316 rtx value
, rtx model_rtx
, rtx cond
)
15318 machine_mode mode
= GET_MODE (mem
);
15319 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
15320 const enum memmodel model
= memmodel_from_int (INTVAL (model_rtx
));
15321 const bool is_sync
= is_mm_sync (model
);
15322 rtx_code_label
*label
;
15325 /* Split the atomic operation into a sequence. */
15326 label
= gen_label_rtx ();
15327 emit_label (label
);
15330 new_out
= gen_lowpart (wmode
, new_out
);
15332 old_out
= gen_lowpart (wmode
, old_out
);
15335 value
= simplify_gen_subreg (wmode
, value
, mode
, 0);
15337 /* The initial load can be relaxed for a __sync operation since a final
15338 barrier will be emitted to stop code hoisting. */
15340 aarch64_emit_load_exclusive (mode
, old_out
, mem
,
15341 GEN_INT (MEMMODEL_RELAXED
));
15343 aarch64_emit_load_exclusive (mode
, old_out
, mem
, model_rtx
);
15352 x
= gen_rtx_AND (wmode
, old_out
, value
);
15353 emit_insn (gen_rtx_SET (new_out
, x
));
15354 x
= gen_rtx_NOT (wmode
, new_out
);
15355 emit_insn (gen_rtx_SET (new_out
, x
));
15359 if (CONST_INT_P (value
))
15361 value
= GEN_INT (-INTVAL (value
));
15364 /* Fall through. */
15367 x
= gen_rtx_fmt_ee (code
, wmode
, old_out
, value
);
15368 emit_insn (gen_rtx_SET (new_out
, x
));
15372 aarch64_emit_store_exclusive (mode
, cond
, mem
,
15373 gen_lowpart (mode
, new_out
), model_rtx
);
15375 if (aarch64_track_speculation
)
15377 /* Emit an explicit compare instruction, so that we can correctly
15378 track the condition codes. */
15379 rtx cc_reg
= aarch64_gen_compare_reg (NE
, cond
, const0_rtx
);
15380 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
15383 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
15385 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
15386 gen_rtx_LABEL_REF (Pmode
, label
), pc_rtx
);
15387 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
15389 /* Emit any final barrier needed for a __sync operation. */
15391 aarch64_emit_post_barrier (model
);
15395 aarch64_init_libfuncs (void)
15397 /* Half-precision float operations. The compiler handles all operations
15398 with NULL libfuncs by converting to SFmode. */
15401 set_conv_libfunc (trunc_optab
, HFmode
, SFmode
, "__gnu_f2h_ieee");
15402 set_conv_libfunc (sext_optab
, SFmode
, HFmode
, "__gnu_h2f_ieee");
15405 set_optab_libfunc (add_optab
, HFmode
, NULL
);
15406 set_optab_libfunc (sdiv_optab
, HFmode
, NULL
);
15407 set_optab_libfunc (smul_optab
, HFmode
, NULL
);
15408 set_optab_libfunc (neg_optab
, HFmode
, NULL
);
15409 set_optab_libfunc (sub_optab
, HFmode
, NULL
);
15412 set_optab_libfunc (eq_optab
, HFmode
, NULL
);
15413 set_optab_libfunc (ne_optab
, HFmode
, NULL
);
15414 set_optab_libfunc (lt_optab
, HFmode
, NULL
);
15415 set_optab_libfunc (le_optab
, HFmode
, NULL
);
15416 set_optab_libfunc (ge_optab
, HFmode
, NULL
);
15417 set_optab_libfunc (gt_optab
, HFmode
, NULL
);
15418 set_optab_libfunc (unord_optab
, HFmode
, NULL
);
15421 /* Target hook for c_mode_for_suffix. */
15422 static machine_mode
15423 aarch64_c_mode_for_suffix (char suffix
)
15431 /* We can only represent floating point constants which will fit in
15432 "quarter-precision" values. These values are characterised by
15433 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
15436 (-1)^s * (n/16) * 2^r
15439 's' is the sign bit.
15440 'n' is an integer in the range 16 <= n <= 31.
15441 'r' is an integer in the range -3 <= r <= 4. */
15443 /* Return true iff X can be represented by a quarter-precision
15444 floating point immediate operand X. Note, we cannot represent 0.0. */
15446 aarch64_float_const_representable_p (rtx x
)
15448 /* This represents our current view of how many bits
15449 make up the mantissa. */
15450 int point_pos
= 2 * HOST_BITS_PER_WIDE_INT
- 1;
15452 unsigned HOST_WIDE_INT mantissa
, mask
;
15453 REAL_VALUE_TYPE r
, m
;
15456 if (!CONST_DOUBLE_P (x
))
15459 if (GET_MODE (x
) == VOIDmode
15460 || (GET_MODE (x
) == HFmode
&& !TARGET_FP_F16INST
))
15463 r
= *CONST_DOUBLE_REAL_VALUE (x
);
15465 /* We cannot represent infinities, NaNs or +/-zero. We won't
15466 know if we have +zero until we analyse the mantissa, but we
15467 can reject the other invalid values. */
15468 if (REAL_VALUE_ISINF (r
) || REAL_VALUE_ISNAN (r
)
15469 || REAL_VALUE_MINUS_ZERO (r
))
15472 /* Extract exponent. */
15473 r
= real_value_abs (&r
);
15474 exponent
= REAL_EXP (&r
);
15476 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
15477 highest (sign) bit, with a fixed binary point at bit point_pos.
15478 m1 holds the low part of the mantissa, m2 the high part.
15479 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
15480 bits for the mantissa, this can fail (low bits will be lost). */
15481 real_ldexp (&m
, &r
, point_pos
- exponent
);
15482 wide_int w
= real_to_integer (&m
, &fail
, HOST_BITS_PER_WIDE_INT
* 2);
15484 /* If the low part of the mantissa has bits set we cannot represent
15486 if (w
.ulow () != 0)
15488 /* We have rejected the lower HOST_WIDE_INT, so update our
15489 understanding of how many bits lie in the mantissa and
15490 look only at the high HOST_WIDE_INT. */
15491 mantissa
= w
.elt (1);
15492 point_pos
-= HOST_BITS_PER_WIDE_INT
;
15494 /* We can only represent values with a mantissa of the form 1.xxxx. */
15495 mask
= ((unsigned HOST_WIDE_INT
)1 << (point_pos
- 5)) - 1;
15496 if ((mantissa
& mask
) != 0)
15499 /* Having filtered unrepresentable values, we may now remove all
15500 but the highest 5 bits. */
15501 mantissa
>>= point_pos
- 5;
15503 /* We cannot represent the value 0.0, so reject it. This is handled
15508 /* Then, as bit 4 is always set, we can mask it off, leaving
15509 the mantissa in the range [0, 15]. */
15510 mantissa
&= ~(1 << 4);
15511 gcc_assert (mantissa
<= 15);
15513 /* GCC internally does not use IEEE754-like encoding (where normalized
15514 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
15515 Our mantissa values are shifted 4 places to the left relative to
15516 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
15517 by 5 places to correct for GCC's representation. */
15518 exponent
= 5 - exponent
;
15520 return (exponent
>= 0 && exponent
<= 7);
15523 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
15524 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
15525 output MOVI/MVNI, ORR or BIC immediate. */
15527 aarch64_output_simd_mov_immediate (rtx const_vector
, unsigned width
,
15528 enum simd_immediate_check which
)
15531 static char templ
[40];
15532 const char *mnemonic
;
15533 const char *shift_op
;
15534 unsigned int lane_count
= 0;
15537 struct simd_immediate_info info
;
15539 /* This will return true to show const_vector is legal for use as either
15540 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
15541 It will also update INFO to show how the immediate should be generated.
15542 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
15543 is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
, which
);
15544 gcc_assert (is_valid
);
15546 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
15547 lane_count
= width
/ GET_MODE_BITSIZE (info
.elt_mode
);
15549 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
15551 gcc_assert (info
.shift
== 0 && info
.insn
== simd_immediate_info::MOV
);
15552 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
15553 move immediate path. */
15554 if (aarch64_float_const_zero_rtx_p (info
.value
))
15555 info
.value
= GEN_INT (0);
15558 const unsigned int buf_size
= 20;
15559 char float_buf
[buf_size
] = {'\0'};
15560 real_to_decimal_for_mode (float_buf
,
15561 CONST_DOUBLE_REAL_VALUE (info
.value
),
15562 buf_size
, buf_size
, 1, info
.elt_mode
);
15564 if (lane_count
== 1)
15565 snprintf (templ
, sizeof (templ
), "fmov\t%%d0, %s", float_buf
);
15567 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%d%c, %s",
15568 lane_count
, element_char
, float_buf
);
15573 gcc_assert (CONST_INT_P (info
.value
));
15575 if (which
== AARCH64_CHECK_MOV
)
15577 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "mvni" : "movi";
15578 shift_op
= info
.modifier
== simd_immediate_info::MSL
? "msl" : "lsl";
15579 if (lane_count
== 1)
15580 snprintf (templ
, sizeof (templ
), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX
,
15581 mnemonic
, UINTVAL (info
.value
));
15582 else if (info
.shift
)
15583 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
15584 HOST_WIDE_INT_PRINT_HEX
", %s %d", mnemonic
, lane_count
,
15585 element_char
, UINTVAL (info
.value
), shift_op
, info
.shift
);
15587 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
15588 HOST_WIDE_INT_PRINT_HEX
, mnemonic
, lane_count
,
15589 element_char
, UINTVAL (info
.value
));
15593 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
15594 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "bic" : "orr";
15596 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
15597 HOST_WIDE_INT_PRINT_DEC
", %s #%d", mnemonic
, lane_count
,
15598 element_char
, UINTVAL (info
.value
), "lsl", info
.shift
);
15600 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
15601 HOST_WIDE_INT_PRINT_DEC
, mnemonic
, lane_count
,
15602 element_char
, UINTVAL (info
.value
));
15608 aarch64_output_scalar_simd_mov_immediate (rtx immediate
, scalar_int_mode mode
)
15611 /* If a floating point number was passed and we desire to use it in an
15612 integer mode do the conversion to integer. */
15613 if (CONST_DOUBLE_P (immediate
) && GET_MODE_CLASS (mode
) == MODE_INT
)
15615 unsigned HOST_WIDE_INT ival
;
15616 if (!aarch64_reinterpret_float_as_int (immediate
, &ival
))
15617 gcc_unreachable ();
15618 immediate
= gen_int_mode (ival
, mode
);
15621 machine_mode vmode
;
15622 /* use a 64 bit mode for everything except for DI/DF mode, where we use
15623 a 128 bit vector mode. */
15624 int width
= GET_MODE_BITSIZE (mode
) == 64 ? 128 : 64;
15626 vmode
= aarch64_simd_container_mode (mode
, width
);
15627 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (immediate
));
15628 return aarch64_output_simd_mov_immediate (v_op
, width
);
15631 /* Return the output string to use for moving immediate CONST_VECTOR
15632 into an SVE register. */
15635 aarch64_output_sve_mov_immediate (rtx const_vector
)
15637 static char templ
[40];
15638 struct simd_immediate_info info
;
15641 bool is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
);
15642 gcc_assert (is_valid
);
15644 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
15648 snprintf (templ
, sizeof (templ
), "index\t%%0.%c, #"
15649 HOST_WIDE_INT_PRINT_DEC
", #" HOST_WIDE_INT_PRINT_DEC
,
15650 element_char
, INTVAL (info
.value
), INTVAL (info
.step
));
15654 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
15656 if (aarch64_float_const_zero_rtx_p (info
.value
))
15657 info
.value
= GEN_INT (0);
15660 const int buf_size
= 20;
15661 char float_buf
[buf_size
] = {};
15662 real_to_decimal_for_mode (float_buf
,
15663 CONST_DOUBLE_REAL_VALUE (info
.value
),
15664 buf_size
, buf_size
, 1, info
.elt_mode
);
15666 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%c, #%s",
15667 element_char
, float_buf
);
15672 snprintf (templ
, sizeof (templ
), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC
,
15673 element_char
, INTVAL (info
.value
));
15677 /* Return the asm format for a PTRUE instruction whose destination has
15678 mode MODE. SUFFIX is the element size suffix. */
15681 aarch64_output_ptrue (machine_mode mode
, char suffix
)
15683 unsigned int nunits
;
15684 static char buf
[sizeof ("ptrue\t%0.N, vlNNNNN")];
15685 if (GET_MODE_NUNITS (mode
).is_constant (&nunits
))
15686 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, vl%d", suffix
, nunits
);
15688 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, all", suffix
);
15692 /* Split operands into moves from op[1] + op[2] into op[0]. */
15695 aarch64_split_combinev16qi (rtx operands
[3])
15697 unsigned int dest
= REGNO (operands
[0]);
15698 unsigned int src1
= REGNO (operands
[1]);
15699 unsigned int src2
= REGNO (operands
[2]);
15700 machine_mode halfmode
= GET_MODE (operands
[1]);
15701 unsigned int halfregs
= REG_NREGS (operands
[1]);
15702 rtx destlo
, desthi
;
15704 gcc_assert (halfmode
== V16QImode
);
15706 if (src1
== dest
&& src2
== dest
+ halfregs
)
15708 /* No-op move. Can't split to nothing; emit something. */
15709 emit_note (NOTE_INSN_DELETED
);
15713 /* Preserve register attributes for variable tracking. */
15714 destlo
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
, 0);
15715 desthi
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
+ halfregs
,
15716 GET_MODE_SIZE (halfmode
));
15718 /* Special case of reversed high/low parts. */
15719 if (reg_overlap_mentioned_p (operands
[2], destlo
)
15720 && reg_overlap_mentioned_p (operands
[1], desthi
))
15722 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
15723 emit_insn (gen_xorv16qi3 (operands
[2], operands
[1], operands
[2]));
15724 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
15726 else if (!reg_overlap_mentioned_p (operands
[2], destlo
))
15728 /* Try to avoid unnecessary moves if part of the result
15729 is in the right place already. */
15731 emit_move_insn (destlo
, operands
[1]);
15732 if (src2
!= dest
+ halfregs
)
15733 emit_move_insn (desthi
, operands
[2]);
15737 if (src2
!= dest
+ halfregs
)
15738 emit_move_insn (desthi
, operands
[2]);
15740 emit_move_insn (destlo
, operands
[1]);
15744 /* vec_perm support. */
15746 struct expand_vec_perm_d
15748 rtx target
, op0
, op1
;
15749 vec_perm_indices perm
;
15750 machine_mode vmode
;
15751 unsigned int vec_flags
;
15756 /* Generate a variable permutation. */
15759 aarch64_expand_vec_perm_1 (rtx target
, rtx op0
, rtx op1
, rtx sel
)
15761 machine_mode vmode
= GET_MODE (target
);
15762 bool one_vector_p
= rtx_equal_p (op0
, op1
);
15764 gcc_checking_assert (vmode
== V8QImode
|| vmode
== V16QImode
);
15765 gcc_checking_assert (GET_MODE (op0
) == vmode
);
15766 gcc_checking_assert (GET_MODE (op1
) == vmode
);
15767 gcc_checking_assert (GET_MODE (sel
) == vmode
);
15768 gcc_checking_assert (TARGET_SIMD
);
15772 if (vmode
== V8QImode
)
15774 /* Expand the argument to a V16QI mode by duplicating it. */
15775 rtx pair
= gen_reg_rtx (V16QImode
);
15776 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op0
));
15777 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
15781 emit_insn (gen_aarch64_tbl1v16qi (target
, op0
, sel
));
15788 if (vmode
== V8QImode
)
15790 pair
= gen_reg_rtx (V16QImode
);
15791 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op1
));
15792 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
15796 pair
= gen_reg_rtx (OImode
);
15797 emit_insn (gen_aarch64_combinev16qi (pair
, op0
, op1
));
15798 emit_insn (gen_aarch64_tbl2v16qi (target
, pair
, sel
));
15803 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
15804 NELT is the number of elements in the vector. */
15807 aarch64_expand_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
,
15810 machine_mode vmode
= GET_MODE (target
);
15811 bool one_vector_p
= rtx_equal_p (op0
, op1
);
15814 /* The TBL instruction does not use a modulo index, so we must take care
15815 of that ourselves. */
15816 mask
= aarch64_simd_gen_const_vector_dup (vmode
,
15817 one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
15818 sel
= expand_simple_binop (vmode
, AND
, sel
, mask
, NULL
, 0, OPTAB_LIB_WIDEN
);
15820 /* For big-endian, we also need to reverse the index within the vector
15821 (but not which vector). */
15822 if (BYTES_BIG_ENDIAN
)
15824 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
15826 mask
= aarch64_simd_gen_const_vector_dup (vmode
, nelt
- 1);
15827 sel
= expand_simple_binop (vmode
, XOR
, sel
, mask
,
15828 NULL
, 0, OPTAB_LIB_WIDEN
);
15830 aarch64_expand_vec_perm_1 (target
, op0
, op1
, sel
);
15833 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
15836 emit_unspec2 (rtx target
, int code
, rtx op0
, rtx op1
)
15838 emit_insn (gen_rtx_SET (target
,
15839 gen_rtx_UNSPEC (GET_MODE (target
),
15840 gen_rtvec (2, op0
, op1
), code
)));
15843 /* Expand an SVE vec_perm with the given operands. */
15846 aarch64_expand_sve_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
)
15848 machine_mode data_mode
= GET_MODE (target
);
15849 machine_mode sel_mode
= GET_MODE (sel
);
15850 /* Enforced by the pattern condition. */
15851 int nunits
= GET_MODE_NUNITS (sel_mode
).to_constant ();
15853 /* Note: vec_perm indices are supposed to wrap when they go beyond the
15854 size of the two value vectors, i.e. the upper bits of the indices
15855 are effectively ignored. SVE TBL instead produces 0 for any
15856 out-of-range indices, so we need to modulo all the vec_perm indices
15857 to ensure they are all in range. */
15858 rtx sel_reg
= force_reg (sel_mode
, sel
);
15860 /* Check if the sel only references the first values vector. */
15861 if (GET_CODE (sel
) == CONST_VECTOR
15862 && aarch64_const_vec_all_in_range_p (sel
, 0, nunits
- 1))
15864 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_reg
);
15868 /* Check if the two values vectors are the same. */
15869 if (rtx_equal_p (op0
, op1
))
15871 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
, nunits
- 1);
15872 rtx sel_mod
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
15873 NULL
, 0, OPTAB_DIRECT
);
15874 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_mod
);
15878 /* Run TBL on for each value vector and combine the results. */
15880 rtx res0
= gen_reg_rtx (data_mode
);
15881 rtx res1
= gen_reg_rtx (data_mode
);
15882 rtx neg_num_elems
= aarch64_simd_gen_const_vector_dup (sel_mode
, -nunits
);
15883 if (GET_CODE (sel
) != CONST_VECTOR
15884 || !aarch64_const_vec_all_in_range_p (sel
, 0, 2 * nunits
- 1))
15886 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
,
15888 sel_reg
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
15889 NULL
, 0, OPTAB_DIRECT
);
15891 emit_unspec2 (res0
, UNSPEC_TBL
, op0
, sel_reg
);
15892 rtx sel_sub
= expand_simple_binop (sel_mode
, PLUS
, sel_reg
, neg_num_elems
,
15893 NULL
, 0, OPTAB_DIRECT
);
15894 emit_unspec2 (res1
, UNSPEC_TBL
, op1
, sel_sub
);
15895 if (GET_MODE_CLASS (data_mode
) == MODE_VECTOR_INT
)
15896 emit_insn (gen_rtx_SET (target
, gen_rtx_IOR (data_mode
, res0
, res1
)));
15898 emit_unspec2 (target
, UNSPEC_IORF
, res0
, res1
);
15901 /* Recognize patterns suitable for the TRN instructions. */
15903 aarch64_evpc_trn (struct expand_vec_perm_d
*d
)
15906 poly_uint64 nelt
= d
->perm
.length ();
15907 rtx out
, in0
, in1
, x
;
15908 machine_mode vmode
= d
->vmode
;
15910 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
15913 /* Note that these are little-endian tests.
15914 We correct for big-endian later. */
15915 if (!d
->perm
[0].is_constant (&odd
)
15916 || (odd
!= 0 && odd
!= 1)
15917 || !d
->perm
.series_p (0, 2, odd
, 2)
15918 || !d
->perm
.series_p (1, 2, nelt
+ odd
, 2))
15927 /* We don't need a big-endian lane correction for SVE; see the comment
15928 at the head of aarch64-sve.md for details. */
15929 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
15931 x
= in0
, in0
= in1
, in1
= x
;
15936 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
15937 odd
? UNSPEC_TRN2
: UNSPEC_TRN1
));
15941 /* Recognize patterns suitable for the UZP instructions. */
15943 aarch64_evpc_uzp (struct expand_vec_perm_d
*d
)
15946 rtx out
, in0
, in1
, x
;
15947 machine_mode vmode
= d
->vmode
;
15949 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
15952 /* Note that these are little-endian tests.
15953 We correct for big-endian later. */
15954 if (!d
->perm
[0].is_constant (&odd
)
15955 || (odd
!= 0 && odd
!= 1)
15956 || !d
->perm
.series_p (0, 1, odd
, 2))
15965 /* We don't need a big-endian lane correction for SVE; see the comment
15966 at the head of aarch64-sve.md for details. */
15967 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
15969 x
= in0
, in0
= in1
, in1
= x
;
15974 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
15975 odd
? UNSPEC_UZP2
: UNSPEC_UZP1
));
15979 /* Recognize patterns suitable for the ZIP instructions. */
15981 aarch64_evpc_zip (struct expand_vec_perm_d
*d
)
15984 poly_uint64 nelt
= d
->perm
.length ();
15985 rtx out
, in0
, in1
, x
;
15986 machine_mode vmode
= d
->vmode
;
15988 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
15991 /* Note that these are little-endian tests.
15992 We correct for big-endian later. */
15993 poly_uint64 first
= d
->perm
[0];
15994 if ((maybe_ne (first
, 0U) && maybe_ne (first
* 2, nelt
))
15995 || !d
->perm
.series_p (0, 2, first
, 1)
15996 || !d
->perm
.series_p (1, 2, first
+ nelt
, 1))
15998 high
= maybe_ne (first
, 0U);
16006 /* We don't need a big-endian lane correction for SVE; see the comment
16007 at the head of aarch64-sve.md for details. */
16008 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
16010 x
= in0
, in0
= in1
, in1
= x
;
16015 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
16016 high
? UNSPEC_ZIP2
: UNSPEC_ZIP1
));
16020 /* Recognize patterns for the EXT insn. */
16023 aarch64_evpc_ext (struct expand_vec_perm_d
*d
)
16025 HOST_WIDE_INT location
;
16028 /* The first element always refers to the first vector.
16029 Check if the extracted indices are increasing by one. */
16030 if (d
->vec_flags
== VEC_SVE_PRED
16031 || !d
->perm
[0].is_constant (&location
)
16032 || !d
->perm
.series_p (0, 1, location
, 1))
16039 /* The case where (location == 0) is a no-op for both big- and little-endian,
16040 and is removed by the mid-end at optimization levels -O1 and higher.
16042 We don't need a big-endian lane correction for SVE; see the comment
16043 at the head of aarch64-sve.md for details. */
16044 if (BYTES_BIG_ENDIAN
&& location
!= 0 && d
->vec_flags
== VEC_ADVSIMD
)
16046 /* After setup, we want the high elements of the first vector (stored
16047 at the LSB end of the register), and the low elements of the second
16048 vector (stored at the MSB end of the register). So swap. */
16049 std::swap (d
->op0
, d
->op1
);
16050 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
16051 to_constant () is safe since this is restricted to Advanced SIMD
16053 location
= d
->perm
.length ().to_constant () - location
;
16056 offset
= GEN_INT (location
);
16057 emit_set_insn (d
->target
,
16058 gen_rtx_UNSPEC (d
->vmode
,
16059 gen_rtvec (3, d
->op0
, d
->op1
, offset
),
16064 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
16065 within each 64-bit, 32-bit or 16-bit granule. */
16068 aarch64_evpc_rev_local (struct expand_vec_perm_d
*d
)
16070 HOST_WIDE_INT diff
;
16071 unsigned int i
, size
, unspec
;
16072 machine_mode pred_mode
;
16074 if (d
->vec_flags
== VEC_SVE_PRED
16075 || !d
->one_vector_p
16076 || !d
->perm
[0].is_constant (&diff
))
16079 size
= (diff
+ 1) * GET_MODE_UNIT_SIZE (d
->vmode
);
16082 unspec
= UNSPEC_REV64
;
16083 pred_mode
= VNx2BImode
;
16085 else if (size
== 4)
16087 unspec
= UNSPEC_REV32
;
16088 pred_mode
= VNx4BImode
;
16090 else if (size
== 2)
16092 unspec
= UNSPEC_REV16
;
16093 pred_mode
= VNx8BImode
;
16098 unsigned int step
= diff
+ 1;
16099 for (i
= 0; i
< step
; ++i
)
16100 if (!d
->perm
.series_p (i
, step
, diff
- i
, step
))
16107 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), unspec
);
16108 if (d
->vec_flags
== VEC_SVE_DATA
)
16110 rtx pred
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
16111 src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (2, pred
, src
),
16112 UNSPEC_MERGE_PTRUE
);
16114 emit_set_insn (d
->target
, src
);
16118 /* Recognize patterns for the REV insn, which reverses elements within
16122 aarch64_evpc_rev_global (struct expand_vec_perm_d
*d
)
16124 poly_uint64 nelt
= d
->perm
.length ();
16126 if (!d
->one_vector_p
|| d
->vec_flags
!= VEC_SVE_DATA
)
16129 if (!d
->perm
.series_p (0, 1, nelt
- 1, -1))
16136 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), UNSPEC_REV
);
16137 emit_set_insn (d
->target
, src
);
16142 aarch64_evpc_dup (struct expand_vec_perm_d
*d
)
16144 rtx out
= d
->target
;
16147 machine_mode vmode
= d
->vmode
;
16150 if (d
->vec_flags
== VEC_SVE_PRED
16151 || d
->perm
.encoding ().encoded_nelts () != 1
16152 || !d
->perm
[0].is_constant (&elt
))
16155 if (d
->vec_flags
== VEC_SVE_DATA
&& elt
>= 64 * GET_MODE_UNIT_SIZE (vmode
))
16162 /* The generic preparation in aarch64_expand_vec_perm_const_1
16163 swaps the operand order and the permute indices if it finds
16164 d->perm[0] to be in the second operand. Thus, we can always
16165 use d->op0 and need not do any extra arithmetic to get the
16166 correct lane number. */
16168 lane
= GEN_INT (elt
); /* The pattern corrects for big-endian. */
16170 rtx parallel
= gen_rtx_PARALLEL (vmode
, gen_rtvec (1, lane
));
16171 rtx select
= gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode
), in0
, parallel
);
16172 emit_set_insn (out
, gen_rtx_VEC_DUPLICATE (vmode
, select
));
16177 aarch64_evpc_tbl (struct expand_vec_perm_d
*d
)
16179 rtx rperm
[MAX_COMPILE_TIME_VEC_BYTES
], sel
;
16180 machine_mode vmode
= d
->vmode
;
16182 /* Make sure that the indices are constant. */
16183 unsigned int encoded_nelts
= d
->perm
.encoding ().encoded_nelts ();
16184 for (unsigned int i
= 0; i
< encoded_nelts
; ++i
)
16185 if (!d
->perm
[i
].is_constant ())
16191 /* Generic code will try constant permutation twice. Once with the
16192 original mode and again with the elements lowered to QImode.
16193 So wait and don't do the selector expansion ourselves. */
16194 if (vmode
!= V8QImode
&& vmode
!= V16QImode
)
16197 /* to_constant is safe since this routine is specific to Advanced SIMD
16199 unsigned int nelt
= d
->perm
.length ().to_constant ();
16200 for (unsigned int i
= 0; i
< nelt
; ++i
)
16201 /* If big-endian and two vectors we end up with a weird mixed-endian
16202 mode on NEON. Reverse the index within each word but not the word
16203 itself. to_constant is safe because we checked is_constant above. */
16204 rperm
[i
] = GEN_INT (BYTES_BIG_ENDIAN
16205 ? d
->perm
[i
].to_constant () ^ (nelt
- 1)
16206 : d
->perm
[i
].to_constant ());
16208 sel
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
16209 sel
= force_reg (vmode
, sel
);
16211 aarch64_expand_vec_perm_1 (d
->target
, d
->op0
, d
->op1
, sel
);
16215 /* Try to implement D using an SVE TBL instruction. */
16218 aarch64_evpc_sve_tbl (struct expand_vec_perm_d
*d
)
16220 unsigned HOST_WIDE_INT nelt
;
16222 /* Permuting two variable-length vectors could overflow the
16224 if (!d
->one_vector_p
&& !d
->perm
.length ().is_constant (&nelt
))
16230 machine_mode sel_mode
= mode_for_int_vector (d
->vmode
).require ();
16231 rtx sel
= vec_perm_indices_to_rtx (sel_mode
, d
->perm
);
16232 if (d
->one_vector_p
)
16233 emit_unspec2 (d
->target
, UNSPEC_TBL
, d
->op0
, force_reg (sel_mode
, sel
));
16235 aarch64_expand_sve_vec_perm (d
->target
, d
->op0
, d
->op1
, sel
);
16240 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
16242 /* The pattern matching functions above are written to look for a small
16243 number to begin the sequence (0, 1, N/2). If we begin with an index
16244 from the second operand, we can swap the operands. */
16245 poly_int64 nelt
= d
->perm
.length ();
16246 if (known_ge (d
->perm
[0], nelt
))
16248 d
->perm
.rotate_inputs (1);
16249 std::swap (d
->op0
, d
->op1
);
16252 if ((d
->vec_flags
== VEC_ADVSIMD
16253 || d
->vec_flags
== VEC_SVE_DATA
16254 || d
->vec_flags
== VEC_SVE_PRED
)
16255 && known_gt (nelt
, 1))
16257 if (aarch64_evpc_rev_local (d
))
16259 else if (aarch64_evpc_rev_global (d
))
16261 else if (aarch64_evpc_ext (d
))
16263 else if (aarch64_evpc_dup (d
))
16265 else if (aarch64_evpc_zip (d
))
16267 else if (aarch64_evpc_uzp (d
))
16269 else if (aarch64_evpc_trn (d
))
16271 if (d
->vec_flags
== VEC_SVE_DATA
)
16272 return aarch64_evpc_sve_tbl (d
);
16273 else if (d
->vec_flags
== VEC_ADVSIMD
)
16274 return aarch64_evpc_tbl (d
);
16279 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
16282 aarch64_vectorize_vec_perm_const (machine_mode vmode
, rtx target
, rtx op0
,
16283 rtx op1
, const vec_perm_indices
&sel
)
16285 struct expand_vec_perm_d d
;
16287 /* Check whether the mask can be applied to a single vector. */
16288 if (sel
.ninputs () == 1
16289 || (op0
&& rtx_equal_p (op0
, op1
)))
16290 d
.one_vector_p
= true;
16291 else if (sel
.all_from_input_p (0))
16293 d
.one_vector_p
= true;
16296 else if (sel
.all_from_input_p (1))
16298 d
.one_vector_p
= true;
16302 d
.one_vector_p
= false;
16304 d
.perm
.new_vector (sel
.encoding (), d
.one_vector_p
? 1 : 2,
16305 sel
.nelts_per_input ());
16307 d
.vec_flags
= aarch64_classify_vector_mode (d
.vmode
);
16311 d
.testing_p
= !target
;
16314 return aarch64_expand_vec_perm_const_1 (&d
);
16316 rtx_insn
*last
= get_last_insn ();
16317 bool ret
= aarch64_expand_vec_perm_const_1 (&d
);
16318 gcc_assert (last
== get_last_insn ());
16323 /* Generate a byte permute mask for a register of mode MODE,
16324 which has NUNITS units. */
16327 aarch64_reverse_mask (machine_mode mode
, unsigned int nunits
)
16329 /* We have to reverse each vector because we dont have
16330 a permuted load that can reverse-load according to ABI rules. */
16332 rtvec v
= rtvec_alloc (16);
16334 unsigned int usize
= GET_MODE_UNIT_SIZE (mode
);
16336 gcc_assert (BYTES_BIG_ENDIAN
);
16337 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode
));
16339 for (i
= 0; i
< nunits
; i
++)
16340 for (j
= 0; j
< usize
; j
++)
16341 RTVEC_ELT (v
, i
* usize
+ j
) = GEN_INT ((i
+ 1) * usize
- 1 - j
);
16342 mask
= gen_rtx_CONST_VECTOR (V16QImode
, v
);
16343 return force_reg (V16QImode
, mask
);
16346 /* Return true if X is a valid second operand for the SVE instruction
16347 that implements integer comparison OP_CODE. */
16350 aarch64_sve_cmp_operand_p (rtx_code op_code
, rtx x
)
16352 if (register_operand (x
, VOIDmode
))
16361 return aarch64_sve_cmp_immediate_p (x
, false);
16368 return aarch64_sve_cmp_immediate_p (x
, true);
16370 gcc_unreachable ();
16374 /* Use predicated SVE instructions to implement the equivalent of:
16378 given that PTRUE is an all-true predicate of the appropriate mode. */
16381 aarch64_emit_sve_ptrue_op (rtx target
, rtx ptrue
, rtx op
)
16383 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (target
),
16384 gen_rtvec (2, ptrue
, op
),
16385 UNSPEC_MERGE_PTRUE
);
16386 rtx_insn
*insn
= emit_set_insn (target
, unspec
);
16387 set_unique_reg_note (insn
, REG_EQUAL
, copy_rtx (op
));
16390 /* Likewise, but also clobber the condition codes. */
16393 aarch64_emit_sve_ptrue_op_cc (rtx target
, rtx ptrue
, rtx op
)
16395 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (target
),
16396 gen_rtvec (2, ptrue
, op
),
16397 UNSPEC_MERGE_PTRUE
);
16398 rtx_insn
*insn
= emit_insn (gen_set_clobber_cc (target
, unspec
));
16399 set_unique_reg_note (insn
, REG_EQUAL
, copy_rtx (op
));
16402 /* Return the UNSPEC_COND_* code for comparison CODE. */
16404 static unsigned int
16405 aarch64_unspec_cond_code (rtx_code code
)
16410 return UNSPEC_COND_NE
;
16412 return UNSPEC_COND_EQ
;
16414 return UNSPEC_COND_LT
;
16416 return UNSPEC_COND_GT
;
16418 return UNSPEC_COND_LE
;
16420 return UNSPEC_COND_GE
;
16422 gcc_unreachable ();
16428 (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
16430 where <X> is the operation associated with comparison CODE. This form
16431 of instruction is used when (and (CODE OP0 OP1) PRED) would have different
16432 semantics, such as when PRED might not be all-true and when comparing
16433 inactive lanes could have side effects. */
16436 aarch64_emit_sve_predicated_cond (rtx target
, rtx_code code
,
16437 rtx pred
, rtx op0
, rtx op1
)
16439 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (pred
),
16440 gen_rtvec (3, pred
, op0
, op1
),
16441 aarch64_unspec_cond_code (code
));
16442 emit_set_insn (target
, unspec
);
16445 /* Expand an SVE integer comparison using the SVE equivalent of:
16447 (set TARGET (CODE OP0 OP1)). */
16450 aarch64_expand_sve_vec_cmp_int (rtx target
, rtx_code code
, rtx op0
, rtx op1
)
16452 machine_mode pred_mode
= GET_MODE (target
);
16453 machine_mode data_mode
= GET_MODE (op0
);
16455 if (!aarch64_sve_cmp_operand_p (code
, op1
))
16456 op1
= force_reg (data_mode
, op1
);
16458 rtx ptrue
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
16459 rtx cond
= gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
);
16460 aarch64_emit_sve_ptrue_op_cc (target
, ptrue
, cond
);
16463 /* Emit the SVE equivalent of:
16465 (set TMP1 (CODE1 OP0 OP1))
16466 (set TMP2 (CODE2 OP0 OP1))
16467 (set TARGET (ior:PRED_MODE TMP1 TMP2))
16469 PTRUE is an all-true predicate with the same mode as TARGET. */
16472 aarch64_emit_sve_or_conds (rtx target
, rtx_code code1
, rtx_code code2
,
16473 rtx ptrue
, rtx op0
, rtx op1
)
16475 machine_mode pred_mode
= GET_MODE (ptrue
);
16476 rtx tmp1
= gen_reg_rtx (pred_mode
);
16477 aarch64_emit_sve_ptrue_op (tmp1
, ptrue
,
16478 gen_rtx_fmt_ee (code1
, pred_mode
, op0
, op1
));
16479 rtx tmp2
= gen_reg_rtx (pred_mode
);
16480 aarch64_emit_sve_ptrue_op (tmp2
, ptrue
,
16481 gen_rtx_fmt_ee (code2
, pred_mode
, op0
, op1
));
16482 aarch64_emit_binop (target
, ior_optab
, tmp1
, tmp2
);
16485 /* Emit the SVE equivalent of:
16487 (set TMP (CODE OP0 OP1))
16488 (set TARGET (not TMP))
16490 PTRUE is an all-true predicate with the same mode as TARGET. */
16493 aarch64_emit_sve_inverted_cond (rtx target
, rtx ptrue
, rtx_code code
,
16496 machine_mode pred_mode
= GET_MODE (ptrue
);
16497 rtx tmp
= gen_reg_rtx (pred_mode
);
16498 aarch64_emit_sve_ptrue_op (tmp
, ptrue
,
16499 gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
));
16500 aarch64_emit_unop (target
, one_cmpl_optab
, tmp
);
16503 /* Expand an SVE floating-point comparison using the SVE equivalent of:
16505 (set TARGET (CODE OP0 OP1))
16507 If CAN_INVERT_P is true, the caller can also handle inverted results;
16508 return true if the result is in fact inverted. */
16511 aarch64_expand_sve_vec_cmp_float (rtx target
, rtx_code code
,
16512 rtx op0
, rtx op1
, bool can_invert_p
)
16514 machine_mode pred_mode
= GET_MODE (target
);
16515 machine_mode data_mode
= GET_MODE (op0
);
16517 rtx ptrue
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
16521 /* UNORDERED has no immediate form. */
16522 op1
= force_reg (data_mode
, op1
);
16531 /* There is native support for the comparison. */
16532 rtx cond
= gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
);
16533 aarch64_emit_sve_ptrue_op (target
, ptrue
, cond
);
16538 /* This is a trapping operation (LT or GT). */
16539 aarch64_emit_sve_or_conds (target
, LT
, GT
, ptrue
, op0
, op1
);
16543 if (!flag_trapping_math
)
16545 /* This would trap for signaling NaNs. */
16546 op1
= force_reg (data_mode
, op1
);
16547 aarch64_emit_sve_or_conds (target
, UNORDERED
, EQ
, ptrue
, op0
, op1
);
16555 if (flag_trapping_math
)
16557 /* Work out which elements are ordered. */
16558 rtx ordered
= gen_reg_rtx (pred_mode
);
16559 op1
= force_reg (data_mode
, op1
);
16560 aarch64_emit_sve_inverted_cond (ordered
, ptrue
, UNORDERED
, op0
, op1
);
16562 /* Test the opposite condition for the ordered elements,
16563 then invert the result. */
16567 code
= reverse_condition_maybe_unordered (code
);
16570 aarch64_emit_sve_predicated_cond (target
, code
,
16571 ordered
, op0
, op1
);
16574 rtx tmp
= gen_reg_rtx (pred_mode
);
16575 aarch64_emit_sve_predicated_cond (tmp
, code
, ordered
, op0
, op1
);
16576 aarch64_emit_unop (target
, one_cmpl_optab
, tmp
);
16582 /* ORDERED has no immediate form. */
16583 op1
= force_reg (data_mode
, op1
);
16587 gcc_unreachable ();
16590 /* There is native support for the inverse comparison. */
16591 code
= reverse_condition_maybe_unordered (code
);
16594 rtx cond
= gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
);
16595 aarch64_emit_sve_ptrue_op (target
, ptrue
, cond
);
16598 aarch64_emit_sve_inverted_cond (target
, ptrue
, code
, op0
, op1
);
16602 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
16603 of the data being selected and CMP_MODE is the mode of the values being
16607 aarch64_expand_sve_vcond (machine_mode data_mode
, machine_mode cmp_mode
,
16610 machine_mode pred_mode
16611 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode
),
16612 GET_MODE_SIZE (cmp_mode
)).require ();
16613 rtx pred
= gen_reg_rtx (pred_mode
);
16614 if (FLOAT_MODE_P (cmp_mode
))
16616 if (aarch64_expand_sve_vec_cmp_float (pred
, GET_CODE (ops
[3]),
16617 ops
[4], ops
[5], true))
16618 std::swap (ops
[1], ops
[2]);
16621 aarch64_expand_sve_vec_cmp_int (pred
, GET_CODE (ops
[3]), ops
[4], ops
[5]);
16623 rtvec vec
= gen_rtvec (3, pred
, ops
[1], ops
[2]);
16624 emit_set_insn (ops
[0], gen_rtx_UNSPEC (data_mode
, vec
, UNSPEC_SEL
));
16627 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
16628 true. However due to issues with register allocation it is preferable
16629 to avoid tieing integer scalar and FP scalar modes. Executing integer
16630 operations in general registers is better than treating them as scalar
16631 vector operations. This reduces latency and avoids redundant int<->FP
16632 moves. So tie modes if they are either the same class, or vector modes
16633 with other vector modes, vector structs or any scalar mode. */
16636 aarch64_modes_tieable_p (machine_mode mode1
, machine_mode mode2
)
16638 if (GET_MODE_CLASS (mode1
) == GET_MODE_CLASS (mode2
))
16641 /* We specifically want to allow elements of "structure" modes to
16642 be tieable to the structure. This more general condition allows
16643 other rarer situations too. The reason we don't extend this to
16644 predicate modes is that there are no predicate structure modes
16645 nor any specific instructions for extracting part of a predicate
16647 if (aarch64_vector_data_mode_p (mode1
)
16648 && aarch64_vector_data_mode_p (mode2
))
16651 /* Also allow any scalar modes with vectors. */
16652 if (aarch64_vector_mode_supported_p (mode1
)
16653 || aarch64_vector_mode_supported_p (mode2
))
16659 /* Return a new RTX holding the result of moving POINTER forward by
16663 aarch64_move_pointer (rtx pointer
, poly_int64 amount
)
16665 rtx next
= plus_constant (Pmode
, XEXP (pointer
, 0), amount
);
16667 return adjust_automodify_address (pointer
, GET_MODE (pointer
),
16671 /* Return a new RTX holding the result of moving POINTER forward by the
16672 size of the mode it points to. */
16675 aarch64_progress_pointer (rtx pointer
)
16677 return aarch64_move_pointer (pointer
, GET_MODE_SIZE (GET_MODE (pointer
)));
16680 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
16684 aarch64_copy_one_block_and_progress_pointers (rtx
*src
, rtx
*dst
,
16687 rtx reg
= gen_reg_rtx (mode
);
16689 /* "Cast" the pointers to the correct mode. */
16690 *src
= adjust_address (*src
, mode
, 0);
16691 *dst
= adjust_address (*dst
, mode
, 0);
16692 /* Emit the memcpy. */
16693 emit_move_insn (reg
, *src
);
16694 emit_move_insn (*dst
, reg
);
16695 /* Move the pointers forward. */
16696 *src
= aarch64_progress_pointer (*src
);
16697 *dst
= aarch64_progress_pointer (*dst
);
16700 /* Expand movmem, as if from a __builtin_memcpy. Return true if
16701 we succeed, otherwise return false. */
16704 aarch64_expand_movmem (rtx
*operands
)
16707 rtx dst
= operands
[0];
16708 rtx src
= operands
[1];
16710 machine_mode cur_mode
= BLKmode
, next_mode
;
16711 bool speed_p
= !optimize_function_for_size_p (cfun
);
16713 /* When optimizing for size, give a better estimate of the length of a
16714 memcpy call, but use the default otherwise. Moves larger than 8 bytes
16715 will always require an even number of instructions to do now. And each
16716 operation requires both a load+store, so devide the max number by 2. */
16717 int max_num_moves
= (speed_p
? 16 : AARCH64_CALL_RATIO
) / 2;
16719 /* We can't do anything smart if the amount to copy is not constant. */
16720 if (!CONST_INT_P (operands
[2]))
16723 n
= INTVAL (operands
[2]);
16725 /* Try to keep the number of instructions low. For all cases we will do at
16726 most two moves for the residual amount, since we'll always overlap the
16728 if (((n
/ 16) + (n
% 16 ? 2 : 0)) > max_num_moves
)
16731 base
= copy_to_mode_reg (Pmode
, XEXP (dst
, 0));
16732 dst
= adjust_automodify_address (dst
, VOIDmode
, base
, 0);
16734 base
= copy_to_mode_reg (Pmode
, XEXP (src
, 0));
16735 src
= adjust_automodify_address (src
, VOIDmode
, base
, 0);
16737 /* Convert n to bits to make the rest of the code simpler. */
16738 n
= n
* BITS_PER_UNIT
;
16740 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
16741 larger than TImode, but we should not use them for loads/stores here. */
16742 const int copy_limit
= GET_MODE_BITSIZE (TImode
);
16746 /* Find the largest mode in which to do the copy in without over reading
16748 opt_scalar_int_mode mode_iter
;
16749 FOR_EACH_MODE_IN_CLASS (mode_iter
, MODE_INT
)
16750 if (GET_MODE_BITSIZE (mode_iter
.require ()) <= MIN (n
, copy_limit
))
16751 cur_mode
= mode_iter
.require ();
16753 gcc_assert (cur_mode
!= BLKmode
);
16755 mode_bits
= GET_MODE_BITSIZE (cur_mode
).to_constant ();
16756 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, cur_mode
);
16760 /* Do certain trailing copies as overlapping if it's going to be
16761 cheaper. i.e. less instructions to do so. For instance doing a 15
16762 byte copy it's more efficient to do two overlapping 8 byte copies than
16764 if (n
> 0 && n
<= 8 * BITS_PER_UNIT
)
16766 next_mode
= smallest_mode_for_size (n
, MODE_INT
);
16767 int n_bits
= GET_MODE_BITSIZE (next_mode
).to_constant ();
16768 src
= aarch64_move_pointer (src
, (n
- n_bits
) / BITS_PER_UNIT
);
16769 dst
= aarch64_move_pointer (dst
, (n
- n_bits
) / BITS_PER_UNIT
);
16777 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
16778 SImode stores. Handle the case when the constant has identical
16779 bottom and top halves. This is beneficial when the two stores can be
16780 merged into an STP and we avoid synthesising potentially expensive
16781 immediates twice. Return true if such a split is possible. */
16784 aarch64_split_dimode_const_store (rtx dst
, rtx src
)
16786 rtx lo
= gen_lowpart (SImode
, src
);
16787 rtx hi
= gen_highpart_mode (SImode
, DImode
, src
);
16789 bool size_p
= optimize_function_for_size_p (cfun
);
16791 if (!rtx_equal_p (lo
, hi
))
16794 unsigned int orig_cost
16795 = aarch64_internal_mov_immediate (NULL_RTX
, src
, false, DImode
);
16796 unsigned int lo_cost
16797 = aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, SImode
);
16799 /* We want to transform:
16801 MOVK x1, 0x140, lsl 16
16802 MOVK x1, 0xc0da, lsl 32
16803 MOVK x1, 0x140, lsl 48
16807 MOVK w1, 0x140, lsl 16
16809 So we want to perform this only when we save two instructions
16810 or more. When optimizing for size, however, accept any code size
16812 if (size_p
&& orig_cost
<= lo_cost
)
16816 && (orig_cost
<= lo_cost
+ 1))
16819 rtx mem_lo
= adjust_address (dst
, SImode
, 0);
16820 if (!aarch64_mem_pair_operand (mem_lo
, SImode
))
16823 rtx tmp_reg
= gen_reg_rtx (SImode
);
16824 aarch64_expand_mov_immediate (tmp_reg
, lo
);
16825 rtx mem_hi
= aarch64_move_pointer (mem_lo
, GET_MODE_SIZE (SImode
));
16826 /* Don't emit an explicit store pair as this may not be always profitable.
16827 Let the sched-fusion logic decide whether to merge them. */
16828 emit_move_insn (mem_lo
, tmp_reg
);
16829 emit_move_insn (mem_hi
, tmp_reg
);
16834 /* Generate RTL for a conditional branch with rtx comparison CODE in
16835 mode CC_MODE. The destination of the unlikely conditional branch
16839 aarch64_gen_unlikely_cbranch (enum rtx_code code
, machine_mode cc_mode
,
16843 x
= gen_rtx_fmt_ee (code
, VOIDmode
,
16844 gen_rtx_REG (cc_mode
, CC_REGNUM
),
16847 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
16848 gen_rtx_LABEL_REF (VOIDmode
, label_ref
),
16850 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
16853 /* Generate DImode scratch registers for 128-bit (TImode) addition.
16855 OP1 represents the TImode destination operand 1
16856 OP2 represents the TImode destination operand 2
16857 LOW_DEST represents the low half (DImode) of TImode operand 0
16858 LOW_IN1 represents the low half (DImode) of TImode operand 1
16859 LOW_IN2 represents the low half (DImode) of TImode operand 2
16860 HIGH_DEST represents the high half (DImode) of TImode operand 0
16861 HIGH_IN1 represents the high half (DImode) of TImode operand 1
16862 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
16865 aarch64_addti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
16866 rtx
*low_in1
, rtx
*low_in2
,
16867 rtx
*high_dest
, rtx
*high_in1
,
16870 *low_dest
= gen_reg_rtx (DImode
);
16871 *low_in1
= gen_lowpart (DImode
, op1
);
16872 *low_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
16873 subreg_lowpart_offset (DImode
, TImode
));
16874 *high_dest
= gen_reg_rtx (DImode
);
16875 *high_in1
= gen_highpart (DImode
, op1
);
16876 *high_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
16877 subreg_highpart_offset (DImode
, TImode
));
16880 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
16882 This function differs from 'arch64_addti_scratch_regs' in that
16883 OP1 can be an immediate constant (zero). We must call
16884 subreg_highpart_offset with DImode and TImode arguments, otherwise
16885 VOIDmode will be used for the const_int which generates an internal
16886 error from subreg_size_highpart_offset which does not expect a size of zero.
16888 OP1 represents the TImode destination operand 1
16889 OP2 represents the TImode destination operand 2
16890 LOW_DEST represents the low half (DImode) of TImode operand 0
16891 LOW_IN1 represents the low half (DImode) of TImode operand 1
16892 LOW_IN2 represents the low half (DImode) of TImode operand 2
16893 HIGH_DEST represents the high half (DImode) of TImode operand 0
16894 HIGH_IN1 represents the high half (DImode) of TImode operand 1
16895 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
16899 aarch64_subvti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
16900 rtx
*low_in1
, rtx
*low_in2
,
16901 rtx
*high_dest
, rtx
*high_in1
,
16904 *low_dest
= gen_reg_rtx (DImode
);
16905 *low_in1
= simplify_gen_subreg (DImode
, op1
, TImode
,
16906 subreg_lowpart_offset (DImode
, TImode
));
16908 *low_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
16909 subreg_lowpart_offset (DImode
, TImode
));
16910 *high_dest
= gen_reg_rtx (DImode
);
16912 *high_in1
= simplify_gen_subreg (DImode
, op1
, TImode
,
16913 subreg_highpart_offset (DImode
, TImode
));
16914 *high_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
16915 subreg_highpart_offset (DImode
, TImode
));
16918 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
16920 OP0 represents the TImode destination operand 0
16921 LOW_DEST represents the low half (DImode) of TImode operand 0
16922 LOW_IN1 represents the low half (DImode) of TImode operand 1
16923 LOW_IN2 represents the low half (DImode) of TImode operand 2
16924 HIGH_DEST represents the high half (DImode) of TImode operand 0
16925 HIGH_IN1 represents the high half (DImode) of TImode operand 1
16926 HIGH_IN2 represents the high half (DImode) of TImode operand 2
16927 UNSIGNED_P is true if the operation is being performed on unsigned
16930 aarch64_expand_subvti (rtx op0
, rtx low_dest
, rtx low_in1
,
16931 rtx low_in2
, rtx high_dest
, rtx high_in1
,
16932 rtx high_in2
, bool unsigned_p
)
16934 if (low_in2
== const0_rtx
)
16936 low_dest
= low_in1
;
16937 high_in2
= force_reg (DImode
, high_in2
);
16939 emit_insn (gen_subdi3_compare1 (high_dest
, high_in1
, high_in2
));
16941 emit_insn (gen_subvdi_insn (high_dest
, high_in1
, high_in2
));
16945 if (CONST_INT_P (low_in2
))
16947 high_in2
= force_reg (DImode
, high_in2
);
16948 emit_insn (gen_subdi3_compare1_imm (low_dest
, low_in1
, low_in2
,
16949 GEN_INT (-INTVAL (low_in2
))));
16952 emit_insn (gen_subdi3_compare1 (low_dest
, low_in1
, low_in2
));
16955 emit_insn (gen_usubdi3_carryinC (high_dest
, high_in1
, high_in2
));
16957 emit_insn (gen_subdi3_carryinV (high_dest
, high_in1
, high_in2
));
16960 emit_move_insn (gen_lowpart (DImode
, op0
), low_dest
);
16961 emit_move_insn (gen_highpart (DImode
, op0
), high_dest
);
16965 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
16967 static unsigned HOST_WIDE_INT
16968 aarch64_asan_shadow_offset (void)
16970 return (HOST_WIDE_INT_1
<< 36);
16974 aarch64_gen_ccmp_first (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
,
16975 int code
, tree treeop0
, tree treeop1
)
16977 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
16979 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
16981 struct expand_operand ops
[4];
16984 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
16986 op_mode
= GET_MODE (op0
);
16987 if (op_mode
== VOIDmode
)
16988 op_mode
= GET_MODE (op1
);
16996 icode
= CODE_FOR_cmpsi
;
17001 icode
= CODE_FOR_cmpdi
;
17006 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
17007 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpesf
: CODE_FOR_fcmpsf
;
17012 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
17013 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpedf
: CODE_FOR_fcmpdf
;
17021 op0
= prepare_operand (icode
, op0
, 0, op_mode
, cmp_mode
, unsignedp
);
17022 op1
= prepare_operand (icode
, op1
, 1, op_mode
, cmp_mode
, unsignedp
);
17028 *prep_seq
= get_insns ();
17031 create_fixed_operand (&ops
[0], op0
);
17032 create_fixed_operand (&ops
[1], op1
);
17035 if (!maybe_expand_insn (icode
, 2, ops
))
17040 *gen_seq
= get_insns ();
17043 return gen_rtx_fmt_ee ((rtx_code
) code
, cc_mode
,
17044 gen_rtx_REG (cc_mode
, CC_REGNUM
), const0_rtx
);
17048 aarch64_gen_ccmp_next (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
, rtx prev
,
17049 int cmp_code
, tree treeop0
, tree treeop1
, int bit_code
)
17051 rtx op0
, op1
, target
;
17052 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
17053 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
17055 struct expand_operand ops
[6];
17058 push_to_sequence (*prep_seq
);
17059 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
17061 op_mode
= GET_MODE (op0
);
17062 if (op_mode
== VOIDmode
)
17063 op_mode
= GET_MODE (op1
);
17071 icode
= CODE_FOR_ccmpsi
;
17076 icode
= CODE_FOR_ccmpdi
;
17081 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
17082 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpesf
: CODE_FOR_fccmpsf
;
17087 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
17088 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpedf
: CODE_FOR_fccmpdf
;
17096 op0
= prepare_operand (icode
, op0
, 2, op_mode
, cmp_mode
, unsignedp
);
17097 op1
= prepare_operand (icode
, op1
, 3, op_mode
, cmp_mode
, unsignedp
);
17103 *prep_seq
= get_insns ();
17106 target
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
17107 aarch64_cond
= aarch64_get_condition_code_1 (cc_mode
, (rtx_code
) cmp_code
);
17109 if (bit_code
!= AND
)
17111 prev
= gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev
),
17112 GET_MODE (XEXP (prev
, 0))),
17113 VOIDmode
, XEXP (prev
, 0), const0_rtx
);
17114 aarch64_cond
= AARCH64_INVERSE_CONDITION_CODE (aarch64_cond
);
17117 create_fixed_operand (&ops
[0], XEXP (prev
, 0));
17118 create_fixed_operand (&ops
[1], target
);
17119 create_fixed_operand (&ops
[2], op0
);
17120 create_fixed_operand (&ops
[3], op1
);
17121 create_fixed_operand (&ops
[4], prev
);
17122 create_fixed_operand (&ops
[5], GEN_INT (aarch64_cond
));
17124 push_to_sequence (*gen_seq
);
17125 if (!maybe_expand_insn (icode
, 6, ops
))
17131 *gen_seq
= get_insns ();
17134 return gen_rtx_fmt_ee ((rtx_code
) cmp_code
, VOIDmode
, target
, const0_rtx
);
17137 #undef TARGET_GEN_CCMP_FIRST
17138 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
17140 #undef TARGET_GEN_CCMP_NEXT
17141 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
17143 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
17144 instruction fusion of some sort. */
17147 aarch64_macro_fusion_p (void)
17149 return aarch64_tune_params
.fusible_ops
!= AARCH64_FUSE_NOTHING
;
17153 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
17154 should be kept together during scheduling. */
17157 aarch_macro_fusion_pair_p (rtx_insn
*prev
, rtx_insn
*curr
)
17160 rtx prev_set
= single_set (prev
);
17161 rtx curr_set
= single_set (curr
);
17162 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
17163 bool simple_sets_p
= prev_set
&& curr_set
&& !any_condjump_p (curr
);
17165 if (!aarch64_macro_fusion_p ())
17168 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK
))
17170 /* We are trying to match:
17171 prev (mov) == (set (reg r0) (const_int imm16))
17172 curr (movk) == (set (zero_extract (reg r0)
17175 (const_int imm16_1)) */
17177 set_dest
= SET_DEST (curr_set
);
17179 if (GET_CODE (set_dest
) == ZERO_EXTRACT
17180 && CONST_INT_P (SET_SRC (curr_set
))
17181 && CONST_INT_P (SET_SRC (prev_set
))
17182 && CONST_INT_P (XEXP (set_dest
, 2))
17183 && INTVAL (XEXP (set_dest
, 2)) == 16
17184 && REG_P (XEXP (set_dest
, 0))
17185 && REG_P (SET_DEST (prev_set
))
17186 && REGNO (XEXP (set_dest
, 0)) == REGNO (SET_DEST (prev_set
)))
17192 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD
))
17195 /* We're trying to match:
17196 prev (adrp) == (set (reg r1)
17197 (high (symbol_ref ("SYM"))))
17198 curr (add) == (set (reg r0)
17200 (symbol_ref ("SYM"))))
17201 Note that r0 need not necessarily be the same as r1, especially
17202 during pre-regalloc scheduling. */
17204 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
17205 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
17207 if (GET_CODE (SET_SRC (curr_set
)) == LO_SUM
17208 && REG_P (XEXP (SET_SRC (curr_set
), 0))
17209 && REGNO (XEXP (SET_SRC (curr_set
), 0))
17210 == REGNO (SET_DEST (prev_set
))
17211 && rtx_equal_p (XEXP (SET_SRC (prev_set
), 0),
17212 XEXP (SET_SRC (curr_set
), 1)))
17217 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK
))
17220 /* We're trying to match:
17221 prev (movk) == (set (zero_extract (reg r0)
17224 (const_int imm16_1))
17225 curr (movk) == (set (zero_extract (reg r0)
17228 (const_int imm16_2)) */
17230 if (GET_CODE (SET_DEST (prev_set
)) == ZERO_EXTRACT
17231 && GET_CODE (SET_DEST (curr_set
)) == ZERO_EXTRACT
17232 && REG_P (XEXP (SET_DEST (prev_set
), 0))
17233 && REG_P (XEXP (SET_DEST (curr_set
), 0))
17234 && REGNO (XEXP (SET_DEST (prev_set
), 0))
17235 == REGNO (XEXP (SET_DEST (curr_set
), 0))
17236 && CONST_INT_P (XEXP (SET_DEST (prev_set
), 2))
17237 && CONST_INT_P (XEXP (SET_DEST (curr_set
), 2))
17238 && INTVAL (XEXP (SET_DEST (prev_set
), 2)) == 32
17239 && INTVAL (XEXP (SET_DEST (curr_set
), 2)) == 48
17240 && CONST_INT_P (SET_SRC (prev_set
))
17241 && CONST_INT_P (SET_SRC (curr_set
)))
17245 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR
))
17247 /* We're trying to match:
17248 prev (adrp) == (set (reg r0)
17249 (high (symbol_ref ("SYM"))))
17250 curr (ldr) == (set (reg r1)
17251 (mem (lo_sum (reg r0)
17252 (symbol_ref ("SYM")))))
17254 curr (ldr) == (set (reg r1)
17257 (symbol_ref ("SYM")))))) */
17258 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
17259 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
17261 rtx curr_src
= SET_SRC (curr_set
);
17263 if (GET_CODE (curr_src
) == ZERO_EXTEND
)
17264 curr_src
= XEXP (curr_src
, 0);
17266 if (MEM_P (curr_src
) && GET_CODE (XEXP (curr_src
, 0)) == LO_SUM
17267 && REG_P (XEXP (XEXP (curr_src
, 0), 0))
17268 && REGNO (XEXP (XEXP (curr_src
, 0), 0))
17269 == REGNO (SET_DEST (prev_set
))
17270 && rtx_equal_p (XEXP (XEXP (curr_src
, 0), 1),
17271 XEXP (SET_SRC (prev_set
), 0)))
17276 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC
)
17277 && aarch_crypto_can_dual_issue (prev
, curr
))
17280 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH
)
17281 && any_condjump_p (curr
))
17283 unsigned int condreg1
, condreg2
;
17285 aarch64_fixed_condition_code_regs (&condreg1
, &condreg2
);
17286 cc_reg_1
= gen_rtx_REG (CCmode
, condreg1
);
17288 if (reg_referenced_p (cc_reg_1
, PATTERN (curr
))
17290 && modified_in_p (cc_reg_1
, prev
))
17292 enum attr_type prev_type
= get_attr_type (prev
);
17294 /* FIXME: this misses some which is considered simple arthematic
17295 instructions for ThunderX. Simple shifts are missed here. */
17296 if (prev_type
== TYPE_ALUS_SREG
17297 || prev_type
== TYPE_ALUS_IMM
17298 || prev_type
== TYPE_LOGICS_REG
17299 || prev_type
== TYPE_LOGICS_IMM
)
17306 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH
)
17307 && any_condjump_p (curr
))
17309 /* We're trying to match:
17310 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
17311 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
17313 (label_ref ("SYM"))
17315 if (SET_DEST (curr_set
) == (pc_rtx
)
17316 && GET_CODE (SET_SRC (curr_set
)) == IF_THEN_ELSE
17317 && REG_P (XEXP (XEXP (SET_SRC (curr_set
), 0), 0))
17318 && REG_P (SET_DEST (prev_set
))
17319 && REGNO (SET_DEST (prev_set
))
17320 == REGNO (XEXP (XEXP (SET_SRC (curr_set
), 0), 0)))
17322 /* Fuse ALU operations followed by conditional branch instruction. */
17323 switch (get_attr_type (prev
))
17326 case TYPE_ALU_SREG
:
17329 case TYPE_ADCS_REG
:
17330 case TYPE_ADCS_IMM
:
17331 case TYPE_LOGIC_REG
:
17332 case TYPE_LOGIC_IMM
:
17336 case TYPE_SHIFT_REG
:
17337 case TYPE_SHIFT_IMM
:
17352 /* Return true iff the instruction fusion described by OP is enabled. */
17355 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op
)
17357 return (aarch64_tune_params
.fusible_ops
& op
) != 0;
17360 /* If MEM is in the form of [base+offset], extract the two parts
17361 of address and set to BASE and OFFSET, otherwise return false
17362 after clearing BASE and OFFSET. */
17365 extract_base_offset_in_addr (rtx mem
, rtx
*base
, rtx
*offset
)
17369 gcc_assert (MEM_P (mem
));
17371 addr
= XEXP (mem
, 0);
17376 *offset
= const0_rtx
;
17380 if (GET_CODE (addr
) == PLUS
17381 && REG_P (XEXP (addr
, 0)) && CONST_INT_P (XEXP (addr
, 1)))
17383 *base
= XEXP (addr
, 0);
17384 *offset
= XEXP (addr
, 1);
17389 *offset
= NULL_RTX
;
17394 /* Types for scheduling fusion. */
17395 enum sched_fusion_type
17397 SCHED_FUSION_NONE
= 0,
17398 SCHED_FUSION_LD_SIGN_EXTEND
,
17399 SCHED_FUSION_LD_ZERO_EXTEND
,
17405 /* If INSN is a load or store of address in the form of [base+offset],
17406 extract the two parts and set to BASE and OFFSET. Return scheduling
17407 fusion type this INSN is. */
17409 static enum sched_fusion_type
17410 fusion_load_store (rtx_insn
*insn
, rtx
*base
, rtx
*offset
)
17413 enum sched_fusion_type fusion
= SCHED_FUSION_LD
;
17415 gcc_assert (INSN_P (insn
));
17416 x
= PATTERN (insn
);
17417 if (GET_CODE (x
) != SET
)
17418 return SCHED_FUSION_NONE
;
17421 dest
= SET_DEST (x
);
17423 machine_mode dest_mode
= GET_MODE (dest
);
17425 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode
))
17426 return SCHED_FUSION_NONE
;
17428 if (GET_CODE (src
) == SIGN_EXTEND
)
17430 fusion
= SCHED_FUSION_LD_SIGN_EXTEND
;
17431 src
= XEXP (src
, 0);
17432 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
17433 return SCHED_FUSION_NONE
;
17435 else if (GET_CODE (src
) == ZERO_EXTEND
)
17437 fusion
= SCHED_FUSION_LD_ZERO_EXTEND
;
17438 src
= XEXP (src
, 0);
17439 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
17440 return SCHED_FUSION_NONE
;
17443 if (GET_CODE (src
) == MEM
&& REG_P (dest
))
17444 extract_base_offset_in_addr (src
, base
, offset
);
17445 else if (GET_CODE (dest
) == MEM
&& (REG_P (src
) || src
== const0_rtx
))
17447 fusion
= SCHED_FUSION_ST
;
17448 extract_base_offset_in_addr (dest
, base
, offset
);
17451 return SCHED_FUSION_NONE
;
17453 if (*base
== NULL_RTX
|| *offset
== NULL_RTX
)
17454 fusion
= SCHED_FUSION_NONE
;
17459 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
17461 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
17462 and PRI are only calculated for these instructions. For other instruction,
17463 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
17464 type instruction fusion can be added by returning different priorities.
17466 It's important that irrelevant instructions get the largest FUSION_PRI. */
17469 aarch64_sched_fusion_priority (rtx_insn
*insn
, int max_pri
,
17470 int *fusion_pri
, int *pri
)
17474 enum sched_fusion_type fusion
;
17476 gcc_assert (INSN_P (insn
));
17479 fusion
= fusion_load_store (insn
, &base
, &offset
);
17480 if (fusion
== SCHED_FUSION_NONE
)
17487 /* Set FUSION_PRI according to fusion type and base register. */
17488 *fusion_pri
= tmp
- fusion
* FIRST_PSEUDO_REGISTER
- REGNO (base
);
17490 /* Calculate PRI. */
17493 /* INSN with smaller offset goes first. */
17494 off_val
= (int)(INTVAL (offset
));
17496 tmp
-= (off_val
& 0xfffff);
17498 tmp
+= ((- off_val
) & 0xfffff);
17504 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
17505 Adjust priority of sha1h instructions so they are scheduled before
17506 other SHA1 instructions. */
17509 aarch64_sched_adjust_priority (rtx_insn
*insn
, int priority
)
17511 rtx x
= PATTERN (insn
);
17513 if (GET_CODE (x
) == SET
)
17517 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_SHA1H
)
17518 return priority
+ 10;
17524 /* Given OPERANDS of consecutive load/store, check if we can merge
17525 them into ldp/stp. LOAD is true if they are load instructions.
17526 MODE is the mode of memory operands. */
17529 aarch64_operands_ok_for_ldpstp (rtx
*operands
, bool load
,
17532 HOST_WIDE_INT offval_1
, offval_2
, msize
;
17533 enum reg_class rclass_1
, rclass_2
;
17534 rtx mem_1
, mem_2
, reg_1
, reg_2
, base_1
, base_2
, offset_1
, offset_2
;
17538 mem_1
= operands
[1];
17539 mem_2
= operands
[3];
17540 reg_1
= operands
[0];
17541 reg_2
= operands
[2];
17542 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
));
17543 if (REGNO (reg_1
) == REGNO (reg_2
))
17548 mem_1
= operands
[0];
17549 mem_2
= operands
[2];
17550 reg_1
= operands
[1];
17551 reg_2
= operands
[3];
17554 /* The mems cannot be volatile. */
17555 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
))
17558 /* If we have SImode and slow unaligned ldp,
17559 check the alignment to be at least 8 byte. */
17561 && (aarch64_tune_params
.extra_tuning_flags
17562 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
17564 && MEM_ALIGN (mem_1
) < 8 * BITS_PER_UNIT
)
17567 /* Check if the addresses are in the form of [base+offset]. */
17568 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
17569 if (base_1
== NULL_RTX
|| offset_1
== NULL_RTX
)
17571 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
17572 if (base_2
== NULL_RTX
|| offset_2
== NULL_RTX
)
17575 /* Check if the bases are same. */
17576 if (!rtx_equal_p (base_1
, base_2
))
17579 /* The operands must be of the same size. */
17580 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1
)),
17581 GET_MODE_SIZE (GET_MODE (mem_2
))));
17583 offval_1
= INTVAL (offset_1
);
17584 offval_2
= INTVAL (offset_2
);
17585 /* We should only be trying this for fixed-sized modes. There is no
17586 SVE LDP/STP instruction. */
17587 msize
= GET_MODE_SIZE (mode
).to_constant ();
17588 /* Check if the offsets are consecutive. */
17589 if (offval_1
!= (offval_2
+ msize
) && offval_2
!= (offval_1
+ msize
))
17592 /* Check if the addresses are clobbered by load. */
17595 if (reg_mentioned_p (reg_1
, mem_1
))
17598 /* In increasing order, the last load can clobber the address. */
17599 if (offval_1
> offval_2
&& reg_mentioned_p (reg_2
, mem_2
))
17603 /* One of the memory accesses must be a mempair operand.
17604 If it is not the first one, they need to be swapped by the
17606 if (!aarch64_mem_pair_operand (mem_1
, GET_MODE (mem_1
))
17607 && !aarch64_mem_pair_operand (mem_2
, GET_MODE (mem_2
)))
17610 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
17611 rclass_1
= FP_REGS
;
17613 rclass_1
= GENERAL_REGS
;
17615 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
17616 rclass_2
= FP_REGS
;
17618 rclass_2
= GENERAL_REGS
;
17620 /* Check if the registers are of same class. */
17621 if (rclass_1
!= rclass_2
)
17627 /* Given OPERANDS of consecutive load/store that can be merged,
17628 swap them if they are not in ascending order. */
17630 aarch64_swap_ldrstr_operands (rtx
* operands
, bool load
)
17632 rtx mem_1
, mem_2
, base_1
, base_2
, offset_1
, offset_2
;
17633 HOST_WIDE_INT offval_1
, offval_2
;
17637 mem_1
= operands
[1];
17638 mem_2
= operands
[3];
17642 mem_1
= operands
[0];
17643 mem_2
= operands
[2];
17646 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
17647 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
17649 offval_1
= INTVAL (offset_1
);
17650 offval_2
= INTVAL (offset_2
);
17652 if (offval_1
> offval_2
)
17654 /* Irrespective of whether this is a load or a store,
17655 we do the same swap. */
17656 std::swap (operands
[0], operands
[2]);
17657 std::swap (operands
[1], operands
[3]);
17661 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
17662 comparison between the two. */
17664 aarch64_host_wide_int_compare (const void *x
, const void *y
)
17666 return wi::cmps (* ((const HOST_WIDE_INT
*) x
),
17667 * ((const HOST_WIDE_INT
*) y
));
17670 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
17671 other pointing to a REG rtx containing an offset, compare the offsets
17676 1 iff offset (X) > offset (Y)
17677 0 iff offset (X) == offset (Y)
17678 -1 iff offset (X) < offset (Y) */
17680 aarch64_ldrstr_offset_compare (const void *x
, const void *y
)
17682 const rtx
* operands_1
= (const rtx
*) x
;
17683 const rtx
* operands_2
= (const rtx
*) y
;
17684 rtx mem_1
, mem_2
, base
, offset_1
, offset_2
;
17686 if (MEM_P (operands_1
[0]))
17687 mem_1
= operands_1
[0];
17689 mem_1
= operands_1
[1];
17691 if (MEM_P (operands_2
[0]))
17692 mem_2
= operands_2
[0];
17694 mem_2
= operands_2
[1];
17696 /* Extract the offsets. */
17697 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
17698 extract_base_offset_in_addr (mem_2
, &base
, &offset_2
);
17700 gcc_assert (offset_1
!= NULL_RTX
&& offset_2
!= NULL_RTX
);
17702 return wi::cmps (INTVAL (offset_1
), INTVAL (offset_2
));
17705 /* Given OPERANDS of consecutive load/store, check if we can merge
17706 them into ldp/stp by adjusting the offset. LOAD is true if they
17707 are load instructions. MODE is the mode of memory operands.
17709 Given below consecutive stores:
17711 str w1, [xb, 0x100]
17712 str w1, [xb, 0x104]
17713 str w1, [xb, 0x108]
17714 str w1, [xb, 0x10c]
17716 Though the offsets are out of the range supported by stp, we can
17717 still pair them after adjusting the offset, like:
17719 add scratch, xb, 0x100
17720 stp w1, w1, [scratch]
17721 stp w1, w1, [scratch, 0x8]
17723 The peephole patterns detecting this opportunity should guarantee
17724 the scratch register is avaliable. */
17727 aarch64_operands_adjust_ok_for_ldpstp (rtx
*operands
, bool load
,
17730 const int num_insns
= 4;
17731 enum reg_class rclass
;
17732 HOST_WIDE_INT offvals
[num_insns
], msize
;
17733 rtx mem
[num_insns
], reg
[num_insns
], base
[num_insns
], offset
[num_insns
];
17737 for (int i
= 0; i
< num_insns
; i
++)
17739 reg
[i
] = operands
[2 * i
];
17740 mem
[i
] = operands
[2 * i
+ 1];
17742 gcc_assert (REG_P (reg
[i
]));
17745 /* Do not attempt to merge the loads if the loads clobber each other. */
17746 for (int i
= 0; i
< 8; i
+= 2)
17747 for (int j
= i
+ 2; j
< 8; j
+= 2)
17748 if (reg_overlap_mentioned_p (operands
[i
], operands
[j
]))
17752 for (int i
= 0; i
< num_insns
; i
++)
17754 mem
[i
] = operands
[2 * i
];
17755 reg
[i
] = operands
[2 * i
+ 1];
17758 /* Skip if memory operand is by itself valid for ldp/stp. */
17759 if (!MEM_P (mem
[0]) || aarch64_mem_pair_operand (mem
[0], mode
))
17762 for (int i
= 0; i
< num_insns
; i
++)
17764 /* The mems cannot be volatile. */
17765 if (MEM_VOLATILE_P (mem
[i
]))
17768 /* Check if the addresses are in the form of [base+offset]. */
17769 extract_base_offset_in_addr (mem
[i
], base
+ i
, offset
+ i
);
17770 if (base
[i
] == NULL_RTX
|| offset
[i
] == NULL_RTX
)
17774 /* Check if the registers are of same class. */
17775 rclass
= REG_P (reg
[0]) && FP_REGNUM_P (REGNO (reg
[0]))
17776 ? FP_REGS
: GENERAL_REGS
;
17778 for (int i
= 1; i
< num_insns
; i
++)
17779 if (REG_P (reg
[i
]) && FP_REGNUM_P (REGNO (reg
[i
])))
17781 if (rclass
!= FP_REGS
)
17786 if (rclass
!= GENERAL_REGS
)
17790 /* Only the last register in the order in which they occur
17791 may be clobbered by the load. */
17792 if (rclass
== GENERAL_REGS
&& load
)
17793 for (int i
= 0; i
< num_insns
- 1; i
++)
17794 if (reg_mentioned_p (reg
[i
], mem
[i
]))
17797 /* Check if the bases are same. */
17798 for (int i
= 0; i
< num_insns
- 1; i
++)
17799 if (!rtx_equal_p (base
[i
], base
[i
+ 1]))
17802 for (int i
= 0; i
< num_insns
; i
++)
17803 offvals
[i
] = INTVAL (offset
[i
]);
17805 msize
= GET_MODE_SIZE (mode
);
17807 /* Check if the offsets can be put in the right order to do a ldp/stp. */
17808 qsort (offvals
, num_insns
, sizeof (HOST_WIDE_INT
),
17809 aarch64_host_wide_int_compare
);
17811 if (!(offvals
[1] == offvals
[0] + msize
17812 && offvals
[3] == offvals
[2] + msize
))
17815 /* Check that offsets are within range of each other. The ldp/stp
17816 instructions have 7 bit immediate offsets, so use 0x80. */
17817 if (offvals
[2] - offvals
[0] >= msize
* 0x80)
17820 /* The offsets must be aligned with respect to each other. */
17821 if (offvals
[0] % msize
!= offvals
[2] % msize
)
17824 /* If we have SImode and slow unaligned ldp,
17825 check the alignment to be at least 8 byte. */
17827 && (aarch64_tune_params
.extra_tuning_flags
17828 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
17830 && MEM_ALIGN (mem
[0]) < 8 * BITS_PER_UNIT
)
17836 /* Given OPERANDS of consecutive load/store, this function pairs them
17837 into LDP/STP after adjusting the offset. It depends on the fact
17838 that the operands can be sorted so the offsets are correct for STP.
17839 MODE is the mode of memory operands. CODE is the rtl operator
17840 which should be applied to all memory operands, it's SIGN_EXTEND,
17841 ZERO_EXTEND or UNKNOWN. */
17844 aarch64_gen_adjusted_ldpstp (rtx
*operands
, bool load
,
17845 scalar_mode mode
, RTX_CODE code
)
17847 rtx base
, offset_1
, offset_3
, t1
, t2
;
17848 rtx mem_1
, mem_2
, mem_3
, mem_4
;
17849 rtx temp_operands
[8];
17850 HOST_WIDE_INT off_val_1
, off_val_3
, base_off
, new_off_1
, new_off_3
,
17851 stp_off_upper_limit
, stp_off_lower_limit
, msize
;
17853 /* We make changes on a copy as we may still bail out. */
17854 for (int i
= 0; i
< 8; i
++)
17855 temp_operands
[i
] = operands
[i
];
17857 /* Sort the operands. */
17858 qsort (temp_operands
, 4, 2 * sizeof (rtx
*), aarch64_ldrstr_offset_compare
);
17862 mem_1
= temp_operands
[1];
17863 mem_2
= temp_operands
[3];
17864 mem_3
= temp_operands
[5];
17865 mem_4
= temp_operands
[7];
17869 mem_1
= temp_operands
[0];
17870 mem_2
= temp_operands
[2];
17871 mem_3
= temp_operands
[4];
17872 mem_4
= temp_operands
[6];
17873 gcc_assert (code
== UNKNOWN
);
17876 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
17877 extract_base_offset_in_addr (mem_3
, &base
, &offset_3
);
17878 gcc_assert (base
!= NULL_RTX
&& offset_1
!= NULL_RTX
17879 && offset_3
!= NULL_RTX
);
17881 /* Adjust offset so it can fit in LDP/STP instruction. */
17882 msize
= GET_MODE_SIZE (mode
);
17883 stp_off_upper_limit
= msize
* (0x40 - 1);
17884 stp_off_lower_limit
= - msize
* 0x40;
17886 off_val_1
= INTVAL (offset_1
);
17887 off_val_3
= INTVAL (offset_3
);
17889 /* The base offset is optimally half way between the two STP/LDP offsets. */
17891 base_off
= (off_val_1
+ off_val_3
) / 2;
17893 /* However, due to issues with negative LDP/STP offset generation for
17894 larger modes, for DF, DI and vector modes. we must not use negative
17895 addresses smaller than 9 signed unadjusted bits can store. This
17896 provides the most range in this case. */
17897 base_off
= off_val_1
;
17899 /* Adjust the base so that it is aligned with the addresses but still
17901 if (base_off
% msize
!= off_val_1
% msize
)
17902 /* Fix the offset, bearing in mind we want to make it bigger not
17904 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
17905 else if (msize
<= 4)
17906 /* The negative range of LDP/STP is one larger than the positive range. */
17909 /* Check if base offset is too big or too small. We can attempt to resolve
17910 this issue by setting it to the maximum value and seeing if the offsets
17912 if (base_off
>= 0x1000)
17914 base_off
= 0x1000 - 1;
17915 /* We must still make sure that the base offset is aligned with respect
17916 to the address. But it may may not be made any bigger. */
17917 base_off
-= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
17920 /* Likewise for the case where the base is too small. */
17921 if (base_off
<= -0x1000)
17923 base_off
= -0x1000 + 1;
17924 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
17927 /* Offset of the first STP/LDP. */
17928 new_off_1
= off_val_1
- base_off
;
17930 /* Offset of the second STP/LDP. */
17931 new_off_3
= off_val_3
- base_off
;
17933 /* The offsets must be within the range of the LDP/STP instructions. */
17934 if (new_off_1
> stp_off_upper_limit
|| new_off_1
< stp_off_lower_limit
17935 || new_off_3
> stp_off_upper_limit
|| new_off_3
< stp_off_lower_limit
)
17938 replace_equiv_address_nv (mem_1
, plus_constant (Pmode
, operands
[8],
17940 replace_equiv_address_nv (mem_2
, plus_constant (Pmode
, operands
[8],
17941 new_off_1
+ msize
), true);
17942 replace_equiv_address_nv (mem_3
, plus_constant (Pmode
, operands
[8],
17944 replace_equiv_address_nv (mem_4
, plus_constant (Pmode
, operands
[8],
17945 new_off_3
+ msize
), true);
17947 if (!aarch64_mem_pair_operand (mem_1
, mode
)
17948 || !aarch64_mem_pair_operand (mem_3
, mode
))
17951 if (code
== ZERO_EXTEND
)
17953 mem_1
= gen_rtx_ZERO_EXTEND (DImode
, mem_1
);
17954 mem_2
= gen_rtx_ZERO_EXTEND (DImode
, mem_2
);
17955 mem_3
= gen_rtx_ZERO_EXTEND (DImode
, mem_3
);
17956 mem_4
= gen_rtx_ZERO_EXTEND (DImode
, mem_4
);
17958 else if (code
== SIGN_EXTEND
)
17960 mem_1
= gen_rtx_SIGN_EXTEND (DImode
, mem_1
);
17961 mem_2
= gen_rtx_SIGN_EXTEND (DImode
, mem_2
);
17962 mem_3
= gen_rtx_SIGN_EXTEND (DImode
, mem_3
);
17963 mem_4
= gen_rtx_SIGN_EXTEND (DImode
, mem_4
);
17968 operands
[0] = temp_operands
[0];
17969 operands
[1] = mem_1
;
17970 operands
[2] = temp_operands
[2];
17971 operands
[3] = mem_2
;
17972 operands
[4] = temp_operands
[4];
17973 operands
[5] = mem_3
;
17974 operands
[6] = temp_operands
[6];
17975 operands
[7] = mem_4
;
17979 operands
[0] = mem_1
;
17980 operands
[1] = temp_operands
[1];
17981 operands
[2] = mem_2
;
17982 operands
[3] = temp_operands
[3];
17983 operands
[4] = mem_3
;
17984 operands
[5] = temp_operands
[5];
17985 operands
[6] = mem_4
;
17986 operands
[7] = temp_operands
[7];
17989 /* Emit adjusting instruction. */
17990 emit_insn (gen_rtx_SET (operands
[8], plus_constant (DImode
, base
, base_off
)));
17991 /* Emit ldp/stp instructions. */
17992 t1
= gen_rtx_SET (operands
[0], operands
[1]);
17993 t2
= gen_rtx_SET (operands
[2], operands
[3]);
17994 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
17995 t1
= gen_rtx_SET (operands
[4], operands
[5]);
17996 t2
= gen_rtx_SET (operands
[6], operands
[7]);
17997 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
18001 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
18002 it isn't worth branching around empty masked ops (including masked
18006 aarch64_empty_mask_is_expensive (unsigned)
18011 /* Return 1 if pseudo register should be created and used to hold
18012 GOT address for PIC code. */
18015 aarch64_use_pseudo_pic_reg (void)
18017 return aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
;
18020 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
18023 aarch64_unspec_may_trap_p (const_rtx x
, unsigned flags
)
18025 switch (XINT (x
, 1))
18027 case UNSPEC_GOTSMALLPIC
:
18028 case UNSPEC_GOTSMALLPIC28K
:
18029 case UNSPEC_GOTTINYPIC
:
18035 return default_unspec_may_trap_p (x
, flags
);
18039 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
18040 return the log2 of that value. Otherwise return -1. */
18043 aarch64_fpconst_pow_of_2 (rtx x
)
18045 const REAL_VALUE_TYPE
*r
;
18047 if (!CONST_DOUBLE_P (x
))
18050 r
= CONST_DOUBLE_REAL_VALUE (x
);
18052 if (REAL_VALUE_NEGATIVE (*r
)
18053 || REAL_VALUE_ISNAN (*r
)
18054 || REAL_VALUE_ISINF (*r
)
18055 || !real_isinteger (r
, DFmode
))
18058 return exact_log2 (real_to_integer (r
));
18061 /* If X is a vector of equal CONST_DOUBLE values and that value is
18062 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
18065 aarch64_vec_fpconst_pow_of_2 (rtx x
)
18068 if (GET_CODE (x
) != CONST_VECTOR
18069 || !CONST_VECTOR_NUNITS (x
).is_constant (&nelts
))
18072 if (GET_MODE_CLASS (GET_MODE (x
)) != MODE_VECTOR_FLOAT
)
18075 int firstval
= aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, 0));
18079 for (int i
= 1; i
< nelts
; i
++)
18080 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, i
)) != firstval
)
18086 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
18089 __fp16 always promotes through this hook.
18090 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
18091 through the generic excess precision logic rather than here. */
18094 aarch64_promoted_type (const_tree t
)
18096 if (SCALAR_FLOAT_TYPE_P (t
)
18097 && TYPE_MAIN_VARIANT (t
) == aarch64_fp16_type_node
)
18098 return float_type_node
;
18103 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
18106 aarch64_optab_supported_p (int op
, machine_mode mode1
, machine_mode
,
18107 optimization_type opt_type
)
18112 return opt_type
== OPTIMIZE_FOR_SPEED
&& use_rsqrt_p (mode1
);
18119 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
18121 static unsigned int
18122 aarch64_dwarf_poly_indeterminate_value (unsigned int i
, unsigned int *factor
,
18125 /* Polynomial invariant 1 == (VG / 2) - 1. */
18126 gcc_assert (i
== 1);
18129 return AARCH64_DWARF_VG
;
18132 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
18133 if MODE is HFmode, and punt to the generic implementation otherwise. */
18136 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode
)
18138 return (mode
== HFmode
18140 : default_libgcc_floating_mode_supported_p (mode
));
18143 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
18144 if MODE is HFmode, and punt to the generic implementation otherwise. */
18147 aarch64_scalar_mode_supported_p (scalar_mode mode
)
18149 return (mode
== HFmode
18151 : default_scalar_mode_supported_p (mode
));
18154 /* Set the value of FLT_EVAL_METHOD.
18155 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
18157 0: evaluate all operations and constants, whose semantic type has at
18158 most the range and precision of type float, to the range and
18159 precision of float; evaluate all other operations and constants to
18160 the range and precision of the semantic type;
18162 N, where _FloatN is a supported interchange floating type
18163 evaluate all operations and constants, whose semantic type has at
18164 most the range and precision of _FloatN type, to the range and
18165 precision of the _FloatN type; evaluate all other operations and
18166 constants to the range and precision of the semantic type;
18168 If we have the ARMv8.2-A extensions then we support _Float16 in native
18169 precision, so we should set this to 16. Otherwise, we support the type,
18170 but want to evaluate expressions in float precision, so set this to
18173 static enum flt_eval_method
18174 aarch64_excess_precision (enum excess_precision_type type
)
18178 case EXCESS_PRECISION_TYPE_FAST
:
18179 case EXCESS_PRECISION_TYPE_STANDARD
:
18180 /* We can calculate either in 16-bit range and precision or
18181 32-bit range and precision. Make that decision based on whether
18182 we have native support for the ARMv8.2-A 16-bit floating-point
18183 instructions or not. */
18184 return (TARGET_FP_F16INST
18185 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
18186 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
);
18187 case EXCESS_PRECISION_TYPE_IMPLICIT
:
18188 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
;
18190 gcc_unreachable ();
18192 return FLT_EVAL_METHOD_UNPREDICTABLE
;
18195 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
18196 scheduled for speculative execution. Reject the long-running division
18197 and square-root instructions. */
18200 aarch64_sched_can_speculate_insn (rtx_insn
*insn
)
18202 switch (get_attr_type (insn
))
18210 case TYPE_NEON_FP_SQRT_S
:
18211 case TYPE_NEON_FP_SQRT_D
:
18212 case TYPE_NEON_FP_SQRT_S_Q
:
18213 case TYPE_NEON_FP_SQRT_D_Q
:
18214 case TYPE_NEON_FP_DIV_S
:
18215 case TYPE_NEON_FP_DIV_D
:
18216 case TYPE_NEON_FP_DIV_S_Q
:
18217 case TYPE_NEON_FP_DIV_D_Q
:
18224 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
18227 aarch64_compute_pressure_classes (reg_class
*classes
)
18230 classes
[i
++] = GENERAL_REGS
;
18231 classes
[i
++] = FP_REGS
;
18232 /* PR_REGS isn't a useful pressure class because many predicate pseudo
18233 registers need to go in PR_LO_REGS at some point during their
18234 lifetime. Splitting it into two halves has the effect of making
18235 all predicates count against PR_LO_REGS, so that we try whenever
18236 possible to restrict the number of live predicates to 8. This
18237 greatly reduces the amount of spilling in certain loops. */
18238 classes
[i
++] = PR_LO_REGS
;
18239 classes
[i
++] = PR_HI_REGS
;
18243 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
18246 aarch64_can_change_mode_class (machine_mode from
,
18247 machine_mode to
, reg_class_t
)
18249 if (BYTES_BIG_ENDIAN
)
18251 bool from_sve_p
= aarch64_sve_data_mode_p (from
);
18252 bool to_sve_p
= aarch64_sve_data_mode_p (to
);
18254 /* Don't allow changes between SVE data modes and non-SVE modes.
18255 See the comment at the head of aarch64-sve.md for details. */
18256 if (from_sve_p
!= to_sve_p
)
18259 /* Don't allow changes in element size: lane 0 of the new vector
18260 would not then be lane 0 of the old vector. See the comment
18261 above aarch64_maybe_expand_sve_subreg_move for a more detailed
18264 In the worst case, this forces a register to be spilled in
18265 one mode and reloaded in the other, which handles the
18266 endianness correctly. */
18267 if (from_sve_p
&& GET_MODE_UNIT_SIZE (from
) != GET_MODE_UNIT_SIZE (to
))
18273 /* Implement TARGET_EARLY_REMAT_MODES. */
18276 aarch64_select_early_remat_modes (sbitmap modes
)
18278 /* SVE values are not normally live across a call, so it should be
18279 worth doing early rematerialization even in VL-specific mode. */
18280 for (int i
= 0; i
< NUM_MACHINE_MODES
; ++i
)
18282 machine_mode mode
= (machine_mode
) i
;
18283 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
18284 if (vec_flags
& VEC_ANY_SVE
)
18285 bitmap_set_bit (modes
, i
);
18289 /* Override the default target speculation_safe_value. */
18291 aarch64_speculation_safe_value (machine_mode mode
,
18292 rtx result
, rtx val
, rtx failval
)
18294 /* Maybe we should warn if falling back to hard barriers. They are
18295 likely to be noticably more expensive than the alternative below. */
18296 if (!aarch64_track_speculation
)
18297 return default_speculation_safe_value (mode
, result
, val
, failval
);
18300 val
= copy_to_mode_reg (mode
, val
);
18302 if (!aarch64_reg_or_zero (failval
, mode
))
18303 failval
= copy_to_mode_reg (mode
, failval
);
18305 emit_insn (gen_despeculate_copy (mode
, result
, val
, failval
));
18309 /* Implement TARGET_ESTIMATED_POLY_VALUE.
18310 Look into the tuning structure for an estimate.
18311 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
18312 Advanced SIMD 128 bits. */
18314 static HOST_WIDE_INT
18315 aarch64_estimated_poly_value (poly_int64 val
)
18317 enum aarch64_sve_vector_bits_enum width_source
18318 = aarch64_tune_params
.sve_width
;
18320 /* If we still don't have an estimate, use the default. */
18321 if (width_source
== SVE_SCALABLE
)
18322 return default_estimated_poly_value (val
);
18324 HOST_WIDE_INT over_128
= width_source
- 128;
18325 return val
.coeffs
[0] + val
.coeffs
[1] * over_128
/ 128;
18328 /* Target-specific selftests. */
18332 namespace selftest
{
18334 /* Selftest for the RTL loader.
18335 Verify that the RTL loader copes with a dump from
18336 print_rtx_function. This is essentially just a test that class
18337 function_reader can handle a real dump, but it also verifies
18338 that lookup_reg_by_dump_name correctly handles hard regs.
18339 The presence of hard reg names in the dump means that the test is
18340 target-specific, hence it is in this file. */
18343 aarch64_test_loading_full_dump ()
18345 rtl_dump_test
t (SELFTEST_LOCATION
, locate_file ("aarch64/times-two.rtl"));
18347 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun
->decl
)));
18349 rtx_insn
*insn_1
= get_insn_by_uid (1);
18350 ASSERT_EQ (NOTE
, GET_CODE (insn_1
));
18352 rtx_insn
*insn_15
= get_insn_by_uid (15);
18353 ASSERT_EQ (INSN
, GET_CODE (insn_15
));
18354 ASSERT_EQ (USE
, GET_CODE (PATTERN (insn_15
)));
18356 /* Verify crtl->return_rtx. */
18357 ASSERT_EQ (REG
, GET_CODE (crtl
->return_rtx
));
18358 ASSERT_EQ (0, REGNO (crtl
->return_rtx
));
18359 ASSERT_EQ (SImode
, GET_MODE (crtl
->return_rtx
));
18362 /* Run all target-specific selftests. */
18365 aarch64_run_selftests (void)
18367 aarch64_test_loading_full_dump ();
18370 } // namespace selftest
18372 #endif /* #if CHECKING_P */
18374 #undef TARGET_ADDRESS_COST
18375 #define TARGET_ADDRESS_COST aarch64_address_cost
18377 /* This hook will determines whether unnamed bitfields affect the alignment
18378 of the containing structure. The hook returns true if the structure
18379 should inherit the alignment requirements of an unnamed bitfield's
18381 #undef TARGET_ALIGN_ANON_BITFIELD
18382 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
18384 #undef TARGET_ASM_ALIGNED_DI_OP
18385 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
18387 #undef TARGET_ASM_ALIGNED_HI_OP
18388 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
18390 #undef TARGET_ASM_ALIGNED_SI_OP
18391 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
18393 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
18394 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
18395 hook_bool_const_tree_hwi_hwi_const_tree_true
18397 #undef TARGET_ASM_FILE_START
18398 #define TARGET_ASM_FILE_START aarch64_start_file
18400 #undef TARGET_ASM_OUTPUT_MI_THUNK
18401 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
18403 #undef TARGET_ASM_SELECT_RTX_SECTION
18404 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
18406 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
18407 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
18409 #undef TARGET_BUILD_BUILTIN_VA_LIST
18410 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
18412 #undef TARGET_CALLEE_COPIES
18413 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
18415 #undef TARGET_CAN_ELIMINATE
18416 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
18418 #undef TARGET_CAN_INLINE_P
18419 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
18421 #undef TARGET_CANNOT_FORCE_CONST_MEM
18422 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
18424 #undef TARGET_CASE_VALUES_THRESHOLD
18425 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
18427 #undef TARGET_CONDITIONAL_REGISTER_USAGE
18428 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
18430 /* Only the least significant bit is used for initialization guard
18432 #undef TARGET_CXX_GUARD_MASK_BIT
18433 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
18435 #undef TARGET_C_MODE_FOR_SUFFIX
18436 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
18438 #ifdef TARGET_BIG_ENDIAN_DEFAULT
18439 #undef TARGET_DEFAULT_TARGET_FLAGS
18440 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
18443 #undef TARGET_CLASS_MAX_NREGS
18444 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
18446 #undef TARGET_BUILTIN_DECL
18447 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
18449 #undef TARGET_BUILTIN_RECIPROCAL
18450 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
18452 #undef TARGET_C_EXCESS_PRECISION
18453 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
18455 #undef TARGET_EXPAND_BUILTIN
18456 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
18458 #undef TARGET_EXPAND_BUILTIN_VA_START
18459 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
18461 #undef TARGET_FOLD_BUILTIN
18462 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
18464 #undef TARGET_FUNCTION_ARG
18465 #define TARGET_FUNCTION_ARG aarch64_function_arg
18467 #undef TARGET_FUNCTION_ARG_ADVANCE
18468 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
18470 #undef TARGET_FUNCTION_ARG_BOUNDARY
18471 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
18473 #undef TARGET_FUNCTION_ARG_PADDING
18474 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
18476 #undef TARGET_GET_RAW_RESULT_MODE
18477 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
18478 #undef TARGET_GET_RAW_ARG_MODE
18479 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
18481 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
18482 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
18484 #undef TARGET_FUNCTION_VALUE
18485 #define TARGET_FUNCTION_VALUE aarch64_function_value
18487 #undef TARGET_FUNCTION_VALUE_REGNO_P
18488 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
18490 #undef TARGET_GIMPLE_FOLD_BUILTIN
18491 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
18493 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
18494 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
18496 #undef TARGET_INIT_BUILTINS
18497 #define TARGET_INIT_BUILTINS aarch64_init_builtins
18499 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
18500 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
18501 aarch64_ira_change_pseudo_allocno_class
18503 #undef TARGET_LEGITIMATE_ADDRESS_P
18504 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
18506 #undef TARGET_LEGITIMATE_CONSTANT_P
18507 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
18509 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
18510 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
18511 aarch64_legitimize_address_displacement
18513 #undef TARGET_LIBGCC_CMP_RETURN_MODE
18514 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
18516 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
18517 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
18518 aarch64_libgcc_floating_mode_supported_p
18520 #undef TARGET_MANGLE_TYPE
18521 #define TARGET_MANGLE_TYPE aarch64_mangle_type
18523 #undef TARGET_MEMORY_MOVE_COST
18524 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
18526 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
18527 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
18529 #undef TARGET_MUST_PASS_IN_STACK
18530 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
18532 /* This target hook should return true if accesses to volatile bitfields
18533 should use the narrowest mode possible. It should return false if these
18534 accesses should use the bitfield container type. */
18535 #undef TARGET_NARROW_VOLATILE_BITFIELD
18536 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
18538 #undef TARGET_OPTION_OVERRIDE
18539 #define TARGET_OPTION_OVERRIDE aarch64_override_options
18541 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
18542 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
18543 aarch64_override_options_after_change
18545 #undef TARGET_OPTION_SAVE
18546 #define TARGET_OPTION_SAVE aarch64_option_save
18548 #undef TARGET_OPTION_RESTORE
18549 #define TARGET_OPTION_RESTORE aarch64_option_restore
18551 #undef TARGET_OPTION_PRINT
18552 #define TARGET_OPTION_PRINT aarch64_option_print
18554 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
18555 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
18557 #undef TARGET_SET_CURRENT_FUNCTION
18558 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
18560 #undef TARGET_PASS_BY_REFERENCE
18561 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
18563 #undef TARGET_PREFERRED_RELOAD_CLASS
18564 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
18566 #undef TARGET_SCHED_REASSOCIATION_WIDTH
18567 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
18569 #undef TARGET_PROMOTED_TYPE
18570 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
18572 #undef TARGET_SECONDARY_RELOAD
18573 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
18575 #undef TARGET_SHIFT_TRUNCATION_MASK
18576 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
18578 #undef TARGET_SETUP_INCOMING_VARARGS
18579 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
18581 #undef TARGET_STRUCT_VALUE_RTX
18582 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
18584 #undef TARGET_REGISTER_MOVE_COST
18585 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
18587 #undef TARGET_RETURN_IN_MEMORY
18588 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
18590 #undef TARGET_RETURN_IN_MSB
18591 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
18593 #undef TARGET_RTX_COSTS
18594 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
18596 #undef TARGET_SCALAR_MODE_SUPPORTED_P
18597 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
18599 #undef TARGET_SCHED_ISSUE_RATE
18600 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
18602 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
18603 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
18604 aarch64_sched_first_cycle_multipass_dfa_lookahead
18606 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
18607 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
18608 aarch64_first_cycle_multipass_dfa_lookahead_guard
18610 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
18611 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
18612 aarch64_get_separate_components
18614 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
18615 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
18616 aarch64_components_for_bb
18618 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
18619 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
18620 aarch64_disqualify_components
18622 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
18623 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
18624 aarch64_emit_prologue_components
18626 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
18627 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
18628 aarch64_emit_epilogue_components
18630 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
18631 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
18632 aarch64_set_handled_components
18634 #undef TARGET_TRAMPOLINE_INIT
18635 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
18637 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
18638 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
18640 #undef TARGET_VECTOR_MODE_SUPPORTED_P
18641 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
18643 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
18644 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
18645 aarch64_builtin_support_vector_misalignment
18647 #undef TARGET_ARRAY_MODE
18648 #define TARGET_ARRAY_MODE aarch64_array_mode
18650 #undef TARGET_ARRAY_MODE_SUPPORTED_P
18651 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
18653 #undef TARGET_VECTORIZE_ADD_STMT_COST
18654 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
18656 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
18657 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
18658 aarch64_builtin_vectorization_cost
18660 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
18661 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
18663 #undef TARGET_VECTORIZE_BUILTINS
18664 #define TARGET_VECTORIZE_BUILTINS
18666 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
18667 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
18668 aarch64_builtin_vectorized_function
18670 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
18671 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
18672 aarch64_autovectorize_vector_sizes
18674 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
18675 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
18676 aarch64_atomic_assign_expand_fenv
18678 /* Section anchor support. */
18680 #undef TARGET_MIN_ANCHOR_OFFSET
18681 #define TARGET_MIN_ANCHOR_OFFSET -256
18683 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
18684 byte offset; we can do much more for larger data types, but have no way
18685 to determine the size of the access. We assume accesses are aligned. */
18686 #undef TARGET_MAX_ANCHOR_OFFSET
18687 #define TARGET_MAX_ANCHOR_OFFSET 4095
18689 #undef TARGET_VECTOR_ALIGNMENT
18690 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
18692 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
18693 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
18694 aarch64_vectorize_preferred_vector_alignment
18695 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
18696 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
18697 aarch64_simd_vector_alignment_reachable
18699 /* vec_perm support. */
18701 #undef TARGET_VECTORIZE_VEC_PERM_CONST
18702 #define TARGET_VECTORIZE_VEC_PERM_CONST \
18703 aarch64_vectorize_vec_perm_const
18705 #undef TARGET_VECTORIZE_GET_MASK_MODE
18706 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
18707 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
18708 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
18709 aarch64_empty_mask_is_expensive
18710 #undef TARGET_PREFERRED_ELSE_VALUE
18711 #define TARGET_PREFERRED_ELSE_VALUE \
18712 aarch64_preferred_else_value
18714 #undef TARGET_INIT_LIBFUNCS
18715 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
18717 #undef TARGET_FIXED_CONDITION_CODE_REGS
18718 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
18720 #undef TARGET_FLAGS_REGNUM
18721 #define TARGET_FLAGS_REGNUM CC_REGNUM
18723 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
18724 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
18726 #undef TARGET_ASAN_SHADOW_OFFSET
18727 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
18729 #undef TARGET_LEGITIMIZE_ADDRESS
18730 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
18732 #undef TARGET_SCHED_CAN_SPECULATE_INSN
18733 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
18735 #undef TARGET_CAN_USE_DOLOOP_P
18736 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
18738 #undef TARGET_SCHED_ADJUST_PRIORITY
18739 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
18741 #undef TARGET_SCHED_MACRO_FUSION_P
18742 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
18744 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
18745 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
18747 #undef TARGET_SCHED_FUSION_PRIORITY
18748 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
18750 #undef TARGET_UNSPEC_MAY_TRAP_P
18751 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
18753 #undef TARGET_USE_PSEUDO_PIC_REG
18754 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
18756 #undef TARGET_PRINT_OPERAND
18757 #define TARGET_PRINT_OPERAND aarch64_print_operand
18759 #undef TARGET_PRINT_OPERAND_ADDRESS
18760 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
18762 #undef TARGET_OPTAB_SUPPORTED_P
18763 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
18765 #undef TARGET_OMIT_STRUCT_RETURN_REG
18766 #define TARGET_OMIT_STRUCT_RETURN_REG true
18768 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
18769 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
18770 aarch64_dwarf_poly_indeterminate_value
18772 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
18773 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
18774 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
18776 #undef TARGET_HARD_REGNO_NREGS
18777 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
18778 #undef TARGET_HARD_REGNO_MODE_OK
18779 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
18781 #undef TARGET_MODES_TIEABLE_P
18782 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
18784 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
18785 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
18786 aarch64_hard_regno_call_part_clobbered
18788 #undef TARGET_CONSTANT_ALIGNMENT
18789 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
18791 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
18792 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
18793 aarch64_stack_clash_protection_alloca_probe_range
18795 #undef TARGET_COMPUTE_PRESSURE_CLASSES
18796 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
18798 #undef TARGET_CAN_CHANGE_MODE_CLASS
18799 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
18801 #undef TARGET_SELECT_EARLY_REMAT_MODES
18802 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
18804 #undef TARGET_SPECULATION_SAFE_VALUE
18805 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
18807 #undef TARGET_ESTIMATED_POLY_VALUE
18808 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
18810 #undef TARGET_ATTRIBUTE_TABLE
18811 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
18814 #undef TARGET_RUN_TARGET_SELFTESTS
18815 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
18816 #endif /* #if CHECKING_P */
18818 struct gcc_target targetm
= TARGET_INITIALIZER
;
18820 #include "gt-aarch64.h"