1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2017 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
22 #define INCLUDE_STRING
24 #include "coretypes.h"
35 #include "stringpool.h"
40 #include "diagnostic.h"
41 #include "insn-attr.h"
43 #include "fold-const.h"
44 #include "stor-layout.h"
52 #include "langhooks.h"
57 #include "gimple-iterator.h"
58 #include "tree-vectorizer.h"
59 #include "aarch64-cost-tables.h"
63 #include "tm-constrs.h"
64 #include "sched-int.h"
65 #include "target-globals.h"
66 #include "common/common-target.h"
68 #include "selftest-rtl.h"
70 /* This file should be included last. */
71 #include "target-def.h"
73 /* Defined for convenience. */
74 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
76 /* Classifies an address.
79 A simple base register plus immediate offset.
82 A base register indexed by immediate offset with writeback.
85 A base register indexed by (optionally scaled) register.
88 A base register indexed by (optionally scaled) zero-extended register.
91 A base register indexed by (optionally scaled) sign-extended register.
94 A LO_SUM rtx with a base register and "LO12" symbol relocation.
97 A constant symbolic address, in pc-relative literal pool. */
99 enum aarch64_address_type
{
109 struct aarch64_address_info
{
110 enum aarch64_address_type type
;
114 enum aarch64_symbol_type symbol_type
;
117 struct simd_immediate_info
126 /* The current code model. */
127 enum aarch64_code_model aarch64_cmodel
;
130 #undef TARGET_HAVE_TLS
131 #define TARGET_HAVE_TLS 1
134 static bool aarch64_composite_type_p (const_tree
, machine_mode
);
135 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode
,
137 machine_mode
*, int *,
139 static void aarch64_elf_asm_constructor (rtx
, int) ATTRIBUTE_UNUSED
;
140 static void aarch64_elf_asm_destructor (rtx
, int) ATTRIBUTE_UNUSED
;
141 static void aarch64_override_options_after_change (void);
142 static bool aarch64_vector_mode_supported_p (machine_mode
);
143 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode
,
144 const unsigned char *sel
);
145 static int aarch64_address_cost (rtx
, machine_mode
, addr_space_t
, bool);
146 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode
,
151 /* Major revision number of the ARM Architecture implemented by the target. */
152 unsigned aarch64_architecture_version
;
154 /* The processor for which instructions should be scheduled. */
155 enum aarch64_processor aarch64_tune
= cortexa53
;
157 /* Mask to specify which instruction scheduling options should be used. */
158 unsigned long aarch64_tune_flags
= 0;
160 /* Global flag for PC relative loads. */
161 bool aarch64_pcrelative_literal_loads
;
163 /* Support for command line parsing of boolean flags in the tuning
165 struct aarch64_flag_desc
171 #define AARCH64_FUSION_PAIR(name, internal_name) \
172 { name, AARCH64_FUSE_##internal_name },
173 static const struct aarch64_flag_desc aarch64_fusible_pairs
[] =
175 { "none", AARCH64_FUSE_NOTHING
},
176 #include "aarch64-fusion-pairs.def"
177 { "all", AARCH64_FUSE_ALL
},
178 { NULL
, AARCH64_FUSE_NOTHING
}
181 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
182 { name, AARCH64_EXTRA_TUNE_##internal_name },
183 static const struct aarch64_flag_desc aarch64_tuning_flags
[] =
185 { "none", AARCH64_EXTRA_TUNE_NONE
},
186 #include "aarch64-tuning-flags.def"
187 { "all", AARCH64_EXTRA_TUNE_ALL
},
188 { NULL
, AARCH64_EXTRA_TUNE_NONE
}
191 /* Tuning parameters. */
193 static const struct cpu_addrcost_table generic_addrcost_table
=
203 0, /* register_offset */
204 0, /* register_sextend */
205 0, /* register_zextend */
209 static const struct cpu_addrcost_table cortexa57_addrcost_table
=
219 0, /* register_offset */
220 0, /* register_sextend */
221 0, /* register_zextend */
225 static const struct cpu_addrcost_table exynosm1_addrcost_table
=
235 1, /* register_offset */
236 1, /* register_sextend */
237 2, /* register_zextend */
241 static const struct cpu_addrcost_table xgene1_addrcost_table
=
251 0, /* register_offset */
252 1, /* register_sextend */
253 1, /* register_zextend */
257 static const struct cpu_addrcost_table qdf24xx_addrcost_table
=
267 0, /* register_offset */
268 0, /* register_sextend */
269 0, /* register_zextend */
273 static const struct cpu_addrcost_table thunderx2t99_addrcost_table
=
283 2, /* register_offset */
284 3, /* register_sextend */
285 3, /* register_zextend */
289 static const struct cpu_regmove_cost generic_regmove_cost
=
292 /* Avoid the use of slow int<->fp moves for spilling by setting
293 their cost higher than memmov_cost. */
299 static const struct cpu_regmove_cost cortexa57_regmove_cost
=
302 /* Avoid the use of slow int<->fp moves for spilling by setting
303 their cost higher than memmov_cost. */
309 static const struct cpu_regmove_cost cortexa53_regmove_cost
=
312 /* Avoid the use of slow int<->fp moves for spilling by setting
313 their cost higher than memmov_cost. */
319 static const struct cpu_regmove_cost exynosm1_regmove_cost
=
322 /* Avoid the use of slow int<->fp moves for spilling by setting
323 their cost higher than memmov_cost (actual, 4 and 9). */
329 static const struct cpu_regmove_cost thunderx_regmove_cost
=
337 static const struct cpu_regmove_cost xgene1_regmove_cost
=
340 /* Avoid the use of slow int<->fp moves for spilling by setting
341 their cost higher than memmov_cost. */
347 static const struct cpu_regmove_cost qdf24xx_regmove_cost
=
350 /* Avoid the use of int<->fp moves for spilling. */
356 static const struct cpu_regmove_cost thunderx2t99_regmove_cost
=
359 /* Avoid the use of int<->fp moves for spilling. */
365 /* Generic costs for vector insn classes. */
366 static const struct cpu_vector_cost generic_vector_cost
=
368 1, /* scalar_int_stmt_cost */
369 1, /* scalar_fp_stmt_cost */
370 1, /* scalar_load_cost */
371 1, /* scalar_store_cost */
372 1, /* vec_int_stmt_cost */
373 1, /* vec_fp_stmt_cost */
374 2, /* vec_permute_cost */
375 1, /* vec_to_scalar_cost */
376 1, /* scalar_to_vec_cost */
377 1, /* vec_align_load_cost */
378 1, /* vec_unalign_load_cost */
379 1, /* vec_unalign_store_cost */
380 1, /* vec_store_cost */
381 3, /* cond_taken_branch_cost */
382 1 /* cond_not_taken_branch_cost */
385 /* ThunderX costs for vector insn classes. */
386 static const struct cpu_vector_cost thunderx_vector_cost
=
388 1, /* scalar_int_stmt_cost */
389 1, /* scalar_fp_stmt_cost */
390 3, /* scalar_load_cost */
391 1, /* scalar_store_cost */
392 4, /* vec_int_stmt_cost */
393 4, /* vec_fp_stmt_cost */
394 4, /* vec_permute_cost */
395 2, /* vec_to_scalar_cost */
396 2, /* scalar_to_vec_cost */
397 3, /* vec_align_load_cost */
398 10, /* vec_unalign_load_cost */
399 10, /* vec_unalign_store_cost */
400 1, /* vec_store_cost */
401 3, /* cond_taken_branch_cost */
402 3 /* cond_not_taken_branch_cost */
405 /* Generic costs for vector insn classes. */
406 static const struct cpu_vector_cost cortexa57_vector_cost
=
408 1, /* scalar_int_stmt_cost */
409 1, /* scalar_fp_stmt_cost */
410 4, /* scalar_load_cost */
411 1, /* scalar_store_cost */
412 2, /* vec_int_stmt_cost */
413 2, /* vec_fp_stmt_cost */
414 3, /* vec_permute_cost */
415 8, /* vec_to_scalar_cost */
416 8, /* scalar_to_vec_cost */
417 4, /* vec_align_load_cost */
418 4, /* vec_unalign_load_cost */
419 1, /* vec_unalign_store_cost */
420 1, /* vec_store_cost */
421 1, /* cond_taken_branch_cost */
422 1 /* cond_not_taken_branch_cost */
425 static const struct cpu_vector_cost exynosm1_vector_cost
=
427 1, /* scalar_int_stmt_cost */
428 1, /* scalar_fp_stmt_cost */
429 5, /* scalar_load_cost */
430 1, /* scalar_store_cost */
431 3, /* vec_int_stmt_cost */
432 3, /* vec_fp_stmt_cost */
433 3, /* vec_permute_cost */
434 3, /* vec_to_scalar_cost */
435 3, /* scalar_to_vec_cost */
436 5, /* vec_align_load_cost */
437 5, /* vec_unalign_load_cost */
438 1, /* vec_unalign_store_cost */
439 1, /* vec_store_cost */
440 1, /* cond_taken_branch_cost */
441 1 /* cond_not_taken_branch_cost */
444 /* Generic costs for vector insn classes. */
445 static const struct cpu_vector_cost xgene1_vector_cost
=
447 1, /* scalar_int_stmt_cost */
448 1, /* scalar_fp_stmt_cost */
449 5, /* scalar_load_cost */
450 1, /* scalar_store_cost */
451 2, /* vec_int_stmt_cost */
452 2, /* vec_fp_stmt_cost */
453 2, /* vec_permute_cost */
454 4, /* vec_to_scalar_cost */
455 4, /* scalar_to_vec_cost */
456 10, /* vec_align_load_cost */
457 10, /* vec_unalign_load_cost */
458 2, /* vec_unalign_store_cost */
459 2, /* vec_store_cost */
460 2, /* cond_taken_branch_cost */
461 1 /* cond_not_taken_branch_cost */
464 /* Costs for vector insn classes for Vulcan. */
465 static const struct cpu_vector_cost thunderx2t99_vector_cost
=
467 1, /* scalar_int_stmt_cost */
468 6, /* scalar_fp_stmt_cost */
469 4, /* scalar_load_cost */
470 1, /* scalar_store_cost */
471 5, /* vec_int_stmt_cost */
472 6, /* vec_fp_stmt_cost */
473 3, /* vec_permute_cost */
474 6, /* vec_to_scalar_cost */
475 5, /* scalar_to_vec_cost */
476 8, /* vec_align_load_cost */
477 8, /* vec_unalign_load_cost */
478 4, /* vec_unalign_store_cost */
479 4, /* vec_store_cost */
480 2, /* cond_taken_branch_cost */
481 1 /* cond_not_taken_branch_cost */
484 /* Generic costs for branch instructions. */
485 static const struct cpu_branch_cost generic_branch_cost
=
487 1, /* Predictable. */
488 3 /* Unpredictable. */
491 /* Branch costs for Cortex-A57. */
492 static const struct cpu_branch_cost cortexa57_branch_cost
=
494 1, /* Predictable. */
495 3 /* Unpredictable. */
498 /* Branch costs for Vulcan. */
499 static const struct cpu_branch_cost thunderx2t99_branch_cost
=
501 1, /* Predictable. */
502 3 /* Unpredictable. */
505 /* Generic approximation modes. */
506 static const cpu_approx_modes generic_approx_modes
=
508 AARCH64_APPROX_NONE
, /* division */
509 AARCH64_APPROX_NONE
, /* sqrt */
510 AARCH64_APPROX_NONE
/* recip_sqrt */
513 /* Approximation modes for Exynos M1. */
514 static const cpu_approx_modes exynosm1_approx_modes
=
516 AARCH64_APPROX_NONE
, /* division */
517 AARCH64_APPROX_ALL
, /* sqrt */
518 AARCH64_APPROX_ALL
/* recip_sqrt */
521 /* Approximation modes for X-Gene 1. */
522 static const cpu_approx_modes xgene1_approx_modes
=
524 AARCH64_APPROX_NONE
, /* division */
525 AARCH64_APPROX_NONE
, /* sqrt */
526 AARCH64_APPROX_ALL
/* recip_sqrt */
529 /* Generic prefetch settings (which disable prefetch). */
530 static const cpu_prefetch_tune generic_prefetch_tune
=
533 -1, /* l1_cache_size */
534 -1, /* l1_cache_line_size */
535 -1, /* l2_cache_size */
536 -1 /* default_opt_level */
539 static const cpu_prefetch_tune exynosm1_prefetch_tune
=
542 -1, /* l1_cache_size */
543 64, /* l1_cache_line_size */
544 -1, /* l2_cache_size */
545 -1 /* default_opt_level */
548 static const cpu_prefetch_tune qdf24xx_prefetch_tune
=
551 32, /* l1_cache_size */
552 64, /* l1_cache_line_size */
553 1024, /* l2_cache_size */
554 3 /* default_opt_level */
557 static const cpu_prefetch_tune thunderxt88_prefetch_tune
=
560 32, /* l1_cache_size */
561 128, /* l1_cache_line_size */
562 16*1024, /* l2_cache_size */
563 3 /* default_opt_level */
566 static const cpu_prefetch_tune thunderx_prefetch_tune
=
569 32, /* l1_cache_size */
570 128, /* l1_cache_line_size */
571 -1, /* l2_cache_size */
572 -1 /* default_opt_level */
575 static const cpu_prefetch_tune thunderx2t99_prefetch_tune
=
578 32, /* l1_cache_size */
579 64, /* l1_cache_line_size */
580 256, /* l2_cache_size */
581 -1 /* default_opt_level */
584 static const struct tune_params generic_tunings
=
586 &cortexa57_extra_costs
,
587 &generic_addrcost_table
,
588 &generic_regmove_cost
,
589 &generic_vector_cost
,
590 &generic_branch_cost
,
591 &generic_approx_modes
,
594 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
595 8, /* function_align. */
598 2, /* int_reassoc_width. */
599 4, /* fp_reassoc_width. */
600 1, /* vec_reassoc_width. */
601 2, /* min_div_recip_mul_sf. */
602 2, /* min_div_recip_mul_df. */
603 0, /* max_case_values. */
604 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
605 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
606 &generic_prefetch_tune
609 static const struct tune_params cortexa35_tunings
=
611 &cortexa53_extra_costs
,
612 &generic_addrcost_table
,
613 &cortexa53_regmove_cost
,
614 &generic_vector_cost
,
615 &cortexa57_branch_cost
,
616 &generic_approx_modes
,
619 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
620 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
621 16, /* function_align. */
624 2, /* int_reassoc_width. */
625 4, /* fp_reassoc_width. */
626 1, /* vec_reassoc_width. */
627 2, /* min_div_recip_mul_sf. */
628 2, /* min_div_recip_mul_df. */
629 0, /* max_case_values. */
630 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
631 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
632 &generic_prefetch_tune
635 static const struct tune_params cortexa53_tunings
=
637 &cortexa53_extra_costs
,
638 &generic_addrcost_table
,
639 &cortexa53_regmove_cost
,
640 &generic_vector_cost
,
641 &cortexa57_branch_cost
,
642 &generic_approx_modes
,
645 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
646 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
647 16, /* function_align. */
650 2, /* int_reassoc_width. */
651 4, /* fp_reassoc_width. */
652 1, /* vec_reassoc_width. */
653 2, /* min_div_recip_mul_sf. */
654 2, /* min_div_recip_mul_df. */
655 0, /* max_case_values. */
656 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
657 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
658 &generic_prefetch_tune
661 static const struct tune_params cortexa57_tunings
=
663 &cortexa57_extra_costs
,
664 &cortexa57_addrcost_table
,
665 &cortexa57_regmove_cost
,
666 &cortexa57_vector_cost
,
667 &cortexa57_branch_cost
,
668 &generic_approx_modes
,
671 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
672 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
673 16, /* function_align. */
676 2, /* int_reassoc_width. */
677 4, /* fp_reassoc_width. */
678 1, /* vec_reassoc_width. */
679 2, /* min_div_recip_mul_sf. */
680 2, /* min_div_recip_mul_df. */
681 0, /* max_case_values. */
682 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
683 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
), /* tune_flags. */
684 &generic_prefetch_tune
687 static const struct tune_params cortexa72_tunings
=
689 &cortexa57_extra_costs
,
690 &cortexa57_addrcost_table
,
691 &cortexa57_regmove_cost
,
692 &cortexa57_vector_cost
,
693 &cortexa57_branch_cost
,
694 &generic_approx_modes
,
697 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
698 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
699 16, /* function_align. */
702 2, /* int_reassoc_width. */
703 4, /* fp_reassoc_width. */
704 1, /* vec_reassoc_width. */
705 2, /* min_div_recip_mul_sf. */
706 2, /* min_div_recip_mul_df. */
707 0, /* max_case_values. */
708 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
709 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
710 &generic_prefetch_tune
713 static const struct tune_params cortexa73_tunings
=
715 &cortexa57_extra_costs
,
716 &cortexa57_addrcost_table
,
717 &cortexa57_regmove_cost
,
718 &cortexa57_vector_cost
,
719 &cortexa57_branch_cost
,
720 &generic_approx_modes
,
721 4, /* memmov_cost. */
723 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
724 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
725 16, /* function_align. */
728 2, /* int_reassoc_width. */
729 4, /* fp_reassoc_width. */
730 1, /* vec_reassoc_width. */
731 2, /* min_div_recip_mul_sf. */
732 2, /* min_div_recip_mul_df. */
733 0, /* max_case_values. */
734 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
735 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
736 &generic_prefetch_tune
741 static const struct tune_params exynosm1_tunings
=
743 &exynosm1_extra_costs
,
744 &exynosm1_addrcost_table
,
745 &exynosm1_regmove_cost
,
746 &exynosm1_vector_cost
,
747 &generic_branch_cost
,
748 &exynosm1_approx_modes
,
751 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
752 4, /* function_align. */
755 2, /* int_reassoc_width. */
756 4, /* fp_reassoc_width. */
757 1, /* vec_reassoc_width. */
758 2, /* min_div_recip_mul_sf. */
759 2, /* min_div_recip_mul_df. */
760 48, /* max_case_values. */
761 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
762 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
763 &exynosm1_prefetch_tune
766 static const struct tune_params thunderxt88_tunings
=
768 &thunderx_extra_costs
,
769 &generic_addrcost_table
,
770 &thunderx_regmove_cost
,
771 &thunderx_vector_cost
,
772 &generic_branch_cost
,
773 &generic_approx_modes
,
776 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
777 8, /* function_align. */
780 2, /* int_reassoc_width. */
781 4, /* fp_reassoc_width. */
782 1, /* vec_reassoc_width. */
783 2, /* min_div_recip_mul_sf. */
784 2, /* min_div_recip_mul_df. */
785 0, /* max_case_values. */
786 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
787 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
), /* tune_flags. */
788 &thunderxt88_prefetch_tune
791 static const struct tune_params thunderx_tunings
=
793 &thunderx_extra_costs
,
794 &generic_addrcost_table
,
795 &thunderx_regmove_cost
,
796 &thunderx_vector_cost
,
797 &generic_branch_cost
,
798 &generic_approx_modes
,
801 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
802 8, /* function_align. */
805 2, /* int_reassoc_width. */
806 4, /* fp_reassoc_width. */
807 1, /* vec_reassoc_width. */
808 2, /* min_div_recip_mul_sf. */
809 2, /* min_div_recip_mul_df. */
810 0, /* max_case_values. */
811 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
812 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
813 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
), /* tune_flags. */
814 &thunderx_prefetch_tune
817 static const struct tune_params xgene1_tunings
=
820 &xgene1_addrcost_table
,
821 &xgene1_regmove_cost
,
823 &generic_branch_cost
,
824 &xgene1_approx_modes
,
827 AARCH64_FUSE_NOTHING
, /* fusible_ops */
828 16, /* function_align. */
830 16, /* loop_align. */
831 2, /* int_reassoc_width. */
832 4, /* fp_reassoc_width. */
833 1, /* vec_reassoc_width. */
834 2, /* min_div_recip_mul_sf. */
835 2, /* min_div_recip_mul_df. */
836 0, /* max_case_values. */
837 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
838 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
839 &generic_prefetch_tune
842 static const struct tune_params qdf24xx_tunings
=
844 &qdf24xx_extra_costs
,
845 &qdf24xx_addrcost_table
,
846 &qdf24xx_regmove_cost
,
847 &generic_vector_cost
,
848 &generic_branch_cost
,
849 &generic_approx_modes
,
852 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
853 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
854 16, /* function_align. */
856 16, /* loop_align. */
857 2, /* int_reassoc_width. */
858 4, /* fp_reassoc_width. */
859 1, /* vec_reassoc_width. */
860 2, /* min_div_recip_mul_sf. */
861 2, /* min_div_recip_mul_df. */
862 0, /* max_case_values. */
863 tune_params::AUTOPREFETCHER_STRONG
, /* autoprefetcher_model. */
864 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
865 &qdf24xx_prefetch_tune
868 static const struct tune_params thunderx2t99_tunings
=
870 &thunderx2t99_extra_costs
,
871 &thunderx2t99_addrcost_table
,
872 &thunderx2t99_regmove_cost
,
873 &thunderx2t99_vector_cost
,
874 &thunderx2t99_branch_cost
,
875 &generic_approx_modes
,
876 4, /* memmov_cost. */
878 (AARCH64_FUSE_CMP_BRANCH
| AARCH64_FUSE_AES_AESMC
879 | AARCH64_FUSE_ALU_BRANCH
), /* fusible_ops */
880 16, /* function_align. */
882 16, /* loop_align. */
883 3, /* int_reassoc_width. */
884 2, /* fp_reassoc_width. */
885 2, /* vec_reassoc_width. */
886 2, /* min_div_recip_mul_sf. */
887 2, /* min_div_recip_mul_df. */
888 0, /* max_case_values. */
889 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
890 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
891 &thunderx2t99_prefetch_tune
894 /* Support for fine-grained override of the tuning structures. */
895 struct aarch64_tuning_override_function
898 void (*parse_override
)(const char*, struct tune_params
*);
901 static void aarch64_parse_fuse_string (const char*, struct tune_params
*);
902 static void aarch64_parse_tune_string (const char*, struct tune_params
*);
904 static const struct aarch64_tuning_override_function
905 aarch64_tuning_override_functions
[] =
907 { "fuse", aarch64_parse_fuse_string
},
908 { "tune", aarch64_parse_tune_string
},
912 /* A processor implementing AArch64. */
915 const char *const name
;
916 enum aarch64_processor ident
;
917 enum aarch64_processor sched_core
;
918 enum aarch64_arch arch
;
919 unsigned architecture_version
;
920 const unsigned long flags
;
921 const struct tune_params
*const tune
;
924 /* Architectures implementing AArch64. */
925 static const struct processor all_architectures
[] =
927 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
928 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
929 #include "aarch64-arches.def"
930 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
933 /* Processor cores implementing AArch64. */
934 static const struct processor all_cores
[] =
936 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
937 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
938 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
939 FLAGS, &COSTS##_tunings},
940 #include "aarch64-cores.def"
941 {"generic", generic
, cortexa53
, AARCH64_ARCH_8A
, 8,
942 AARCH64_FL_FOR_ARCH8
, &generic_tunings
},
943 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
947 /* Target specification. These are populated by the -march, -mtune, -mcpu
948 handling code or by target attributes. */
949 static const struct processor
*selected_arch
;
950 static const struct processor
*selected_cpu
;
951 static const struct processor
*selected_tune
;
953 /* The current tuning set. */
954 struct tune_params aarch64_tune_params
= generic_tunings
;
956 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
958 /* An ISA extension in the co-processor and main instruction set space. */
959 struct aarch64_option_extension
961 const char *const name
;
962 const unsigned long flags_on
;
963 const unsigned long flags_off
;
966 typedef enum aarch64_cond_code
968 AARCH64_EQ
= 0, AARCH64_NE
, AARCH64_CS
, AARCH64_CC
, AARCH64_MI
, AARCH64_PL
,
969 AARCH64_VS
, AARCH64_VC
, AARCH64_HI
, AARCH64_LS
, AARCH64_GE
, AARCH64_LT
,
970 AARCH64_GT
, AARCH64_LE
, AARCH64_AL
, AARCH64_NV
974 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
976 /* The condition codes of the processor, and the inverse function. */
977 static const char * const aarch64_condition_codes
[] =
979 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
980 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
983 /* Generate code to enable conditional branches in functions over 1 MiB. */
985 aarch64_gen_far_branch (rtx
* operands
, int pos_label
, const char * dest
,
986 const char * branch_format
)
988 rtx_code_label
* tmp_label
= gen_label_rtx ();
991 ASM_GENERATE_INTERNAL_LABEL (label_buf
, dest
,
992 CODE_LABEL_NUMBER (tmp_label
));
993 const char *label_ptr
= targetm
.strip_name_encoding (label_buf
);
994 rtx dest_label
= operands
[pos_label
];
995 operands
[pos_label
] = tmp_label
;
997 snprintf (buffer
, sizeof (buffer
), "%s%s", branch_format
, label_ptr
);
998 output_asm_insn (buffer
, operands
);
1000 snprintf (buffer
, sizeof (buffer
), "b\t%%l%d\n%s:", pos_label
, label_ptr
);
1001 operands
[pos_label
] = dest_label
;
1002 output_asm_insn (buffer
, operands
);
1007 aarch64_err_no_fpadvsimd (machine_mode mode
, const char *msg
)
1009 const char *mc
= FLOAT_MODE_P (mode
) ? "floating-point" : "vector";
1010 if (TARGET_GENERAL_REGS_ONLY
)
1011 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc
, msg
);
1013 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc
, msg
);
1016 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1017 The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
1018 the same cost even if ALL_REGS has a much larger cost. ALL_REGS is also
1019 used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
1020 cost (in this case the best class is the lowest cost one). Using ALL_REGS
1021 irrespectively of its cost results in bad allocations with many redundant
1022 int<->FP moves which are expensive on various cores.
1023 To avoid this we don't allow ALL_REGS as the allocno class, but force a
1024 decision between FP_REGS and GENERAL_REGS. We use the allocno class if it
1025 isn't ALL_REGS. Similarly, use the best class if it isn't ALL_REGS.
1026 Otherwise set the allocno class depending on the mode.
1027 The result of this is that it is no longer inefficient to have a higher
1028 memory move cost than the register move cost.
1032 aarch64_ira_change_pseudo_allocno_class (int regno
, reg_class_t allocno_class
,
1033 reg_class_t best_class
)
1037 if (allocno_class
!= ALL_REGS
)
1038 return allocno_class
;
1040 if (best_class
!= ALL_REGS
)
1043 mode
= PSEUDO_REGNO_MODE (regno
);
1044 return FLOAT_MODE_P (mode
) || VECTOR_MODE_P (mode
) ? FP_REGS
: GENERAL_REGS
;
1048 aarch64_min_divisions_for_recip_mul (machine_mode mode
)
1050 if (GET_MODE_UNIT_SIZE (mode
) == 4)
1051 return aarch64_tune_params
.min_div_recip_mul_sf
;
1052 return aarch64_tune_params
.min_div_recip_mul_df
;
1056 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED
,
1059 if (VECTOR_MODE_P (mode
))
1060 return aarch64_tune_params
.vec_reassoc_width
;
1061 if (INTEGRAL_MODE_P (mode
))
1062 return aarch64_tune_params
.int_reassoc_width
;
1063 if (FLOAT_MODE_P (mode
))
1064 return aarch64_tune_params
.fp_reassoc_width
;
1068 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1070 aarch64_dbx_register_number (unsigned regno
)
1072 if (GP_REGNUM_P (regno
))
1073 return AARCH64_DWARF_R0
+ regno
- R0_REGNUM
;
1074 else if (regno
== SP_REGNUM
)
1075 return AARCH64_DWARF_SP
;
1076 else if (FP_REGNUM_P (regno
))
1077 return AARCH64_DWARF_V0
+ regno
- V0_REGNUM
;
1079 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1080 equivalent DWARF register. */
1081 return DWARF_FRAME_REGISTERS
;
1084 /* Return TRUE if MODE is any of the large INT modes. */
1086 aarch64_vect_struct_mode_p (machine_mode mode
)
1088 return mode
== OImode
|| mode
== CImode
|| mode
== XImode
;
1091 /* Return TRUE if MODE is any of the vector modes. */
1093 aarch64_vector_mode_p (machine_mode mode
)
1095 return aarch64_vector_mode_supported_p (mode
)
1096 || aarch64_vect_struct_mode_p (mode
);
1099 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1101 aarch64_array_mode_supported_p (machine_mode mode
,
1102 unsigned HOST_WIDE_INT nelems
)
1105 && (AARCH64_VALID_SIMD_QREG_MODE (mode
)
1106 || AARCH64_VALID_SIMD_DREG_MODE (mode
))
1107 && (nelems
>= 2 && nelems
<= 4))
1113 /* Implement HARD_REGNO_NREGS. */
1116 aarch64_hard_regno_nregs (unsigned regno
, machine_mode mode
)
1118 switch (aarch64_regno_regclass (regno
))
1122 return (GET_MODE_SIZE (mode
) + UNITS_PER_VREG
- 1) / UNITS_PER_VREG
;
1124 return (GET_MODE_SIZE (mode
) + UNITS_PER_WORD
- 1) / UNITS_PER_WORD
;
1129 /* Implement HARD_REGNO_MODE_OK. */
1132 aarch64_hard_regno_mode_ok (unsigned regno
, machine_mode mode
)
1134 if (GET_MODE_CLASS (mode
) == MODE_CC
)
1135 return regno
== CC_REGNUM
;
1137 if (regno
== SP_REGNUM
)
1138 /* The purpose of comparing with ptr_mode is to support the
1139 global register variable associated with the stack pointer
1140 register via the syntax of asm ("wsp") in ILP32. */
1141 return mode
== Pmode
|| mode
== ptr_mode
;
1143 if (regno
== FRAME_POINTER_REGNUM
|| regno
== ARG_POINTER_REGNUM
)
1144 return mode
== Pmode
;
1146 if (GP_REGNUM_P (regno
) && ! aarch64_vect_struct_mode_p (mode
))
1149 if (FP_REGNUM_P (regno
))
1151 if (aarch64_vect_struct_mode_p (mode
))
1153 (regno
+ aarch64_hard_regno_nregs (regno
, mode
) - 1) <= V31_REGNUM
;
1161 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1163 aarch64_hard_regno_caller_save_mode (unsigned regno
, unsigned nregs
,
1166 /* Handle modes that fit within single registers. */
1167 if (nregs
== 1 && GET_MODE_SIZE (mode
) <= 16)
1169 if (GET_MODE_SIZE (mode
) >= 4)
1174 /* Fall back to generic for multi-reg and very large modes. */
1176 return choose_hard_reg_mode (regno
, nregs
, false);
1179 /* Return true if calls to DECL should be treated as
1180 long-calls (ie called via a register). */
1182 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED
)
1187 /* Return true if calls to symbol-ref SYM should be treated as
1188 long-calls (ie called via a register). */
1190 aarch64_is_long_call_p (rtx sym
)
1192 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym
));
1195 /* Return true if calls to symbol-ref SYM should not go through
1199 aarch64_is_noplt_call_p (rtx sym
)
1201 const_tree decl
= SYMBOL_REF_DECL (sym
);
1206 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl
)))
1207 && !targetm
.binds_local_p (decl
))
1213 /* Return true if the offsets to a zero/sign-extract operation
1214 represent an expression that matches an extend operation. The
1215 operands represent the paramters from
1217 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1219 aarch64_is_extend_from_extract (machine_mode mode
, rtx mult_imm
,
1222 HOST_WIDE_INT mult_val
, extract_val
;
1224 if (! CONST_INT_P (mult_imm
) || ! CONST_INT_P (extract_imm
))
1227 mult_val
= INTVAL (mult_imm
);
1228 extract_val
= INTVAL (extract_imm
);
1231 && extract_val
< GET_MODE_BITSIZE (mode
)
1232 && exact_log2 (extract_val
& ~7) > 0
1233 && (extract_val
& 7) <= 4
1234 && mult_val
== (1 << (extract_val
& 7)))
1240 /* Emit an insn that's a simple single-set. Both the operands must be
1241 known to be valid. */
1242 inline static rtx_insn
*
1243 emit_set_insn (rtx x
, rtx y
)
1245 return emit_insn (gen_rtx_SET (x
, y
));
1248 /* X and Y are two things to compare using CODE. Emit the compare insn and
1249 return the rtx for register 0 in the proper mode. */
1251 aarch64_gen_compare_reg (RTX_CODE code
, rtx x
, rtx y
)
1253 machine_mode mode
= SELECT_CC_MODE (code
, x
, y
);
1254 rtx cc_reg
= gen_rtx_REG (mode
, CC_REGNUM
);
1256 emit_set_insn (cc_reg
, gen_rtx_COMPARE (mode
, x
, y
));
1260 /* Build the SYMBOL_REF for __tls_get_addr. */
1262 static GTY(()) rtx tls_get_addr_libfunc
;
1265 aarch64_tls_get_addr (void)
1267 if (!tls_get_addr_libfunc
)
1268 tls_get_addr_libfunc
= init_one_libfunc ("__tls_get_addr");
1269 return tls_get_addr_libfunc
;
1272 /* Return the TLS model to use for ADDR. */
1274 static enum tls_model
1275 tls_symbolic_operand_type (rtx addr
)
1277 enum tls_model tls_kind
= TLS_MODEL_NONE
;
1280 if (GET_CODE (addr
) == CONST
)
1282 split_const (addr
, &sym
, &addend
);
1283 if (GET_CODE (sym
) == SYMBOL_REF
)
1284 tls_kind
= SYMBOL_REF_TLS_MODEL (sym
);
1286 else if (GET_CODE (addr
) == SYMBOL_REF
)
1287 tls_kind
= SYMBOL_REF_TLS_MODEL (addr
);
1292 /* We'll allow lo_sum's in addresses in our legitimate addresses
1293 so that combine would take care of combining addresses where
1294 necessary, but for generation purposes, we'll generate the address
1297 tmp = hi (symbol_ref); adrp x1, foo
1298 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1302 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1303 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1307 Load TLS symbol, depending on TLS mechanism and TLS access model.
1309 Global Dynamic - Traditional TLS:
1310 adrp tmp, :tlsgd:imm
1311 add dest, tmp, #:tlsgd_lo12:imm
1314 Global Dynamic - TLS Descriptors:
1315 adrp dest, :tlsdesc:imm
1316 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1317 add dest, dest, #:tlsdesc_lo12:imm
1324 adrp tmp, :gottprel:imm
1325 ldr dest, [tmp, #:gottprel_lo12:imm]
1330 add t0, tp, #:tprel_hi12:imm, lsl #12
1331 add t0, t0, #:tprel_lo12_nc:imm
1335 aarch64_load_symref_appropriately (rtx dest
, rtx imm
,
1336 enum aarch64_symbol_type type
)
1340 case SYMBOL_SMALL_ABSOLUTE
:
1342 /* In ILP32, the mode of dest can be either SImode or DImode. */
1344 machine_mode mode
= GET_MODE (dest
);
1346 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
1348 if (can_create_pseudo_p ())
1349 tmp_reg
= gen_reg_rtx (mode
);
1351 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
1352 emit_insn (gen_add_losym (dest
, tmp_reg
, imm
));
1356 case SYMBOL_TINY_ABSOLUTE
:
1357 emit_insn (gen_rtx_SET (dest
, imm
));
1360 case SYMBOL_SMALL_GOT_28K
:
1362 machine_mode mode
= GET_MODE (dest
);
1363 rtx gp_rtx
= pic_offset_table_rtx
;
1367 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1368 here before rtl expand. Tree IVOPT will generate rtl pattern to
1369 decide rtx costs, in which case pic_offset_table_rtx is not
1370 initialized. For that case no need to generate the first adrp
1371 instruction as the final cost for global variable access is
1375 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1376 using the page base as GOT base, the first page may be wasted,
1377 in the worst scenario, there is only 28K space for GOT).
1379 The generate instruction sequence for accessing global variable
1382 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1384 Only one instruction needed. But we must initialize
1385 pic_offset_table_rtx properly. We generate initialize insn for
1386 every global access, and allow CSE to remove all redundant.
1388 The final instruction sequences will look like the following
1389 for multiply global variables access.
1391 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1393 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1394 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1395 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1398 rtx s
= gen_rtx_SYMBOL_REF (Pmode
, "_GLOBAL_OFFSET_TABLE_");
1399 crtl
->uses_pic_offset_table
= 1;
1400 emit_move_insn (gp_rtx
, gen_rtx_HIGH (Pmode
, s
));
1402 if (mode
!= GET_MODE (gp_rtx
))
1403 gp_rtx
= gen_lowpart (mode
, gp_rtx
);
1407 if (mode
== ptr_mode
)
1410 insn
= gen_ldr_got_small_28k_di (dest
, gp_rtx
, imm
);
1412 insn
= gen_ldr_got_small_28k_si (dest
, gp_rtx
, imm
);
1414 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
1418 gcc_assert (mode
== Pmode
);
1420 insn
= gen_ldr_got_small_28k_sidi (dest
, gp_rtx
, imm
);
1421 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
1424 /* The operand is expected to be MEM. Whenever the related insn
1425 pattern changed, above code which calculate mem should be
1427 gcc_assert (GET_CODE (mem
) == MEM
);
1428 MEM_READONLY_P (mem
) = 1;
1429 MEM_NOTRAP_P (mem
) = 1;
1434 case SYMBOL_SMALL_GOT_4G
:
1436 /* In ILP32, the mode of dest can be either SImode or DImode,
1437 while the got entry is always of SImode size. The mode of
1438 dest depends on how dest is used: if dest is assigned to a
1439 pointer (e.g. in the memory), it has SImode; it may have
1440 DImode if dest is dereferenced to access the memeory.
1441 This is why we have to handle three different ldr_got_small
1442 patterns here (two patterns for ILP32). */
1447 machine_mode mode
= GET_MODE (dest
);
1449 if (can_create_pseudo_p ())
1450 tmp_reg
= gen_reg_rtx (mode
);
1452 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
1453 if (mode
== ptr_mode
)
1456 insn
= gen_ldr_got_small_di (dest
, tmp_reg
, imm
);
1458 insn
= gen_ldr_got_small_si (dest
, tmp_reg
, imm
);
1460 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
1464 gcc_assert (mode
== Pmode
);
1466 insn
= gen_ldr_got_small_sidi (dest
, tmp_reg
, imm
);
1467 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
1470 gcc_assert (GET_CODE (mem
) == MEM
);
1471 MEM_READONLY_P (mem
) = 1;
1472 MEM_NOTRAP_P (mem
) = 1;
1477 case SYMBOL_SMALL_TLSGD
:
1480 machine_mode mode
= GET_MODE (dest
);
1481 rtx result
= gen_rtx_REG (mode
, R0_REGNUM
);
1485 aarch64_emit_call_insn (gen_tlsgd_small_si (result
, imm
));
1487 aarch64_emit_call_insn (gen_tlsgd_small_di (result
, imm
));
1488 insns
= get_insns ();
1491 RTL_CONST_CALL_P (insns
) = 1;
1492 emit_libcall_block (insns
, dest
, result
, imm
);
1496 case SYMBOL_SMALL_TLSDESC
:
1498 machine_mode mode
= GET_MODE (dest
);
1499 rtx x0
= gen_rtx_REG (mode
, R0_REGNUM
);
1502 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
1504 /* In ILP32, the got entry is always of SImode size. Unlike
1505 small GOT, the dest is fixed at reg 0. */
1507 emit_insn (gen_tlsdesc_small_si (imm
));
1509 emit_insn (gen_tlsdesc_small_di (imm
));
1510 tp
= aarch64_load_tp (NULL
);
1513 tp
= gen_lowpart (mode
, tp
);
1515 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, x0
)));
1516 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1520 case SYMBOL_SMALL_TLSIE
:
1522 /* In ILP32, the mode of dest can be either SImode or DImode,
1523 while the got entry is always of SImode size. The mode of
1524 dest depends on how dest is used: if dest is assigned to a
1525 pointer (e.g. in the memory), it has SImode; it may have
1526 DImode if dest is dereferenced to access the memeory.
1527 This is why we have to handle three different tlsie_small
1528 patterns here (two patterns for ILP32). */
1529 machine_mode mode
= GET_MODE (dest
);
1530 rtx tmp_reg
= gen_reg_rtx (mode
);
1531 rtx tp
= aarch64_load_tp (NULL
);
1533 if (mode
== ptr_mode
)
1536 emit_insn (gen_tlsie_small_di (tmp_reg
, imm
));
1539 emit_insn (gen_tlsie_small_si (tmp_reg
, imm
));
1540 tp
= gen_lowpart (mode
, tp
);
1545 gcc_assert (mode
== Pmode
);
1546 emit_insn (gen_tlsie_small_sidi (tmp_reg
, imm
));
1549 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, tmp_reg
)));
1550 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1554 case SYMBOL_TLSLE12
:
1555 case SYMBOL_TLSLE24
:
1556 case SYMBOL_TLSLE32
:
1557 case SYMBOL_TLSLE48
:
1559 machine_mode mode
= GET_MODE (dest
);
1560 rtx tp
= aarch64_load_tp (NULL
);
1563 tp
= gen_lowpart (mode
, tp
);
1567 case SYMBOL_TLSLE12
:
1568 emit_insn ((mode
== DImode
? gen_tlsle12_di
: gen_tlsle12_si
)
1571 case SYMBOL_TLSLE24
:
1572 emit_insn ((mode
== DImode
? gen_tlsle24_di
: gen_tlsle24_si
)
1575 case SYMBOL_TLSLE32
:
1576 emit_insn ((mode
== DImode
? gen_tlsle32_di
: gen_tlsle32_si
)
1578 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
1581 case SYMBOL_TLSLE48
:
1582 emit_insn ((mode
== DImode
? gen_tlsle48_di
: gen_tlsle48_si
)
1584 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
1591 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1595 case SYMBOL_TINY_GOT
:
1596 emit_insn (gen_ldr_got_tiny (dest
, imm
));
1599 case SYMBOL_TINY_TLSIE
:
1601 machine_mode mode
= GET_MODE (dest
);
1602 rtx tp
= aarch64_load_tp (NULL
);
1604 if (mode
== ptr_mode
)
1607 emit_insn (gen_tlsie_tiny_di (dest
, imm
, tp
));
1610 tp
= gen_lowpart (mode
, tp
);
1611 emit_insn (gen_tlsie_tiny_si (dest
, imm
, tp
));
1616 gcc_assert (mode
== Pmode
);
1617 emit_insn (gen_tlsie_tiny_sidi (dest
, imm
, tp
));
1620 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1629 /* Emit a move from SRC to DEST. Assume that the move expanders can
1630 handle all moves if !can_create_pseudo_p (). The distinction is
1631 important because, unlike emit_move_insn, the move expanders know
1632 how to force Pmode objects into the constant pool even when the
1633 constant pool address is not itself legitimate. */
1635 aarch64_emit_move (rtx dest
, rtx src
)
1637 return (can_create_pseudo_p ()
1638 ? emit_move_insn (dest
, src
)
1639 : emit_move_insn_1 (dest
, src
));
1642 /* Split a 128-bit move operation into two 64-bit move operations,
1643 taking care to handle partial overlap of register to register
1644 copies. Special cases are needed when moving between GP regs and
1645 FP regs. SRC can be a register, constant or memory; DST a register
1646 or memory. If either operand is memory it must not have any side
1649 aarch64_split_128bit_move (rtx dst
, rtx src
)
1654 machine_mode mode
= GET_MODE (dst
);
1656 gcc_assert (mode
== TImode
|| mode
== TFmode
);
1657 gcc_assert (!(side_effects_p (src
) || side_effects_p (dst
)));
1658 gcc_assert (mode
== GET_MODE (src
) || GET_MODE (src
) == VOIDmode
);
1660 if (REG_P (dst
) && REG_P (src
))
1662 int src_regno
= REGNO (src
);
1663 int dst_regno
= REGNO (dst
);
1665 /* Handle FP <-> GP regs. */
1666 if (FP_REGNUM_P (dst_regno
) && GP_REGNUM_P (src_regno
))
1668 src_lo
= gen_lowpart (word_mode
, src
);
1669 src_hi
= gen_highpart (word_mode
, src
);
1673 emit_insn (gen_aarch64_movtilow_di (dst
, src_lo
));
1674 emit_insn (gen_aarch64_movtihigh_di (dst
, src_hi
));
1678 emit_insn (gen_aarch64_movtflow_di (dst
, src_lo
));
1679 emit_insn (gen_aarch64_movtfhigh_di (dst
, src_hi
));
1683 else if (GP_REGNUM_P (dst_regno
) && FP_REGNUM_P (src_regno
))
1685 dst_lo
= gen_lowpart (word_mode
, dst
);
1686 dst_hi
= gen_highpart (word_mode
, dst
);
1690 emit_insn (gen_aarch64_movdi_tilow (dst_lo
, src
));
1691 emit_insn (gen_aarch64_movdi_tihigh (dst_hi
, src
));
1695 emit_insn (gen_aarch64_movdi_tflow (dst_lo
, src
));
1696 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi
, src
));
1702 dst_lo
= gen_lowpart (word_mode
, dst
);
1703 dst_hi
= gen_highpart (word_mode
, dst
);
1704 src_lo
= gen_lowpart (word_mode
, src
);
1705 src_hi
= gen_highpart_mode (word_mode
, mode
, src
);
1707 /* At most one pairing may overlap. */
1708 if (reg_overlap_mentioned_p (dst_lo
, src_hi
))
1710 aarch64_emit_move (dst_hi
, src_hi
);
1711 aarch64_emit_move (dst_lo
, src_lo
);
1715 aarch64_emit_move (dst_lo
, src_lo
);
1716 aarch64_emit_move (dst_hi
, src_hi
);
1721 aarch64_split_128bit_move_p (rtx dst
, rtx src
)
1723 return (! REG_P (src
)
1724 || ! (FP_REGNUM_P (REGNO (dst
)) && FP_REGNUM_P (REGNO (src
))));
1727 /* Split a complex SIMD combine. */
1730 aarch64_split_simd_combine (rtx dst
, rtx src1
, rtx src2
)
1732 machine_mode src_mode
= GET_MODE (src1
);
1733 machine_mode dst_mode
= GET_MODE (dst
);
1735 gcc_assert (VECTOR_MODE_P (dst_mode
));
1736 gcc_assert (register_operand (dst
, dst_mode
)
1737 && register_operand (src1
, src_mode
)
1738 && register_operand (src2
, src_mode
));
1740 rtx (*gen
) (rtx
, rtx
, rtx
);
1745 gen
= gen_aarch64_simd_combinev8qi
;
1748 gen
= gen_aarch64_simd_combinev4hi
;
1751 gen
= gen_aarch64_simd_combinev2si
;
1754 gen
= gen_aarch64_simd_combinev4hf
;
1757 gen
= gen_aarch64_simd_combinev2sf
;
1760 gen
= gen_aarch64_simd_combinedi
;
1763 gen
= gen_aarch64_simd_combinedf
;
1769 emit_insn (gen (dst
, src1
, src2
));
1773 /* Split a complex SIMD move. */
1776 aarch64_split_simd_move (rtx dst
, rtx src
)
1778 machine_mode src_mode
= GET_MODE (src
);
1779 machine_mode dst_mode
= GET_MODE (dst
);
1781 gcc_assert (VECTOR_MODE_P (dst_mode
));
1783 if (REG_P (dst
) && REG_P (src
))
1785 rtx (*gen
) (rtx
, rtx
);
1787 gcc_assert (VECTOR_MODE_P (src_mode
));
1792 gen
= gen_aarch64_split_simd_movv16qi
;
1795 gen
= gen_aarch64_split_simd_movv8hi
;
1798 gen
= gen_aarch64_split_simd_movv4si
;
1801 gen
= gen_aarch64_split_simd_movv2di
;
1804 gen
= gen_aarch64_split_simd_movv8hf
;
1807 gen
= gen_aarch64_split_simd_movv4sf
;
1810 gen
= gen_aarch64_split_simd_movv2df
;
1816 emit_insn (gen (dst
, src
));
1822 aarch64_zero_extend_const_eq (machine_mode xmode
, rtx x
,
1823 machine_mode ymode
, rtx y
)
1825 rtx r
= simplify_const_unary_operation (ZERO_EXTEND
, xmode
, y
, ymode
);
1826 gcc_assert (r
!= NULL
);
1827 return rtx_equal_p (x
, r
);
1832 aarch64_force_temporary (machine_mode mode
, rtx x
, rtx value
)
1834 if (can_create_pseudo_p ())
1835 return force_reg (mode
, value
);
1838 x
= aarch64_emit_move (x
, value
);
1845 aarch64_add_offset (machine_mode mode
, rtx temp
, rtx reg
, HOST_WIDE_INT offset
)
1847 if (!aarch64_plus_immediate (GEN_INT (offset
), mode
))
1850 /* Load the full offset into a register. This
1851 might be improvable in the future. */
1852 high
= GEN_INT (offset
);
1854 high
= aarch64_force_temporary (mode
, temp
, high
);
1855 reg
= aarch64_force_temporary (mode
, temp
,
1856 gen_rtx_PLUS (mode
, high
, reg
));
1858 return plus_constant (mode
, reg
, offset
);
1862 aarch64_internal_mov_immediate (rtx dest
, rtx imm
, bool generate
,
1866 unsigned HOST_WIDE_INT val
, val2
, mask
;
1867 int one_match
, zero_match
;
1872 if (aarch64_move_imm (val
, mode
))
1875 emit_insn (gen_rtx_SET (dest
, imm
));
1879 if ((val
>> 32) == 0 || mode
== SImode
)
1883 emit_insn (gen_rtx_SET (dest
, GEN_INT (val
& 0xffff)));
1885 emit_insn (gen_insv_immsi (dest
, GEN_INT (16),
1886 GEN_INT ((val
>> 16) & 0xffff)));
1888 emit_insn (gen_insv_immdi (dest
, GEN_INT (16),
1889 GEN_INT ((val
>> 16) & 0xffff)));
1894 /* Remaining cases are all for DImode. */
1897 zero_match
= ((val
& mask
) == 0) + ((val
& (mask
<< 16)) == 0) +
1898 ((val
& (mask
<< 32)) == 0) + ((val
& (mask
<< 48)) == 0);
1899 one_match
= ((~val
& mask
) == 0) + ((~val
& (mask
<< 16)) == 0) +
1900 ((~val
& (mask
<< 32)) == 0) + ((~val
& (mask
<< 48)) == 0);
1902 if (zero_match
!= 2 && one_match
!= 2)
1904 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1905 For a 64-bit bitmask try whether changing 16 bits to all ones or
1906 zeroes creates a valid bitmask. To check any repeated bitmask,
1907 try using 16 bits from the other 32-bit half of val. */
1909 for (i
= 0; i
< 64; i
+= 16, mask
<<= 16)
1912 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
1915 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
1917 val2
= val2
& ~mask
;
1918 val2
= val2
| (((val2
>> 32) | (val2
<< 32)) & mask
);
1919 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
1926 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
1927 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
1928 GEN_INT ((val
>> i
) & 0xffff)));
1934 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1935 are emitted by the initial mov. If one_match > zero_match, skip set bits,
1936 otherwise skip zero bits. */
1940 val2
= one_match
> zero_match
? ~val
: val
;
1941 i
= (val2
& mask
) != 0 ? 0 : (val2
& (mask
<< 16)) != 0 ? 16 : 32;
1944 emit_insn (gen_rtx_SET (dest
, GEN_INT (one_match
> zero_match
1945 ? (val
| ~(mask
<< i
))
1946 : (val
& (mask
<< i
)))));
1947 for (i
+= 16; i
< 64; i
+= 16)
1949 if ((val2
& (mask
<< i
)) == 0)
1952 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
1953 GEN_INT ((val
>> i
) & 0xffff)));
1962 aarch64_expand_mov_immediate (rtx dest
, rtx imm
)
1964 machine_mode mode
= GET_MODE (dest
);
1966 gcc_assert (mode
== SImode
|| mode
== DImode
);
1968 /* Check on what type of symbol it is. */
1969 if (GET_CODE (imm
) == SYMBOL_REF
1970 || GET_CODE (imm
) == LABEL_REF
1971 || GET_CODE (imm
) == CONST
)
1973 rtx mem
, base
, offset
;
1974 enum aarch64_symbol_type sty
;
1976 /* If we have (const (plus symbol offset)), separate out the offset
1977 before we start classifying the symbol. */
1978 split_const (imm
, &base
, &offset
);
1980 sty
= aarch64_classify_symbol (base
, offset
);
1983 case SYMBOL_FORCE_TO_MEM
:
1984 if (offset
!= const0_rtx
1985 && targetm
.cannot_force_const_mem (mode
, imm
))
1987 gcc_assert (can_create_pseudo_p ());
1988 base
= aarch64_force_temporary (mode
, dest
, base
);
1989 base
= aarch64_add_offset (mode
, NULL
, base
, INTVAL (offset
));
1990 aarch64_emit_move (dest
, base
);
1994 mem
= force_const_mem (ptr_mode
, imm
);
1997 /* If we aren't generating PC relative literals, then
1998 we need to expand the literal pool access carefully.
1999 This is something that needs to be done in a number
2000 of places, so could well live as a separate function. */
2001 if (!aarch64_pcrelative_literal_loads
)
2003 gcc_assert (can_create_pseudo_p ());
2004 base
= gen_reg_rtx (ptr_mode
);
2005 aarch64_expand_mov_immediate (base
, XEXP (mem
, 0));
2006 if (ptr_mode
!= Pmode
)
2007 base
= convert_memory_address (Pmode
, base
);
2008 mem
= gen_rtx_MEM (ptr_mode
, base
);
2011 if (mode
!= ptr_mode
)
2012 mem
= gen_rtx_ZERO_EXTEND (mode
, mem
);
2014 emit_insn (gen_rtx_SET (dest
, mem
));
2018 case SYMBOL_SMALL_TLSGD
:
2019 case SYMBOL_SMALL_TLSDESC
:
2020 case SYMBOL_SMALL_TLSIE
:
2021 case SYMBOL_SMALL_GOT_28K
:
2022 case SYMBOL_SMALL_GOT_4G
:
2023 case SYMBOL_TINY_GOT
:
2024 case SYMBOL_TINY_TLSIE
:
2025 if (offset
!= const0_rtx
)
2027 gcc_assert(can_create_pseudo_p ());
2028 base
= aarch64_force_temporary (mode
, dest
, base
);
2029 base
= aarch64_add_offset (mode
, NULL
, base
, INTVAL (offset
));
2030 aarch64_emit_move (dest
, base
);
2035 case SYMBOL_SMALL_ABSOLUTE
:
2036 case SYMBOL_TINY_ABSOLUTE
:
2037 case SYMBOL_TLSLE12
:
2038 case SYMBOL_TLSLE24
:
2039 case SYMBOL_TLSLE32
:
2040 case SYMBOL_TLSLE48
:
2041 aarch64_load_symref_appropriately (dest
, imm
, sty
);
2049 if (!CONST_INT_P (imm
))
2051 if (GET_CODE (imm
) == HIGH
)
2052 emit_insn (gen_rtx_SET (dest
, imm
));
2055 rtx mem
= force_const_mem (mode
, imm
);
2057 emit_insn (gen_rtx_SET (dest
, mem
));
2063 aarch64_internal_mov_immediate (dest
, imm
, true, GET_MODE (dest
));
2066 /* Add DELTA to REGNUM in mode MODE. SCRATCHREG can be used to hold a
2067 temporary value if necessary. FRAME_RELATED_P should be true if
2068 the RTX_FRAME_RELATED flag should be set and CFA adjustments added
2069 to the generated instructions. If SCRATCHREG is known to hold
2070 abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
2073 Since this function may be used to adjust the stack pointer, we must
2074 ensure that it cannot cause transient stack deallocation (for example
2075 by first incrementing SP and then decrementing when adjusting by a
2076 large immediate). */
2079 aarch64_add_constant_internal (machine_mode mode
, int regnum
, int scratchreg
,
2080 HOST_WIDE_INT delta
, bool frame_related_p
,
2083 HOST_WIDE_INT mdelta
= abs_hwi (delta
);
2084 rtx this_rtx
= gen_rtx_REG (mode
, regnum
);
2090 /* Single instruction adjustment. */
2091 if (aarch64_uimm12_shift (mdelta
))
2093 insn
= emit_insn (gen_add2_insn (this_rtx
, GEN_INT (delta
)));
2094 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2098 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
2099 Only do this if mdelta is not a 16-bit move as adjusting using a move
2101 if (mdelta
< 0x1000000 && !aarch64_move_imm (mdelta
, mode
))
2103 HOST_WIDE_INT low_off
= mdelta
& 0xfff;
2105 low_off
= delta
< 0 ? -low_off
: low_off
;
2106 insn
= emit_insn (gen_add2_insn (this_rtx
, GEN_INT (low_off
)));
2107 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2108 insn
= emit_insn (gen_add2_insn (this_rtx
, GEN_INT (delta
- low_off
)));
2109 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2113 /* Emit a move immediate if required and an addition/subtraction. */
2114 rtx scratch_rtx
= gen_rtx_REG (mode
, scratchreg
);
2116 aarch64_internal_mov_immediate (scratch_rtx
, GEN_INT (mdelta
), true, mode
);
2117 insn
= emit_insn (delta
< 0 ? gen_sub2_insn (this_rtx
, scratch_rtx
)
2118 : gen_add2_insn (this_rtx
, scratch_rtx
));
2119 if (frame_related_p
)
2121 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2122 rtx adj
= plus_constant (mode
, this_rtx
, delta
);
2123 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, gen_rtx_SET (this_rtx
, adj
));
2128 aarch64_add_constant (machine_mode mode
, int regnum
, int scratchreg
,
2129 HOST_WIDE_INT delta
)
2131 aarch64_add_constant_internal (mode
, regnum
, scratchreg
, delta
, false, true);
2135 aarch64_add_sp (int scratchreg
, HOST_WIDE_INT delta
, bool emit_move_imm
)
2137 aarch64_add_constant_internal (Pmode
, SP_REGNUM
, scratchreg
, delta
,
2138 true, emit_move_imm
);
2142 aarch64_sub_sp (int scratchreg
, HOST_WIDE_INT delta
, bool frame_related_p
)
2144 aarch64_add_constant_internal (Pmode
, SP_REGNUM
, scratchreg
, -delta
,
2145 frame_related_p
, true);
2149 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED
,
2150 tree exp ATTRIBUTE_UNUSED
)
2152 /* Currently, always true. */
2156 /* Implement TARGET_PASS_BY_REFERENCE. */
2159 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED
,
2162 bool named ATTRIBUTE_UNUSED
)
2165 machine_mode dummymode
;
2168 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
2169 size
= (mode
== BLKmode
&& type
)
2170 ? int_size_in_bytes (type
) : (int) GET_MODE_SIZE (mode
);
2172 /* Aggregates are passed by reference based on their size. */
2173 if (type
&& AGGREGATE_TYPE_P (type
))
2175 size
= int_size_in_bytes (type
);
2178 /* Variable sized arguments are always returned by reference. */
2182 /* Can this be a candidate to be passed in fp/simd register(s)? */
2183 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
2188 /* Arguments which are variable sized or larger than 2 registers are
2189 passed by reference unless they are a homogenous floating point
2191 return size
> 2 * UNITS_PER_WORD
;
2194 /* Return TRUE if VALTYPE is padded to its least significant bits. */
2196 aarch64_return_in_msb (const_tree valtype
)
2198 machine_mode dummy_mode
;
2201 /* Never happens in little-endian mode. */
2202 if (!BYTES_BIG_ENDIAN
)
2205 /* Only composite types smaller than or equal to 16 bytes can
2206 be potentially returned in registers. */
2207 if (!aarch64_composite_type_p (valtype
, TYPE_MODE (valtype
))
2208 || int_size_in_bytes (valtype
) <= 0
2209 || int_size_in_bytes (valtype
) > 16)
2212 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2213 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2214 is always passed/returned in the least significant bits of fp/simd
2216 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype
), valtype
,
2217 &dummy_mode
, &dummy_int
, NULL
))
2223 /* Implement TARGET_FUNCTION_VALUE.
2224 Define how to find the value returned by a function. */
2227 aarch64_function_value (const_tree type
, const_tree func
,
2228 bool outgoing ATTRIBUTE_UNUSED
)
2233 machine_mode ag_mode
;
2235 mode
= TYPE_MODE (type
);
2236 if (INTEGRAL_TYPE_P (type
))
2237 mode
= promote_function_mode (type
, mode
, &unsignedp
, func
, 1);
2239 if (aarch64_return_in_msb (type
))
2241 HOST_WIDE_INT size
= int_size_in_bytes (type
);
2243 if (size
% UNITS_PER_WORD
!= 0)
2245 size
+= UNITS_PER_WORD
- size
% UNITS_PER_WORD
;
2246 mode
= mode_for_size (size
* BITS_PER_UNIT
, MODE_INT
, 0);
2250 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
2251 &ag_mode
, &count
, NULL
))
2253 if (!aarch64_composite_type_p (type
, mode
))
2255 gcc_assert (count
== 1 && mode
== ag_mode
);
2256 return gen_rtx_REG (mode
, V0_REGNUM
);
2263 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (count
));
2264 for (i
= 0; i
< count
; i
++)
2266 rtx tmp
= gen_rtx_REG (ag_mode
, V0_REGNUM
+ i
);
2267 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
,
2268 GEN_INT (i
* GET_MODE_SIZE (ag_mode
)));
2269 XVECEXP (par
, 0, i
) = tmp
;
2275 return gen_rtx_REG (mode
, R0_REGNUM
);
2278 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2279 Return true if REGNO is the number of a hard register in which the values
2280 of called function may come back. */
2283 aarch64_function_value_regno_p (const unsigned int regno
)
2285 /* Maximum of 16 bytes can be returned in the general registers. Examples
2286 of 16-byte return values are: 128-bit integers and 16-byte small
2287 structures (excluding homogeneous floating-point aggregates). */
2288 if (regno
== R0_REGNUM
|| regno
== R1_REGNUM
)
2291 /* Up to four fp/simd registers can return a function value, e.g. a
2292 homogeneous floating-point aggregate having four members. */
2293 if (regno
>= V0_REGNUM
&& regno
< V0_REGNUM
+ HA_MAX_NUM_FLDS
)
2294 return TARGET_FLOAT
;
2299 /* Implement TARGET_RETURN_IN_MEMORY.
2301 If the type T of the result of a function is such that
2303 would require that arg be passed as a value in a register (or set of
2304 registers) according to the parameter passing rules, then the result
2305 is returned in the same registers as would be used for such an
2309 aarch64_return_in_memory (const_tree type
, const_tree fndecl ATTRIBUTE_UNUSED
)
2312 machine_mode ag_mode
;
2315 if (!AGGREGATE_TYPE_P (type
)
2316 && TREE_CODE (type
) != COMPLEX_TYPE
2317 && TREE_CODE (type
) != VECTOR_TYPE
)
2318 /* Simple scalar types always returned in registers. */
2321 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
),
2328 /* Types larger than 2 registers returned in memory. */
2329 size
= int_size_in_bytes (type
);
2330 return (size
< 0 || size
> 2 * UNITS_PER_WORD
);
2334 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v
, machine_mode mode
,
2335 const_tree type
, int *nregs
)
2337 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
2338 return aarch64_vfp_is_call_or_return_candidate (mode
,
2340 &pcum
->aapcs_vfp_rmode
,
2345 /* Given MODE and TYPE of a function argument, return the alignment in
2346 bits. The idea is to suppress any stronger alignment requested by
2347 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2348 This is a helper function for local use only. */
2351 aarch64_function_arg_alignment (machine_mode mode
, const_tree type
)
2354 return GET_MODE_ALIGNMENT (mode
);
2356 if (integer_zerop (TYPE_SIZE (type
)))
2359 gcc_assert (TYPE_MODE (type
) == mode
);
2361 if (!AGGREGATE_TYPE_P (type
))
2362 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type
));
2364 if (TREE_CODE (type
) == ARRAY_TYPE
)
2365 return TYPE_ALIGN (TREE_TYPE (type
));
2367 unsigned int alignment
= 0;
2368 for (tree field
= TYPE_FIELDS (type
); field
; field
= DECL_CHAIN (field
))
2369 if (TREE_CODE (field
) == FIELD_DECL
)
2370 alignment
= std::max (alignment
, DECL_ALIGN (field
));
2375 /* Layout a function argument according to the AAPCS64 rules. The rule
2376 numbers refer to the rule numbers in the AAPCS64. */
2379 aarch64_layout_arg (cumulative_args_t pcum_v
, machine_mode mode
,
2381 bool named ATTRIBUTE_UNUSED
)
2383 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
2384 int ncrn
, nvrn
, nregs
;
2385 bool allocate_ncrn
, allocate_nvrn
;
2388 /* We need to do this once per argument. */
2389 if (pcum
->aapcs_arg_processed
)
2392 pcum
->aapcs_arg_processed
= true;
2394 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
2396 = ROUND_UP (type
? int_size_in_bytes (type
) : GET_MODE_SIZE (mode
),
2399 allocate_ncrn
= (type
) ? !(FLOAT_TYPE_P (type
)) : !FLOAT_MODE_P (mode
);
2400 allocate_nvrn
= aarch64_vfp_is_call_candidate (pcum_v
,
2405 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2406 The following code thus handles passing by SIMD/FP registers first. */
2408 nvrn
= pcum
->aapcs_nvrn
;
2410 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2411 and homogenous short-vector aggregates (HVA). */
2415 aarch64_err_no_fpadvsimd (mode
, "argument");
2417 if (nvrn
+ nregs
<= NUM_FP_ARG_REGS
)
2419 pcum
->aapcs_nextnvrn
= nvrn
+ nregs
;
2420 if (!aarch64_composite_type_p (type
, mode
))
2422 gcc_assert (nregs
== 1);
2423 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ nvrn
);
2429 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
2430 for (i
= 0; i
< nregs
; i
++)
2432 rtx tmp
= gen_rtx_REG (pcum
->aapcs_vfp_rmode
,
2433 V0_REGNUM
+ nvrn
+ i
);
2434 tmp
= gen_rtx_EXPR_LIST
2436 GEN_INT (i
* GET_MODE_SIZE (pcum
->aapcs_vfp_rmode
)));
2437 XVECEXP (par
, 0, i
) = tmp
;
2439 pcum
->aapcs_reg
= par
;
2445 /* C.3 NSRN is set to 8. */
2446 pcum
->aapcs_nextnvrn
= NUM_FP_ARG_REGS
;
2451 ncrn
= pcum
->aapcs_ncrn
;
2452 nregs
= size
/ UNITS_PER_WORD
;
2454 /* C6 - C9. though the sign and zero extension semantics are
2455 handled elsewhere. This is the case where the argument fits
2456 entirely general registers. */
2457 if (allocate_ncrn
&& (ncrn
+ nregs
<= NUM_ARG_REGS
))
2460 gcc_assert (nregs
== 0 || nregs
== 1 || nregs
== 2);
2462 /* C.8 if the argument has an alignment of 16 then the NGRN is
2463 rounded up to the next even number. */
2466 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
2467 comparison is there because for > 16 * BITS_PER_UNIT
2468 alignment nregs should be > 2 and therefore it should be
2469 passed by reference rather than value. */
2470 && aarch64_function_arg_alignment (mode
, type
) == 16 * BITS_PER_UNIT
)
2473 gcc_assert (ncrn
+ nregs
<= NUM_ARG_REGS
);
2476 /* NREGS can be 0 when e.g. an empty structure is to be passed.
2477 A reg is still generated for it, but the caller should be smart
2478 enough not to use it. */
2479 if (nregs
== 0 || nregs
== 1 || GET_MODE_CLASS (mode
) == MODE_INT
)
2480 pcum
->aapcs_reg
= gen_rtx_REG (mode
, R0_REGNUM
+ ncrn
);
2486 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
2487 for (i
= 0; i
< nregs
; i
++)
2489 rtx tmp
= gen_rtx_REG (word_mode
, R0_REGNUM
+ ncrn
+ i
);
2490 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
,
2491 GEN_INT (i
* UNITS_PER_WORD
));
2492 XVECEXP (par
, 0, i
) = tmp
;
2494 pcum
->aapcs_reg
= par
;
2497 pcum
->aapcs_nextncrn
= ncrn
+ nregs
;
2502 pcum
->aapcs_nextncrn
= NUM_ARG_REGS
;
2504 /* The argument is passed on stack; record the needed number of words for
2505 this argument and align the total size if necessary. */
2507 pcum
->aapcs_stack_words
= size
/ UNITS_PER_WORD
;
2509 if (aarch64_function_arg_alignment (mode
, type
) == 16 * BITS_PER_UNIT
)
2510 pcum
->aapcs_stack_size
= ROUND_UP (pcum
->aapcs_stack_size
,
2511 16 / UNITS_PER_WORD
);
2515 /* Implement TARGET_FUNCTION_ARG. */
2518 aarch64_function_arg (cumulative_args_t pcum_v
, machine_mode mode
,
2519 const_tree type
, bool named
)
2521 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
2522 gcc_assert (pcum
->pcs_variant
== ARM_PCS_AAPCS64
);
2524 if (mode
== VOIDmode
)
2527 aarch64_layout_arg (pcum_v
, mode
, type
, named
);
2528 return pcum
->aapcs_reg
;
2532 aarch64_init_cumulative_args (CUMULATIVE_ARGS
*pcum
,
2533 const_tree fntype ATTRIBUTE_UNUSED
,
2534 rtx libname ATTRIBUTE_UNUSED
,
2535 const_tree fndecl ATTRIBUTE_UNUSED
,
2536 unsigned n_named ATTRIBUTE_UNUSED
)
2538 pcum
->aapcs_ncrn
= 0;
2539 pcum
->aapcs_nvrn
= 0;
2540 pcum
->aapcs_nextncrn
= 0;
2541 pcum
->aapcs_nextnvrn
= 0;
2542 pcum
->pcs_variant
= ARM_PCS_AAPCS64
;
2543 pcum
->aapcs_reg
= NULL_RTX
;
2544 pcum
->aapcs_arg_processed
= false;
2545 pcum
->aapcs_stack_words
= 0;
2546 pcum
->aapcs_stack_size
= 0;
2549 && fndecl
&& TREE_PUBLIC (fndecl
)
2550 && fntype
&& fntype
!= error_mark_node
)
2552 const_tree type
= TREE_TYPE (fntype
);
2553 machine_mode mode ATTRIBUTE_UNUSED
; /* To pass pointer as argument. */
2554 int nregs ATTRIBUTE_UNUSED
; /* Likewise. */
2555 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
), type
,
2556 &mode
, &nregs
, NULL
))
2557 aarch64_err_no_fpadvsimd (TYPE_MODE (type
), "return type");
2563 aarch64_function_arg_advance (cumulative_args_t pcum_v
,
2568 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
2569 if (pcum
->pcs_variant
== ARM_PCS_AAPCS64
)
2571 aarch64_layout_arg (pcum_v
, mode
, type
, named
);
2572 gcc_assert ((pcum
->aapcs_reg
!= NULL_RTX
)
2573 != (pcum
->aapcs_stack_words
!= 0));
2574 pcum
->aapcs_arg_processed
= false;
2575 pcum
->aapcs_ncrn
= pcum
->aapcs_nextncrn
;
2576 pcum
->aapcs_nvrn
= pcum
->aapcs_nextnvrn
;
2577 pcum
->aapcs_stack_size
+= pcum
->aapcs_stack_words
;
2578 pcum
->aapcs_stack_words
= 0;
2579 pcum
->aapcs_reg
= NULL_RTX
;
2584 aarch64_function_arg_regno_p (unsigned regno
)
2586 return ((GP_REGNUM_P (regno
) && regno
< R0_REGNUM
+ NUM_ARG_REGS
)
2587 || (FP_REGNUM_P (regno
) && regno
< V0_REGNUM
+ NUM_FP_ARG_REGS
));
2590 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
2591 PARM_BOUNDARY bits of alignment, but will be given anything up
2592 to STACK_BOUNDARY bits if the type requires it. This makes sure
2593 that both before and after the layout of each argument, the Next
2594 Stacked Argument Address (NSAA) will have a minimum alignment of
2598 aarch64_function_arg_boundary (machine_mode mode
, const_tree type
)
2600 unsigned int alignment
= aarch64_function_arg_alignment (mode
, type
);
2601 return MIN (MAX (alignment
, PARM_BOUNDARY
), STACK_BOUNDARY
);
2604 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2606 Return true if an argument passed on the stack should be padded upwards,
2607 i.e. if the least-significant byte of the stack slot has useful data.
2609 Small aggregate types are placed in the lowest memory address.
2611 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
2614 aarch64_pad_arg_upward (machine_mode mode
, const_tree type
)
2616 /* On little-endian targets, the least significant byte of every stack
2617 argument is passed at the lowest byte address of the stack slot. */
2618 if (!BYTES_BIG_ENDIAN
)
2621 /* Otherwise, integral, floating-point and pointer types are padded downward:
2622 the least significant byte of a stack argument is passed at the highest
2623 byte address of the stack slot. */
2625 ? (INTEGRAL_TYPE_P (type
) || SCALAR_FLOAT_TYPE_P (type
)
2626 || POINTER_TYPE_P (type
))
2627 : (SCALAR_INT_MODE_P (mode
) || SCALAR_FLOAT_MODE_P (mode
)))
2630 /* Everything else padded upward, i.e. data in first byte of stack slot. */
2634 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2636 It specifies padding for the last (may also be the only)
2637 element of a block move between registers and memory. If
2638 assuming the block is in the memory, padding upward means that
2639 the last element is padded after its highest significant byte,
2640 while in downward padding, the last element is padded at the
2641 its least significant byte side.
2643 Small aggregates and small complex types are always padded
2646 We don't need to worry about homogeneous floating-point or
2647 short-vector aggregates; their move is not affected by the
2648 padding direction determined here. Regardless of endianness,
2649 each element of such an aggregate is put in the least
2650 significant bits of a fp/simd register.
2652 Return !BYTES_BIG_ENDIAN if the least significant byte of the
2653 register has useful data, and return the opposite if the most
2654 significant byte does. */
2657 aarch64_pad_reg_upward (machine_mode mode
, const_tree type
,
2658 bool first ATTRIBUTE_UNUSED
)
2661 /* Small composite types are always padded upward. */
2662 if (BYTES_BIG_ENDIAN
&& aarch64_composite_type_p (type
, mode
))
2664 HOST_WIDE_INT size
= (type
? int_size_in_bytes (type
)
2665 : GET_MODE_SIZE (mode
));
2666 if (size
< 2 * UNITS_PER_WORD
)
2670 /* Otherwise, use the default padding. */
2671 return !BYTES_BIG_ENDIAN
;
2675 aarch64_libgcc_cmp_return_mode (void)
2680 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2682 /* We use the 12-bit shifted immediate arithmetic instructions so values
2683 must be multiple of (1 << 12), i.e. 4096. */
2684 #define ARITH_FACTOR 4096
2686 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2687 #error Cannot use simple address calculation for stack probing
2690 /* The pair of scratch registers used for stack probing. */
2691 #define PROBE_STACK_FIRST_REG 9
2692 #define PROBE_STACK_SECOND_REG 10
2694 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2695 inclusive. These are offsets from the current stack pointer. */
2698 aarch64_emit_probe_stack_range (HOST_WIDE_INT first
, HOST_WIDE_INT size
)
2700 rtx reg1
= gen_rtx_REG (Pmode
, PROBE_STACK_FIRST_REG
);
2702 /* See the same assertion on PROBE_INTERVAL above. */
2703 gcc_assert ((first
% ARITH_FACTOR
) == 0);
2705 /* See if we have a constant small number of probes to generate. If so,
2706 that's the easy case. */
2707 if (size
<= PROBE_INTERVAL
)
2709 const HOST_WIDE_INT base
= ROUND_UP (size
, ARITH_FACTOR
);
2711 emit_set_insn (reg1
,
2712 plus_constant (Pmode
,
2713 stack_pointer_rtx
, -(first
+ base
)));
2714 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- size
));
2717 /* The run-time loop is made up of 8 insns in the generic case while the
2718 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
2719 else if (size
<= 4 * PROBE_INTERVAL
)
2721 HOST_WIDE_INT i
, rem
;
2723 emit_set_insn (reg1
,
2724 plus_constant (Pmode
,
2726 -(first
+ PROBE_INTERVAL
)));
2727 emit_stack_probe (reg1
);
2729 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2730 it exceeds SIZE. If only two probes are needed, this will not
2731 generate any code. Then probe at FIRST + SIZE. */
2732 for (i
= 2 * PROBE_INTERVAL
; i
< size
; i
+= PROBE_INTERVAL
)
2734 emit_set_insn (reg1
,
2735 plus_constant (Pmode
, reg1
, -PROBE_INTERVAL
));
2736 emit_stack_probe (reg1
);
2739 rem
= size
- (i
- PROBE_INTERVAL
);
2742 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
2744 emit_set_insn (reg1
, plus_constant (Pmode
, reg1
, -base
));
2745 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- rem
));
2748 emit_stack_probe (plus_constant (Pmode
, reg1
, -rem
));
2751 /* Otherwise, do the same as above, but in a loop. Note that we must be
2752 extra careful with variables wrapping around because we might be at
2753 the very top (or the very bottom) of the address space and we have
2754 to be able to handle this case properly; in particular, we use an
2755 equality test for the loop condition. */
2758 rtx reg2
= gen_rtx_REG (Pmode
, PROBE_STACK_SECOND_REG
);
2760 /* Step 1: round SIZE to the previous multiple of the interval. */
2762 HOST_WIDE_INT rounded_size
= size
& -PROBE_INTERVAL
;
2765 /* Step 2: compute initial and final value of the loop counter. */
2767 /* TEST_ADDR = SP + FIRST. */
2768 emit_set_insn (reg1
,
2769 plus_constant (Pmode
, stack_pointer_rtx
, -first
));
2771 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
2772 HOST_WIDE_INT adjustment
= - (first
+ rounded_size
);
2773 if (! aarch64_uimm12_shift (adjustment
))
2775 aarch64_internal_mov_immediate (reg2
, GEN_INT (adjustment
),
2777 emit_set_insn (reg2
, gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, reg2
));
2781 emit_set_insn (reg2
,
2782 plus_constant (Pmode
, stack_pointer_rtx
, adjustment
));
2789 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2792 while (TEST_ADDR != LAST_ADDR)
2794 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2795 until it is equal to ROUNDED_SIZE. */
2797 emit_insn (gen_probe_stack_range (reg1
, reg1
, reg2
));
2800 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2801 that SIZE is equal to ROUNDED_SIZE. */
2803 if (size
!= rounded_size
)
2805 HOST_WIDE_INT rem
= size
- rounded_size
;
2809 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
2811 emit_set_insn (reg2
, plus_constant (Pmode
, reg2
, -base
));
2812 emit_stack_probe (plus_constant (Pmode
, reg2
, base
- rem
));
2815 emit_stack_probe (plus_constant (Pmode
, reg2
, -rem
));
2819 /* Make sure nothing is scheduled before we are done. */
2820 emit_insn (gen_blockage ());
2823 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
2824 absolute addresses. */
2827 aarch64_output_probe_stack_range (rtx reg1
, rtx reg2
)
2829 static int labelno
= 0;
2833 ASM_GENERATE_INTERNAL_LABEL (loop_lab
, "LPSRL", labelno
++);
2836 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_lab
);
2838 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
2840 xops
[1] = GEN_INT (PROBE_INTERVAL
);
2841 output_asm_insn ("sub\t%0, %0, %1", xops
);
2843 /* Probe at TEST_ADDR. */
2844 output_asm_insn ("str\txzr, [%0]", xops
);
2846 /* Test if TEST_ADDR == LAST_ADDR. */
2848 output_asm_insn ("cmp\t%0, %1", xops
);
2851 fputs ("\tb.ne\t", asm_out_file
);
2852 assemble_name_raw (asm_out_file
, loop_lab
);
2853 fputc ('\n', asm_out_file
);
2859 aarch64_frame_pointer_required (void)
2861 /* In aarch64_override_options_after_change
2862 flag_omit_leaf_frame_pointer turns off the frame pointer by
2863 default. Turn it back on now if we've not got a leaf
2865 if (flag_omit_leaf_frame_pointer
2866 && (!crtl
->is_leaf
|| df_regs_ever_live_p (LR_REGNUM
)))
2869 /* Force a frame pointer for EH returns so the return address is at FP+8. */
2870 if (crtl
->calls_eh_return
)
2876 /* Mark the registers that need to be saved by the callee and calculate
2877 the size of the callee-saved registers area and frame record (both FP
2878 and LR may be omitted). */
2880 aarch64_layout_frame (void)
2882 HOST_WIDE_INT offset
= 0;
2883 int regno
, last_fp_reg
= INVALID_REGNUM
;
2885 if (reload_completed
&& cfun
->machine
->frame
.laid_out
)
2888 #define SLOT_NOT_REQUIRED (-2)
2889 #define SLOT_REQUIRED (-1)
2891 cfun
->machine
->frame
.wb_candidate1
= INVALID_REGNUM
;
2892 cfun
->machine
->frame
.wb_candidate2
= INVALID_REGNUM
;
2894 /* First mark all the registers that really need to be saved... */
2895 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
2896 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
2898 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
2899 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
2901 /* ... that includes the eh data registers (if needed)... */
2902 if (crtl
->calls_eh_return
)
2903 for (regno
= 0; EH_RETURN_DATA_REGNO (regno
) != INVALID_REGNUM
; regno
++)
2904 cfun
->machine
->frame
.reg_offset
[EH_RETURN_DATA_REGNO (regno
)]
2907 /* ... and any callee saved register that dataflow says is live. */
2908 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
2909 if (df_regs_ever_live_p (regno
)
2910 && (regno
== R30_REGNUM
2911 || !call_used_regs
[regno
]))
2912 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
2914 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
2915 if (df_regs_ever_live_p (regno
)
2916 && !call_used_regs
[regno
])
2918 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
2919 last_fp_reg
= regno
;
2922 if (frame_pointer_needed
)
2924 /* FP and LR are placed in the linkage record. */
2925 cfun
->machine
->frame
.reg_offset
[R29_REGNUM
] = 0;
2926 cfun
->machine
->frame
.wb_candidate1
= R29_REGNUM
;
2927 cfun
->machine
->frame
.reg_offset
[R30_REGNUM
] = UNITS_PER_WORD
;
2928 cfun
->machine
->frame
.wb_candidate2
= R30_REGNUM
;
2929 offset
+= 2 * UNITS_PER_WORD
;
2932 /* Now assign stack slots for them. */
2933 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
2934 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
2936 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
2937 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
2938 cfun
->machine
->frame
.wb_candidate1
= regno
;
2939 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
)
2940 cfun
->machine
->frame
.wb_candidate2
= regno
;
2941 offset
+= UNITS_PER_WORD
;
2944 HOST_WIDE_INT max_int_offset
= offset
;
2945 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
2946 bool has_align_gap
= offset
!= max_int_offset
;
2948 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
2949 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
2951 /* If there is an alignment gap between integer and fp callee-saves,
2952 allocate the last fp register to it if possible. */
2953 if (regno
== last_fp_reg
&& has_align_gap
&& (offset
& 8) == 0)
2955 cfun
->machine
->frame
.reg_offset
[regno
] = max_int_offset
;
2959 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
2960 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
2961 cfun
->machine
->frame
.wb_candidate1
= regno
;
2962 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
2963 && cfun
->machine
->frame
.wb_candidate1
>= V0_REGNUM
)
2964 cfun
->machine
->frame
.wb_candidate2
= regno
;
2965 offset
+= UNITS_PER_WORD
;
2968 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
2970 cfun
->machine
->frame
.saved_regs_size
= offset
;
2972 HOST_WIDE_INT varargs_and_saved_regs_size
2973 = offset
+ cfun
->machine
->frame
.saved_varargs_size
;
2975 cfun
->machine
->frame
.hard_fp_offset
2976 = ROUND_UP (varargs_and_saved_regs_size
+ get_frame_size (),
2977 STACK_BOUNDARY
/ BITS_PER_UNIT
);
2979 cfun
->machine
->frame
.frame_size
2980 = ROUND_UP (cfun
->machine
->frame
.hard_fp_offset
2981 + crtl
->outgoing_args_size
,
2982 STACK_BOUNDARY
/ BITS_PER_UNIT
);
2984 cfun
->machine
->frame
.locals_offset
= cfun
->machine
->frame
.saved_varargs_size
;
2986 cfun
->machine
->frame
.initial_adjust
= 0;
2987 cfun
->machine
->frame
.final_adjust
= 0;
2988 cfun
->machine
->frame
.callee_adjust
= 0;
2989 cfun
->machine
->frame
.callee_offset
= 0;
2991 HOST_WIDE_INT max_push_offset
= 0;
2992 if (cfun
->machine
->frame
.wb_candidate2
!= INVALID_REGNUM
)
2993 max_push_offset
= 512;
2994 else if (cfun
->machine
->frame
.wb_candidate1
!= INVALID_REGNUM
)
2995 max_push_offset
= 256;
2997 if (cfun
->machine
->frame
.frame_size
< max_push_offset
2998 && crtl
->outgoing_args_size
== 0)
3000 /* Simple, small frame with no outgoing arguments:
3001 stp reg1, reg2, [sp, -frame_size]!
3002 stp reg3, reg4, [sp, 16] */
3003 cfun
->machine
->frame
.callee_adjust
= cfun
->machine
->frame
.frame_size
;
3005 else if ((crtl
->outgoing_args_size
3006 + cfun
->machine
->frame
.saved_regs_size
< 512)
3007 && !(cfun
->calls_alloca
3008 && cfun
->machine
->frame
.hard_fp_offset
< max_push_offset
))
3010 /* Frame with small outgoing arguments:
3011 sub sp, sp, frame_size
3012 stp reg1, reg2, [sp, outgoing_args_size]
3013 stp reg3, reg4, [sp, outgoing_args_size + 16] */
3014 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.frame_size
;
3015 cfun
->machine
->frame
.callee_offset
3016 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.hard_fp_offset
;
3018 else if (cfun
->machine
->frame
.hard_fp_offset
< max_push_offset
)
3020 /* Frame with large outgoing arguments but a small local area:
3021 stp reg1, reg2, [sp, -hard_fp_offset]!
3022 stp reg3, reg4, [sp, 16]
3023 sub sp, sp, outgoing_args_size */
3024 cfun
->machine
->frame
.callee_adjust
= cfun
->machine
->frame
.hard_fp_offset
;
3025 cfun
->machine
->frame
.final_adjust
3026 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.callee_adjust
;
3028 else if (!frame_pointer_needed
3029 && varargs_and_saved_regs_size
< max_push_offset
)
3031 /* Frame with large local area and outgoing arguments (this pushes the
3032 callee-saves first, followed by the locals and outgoing area):
3033 stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
3034 stp reg3, reg4, [sp, 16]
3035 sub sp, sp, frame_size - varargs_and_saved_regs_size */
3036 cfun
->machine
->frame
.callee_adjust
= varargs_and_saved_regs_size
;
3037 cfun
->machine
->frame
.final_adjust
3038 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.callee_adjust
;
3039 cfun
->machine
->frame
.hard_fp_offset
= cfun
->machine
->frame
.callee_adjust
;
3040 cfun
->machine
->frame
.locals_offset
= cfun
->machine
->frame
.hard_fp_offset
;
3044 /* Frame with large local area and outgoing arguments using frame pointer:
3045 sub sp, sp, hard_fp_offset
3046 stp x29, x30, [sp, 0]
3048 stp reg3, reg4, [sp, 16]
3049 sub sp, sp, outgoing_args_size */
3050 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.hard_fp_offset
;
3051 cfun
->machine
->frame
.final_adjust
3052 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.initial_adjust
;
3055 cfun
->machine
->frame
.laid_out
= true;
3058 /* Return true if the register REGNO is saved on entry to
3059 the current function. */
3062 aarch64_register_saved_on_entry (int regno
)
3064 return cfun
->machine
->frame
.reg_offset
[regno
] >= 0;
3067 /* Return the next register up from REGNO up to LIMIT for the callee
3071 aarch64_next_callee_save (unsigned regno
, unsigned limit
)
3073 while (regno
<= limit
&& !aarch64_register_saved_on_entry (regno
))
3078 /* Push the register number REGNO of mode MODE to the stack with write-back
3079 adjusting the stack by ADJUSTMENT. */
3082 aarch64_pushwb_single_reg (machine_mode mode
, unsigned regno
,
3083 HOST_WIDE_INT adjustment
)
3085 rtx base_rtx
= stack_pointer_rtx
;
3088 reg
= gen_rtx_REG (mode
, regno
);
3089 mem
= gen_rtx_PRE_MODIFY (Pmode
, base_rtx
,
3090 plus_constant (Pmode
, base_rtx
, -adjustment
));
3091 mem
= gen_rtx_MEM (mode
, mem
);
3093 insn
= emit_move_insn (mem
, reg
);
3094 RTX_FRAME_RELATED_P (insn
) = 1;
3097 /* Generate and return an instruction to store the pair of registers
3098 REG and REG2 of mode MODE to location BASE with write-back adjusting
3099 the stack location BASE by ADJUSTMENT. */
3102 aarch64_gen_storewb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
3103 HOST_WIDE_INT adjustment
)
3108 return gen_storewb_pairdi_di (base
, base
, reg
, reg2
,
3109 GEN_INT (-adjustment
),
3110 GEN_INT (UNITS_PER_WORD
- adjustment
));
3112 return gen_storewb_pairdf_di (base
, base
, reg
, reg2
,
3113 GEN_INT (-adjustment
),
3114 GEN_INT (UNITS_PER_WORD
- adjustment
));
3120 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3121 stack pointer by ADJUSTMENT. */
3124 aarch64_push_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
)
3127 machine_mode mode
= (regno1
<= R30_REGNUM
) ? DImode
: DFmode
;
3129 if (regno2
== INVALID_REGNUM
)
3130 return aarch64_pushwb_single_reg (mode
, regno1
, adjustment
);
3132 rtx reg1
= gen_rtx_REG (mode
, regno1
);
3133 rtx reg2
= gen_rtx_REG (mode
, regno2
);
3135 insn
= emit_insn (aarch64_gen_storewb_pair (mode
, stack_pointer_rtx
, reg1
,
3137 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 2)) = 1;
3138 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
3139 RTX_FRAME_RELATED_P (insn
) = 1;
3142 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3143 adjusting it by ADJUSTMENT afterwards. */
3146 aarch64_gen_loadwb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
3147 HOST_WIDE_INT adjustment
)
3152 return gen_loadwb_pairdi_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
3153 GEN_INT (UNITS_PER_WORD
));
3155 return gen_loadwb_pairdf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
3156 GEN_INT (UNITS_PER_WORD
));
3162 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3163 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3167 aarch64_pop_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
,
3170 machine_mode mode
= (regno1
<= R30_REGNUM
) ? DImode
: DFmode
;
3171 rtx reg1
= gen_rtx_REG (mode
, regno1
);
3173 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg1
, *cfi_ops
);
3175 if (regno2
== INVALID_REGNUM
)
3177 rtx mem
= plus_constant (Pmode
, stack_pointer_rtx
, adjustment
);
3178 mem
= gen_rtx_POST_MODIFY (Pmode
, stack_pointer_rtx
, mem
);
3179 emit_move_insn (reg1
, gen_rtx_MEM (mode
, mem
));
3183 rtx reg2
= gen_rtx_REG (mode
, regno2
);
3184 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
3185 emit_insn (aarch64_gen_loadwb_pair (mode
, stack_pointer_rtx
, reg1
,
3190 /* Generate and return a store pair instruction of mode MODE to store
3191 register REG1 to MEM1 and register REG2 to MEM2. */
3194 aarch64_gen_store_pair (machine_mode mode
, rtx mem1
, rtx reg1
, rtx mem2
,
3200 return gen_store_pairdi (mem1
, reg1
, mem2
, reg2
);
3203 return gen_store_pairdf (mem1
, reg1
, mem2
, reg2
);
3210 /* Generate and regurn a load pair isntruction of mode MODE to load register
3211 REG1 from MEM1 and register REG2 from MEM2. */
3214 aarch64_gen_load_pair (machine_mode mode
, rtx reg1
, rtx mem1
, rtx reg2
,
3220 return gen_load_pairdi (reg1
, mem1
, reg2
, mem2
);
3223 return gen_load_pairdf (reg1
, mem1
, reg2
, mem2
);
3230 /* Return TRUE if return address signing should be enabled for the current
3231 function, otherwise return FALSE. */
3234 aarch64_return_address_signing_enabled (void)
3236 /* This function should only be called after frame laid out. */
3237 gcc_assert (cfun
->machine
->frame
.laid_out
);
3239 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
3240 if it's LR is pushed onto stack. */
3241 return (aarch64_ra_sign_scope
== AARCH64_FUNCTION_ALL
3242 || (aarch64_ra_sign_scope
== AARCH64_FUNCTION_NON_LEAF
3243 && cfun
->machine
->frame
.reg_offset
[LR_REGNUM
] >= 0));
3246 /* Emit code to save the callee-saved registers from register number START
3247 to LIMIT to the stack at the location starting at offset START_OFFSET,
3248 skipping any write-back candidates if SKIP_WB is true. */
3251 aarch64_save_callee_saves (machine_mode mode
, HOST_WIDE_INT start_offset
,
3252 unsigned start
, unsigned limit
, bool skip_wb
)
3255 rtx (*gen_mem_ref
) (machine_mode
, rtx
) = (frame_pointer_needed
3256 ? gen_frame_mem
: gen_rtx_MEM
);
3260 for (regno
= aarch64_next_callee_save (start
, limit
);
3262 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
3265 HOST_WIDE_INT offset
;
3268 && (regno
== cfun
->machine
->frame
.wb_candidate1
3269 || regno
== cfun
->machine
->frame
.wb_candidate2
))
3272 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
3275 reg
= gen_rtx_REG (mode
, regno
);
3276 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
3277 mem
= gen_mem_ref (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
3280 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
3283 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
3284 && ((cfun
->machine
->frame
.reg_offset
[regno
] + UNITS_PER_WORD
)
3285 == cfun
->machine
->frame
.reg_offset
[regno2
]))
3288 rtx reg2
= gen_rtx_REG (mode
, regno2
);
3291 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
3292 mem2
= gen_mem_ref (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
3294 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
,
3297 /* The first part of a frame-related parallel insn is
3298 always assumed to be relevant to the frame
3299 calculations; subsequent parts, are only
3300 frame-related if explicitly marked. */
3301 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
3305 insn
= emit_move_insn (mem
, reg
);
3307 RTX_FRAME_RELATED_P (insn
) = 1;
3311 /* Emit code to restore the callee registers of mode MODE from register
3312 number START up to and including LIMIT. Restore from the stack offset
3313 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3314 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
3317 aarch64_restore_callee_saves (machine_mode mode
,
3318 HOST_WIDE_INT start_offset
, unsigned start
,
3319 unsigned limit
, bool skip_wb
, rtx
*cfi_ops
)
3321 rtx base_rtx
= stack_pointer_rtx
;
3322 rtx (*gen_mem_ref
) (machine_mode
, rtx
) = (frame_pointer_needed
3323 ? gen_frame_mem
: gen_rtx_MEM
);
3326 HOST_WIDE_INT offset
;
3328 for (regno
= aarch64_next_callee_save (start
, limit
);
3330 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
3332 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
3338 && (regno
== cfun
->machine
->frame
.wb_candidate1
3339 || regno
== cfun
->machine
->frame
.wb_candidate2
))
3342 reg
= gen_rtx_REG (mode
, regno
);
3343 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
3344 mem
= gen_mem_ref (mode
, plus_constant (Pmode
, base_rtx
, offset
));
3346 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
3349 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
3350 && ((cfun
->machine
->frame
.reg_offset
[regno
] + UNITS_PER_WORD
)
3351 == cfun
->machine
->frame
.reg_offset
[regno2
]))
3353 rtx reg2
= gen_rtx_REG (mode
, regno2
);
3356 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
3357 mem2
= gen_mem_ref (mode
, plus_constant (Pmode
, base_rtx
, offset
));
3358 emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
3360 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
3364 emit_move_insn (reg
, mem
);
3365 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg
, *cfi_ops
);
3370 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED
,
3371 HOST_WIDE_INT offset
)
3373 return offset
>= -256 && offset
< 256;
3377 offset_12bit_unsigned_scaled_p (machine_mode mode
, HOST_WIDE_INT offset
)
3380 && offset
< 4096 * GET_MODE_SIZE (mode
)
3381 && offset
% GET_MODE_SIZE (mode
) == 0);
3385 aarch64_offset_7bit_signed_scaled_p (machine_mode mode
, HOST_WIDE_INT offset
)
3387 return (offset
>= -64 * GET_MODE_SIZE (mode
)
3388 && offset
< 64 * GET_MODE_SIZE (mode
)
3389 && offset
% GET_MODE_SIZE (mode
) == 0);
3392 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
3395 aarch64_get_separate_components (void)
3397 aarch64_layout_frame ();
3399 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
3400 bitmap_clear (components
);
3402 /* The registers we need saved to the frame. */
3403 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
3404 if (aarch64_register_saved_on_entry (regno
))
3406 HOST_WIDE_INT offset
= cfun
->machine
->frame
.reg_offset
[regno
];
3407 if (!frame_pointer_needed
)
3408 offset
+= cfun
->machine
->frame
.frame_size
3409 - cfun
->machine
->frame
.hard_fp_offset
;
3410 /* Check that we can access the stack slot of the register with one
3411 direct load with no adjustments needed. */
3412 if (offset_12bit_unsigned_scaled_p (DImode
, offset
))
3413 bitmap_set_bit (components
, regno
);
3416 /* Don't mess with the hard frame pointer. */
3417 if (frame_pointer_needed
)
3418 bitmap_clear_bit (components
, HARD_FRAME_POINTER_REGNUM
);
3420 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
3421 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
3422 /* If aarch64_layout_frame has chosen registers to store/restore with
3423 writeback don't interfere with them to avoid having to output explicit
3424 stack adjustment instructions. */
3425 if (reg2
!= INVALID_REGNUM
)
3426 bitmap_clear_bit (components
, reg2
);
3427 if (reg1
!= INVALID_REGNUM
)
3428 bitmap_clear_bit (components
, reg1
);
3430 bitmap_clear_bit (components
, LR_REGNUM
);
3431 bitmap_clear_bit (components
, SP_REGNUM
);
3436 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
3439 aarch64_components_for_bb (basic_block bb
)
3441 bitmap in
= DF_LIVE_IN (bb
);
3442 bitmap gen
= &DF_LIVE_BB_INFO (bb
)->gen
;
3443 bitmap kill
= &DF_LIVE_BB_INFO (bb
)->kill
;
3445 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
3446 bitmap_clear (components
);
3448 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
3449 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
3450 if ((!call_used_regs
[regno
])
3451 && (bitmap_bit_p (in
, regno
)
3452 || bitmap_bit_p (gen
, regno
)
3453 || bitmap_bit_p (kill
, regno
)))
3454 bitmap_set_bit (components
, regno
);
3459 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3460 Nothing to do for aarch64. */
3463 aarch64_disqualify_components (sbitmap
, edge
, sbitmap
, bool)
3467 /* Return the next set bit in BMP from START onwards. Return the total number
3468 of bits in BMP if no set bit is found at or after START. */
3471 aarch64_get_next_set_bit (sbitmap bmp
, unsigned int start
)
3473 unsigned int nbits
= SBITMAP_SIZE (bmp
);
3477 gcc_assert (start
< nbits
);
3478 for (unsigned int i
= start
; i
< nbits
; i
++)
3479 if (bitmap_bit_p (bmp
, i
))
3485 /* Do the work for aarch64_emit_prologue_components and
3486 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
3487 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3488 for these components or the epilogue sequence. That is, it determines
3489 whether we should emit stores or loads and what kind of CFA notes to attach
3490 to the insns. Otherwise the logic for the two sequences is very
3494 aarch64_process_components (sbitmap components
, bool prologue_p
)
3496 rtx ptr_reg
= gen_rtx_REG (Pmode
, frame_pointer_needed
3497 ? HARD_FRAME_POINTER_REGNUM
3498 : STACK_POINTER_REGNUM
);
3500 unsigned last_regno
= SBITMAP_SIZE (components
);
3501 unsigned regno
= aarch64_get_next_set_bit (components
, R0_REGNUM
);
3502 rtx_insn
*insn
= NULL
;
3504 while (regno
!= last_regno
)
3506 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3507 so DFmode for the vector registers is enough. */
3508 machine_mode mode
= GP_REGNUM_P (regno
) ? DImode
: DFmode
;
3509 rtx reg
= gen_rtx_REG (mode
, regno
);
3510 HOST_WIDE_INT offset
= cfun
->machine
->frame
.reg_offset
[regno
];
3511 if (!frame_pointer_needed
)
3512 offset
+= cfun
->machine
->frame
.frame_size
3513 - cfun
->machine
->frame
.hard_fp_offset
;
3514 rtx addr
= plus_constant (Pmode
, ptr_reg
, offset
);
3515 rtx mem
= gen_frame_mem (mode
, addr
);
3517 rtx set
= prologue_p
? gen_rtx_SET (mem
, reg
) : gen_rtx_SET (reg
, mem
);
3518 unsigned regno2
= aarch64_get_next_set_bit (components
, regno
+ 1);
3519 /* No more registers to handle after REGNO.
3520 Emit a single save/restore and exit. */
3521 if (regno2
== last_regno
)
3523 insn
= emit_insn (set
);
3524 RTX_FRAME_RELATED_P (insn
) = 1;
3526 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
3528 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
3532 HOST_WIDE_INT offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
3533 /* The next register is not of the same class or its offset is not
3534 mergeable with the current one into a pair. */
3535 if (!satisfies_constraint_Ump (mem
)
3536 || GP_REGNUM_P (regno
) != GP_REGNUM_P (regno2
)
3537 || (offset2
- cfun
->machine
->frame
.reg_offset
[regno
])
3538 != GET_MODE_SIZE (mode
))
3540 insn
= emit_insn (set
);
3541 RTX_FRAME_RELATED_P (insn
) = 1;
3543 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
3545 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
3551 /* REGNO2 can be saved/restored in a pair with REGNO. */
3552 rtx reg2
= gen_rtx_REG (mode
, regno2
);
3553 if (!frame_pointer_needed
)
3554 offset2
+= cfun
->machine
->frame
.frame_size
3555 - cfun
->machine
->frame
.hard_fp_offset
;
3556 rtx addr2
= plus_constant (Pmode
, ptr_reg
, offset2
);
3557 rtx mem2
= gen_frame_mem (mode
, addr2
);
3558 rtx set2
= prologue_p
? gen_rtx_SET (mem2
, reg2
)
3559 : gen_rtx_SET (reg2
, mem2
);
3562 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
, reg2
));
3564 insn
= emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
3566 RTX_FRAME_RELATED_P (insn
) = 1;
3569 add_reg_note (insn
, REG_CFA_OFFSET
, set
);
3570 add_reg_note (insn
, REG_CFA_OFFSET
, set2
);
3574 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
3575 add_reg_note (insn
, REG_CFA_RESTORE
, reg2
);
3578 regno
= aarch64_get_next_set_bit (components
, regno2
+ 1);
3582 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
3585 aarch64_emit_prologue_components (sbitmap components
)
3587 aarch64_process_components (components
, true);
3590 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
3593 aarch64_emit_epilogue_components (sbitmap components
)
3595 aarch64_process_components (components
, false);
3598 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
3601 aarch64_set_handled_components (sbitmap components
)
3603 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
3604 if (bitmap_bit_p (components
, regno
))
3605 cfun
->machine
->reg_is_wrapped_separately
[regno
] = true;
3608 /* AArch64 stack frames generated by this compiler look like:
3610 +-------------------------------+
3612 | incoming stack arguments |
3614 +-------------------------------+
3615 | | <-- incoming stack pointer (aligned)
3616 | callee-allocated save area |
3617 | for register varargs |
3619 +-------------------------------+
3620 | local variables | <-- frame_pointer_rtx
3622 +-------------------------------+
3624 +-------------------------------+ |
3625 | callee-saved registers | | frame.saved_regs_size
3626 +-------------------------------+ |
3628 +-------------------------------+ |
3629 | FP' | / <- hard_frame_pointer_rtx (aligned)
3630 +-------------------------------+
3631 | dynamic allocation |
3632 +-------------------------------+
3634 +-------------------------------+
3635 | outgoing stack arguments | <-- arg_pointer
3637 +-------------------------------+
3638 | | <-- stack_pointer_rtx (aligned)
3640 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3641 but leave frame_pointer_rtx and hard_frame_pointer_rtx
3644 /* Generate the prologue instructions for entry into a function.
3645 Establish the stack frame by decreasing the stack pointer with a
3646 properly calculated size and, if necessary, create a frame record
3647 filled with the values of LR and previous frame pointer. The
3648 current FP is also set up if it is in use. */
3651 aarch64_expand_prologue (void)
3653 aarch64_layout_frame ();
3655 HOST_WIDE_INT frame_size
= cfun
->machine
->frame
.frame_size
;
3656 HOST_WIDE_INT initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
3657 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
3658 HOST_WIDE_INT final_adjust
= cfun
->machine
->frame
.final_adjust
;
3659 HOST_WIDE_INT callee_offset
= cfun
->machine
->frame
.callee_offset
;
3660 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
3661 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
3664 /* Sign return address for functions. */
3665 if (aarch64_return_address_signing_enabled ())
3667 insn
= emit_insn (gen_pacisp ());
3668 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
3669 RTX_FRAME_RELATED_P (insn
) = 1;
3672 if (flag_stack_usage_info
)
3673 current_function_static_stack_size
= frame_size
;
3675 if (flag_stack_check
== STATIC_BUILTIN_STACK_CHECK
)
3677 if (crtl
->is_leaf
&& !cfun
->calls_alloca
)
3679 if (frame_size
> PROBE_INTERVAL
&& frame_size
> STACK_CHECK_PROTECT
)
3680 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT
,
3681 frame_size
- STACK_CHECK_PROTECT
);
3683 else if (frame_size
> 0)
3684 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT
, frame_size
);
3687 aarch64_sub_sp (IP0_REGNUM
, initial_adjust
, true);
3689 if (callee_adjust
!= 0)
3690 aarch64_push_regs (reg1
, reg2
, callee_adjust
);
3692 if (frame_pointer_needed
)
3694 if (callee_adjust
== 0)
3695 aarch64_save_callee_saves (DImode
, callee_offset
, R29_REGNUM
,
3697 insn
= emit_insn (gen_add3_insn (hard_frame_pointer_rtx
,
3699 GEN_INT (callee_offset
)));
3700 RTX_FRAME_RELATED_P (insn
) = 1;
3701 emit_insn (gen_stack_tie (stack_pointer_rtx
, hard_frame_pointer_rtx
));
3704 aarch64_save_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
3705 callee_adjust
!= 0 || frame_pointer_needed
);
3706 aarch64_save_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
3707 callee_adjust
!= 0 || frame_pointer_needed
);
3708 aarch64_sub_sp (IP1_REGNUM
, final_adjust
, !frame_pointer_needed
);
3711 /* Return TRUE if we can use a simple_return insn.
3713 This function checks whether the callee saved stack is empty, which
3714 means no restore actions are need. The pro_and_epilogue will use
3715 this to check whether shrink-wrapping opt is feasible. */
3718 aarch64_use_return_insn_p (void)
3720 if (!reload_completed
)
3726 aarch64_layout_frame ();
3728 return cfun
->machine
->frame
.frame_size
== 0;
3731 /* Generate the epilogue instructions for returning from a function.
3732 This is almost exactly the reverse of the prolog sequence, except
3733 that we need to insert barriers to avoid scheduling loads that read
3734 from a deallocated stack, and we optimize the unwind records by
3735 emitting them all together if possible. */
3737 aarch64_expand_epilogue (bool for_sibcall
)
3739 aarch64_layout_frame ();
3741 HOST_WIDE_INT initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
3742 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
3743 HOST_WIDE_INT final_adjust
= cfun
->machine
->frame
.final_adjust
;
3744 HOST_WIDE_INT callee_offset
= cfun
->machine
->frame
.callee_offset
;
3745 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
3746 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
3750 /* We need to add memory barrier to prevent read from deallocated stack. */
3751 bool need_barrier_p
= (get_frame_size ()
3752 + cfun
->machine
->frame
.saved_varargs_size
) != 0;
3754 /* Emit a barrier to prevent loads from a deallocated stack. */
3755 if (final_adjust
> crtl
->outgoing_args_size
|| cfun
->calls_alloca
3756 || crtl
->calls_eh_return
)
3758 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
3759 need_barrier_p
= false;
3762 /* Restore the stack pointer from the frame pointer if it may not
3763 be the same as the stack pointer. */
3764 if (frame_pointer_needed
&& (final_adjust
|| cfun
->calls_alloca
))
3766 insn
= emit_insn (gen_add3_insn (stack_pointer_rtx
,
3767 hard_frame_pointer_rtx
,
3768 GEN_INT (-callee_offset
)));
3769 /* If writeback is used when restoring callee-saves, the CFA
3770 is restored on the instruction doing the writeback. */
3771 RTX_FRAME_RELATED_P (insn
) = callee_adjust
== 0;
3774 aarch64_add_sp (IP1_REGNUM
, final_adjust
, df_regs_ever_live_p (IP1_REGNUM
));
3776 aarch64_restore_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
3777 callee_adjust
!= 0, &cfi_ops
);
3778 aarch64_restore_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
3779 callee_adjust
!= 0, &cfi_ops
);
3782 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
3784 if (callee_adjust
!= 0)
3785 aarch64_pop_regs (reg1
, reg2
, callee_adjust
, &cfi_ops
);
3787 if (callee_adjust
!= 0 || initial_adjust
> 65536)
3789 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
3790 insn
= get_last_insn ();
3791 rtx new_cfa
= plus_constant (Pmode
, stack_pointer_rtx
, initial_adjust
);
3792 REG_NOTES (insn
) = alloc_reg_note (REG_CFA_DEF_CFA
, new_cfa
, cfi_ops
);
3793 RTX_FRAME_RELATED_P (insn
) = 1;
3797 aarch64_add_sp (IP0_REGNUM
, initial_adjust
, df_regs_ever_live_p (IP0_REGNUM
));
3801 /* Emit delayed restores and reset the CFA to be SP. */
3802 insn
= get_last_insn ();
3803 cfi_ops
= alloc_reg_note (REG_CFA_DEF_CFA
, stack_pointer_rtx
, cfi_ops
);
3804 REG_NOTES (insn
) = cfi_ops
;
3805 RTX_FRAME_RELATED_P (insn
) = 1;
3808 /* We prefer to emit the combined return/authenticate instruction RETAA,
3809 however there are three cases in which we must instead emit an explicit
3810 authentication instruction.
3812 1) Sibcalls don't return in a normal way, so if we're about to call one
3813 we must authenticate.
3815 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
3816 generating code for !TARGET_ARMV8_3 we can't use it and must
3817 explicitly authenticate.
3819 3) On an eh_return path we make extra stack adjustments to update the
3820 canonical frame address to be the exception handler's CFA. We want
3821 to authenticate using the CFA of the function which calls eh_return.
3823 if (aarch64_return_address_signing_enabled ()
3824 && (for_sibcall
|| !TARGET_ARMV8_3
|| crtl
->calls_eh_return
))
3826 insn
= emit_insn (gen_autisp ());
3827 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
3828 RTX_FRAME_RELATED_P (insn
) = 1;
3831 /* Stack adjustment for exception handler. */
3832 if (crtl
->calls_eh_return
)
3834 /* We need to unwind the stack by the offset computed by
3835 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
3836 to be SP; letting the CFA move during this adjustment
3837 is just as correct as retaining the CFA from the body
3838 of the function. Therefore, do nothing special. */
3839 emit_insn (gen_add2_insn (stack_pointer_rtx
, EH_RETURN_STACKADJ_RTX
));
3842 emit_use (gen_rtx_REG (DImode
, LR_REGNUM
));
3844 emit_jump_insn (ret_rtx
);
3847 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
3848 normally or return to a previous frame after unwinding.
3850 An EH return uses a single shared return sequence. The epilogue is
3851 exactly like a normal epilogue except that it has an extra input
3852 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3853 that must be applied after the frame has been destroyed. An extra label
3854 is inserted before the epilogue which initializes this register to zero,
3855 and this is the entry point for a normal return.
3857 An actual EH return updates the return address, initializes the stack
3858 adjustment and jumps directly into the epilogue (bypassing the zeroing
3859 of the adjustment). Since the return address is typically saved on the
3860 stack when a function makes a call, the saved LR must be updated outside
3863 This poses problems as the store is generated well before the epilogue,
3864 so the offset of LR is not known yet. Also optimizations will remove the
3865 store as it appears dead, even after the epilogue is generated (as the
3866 base or offset for loading LR is different in many cases).
3868 To avoid these problems this implementation forces the frame pointer
3869 in eh_return functions so that the location of LR is fixed and known early.
3870 It also marks the store volatile, so no optimization is permitted to
3871 remove the store. */
3873 aarch64_eh_return_handler_rtx (void)
3875 rtx tmp
= gen_frame_mem (Pmode
,
3876 plus_constant (Pmode
, hard_frame_pointer_rtx
, UNITS_PER_WORD
));
3878 /* Mark the store volatile, so no optimization is permitted to remove it. */
3879 MEM_VOLATILE_P (tmp
) = true;
3883 /* Output code to add DELTA to the first argument, and then jump
3884 to FUNCTION. Used for C++ multiple inheritance. */
3886 aarch64_output_mi_thunk (FILE *file
, tree thunk ATTRIBUTE_UNUSED
,
3887 HOST_WIDE_INT delta
,
3888 HOST_WIDE_INT vcall_offset
,
3891 /* The this pointer is always in x0. Note that this differs from
3892 Arm where the this pointer maybe bumped to r1 if r0 is required
3893 to return a pointer to an aggregate. On AArch64 a result value
3894 pointer will be in x8. */
3895 int this_regno
= R0_REGNUM
;
3896 rtx this_rtx
, temp0
, temp1
, addr
, funexp
;
3899 reload_completed
= 1;
3900 emit_note (NOTE_INSN_PROLOGUE_END
);
3902 if (vcall_offset
== 0)
3903 aarch64_add_constant (Pmode
, this_regno
, IP1_REGNUM
, delta
);
3906 gcc_assert ((vcall_offset
& (POINTER_BYTES
- 1)) == 0);
3908 this_rtx
= gen_rtx_REG (Pmode
, this_regno
);
3909 temp0
= gen_rtx_REG (Pmode
, IP0_REGNUM
);
3910 temp1
= gen_rtx_REG (Pmode
, IP1_REGNUM
);
3915 if (delta
>= -256 && delta
< 256)
3916 addr
= gen_rtx_PRE_MODIFY (Pmode
, this_rtx
,
3917 plus_constant (Pmode
, this_rtx
, delta
));
3919 aarch64_add_constant (Pmode
, this_regno
, IP1_REGNUM
, delta
);
3922 if (Pmode
== ptr_mode
)
3923 aarch64_emit_move (temp0
, gen_rtx_MEM (ptr_mode
, addr
));
3925 aarch64_emit_move (temp0
,
3926 gen_rtx_ZERO_EXTEND (Pmode
,
3927 gen_rtx_MEM (ptr_mode
, addr
)));
3929 if (vcall_offset
>= -256 && vcall_offset
< 4096 * POINTER_BYTES
)
3930 addr
= plus_constant (Pmode
, temp0
, vcall_offset
);
3933 aarch64_internal_mov_immediate (temp1
, GEN_INT (vcall_offset
), true,
3935 addr
= gen_rtx_PLUS (Pmode
, temp0
, temp1
);
3938 if (Pmode
== ptr_mode
)
3939 aarch64_emit_move (temp1
, gen_rtx_MEM (ptr_mode
,addr
));
3941 aarch64_emit_move (temp1
,
3942 gen_rtx_SIGN_EXTEND (Pmode
,
3943 gen_rtx_MEM (ptr_mode
, addr
)));
3945 emit_insn (gen_add2_insn (this_rtx
, temp1
));
3948 /* Generate a tail call to the target function. */
3949 if (!TREE_USED (function
))
3951 assemble_external (function
);
3952 TREE_USED (function
) = 1;
3954 funexp
= XEXP (DECL_RTL (function
), 0);
3955 funexp
= gen_rtx_MEM (FUNCTION_MODE
, funexp
);
3956 insn
= emit_call_insn (gen_sibcall (funexp
, const0_rtx
, NULL_RTX
));
3957 SIBLING_CALL_P (insn
) = 1;
3959 insn
= get_insns ();
3960 shorten_branches (insn
);
3961 final_start_function (insn
, file
, 1);
3962 final (insn
, file
, 1);
3963 final_end_function ();
3965 /* Stop pretending to be a post-reload pass. */
3966 reload_completed
= 0;
3970 aarch64_tls_referenced_p (rtx x
)
3972 if (!TARGET_HAVE_TLS
)
3974 subrtx_iterator::array_type array
;
3975 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
3977 const_rtx x
= *iter
;
3978 if (GET_CODE (x
) == SYMBOL_REF
&& SYMBOL_REF_TLS_MODEL (x
) != 0)
3980 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3981 TLS offsets, not real symbol references. */
3982 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
3983 iter
.skip_subrtxes ();
3989 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3990 a left shift of 0 or 12 bits. */
3992 aarch64_uimm12_shift (HOST_WIDE_INT val
)
3994 return ((val
& (((HOST_WIDE_INT
) 0xfff) << 0)) == val
3995 || (val
& (((HOST_WIDE_INT
) 0xfff) << 12)) == val
4000 /* Return true if val is an immediate that can be loaded into a
4001 register by a MOVZ instruction. */
4003 aarch64_movw_imm (HOST_WIDE_INT val
, machine_mode mode
)
4005 if (GET_MODE_SIZE (mode
) > 4)
4007 if ((val
& (((HOST_WIDE_INT
) 0xffff) << 32)) == val
4008 || (val
& (((HOST_WIDE_INT
) 0xffff) << 48)) == val
)
4013 /* Ignore sign extension. */
4014 val
&= (HOST_WIDE_INT
) 0xffffffff;
4016 return ((val
& (((HOST_WIDE_INT
) 0xffff) << 0)) == val
4017 || (val
& (((HOST_WIDE_INT
) 0xffff) << 16)) == val
);
4020 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
4022 static const unsigned HOST_WIDE_INT bitmask_imm_mul
[] =
4024 0x0000000100000001ull
,
4025 0x0001000100010001ull
,
4026 0x0101010101010101ull
,
4027 0x1111111111111111ull
,
4028 0x5555555555555555ull
,
4032 /* Return true if val is a valid bitmask immediate. */
4035 aarch64_bitmask_imm (HOST_WIDE_INT val_in
, machine_mode mode
)
4037 unsigned HOST_WIDE_INT val
, tmp
, mask
, first_one
, next_one
;
4040 /* Check for a single sequence of one bits and return quickly if so.
4041 The special cases of all ones and all zeroes returns false. */
4042 val
= (unsigned HOST_WIDE_INT
) val_in
;
4043 tmp
= val
+ (val
& -val
);
4045 if (tmp
== (tmp
& -tmp
))
4046 return (val
+ 1) > 1;
4048 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
4050 val
= (val
<< 32) | (val
& 0xffffffff);
4052 /* Invert if the immediate doesn't start with a zero bit - this means we
4053 only need to search for sequences of one bits. */
4057 /* Find the first set bit and set tmp to val with the first sequence of one
4058 bits removed. Return success if there is a single sequence of ones. */
4059 first_one
= val
& -val
;
4060 tmp
= val
& (val
+ first_one
);
4065 /* Find the next set bit and compute the difference in bit position. */
4066 next_one
= tmp
& -tmp
;
4067 bits
= clz_hwi (first_one
) - clz_hwi (next_one
);
4070 /* Check the bit position difference is a power of 2, and that the first
4071 sequence of one bits fits within 'bits' bits. */
4072 if ((mask
>> bits
) != 0 || bits
!= (bits
& -bits
))
4075 /* Check the sequence of one bits is repeated 64/bits times. */
4076 return val
== mask
* bitmask_imm_mul
[__builtin_clz (bits
) - 26];
4079 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4080 Assumed precondition: VAL_IN Is not zero. */
4082 unsigned HOST_WIDE_INT
4083 aarch64_and_split_imm1 (HOST_WIDE_INT val_in
)
4085 int lowest_bit_set
= ctz_hwi (val_in
);
4086 int highest_bit_set
= floor_log2 (val_in
);
4087 gcc_assert (val_in
!= 0);
4089 return ((HOST_WIDE_INT_UC (2) << highest_bit_set
) -
4090 (HOST_WIDE_INT_1U
<< lowest_bit_set
));
4093 /* Create constant where bits outside of lowest bit set to highest bit set
4096 unsigned HOST_WIDE_INT
4097 aarch64_and_split_imm2 (HOST_WIDE_INT val_in
)
4099 return val_in
| ~aarch64_and_split_imm1 (val_in
);
4102 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
4105 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in
, machine_mode mode
)
4107 if (aarch64_bitmask_imm (val_in
, mode
))
4110 if (aarch64_move_imm (val_in
, mode
))
4113 unsigned HOST_WIDE_INT imm2
= aarch64_and_split_imm2 (val_in
);
4115 return aarch64_bitmask_imm (imm2
, mode
);
4118 /* Return true if val is an immediate that can be loaded into a
4119 register in a single instruction. */
4121 aarch64_move_imm (HOST_WIDE_INT val
, machine_mode mode
)
4123 if (aarch64_movw_imm (val
, mode
) || aarch64_movw_imm (~val
, mode
))
4125 return aarch64_bitmask_imm (val
, mode
);
4129 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED
, rtx x
)
4133 if (GET_CODE (x
) == HIGH
)
4136 split_const (x
, &base
, &offset
);
4137 if (GET_CODE (base
) == SYMBOL_REF
|| GET_CODE (base
) == LABEL_REF
)
4139 if (aarch64_classify_symbol (base
, offset
)
4140 != SYMBOL_FORCE_TO_MEM
)
4143 /* Avoid generating a 64-bit relocation in ILP32; leave
4144 to aarch64_expand_mov_immediate to handle it properly. */
4145 return mode
!= ptr_mode
;
4148 return aarch64_tls_referenced_p (x
);
4151 /* Implement TARGET_CASE_VALUES_THRESHOLD.
4152 The expansion for a table switch is quite expensive due to the number
4153 of instructions, the table lookup and hard to predict indirect jump.
4154 When optimizing for speed, and -O3 enabled, use the per-core tuning if
4155 set, otherwise use tables for > 16 cases as a tradeoff between size and
4156 performance. When optimizing for size, use the default setting. */
4159 aarch64_case_values_threshold (void)
4161 /* Use the specified limit for the number of cases before using jump
4162 tables at higher optimization levels. */
4164 && selected_cpu
->tune
->max_case_values
!= 0)
4165 return selected_cpu
->tune
->max_case_values
;
4167 return optimize_size
? default_case_values_threshold () : 17;
4170 /* Return true if register REGNO is a valid index register.
4171 STRICT_P is true if REG_OK_STRICT is in effect. */
4174 aarch64_regno_ok_for_index_p (int regno
, bool strict_p
)
4176 if (!HARD_REGISTER_NUM_P (regno
))
4184 regno
= reg_renumber
[regno
];
4186 return GP_REGNUM_P (regno
);
4189 /* Return true if register REGNO is a valid base register for mode MODE.
4190 STRICT_P is true if REG_OK_STRICT is in effect. */
4193 aarch64_regno_ok_for_base_p (int regno
, bool strict_p
)
4195 if (!HARD_REGISTER_NUM_P (regno
))
4203 regno
= reg_renumber
[regno
];
4206 /* The fake registers will be eliminated to either the stack or
4207 hard frame pointer, both of which are usually valid base registers.
4208 Reload deals with the cases where the eliminated form isn't valid. */
4209 return (GP_REGNUM_P (regno
)
4210 || regno
== SP_REGNUM
4211 || regno
== FRAME_POINTER_REGNUM
4212 || regno
== ARG_POINTER_REGNUM
);
4215 /* Return true if X is a valid base register for mode MODE.
4216 STRICT_P is true if REG_OK_STRICT is in effect. */
4219 aarch64_base_register_rtx_p (rtx x
, bool strict_p
)
4221 if (!strict_p
&& GET_CODE (x
) == SUBREG
)
4224 return (REG_P (x
) && aarch64_regno_ok_for_base_p (REGNO (x
), strict_p
));
4227 /* Return true if address offset is a valid index. If it is, fill in INFO
4228 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
4231 aarch64_classify_index (struct aarch64_address_info
*info
, rtx x
,
4232 machine_mode mode
, bool strict_p
)
4234 enum aarch64_address_type type
;
4239 if ((REG_P (x
) || GET_CODE (x
) == SUBREG
)
4240 && GET_MODE (x
) == Pmode
)
4242 type
= ADDRESS_REG_REG
;
4246 /* (sign_extend:DI (reg:SI)) */
4247 else if ((GET_CODE (x
) == SIGN_EXTEND
4248 || GET_CODE (x
) == ZERO_EXTEND
)
4249 && GET_MODE (x
) == DImode
4250 && GET_MODE (XEXP (x
, 0)) == SImode
)
4252 type
= (GET_CODE (x
) == SIGN_EXTEND
)
4253 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
4254 index
= XEXP (x
, 0);
4257 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4258 else if (GET_CODE (x
) == MULT
4259 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
4260 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
4261 && GET_MODE (XEXP (x
, 0)) == DImode
4262 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
4263 && CONST_INT_P (XEXP (x
, 1)))
4265 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
4266 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
4267 index
= XEXP (XEXP (x
, 0), 0);
4268 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
4270 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4271 else if (GET_CODE (x
) == ASHIFT
4272 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
4273 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
4274 && GET_MODE (XEXP (x
, 0)) == DImode
4275 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
4276 && CONST_INT_P (XEXP (x
, 1)))
4278 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
4279 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
4280 index
= XEXP (XEXP (x
, 0), 0);
4281 shift
= INTVAL (XEXP (x
, 1));
4283 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4284 else if ((GET_CODE (x
) == SIGN_EXTRACT
4285 || GET_CODE (x
) == ZERO_EXTRACT
)
4286 && GET_MODE (x
) == DImode
4287 && GET_CODE (XEXP (x
, 0)) == MULT
4288 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
4289 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
4291 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
4292 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
4293 index
= XEXP (XEXP (x
, 0), 0);
4294 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
4295 if (INTVAL (XEXP (x
, 1)) != 32 + shift
4296 || INTVAL (XEXP (x
, 2)) != 0)
4299 /* (and:DI (mult:DI (reg:DI) (const_int scale))
4300 (const_int 0xffffffff<<shift)) */
4301 else if (GET_CODE (x
) == AND
4302 && GET_MODE (x
) == DImode
4303 && GET_CODE (XEXP (x
, 0)) == MULT
4304 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
4305 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
4306 && CONST_INT_P (XEXP (x
, 1)))
4308 type
= ADDRESS_REG_UXTW
;
4309 index
= XEXP (XEXP (x
, 0), 0);
4310 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
4311 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
4314 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4315 else if ((GET_CODE (x
) == SIGN_EXTRACT
4316 || GET_CODE (x
) == ZERO_EXTRACT
)
4317 && GET_MODE (x
) == DImode
4318 && GET_CODE (XEXP (x
, 0)) == ASHIFT
4319 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
4320 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
4322 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
4323 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
4324 index
= XEXP (XEXP (x
, 0), 0);
4325 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
4326 if (INTVAL (XEXP (x
, 1)) != 32 + shift
4327 || INTVAL (XEXP (x
, 2)) != 0)
4330 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4331 (const_int 0xffffffff<<shift)) */
4332 else if (GET_CODE (x
) == AND
4333 && GET_MODE (x
) == DImode
4334 && GET_CODE (XEXP (x
, 0)) == ASHIFT
4335 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
4336 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
4337 && CONST_INT_P (XEXP (x
, 1)))
4339 type
= ADDRESS_REG_UXTW
;
4340 index
= XEXP (XEXP (x
, 0), 0);
4341 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
4342 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
4345 /* (mult:P (reg:P) (const_int scale)) */
4346 else if (GET_CODE (x
) == MULT
4347 && GET_MODE (x
) == Pmode
4348 && GET_MODE (XEXP (x
, 0)) == Pmode
4349 && CONST_INT_P (XEXP (x
, 1)))
4351 type
= ADDRESS_REG_REG
;
4352 index
= XEXP (x
, 0);
4353 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
4355 /* (ashift:P (reg:P) (const_int shift)) */
4356 else if (GET_CODE (x
) == ASHIFT
4357 && GET_MODE (x
) == Pmode
4358 && GET_MODE (XEXP (x
, 0)) == Pmode
4359 && CONST_INT_P (XEXP (x
, 1)))
4361 type
= ADDRESS_REG_REG
;
4362 index
= XEXP (x
, 0);
4363 shift
= INTVAL (XEXP (x
, 1));
4368 if (GET_CODE (index
) == SUBREG
)
4369 index
= SUBREG_REG (index
);
4372 (shift
> 0 && shift
<= 3
4373 && (1 << shift
) == GET_MODE_SIZE (mode
)))
4375 && aarch64_regno_ok_for_index_p (REGNO (index
), strict_p
))
4378 info
->offset
= index
;
4379 info
->shift
= shift
;
4386 /* Return true if MODE is one of the modes for which we
4387 support LDP/STP operations. */
4390 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode
)
4392 return mode
== SImode
|| mode
== DImode
4393 || mode
== SFmode
|| mode
== DFmode
4394 || (aarch64_vector_mode_supported_p (mode
)
4395 && GET_MODE_SIZE (mode
) == 8);
4398 /* Return true if REGNO is a virtual pointer register, or an eliminable
4399 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
4400 include stack_pointer or hard_frame_pointer. */
4402 virt_or_elim_regno_p (unsigned regno
)
4404 return ((regno
>= FIRST_VIRTUAL_REGISTER
4405 && regno
<= LAST_VIRTUAL_POINTER_REGISTER
)
4406 || regno
== FRAME_POINTER_REGNUM
4407 || regno
== ARG_POINTER_REGNUM
);
4410 /* Return true if X is a valid address for machine mode MODE. If it is,
4411 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
4412 effect. OUTER_CODE is PARALLEL for a load/store pair. */
4415 aarch64_classify_address (struct aarch64_address_info
*info
,
4416 rtx x
, machine_mode mode
,
4417 RTX_CODE outer_code
, bool strict_p
)
4419 enum rtx_code code
= GET_CODE (x
);
4422 /* On BE, we use load/store pair for all large int mode load/stores.
4423 TI/TFmode may also use a load/store pair. */
4424 bool load_store_pair_p
= (outer_code
== PARALLEL
4427 || (BYTES_BIG_ENDIAN
4428 && aarch64_vect_struct_mode_p (mode
)));
4430 bool allow_reg_index_p
=
4432 && (GET_MODE_SIZE (mode
) != 16 || aarch64_vector_mode_supported_p (mode
))
4433 && !aarch64_vect_struct_mode_p (mode
);
4435 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4437 if (aarch64_vect_struct_mode_p (mode
) && !BYTES_BIG_ENDIAN
4438 && (code
!= POST_INC
&& code
!= REG
))
4445 info
->type
= ADDRESS_REG_IMM
;
4447 info
->offset
= const0_rtx
;
4448 return aarch64_base_register_rtx_p (x
, strict_p
);
4456 && virt_or_elim_regno_p (REGNO (op0
))
4457 && CONST_INT_P (op1
))
4459 info
->type
= ADDRESS_REG_IMM
;
4466 if (GET_MODE_SIZE (mode
) != 0
4467 && CONST_INT_P (op1
)
4468 && aarch64_base_register_rtx_p (op0
, strict_p
))
4470 HOST_WIDE_INT offset
= INTVAL (op1
);
4472 info
->type
= ADDRESS_REG_IMM
;
4476 /* TImode and TFmode values are allowed in both pairs of X
4477 registers and individual Q registers. The available
4479 X,X: 7-bit signed scaled offset
4480 Q: 9-bit signed offset
4481 We conservatively require an offset representable in either mode.
4482 When performing the check for pairs of X registers i.e. LDP/STP
4483 pass down DImode since that is the natural size of the LDP/STP
4484 instruction memory accesses. */
4485 if (mode
== TImode
|| mode
== TFmode
)
4486 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
4487 && (offset_9bit_signed_unscaled_p (mode
, offset
)
4488 || offset_12bit_unsigned_scaled_p (mode
, offset
)));
4490 /* A 7bit offset check because OImode will emit a ldp/stp
4491 instruction (only big endian will get here).
4492 For ldp/stp instructions, the offset is scaled for the size of a
4493 single element of the pair. */
4495 return aarch64_offset_7bit_signed_scaled_p (TImode
, offset
);
4497 /* Three 9/12 bit offsets checks because CImode will emit three
4498 ldr/str instructions (only big endian will get here). */
4500 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
4501 && (offset_9bit_signed_unscaled_p (V16QImode
, offset
+ 32)
4502 || offset_12bit_unsigned_scaled_p (V16QImode
,
4505 /* Two 7bit offsets checks because XImode will emit two ldp/stp
4506 instructions (only big endian will get here). */
4508 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
4509 && aarch64_offset_7bit_signed_scaled_p (TImode
,
4512 if (load_store_pair_p
)
4513 return ((GET_MODE_SIZE (mode
) == 4 || GET_MODE_SIZE (mode
) == 8)
4514 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
4516 return (offset_9bit_signed_unscaled_p (mode
, offset
)
4517 || offset_12bit_unsigned_scaled_p (mode
, offset
));
4520 if (allow_reg_index_p
)
4522 /* Look for base + (scaled/extended) index register. */
4523 if (aarch64_base_register_rtx_p (op0
, strict_p
)
4524 && aarch64_classify_index (info
, op1
, mode
, strict_p
))
4529 if (aarch64_base_register_rtx_p (op1
, strict_p
)
4530 && aarch64_classify_index (info
, op0
, mode
, strict_p
))
4543 info
->type
= ADDRESS_REG_WB
;
4544 info
->base
= XEXP (x
, 0);
4545 info
->offset
= NULL_RTX
;
4546 return aarch64_base_register_rtx_p (info
->base
, strict_p
);
4550 info
->type
= ADDRESS_REG_WB
;
4551 info
->base
= XEXP (x
, 0);
4552 if (GET_CODE (XEXP (x
, 1)) == PLUS
4553 && CONST_INT_P (XEXP (XEXP (x
, 1), 1))
4554 && rtx_equal_p (XEXP (XEXP (x
, 1), 0), info
->base
)
4555 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
4557 HOST_WIDE_INT offset
;
4558 info
->offset
= XEXP (XEXP (x
, 1), 1);
4559 offset
= INTVAL (info
->offset
);
4561 /* TImode and TFmode values are allowed in both pairs of X
4562 registers and individual Q registers. The available
4564 X,X: 7-bit signed scaled offset
4565 Q: 9-bit signed offset
4566 We conservatively require an offset representable in either mode.
4568 if (mode
== TImode
|| mode
== TFmode
)
4569 return (aarch64_offset_7bit_signed_scaled_p (mode
, offset
)
4570 && offset_9bit_signed_unscaled_p (mode
, offset
));
4572 if (load_store_pair_p
)
4573 return ((GET_MODE_SIZE (mode
) == 4 || GET_MODE_SIZE (mode
) == 8)
4574 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
4576 return offset_9bit_signed_unscaled_p (mode
, offset
);
4583 /* load literal: pc-relative constant pool entry. Only supported
4584 for SI mode or larger. */
4585 info
->type
= ADDRESS_SYMBOLIC
;
4587 if (!load_store_pair_p
&& GET_MODE_SIZE (mode
) >= 4)
4591 split_const (x
, &sym
, &addend
);
4592 return ((GET_CODE (sym
) == LABEL_REF
4593 || (GET_CODE (sym
) == SYMBOL_REF
4594 && CONSTANT_POOL_ADDRESS_P (sym
)
4595 && aarch64_pcrelative_literal_loads
)));
4600 info
->type
= ADDRESS_LO_SUM
;
4601 info
->base
= XEXP (x
, 0);
4602 info
->offset
= XEXP (x
, 1);
4603 if (allow_reg_index_p
4604 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
4607 split_const (info
->offset
, &sym
, &offs
);
4608 if (GET_CODE (sym
) == SYMBOL_REF
4609 && (aarch64_classify_symbol (sym
, offs
) == SYMBOL_SMALL_ABSOLUTE
))
4611 /* The symbol and offset must be aligned to the access size. */
4613 unsigned int ref_size
;
4615 if (CONSTANT_POOL_ADDRESS_P (sym
))
4616 align
= GET_MODE_ALIGNMENT (get_pool_mode (sym
));
4617 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym
))
4619 tree exp
= SYMBOL_REF_DECL (sym
);
4620 align
= TYPE_ALIGN (TREE_TYPE (exp
));
4621 align
= CONSTANT_ALIGNMENT (exp
, align
);
4623 else if (SYMBOL_REF_DECL (sym
))
4624 align
= DECL_ALIGN (SYMBOL_REF_DECL (sym
));
4625 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym
)
4626 && SYMBOL_REF_BLOCK (sym
) != NULL
)
4627 align
= SYMBOL_REF_BLOCK (sym
)->alignment
;
4629 align
= BITS_PER_UNIT
;
4631 ref_size
= GET_MODE_SIZE (mode
);
4633 ref_size
= GET_MODE_SIZE (DImode
);
4635 return ((INTVAL (offs
) & (ref_size
- 1)) == 0
4636 && ((align
/ BITS_PER_UNIT
) & (ref_size
- 1)) == 0);
4646 /* Return true if the address X is valid for a PRFM instruction.
4647 STRICT_P is true if we should do strict checking with
4648 aarch64_classify_address. */
4651 aarch64_address_valid_for_prefetch_p (rtx x
, bool strict_p
)
4653 struct aarch64_address_info addr
;
4655 /* PRFM accepts the same addresses as DImode... */
4656 bool res
= aarch64_classify_address (&addr
, x
, DImode
, MEM
, strict_p
);
4660 /* ... except writeback forms. */
4661 return addr
.type
!= ADDRESS_REG_WB
;
4665 aarch64_symbolic_address_p (rtx x
)
4669 split_const (x
, &x
, &offset
);
4670 return GET_CODE (x
) == SYMBOL_REF
|| GET_CODE (x
) == LABEL_REF
;
4673 /* Classify the base of symbolic expression X. */
4675 enum aarch64_symbol_type
4676 aarch64_classify_symbolic_expression (rtx x
)
4680 split_const (x
, &x
, &offset
);
4681 return aarch64_classify_symbol (x
, offset
);
4685 /* Return TRUE if X is a legitimate address for accessing memory in
4688 aarch64_legitimate_address_hook_p (machine_mode mode
, rtx x
, bool strict_p
)
4690 struct aarch64_address_info addr
;
4692 return aarch64_classify_address (&addr
, x
, mode
, MEM
, strict_p
);
4695 /* Return TRUE if X is a legitimate address for accessing memory in
4696 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
4699 aarch64_legitimate_address_p (machine_mode mode
, rtx x
,
4700 RTX_CODE outer_code
, bool strict_p
)
4702 struct aarch64_address_info addr
;
4704 return aarch64_classify_address (&addr
, x
, mode
, outer_code
, strict_p
);
4707 /* Split an out-of-range address displacement into a base and offset.
4708 Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4709 to increase opportunities for sharing the base address of different sizes.
4710 For unaligned accesses and TI/TF mode use the signed 9-bit range. */
4712 aarch64_legitimize_address_displacement (rtx
*disp
, rtx
*off
, machine_mode mode
)
4714 HOST_WIDE_INT offset
= INTVAL (*disp
);
4715 HOST_WIDE_INT base
= offset
& ~(GET_MODE_SIZE (mode
) < 4 ? 0xfff : 0x3ffc);
4717 if (mode
== TImode
|| mode
== TFmode
4718 || (offset
& (GET_MODE_SIZE (mode
) - 1)) != 0)
4719 base
= (offset
+ 0x100) & ~0x1ff;
4721 *off
= GEN_INT (base
);
4722 *disp
= GEN_INT (offset
- base
);
4726 /* Return TRUE if rtx X is immediate constant 0.0 */
4728 aarch64_float_const_zero_rtx_p (rtx x
)
4730 if (GET_MODE (x
) == VOIDmode
)
4733 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x
)))
4734 return !HONOR_SIGNED_ZEROS (GET_MODE (x
));
4735 return real_equal (CONST_DOUBLE_REAL_VALUE (x
), &dconst0
);
4738 /* Return the fixed registers used for condition codes. */
4741 aarch64_fixed_condition_code_regs (unsigned int *p1
, unsigned int *p2
)
4744 *p2
= INVALID_REGNUM
;
4748 /* This function is used by the call expanders of the machine description.
4749 RESULT is the register in which the result is returned. It's NULL for
4750 "call" and "sibcall".
4751 MEM is the location of the function call.
4752 SIBCALL indicates whether this function call is normal call or sibling call.
4753 It will generate different pattern accordingly. */
4756 aarch64_expand_call (rtx result
, rtx mem
, bool sibcall
)
4758 rtx call
, callee
, tmp
;
4762 gcc_assert (MEM_P (mem
));
4763 callee
= XEXP (mem
, 0);
4764 mode
= GET_MODE (callee
);
4765 gcc_assert (mode
== Pmode
);
4767 /* Decide if we should generate indirect calls by loading the
4768 address of the callee into a register before performing
4769 the branch-and-link. */
4770 if (SYMBOL_REF_P (callee
)
4771 ? (aarch64_is_long_call_p (callee
)
4772 || aarch64_is_noplt_call_p (callee
))
4774 XEXP (mem
, 0) = force_reg (mode
, callee
);
4776 call
= gen_rtx_CALL (VOIDmode
, mem
, const0_rtx
);
4778 if (result
!= NULL_RTX
)
4779 call
= gen_rtx_SET (result
, call
);
4784 tmp
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (Pmode
, LR_REGNUM
));
4786 vec
= gen_rtvec (2, call
, tmp
);
4787 call
= gen_rtx_PARALLEL (VOIDmode
, vec
);
4789 aarch64_emit_call_insn (call
);
4792 /* Emit call insn with PAT and do aarch64-specific handling. */
4795 aarch64_emit_call_insn (rtx pat
)
4797 rtx insn
= emit_call_insn (pat
);
4799 rtx
*fusage
= &CALL_INSN_FUNCTION_USAGE (insn
);
4800 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP0_REGNUM
));
4801 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP1_REGNUM
));
4805 aarch64_select_cc_mode (RTX_CODE code
, rtx x
, rtx y
)
4807 /* All floating point compares return CCFP if it is an equality
4808 comparison, and CCFPE otherwise. */
4809 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_FLOAT
)
4836 /* Equality comparisons of short modes against zero can be performed
4837 using the TST instruction with the appropriate bitmask. */
4838 if (y
== const0_rtx
&& REG_P (x
)
4839 && (code
== EQ
|| code
== NE
)
4840 && (GET_MODE (x
) == HImode
|| GET_MODE (x
) == QImode
))
4843 /* Similarly, comparisons of zero_extends from shorter modes can
4844 be performed using an ANDS with an immediate mask. */
4845 if (y
== const0_rtx
&& GET_CODE (x
) == ZERO_EXTEND
4846 && (GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
4847 && (GET_MODE (XEXP (x
, 0)) == HImode
|| GET_MODE (XEXP (x
, 0)) == QImode
)
4848 && (code
== EQ
|| code
== NE
))
4851 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
4853 && (code
== EQ
|| code
== NE
|| code
== LT
|| code
== GE
)
4854 && (GET_CODE (x
) == PLUS
|| GET_CODE (x
) == MINUS
|| GET_CODE (x
) == AND
4855 || GET_CODE (x
) == NEG
4856 || (GET_CODE (x
) == ZERO_EXTRACT
&& CONST_INT_P (XEXP (x
, 1))
4857 && CONST_INT_P (XEXP (x
, 2)))))
4860 /* A compare with a shifted operand. Because of canonicalization,
4861 the comparison will have to be swapped when we emit the assembly
4863 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
4864 && (REG_P (y
) || GET_CODE (y
) == SUBREG
|| y
== const0_rtx
)
4865 && (GET_CODE (x
) == ASHIFT
|| GET_CODE (x
) == ASHIFTRT
4866 || GET_CODE (x
) == LSHIFTRT
4867 || GET_CODE (x
) == ZERO_EXTEND
|| GET_CODE (x
) == SIGN_EXTEND
))
4870 /* Similarly for a negated operand, but we can only do this for
4872 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
4873 && (REG_P (y
) || GET_CODE (y
) == SUBREG
)
4874 && (code
== EQ
|| code
== NE
)
4875 && GET_CODE (x
) == NEG
)
4878 /* A test for unsigned overflow. */
4879 if ((GET_MODE (x
) == DImode
|| GET_MODE (x
) == TImode
)
4881 && GET_CODE (x
) == PLUS
4882 && GET_CODE (y
) == ZERO_EXTEND
)
4885 /* For everything else, return CCmode. */
4890 aarch64_get_condition_code_1 (machine_mode
, enum rtx_code
);
4893 aarch64_get_condition_code (rtx x
)
4895 machine_mode mode
= GET_MODE (XEXP (x
, 0));
4896 enum rtx_code comp_code
= GET_CODE (x
);
4898 if (GET_MODE_CLASS (mode
) != MODE_CC
)
4899 mode
= SELECT_CC_MODE (comp_code
, XEXP (x
, 0), XEXP (x
, 1));
4900 return aarch64_get_condition_code_1 (mode
, comp_code
);
4904 aarch64_get_condition_code_1 (machine_mode mode
, enum rtx_code comp_code
)
4912 case GE
: return AARCH64_GE
;
4913 case GT
: return AARCH64_GT
;
4914 case LE
: return AARCH64_LS
;
4915 case LT
: return AARCH64_MI
;
4916 case NE
: return AARCH64_NE
;
4917 case EQ
: return AARCH64_EQ
;
4918 case ORDERED
: return AARCH64_VC
;
4919 case UNORDERED
: return AARCH64_VS
;
4920 case UNLT
: return AARCH64_LT
;
4921 case UNLE
: return AARCH64_LE
;
4922 case UNGT
: return AARCH64_HI
;
4923 case UNGE
: return AARCH64_PL
;
4931 case NE
: return AARCH64_NE
;
4932 case EQ
: return AARCH64_EQ
;
4933 case GE
: return AARCH64_GE
;
4934 case GT
: return AARCH64_GT
;
4935 case LE
: return AARCH64_LE
;
4936 case LT
: return AARCH64_LT
;
4937 case GEU
: return AARCH64_CS
;
4938 case GTU
: return AARCH64_HI
;
4939 case LEU
: return AARCH64_LS
;
4940 case LTU
: return AARCH64_CC
;
4948 case NE
: return AARCH64_NE
;
4949 case EQ
: return AARCH64_EQ
;
4950 case GE
: return AARCH64_LE
;
4951 case GT
: return AARCH64_LT
;
4952 case LE
: return AARCH64_GE
;
4953 case LT
: return AARCH64_GT
;
4954 case GEU
: return AARCH64_LS
;
4955 case GTU
: return AARCH64_CC
;
4956 case LEU
: return AARCH64_CS
;
4957 case LTU
: return AARCH64_HI
;
4965 case NE
: return AARCH64_NE
;
4966 case EQ
: return AARCH64_EQ
;
4967 case GE
: return AARCH64_PL
;
4968 case LT
: return AARCH64_MI
;
4976 case NE
: return AARCH64_NE
;
4977 case EQ
: return AARCH64_EQ
;
4985 case NE
: return AARCH64_CS
;
4986 case EQ
: return AARCH64_CC
;
4999 aarch64_const_vec_all_same_in_range_p (rtx x
,
5000 HOST_WIDE_INT minval
,
5001 HOST_WIDE_INT maxval
)
5003 HOST_WIDE_INT firstval
;
5006 if (GET_CODE (x
) != CONST_VECTOR
5007 || GET_MODE_CLASS (GET_MODE (x
)) != MODE_VECTOR_INT
)
5010 firstval
= INTVAL (CONST_VECTOR_ELT (x
, 0));
5011 if (firstval
< minval
|| firstval
> maxval
)
5014 count
= CONST_VECTOR_NUNITS (x
);
5015 for (i
= 1; i
< count
; i
++)
5016 if (INTVAL (CONST_VECTOR_ELT (x
, i
)) != firstval
)
5023 aarch64_const_vec_all_same_int_p (rtx x
, HOST_WIDE_INT val
)
5025 return aarch64_const_vec_all_same_in_range_p (x
, val
, val
);
5030 #define AARCH64_CC_V 1
5031 #define AARCH64_CC_C (1 << 1)
5032 #define AARCH64_CC_Z (1 << 2)
5033 #define AARCH64_CC_N (1 << 3)
5035 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
5036 static const int aarch64_nzcv_codes
[] =
5038 0, /* EQ, Z == 1. */
5039 AARCH64_CC_Z
, /* NE, Z == 0. */
5040 0, /* CS, C == 1. */
5041 AARCH64_CC_C
, /* CC, C == 0. */
5042 0, /* MI, N == 1. */
5043 AARCH64_CC_N
, /* PL, N == 0. */
5044 0, /* VS, V == 1. */
5045 AARCH64_CC_V
, /* VC, V == 0. */
5046 0, /* HI, C ==1 && Z == 0. */
5047 AARCH64_CC_C
, /* LS, !(C == 1 && Z == 0). */
5048 AARCH64_CC_V
, /* GE, N == V. */
5049 0, /* LT, N != V. */
5050 AARCH64_CC_Z
, /* GT, Z == 0 && N == V. */
5051 0, /* LE, !(Z == 0 && N == V). */
5057 aarch64_print_operand (FILE *f
, rtx x
, int code
)
5061 /* An integer or symbol address without a preceding # sign. */
5063 switch (GET_CODE (x
))
5066 fprintf (f
, HOST_WIDE_INT_PRINT_DEC
, INTVAL (x
));
5070 output_addr_const (f
, x
);
5074 if (GET_CODE (XEXP (x
, 0)) == PLUS
5075 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
)
5077 output_addr_const (f
, x
);
5083 output_operand_lossage ("Unsupported operand for code '%c'", code
);
5088 /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w. */
5092 if (!CONST_INT_P (x
)
5093 || (n
= exact_log2 (INTVAL (x
) & ~7)) <= 0)
5095 output_operand_lossage ("invalid operand for '%%%c'", code
);
5111 output_operand_lossage ("invalid operand for '%%%c'", code
);
5121 /* Print N such that 2^N == X. */
5122 if (!CONST_INT_P (x
) || (n
= exact_log2 (INTVAL (x
))) < 0)
5124 output_operand_lossage ("invalid operand for '%%%c'", code
);
5128 asm_fprintf (f
, "%d", n
);
5133 /* Print the number of non-zero bits in X (a const_int). */
5134 if (!CONST_INT_P (x
))
5136 output_operand_lossage ("invalid operand for '%%%c'", code
);
5140 asm_fprintf (f
, "%u", popcount_hwi (INTVAL (x
)));
5144 /* Print the higher numbered register of a pair (TImode) of regs. */
5145 if (!REG_P (x
) || !GP_REGNUM_P (REGNO (x
) + 1))
5147 output_operand_lossage ("invalid operand for '%%%c'", code
);
5151 asm_fprintf (f
, "%s", reg_names
[REGNO (x
) + 1]);
5158 /* Print a condition (eq, ne, etc) or its inverse. */
5160 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
5161 if (x
== const_true_rtx
)
5168 if (!COMPARISON_P (x
))
5170 output_operand_lossage ("invalid operand for '%%%c'", code
);
5174 cond_code
= aarch64_get_condition_code (x
);
5175 gcc_assert (cond_code
>= 0);
5177 cond_code
= AARCH64_INVERSE_CONDITION_CODE (cond_code
);
5178 fputs (aarch64_condition_codes
[cond_code
], f
);
5187 /* Print a scalar FP/SIMD register name. */
5188 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
5190 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
5193 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - V0_REGNUM
);
5200 /* Print the first FP/SIMD register name in a list. */
5201 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
5203 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
5206 asm_fprintf (f
, "v%d", REGNO (x
) - V0_REGNUM
+ (code
- 'S'));
5210 /* Print a scalar FP/SIMD register name + 1. */
5211 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
5213 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
5216 asm_fprintf (f
, "q%d", REGNO (x
) - V0_REGNUM
+ 1);
5220 /* Print bottom 16 bits of integer constant in hex. */
5221 if (!CONST_INT_P (x
))
5223 output_operand_lossage ("invalid operand for '%%%c'", code
);
5226 asm_fprintf (f
, "0x%wx", UINTVAL (x
) & 0xffff);
5231 /* Print a general register name or the zero register (32-bit or
5234 || (CONST_DOUBLE_P (x
) && aarch64_float_const_zero_rtx_p (x
)))
5236 asm_fprintf (f
, "%czr", code
);
5240 if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
5242 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - R0_REGNUM
);
5246 if (REG_P (x
) && REGNO (x
) == SP_REGNUM
)
5248 asm_fprintf (f
, "%ssp", code
== 'w' ? "w" : "");
5255 /* Print a normal operand, if it's a general register, then we
5259 output_operand_lossage ("missing operand");
5263 switch (GET_CODE (x
))
5266 asm_fprintf (f
, "%s", reg_names
[REGNO (x
)]);
5270 output_address (GET_MODE (x
), XEXP (x
, 0));
5271 /* Check all memory references are Pmode - even with ILP32. */
5272 gcc_assert (GET_MODE (XEXP (x
, 0)) == Pmode
);
5278 output_addr_const (asm_out_file
, x
);
5282 asm_fprintf (f
, "%wd", INTVAL (x
));
5286 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
5289 aarch64_const_vec_all_same_in_range_p (x
,
5291 HOST_WIDE_INT_MAX
));
5292 asm_fprintf (f
, "%wd", INTVAL (CONST_VECTOR_ELT (x
, 0)));
5294 else if (aarch64_simd_imm_zero_p (x
, GET_MODE (x
)))
5303 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5304 be getting CONST_DOUBLEs holding integers. */
5305 gcc_assert (GET_MODE (x
) != VOIDmode
);
5306 if (aarch64_float_const_zero_rtx_p (x
))
5311 else if (aarch64_float_const_representable_p (x
))
5314 char float_buf
[buf_size
] = {'\0'};
5315 real_to_decimal_for_mode (float_buf
,
5316 CONST_DOUBLE_REAL_VALUE (x
),
5319 asm_fprintf (asm_out_file
, "%s", float_buf
);
5323 output_operand_lossage ("invalid constant");
5326 output_operand_lossage ("invalid operand");
5332 if (GET_CODE (x
) == HIGH
)
5335 switch (aarch64_classify_symbolic_expression (x
))
5337 case SYMBOL_SMALL_GOT_4G
:
5338 asm_fprintf (asm_out_file
, ":got:");
5341 case SYMBOL_SMALL_TLSGD
:
5342 asm_fprintf (asm_out_file
, ":tlsgd:");
5345 case SYMBOL_SMALL_TLSDESC
:
5346 asm_fprintf (asm_out_file
, ":tlsdesc:");
5349 case SYMBOL_SMALL_TLSIE
:
5350 asm_fprintf (asm_out_file
, ":gottprel:");
5353 case SYMBOL_TLSLE24
:
5354 asm_fprintf (asm_out_file
, ":tprel:");
5357 case SYMBOL_TINY_GOT
:
5364 output_addr_const (asm_out_file
, x
);
5368 switch (aarch64_classify_symbolic_expression (x
))
5370 case SYMBOL_SMALL_GOT_4G
:
5371 asm_fprintf (asm_out_file
, ":lo12:");
5374 case SYMBOL_SMALL_TLSGD
:
5375 asm_fprintf (asm_out_file
, ":tlsgd_lo12:");
5378 case SYMBOL_SMALL_TLSDESC
:
5379 asm_fprintf (asm_out_file
, ":tlsdesc_lo12:");
5382 case SYMBOL_SMALL_TLSIE
:
5383 asm_fprintf (asm_out_file
, ":gottprel_lo12:");
5386 case SYMBOL_TLSLE12
:
5387 asm_fprintf (asm_out_file
, ":tprel_lo12:");
5390 case SYMBOL_TLSLE24
:
5391 asm_fprintf (asm_out_file
, ":tprel_lo12_nc:");
5394 case SYMBOL_TINY_GOT
:
5395 asm_fprintf (asm_out_file
, ":got:");
5398 case SYMBOL_TINY_TLSIE
:
5399 asm_fprintf (asm_out_file
, ":gottprel:");
5405 output_addr_const (asm_out_file
, x
);
5410 switch (aarch64_classify_symbolic_expression (x
))
5412 case SYMBOL_TLSLE24
:
5413 asm_fprintf (asm_out_file
, ":tprel_hi12:");
5418 output_addr_const (asm_out_file
, x
);
5423 HOST_WIDE_INT cond_code
;
5426 if (!CONST_INT_P (x
))
5428 output_operand_lossage ("invalid operand for '%%%c'", code
);
5432 cond_code
= INTVAL (x
);
5433 gcc_assert (cond_code
>= 0 && cond_code
<= AARCH64_NV
);
5434 asm_fprintf (f
, "%d", aarch64_nzcv_codes
[cond_code
]);
5439 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
5445 aarch64_print_operand_address (FILE *f
, machine_mode mode
, rtx x
)
5447 struct aarch64_address_info addr
;
5449 if (aarch64_classify_address (&addr
, x
, mode
, MEM
, true))
5452 case ADDRESS_REG_IMM
:
5453 if (addr
.offset
== const0_rtx
)
5454 asm_fprintf (f
, "[%s]", reg_names
[REGNO (addr
.base
)]);
5456 asm_fprintf (f
, "[%s, %wd]", reg_names
[REGNO (addr
.base
)],
5457 INTVAL (addr
.offset
));
5460 case ADDRESS_REG_REG
:
5461 if (addr
.shift
== 0)
5462 asm_fprintf (f
, "[%s, %s]", reg_names
[REGNO (addr
.base
)],
5463 reg_names
[REGNO (addr
.offset
)]);
5465 asm_fprintf (f
, "[%s, %s, lsl %u]", reg_names
[REGNO (addr
.base
)],
5466 reg_names
[REGNO (addr
.offset
)], addr
.shift
);
5469 case ADDRESS_REG_UXTW
:
5470 if (addr
.shift
== 0)
5471 asm_fprintf (f
, "[%s, w%d, uxtw]", reg_names
[REGNO (addr
.base
)],
5472 REGNO (addr
.offset
) - R0_REGNUM
);
5474 asm_fprintf (f
, "[%s, w%d, uxtw %u]", reg_names
[REGNO (addr
.base
)],
5475 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
5478 case ADDRESS_REG_SXTW
:
5479 if (addr
.shift
== 0)
5480 asm_fprintf (f
, "[%s, w%d, sxtw]", reg_names
[REGNO (addr
.base
)],
5481 REGNO (addr
.offset
) - R0_REGNUM
);
5483 asm_fprintf (f
, "[%s, w%d, sxtw %u]", reg_names
[REGNO (addr
.base
)],
5484 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
5487 case ADDRESS_REG_WB
:
5488 switch (GET_CODE (x
))
5491 asm_fprintf (f
, "[%s, %d]!", reg_names
[REGNO (addr
.base
)],
5492 GET_MODE_SIZE (mode
));
5495 asm_fprintf (f
, "[%s], %d", reg_names
[REGNO (addr
.base
)],
5496 GET_MODE_SIZE (mode
));
5499 asm_fprintf (f
, "[%s, -%d]!", reg_names
[REGNO (addr
.base
)],
5500 GET_MODE_SIZE (mode
));
5503 asm_fprintf (f
, "[%s], -%d", reg_names
[REGNO (addr
.base
)],
5504 GET_MODE_SIZE (mode
));
5507 asm_fprintf (f
, "[%s, %wd]!", reg_names
[REGNO (addr
.base
)],
5508 INTVAL (addr
.offset
));
5511 asm_fprintf (f
, "[%s], %wd", reg_names
[REGNO (addr
.base
)],
5512 INTVAL (addr
.offset
));
5519 case ADDRESS_LO_SUM
:
5520 asm_fprintf (f
, "[%s, #:lo12:", reg_names
[REGNO (addr
.base
)]);
5521 output_addr_const (f
, addr
.offset
);
5522 asm_fprintf (f
, "]");
5525 case ADDRESS_SYMBOLIC
:
5529 output_addr_const (f
, x
);
5533 aarch64_label_mentioned_p (rtx x
)
5538 if (GET_CODE (x
) == LABEL_REF
)
5541 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5542 referencing instruction, but they are constant offsets, not
5544 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
5547 fmt
= GET_RTX_FORMAT (GET_CODE (x
));
5548 for (i
= GET_RTX_LENGTH (GET_CODE (x
)) - 1; i
>= 0; i
--)
5554 for (j
= XVECLEN (x
, i
) - 1; j
>= 0; j
--)
5555 if (aarch64_label_mentioned_p (XVECEXP (x
, i
, j
)))
5558 else if (fmt
[i
] == 'e' && aarch64_label_mentioned_p (XEXP (x
, i
)))
5565 /* Implement REGNO_REG_CLASS. */
5568 aarch64_regno_regclass (unsigned regno
)
5570 if (GP_REGNUM_P (regno
))
5571 return GENERAL_REGS
;
5573 if (regno
== SP_REGNUM
)
5576 if (regno
== FRAME_POINTER_REGNUM
5577 || regno
== ARG_POINTER_REGNUM
)
5578 return POINTER_REGS
;
5580 if (FP_REGNUM_P (regno
))
5581 return FP_LO_REGNUM_P (regno
) ? FP_LO_REGS
: FP_REGS
;
5587 aarch64_legitimize_address (rtx x
, rtx
/* orig_x */, machine_mode mode
)
5589 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5590 where mask is selected by alignment and size of the offset.
5591 We try to pick as large a range for the offset as possible to
5592 maximize the chance of a CSE. However, for aligned addresses
5593 we limit the range to 4k so that structures with different sized
5594 elements are likely to use the same base. We need to be careful
5595 not to split a CONST for some forms of address expression, otherwise
5596 it will generate sub-optimal code. */
5598 if (GET_CODE (x
) == PLUS
&& CONST_INT_P (XEXP (x
, 1)))
5600 rtx base
= XEXP (x
, 0);
5601 rtx offset_rtx
= XEXP (x
, 1);
5602 HOST_WIDE_INT offset
= INTVAL (offset_rtx
);
5604 if (GET_CODE (base
) == PLUS
)
5606 rtx op0
= XEXP (base
, 0);
5607 rtx op1
= XEXP (base
, 1);
5609 /* Force any scaling into a temp for CSE. */
5610 op0
= force_reg (Pmode
, op0
);
5611 op1
= force_reg (Pmode
, op1
);
5613 /* Let the pointer register be in op0. */
5614 if (REG_POINTER (op1
))
5615 std::swap (op0
, op1
);
5617 /* If the pointer is virtual or frame related, then we know that
5618 virtual register instantiation or register elimination is going
5619 to apply a second constant. We want the two constants folded
5620 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
5621 if (virt_or_elim_regno_p (REGNO (op0
)))
5623 base
= expand_binop (Pmode
, add_optab
, op0
, offset_rtx
,
5624 NULL_RTX
, true, OPTAB_DIRECT
);
5625 return gen_rtx_PLUS (Pmode
, base
, op1
);
5628 /* Otherwise, in order to encourage CSE (and thence loop strength
5629 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
5630 base
= expand_binop (Pmode
, add_optab
, op0
, op1
,
5631 NULL_RTX
, true, OPTAB_DIRECT
);
5632 x
= gen_rtx_PLUS (Pmode
, base
, offset_rtx
);
5635 /* Does it look like we'll need a 16-byte load/store-pair operation? */
5636 HOST_WIDE_INT base_offset
;
5637 if (GET_MODE_SIZE (mode
) > 16)
5638 base_offset
= (offset
+ 0x400) & ~0x7f0;
5639 /* For offsets aren't a multiple of the access size, the limit is
5641 else if (offset
& (GET_MODE_SIZE (mode
) - 1))
5643 base_offset
= (offset
+ 0x100) & ~0x1ff;
5645 /* BLKmode typically uses LDP of X-registers. */
5646 if (mode
== BLKmode
)
5647 base_offset
= (offset
+ 512) & ~0x3ff;
5649 /* Small negative offsets are supported. */
5650 else if (IN_RANGE (offset
, -256, 0))
5652 else if (mode
== TImode
|| mode
== TFmode
)
5653 base_offset
= (offset
+ 0x100) & ~0x1ff;
5654 /* Use 12-bit offset by access size. */
5656 base_offset
= offset
& (~0xfff * GET_MODE_SIZE (mode
));
5658 if (base_offset
!= 0)
5660 base
= plus_constant (Pmode
, base
, base_offset
);
5661 base
= force_operand (base
, NULL_RTX
);
5662 return plus_constant (Pmode
, base
, offset
- base_offset
);
5669 /* Return the reload icode required for a constant pool in mode. */
5670 static enum insn_code
5671 aarch64_constant_pool_reload_icode (machine_mode mode
)
5676 return CODE_FOR_aarch64_reload_movcpsfdi
;
5679 return CODE_FOR_aarch64_reload_movcpdfdi
;
5682 return CODE_FOR_aarch64_reload_movcptfdi
;
5685 return CODE_FOR_aarch64_reload_movcpv8qidi
;
5688 return CODE_FOR_aarch64_reload_movcpv16qidi
;
5691 return CODE_FOR_aarch64_reload_movcpv4hidi
;
5694 return CODE_FOR_aarch64_reload_movcpv8hidi
;
5697 return CODE_FOR_aarch64_reload_movcpv2sidi
;
5700 return CODE_FOR_aarch64_reload_movcpv4sidi
;
5703 return CODE_FOR_aarch64_reload_movcpv2didi
;
5706 return CODE_FOR_aarch64_reload_movcpv2dfdi
;
5715 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED
, rtx x
,
5718 secondary_reload_info
*sri
)
5721 /* If we have to disable direct literal pool loads and stores because the
5722 function is too big, then we need a scratch register. */
5723 if (MEM_P (x
) && GET_CODE (x
) == SYMBOL_REF
&& CONSTANT_POOL_ADDRESS_P (x
)
5724 && (SCALAR_FLOAT_MODE_P (GET_MODE (x
))
5725 || targetm
.vector_mode_supported_p (GET_MODE (x
)))
5726 && !aarch64_pcrelative_literal_loads
)
5728 sri
->icode
= aarch64_constant_pool_reload_icode (mode
);
5732 /* Without the TARGET_SIMD instructions we cannot move a Q register
5733 to a Q register directly. We need a scratch. */
5734 if (REG_P (x
) && (mode
== TFmode
|| mode
== TImode
) && mode
== GET_MODE (x
)
5735 && FP_REGNUM_P (REGNO (x
)) && !TARGET_SIMD
5736 && reg_class_subset_p (rclass
, FP_REGS
))
5739 sri
->icode
= CODE_FOR_aarch64_reload_movtf
;
5740 else if (mode
== TImode
)
5741 sri
->icode
= CODE_FOR_aarch64_reload_movti
;
5745 /* A TFmode or TImode memory access should be handled via an FP_REGS
5746 because AArch64 has richer addressing modes for LDR/STR instructions
5747 than LDP/STP instructions. */
5748 if (TARGET_FLOAT
&& rclass
== GENERAL_REGS
5749 && GET_MODE_SIZE (mode
) == 16 && MEM_P (x
))
5752 if (rclass
== FP_REGS
&& (mode
== TImode
|| mode
== TFmode
) && CONSTANT_P(x
))
5753 return GENERAL_REGS
;
5759 aarch64_can_eliminate (const int from
, const int to
)
5761 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5762 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
5764 if (frame_pointer_needed
)
5766 if (from
== ARG_POINTER_REGNUM
&& to
== HARD_FRAME_POINTER_REGNUM
)
5768 if (from
== ARG_POINTER_REGNUM
&& to
== STACK_POINTER_REGNUM
)
5770 if (from
== FRAME_POINTER_REGNUM
&& to
== STACK_POINTER_REGNUM
5771 && !cfun
->calls_alloca
)
5773 if (from
== FRAME_POINTER_REGNUM
&& to
== HARD_FRAME_POINTER_REGNUM
)
5780 /* If we decided that we didn't need a leaf frame pointer but then used
5781 LR in the function, then we'll want a frame pointer after all, so
5782 prevent this elimination to ensure a frame pointer is used. */
5783 if (to
== STACK_POINTER_REGNUM
5784 && flag_omit_leaf_frame_pointer
5785 && df_regs_ever_live_p (LR_REGNUM
))
5793 aarch64_initial_elimination_offset (unsigned from
, unsigned to
)
5795 aarch64_layout_frame ();
5797 if (to
== HARD_FRAME_POINTER_REGNUM
)
5799 if (from
== ARG_POINTER_REGNUM
)
5800 return cfun
->machine
->frame
.hard_fp_offset
;
5802 if (from
== FRAME_POINTER_REGNUM
)
5803 return cfun
->machine
->frame
.hard_fp_offset
5804 - cfun
->machine
->frame
.locals_offset
;
5807 if (to
== STACK_POINTER_REGNUM
)
5809 if (from
== FRAME_POINTER_REGNUM
)
5810 return cfun
->machine
->frame
.frame_size
5811 - cfun
->machine
->frame
.locals_offset
;
5814 return cfun
->machine
->frame
.frame_size
;
5817 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
5821 aarch64_return_addr (int count
, rtx frame ATTRIBUTE_UNUSED
)
5825 return get_hard_reg_initial_val (Pmode
, LR_REGNUM
);
5830 aarch64_asm_trampoline_template (FILE *f
)
5834 asm_fprintf (f
, "\tldr\tw%d, .+16\n", IP1_REGNUM
- R0_REGNUM
);
5835 asm_fprintf (f
, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM
- R0_REGNUM
);
5839 asm_fprintf (f
, "\tldr\t%s, .+16\n", reg_names
[IP1_REGNUM
]);
5840 asm_fprintf (f
, "\tldr\t%s, .+20\n", reg_names
[STATIC_CHAIN_REGNUM
]);
5842 asm_fprintf (f
, "\tbr\t%s\n", reg_names
[IP1_REGNUM
]);
5843 assemble_aligned_integer (4, const0_rtx
);
5844 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
5845 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
5849 aarch64_trampoline_init (rtx m_tramp
, tree fndecl
, rtx chain_value
)
5851 rtx fnaddr
, mem
, a_tramp
;
5852 const int tramp_code_sz
= 16;
5854 /* Don't need to copy the trailing D-words, we fill those in below. */
5855 emit_block_move (m_tramp
, assemble_trampoline_template (),
5856 GEN_INT (tramp_code_sz
), BLOCK_OP_NORMAL
);
5857 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
);
5858 fnaddr
= XEXP (DECL_RTL (fndecl
), 0);
5859 if (GET_MODE (fnaddr
) != ptr_mode
)
5860 fnaddr
= convert_memory_address (ptr_mode
, fnaddr
);
5861 emit_move_insn (mem
, fnaddr
);
5863 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
+ POINTER_BYTES
);
5864 emit_move_insn (mem
, chain_value
);
5866 /* XXX We should really define a "clear_cache" pattern and use
5867 gen_clear_cache(). */
5868 a_tramp
= XEXP (m_tramp
, 0);
5869 emit_library_call (gen_rtx_SYMBOL_REF (Pmode
, "__clear_cache"),
5870 LCT_NORMAL
, VOIDmode
, 2, a_tramp
, ptr_mode
,
5871 plus_constant (ptr_mode
, a_tramp
, TRAMPOLINE_SIZE
),
5875 static unsigned char
5876 aarch64_class_max_nregs (reg_class_t regclass
, machine_mode mode
)
5880 case CALLER_SAVE_REGS
:
5887 aarch64_vector_mode_p (mode
)
5888 ? (GET_MODE_SIZE (mode
) + UNITS_PER_VREG
- 1) / UNITS_PER_VREG
5889 : (GET_MODE_SIZE (mode
) + UNITS_PER_WORD
- 1) / UNITS_PER_WORD
;
5903 aarch64_preferred_reload_class (rtx x
, reg_class_t regclass
)
5905 if (regclass
== POINTER_REGS
)
5906 return GENERAL_REGS
;
5908 if (regclass
== STACK_REG
)
5911 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x
)), POINTER_REGS
))
5917 /* If it's an integer immediate that MOVI can't handle, then
5918 FP_REGS is not an option, so we return NO_REGS instead. */
5919 if (CONST_INT_P (x
) && reg_class_subset_p (regclass
, FP_REGS
)
5920 && !aarch64_simd_imm_scalar_p (x
, GET_MODE (x
)))
5923 /* Register eliminiation can result in a request for
5924 SP+constant->FP_REGS. We cannot support such operations which
5925 use SP as source and an FP_REG as destination, so reject out
5927 if (! reg_class_subset_p (regclass
, GENERAL_REGS
) && GET_CODE (x
) == PLUS
)
5929 rtx lhs
= XEXP (x
, 0);
5931 /* Look through a possible SUBREG introduced by ILP32. */
5932 if (GET_CODE (lhs
) == SUBREG
)
5933 lhs
= SUBREG_REG (lhs
);
5935 gcc_assert (REG_P (lhs
));
5936 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs
)),
5945 aarch64_asm_output_labelref (FILE* f
, const char *name
)
5947 asm_fprintf (f
, "%U%s", name
);
5951 aarch64_elf_asm_constructor (rtx symbol
, int priority
)
5953 if (priority
== DEFAULT_INIT_PRIORITY
)
5954 default_ctor_section_asm_out_constructor (symbol
, priority
);
5958 /* While priority is known to be in range [0, 65535], so 18 bytes
5959 would be enough, the compiler might not know that. To avoid
5960 -Wformat-truncation false positive, use a larger size. */
5962 snprintf (buf
, sizeof (buf
), ".init_array.%.5u", priority
);
5963 s
= get_section (buf
, SECTION_WRITE
, NULL
);
5964 switch_to_section (s
);
5965 assemble_align (POINTER_SIZE
);
5966 assemble_aligned_integer (POINTER_BYTES
, symbol
);
5971 aarch64_elf_asm_destructor (rtx symbol
, int priority
)
5973 if (priority
== DEFAULT_INIT_PRIORITY
)
5974 default_dtor_section_asm_out_destructor (symbol
, priority
);
5978 /* While priority is known to be in range [0, 65535], so 18 bytes
5979 would be enough, the compiler might not know that. To avoid
5980 -Wformat-truncation false positive, use a larger size. */
5982 snprintf (buf
, sizeof (buf
), ".fini_array.%.5u", priority
);
5983 s
= get_section (buf
, SECTION_WRITE
, NULL
);
5984 switch_to_section (s
);
5985 assemble_align (POINTER_SIZE
);
5986 assemble_aligned_integer (POINTER_BYTES
, symbol
);
5991 aarch64_output_casesi (rtx
*operands
)
5995 rtx diff_vec
= PATTERN (NEXT_INSN (as_a
<rtx_insn
*> (operands
[2])));
5997 static const char *const patterns
[4][2] =
6000 "ldrb\t%w3, [%0,%w1,uxtw]",
6001 "add\t%3, %4, %w3, sxtb #2"
6004 "ldrh\t%w3, [%0,%w1,uxtw #1]",
6005 "add\t%3, %4, %w3, sxth #2"
6008 "ldr\t%w3, [%0,%w1,uxtw #2]",
6009 "add\t%3, %4, %w3, sxtw #2"
6011 /* We assume that DImode is only generated when not optimizing and
6012 that we don't really need 64-bit address offsets. That would
6013 imply an object file with 8GB of code in a single function! */
6015 "ldr\t%w3, [%0,%w1,uxtw #2]",
6016 "add\t%3, %4, %w3, sxtw #2"
6020 gcc_assert (GET_CODE (diff_vec
) == ADDR_DIFF_VEC
);
6022 index
= exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec
)));
6024 gcc_assert (index
>= 0 && index
<= 3);
6026 /* Need to implement table size reduction, by chaning the code below. */
6027 output_asm_insn (patterns
[index
][0], operands
);
6028 ASM_GENERATE_INTERNAL_LABEL (label
, "Lrtx", CODE_LABEL_NUMBER (operands
[2]));
6029 snprintf (buf
, sizeof (buf
),
6030 "adr\t%%4, %s", targetm
.strip_name_encoding (label
));
6031 output_asm_insn (buf
, operands
);
6032 output_asm_insn (patterns
[index
][1], operands
);
6033 output_asm_insn ("br\t%3", operands
);
6034 assemble_label (asm_out_file
, label
);
6039 /* Return size in bits of an arithmetic operand which is shifted/scaled and
6040 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
6044 aarch64_uxt_size (int shift
, HOST_WIDE_INT mask
)
6046 if (shift
>= 0 && shift
<= 3)
6049 for (size
= 8; size
<= 32; size
*= 2)
6051 HOST_WIDE_INT bits
= ((HOST_WIDE_INT
)1U << size
) - 1;
6052 if (mask
== bits
<< shift
)
6059 /* Constant pools are per function only when PC relative
6060 literal loads are true or we are in the large memory
6064 aarch64_can_use_per_function_literal_pools_p (void)
6066 return (aarch64_pcrelative_literal_loads
6067 || aarch64_cmodel
== AARCH64_CMODEL_LARGE
);
6071 aarch64_use_blocks_for_constant_p (machine_mode
, const_rtx
)
6073 /* Fixme:: In an ideal world this would work similar
6074 to the logic in aarch64_select_rtx_section but this
6075 breaks bootstrap in gcc go. For now we workaround
6076 this by returning false here. */
6080 /* Select appropriate section for constants depending
6081 on where we place literal pools. */
6084 aarch64_select_rtx_section (machine_mode mode
,
6086 unsigned HOST_WIDE_INT align
)
6088 if (aarch64_can_use_per_function_literal_pools_p ())
6089 return function_section (current_function_decl
);
6091 return default_elf_select_rtx_section (mode
, x
, align
);
6094 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
6096 aarch64_asm_output_pool_epilogue (FILE *f
, const char *, tree
,
6097 HOST_WIDE_INT offset
)
6099 /* When using per-function literal pools, we must ensure that any code
6100 section is aligned to the minimal instruction length, lest we get
6101 errors from the assembler re "unaligned instructions". */
6102 if ((offset
& 3) && aarch64_can_use_per_function_literal_pools_p ())
6103 ASM_OUTPUT_ALIGN (f
, 2);
6108 /* Helper function for rtx cost calculation. Strip a shift expression
6109 from X. Returns the inner operand if successful, or the original
6110 expression on failure. */
6112 aarch64_strip_shift (rtx x
)
6116 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
6117 we can convert both to ROR during final output. */
6118 if ((GET_CODE (op
) == ASHIFT
6119 || GET_CODE (op
) == ASHIFTRT
6120 || GET_CODE (op
) == LSHIFTRT
6121 || GET_CODE (op
) == ROTATERT
6122 || GET_CODE (op
) == ROTATE
)
6123 && CONST_INT_P (XEXP (op
, 1)))
6124 return XEXP (op
, 0);
6126 if (GET_CODE (op
) == MULT
6127 && CONST_INT_P (XEXP (op
, 1))
6128 && ((unsigned) exact_log2 (INTVAL (XEXP (op
, 1)))) < 64)
6129 return XEXP (op
, 0);
6134 /* Helper function for rtx cost calculation. Strip an extend
6135 expression from X. Returns the inner operand if successful, or the
6136 original expression on failure. We deal with a number of possible
6137 canonicalization variations here. If STRIP_SHIFT is true, then
6138 we can strip off a shift also. */
6140 aarch64_strip_extend (rtx x
, bool strip_shift
)
6144 /* Zero and sign extraction of a widened value. */
6145 if ((GET_CODE (op
) == ZERO_EXTRACT
|| GET_CODE (op
) == SIGN_EXTRACT
)
6146 && XEXP (op
, 2) == const0_rtx
6147 && GET_CODE (XEXP (op
, 0)) == MULT
6148 && aarch64_is_extend_from_extract (GET_MODE (op
), XEXP (XEXP (op
, 0), 1),
6150 return XEXP (XEXP (op
, 0), 0);
6152 /* It can also be represented (for zero-extend) as an AND with an
6154 if (GET_CODE (op
) == AND
6155 && GET_CODE (XEXP (op
, 0)) == MULT
6156 && CONST_INT_P (XEXP (XEXP (op
, 0), 1))
6157 && CONST_INT_P (XEXP (op
, 1))
6158 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op
, 0), 1))),
6159 INTVAL (XEXP (op
, 1))) != 0)
6160 return XEXP (XEXP (op
, 0), 0);
6162 /* Now handle extended register, as this may also have an optional
6163 left shift by 1..4. */
6165 && GET_CODE (op
) == ASHIFT
6166 && CONST_INT_P (XEXP (op
, 1))
6167 && ((unsigned HOST_WIDE_INT
) INTVAL (XEXP (op
, 1))) <= 4)
6170 if (GET_CODE (op
) == ZERO_EXTEND
6171 || GET_CODE (op
) == SIGN_EXTEND
)
6180 /* Return true iff CODE is a shift supported in combination
6181 with arithmetic instructions. */
6184 aarch64_shift_p (enum rtx_code code
)
6186 return code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
;
6190 /* Return true iff X is a cheap shift without a sign extend. */
6193 aarch64_cheap_mult_shift_p (rtx x
)
6200 if (!(aarch64_tune_params
.extra_tuning_flags
6201 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
))
6204 if (GET_CODE (op0
) == SIGN_EXTEND
)
6207 if (GET_CODE (x
) == ASHIFT
&& CONST_INT_P (op1
)
6208 && UINTVAL (op1
) <= 4)
6211 if (GET_CODE (x
) != MULT
|| !CONST_INT_P (op1
))
6214 HOST_WIDE_INT l2
= exact_log2 (INTVAL (op1
));
6216 if (l2
> 0 && l2
<= 4)
6222 /* Helper function for rtx cost calculation. Calculate the cost of
6223 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
6224 Return the calculated cost of the expression, recursing manually in to
6225 operands where needed. */
6228 aarch64_rtx_mult_cost (rtx x
, enum rtx_code code
, int outer
, bool speed
)
6231 const struct cpu_cost_table
*extra_cost
6232 = aarch64_tune_params
.insn_extra_cost
;
6234 bool compound_p
= (outer
== PLUS
|| outer
== MINUS
);
6235 machine_mode mode
= GET_MODE (x
);
6237 gcc_checking_assert (code
== MULT
);
6242 if (VECTOR_MODE_P (mode
))
6243 mode
= GET_MODE_INNER (mode
);
6245 /* Integer multiply/fma. */
6246 if (GET_MODE_CLASS (mode
) == MODE_INT
)
6248 /* The multiply will be canonicalized as a shift, cost it as such. */
6249 if (aarch64_shift_p (GET_CODE (x
))
6250 || (CONST_INT_P (op1
)
6251 && exact_log2 (INTVAL (op1
)) > 0))
6253 bool is_extend
= GET_CODE (op0
) == ZERO_EXTEND
6254 || GET_CODE (op0
) == SIGN_EXTEND
;
6259 /* If the shift is considered cheap,
6260 then don't add any cost. */
6261 if (aarch64_cheap_mult_shift_p (x
))
6263 else if (REG_P (op1
))
6264 /* ARITH + shift-by-register. */
6265 cost
+= extra_cost
->alu
.arith_shift_reg
;
6267 /* ARITH + extended register. We don't have a cost field
6268 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
6269 cost
+= extra_cost
->alu
.extend_arith
;
6271 /* ARITH + shift-by-immediate. */
6272 cost
+= extra_cost
->alu
.arith_shift
;
6275 /* LSL (immediate). */
6276 cost
+= extra_cost
->alu
.shift
;
6279 /* Strip extends as we will have costed them in the case above. */
6281 op0
= aarch64_strip_extend (op0
, true);
6283 cost
+= rtx_cost (op0
, VOIDmode
, code
, 0, speed
);
6288 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
6289 compound and let the below cases handle it. After all, MNEG is a
6290 special-case alias of MSUB. */
6291 if (GET_CODE (op0
) == NEG
)
6293 op0
= XEXP (op0
, 0);
6297 /* Integer multiplies or FMAs have zero/sign extending variants. */
6298 if ((GET_CODE (op0
) == ZERO_EXTEND
6299 && GET_CODE (op1
) == ZERO_EXTEND
)
6300 || (GET_CODE (op0
) == SIGN_EXTEND
6301 && GET_CODE (op1
) == SIGN_EXTEND
))
6303 cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, MULT
, 0, speed
);
6304 cost
+= rtx_cost (XEXP (op1
, 0), VOIDmode
, MULT
, 1, speed
);
6309 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
6310 cost
+= extra_cost
->mult
[0].extend_add
;
6312 /* MUL/SMULL/UMULL. */
6313 cost
+= extra_cost
->mult
[0].extend
;
6319 /* This is either an integer multiply or a MADD. In both cases
6320 we want to recurse and cost the operands. */
6321 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
6322 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
6328 cost
+= extra_cost
->mult
[mode
== DImode
].add
;
6331 cost
+= extra_cost
->mult
[mode
== DImode
].simple
;
6340 /* Floating-point FMA/FMUL can also support negations of the
6341 operands, unless the rounding mode is upward or downward in
6342 which case FNMUL is different than FMUL with operand negation. */
6343 bool neg0
= GET_CODE (op0
) == NEG
;
6344 bool neg1
= GET_CODE (op1
) == NEG
;
6345 if (compound_p
|| !flag_rounding_math
|| (neg0
&& neg1
))
6348 op0
= XEXP (op0
, 0);
6350 op1
= XEXP (op1
, 0);
6354 /* FMADD/FNMADD/FNMSUB/FMSUB. */
6355 cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
6358 cost
+= extra_cost
->fp
[mode
== DFmode
].mult
;
6361 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
6362 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
6368 aarch64_address_cost (rtx x
,
6370 addr_space_t as ATTRIBUTE_UNUSED
,
6373 enum rtx_code c
= GET_CODE (x
);
6374 const struct cpu_addrcost_table
*addr_cost
= aarch64_tune_params
.addr_cost
;
6375 struct aarch64_address_info info
;
6379 if (!aarch64_classify_address (&info
, x
, mode
, c
, false))
6381 if (GET_CODE (x
) == CONST
|| GET_CODE (x
) == SYMBOL_REF
)
6383 /* This is a CONST or SYMBOL ref which will be split
6384 in a different way depending on the code model in use.
6385 Cost it through the generic infrastructure. */
6386 int cost_symbol_ref
= rtx_cost (x
, Pmode
, MEM
, 1, speed
);
6387 /* Divide through by the cost of one instruction to
6388 bring it to the same units as the address costs. */
6389 cost_symbol_ref
/= COSTS_N_INSNS (1);
6390 /* The cost is then the cost of preparing the address,
6391 followed by an immediate (possibly 0) offset. */
6392 return cost_symbol_ref
+ addr_cost
->imm_offset
;
6396 /* This is most likely a jump table from a case
6398 return addr_cost
->register_offset
;
6404 case ADDRESS_LO_SUM
:
6405 case ADDRESS_SYMBOLIC
:
6406 case ADDRESS_REG_IMM
:
6407 cost
+= addr_cost
->imm_offset
;
6410 case ADDRESS_REG_WB
:
6411 if (c
== PRE_INC
|| c
== PRE_DEC
|| c
== PRE_MODIFY
)
6412 cost
+= addr_cost
->pre_modify
;
6413 else if (c
== POST_INC
|| c
== POST_DEC
|| c
== POST_MODIFY
)
6414 cost
+= addr_cost
->post_modify
;
6420 case ADDRESS_REG_REG
:
6421 cost
+= addr_cost
->register_offset
;
6424 case ADDRESS_REG_SXTW
:
6425 cost
+= addr_cost
->register_sextend
;
6428 case ADDRESS_REG_UXTW
:
6429 cost
+= addr_cost
->register_zextend
;
6439 /* For the sake of calculating the cost of the shifted register
6440 component, we can treat same sized modes in the same way. */
6441 switch (GET_MODE_BITSIZE (mode
))
6444 cost
+= addr_cost
->addr_scale_costs
.hi
;
6448 cost
+= addr_cost
->addr_scale_costs
.si
;
6452 cost
+= addr_cost
->addr_scale_costs
.di
;
6455 /* We can't tell, or this is a 128-bit vector. */
6457 cost
+= addr_cost
->addr_scale_costs
.ti
;
6465 /* Return the cost of a branch. If SPEED_P is true then the compiler is
6466 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
6470 aarch64_branch_cost (bool speed_p
, bool predictable_p
)
6472 /* When optimizing for speed, use the cost of unpredictable branches. */
6473 const struct cpu_branch_cost
*branch_costs
=
6474 aarch64_tune_params
.branch_costs
;
6476 if (!speed_p
|| predictable_p
)
6477 return branch_costs
->predictable
;
6479 return branch_costs
->unpredictable
;
6482 /* Return true if the RTX X in mode MODE is a zero or sign extract
6483 usable in an ADD or SUB (extended register) instruction. */
6485 aarch64_rtx_arith_op_extract_p (rtx x
, machine_mode mode
)
6487 /* Catch add with a sign extract.
6488 This is add_<optab><mode>_multp2. */
6489 if (GET_CODE (x
) == SIGN_EXTRACT
6490 || GET_CODE (x
) == ZERO_EXTRACT
)
6492 rtx op0
= XEXP (x
, 0);
6493 rtx op1
= XEXP (x
, 1);
6494 rtx op2
= XEXP (x
, 2);
6496 if (GET_CODE (op0
) == MULT
6497 && CONST_INT_P (op1
)
6498 && op2
== const0_rtx
6499 && CONST_INT_P (XEXP (op0
, 1))
6500 && aarch64_is_extend_from_extract (mode
,
6507 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6509 else if (GET_CODE (x
) == SIGN_EXTEND
6510 || GET_CODE (x
) == ZERO_EXTEND
)
6511 return REG_P (XEXP (x
, 0));
6517 aarch64_frint_unspec_p (unsigned int u
)
6535 /* Return true iff X is an rtx that will match an extr instruction
6536 i.e. as described in the *extr<mode>5_insn family of patterns.
6537 OP0 and OP1 will be set to the operands of the shifts involved
6538 on success and will be NULL_RTX otherwise. */
6541 aarch64_extr_rtx_p (rtx x
, rtx
*res_op0
, rtx
*res_op1
)
6544 machine_mode mode
= GET_MODE (x
);
6546 *res_op0
= NULL_RTX
;
6547 *res_op1
= NULL_RTX
;
6549 if (GET_CODE (x
) != IOR
)
6555 if ((GET_CODE (op0
) == ASHIFT
&& GET_CODE (op1
) == LSHIFTRT
)
6556 || (GET_CODE (op1
) == ASHIFT
&& GET_CODE (op0
) == LSHIFTRT
))
6558 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
6559 if (GET_CODE (op1
) == ASHIFT
)
6560 std::swap (op0
, op1
);
6562 if (!CONST_INT_P (XEXP (op0
, 1)) || !CONST_INT_P (XEXP (op1
, 1)))
6565 unsigned HOST_WIDE_INT shft_amnt_0
= UINTVAL (XEXP (op0
, 1));
6566 unsigned HOST_WIDE_INT shft_amnt_1
= UINTVAL (XEXP (op1
, 1));
6568 if (shft_amnt_0
< GET_MODE_BITSIZE (mode
)
6569 && shft_amnt_0
+ shft_amnt_1
== GET_MODE_BITSIZE (mode
))
6571 *res_op0
= XEXP (op0
, 0);
6572 *res_op1
= XEXP (op1
, 0);
6580 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6581 storing it in *COST. Result is true if the total cost of the operation
6582 has now been calculated. */
6584 aarch64_if_then_else_costs (rtx op0
, rtx op1
, rtx op2
, int *cost
, bool speed
)
6588 enum rtx_code cmpcode
;
6590 if (COMPARISON_P (op0
))
6592 inner
= XEXP (op0
, 0);
6593 comparator
= XEXP (op0
, 1);
6594 cmpcode
= GET_CODE (op0
);
6599 comparator
= const0_rtx
;
6603 if (GET_CODE (op1
) == PC
|| GET_CODE (op2
) == PC
)
6605 /* Conditional branch. */
6606 if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
6610 if (cmpcode
== NE
|| cmpcode
== EQ
)
6612 if (comparator
== const0_rtx
)
6614 /* TBZ/TBNZ/CBZ/CBNZ. */
6615 if (GET_CODE (inner
) == ZERO_EXTRACT
)
6617 *cost
+= rtx_cost (XEXP (inner
, 0), VOIDmode
,
6618 ZERO_EXTRACT
, 0, speed
);
6621 *cost
+= rtx_cost (inner
, VOIDmode
, cmpcode
, 0, speed
);
6626 else if (cmpcode
== LT
|| cmpcode
== GE
)
6629 if (comparator
== const0_rtx
)
6634 else if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
6637 if (GET_CODE (op1
) == COMPARE
)
6639 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
6640 if (XEXP (op1
, 1) == const0_rtx
)
6644 machine_mode mode
= GET_MODE (XEXP (op1
, 0));
6645 const struct cpu_cost_table
*extra_cost
6646 = aarch64_tune_params
.insn_extra_cost
;
6648 if (GET_MODE_CLASS (mode
) == MODE_INT
)
6649 *cost
+= extra_cost
->alu
.arith
;
6651 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
6656 /* It's a conditional operation based on the status flags,
6657 so it must be some flavor of CSEL. */
6659 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
6660 if (GET_CODE (op1
) == NEG
6661 || GET_CODE (op1
) == NOT
6662 || (GET_CODE (op1
) == PLUS
&& XEXP (op1
, 1) == const1_rtx
))
6663 op1
= XEXP (op1
, 0);
6664 else if (GET_CODE (op1
) == ZERO_EXTEND
&& GET_CODE (op2
) == ZERO_EXTEND
)
6666 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
6667 op1
= XEXP (op1
, 0);
6668 op2
= XEXP (op2
, 0);
6671 *cost
+= rtx_cost (op1
, VOIDmode
, IF_THEN_ELSE
, 1, speed
);
6672 *cost
+= rtx_cost (op2
, VOIDmode
, IF_THEN_ELSE
, 2, speed
);
6676 /* We don't know what this is, cost all operands. */
6680 /* Check whether X is a bitfield operation of the form shift + extend that
6681 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
6682 operand to which the bitfield operation is applied. Otherwise return
6686 aarch64_extend_bitfield_pattern_p (rtx x
)
6688 rtx_code outer_code
= GET_CODE (x
);
6689 machine_mode outer_mode
= GET_MODE (x
);
6691 if (outer_code
!= ZERO_EXTEND
&& outer_code
!= SIGN_EXTEND
6692 && outer_mode
!= SImode
&& outer_mode
!= DImode
)
6695 rtx inner
= XEXP (x
, 0);
6696 rtx_code inner_code
= GET_CODE (inner
);
6697 machine_mode inner_mode
= GET_MODE (inner
);
6703 if (CONST_INT_P (XEXP (inner
, 1))
6704 && (inner_mode
== QImode
|| inner_mode
== HImode
))
6705 op
= XEXP (inner
, 0);
6708 if (outer_code
== ZERO_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
6709 && (inner_mode
== QImode
|| inner_mode
== HImode
))
6710 op
= XEXP (inner
, 0);
6713 if (outer_code
== SIGN_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
6714 && (inner_mode
== QImode
|| inner_mode
== HImode
))
6715 op
= XEXP (inner
, 0);
6724 /* Return true if the mask and a shift amount from an RTX of the form
6725 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6726 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
6729 aarch64_mask_and_shift_for_ubfiz_p (machine_mode mode
, rtx mask
, rtx shft_amnt
)
6731 return CONST_INT_P (mask
) && CONST_INT_P (shft_amnt
)
6732 && INTVAL (shft_amnt
) < GET_MODE_BITSIZE (mode
)
6733 && exact_log2 ((INTVAL (mask
) >> INTVAL (shft_amnt
)) + 1) >= 0
6734 && (INTVAL (mask
) & ((1 << INTVAL (shft_amnt
)) - 1)) == 0;
6737 /* Calculate the cost of calculating X, storing it in *COST. Result
6738 is true if the total cost of the operation has now been calculated. */
6740 aarch64_rtx_costs (rtx x
, machine_mode mode
, int outer ATTRIBUTE_UNUSED
,
6741 int param ATTRIBUTE_UNUSED
, int *cost
, bool speed
)
6744 const struct cpu_cost_table
*extra_cost
6745 = aarch64_tune_params
.insn_extra_cost
;
6746 int code
= GET_CODE (x
);
6748 /* By default, assume that everything has equivalent cost to the
6749 cheapest instruction. Any additional costs are applied as a delta
6750 above this default. */
6751 *cost
= COSTS_N_INSNS (1);
6756 /* The cost depends entirely on the operands to SET. */
6761 switch (GET_CODE (op0
))
6766 rtx address
= XEXP (op0
, 0);
6767 if (VECTOR_MODE_P (mode
))
6768 *cost
+= extra_cost
->ldst
.storev
;
6769 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
6770 *cost
+= extra_cost
->ldst
.store
;
6771 else if (mode
== SFmode
)
6772 *cost
+= extra_cost
->ldst
.storef
;
6773 else if (mode
== DFmode
)
6774 *cost
+= extra_cost
->ldst
.stored
;
6777 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
6781 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
6785 if (! REG_P (SUBREG_REG (op0
)))
6786 *cost
+= rtx_cost (SUBREG_REG (op0
), VOIDmode
, SET
, 0, speed
);
6790 /* The cost is one per vector-register copied. */
6791 if (VECTOR_MODE_P (GET_MODE (op0
)) && REG_P (op1
))
6793 int n_minus_1
= (GET_MODE_SIZE (GET_MODE (op0
)) - 1)
6794 / GET_MODE_SIZE (V4SImode
);
6795 *cost
= COSTS_N_INSNS (n_minus_1
+ 1);
6797 /* const0_rtx is in general free, but we will use an
6798 instruction to set a register to 0. */
6799 else if (REG_P (op1
) || op1
== const0_rtx
)
6801 /* The cost is 1 per register copied. */
6802 int n_minus_1
= (GET_MODE_SIZE (GET_MODE (op0
)) - 1)
6804 *cost
= COSTS_N_INSNS (n_minus_1
+ 1);
6807 /* Cost is just the cost of the RHS of the set. */
6808 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
6813 /* Bit-field insertion. Strip any redundant widening of
6814 the RHS to meet the width of the target. */
6815 if (GET_CODE (op1
) == SUBREG
)
6816 op1
= SUBREG_REG (op1
);
6817 if ((GET_CODE (op1
) == ZERO_EXTEND
6818 || GET_CODE (op1
) == SIGN_EXTEND
)
6819 && CONST_INT_P (XEXP (op0
, 1))
6820 && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1
, 0)))
6821 >= INTVAL (XEXP (op0
, 1))))
6822 op1
= XEXP (op1
, 0);
6824 if (CONST_INT_P (op1
))
6826 /* MOV immediate is assumed to always be cheap. */
6827 *cost
= COSTS_N_INSNS (1);
6833 *cost
+= extra_cost
->alu
.bfi
;
6834 *cost
+= rtx_cost (op1
, VOIDmode
, (enum rtx_code
) code
, 1, speed
);
6840 /* We can't make sense of this, assume default cost. */
6841 *cost
= COSTS_N_INSNS (1);
6847 /* If an instruction can incorporate a constant within the
6848 instruction, the instruction's expression avoids calling
6849 rtx_cost() on the constant. If rtx_cost() is called on a
6850 constant, then it is usually because the constant must be
6851 moved into a register by one or more instructions.
6853 The exception is constant 0, which can be expressed
6854 as XZR/WZR and is therefore free. The exception to this is
6855 if we have (set (reg) (const0_rtx)) in which case we must cost
6856 the move. However, we can catch that when we cost the SET, so
6857 we don't need to consider that here. */
6858 if (x
== const0_rtx
)
6862 /* To an approximation, building any other constant is
6863 proportionally expensive to the number of instructions
6864 required to build that constant. This is true whether we
6865 are compiling for SPEED or otherwise. */
6866 *cost
= COSTS_N_INSNS (aarch64_internal_mov_immediate
6867 (NULL_RTX
, x
, false, mode
));
6874 /* mov[df,sf]_aarch64. */
6875 if (aarch64_float_const_representable_p (x
))
6876 /* FMOV (scalar immediate). */
6877 *cost
+= extra_cost
->fp
[mode
== DFmode
].fpconst
;
6878 else if (!aarch64_float_const_zero_rtx_p (x
))
6880 /* This will be a load from memory. */
6882 *cost
+= extra_cost
->ldst
.loadd
;
6884 *cost
+= extra_cost
->ldst
.loadf
;
6887 /* Otherwise this is +0.0. We get this using MOVI d0, #0
6888 or MOV v0.s[0], wzr - neither of which are modeled by the
6889 cost tables. Just use the default cost. */
6899 /* For loads we want the base cost of a load, plus an
6900 approximation for the additional cost of the addressing
6902 rtx address
= XEXP (x
, 0);
6903 if (VECTOR_MODE_P (mode
))
6904 *cost
+= extra_cost
->ldst
.loadv
;
6905 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
6906 *cost
+= extra_cost
->ldst
.load
;
6907 else if (mode
== SFmode
)
6908 *cost
+= extra_cost
->ldst
.loadf
;
6909 else if (mode
== DFmode
)
6910 *cost
+= extra_cost
->ldst
.loadd
;
6913 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
6922 if (VECTOR_MODE_P (mode
))
6927 *cost
+= extra_cost
->vect
.alu
;
6932 if (GET_MODE_CLASS (mode
) == MODE_INT
)
6934 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
6935 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
6938 *cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, NEG
, 0, speed
);
6942 /* Cost this as SUB wzr, X. */
6943 op0
= CONST0_RTX (mode
);
6948 if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
6950 /* Support (neg(fma...)) as a single instruction only if
6951 sign of zeros is unimportant. This matches the decision
6952 making in aarch64.md. */
6953 if (GET_CODE (op0
) == FMA
&& !HONOR_SIGNED_ZEROS (GET_MODE (op0
)))
6956 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
6959 if (GET_CODE (op0
) == MULT
)
6962 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
6967 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
6977 if (VECTOR_MODE_P (mode
))
6978 *cost
+= extra_cost
->vect
.alu
;
6980 *cost
+= extra_cost
->alu
.clz
;
6989 if (op1
== const0_rtx
6990 && GET_CODE (op0
) == AND
)
6993 mode
= GET_MODE (op0
);
6997 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
)
6999 /* TODO: A write to the CC flags possibly costs extra, this
7000 needs encoding in the cost tables. */
7002 mode
= GET_MODE (op0
);
7004 if (GET_CODE (op0
) == AND
)
7010 if (GET_CODE (op0
) == PLUS
)
7012 /* ADDS (and CMN alias). */
7017 if (GET_CODE (op0
) == MINUS
)
7024 if (GET_CODE (op0
) == ZERO_EXTRACT
&& op1
== const0_rtx
7025 && GET_MODE (x
) == CC_NZmode
&& CONST_INT_P (XEXP (op0
, 1))
7026 && CONST_INT_P (XEXP (op0
, 2)))
7028 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
7029 Handle it here directly rather than going to cost_logic
7030 since we know the immediate generated for the TST is valid
7031 so we can avoid creating an intermediate rtx for it only
7032 for costing purposes. */
7034 *cost
+= extra_cost
->alu
.logical
;
7036 *cost
+= rtx_cost (XEXP (op0
, 0), GET_MODE (op0
),
7037 ZERO_EXTRACT
, 0, speed
);
7041 if (GET_CODE (op1
) == NEG
)
7045 *cost
+= extra_cost
->alu
.arith
;
7047 *cost
+= rtx_cost (op0
, mode
, COMPARE
, 0, speed
);
7048 *cost
+= rtx_cost (XEXP (op1
, 0), mode
, NEG
, 1, speed
);
7054 Compare can freely swap the order of operands, and
7055 canonicalization puts the more complex operation first.
7056 But the integer MINUS logic expects the shift/extend
7057 operation in op1. */
7059 || (GET_CODE (op0
) == SUBREG
&& REG_P (SUBREG_REG (op0
)))))
7067 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_FLOAT
)
7071 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
7073 if (CONST_DOUBLE_P (op1
) && aarch64_float_const_zero_rtx_p (op1
))
7075 *cost
+= rtx_cost (op0
, VOIDmode
, COMPARE
, 0, speed
);
7076 /* FCMP supports constant 0.0 for no extra cost. */
7082 if (VECTOR_MODE_P (mode
))
7084 /* Vector compare. */
7086 *cost
+= extra_cost
->vect
.alu
;
7088 if (aarch64_float_const_zero_rtx_p (op1
))
7090 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
7104 *cost
+= rtx_cost (op0
, mode
, MINUS
, 0, speed
);
7106 /* Detect valid immediates. */
7107 if ((GET_MODE_CLASS (mode
) == MODE_INT
7108 || (GET_MODE_CLASS (mode
) == MODE_CC
7109 && GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
))
7110 && CONST_INT_P (op1
)
7111 && aarch64_uimm12_shift (INTVAL (op1
)))
7114 /* SUB(S) (immediate). */
7115 *cost
+= extra_cost
->alu
.arith
;
7119 /* Look for SUB (extended register). */
7120 if (aarch64_rtx_arith_op_extract_p (op1
, mode
))
7123 *cost
+= extra_cost
->alu
.extend_arith
;
7125 op1
= aarch64_strip_extend (op1
, true);
7126 *cost
+= rtx_cost (op1
, VOIDmode
,
7127 (enum rtx_code
) GET_CODE (op1
), 0, speed
);
7131 rtx new_op1
= aarch64_strip_extend (op1
, false);
7133 /* Cost this as an FMA-alike operation. */
7134 if ((GET_CODE (new_op1
) == MULT
7135 || aarch64_shift_p (GET_CODE (new_op1
)))
7138 *cost
+= aarch64_rtx_mult_cost (new_op1
, MULT
,
7139 (enum rtx_code
) code
,
7144 *cost
+= rtx_cost (new_op1
, VOIDmode
, MINUS
, 1, speed
);
7148 if (VECTOR_MODE_P (mode
))
7151 *cost
+= extra_cost
->vect
.alu
;
7153 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
7156 *cost
+= extra_cost
->alu
.arith
;
7158 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
7161 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
7175 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
7176 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
7179 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, PLUS
, 0, speed
);
7180 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
7184 if (GET_MODE_CLASS (mode
) == MODE_INT
7185 && CONST_INT_P (op1
)
7186 && aarch64_uimm12_shift (INTVAL (op1
)))
7188 *cost
+= rtx_cost (op0
, mode
, PLUS
, 0, speed
);
7191 /* ADD (immediate). */
7192 *cost
+= extra_cost
->alu
.arith
;
7196 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
7198 /* Look for ADD (extended register). */
7199 if (aarch64_rtx_arith_op_extract_p (op0
, mode
))
7202 *cost
+= extra_cost
->alu
.extend_arith
;
7204 op0
= aarch64_strip_extend (op0
, true);
7205 *cost
+= rtx_cost (op0
, VOIDmode
,
7206 (enum rtx_code
) GET_CODE (op0
), 0, speed
);
7210 /* Strip any extend, leave shifts behind as we will
7211 cost them through mult_cost. */
7212 new_op0
= aarch64_strip_extend (op0
, false);
7214 if (GET_CODE (new_op0
) == MULT
7215 || aarch64_shift_p (GET_CODE (new_op0
)))
7217 *cost
+= aarch64_rtx_mult_cost (new_op0
, MULT
, PLUS
,
7222 *cost
+= rtx_cost (new_op0
, VOIDmode
, PLUS
, 0, speed
);
7226 if (VECTOR_MODE_P (mode
))
7229 *cost
+= extra_cost
->vect
.alu
;
7231 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
7234 *cost
+= extra_cost
->alu
.arith
;
7236 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
7239 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
7246 *cost
= COSTS_N_INSNS (1);
7250 if (VECTOR_MODE_P (mode
))
7251 *cost
+= extra_cost
->vect
.alu
;
7253 *cost
+= extra_cost
->alu
.rev
;
7258 if (aarch_rev16_p (x
))
7260 *cost
= COSTS_N_INSNS (1);
7264 if (VECTOR_MODE_P (mode
))
7265 *cost
+= extra_cost
->vect
.alu
;
7267 *cost
+= extra_cost
->alu
.rev
;
7272 if (aarch64_extr_rtx_p (x
, &op0
, &op1
))
7274 *cost
+= rtx_cost (op0
, mode
, IOR
, 0, speed
);
7275 *cost
+= rtx_cost (op1
, mode
, IOR
, 1, speed
);
7277 *cost
+= extra_cost
->alu
.shift
;
7288 if (VECTOR_MODE_P (mode
))
7291 *cost
+= extra_cost
->vect
.alu
;
7296 && GET_CODE (op0
) == MULT
7297 && CONST_INT_P (XEXP (op0
, 1))
7298 && CONST_INT_P (op1
)
7299 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0
, 1))),
7302 /* This is a UBFM/SBFM. */
7303 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, ZERO_EXTRACT
, 0, speed
);
7305 *cost
+= extra_cost
->alu
.bfx
;
7309 if (GET_MODE_CLASS (mode
) == MODE_INT
)
7311 if (CONST_INT_P (op1
))
7313 /* We have a mask + shift version of a UBFIZ
7314 i.e. the *andim_ashift<mode>_bfiz pattern. */
7315 if (GET_CODE (op0
) == ASHIFT
7316 && aarch64_mask_and_shift_for_ubfiz_p (mode
, op1
,
7319 *cost
+= rtx_cost (XEXP (op0
, 0), mode
,
7320 (enum rtx_code
) code
, 0, speed
);
7322 *cost
+= extra_cost
->alu
.bfx
;
7326 else if (aarch64_bitmask_imm (INTVAL (op1
), mode
))
7328 /* We possibly get the immediate for free, this is not
7330 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
7332 *cost
+= extra_cost
->alu
.logical
;
7341 /* Handle ORN, EON, or BIC. */
7342 if (GET_CODE (op0
) == NOT
)
7343 op0
= XEXP (op0
, 0);
7345 new_op0
= aarch64_strip_shift (op0
);
7347 /* If we had a shift on op0 then this is a logical-shift-
7348 by-register/immediate operation. Otherwise, this is just
7349 a logical operation. */
7354 /* Shift by immediate. */
7355 if (CONST_INT_P (XEXP (op0
, 1)))
7356 *cost
+= extra_cost
->alu
.log_shift
;
7358 *cost
+= extra_cost
->alu
.log_shift_reg
;
7361 *cost
+= extra_cost
->alu
.logical
;
7364 /* In both cases we want to cost both operands. */
7365 *cost
+= rtx_cost (new_op0
, mode
, (enum rtx_code
) code
, 0, speed
);
7366 *cost
+= rtx_cost (op1
, mode
, (enum rtx_code
) code
, 1, speed
);
7375 op0
= aarch64_strip_shift (x
);
7377 if (VECTOR_MODE_P (mode
))
7380 *cost
+= extra_cost
->vect
.alu
;
7384 /* MVN-shifted-reg. */
7387 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
7390 *cost
+= extra_cost
->alu
.log_shift
;
7394 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7395 Handle the second form here taking care that 'a' in the above can
7397 else if (GET_CODE (op0
) == XOR
)
7399 rtx newop0
= XEXP (op0
, 0);
7400 rtx newop1
= XEXP (op0
, 1);
7401 rtx op0_stripped
= aarch64_strip_shift (newop0
);
7403 *cost
+= rtx_cost (newop1
, mode
, (enum rtx_code
) code
, 1, speed
);
7404 *cost
+= rtx_cost (op0_stripped
, mode
, XOR
, 0, speed
);
7408 if (op0_stripped
!= newop0
)
7409 *cost
+= extra_cost
->alu
.log_shift
;
7411 *cost
+= extra_cost
->alu
.logical
;
7418 *cost
+= extra_cost
->alu
.logical
;
7425 /* If a value is written in SI mode, then zero extended to DI
7426 mode, the operation will in general be free as a write to
7427 a 'w' register implicitly zeroes the upper bits of an 'x'
7428 register. However, if this is
7430 (set (reg) (zero_extend (reg)))
7432 we must cost the explicit register move. */
7434 && GET_MODE (op0
) == SImode
7437 int op_cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, 0, speed
);
7439 /* If OP_COST is non-zero, then the cost of the zero extend
7440 is effectively the cost of the inner operation. Otherwise
7441 we have a MOV instruction and we take the cost from the MOV
7442 itself. This is true independently of whether we are
7443 optimizing for space or time. */
7449 else if (MEM_P (op0
))
7451 /* All loads can zero extend to any size for free. */
7452 *cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, param
, speed
);
7456 op0
= aarch64_extend_bitfield_pattern_p (x
);
7459 *cost
+= rtx_cost (op0
, mode
, ZERO_EXTEND
, 0, speed
);
7461 *cost
+= extra_cost
->alu
.bfx
;
7467 if (VECTOR_MODE_P (mode
))
7470 *cost
+= extra_cost
->vect
.alu
;
7474 /* We generate an AND instead of UXTB/UXTH. */
7475 *cost
+= extra_cost
->alu
.logical
;
7481 if (MEM_P (XEXP (x
, 0)))
7486 rtx address
= XEXP (XEXP (x
, 0), 0);
7487 *cost
+= extra_cost
->ldst
.load_sign_extend
;
7490 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
7496 op0
= aarch64_extend_bitfield_pattern_p (x
);
7499 *cost
+= rtx_cost (op0
, mode
, SIGN_EXTEND
, 0, speed
);
7501 *cost
+= extra_cost
->alu
.bfx
;
7507 if (VECTOR_MODE_P (mode
))
7508 *cost
+= extra_cost
->vect
.alu
;
7510 *cost
+= extra_cost
->alu
.extend
;
7518 if (CONST_INT_P (op1
))
7522 if (VECTOR_MODE_P (mode
))
7524 /* Vector shift (immediate). */
7525 *cost
+= extra_cost
->vect
.alu
;
7529 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
7531 *cost
+= extra_cost
->alu
.shift
;
7535 /* We can incorporate zero/sign extend for free. */
7536 if (GET_CODE (op0
) == ZERO_EXTEND
7537 || GET_CODE (op0
) == SIGN_EXTEND
)
7538 op0
= XEXP (op0
, 0);
7540 *cost
+= rtx_cost (op0
, VOIDmode
, ASHIFT
, 0, speed
);
7545 if (VECTOR_MODE_P (mode
))
7548 /* Vector shift (register). */
7549 *cost
+= extra_cost
->vect
.alu
;
7555 *cost
+= extra_cost
->alu
.shift_reg
;
7557 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
7558 && CONST_INT_P (XEXP (op1
, 1))
7559 && INTVAL (XEXP (op1
, 1)) == GET_MODE_BITSIZE (mode
) - 1)
7561 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
7562 /* We already demanded XEXP (op1, 0) to be REG_P, so
7563 don't recurse into it. */
7567 return false; /* All arguments need to be in registers. */
7577 if (CONST_INT_P (op1
))
7579 /* ASR (immediate) and friends. */
7582 if (VECTOR_MODE_P (mode
))
7583 *cost
+= extra_cost
->vect
.alu
;
7585 *cost
+= extra_cost
->alu
.shift
;
7588 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
7593 if (VECTOR_MODE_P (mode
))
7596 /* Vector shift (register). */
7597 *cost
+= extra_cost
->vect
.alu
;
7602 /* ASR (register) and friends. */
7603 *cost
+= extra_cost
->alu
.shift_reg
;
7605 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
7606 && CONST_INT_P (XEXP (op1
, 1))
7607 && INTVAL (XEXP (op1
, 1)) == GET_MODE_BITSIZE (mode
) - 1)
7609 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
7610 /* We already demanded XEXP (op1, 0) to be REG_P, so
7611 don't recurse into it. */
7615 return false; /* All arguments need to be in registers. */
7620 if (aarch64_cmodel
== AARCH64_CMODEL_LARGE
7621 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
)
7625 *cost
+= extra_cost
->ldst
.load
;
7627 else if (aarch64_cmodel
== AARCH64_CMODEL_SMALL
7628 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_PIC
)
7630 /* ADRP, followed by ADD. */
7631 *cost
+= COSTS_N_INSNS (1);
7633 *cost
+= 2 * extra_cost
->alu
.arith
;
7635 else if (aarch64_cmodel
== AARCH64_CMODEL_TINY
7636 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
7640 *cost
+= extra_cost
->alu
.arith
;
7645 /* One extra load instruction, after accessing the GOT. */
7646 *cost
+= COSTS_N_INSNS (1);
7648 *cost
+= extra_cost
->ldst
.load
;
7654 /* ADRP/ADD (immediate). */
7656 *cost
+= extra_cost
->alu
.arith
;
7664 if (VECTOR_MODE_P (mode
))
7665 *cost
+= extra_cost
->vect
.alu
;
7667 *cost
+= extra_cost
->alu
.bfx
;
7670 /* We can trust that the immediates used will be correct (there
7671 are no by-register forms), so we need only cost op0. */
7672 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (enum rtx_code
) code
, 0, speed
);
7676 *cost
+= aarch64_rtx_mult_cost (x
, MULT
, 0, speed
);
7677 /* aarch64_rtx_mult_cost always handles recursion to its
7682 /* We can expand signed mod by power of 2 using a NEGS, two parallel
7683 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
7684 an unconditional negate. This case should only ever be reached through
7685 the set_smod_pow2_cheap check in expmed.c. */
7686 if (CONST_INT_P (XEXP (x
, 1))
7687 && exact_log2 (INTVAL (XEXP (x
, 1))) > 0
7688 && (mode
== SImode
|| mode
== DImode
))
7690 /* We expand to 4 instructions. Reset the baseline. */
7691 *cost
= COSTS_N_INSNS (4);
7694 *cost
+= 2 * extra_cost
->alu
.logical
7695 + 2 * extra_cost
->alu
.arith
;
7704 /* Slighly prefer UMOD over SMOD. */
7705 if (VECTOR_MODE_P (mode
))
7706 *cost
+= extra_cost
->vect
.alu
;
7707 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
7708 *cost
+= (extra_cost
->mult
[mode
== DImode
].add
7709 + extra_cost
->mult
[mode
== DImode
].idiv
7710 + (code
== MOD
? 1 : 0));
7712 return false; /* All arguments need to be in registers. */
7719 if (VECTOR_MODE_P (mode
))
7720 *cost
+= extra_cost
->vect
.alu
;
7721 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
7722 /* There is no integer SQRT, so only DIV and UDIV can get
7724 *cost
+= (extra_cost
->mult
[mode
== DImode
].idiv
7725 /* Slighly prefer UDIV over SDIV. */
7726 + (code
== DIV
? 1 : 0));
7728 *cost
+= extra_cost
->fp
[mode
== DFmode
].div
;
7730 return false; /* All arguments need to be in registers. */
7733 return aarch64_if_then_else_costs (XEXP (x
, 0), XEXP (x
, 1),
7734 XEXP (x
, 2), cost
, speed
);
7747 return false; /* All arguments must be in registers. */
7756 if (VECTOR_MODE_P (mode
))
7757 *cost
+= extra_cost
->vect
.alu
;
7759 *cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
7762 /* FMSUB, FNMADD, and FNMSUB are free. */
7763 if (GET_CODE (op0
) == NEG
)
7764 op0
= XEXP (op0
, 0);
7766 if (GET_CODE (op2
) == NEG
)
7767 op2
= XEXP (op2
, 0);
7769 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7770 and the by-element operand as operand 0. */
7771 if (GET_CODE (op1
) == NEG
)
7772 op1
= XEXP (op1
, 0);
7774 /* Catch vector-by-element operations. The by-element operand can
7775 either be (vec_duplicate (vec_select (x))) or just
7776 (vec_select (x)), depending on whether we are multiplying by
7777 a vector or a scalar.
7779 Canonicalization is not very good in these cases, FMA4 will put the
7780 by-element operand as operand 0, FNMA4 will have it as operand 1. */
7781 if (GET_CODE (op0
) == VEC_DUPLICATE
)
7782 op0
= XEXP (op0
, 0);
7783 else if (GET_CODE (op1
) == VEC_DUPLICATE
)
7784 op1
= XEXP (op1
, 0);
7786 if (GET_CODE (op0
) == VEC_SELECT
)
7787 op0
= XEXP (op0
, 0);
7788 else if (GET_CODE (op1
) == VEC_SELECT
)
7789 op1
= XEXP (op1
, 0);
7791 /* If the remaining parameters are not registers,
7792 get the cost to put them into registers. */
7793 *cost
+= rtx_cost (op0
, mode
, FMA
, 0, speed
);
7794 *cost
+= rtx_cost (op1
, mode
, FMA
, 1, speed
);
7795 *cost
+= rtx_cost (op2
, mode
, FMA
, 2, speed
);
7799 case UNSIGNED_FLOAT
:
7801 *cost
+= extra_cost
->fp
[mode
== DFmode
].fromint
;
7807 if (VECTOR_MODE_P (mode
))
7809 /*Vector truncate. */
7810 *cost
+= extra_cost
->vect
.alu
;
7813 *cost
+= extra_cost
->fp
[mode
== DFmode
].widen
;
7817 case FLOAT_TRUNCATE
:
7820 if (VECTOR_MODE_P (mode
))
7822 /*Vector conversion. */
7823 *cost
+= extra_cost
->vect
.alu
;
7826 *cost
+= extra_cost
->fp
[mode
== DFmode
].narrow
;
7833 /* Strip the rounding part. They will all be implemented
7834 by the fcvt* family of instructions anyway. */
7835 if (GET_CODE (x
) == UNSPEC
)
7837 unsigned int uns_code
= XINT (x
, 1);
7839 if (uns_code
== UNSPEC_FRINTA
7840 || uns_code
== UNSPEC_FRINTM
7841 || uns_code
== UNSPEC_FRINTN
7842 || uns_code
== UNSPEC_FRINTP
7843 || uns_code
== UNSPEC_FRINTZ
)
7844 x
= XVECEXP (x
, 0, 0);
7849 if (VECTOR_MODE_P (mode
))
7850 *cost
+= extra_cost
->vect
.alu
;
7852 *cost
+= extra_cost
->fp
[GET_MODE (x
) == DFmode
].toint
;
7855 /* We can combine fmul by a power of 2 followed by a fcvt into a single
7856 fixed-point fcvt. */
7857 if (GET_CODE (x
) == MULT
7858 && ((VECTOR_MODE_P (mode
)
7859 && aarch64_vec_fpconst_pow_of_2 (XEXP (x
, 1)) > 0)
7860 || aarch64_fpconst_pow_of_2 (XEXP (x
, 1)) > 0))
7862 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (rtx_code
) code
,
7867 *cost
+= rtx_cost (x
, VOIDmode
, (enum rtx_code
) code
, 0, speed
);
7871 if (VECTOR_MODE_P (mode
))
7875 *cost
+= extra_cost
->vect
.alu
;
7877 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
7881 /* FABD, which is analogous to FADD. */
7882 if (GET_CODE (op0
) == MINUS
)
7884 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, MINUS
, 0, speed
);
7885 *cost
+= rtx_cost (XEXP (op0
, 1), mode
, MINUS
, 1, speed
);
7887 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
7891 /* Simple FABS is analogous to FNEG. */
7893 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
7897 /* Integer ABS will either be split to
7898 two arithmetic instructions, or will be an ABS
7899 (scalar), which we don't model. */
7900 *cost
= COSTS_N_INSNS (2);
7902 *cost
+= 2 * extra_cost
->alu
.arith
;
7910 if (VECTOR_MODE_P (mode
))
7911 *cost
+= extra_cost
->vect
.alu
;
7914 /* FMAXNM/FMINNM/FMAX/FMIN.
7915 TODO: This may not be accurate for all implementations, but
7916 we do not model this in the cost tables. */
7917 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
7923 /* The floating point round to integer frint* instructions. */
7924 if (aarch64_frint_unspec_p (XINT (x
, 1)))
7927 *cost
+= extra_cost
->fp
[mode
== DFmode
].roundint
;
7932 if (XINT (x
, 1) == UNSPEC_RBIT
)
7935 *cost
+= extra_cost
->alu
.rev
;
7943 /* Decompose <su>muldi3_highpart. */
7944 if (/* (truncate:DI */
7947 && GET_MODE (XEXP (x
, 0)) == TImode
7948 && GET_CODE (XEXP (x
, 0)) == LSHIFTRT
7950 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == MULT
7951 /* (ANY_EXTEND:TI (reg:DI))
7952 (ANY_EXTEND:TI (reg:DI))) */
7953 && ((GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == ZERO_EXTEND
7954 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == ZERO_EXTEND
)
7955 || (GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == SIGN_EXTEND
7956 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == SIGN_EXTEND
))
7957 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0)) == DImode
7958 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0)) == DImode
7959 /* (const_int 64) */
7960 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
7961 && UINTVAL (XEXP (XEXP (x
, 0), 1)) == 64)
7965 *cost
+= extra_cost
->mult
[mode
== DImode
].extend
;
7966 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0),
7967 mode
, MULT
, 0, speed
);
7968 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0),
7969 mode
, MULT
, 1, speed
);
7979 && flag_aarch64_verbose_cost
)
7981 "\nFailed to cost RTX. Assuming default cost.\n");
7986 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
7987 calculated for X. This cost is stored in *COST. Returns true
7988 if the total cost of X was calculated. */
7990 aarch64_rtx_costs_wrapper (rtx x
, machine_mode mode
, int outer
,
7991 int param
, int *cost
, bool speed
)
7993 bool result
= aarch64_rtx_costs (x
, mode
, outer
, param
, cost
, speed
);
7996 && flag_aarch64_verbose_cost
)
7998 print_rtl_single (dump_file
, x
);
7999 fprintf (dump_file
, "\n%s cost: %d (%s)\n",
8000 speed
? "Hot" : "Cold",
8001 *cost
, result
? "final" : "partial");
8008 aarch64_register_move_cost (machine_mode mode
,
8009 reg_class_t from_i
, reg_class_t to_i
)
8011 enum reg_class from
= (enum reg_class
) from_i
;
8012 enum reg_class to
= (enum reg_class
) to_i
;
8013 const struct cpu_regmove_cost
*regmove_cost
8014 = aarch64_tune_params
.regmove_cost
;
8016 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
8017 if (to
== CALLER_SAVE_REGS
|| to
== POINTER_REGS
)
8020 if (from
== CALLER_SAVE_REGS
|| from
== POINTER_REGS
)
8021 from
= GENERAL_REGS
;
8023 /* Moving between GPR and stack cost is the same as GP2GP. */
8024 if ((from
== GENERAL_REGS
&& to
== STACK_REG
)
8025 || (to
== GENERAL_REGS
&& from
== STACK_REG
))
8026 return regmove_cost
->GP2GP
;
8028 /* To/From the stack register, we move via the gprs. */
8029 if (to
== STACK_REG
|| from
== STACK_REG
)
8030 return aarch64_register_move_cost (mode
, from
, GENERAL_REGS
)
8031 + aarch64_register_move_cost (mode
, GENERAL_REGS
, to
);
8033 if (GET_MODE_SIZE (mode
) == 16)
8035 /* 128-bit operations on general registers require 2 instructions. */
8036 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
8037 return regmove_cost
->GP2GP
* 2;
8038 else if (from
== GENERAL_REGS
)
8039 return regmove_cost
->GP2FP
* 2;
8040 else if (to
== GENERAL_REGS
)
8041 return regmove_cost
->FP2GP
* 2;
8043 /* When AdvSIMD instructions are disabled it is not possible to move
8044 a 128-bit value directly between Q registers. This is handled in
8045 secondary reload. A general register is used as a scratch to move
8046 the upper DI value and the lower DI value is moved directly,
8047 hence the cost is the sum of three moves. */
8049 return regmove_cost
->GP2FP
+ regmove_cost
->FP2GP
+ regmove_cost
->FP2FP
;
8051 return regmove_cost
->FP2FP
;
8054 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
8055 return regmove_cost
->GP2GP
;
8056 else if (from
== GENERAL_REGS
)
8057 return regmove_cost
->GP2FP
;
8058 else if (to
== GENERAL_REGS
)
8059 return regmove_cost
->FP2GP
;
8061 return regmove_cost
->FP2FP
;
8065 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED
,
8066 reg_class_t rclass ATTRIBUTE_UNUSED
,
8067 bool in ATTRIBUTE_UNUSED
)
8069 return aarch64_tune_params
.memmov_cost
;
8072 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
8073 to optimize 1.0/sqrt. */
8076 use_rsqrt_p (machine_mode mode
)
8078 return (!flag_trapping_math
8079 && flag_unsafe_math_optimizations
8080 && ((aarch64_tune_params
.approx_modes
->recip_sqrt
8081 & AARCH64_APPROX_MODE (mode
))
8082 || flag_mrecip_low_precision_sqrt
));
8085 /* Function to decide when to use the approximate reciprocal square root
8089 aarch64_builtin_reciprocal (tree fndecl
)
8091 machine_mode mode
= TYPE_MODE (TREE_TYPE (fndecl
));
8093 if (!use_rsqrt_p (mode
))
8095 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl
));
8098 typedef rtx (*rsqrte_type
) (rtx
, rtx
);
8100 /* Select reciprocal square root initial estimate insn depending on machine
8104 get_rsqrte_type (machine_mode mode
)
8108 case DFmode
: return gen_aarch64_rsqrtedf
;
8109 case SFmode
: return gen_aarch64_rsqrtesf
;
8110 case V2DFmode
: return gen_aarch64_rsqrtev2df
;
8111 case V2SFmode
: return gen_aarch64_rsqrtev2sf
;
8112 case V4SFmode
: return gen_aarch64_rsqrtev4sf
;
8113 default: gcc_unreachable ();
8117 typedef rtx (*rsqrts_type
) (rtx
, rtx
, rtx
);
8119 /* Select reciprocal square root series step insn depending on machine mode. */
8122 get_rsqrts_type (machine_mode mode
)
8126 case DFmode
: return gen_aarch64_rsqrtsdf
;
8127 case SFmode
: return gen_aarch64_rsqrtssf
;
8128 case V2DFmode
: return gen_aarch64_rsqrtsv2df
;
8129 case V2SFmode
: return gen_aarch64_rsqrtsv2sf
;
8130 case V4SFmode
: return gen_aarch64_rsqrtsv4sf
;
8131 default: gcc_unreachable ();
8135 /* Emit instruction sequence to compute either the approximate square root
8136 or its approximate reciprocal, depending on the flag RECP, and return
8137 whether the sequence was emitted or not. */
8140 aarch64_emit_approx_sqrt (rtx dst
, rtx src
, bool recp
)
8142 machine_mode mode
= GET_MODE (dst
);
8144 if (GET_MODE_INNER (mode
) == HFmode
)
8151 = mode_for_vector (int_mode_for_mode (GET_MODE_INNER (mode
)),
8152 GET_MODE_NUNITS (mode
));
8155 if (!(flag_mlow_precision_sqrt
8156 || (aarch64_tune_params
.approx_modes
->sqrt
8157 & AARCH64_APPROX_MODE (mode
))))
8160 if (flag_finite_math_only
8161 || flag_trapping_math
8162 || !flag_unsafe_math_optimizations
8163 || optimize_function_for_size_p (cfun
))
8167 /* Caller assumes we cannot fail. */
8168 gcc_assert (use_rsqrt_p (mode
));
8171 rtx xmsk
= gen_reg_rtx (mmsk
);
8173 /* When calculating the approximate square root, compare the
8174 argument with 0.0 and create a mask. */
8175 emit_insn (gen_rtx_SET (xmsk
,
8177 gen_rtx_EQ (mmsk
, src
,
8178 CONST0_RTX (mode
)))));
8180 /* Estimate the approximate reciprocal square root. */
8181 rtx xdst
= gen_reg_rtx (mode
);
8182 emit_insn ((*get_rsqrte_type (mode
)) (xdst
, src
));
8184 /* Iterate over the series twice for SF and thrice for DF. */
8185 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
8187 /* Optionally iterate over the series once less for faster performance
8188 while sacrificing the accuracy. */
8189 if ((recp
&& flag_mrecip_low_precision_sqrt
)
8190 || (!recp
&& flag_mlow_precision_sqrt
))
8193 /* Iterate over the series to calculate the approximate reciprocal square
8195 rtx x1
= gen_reg_rtx (mode
);
8196 while (iterations
--)
8198 rtx x2
= gen_reg_rtx (mode
);
8199 emit_set_insn (x2
, gen_rtx_MULT (mode
, xdst
, xdst
));
8201 emit_insn ((*get_rsqrts_type (mode
)) (x1
, src
, x2
));
8204 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, x1
));
8209 /* Qualify the approximate reciprocal square root when the argument is
8210 0.0 by squashing the intermediary result to 0.0. */
8211 rtx xtmp
= gen_reg_rtx (mmsk
);
8212 emit_set_insn (xtmp
, gen_rtx_AND (mmsk
, gen_rtx_NOT (mmsk
, xmsk
),
8213 gen_rtx_SUBREG (mmsk
, xdst
, 0)));
8214 emit_move_insn (xdst
, gen_rtx_SUBREG (mode
, xtmp
, 0));
8216 /* Calculate the approximate square root. */
8217 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, src
));
8220 /* Finalize the approximation. */
8221 emit_set_insn (dst
, gen_rtx_MULT (mode
, xdst
, x1
));
8226 typedef rtx (*recpe_type
) (rtx
, rtx
);
8228 /* Select reciprocal initial estimate insn depending on machine mode. */
8231 get_recpe_type (machine_mode mode
)
8235 case SFmode
: return (gen_aarch64_frecpesf
);
8236 case V2SFmode
: return (gen_aarch64_frecpev2sf
);
8237 case V4SFmode
: return (gen_aarch64_frecpev4sf
);
8238 case DFmode
: return (gen_aarch64_frecpedf
);
8239 case V2DFmode
: return (gen_aarch64_frecpev2df
);
8240 default: gcc_unreachable ();
8244 typedef rtx (*recps_type
) (rtx
, rtx
, rtx
);
8246 /* Select reciprocal series step insn depending on machine mode. */
8249 get_recps_type (machine_mode mode
)
8253 case SFmode
: return (gen_aarch64_frecpssf
);
8254 case V2SFmode
: return (gen_aarch64_frecpsv2sf
);
8255 case V4SFmode
: return (gen_aarch64_frecpsv4sf
);
8256 case DFmode
: return (gen_aarch64_frecpsdf
);
8257 case V2DFmode
: return (gen_aarch64_frecpsv2df
);
8258 default: gcc_unreachable ();
8262 /* Emit the instruction sequence to compute the approximation for the division
8263 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
8266 aarch64_emit_approx_div (rtx quo
, rtx num
, rtx den
)
8268 machine_mode mode
= GET_MODE (quo
);
8270 if (GET_MODE_INNER (mode
) == HFmode
)
8273 bool use_approx_division_p
= (flag_mlow_precision_div
8274 || (aarch64_tune_params
.approx_modes
->division
8275 & AARCH64_APPROX_MODE (mode
)));
8277 if (!flag_finite_math_only
8278 || flag_trapping_math
8279 || !flag_unsafe_math_optimizations
8280 || optimize_function_for_size_p (cfun
)
8281 || !use_approx_division_p
)
8284 /* Estimate the approximate reciprocal. */
8285 rtx xrcp
= gen_reg_rtx (mode
);
8286 emit_insn ((*get_recpe_type (mode
)) (xrcp
, den
));
8288 /* Iterate over the series twice for SF and thrice for DF. */
8289 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
8291 /* Optionally iterate over the series once less for faster performance,
8292 while sacrificing the accuracy. */
8293 if (flag_mlow_precision_div
)
8296 /* Iterate over the series to calculate the approximate reciprocal. */
8297 rtx xtmp
= gen_reg_rtx (mode
);
8298 while (iterations
--)
8300 emit_insn ((*get_recps_type (mode
)) (xtmp
, xrcp
, den
));
8303 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
8306 if (num
!= CONST1_RTX (mode
))
8308 /* As the approximate reciprocal of DEN is already calculated, only
8309 calculate the approximate division when NUM is not 1.0. */
8310 rtx xnum
= force_reg (mode
, num
);
8311 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xnum
));
8314 /* Finalize the approximation. */
8315 emit_set_insn (quo
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
8319 /* Return the number of instructions that can be issued per cycle. */
8321 aarch64_sched_issue_rate (void)
8323 return aarch64_tune_params
.issue_rate
;
8327 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8329 int issue_rate
= aarch64_sched_issue_rate ();
8331 return issue_rate
> 1 && !sched_fusion
? issue_rate
: 0;
8335 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8336 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
8337 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
8340 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn
*insn
,
8343 return autopref_multipass_dfa_lookahead_guard (insn
, ready_index
);
8347 /* Vectorizer cost model target hooks. */
8349 /* Implement targetm.vectorize.builtin_vectorization_cost. */
8351 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost
,
8353 int misalign ATTRIBUTE_UNUSED
)
8356 const cpu_vector_cost
*costs
= aarch64_tune_params
.vec_costs
;
8359 if (vectype
!= NULL
)
8360 fp
= FLOAT_TYPE_P (vectype
);
8362 switch (type_of_cost
)
8365 return fp
? costs
->scalar_fp_stmt_cost
: costs
->scalar_int_stmt_cost
;
8368 return costs
->scalar_load_cost
;
8371 return costs
->scalar_store_cost
;
8374 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
8377 return costs
->vec_align_load_cost
;
8380 return costs
->vec_store_cost
;
8383 return costs
->vec_to_scalar_cost
;
8386 return costs
->scalar_to_vec_cost
;
8388 case unaligned_load
:
8389 return costs
->vec_unalign_load_cost
;
8391 case unaligned_store
:
8392 return costs
->vec_unalign_store_cost
;
8394 case cond_branch_taken
:
8395 return costs
->cond_taken_branch_cost
;
8397 case cond_branch_not_taken
:
8398 return costs
->cond_not_taken_branch_cost
;
8401 return costs
->vec_permute_cost
;
8403 case vec_promote_demote
:
8404 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
8407 elements
= TYPE_VECTOR_SUBPARTS (vectype
);
8408 return elements
/ 2 + 1;
8415 /* Implement targetm.vectorize.add_stmt_cost. */
8417 aarch64_add_stmt_cost (void *data
, int count
, enum vect_cost_for_stmt kind
,
8418 struct _stmt_vec_info
*stmt_info
, int misalign
,
8419 enum vect_cost_model_location where
)
8421 unsigned *cost
= (unsigned *) data
;
8422 unsigned retval
= 0;
8424 if (flag_vect_cost_model
)
8426 tree vectype
= stmt_info
? stmt_vectype (stmt_info
) : NULL_TREE
;
8428 aarch64_builtin_vectorization_cost (kind
, vectype
, misalign
);
8430 /* Statements in an inner loop relative to the loop being
8431 vectorized are weighted more heavily. The value here is
8432 arbitrary and could potentially be improved with analysis. */
8433 if (where
== vect_body
&& stmt_info
&& stmt_in_inner_loop_p (stmt_info
))
8434 count
*= 50; /* FIXME */
8436 retval
= (unsigned) (count
* stmt_cost
);
8437 cost
[where
] += retval
;
8443 static void initialize_aarch64_code_model (struct gcc_options
*);
8445 /* Parse the TO_PARSE string and put the architecture struct that it
8446 selects into RES and the architectural features into ISA_FLAGS.
8447 Return an aarch64_parse_opt_result describing the parse result.
8448 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
8450 static enum aarch64_parse_opt_result
8451 aarch64_parse_arch (const char *to_parse
, const struct processor
**res
,
8452 unsigned long *isa_flags
)
8455 const struct processor
*arch
;
8456 char *str
= (char *) alloca (strlen (to_parse
) + 1);
8459 strcpy (str
, to_parse
);
8461 ext
= strchr (str
, '+');
8469 return AARCH64_PARSE_MISSING_ARG
;
8472 /* Loop through the list of supported ARCHes to find a match. */
8473 for (arch
= all_architectures
; arch
->name
!= NULL
; arch
++)
8475 if (strlen (arch
->name
) == len
&& strncmp (arch
->name
, str
, len
) == 0)
8477 unsigned long isa_temp
= arch
->flags
;
8481 /* TO_PARSE string contains at least one extension. */
8482 enum aarch64_parse_opt_result ext_res
8483 = aarch64_parse_extension (ext
, &isa_temp
);
8485 if (ext_res
!= AARCH64_PARSE_OK
)
8488 /* Extension parsing was successful. Confirm the result
8489 arch and ISA flags. */
8491 *isa_flags
= isa_temp
;
8492 return AARCH64_PARSE_OK
;
8496 /* ARCH name not found in list. */
8497 return AARCH64_PARSE_INVALID_ARG
;
8500 /* Parse the TO_PARSE string and put the result tuning in RES and the
8501 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
8502 describing the parse result. If there is an error parsing, RES and
8503 ISA_FLAGS are left unchanged. */
8505 static enum aarch64_parse_opt_result
8506 aarch64_parse_cpu (const char *to_parse
, const struct processor
**res
,
8507 unsigned long *isa_flags
)
8510 const struct processor
*cpu
;
8511 char *str
= (char *) alloca (strlen (to_parse
) + 1);
8514 strcpy (str
, to_parse
);
8516 ext
= strchr (str
, '+');
8524 return AARCH64_PARSE_MISSING_ARG
;
8527 /* Loop through the list of supported CPUs to find a match. */
8528 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
8530 if (strlen (cpu
->name
) == len
&& strncmp (cpu
->name
, str
, len
) == 0)
8532 unsigned long isa_temp
= cpu
->flags
;
8537 /* TO_PARSE string contains at least one extension. */
8538 enum aarch64_parse_opt_result ext_res
8539 = aarch64_parse_extension (ext
, &isa_temp
);
8541 if (ext_res
!= AARCH64_PARSE_OK
)
8544 /* Extension parsing was successfull. Confirm the result
8545 cpu and ISA flags. */
8547 *isa_flags
= isa_temp
;
8548 return AARCH64_PARSE_OK
;
8552 /* CPU name not found in list. */
8553 return AARCH64_PARSE_INVALID_ARG
;
8556 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8557 Return an aarch64_parse_opt_result describing the parse result.
8558 If the parsing fails the RES does not change. */
8560 static enum aarch64_parse_opt_result
8561 aarch64_parse_tune (const char *to_parse
, const struct processor
**res
)
8563 const struct processor
*cpu
;
8564 char *str
= (char *) alloca (strlen (to_parse
) + 1);
8566 strcpy (str
, to_parse
);
8568 /* Loop through the list of supported CPUs to find a match. */
8569 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
8571 if (strcmp (cpu
->name
, str
) == 0)
8574 return AARCH64_PARSE_OK
;
8578 /* CPU name not found in list. */
8579 return AARCH64_PARSE_INVALID_ARG
;
8582 /* Parse TOKEN, which has length LENGTH to see if it is an option
8583 described in FLAG. If it is, return the index bit for that fusion type.
8584 If not, error (printing OPTION_NAME) and return zero. */
8587 aarch64_parse_one_option_token (const char *token
,
8589 const struct aarch64_flag_desc
*flag
,
8590 const char *option_name
)
8592 for (; flag
->name
!= NULL
; flag
++)
8594 if (length
== strlen (flag
->name
)
8595 && !strncmp (flag
->name
, token
, length
))
8599 error ("unknown flag passed in -moverride=%s (%s)", option_name
, token
);
8603 /* Parse OPTION which is a comma-separated list of flags to enable.
8604 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8605 default state we inherit from the CPU tuning structures. OPTION_NAME
8606 gives the top-level option we are parsing in the -moverride string,
8607 for use in error messages. */
8610 aarch64_parse_boolean_options (const char *option
,
8611 const struct aarch64_flag_desc
*flags
,
8612 unsigned int initial_state
,
8613 const char *option_name
)
8615 const char separator
= '.';
8616 const char* specs
= option
;
8617 const char* ntoken
= option
;
8618 unsigned int found_flags
= initial_state
;
8620 while ((ntoken
= strchr (specs
, separator
)))
8622 size_t token_length
= ntoken
- specs
;
8623 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
8627 /* If we find "none" (or, for simplicity's sake, an error) anywhere
8628 in the token stream, reset the supported operations. So:
8630 adrp+add.cmp+branch.none.adrp+add
8632 would have the result of turning on only adrp+add fusion. */
8636 found_flags
|= token_ops
;
8640 /* We ended with a comma, print something. */
8643 error ("%s string ill-formed\n", option_name
);
8647 /* We still have one more token to parse. */
8648 size_t token_length
= strlen (specs
);
8649 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
8656 found_flags
|= token_ops
;
8660 /* Support for overriding instruction fusion. */
8663 aarch64_parse_fuse_string (const char *fuse_string
,
8664 struct tune_params
*tune
)
8666 tune
->fusible_ops
= aarch64_parse_boolean_options (fuse_string
,
8667 aarch64_fusible_pairs
,
8672 /* Support for overriding other tuning flags. */
8675 aarch64_parse_tune_string (const char *tune_string
,
8676 struct tune_params
*tune
)
8678 tune
->extra_tuning_flags
8679 = aarch64_parse_boolean_options (tune_string
,
8680 aarch64_tuning_flags
,
8681 tune
->extra_tuning_flags
,
8685 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8686 we understand. If it is, extract the option string and handoff to
8687 the appropriate function. */
8690 aarch64_parse_one_override_token (const char* token
,
8692 struct tune_params
*tune
)
8694 const struct aarch64_tuning_override_function
*fn
8695 = aarch64_tuning_override_functions
;
8697 const char *option_part
= strchr (token
, '=');
8700 error ("tuning string missing in option (%s)", token
);
8704 /* Get the length of the option name. */
8705 length
= option_part
- token
;
8706 /* Skip the '=' to get to the option string. */
8709 for (; fn
->name
!= NULL
; fn
++)
8711 if (!strncmp (fn
->name
, token
, length
))
8713 fn
->parse_override (option_part
, tune
);
8718 error ("unknown tuning option (%s)",token
);
8722 /* A checking mechanism for the implementation of the tls size. */
8725 initialize_aarch64_tls_size (struct gcc_options
*opts
)
8727 if (aarch64_tls_size
== 0)
8728 aarch64_tls_size
= 24;
8730 switch (opts
->x_aarch64_cmodel_var
)
8732 case AARCH64_CMODEL_TINY
:
8733 /* Both the default and maximum TLS size allowed under tiny is 1M which
8734 needs two instructions to address, so we clamp the size to 24. */
8735 if (aarch64_tls_size
> 24)
8736 aarch64_tls_size
= 24;
8738 case AARCH64_CMODEL_SMALL
:
8739 /* The maximum TLS size allowed under small is 4G. */
8740 if (aarch64_tls_size
> 32)
8741 aarch64_tls_size
= 32;
8743 case AARCH64_CMODEL_LARGE
:
8744 /* The maximum TLS size allowed under large is 16E.
8745 FIXME: 16E should be 64bit, we only support 48bit offset now. */
8746 if (aarch64_tls_size
> 48)
8747 aarch64_tls_size
= 48;
8756 /* Parse STRING looking for options in the format:
8757 string :: option:string
8758 option :: name=substring
8760 substring :: defined by option. */
8763 aarch64_parse_override_string (const char* input_string
,
8764 struct tune_params
* tune
)
8766 const char separator
= ':';
8767 size_t string_length
= strlen (input_string
) + 1;
8768 char *string_root
= (char *) xmalloc (sizeof (*string_root
) * string_length
);
8769 char *string
= string_root
;
8770 strncpy (string
, input_string
, string_length
);
8771 string
[string_length
- 1] = '\0';
8773 char* ntoken
= string
;
8775 while ((ntoken
= strchr (string
, separator
)))
8777 size_t token_length
= ntoken
- string
;
8778 /* Make this substring look like a string. */
8780 aarch64_parse_one_override_token (string
, token_length
, tune
);
8784 /* One last option to parse. */
8785 aarch64_parse_one_override_token (string
, strlen (string
), tune
);
8791 aarch64_override_options_after_change_1 (struct gcc_options
*opts
)
8793 /* The logic here is that if we are disabling all frame pointer generation
8794 then we do not need to disable leaf frame pointer generation as a
8795 separate operation. But if we are *only* disabling leaf frame pointer
8796 generation then we set flag_omit_frame_pointer to true, but in
8797 aarch64_frame_pointer_required we return false only for leaf functions.
8799 PR 70044: We have to be careful about being called multiple times for the
8800 same function. Once we have decided to set flag_omit_frame_pointer just
8801 so that we can omit leaf frame pointers, we must then not interpret a
8802 second call as meaning that all frame pointer generation should be
8803 omitted. We do this by setting flag_omit_frame_pointer to a special,
8805 if (opts
->x_flag_omit_frame_pointer
== 2)
8806 opts
->x_flag_omit_frame_pointer
= 0;
8808 if (opts
->x_flag_omit_frame_pointer
)
8809 opts
->x_flag_omit_leaf_frame_pointer
= false;
8810 else if (opts
->x_flag_omit_leaf_frame_pointer
)
8811 opts
->x_flag_omit_frame_pointer
= 2;
8813 /* If not optimizing for size, set the default
8814 alignment to what the target wants. */
8815 if (!opts
->x_optimize_size
)
8817 if (opts
->x_align_loops
<= 0)
8818 opts
->x_align_loops
= aarch64_tune_params
.loop_align
;
8819 if (opts
->x_align_jumps
<= 0)
8820 opts
->x_align_jumps
= aarch64_tune_params
.jump_align
;
8821 if (opts
->x_align_functions
<= 0)
8822 opts
->x_align_functions
= aarch64_tune_params
.function_align
;
8825 /* We default to no pc-relative literal loads. */
8827 aarch64_pcrelative_literal_loads
= false;
8829 /* If -mpc-relative-literal-loads is set on the command line, this
8830 implies that the user asked for PC relative literal loads. */
8831 if (opts
->x_pcrelative_literal_loads
== 1)
8832 aarch64_pcrelative_literal_loads
= true;
8834 /* This is PR70113. When building the Linux kernel with
8835 CONFIG_ARM64_ERRATUM_843419, support for relocations
8836 R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
8837 removed from the kernel to avoid loading objects with possibly
8838 offending sequences. Without -mpc-relative-literal-loads we would
8839 generate such relocations, preventing the kernel build from
8841 if (opts
->x_pcrelative_literal_loads
== 2
8842 && TARGET_FIX_ERR_A53_843419
)
8843 aarch64_pcrelative_literal_loads
= true;
8845 /* In the tiny memory model it makes no sense to disallow PC relative
8846 literal pool loads. */
8847 if (aarch64_cmodel
== AARCH64_CMODEL_TINY
8848 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
8849 aarch64_pcrelative_literal_loads
= true;
8851 /* When enabling the lower precision Newton series for the square root, also
8852 enable it for the reciprocal square root, since the latter is an
8853 intermediary step for the former. */
8854 if (flag_mlow_precision_sqrt
)
8855 flag_mrecip_low_precision_sqrt
= true;
8858 /* 'Unpack' up the internal tuning structs and update the options
8859 in OPTS. The caller must have set up selected_tune and selected_arch
8860 as all the other target-specific codegen decisions are
8861 derived from them. */
8864 aarch64_override_options_internal (struct gcc_options
*opts
)
8866 aarch64_tune_flags
= selected_tune
->flags
;
8867 aarch64_tune
= selected_tune
->sched_core
;
8868 /* Make a copy of the tuning parameters attached to the core, which
8869 we may later overwrite. */
8870 aarch64_tune_params
= *(selected_tune
->tune
);
8871 aarch64_architecture_version
= selected_arch
->architecture_version
;
8873 if (opts
->x_aarch64_override_tune_string
)
8874 aarch64_parse_override_string (opts
->x_aarch64_override_tune_string
,
8875 &aarch64_tune_params
);
8877 /* This target defaults to strict volatile bitfields. */
8878 if (opts
->x_flag_strict_volatile_bitfields
< 0 && abi_version_at_least (2))
8879 opts
->x_flag_strict_volatile_bitfields
= 1;
8881 initialize_aarch64_code_model (opts
);
8882 initialize_aarch64_tls_size (opts
);
8884 int queue_depth
= 0;
8885 switch (aarch64_tune_params
.autoprefetcher_model
)
8887 case tune_params::AUTOPREFETCHER_OFF
:
8890 case tune_params::AUTOPREFETCHER_WEAK
:
8893 case tune_params::AUTOPREFETCHER_STRONG
:
8894 queue_depth
= max_insn_queue_index
+ 1;
8900 /* We don't mind passing in global_options_set here as we don't use
8901 the *options_set structs anyway. */
8902 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH
,
8904 opts
->x_param_values
,
8905 global_options_set
.x_param_values
);
8907 /* Set up parameters to be used in prefetching algorithm. Do not
8908 override the defaults unless we are tuning for a core we have
8909 researched values for. */
8910 if (aarch64_tune_params
.prefetch
->num_slots
> 0)
8911 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES
,
8912 aarch64_tune_params
.prefetch
->num_slots
,
8913 opts
->x_param_values
,
8914 global_options_set
.x_param_values
);
8915 if (aarch64_tune_params
.prefetch
->l1_cache_size
>= 0)
8916 maybe_set_param_value (PARAM_L1_CACHE_SIZE
,
8917 aarch64_tune_params
.prefetch
->l1_cache_size
,
8918 opts
->x_param_values
,
8919 global_options_set
.x_param_values
);
8920 if (aarch64_tune_params
.prefetch
->l1_cache_line_size
>= 0)
8921 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE
,
8922 aarch64_tune_params
.prefetch
->l1_cache_line_size
,
8923 opts
->x_param_values
,
8924 global_options_set
.x_param_values
);
8925 if (aarch64_tune_params
.prefetch
->l2_cache_size
>= 0)
8926 maybe_set_param_value (PARAM_L2_CACHE_SIZE
,
8927 aarch64_tune_params
.prefetch
->l2_cache_size
,
8928 opts
->x_param_values
,
8929 global_options_set
.x_param_values
);
8931 /* Enable sw prefetching at specified optimization level for
8932 CPUS that have prefetch. Lower optimization level threshold by 1
8933 when profiling is enabled. */
8934 if (opts
->x_flag_prefetch_loop_arrays
< 0
8935 && !opts
->x_optimize_size
8936 && aarch64_tune_params
.prefetch
->default_opt_level
>= 0
8937 && opts
->x_optimize
>= aarch64_tune_params
.prefetch
->default_opt_level
)
8938 opts
->x_flag_prefetch_loop_arrays
= 1;
8940 aarch64_override_options_after_change_1 (opts
);
8943 /* Print a hint with a suggestion for a core or architecture name that
8944 most closely resembles what the user passed in STR. ARCH is true if
8945 the user is asking for an architecture name. ARCH is false if the user
8946 is asking for a core name. */
8949 aarch64_print_hint_for_core_or_arch (const char *str
, bool arch
)
8951 auto_vec
<const char *> candidates
;
8952 const struct processor
*entry
= arch
? all_architectures
: all_cores
;
8953 for (; entry
->name
!= NULL
; entry
++)
8954 candidates
.safe_push (entry
->name
);
8956 const char *hint
= candidates_list_and_hint (str
, s
, candidates
);
8958 inform (input_location
, "valid arguments are: %s;"
8959 " did you mean %qs?", s
, hint
);
8963 /* Print a hint with a suggestion for a core name that most closely resembles
8964 what the user passed in STR. */
8967 aarch64_print_hint_for_core (const char *str
)
8969 aarch64_print_hint_for_core_or_arch (str
, false);
8972 /* Print a hint with a suggestion for an architecture name that most closely
8973 resembles what the user passed in STR. */
8976 aarch64_print_hint_for_arch (const char *str
)
8978 aarch64_print_hint_for_core_or_arch (str
, true);
8981 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
8982 specified in STR and throw errors if appropriate. Put the results if
8983 they are valid in RES and ISA_FLAGS. Return whether the option is
8987 aarch64_validate_mcpu (const char *str
, const struct processor
**res
,
8988 unsigned long *isa_flags
)
8990 enum aarch64_parse_opt_result parse_res
8991 = aarch64_parse_cpu (str
, res
, isa_flags
);
8993 if (parse_res
== AARCH64_PARSE_OK
)
8998 case AARCH64_PARSE_MISSING_ARG
:
8999 error ("missing cpu name in %<-mcpu=%s%>", str
);
9001 case AARCH64_PARSE_INVALID_ARG
:
9002 error ("unknown value %qs for -mcpu", str
);
9003 aarch64_print_hint_for_core (str
);
9005 case AARCH64_PARSE_INVALID_FEATURE
:
9006 error ("invalid feature modifier in %<-mcpu=%s%>", str
);
9015 /* Validate a command-line -march option. Parse the arch and extensions
9016 (if any) specified in STR and throw errors if appropriate. Put the
9017 results, if they are valid, in RES and ISA_FLAGS. Return whether the
9021 aarch64_validate_march (const char *str
, const struct processor
**res
,
9022 unsigned long *isa_flags
)
9024 enum aarch64_parse_opt_result parse_res
9025 = aarch64_parse_arch (str
, res
, isa_flags
);
9027 if (parse_res
== AARCH64_PARSE_OK
)
9032 case AARCH64_PARSE_MISSING_ARG
:
9033 error ("missing arch name in %<-march=%s%>", str
);
9035 case AARCH64_PARSE_INVALID_ARG
:
9036 error ("unknown value %qs for -march", str
);
9037 aarch64_print_hint_for_arch (str
);
9039 case AARCH64_PARSE_INVALID_FEATURE
:
9040 error ("invalid feature modifier in %<-march=%s%>", str
);
9049 /* Validate a command-line -mtune option. Parse the cpu
9050 specified in STR and throw errors if appropriate. Put the
9051 result, if it is valid, in RES. Return whether the option is
9055 aarch64_validate_mtune (const char *str
, const struct processor
**res
)
9057 enum aarch64_parse_opt_result parse_res
9058 = aarch64_parse_tune (str
, res
);
9060 if (parse_res
== AARCH64_PARSE_OK
)
9065 case AARCH64_PARSE_MISSING_ARG
:
9066 error ("missing cpu name in %<-mtune=%s%>", str
);
9068 case AARCH64_PARSE_INVALID_ARG
:
9069 error ("unknown value %qs for -mtune", str
);
9070 aarch64_print_hint_for_core (str
);
9078 /* Return the CPU corresponding to the enum CPU.
9079 If it doesn't specify a cpu, return the default. */
9081 static const struct processor
*
9082 aarch64_get_tune_cpu (enum aarch64_processor cpu
)
9084 if (cpu
!= aarch64_none
)
9085 return &all_cores
[cpu
];
9087 /* The & 0x3f is to extract the bottom 6 bits that encode the
9088 default cpu as selected by the --with-cpu GCC configure option
9090 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
9091 flags mechanism should be reworked to make it more sane. */
9092 return &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
9095 /* Return the architecture corresponding to the enum ARCH.
9096 If it doesn't specify a valid architecture, return the default. */
9098 static const struct processor
*
9099 aarch64_get_arch (enum aarch64_arch arch
)
9101 if (arch
!= aarch64_no_arch
)
9102 return &all_architectures
[arch
];
9104 const struct processor
*cpu
= &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
9106 return &all_architectures
[cpu
->arch
];
9109 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
9110 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
9111 tuning structs. In particular it must set selected_tune and
9112 aarch64_isa_flags that define the available ISA features and tuning
9113 decisions. It must also set selected_arch as this will be used to
9114 output the .arch asm tags for each function. */
9117 aarch64_override_options (void)
9119 unsigned long cpu_isa
= 0;
9120 unsigned long arch_isa
= 0;
9121 aarch64_isa_flags
= 0;
9123 bool valid_cpu
= true;
9124 bool valid_tune
= true;
9125 bool valid_arch
= true;
9127 selected_cpu
= NULL
;
9128 selected_arch
= NULL
;
9129 selected_tune
= NULL
;
9131 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
9132 If either of -march or -mtune is given, they override their
9133 respective component of -mcpu. */
9134 if (aarch64_cpu_string
)
9135 valid_cpu
= aarch64_validate_mcpu (aarch64_cpu_string
, &selected_cpu
,
9138 if (aarch64_arch_string
)
9139 valid_arch
= aarch64_validate_march (aarch64_arch_string
, &selected_arch
,
9142 if (aarch64_tune_string
)
9143 valid_tune
= aarch64_validate_mtune (aarch64_tune_string
, &selected_tune
);
9145 /* If the user did not specify a processor, choose the default
9146 one for them. This will be the CPU set during configuration using
9147 --with-cpu, otherwise it is "generic". */
9152 selected_cpu
= &all_cores
[selected_arch
->ident
];
9153 aarch64_isa_flags
= arch_isa
;
9154 explicit_arch
= selected_arch
->arch
;
9158 /* Get default configure-time CPU. */
9159 selected_cpu
= aarch64_get_tune_cpu (aarch64_none
);
9160 aarch64_isa_flags
= TARGET_CPU_DEFAULT
>> 6;
9164 explicit_tune_core
= selected_tune
->ident
;
9166 /* If both -mcpu and -march are specified check that they are architecturally
9167 compatible, warn if they're not and prefer the -march ISA flags. */
9168 else if (selected_arch
)
9170 if (selected_arch
->arch
!= selected_cpu
->arch
)
9172 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
9173 all_architectures
[selected_cpu
->arch
].name
,
9174 selected_arch
->name
);
9176 aarch64_isa_flags
= arch_isa
;
9177 explicit_arch
= selected_arch
->arch
;
9178 explicit_tune_core
= selected_tune
? selected_tune
->ident
9179 : selected_cpu
->ident
;
9183 /* -mcpu but no -march. */
9184 aarch64_isa_flags
= cpu_isa
;
9185 explicit_tune_core
= selected_tune
? selected_tune
->ident
9186 : selected_cpu
->ident
;
9187 gcc_assert (selected_cpu
);
9188 selected_arch
= &all_architectures
[selected_cpu
->arch
];
9189 explicit_arch
= selected_arch
->arch
;
9192 /* Set the arch as well as we will need it when outputing
9193 the .arch directive in assembly. */
9196 gcc_assert (selected_cpu
);
9197 selected_arch
= &all_architectures
[selected_cpu
->arch
];
9201 selected_tune
= selected_cpu
;
9203 #ifndef HAVE_AS_MABI_OPTION
9204 /* The compiler may have been configured with 2.23.* binutils, which does
9205 not have support for ILP32. */
9207 error ("Assembler does not support -mabi=ilp32");
9210 if (aarch64_ra_sign_scope
!= AARCH64_FUNCTION_NONE
&& TARGET_ILP32
)
9211 sorry ("Return address signing is only supported for -mabi=lp64");
9213 /* Make sure we properly set up the explicit options. */
9214 if ((aarch64_cpu_string
&& valid_cpu
)
9215 || (aarch64_tune_string
&& valid_tune
))
9216 gcc_assert (explicit_tune_core
!= aarch64_none
);
9218 if ((aarch64_cpu_string
&& valid_cpu
)
9219 || (aarch64_arch_string
&& valid_arch
))
9220 gcc_assert (explicit_arch
!= aarch64_no_arch
);
9222 aarch64_override_options_internal (&global_options
);
9224 /* Save these options as the default ones in case we push and pop them later
9225 while processing functions with potential target attributes. */
9226 target_option_default_node
= target_option_current_node
9227 = build_target_option_node (&global_options
);
9230 /* Implement targetm.override_options_after_change. */
9233 aarch64_override_options_after_change (void)
9235 aarch64_override_options_after_change_1 (&global_options
);
9238 static struct machine_function
*
9239 aarch64_init_machine_status (void)
9241 struct machine_function
*machine
;
9242 machine
= ggc_cleared_alloc
<machine_function
> ();
9247 aarch64_init_expanders (void)
9249 init_machine_status
= aarch64_init_machine_status
;
9252 /* A checking mechanism for the implementation of the various code models. */
9254 initialize_aarch64_code_model (struct gcc_options
*opts
)
9256 if (opts
->x_flag_pic
)
9258 switch (opts
->x_aarch64_cmodel_var
)
9260 case AARCH64_CMODEL_TINY
:
9261 aarch64_cmodel
= AARCH64_CMODEL_TINY_PIC
;
9263 case AARCH64_CMODEL_SMALL
:
9264 #ifdef HAVE_AS_SMALL_PIC_RELOCS
9265 aarch64_cmodel
= (flag_pic
== 2
9266 ? AARCH64_CMODEL_SMALL_PIC
9267 : AARCH64_CMODEL_SMALL_SPIC
);
9269 aarch64_cmodel
= AARCH64_CMODEL_SMALL_PIC
;
9272 case AARCH64_CMODEL_LARGE
:
9273 sorry ("code model %qs with -f%s", "large",
9274 opts
->x_flag_pic
> 1 ? "PIC" : "pic");
9281 aarch64_cmodel
= opts
->x_aarch64_cmodel_var
;
9284 /* Implement TARGET_OPTION_SAVE. */
9287 aarch64_option_save (struct cl_target_option
*ptr
, struct gcc_options
*opts
)
9289 ptr
->x_aarch64_override_tune_string
= opts
->x_aarch64_override_tune_string
;
9292 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
9293 using the information saved in PTR. */
9296 aarch64_option_restore (struct gcc_options
*opts
, struct cl_target_option
*ptr
)
9298 opts
->x_explicit_tune_core
= ptr
->x_explicit_tune_core
;
9299 selected_tune
= aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
9300 opts
->x_explicit_arch
= ptr
->x_explicit_arch
;
9301 selected_arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
9302 opts
->x_aarch64_override_tune_string
= ptr
->x_aarch64_override_tune_string
;
9304 aarch64_override_options_internal (opts
);
9307 /* Implement TARGET_OPTION_PRINT. */
9310 aarch64_option_print (FILE *file
, int indent
, struct cl_target_option
*ptr
)
9312 const struct processor
*cpu
9313 = aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
9314 unsigned long isa_flags
= ptr
->x_aarch64_isa_flags
;
9315 const struct processor
*arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
9316 std::string extension
9317 = aarch64_get_extension_string_for_isa_flags (isa_flags
, arch
->flags
);
9319 fprintf (file
, "%*sselected tune = %s\n", indent
, "", cpu
->name
);
9320 fprintf (file
, "%*sselected arch = %s%s\n", indent
, "",
9321 arch
->name
, extension
.c_str ());
9324 static GTY(()) tree aarch64_previous_fndecl
;
9327 aarch64_reset_previous_fndecl (void)
9329 aarch64_previous_fndecl
= NULL
;
9332 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9333 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9334 make sure optab availability predicates are recomputed when necessary. */
9337 aarch64_save_restore_target_globals (tree new_tree
)
9339 if (TREE_TARGET_GLOBALS (new_tree
))
9340 restore_target_globals (TREE_TARGET_GLOBALS (new_tree
));
9341 else if (new_tree
== target_option_default_node
)
9342 restore_target_globals (&default_target_globals
);
9344 TREE_TARGET_GLOBALS (new_tree
) = save_target_globals_default_opts ();
9347 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
9348 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9349 of the function, if such exists. This function may be called multiple
9350 times on a single function so use aarch64_previous_fndecl to avoid
9351 setting up identical state. */
9354 aarch64_set_current_function (tree fndecl
)
9356 if (!fndecl
|| fndecl
== aarch64_previous_fndecl
)
9359 tree old_tree
= (aarch64_previous_fndecl
9360 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl
)
9363 tree new_tree
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
9365 /* If current function has no attributes but the previous one did,
9366 use the default node. */
9367 if (!new_tree
&& old_tree
)
9368 new_tree
= target_option_default_node
;
9370 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
9371 the default have been handled by aarch64_save_restore_target_globals from
9372 aarch64_pragma_target_parse. */
9373 if (old_tree
== new_tree
)
9376 aarch64_previous_fndecl
= fndecl
;
9378 /* First set the target options. */
9379 cl_target_option_restore (&global_options
, TREE_TARGET_OPTION (new_tree
));
9381 aarch64_save_restore_target_globals (new_tree
);
9384 /* Enum describing the various ways we can handle attributes.
9385 In many cases we can reuse the generic option handling machinery. */
9387 enum aarch64_attr_opt_type
9389 aarch64_attr_mask
, /* Attribute should set a bit in target_flags. */
9390 aarch64_attr_bool
, /* Attribute sets or unsets a boolean variable. */
9391 aarch64_attr_enum
, /* Attribute sets an enum variable. */
9392 aarch64_attr_custom
/* Attribute requires a custom handling function. */
9395 /* All the information needed to handle a target attribute.
9396 NAME is the name of the attribute.
9397 ATTR_TYPE specifies the type of behavior of the attribute as described
9398 in the definition of enum aarch64_attr_opt_type.
9399 ALLOW_NEG is true if the attribute supports a "no-" form.
9400 HANDLER is the function that takes the attribute string and whether
9401 it is a pragma or attribute and handles the option. It is needed only
9402 when the ATTR_TYPE is aarch64_attr_custom.
9403 OPT_NUM is the enum specifying the option that the attribute modifies.
9404 This is needed for attributes that mirror the behavior of a command-line
9405 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9406 aarch64_attr_enum. */
9408 struct aarch64_attribute_info
9411 enum aarch64_attr_opt_type attr_type
;
9413 bool (*handler
) (const char *, const char *);
9414 enum opt_code opt_num
;
9417 /* Handle the ARCH_STR argument to the arch= target attribute.
9418 PRAGMA_OR_ATTR is used in potential error messages. */
9421 aarch64_handle_attr_arch (const char *str
, const char *pragma_or_attr
)
9423 const struct processor
*tmp_arch
= NULL
;
9424 enum aarch64_parse_opt_result parse_res
9425 = aarch64_parse_arch (str
, &tmp_arch
, &aarch64_isa_flags
);
9427 if (parse_res
== AARCH64_PARSE_OK
)
9429 gcc_assert (tmp_arch
);
9430 selected_arch
= tmp_arch
;
9431 explicit_arch
= selected_arch
->arch
;
9437 case AARCH64_PARSE_MISSING_ARG
:
9438 error ("missing architecture name in 'arch' target %s", pragma_or_attr
);
9440 case AARCH64_PARSE_INVALID_ARG
:
9441 error ("unknown value %qs for 'arch' target %s", str
, pragma_or_attr
);
9442 aarch64_print_hint_for_arch (str
);
9444 case AARCH64_PARSE_INVALID_FEATURE
:
9445 error ("invalid feature modifier %qs for 'arch' target %s",
9446 str
, pragma_or_attr
);
9455 /* Handle the argument CPU_STR to the cpu= target attribute.
9456 PRAGMA_OR_ATTR is used in potential error messages. */
9459 aarch64_handle_attr_cpu (const char *str
, const char *pragma_or_attr
)
9461 const struct processor
*tmp_cpu
= NULL
;
9462 enum aarch64_parse_opt_result parse_res
9463 = aarch64_parse_cpu (str
, &tmp_cpu
, &aarch64_isa_flags
);
9465 if (parse_res
== AARCH64_PARSE_OK
)
9467 gcc_assert (tmp_cpu
);
9468 selected_tune
= tmp_cpu
;
9469 explicit_tune_core
= selected_tune
->ident
;
9471 selected_arch
= &all_architectures
[tmp_cpu
->arch
];
9472 explicit_arch
= selected_arch
->arch
;
9478 case AARCH64_PARSE_MISSING_ARG
:
9479 error ("missing cpu name in 'cpu' target %s", pragma_or_attr
);
9481 case AARCH64_PARSE_INVALID_ARG
:
9482 error ("unknown value %qs for 'cpu' target %s", str
, pragma_or_attr
);
9483 aarch64_print_hint_for_core (str
);
9485 case AARCH64_PARSE_INVALID_FEATURE
:
9486 error ("invalid feature modifier %qs for 'cpu' target %s",
9487 str
, pragma_or_attr
);
9496 /* Handle the argument STR to the tune= target attribute.
9497 PRAGMA_OR_ATTR is used in potential error messages. */
9500 aarch64_handle_attr_tune (const char *str
, const char *pragma_or_attr
)
9502 const struct processor
*tmp_tune
= NULL
;
9503 enum aarch64_parse_opt_result parse_res
9504 = aarch64_parse_tune (str
, &tmp_tune
);
9506 if (parse_res
== AARCH64_PARSE_OK
)
9508 gcc_assert (tmp_tune
);
9509 selected_tune
= tmp_tune
;
9510 explicit_tune_core
= selected_tune
->ident
;
9516 case AARCH64_PARSE_INVALID_ARG
:
9517 error ("unknown value %qs for 'tune' target %s", str
, pragma_or_attr
);
9518 aarch64_print_hint_for_core (str
);
9527 /* Parse an architecture extensions target attribute string specified in STR.
9528 For example "+fp+nosimd". Show any errors if needed. Return TRUE
9529 if successful. Update aarch64_isa_flags to reflect the ISA features
9531 PRAGMA_OR_ATTR is used in potential error messages. */
9534 aarch64_handle_attr_isa_flags (char *str
, const char *pragma_or_attr
)
9536 enum aarch64_parse_opt_result parse_res
;
9537 unsigned long isa_flags
= aarch64_isa_flags
;
9539 /* We allow "+nothing" in the beginning to clear out all architectural
9540 features if the user wants to handpick specific features. */
9541 if (strncmp ("+nothing", str
, 8) == 0)
9547 parse_res
= aarch64_parse_extension (str
, &isa_flags
);
9549 if (parse_res
== AARCH64_PARSE_OK
)
9551 aarch64_isa_flags
= isa_flags
;
9557 case AARCH64_PARSE_MISSING_ARG
:
9558 error ("missing feature modifier in target %s %qs",
9559 pragma_or_attr
, str
);
9562 case AARCH64_PARSE_INVALID_FEATURE
:
9563 error ("invalid feature modifier in target %s %qs",
9564 pragma_or_attr
, str
);
9574 /* The target attributes that we support. On top of these we also support just
9575 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
9576 handled explicitly in aarch64_process_one_target_attr. */
9578 static const struct aarch64_attribute_info aarch64_attributes
[] =
9580 { "general-regs-only", aarch64_attr_mask
, false, NULL
,
9581 OPT_mgeneral_regs_only
},
9582 { "fix-cortex-a53-835769", aarch64_attr_bool
, true, NULL
,
9583 OPT_mfix_cortex_a53_835769
},
9584 { "fix-cortex-a53-843419", aarch64_attr_bool
, true, NULL
,
9585 OPT_mfix_cortex_a53_843419
},
9586 { "cmodel", aarch64_attr_enum
, false, NULL
, OPT_mcmodel_
},
9587 { "strict-align", aarch64_attr_mask
, false, NULL
, OPT_mstrict_align
},
9588 { "omit-leaf-frame-pointer", aarch64_attr_bool
, true, NULL
,
9589 OPT_momit_leaf_frame_pointer
},
9590 { "tls-dialect", aarch64_attr_enum
, false, NULL
, OPT_mtls_dialect_
},
9591 { "arch", aarch64_attr_custom
, false, aarch64_handle_attr_arch
,
9593 { "cpu", aarch64_attr_custom
, false, aarch64_handle_attr_cpu
, OPT_mcpu_
},
9594 { "tune", aarch64_attr_custom
, false, aarch64_handle_attr_tune
,
9596 { "sign-return-address", aarch64_attr_enum
, false, NULL
,
9597 OPT_msign_return_address_
},
9598 { NULL
, aarch64_attr_custom
, false, NULL
, OPT____
}
9601 /* Parse ARG_STR which contains the definition of one target attribute.
9602 Show appropriate errors if any or return true if the attribute is valid.
9603 PRAGMA_OR_ATTR holds the string to use in error messages about whether
9604 we're processing a target attribute or pragma. */
9607 aarch64_process_one_target_attr (char *arg_str
, const char* pragma_or_attr
)
9609 bool invert
= false;
9611 size_t len
= strlen (arg_str
);
9615 error ("malformed target %s", pragma_or_attr
);
9619 char *str_to_check
= (char *) alloca (len
+ 1);
9620 strcpy (str_to_check
, arg_str
);
9622 /* Skip leading whitespace. */
9623 while (*str_to_check
== ' ' || *str_to_check
== '\t')
9626 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9627 It is easier to detect and handle it explicitly here rather than going
9628 through the machinery for the rest of the target attributes in this
9630 if (*str_to_check
== '+')
9631 return aarch64_handle_attr_isa_flags (str_to_check
, pragma_or_attr
);
9633 if (len
> 3 && strncmp (str_to_check
, "no-", 3) == 0)
9638 char *arg
= strchr (str_to_check
, '=');
9640 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9641 and point ARG to "foo". */
9647 const struct aarch64_attribute_info
*p_attr
;
9649 for (p_attr
= aarch64_attributes
; p_attr
->name
; p_attr
++)
9651 /* If the names don't match up, or the user has given an argument
9652 to an attribute that doesn't accept one, or didn't give an argument
9653 to an attribute that expects one, fail to match. */
9654 if (strcmp (str_to_check
, p_attr
->name
) != 0)
9658 bool attr_need_arg_p
= p_attr
->attr_type
== aarch64_attr_custom
9659 || p_attr
->attr_type
== aarch64_attr_enum
;
9661 if (attr_need_arg_p
^ (arg
!= NULL
))
9663 error ("target %s %qs does not accept an argument",
9664 pragma_or_attr
, str_to_check
);
9668 /* If the name matches but the attribute does not allow "no-" versions
9669 then we can't match. */
9670 if (invert
&& !p_attr
->allow_neg
)
9672 error ("target %s %qs does not allow a negated form",
9673 pragma_or_attr
, str_to_check
);
9677 switch (p_attr
->attr_type
)
9679 /* Has a custom handler registered.
9680 For example, cpu=, arch=, tune=. */
9681 case aarch64_attr_custom
:
9682 gcc_assert (p_attr
->handler
);
9683 if (!p_attr
->handler (arg
, pragma_or_attr
))
9687 /* Either set or unset a boolean option. */
9688 case aarch64_attr_bool
:
9690 struct cl_decoded_option decoded
;
9692 generate_option (p_attr
->opt_num
, NULL
, !invert
,
9693 CL_TARGET
, &decoded
);
9694 aarch64_handle_option (&global_options
, &global_options_set
,
9695 &decoded
, input_location
);
9698 /* Set or unset a bit in the target_flags. aarch64_handle_option
9699 should know what mask to apply given the option number. */
9700 case aarch64_attr_mask
:
9702 struct cl_decoded_option decoded
;
9703 /* We only need to specify the option number.
9704 aarch64_handle_option will know which mask to apply. */
9705 decoded
.opt_index
= p_attr
->opt_num
;
9706 decoded
.value
= !invert
;
9707 aarch64_handle_option (&global_options
, &global_options_set
,
9708 &decoded
, input_location
);
9711 /* Use the option setting machinery to set an option to an enum. */
9712 case aarch64_attr_enum
:
9717 valid
= opt_enum_arg_to_value (p_attr
->opt_num
, arg
,
9721 set_option (&global_options
, NULL
, p_attr
->opt_num
, value
,
9722 NULL
, DK_UNSPECIFIED
, input_location
,
9727 error ("target %s %s=%s is not valid",
9728 pragma_or_attr
, str_to_check
, arg
);
9737 /* If we reached here we either have found an attribute and validated
9738 it or didn't match any. If we matched an attribute but its arguments
9739 were malformed we will have returned false already. */
9743 /* Count how many times the character C appears in
9744 NULL-terminated string STR. */
9747 num_occurences_in_str (char c
, char *str
)
9749 unsigned int res
= 0;
9750 while (*str
!= '\0')
9761 /* Parse the tree in ARGS that contains the target attribute information
9762 and update the global target options space. PRAGMA_OR_ATTR is a string
9763 to be used in error messages, specifying whether this is processing
9764 a target attribute or a target pragma. */
9767 aarch64_process_target_attr (tree args
, const char* pragma_or_attr
)
9769 if (TREE_CODE (args
) == TREE_LIST
)
9773 tree head
= TREE_VALUE (args
);
9776 if (!aarch64_process_target_attr (head
, pragma_or_attr
))
9779 args
= TREE_CHAIN (args
);
9785 if (TREE_CODE (args
) != STRING_CST
)
9787 error ("attribute %<target%> argument not a string");
9791 size_t len
= strlen (TREE_STRING_POINTER (args
));
9792 char *str_to_check
= (char *) alloca (len
+ 1);
9793 strcpy (str_to_check
, TREE_STRING_POINTER (args
));
9797 error ("malformed target %s value", pragma_or_attr
);
9801 /* Used to catch empty spaces between commas i.e.
9802 attribute ((target ("attr1,,attr2"))). */
9803 unsigned int num_commas
= num_occurences_in_str (',', str_to_check
);
9805 /* Handle multiple target attributes separated by ','. */
9806 char *token
= strtok (str_to_check
, ",");
9808 unsigned int num_attrs
= 0;
9812 if (!aarch64_process_one_target_attr (token
, pragma_or_attr
))
9814 error ("target %s %qs is invalid", pragma_or_attr
, token
);
9818 token
= strtok (NULL
, ",");
9821 if (num_attrs
!= num_commas
+ 1)
9823 error ("malformed target %s list %qs",
9824 pragma_or_attr
, TREE_STRING_POINTER (args
));
9831 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
9832 process attribute ((target ("..."))). */
9835 aarch64_option_valid_attribute_p (tree fndecl
, tree
, tree args
, int)
9837 struct cl_target_option cur_target
;
9840 tree new_target
, new_optimize
;
9841 tree existing_target
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
9843 /* If what we're processing is the current pragma string then the
9844 target option node is already stored in target_option_current_node
9845 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
9846 having to re-parse the string. This is especially useful to keep
9847 arm_neon.h compile times down since that header contains a lot
9848 of intrinsics enclosed in pragmas. */
9849 if (!existing_target
&& args
== current_target_pragma
)
9851 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = target_option_current_node
;
9854 tree func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
9856 old_optimize
= build_optimization_node (&global_options
);
9857 func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
9859 /* If the function changed the optimization levels as well as setting
9860 target options, start with the optimizations specified. */
9861 if (func_optimize
&& func_optimize
!= old_optimize
)
9862 cl_optimization_restore (&global_options
,
9863 TREE_OPTIMIZATION (func_optimize
));
9865 /* Save the current target options to restore at the end. */
9866 cl_target_option_save (&cur_target
, &global_options
);
9868 /* If fndecl already has some target attributes applied to it, unpack
9869 them so that we add this attribute on top of them, rather than
9870 overwriting them. */
9871 if (existing_target
)
9873 struct cl_target_option
*existing_options
9874 = TREE_TARGET_OPTION (existing_target
);
9876 if (existing_options
)
9877 cl_target_option_restore (&global_options
, existing_options
);
9880 cl_target_option_restore (&global_options
,
9881 TREE_TARGET_OPTION (target_option_current_node
));
9884 ret
= aarch64_process_target_attr (args
, "attribute");
9886 /* Set up any additional state. */
9889 aarch64_override_options_internal (&global_options
);
9890 /* Initialize SIMD builtins if we haven't already.
9891 Set current_target_pragma to NULL for the duration so that
9892 the builtin initialization code doesn't try to tag the functions
9893 being built with the attributes specified by any current pragma, thus
9894 going into an infinite recursion. */
9897 tree saved_current_target_pragma
= current_target_pragma
;
9898 current_target_pragma
= NULL
;
9899 aarch64_init_simd_builtins ();
9900 current_target_pragma
= saved_current_target_pragma
;
9902 new_target
= build_target_option_node (&global_options
);
9907 new_optimize
= build_optimization_node (&global_options
);
9911 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = new_target
;
9913 if (old_optimize
!= new_optimize
)
9914 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
) = new_optimize
;
9917 cl_target_option_restore (&global_options
, &cur_target
);
9919 if (old_optimize
!= new_optimize
)
9920 cl_optimization_restore (&global_options
,
9921 TREE_OPTIMIZATION (old_optimize
));
9925 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
9926 tri-bool options (yes, no, don't care) and the default value is
9927 DEF, determine whether to reject inlining. */
9930 aarch64_tribools_ok_for_inlining_p (int caller
, int callee
,
9931 int dont_care
, int def
)
9933 /* If the callee doesn't care, always allow inlining. */
9934 if (callee
== dont_care
)
9937 /* If the caller doesn't care, always allow inlining. */
9938 if (caller
== dont_care
)
9941 /* Otherwise, allow inlining if either the callee and caller values
9942 agree, or if the callee is using the default value. */
9943 return (callee
== caller
|| callee
== def
);
9946 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
9947 to inline CALLEE into CALLER based on target-specific info.
9948 Make sure that the caller and callee have compatible architectural
9949 features. Then go through the other possible target attributes
9950 and see if they can block inlining. Try not to reject always_inline
9951 callees unless they are incompatible architecturally. */
9954 aarch64_can_inline_p (tree caller
, tree callee
)
9956 tree caller_tree
= DECL_FUNCTION_SPECIFIC_TARGET (caller
);
9957 tree callee_tree
= DECL_FUNCTION_SPECIFIC_TARGET (callee
);
9959 /* If callee has no option attributes, then it is ok to inline. */
9963 struct cl_target_option
*caller_opts
9964 = TREE_TARGET_OPTION (caller_tree
? caller_tree
9965 : target_option_default_node
);
9967 struct cl_target_option
*callee_opts
= TREE_TARGET_OPTION (callee_tree
);
9970 /* Callee's ISA flags should be a subset of the caller's. */
9971 if ((caller_opts
->x_aarch64_isa_flags
& callee_opts
->x_aarch64_isa_flags
)
9972 != callee_opts
->x_aarch64_isa_flags
)
9975 /* Allow non-strict aligned functions inlining into strict
9977 if ((TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)
9978 != TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
))
9979 && !(!TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
)
9980 && TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)))
9983 bool always_inline
= lookup_attribute ("always_inline",
9984 DECL_ATTRIBUTES (callee
));
9986 /* If the architectural features match up and the callee is always_inline
9987 then the other attributes don't matter. */
9991 if (caller_opts
->x_aarch64_cmodel_var
9992 != callee_opts
->x_aarch64_cmodel_var
)
9995 if (caller_opts
->x_aarch64_tls_dialect
9996 != callee_opts
->x_aarch64_tls_dialect
)
9999 /* Honour explicit requests to workaround errata. */
10000 if (!aarch64_tribools_ok_for_inlining_p (
10001 caller_opts
->x_aarch64_fix_a53_err835769
,
10002 callee_opts
->x_aarch64_fix_a53_err835769
,
10003 2, TARGET_FIX_ERR_A53_835769_DEFAULT
))
10006 if (!aarch64_tribools_ok_for_inlining_p (
10007 caller_opts
->x_aarch64_fix_a53_err843419
,
10008 callee_opts
->x_aarch64_fix_a53_err843419
,
10009 2, TARGET_FIX_ERR_A53_843419
))
10012 /* If the user explicitly specified -momit-leaf-frame-pointer for the
10013 caller and calle and they don't match up, reject inlining. */
10014 if (!aarch64_tribools_ok_for_inlining_p (
10015 caller_opts
->x_flag_omit_leaf_frame_pointer
,
10016 callee_opts
->x_flag_omit_leaf_frame_pointer
,
10020 /* If the callee has specific tuning overrides, respect them. */
10021 if (callee_opts
->x_aarch64_override_tune_string
!= NULL
10022 && caller_opts
->x_aarch64_override_tune_string
== NULL
)
10025 /* If the user specified tuning override strings for the
10026 caller and callee and they don't match up, reject inlining.
10027 We just do a string compare here, we don't analyze the meaning
10028 of the string, as it would be too costly for little gain. */
10029 if (callee_opts
->x_aarch64_override_tune_string
10030 && caller_opts
->x_aarch64_override_tune_string
10031 && (strcmp (callee_opts
->x_aarch64_override_tune_string
,
10032 caller_opts
->x_aarch64_override_tune_string
) != 0))
10038 /* Return true if SYMBOL_REF X binds locally. */
10041 aarch64_symbol_binds_local_p (const_rtx x
)
10043 return (SYMBOL_REF_DECL (x
)
10044 ? targetm
.binds_local_p (SYMBOL_REF_DECL (x
))
10045 : SYMBOL_REF_LOCAL_P (x
));
10048 /* Return true if SYMBOL_REF X is thread local */
10050 aarch64_tls_symbol_p (rtx x
)
10052 if (! TARGET_HAVE_TLS
)
10055 if (GET_CODE (x
) != SYMBOL_REF
)
10058 return SYMBOL_REF_TLS_MODEL (x
) != 0;
10061 /* Classify a TLS symbol into one of the TLS kinds. */
10062 enum aarch64_symbol_type
10063 aarch64_classify_tls_symbol (rtx x
)
10065 enum tls_model tls_kind
= tls_symbolic_operand_type (x
);
10069 case TLS_MODEL_GLOBAL_DYNAMIC
:
10070 case TLS_MODEL_LOCAL_DYNAMIC
:
10071 return TARGET_TLS_DESC
? SYMBOL_SMALL_TLSDESC
: SYMBOL_SMALL_TLSGD
;
10073 case TLS_MODEL_INITIAL_EXEC
:
10074 switch (aarch64_cmodel
)
10076 case AARCH64_CMODEL_TINY
:
10077 case AARCH64_CMODEL_TINY_PIC
:
10078 return SYMBOL_TINY_TLSIE
;
10080 return SYMBOL_SMALL_TLSIE
;
10083 case TLS_MODEL_LOCAL_EXEC
:
10084 if (aarch64_tls_size
== 12)
10085 return SYMBOL_TLSLE12
;
10086 else if (aarch64_tls_size
== 24)
10087 return SYMBOL_TLSLE24
;
10088 else if (aarch64_tls_size
== 32)
10089 return SYMBOL_TLSLE32
;
10090 else if (aarch64_tls_size
== 48)
10091 return SYMBOL_TLSLE48
;
10093 gcc_unreachable ();
10095 case TLS_MODEL_EMULATED
:
10096 case TLS_MODEL_NONE
:
10097 return SYMBOL_FORCE_TO_MEM
;
10100 gcc_unreachable ();
10104 /* Return the method that should be used to access SYMBOL_REF or
10107 enum aarch64_symbol_type
10108 aarch64_classify_symbol (rtx x
, rtx offset
)
10110 if (GET_CODE (x
) == LABEL_REF
)
10112 switch (aarch64_cmodel
)
10114 case AARCH64_CMODEL_LARGE
:
10115 return SYMBOL_FORCE_TO_MEM
;
10117 case AARCH64_CMODEL_TINY_PIC
:
10118 case AARCH64_CMODEL_TINY
:
10119 return SYMBOL_TINY_ABSOLUTE
;
10121 case AARCH64_CMODEL_SMALL_SPIC
:
10122 case AARCH64_CMODEL_SMALL_PIC
:
10123 case AARCH64_CMODEL_SMALL
:
10124 return SYMBOL_SMALL_ABSOLUTE
;
10127 gcc_unreachable ();
10131 if (GET_CODE (x
) == SYMBOL_REF
)
10133 if (aarch64_tls_symbol_p (x
))
10134 return aarch64_classify_tls_symbol (x
);
10136 switch (aarch64_cmodel
)
10138 case AARCH64_CMODEL_TINY
:
10139 /* When we retrieve symbol + offset address, we have to make sure
10140 the offset does not cause overflow of the final address. But
10141 we have no way of knowing the address of symbol at compile time
10142 so we can't accurately say if the distance between the PC and
10143 symbol + offset is outside the addressible range of +/-1M in the
10144 TINY code model. So we rely on images not being greater than
10145 1M and cap the offset at 1M and anything beyond 1M will have to
10146 be loaded using an alternative mechanism. Furthermore if the
10147 symbol is a weak reference to something that isn't known to
10148 resolve to a symbol in this module, then force to memory. */
10149 if ((SYMBOL_REF_WEAK (x
)
10150 && !aarch64_symbol_binds_local_p (x
))
10151 || INTVAL (offset
) < -1048575 || INTVAL (offset
) > 1048575)
10152 return SYMBOL_FORCE_TO_MEM
;
10153 return SYMBOL_TINY_ABSOLUTE
;
10155 case AARCH64_CMODEL_SMALL
:
10156 /* Same reasoning as the tiny code model, but the offset cap here is
10158 if ((SYMBOL_REF_WEAK (x
)
10159 && !aarch64_symbol_binds_local_p (x
))
10160 || !IN_RANGE (INTVAL (offset
), HOST_WIDE_INT_C (-4294967263),
10161 HOST_WIDE_INT_C (4294967264)))
10162 return SYMBOL_FORCE_TO_MEM
;
10163 return SYMBOL_SMALL_ABSOLUTE
;
10165 case AARCH64_CMODEL_TINY_PIC
:
10166 if (!aarch64_symbol_binds_local_p (x
))
10167 return SYMBOL_TINY_GOT
;
10168 return SYMBOL_TINY_ABSOLUTE
;
10170 case AARCH64_CMODEL_SMALL_SPIC
:
10171 case AARCH64_CMODEL_SMALL_PIC
:
10172 if (!aarch64_symbol_binds_local_p (x
))
10173 return (aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
10174 ? SYMBOL_SMALL_GOT_28K
: SYMBOL_SMALL_GOT_4G
);
10175 return SYMBOL_SMALL_ABSOLUTE
;
10177 case AARCH64_CMODEL_LARGE
:
10178 /* This is alright even in PIC code as the constant
10179 pool reference is always PC relative and within
10180 the same translation unit. */
10181 if (CONSTANT_POOL_ADDRESS_P (x
))
10182 return SYMBOL_SMALL_ABSOLUTE
;
10184 return SYMBOL_FORCE_TO_MEM
;
10187 gcc_unreachable ();
10191 /* By default push everything into the constant pool. */
10192 return SYMBOL_FORCE_TO_MEM
;
10196 aarch64_constant_address_p (rtx x
)
10198 return (CONSTANT_P (x
) && memory_address_p (DImode
, x
));
10202 aarch64_legitimate_pic_operand_p (rtx x
)
10204 if (GET_CODE (x
) == SYMBOL_REF
10205 || (GET_CODE (x
) == CONST
10206 && GET_CODE (XEXP (x
, 0)) == PLUS
10207 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
))
10213 /* Return true if X holds either a quarter-precision or
10214 floating-point +0.0 constant. */
10216 aarch64_valid_floating_const (machine_mode mode
, rtx x
)
10218 if (!CONST_DOUBLE_P (x
))
10221 if (aarch64_float_const_zero_rtx_p (x
))
10224 /* We only handle moving 0.0 to a TFmode register. */
10225 if (!(mode
== SFmode
|| mode
== DFmode
))
10228 return aarch64_float_const_representable_p (x
);
10232 aarch64_legitimate_constant_p (machine_mode mode
, rtx x
)
10234 /* Do not allow vector struct mode constants. We could support
10235 0 and -1 easily, but they need support in aarch64-simd.md. */
10236 if (TARGET_SIMD
&& aarch64_vect_struct_mode_p (mode
))
10239 /* This could probably go away because
10240 we now decompose CONST_INTs according to expand_mov_immediate. */
10241 if ((GET_CODE (x
) == CONST_VECTOR
10242 && aarch64_simd_valid_immediate (x
, mode
, false, NULL
))
10243 || CONST_INT_P (x
) || aarch64_valid_floating_const (mode
, x
))
10244 return !targetm
.cannot_force_const_mem (mode
, x
);
10246 if (GET_CODE (x
) == HIGH
10247 && aarch64_valid_symref (XEXP (x
, 0), GET_MODE (XEXP (x
, 0))))
10250 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
10251 so spilling them is better than rematerialization. */
10252 if (SYMBOL_REF_P (x
) && !SYMBOL_REF_TLS_MODEL (x
))
10255 return aarch64_constant_address_p (x
);
10259 aarch64_load_tp (rtx target
)
10262 || GET_MODE (target
) != Pmode
10263 || !register_operand (target
, Pmode
))
10264 target
= gen_reg_rtx (Pmode
);
10266 /* Can return in any reg. */
10267 emit_insn (gen_aarch64_load_tp_hard (target
));
10271 /* On AAPCS systems, this is the "struct __va_list". */
10272 static GTY(()) tree va_list_type
;
10274 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
10275 Return the type to use as __builtin_va_list.
10277 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
10289 aarch64_build_builtin_va_list (void)
10292 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
10294 /* Create the type. */
10295 va_list_type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
10296 /* Give it the required name. */
10297 va_list_name
= build_decl (BUILTINS_LOCATION
,
10299 get_identifier ("__va_list"),
10301 DECL_ARTIFICIAL (va_list_name
) = 1;
10302 TYPE_NAME (va_list_type
) = va_list_name
;
10303 TYPE_STUB_DECL (va_list_type
) = va_list_name
;
10305 /* Create the fields. */
10306 f_stack
= build_decl (BUILTINS_LOCATION
,
10307 FIELD_DECL
, get_identifier ("__stack"),
10309 f_grtop
= build_decl (BUILTINS_LOCATION
,
10310 FIELD_DECL
, get_identifier ("__gr_top"),
10312 f_vrtop
= build_decl (BUILTINS_LOCATION
,
10313 FIELD_DECL
, get_identifier ("__vr_top"),
10315 f_groff
= build_decl (BUILTINS_LOCATION
,
10316 FIELD_DECL
, get_identifier ("__gr_offs"),
10317 integer_type_node
);
10318 f_vroff
= build_decl (BUILTINS_LOCATION
,
10319 FIELD_DECL
, get_identifier ("__vr_offs"),
10320 integer_type_node
);
10322 /* Tell tree-stdarg pass about our internal offset fields.
10323 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
10324 purpose to identify whether the code is updating va_list internal
10325 offset fields through irregular way. */
10326 va_list_gpr_counter_field
= f_groff
;
10327 va_list_fpr_counter_field
= f_vroff
;
10329 DECL_ARTIFICIAL (f_stack
) = 1;
10330 DECL_ARTIFICIAL (f_grtop
) = 1;
10331 DECL_ARTIFICIAL (f_vrtop
) = 1;
10332 DECL_ARTIFICIAL (f_groff
) = 1;
10333 DECL_ARTIFICIAL (f_vroff
) = 1;
10335 DECL_FIELD_CONTEXT (f_stack
) = va_list_type
;
10336 DECL_FIELD_CONTEXT (f_grtop
) = va_list_type
;
10337 DECL_FIELD_CONTEXT (f_vrtop
) = va_list_type
;
10338 DECL_FIELD_CONTEXT (f_groff
) = va_list_type
;
10339 DECL_FIELD_CONTEXT (f_vroff
) = va_list_type
;
10341 TYPE_FIELDS (va_list_type
) = f_stack
;
10342 DECL_CHAIN (f_stack
) = f_grtop
;
10343 DECL_CHAIN (f_grtop
) = f_vrtop
;
10344 DECL_CHAIN (f_vrtop
) = f_groff
;
10345 DECL_CHAIN (f_groff
) = f_vroff
;
10347 /* Compute its layout. */
10348 layout_type (va_list_type
);
10350 return va_list_type
;
10353 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
10355 aarch64_expand_builtin_va_start (tree valist
, rtx nextarg ATTRIBUTE_UNUSED
)
10357 const CUMULATIVE_ARGS
*cum
;
10358 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
10359 tree stack
, grtop
, vrtop
, groff
, vroff
;
10361 int gr_save_area_size
= cfun
->va_list_gpr_size
;
10362 int vr_save_area_size
= cfun
->va_list_fpr_size
;
10365 cum
= &crtl
->args
.info
;
10366 if (cfun
->va_list_gpr_size
)
10367 gr_save_area_size
= MIN ((NUM_ARG_REGS
- cum
->aapcs_ncrn
) * UNITS_PER_WORD
,
10368 cfun
->va_list_gpr_size
);
10369 if (cfun
->va_list_fpr_size
)
10370 vr_save_area_size
= MIN ((NUM_FP_ARG_REGS
- cum
->aapcs_nvrn
)
10371 * UNITS_PER_VREG
, cfun
->va_list_fpr_size
);
10375 gcc_assert (cum
->aapcs_nvrn
== 0);
10376 vr_save_area_size
= 0;
10379 f_stack
= TYPE_FIELDS (va_list_type_node
);
10380 f_grtop
= DECL_CHAIN (f_stack
);
10381 f_vrtop
= DECL_CHAIN (f_grtop
);
10382 f_groff
= DECL_CHAIN (f_vrtop
);
10383 f_vroff
= DECL_CHAIN (f_groff
);
10385 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), valist
, f_stack
,
10387 grtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
), valist
, f_grtop
,
10389 vrtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
), valist
, f_vrtop
,
10391 groff
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
), valist
, f_groff
,
10393 vroff
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
), valist
, f_vroff
,
10396 /* Emit code to initialize STACK, which points to the next varargs stack
10397 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
10398 by named arguments. STACK is 8-byte aligned. */
10399 t
= make_tree (TREE_TYPE (stack
), virtual_incoming_args_rtx
);
10400 if (cum
->aapcs_stack_size
> 0)
10401 t
= fold_build_pointer_plus_hwi (t
, cum
->aapcs_stack_size
* UNITS_PER_WORD
);
10402 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), stack
, t
);
10403 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
10405 /* Emit code to initialize GRTOP, the top of the GR save area.
10406 virtual_incoming_args_rtx should have been 16 byte aligned. */
10407 t
= make_tree (TREE_TYPE (grtop
), virtual_incoming_args_rtx
);
10408 t
= build2 (MODIFY_EXPR
, TREE_TYPE (grtop
), grtop
, t
);
10409 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
10411 /* Emit code to initialize VRTOP, the top of the VR save area.
10412 This address is gr_save_area_bytes below GRTOP, rounded
10413 down to the next 16-byte boundary. */
10414 t
= make_tree (TREE_TYPE (vrtop
), virtual_incoming_args_rtx
);
10415 vr_offset
= ROUND_UP (gr_save_area_size
,
10416 STACK_BOUNDARY
/ BITS_PER_UNIT
);
10419 t
= fold_build_pointer_plus_hwi (t
, -vr_offset
);
10420 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vrtop
), vrtop
, t
);
10421 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
10423 /* Emit code to initialize GROFF, the offset from GRTOP of the
10424 next GPR argument. */
10425 t
= build2 (MODIFY_EXPR
, TREE_TYPE (groff
), groff
,
10426 build_int_cst (TREE_TYPE (groff
), -gr_save_area_size
));
10427 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
10429 /* Likewise emit code to initialize VROFF, the offset from FTOP
10430 of the next VR argument. */
10431 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vroff
), vroff
,
10432 build_int_cst (TREE_TYPE (vroff
), -vr_save_area_size
));
10433 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
10436 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
10439 aarch64_gimplify_va_arg_expr (tree valist
, tree type
, gimple_seq
*pre_p
,
10440 gimple_seq
*post_p ATTRIBUTE_UNUSED
)
10444 bool is_ha
; /* is HFA or HVA. */
10445 bool dw_align
; /* double-word align. */
10446 machine_mode ag_mode
= VOIDmode
;
10450 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
10451 tree stack
, f_top
, f_off
, off
, arg
, roundup
, on_stack
;
10452 HOST_WIDE_INT size
, rsize
, adjust
, align
;
10453 tree t
, u
, cond1
, cond2
;
10455 indirect_p
= pass_by_reference (NULL
, TYPE_MODE (type
), type
, false);
10457 type
= build_pointer_type (type
);
10459 mode
= TYPE_MODE (type
);
10461 f_stack
= TYPE_FIELDS (va_list_type_node
);
10462 f_grtop
= DECL_CHAIN (f_stack
);
10463 f_vrtop
= DECL_CHAIN (f_grtop
);
10464 f_groff
= DECL_CHAIN (f_vrtop
);
10465 f_vroff
= DECL_CHAIN (f_groff
);
10467 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), unshare_expr (valist
),
10468 f_stack
, NULL_TREE
);
10469 size
= int_size_in_bytes (type
);
10470 align
= aarch64_function_arg_alignment (mode
, type
) / BITS_PER_UNIT
;
10474 if (aarch64_vfp_is_call_or_return_candidate (mode
,
10480 /* TYPE passed in fp/simd registers. */
10482 aarch64_err_no_fpadvsimd (mode
, "varargs");
10484 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
),
10485 unshare_expr (valist
), f_vrtop
, NULL_TREE
);
10486 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
),
10487 unshare_expr (valist
), f_vroff
, NULL_TREE
);
10489 rsize
= nregs
* UNITS_PER_VREG
;
10493 if (BYTES_BIG_ENDIAN
&& GET_MODE_SIZE (ag_mode
) < UNITS_PER_VREG
)
10494 adjust
= UNITS_PER_VREG
- GET_MODE_SIZE (ag_mode
);
10496 else if (BLOCK_REG_PADDING (mode
, type
, 1) == downward
10497 && size
< UNITS_PER_VREG
)
10499 adjust
= UNITS_PER_VREG
- size
;
10504 /* TYPE passed in general registers. */
10505 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
),
10506 unshare_expr (valist
), f_grtop
, NULL_TREE
);
10507 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
),
10508 unshare_expr (valist
), f_groff
, NULL_TREE
);
10509 rsize
= ROUND_UP (size
, UNITS_PER_WORD
);
10510 nregs
= rsize
/ UNITS_PER_WORD
;
10515 if (BLOCK_REG_PADDING (mode
, type
, 1) == downward
10516 && size
< UNITS_PER_WORD
)
10518 adjust
= UNITS_PER_WORD
- size
;
10522 /* Get a local temporary for the field value. */
10523 off
= get_initialized_tmp_var (f_off
, pre_p
, NULL
);
10525 /* Emit code to branch if off >= 0. */
10526 t
= build2 (GE_EXPR
, boolean_type_node
, off
,
10527 build_int_cst (TREE_TYPE (off
), 0));
10528 cond1
= build3 (COND_EXPR
, ptr_type_node
, t
, NULL_TREE
, NULL_TREE
);
10532 /* Emit: offs = (offs + 15) & -16. */
10533 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
10534 build_int_cst (TREE_TYPE (off
), 15));
10535 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (off
), t
,
10536 build_int_cst (TREE_TYPE (off
), -16));
10537 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (off
), off
, t
);
10542 /* Update ap.__[g|v]r_offs */
10543 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
10544 build_int_cst (TREE_TYPE (off
), rsize
));
10545 t
= build2 (MODIFY_EXPR
, TREE_TYPE (f_off
), unshare_expr (f_off
), t
);
10549 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
10551 /* [cond2] if (ap.__[g|v]r_offs > 0) */
10552 u
= build2 (GT_EXPR
, boolean_type_node
, unshare_expr (f_off
),
10553 build_int_cst (TREE_TYPE (f_off
), 0));
10554 cond2
= build3 (COND_EXPR
, ptr_type_node
, u
, NULL_TREE
, NULL_TREE
);
10556 /* String up: make sure the assignment happens before the use. */
10557 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (cond2
), t
, cond2
);
10558 COND_EXPR_ELSE (cond1
) = t
;
10560 /* Prepare the trees handling the argument that is passed on the stack;
10561 the top level node will store in ON_STACK. */
10562 arg
= get_initialized_tmp_var (stack
, pre_p
, NULL
);
10565 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
10566 t
= fold_convert (intDI_type_node
, arg
);
10567 t
= build2 (PLUS_EXPR
, TREE_TYPE (t
), t
,
10568 build_int_cst (TREE_TYPE (t
), 15));
10569 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
10570 build_int_cst (TREE_TYPE (t
), -16));
10571 t
= fold_convert (TREE_TYPE (arg
), t
);
10572 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (arg
), arg
, t
);
10576 /* Advance ap.__stack */
10577 t
= fold_convert (intDI_type_node
, arg
);
10578 t
= build2 (PLUS_EXPR
, TREE_TYPE (t
), t
,
10579 build_int_cst (TREE_TYPE (t
), size
+ 7));
10580 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
10581 build_int_cst (TREE_TYPE (t
), -8));
10582 t
= fold_convert (TREE_TYPE (arg
), t
);
10583 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), unshare_expr (stack
), t
);
10584 /* String up roundup and advance. */
10586 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
10587 /* String up with arg */
10588 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), t
, arg
);
10589 /* Big-endianness related address adjustment. */
10590 if (BLOCK_REG_PADDING (mode
, type
, 1) == downward
10591 && size
< UNITS_PER_WORD
)
10593 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (arg
), arg
,
10594 size_int (UNITS_PER_WORD
- size
));
10595 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), on_stack
, t
);
10598 COND_EXPR_THEN (cond1
) = unshare_expr (on_stack
);
10599 COND_EXPR_THEN (cond2
) = unshare_expr (on_stack
);
10601 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
10604 t
= build2 (PREINCREMENT_EXPR
, TREE_TYPE (off
), off
,
10605 build_int_cst (TREE_TYPE (off
), adjust
));
10607 t
= fold_convert (sizetype
, t
);
10608 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (f_top
), f_top
, t
);
10612 /* type ha; // treat as "struct {ftype field[n];}"
10613 ... [computing offs]
10614 for (i = 0; i <nregs; ++i, offs += 16)
10615 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10618 tree tmp_ha
, field_t
, field_ptr_t
;
10620 /* Declare a local variable. */
10621 tmp_ha
= create_tmp_var_raw (type
, "ha");
10622 gimple_add_tmp_var (tmp_ha
);
10624 /* Establish the base type. */
10628 field_t
= float_type_node
;
10629 field_ptr_t
= float_ptr_type_node
;
10632 field_t
= double_type_node
;
10633 field_ptr_t
= double_ptr_type_node
;
10636 field_t
= long_double_type_node
;
10637 field_ptr_t
= long_double_ptr_type_node
;
10640 field_t
= aarch64_fp16_type_node
;
10641 field_ptr_t
= aarch64_fp16_ptr_type_node
;
10646 tree innertype
= make_signed_type (GET_MODE_PRECISION (SImode
));
10647 field_t
= build_vector_type_for_mode (innertype
, ag_mode
);
10648 field_ptr_t
= build_pointer_type (field_t
);
10655 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
10656 tmp_ha
= build1 (ADDR_EXPR
, field_ptr_t
, tmp_ha
);
10658 t
= fold_convert (field_ptr_t
, addr
);
10659 t
= build2 (MODIFY_EXPR
, field_t
,
10660 build1 (INDIRECT_REF
, field_t
, tmp_ha
),
10661 build1 (INDIRECT_REF
, field_t
, t
));
10663 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
10664 for (i
= 1; i
< nregs
; ++i
)
10666 addr
= fold_build_pointer_plus_hwi (addr
, UNITS_PER_VREG
);
10667 u
= fold_convert (field_ptr_t
, addr
);
10668 u
= build2 (MODIFY_EXPR
, field_t
,
10669 build2 (MEM_REF
, field_t
, tmp_ha
,
10670 build_int_cst (field_ptr_t
,
10672 int_size_in_bytes (field_t
)))),
10673 build1 (INDIRECT_REF
, field_t
, u
));
10674 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), t
, u
);
10677 u
= fold_convert (TREE_TYPE (f_top
), tmp_ha
);
10678 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (f_top
), t
, u
);
10681 COND_EXPR_ELSE (cond2
) = t
;
10682 addr
= fold_convert (build_pointer_type (type
), cond1
);
10683 addr
= build_va_arg_indirect_ref (addr
);
10686 addr
= build_va_arg_indirect_ref (addr
);
10691 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
10694 aarch64_setup_incoming_varargs (cumulative_args_t cum_v
, machine_mode mode
,
10695 tree type
, int *pretend_size ATTRIBUTE_UNUSED
,
10698 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
10699 CUMULATIVE_ARGS local_cum
;
10700 int gr_saved
= cfun
->va_list_gpr_size
;
10701 int vr_saved
= cfun
->va_list_fpr_size
;
10703 /* The caller has advanced CUM up to, but not beyond, the last named
10704 argument. Advance a local copy of CUM past the last "real" named
10705 argument, to find out how many registers are left over. */
10707 aarch64_function_arg_advance (pack_cumulative_args(&local_cum
), mode
, type
, true);
10709 /* Found out how many registers we need to save.
10710 Honor tree-stdvar analysis results. */
10711 if (cfun
->va_list_gpr_size
)
10712 gr_saved
= MIN (NUM_ARG_REGS
- local_cum
.aapcs_ncrn
,
10713 cfun
->va_list_gpr_size
/ UNITS_PER_WORD
);
10714 if (cfun
->va_list_fpr_size
)
10715 vr_saved
= MIN (NUM_FP_ARG_REGS
- local_cum
.aapcs_nvrn
,
10716 cfun
->va_list_fpr_size
/ UNITS_PER_VREG
);
10720 gcc_assert (local_cum
.aapcs_nvrn
== 0);
10730 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
10731 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
,
10732 - gr_saved
* UNITS_PER_WORD
);
10733 mem
= gen_frame_mem (BLKmode
, ptr
);
10734 set_mem_alias_set (mem
, get_varargs_alias_set ());
10736 move_block_from_reg (local_cum
.aapcs_ncrn
+ R0_REGNUM
,
10741 /* We can't use move_block_from_reg, because it will use
10742 the wrong mode, storing D regs only. */
10743 machine_mode mode
= TImode
;
10744 int off
, i
, vr_start
;
10746 /* Set OFF to the offset from virtual_incoming_args_rtx of
10747 the first vector register. The VR save area lies below
10748 the GR one, and is aligned to 16 bytes. */
10749 off
= -ROUND_UP (gr_saved
* UNITS_PER_WORD
,
10750 STACK_BOUNDARY
/ BITS_PER_UNIT
);
10751 off
-= vr_saved
* UNITS_PER_VREG
;
10753 vr_start
= V0_REGNUM
+ local_cum
.aapcs_nvrn
;
10754 for (i
= 0; i
< vr_saved
; ++i
)
10758 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
, off
);
10759 mem
= gen_frame_mem (mode
, ptr
);
10760 set_mem_alias_set (mem
, get_varargs_alias_set ());
10761 aarch64_emit_move (mem
, gen_rtx_REG (mode
, vr_start
+ i
));
10762 off
+= UNITS_PER_VREG
;
10767 /* We don't save the size into *PRETEND_SIZE because we want to avoid
10768 any complication of having crtl->args.pretend_args_size changed. */
10769 cfun
->machine
->frame
.saved_varargs_size
10770 = (ROUND_UP (gr_saved
* UNITS_PER_WORD
,
10771 STACK_BOUNDARY
/ BITS_PER_UNIT
)
10772 + vr_saved
* UNITS_PER_VREG
);
10776 aarch64_conditional_register_usage (void)
10781 for (i
= V0_REGNUM
; i
<= V31_REGNUM
; i
++)
10784 call_used_regs
[i
] = 1;
10789 /* Walk down the type tree of TYPE counting consecutive base elements.
10790 If *MODEP is VOIDmode, then set it to the first valid floating point
10791 type. If a non-floating point type is found, or if a floating point
10792 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10793 otherwise return the count in the sub-tree. */
10795 aapcs_vfp_sub_candidate (const_tree type
, machine_mode
*modep
)
10798 HOST_WIDE_INT size
;
10800 switch (TREE_CODE (type
))
10803 mode
= TYPE_MODE (type
);
10804 if (mode
!= DFmode
&& mode
!= SFmode
10805 && mode
!= TFmode
&& mode
!= HFmode
)
10808 if (*modep
== VOIDmode
)
10811 if (*modep
== mode
)
10817 mode
= TYPE_MODE (TREE_TYPE (type
));
10818 if (mode
!= DFmode
&& mode
!= SFmode
10819 && mode
!= TFmode
&& mode
!= HFmode
)
10822 if (*modep
== VOIDmode
)
10825 if (*modep
== mode
)
10831 /* Use V2SImode and V4SImode as representatives of all 64-bit
10832 and 128-bit vector types. */
10833 size
= int_size_in_bytes (type
);
10846 if (*modep
== VOIDmode
)
10849 /* Vector modes are considered to be opaque: two vectors are
10850 equivalent for the purposes of being homogeneous aggregates
10851 if they are the same size. */
10852 if (*modep
== mode
)
10860 tree index
= TYPE_DOMAIN (type
);
10862 /* Can't handle incomplete types nor sizes that are not
10864 if (!COMPLETE_TYPE_P (type
)
10865 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
10868 count
= aapcs_vfp_sub_candidate (TREE_TYPE (type
), modep
);
10871 || !TYPE_MAX_VALUE (index
)
10872 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index
))
10873 || !TYPE_MIN_VALUE (index
)
10874 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index
))
10878 count
*= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index
))
10879 - tree_to_uhwi (TYPE_MIN_VALUE (index
)));
10881 /* There must be no padding. */
10882 if (wi::ne_p (TYPE_SIZE (type
), count
* GET_MODE_BITSIZE (*modep
)))
10894 /* Can't handle incomplete types nor sizes that are not
10896 if (!COMPLETE_TYPE_P (type
)
10897 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
10900 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
10902 if (TREE_CODE (field
) != FIELD_DECL
)
10905 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
10908 count
+= sub_count
;
10911 /* There must be no padding. */
10912 if (wi::ne_p (TYPE_SIZE (type
), count
* GET_MODE_BITSIZE (*modep
)))
10919 case QUAL_UNION_TYPE
:
10921 /* These aren't very interesting except in a degenerate case. */
10926 /* Can't handle incomplete types nor sizes that are not
10928 if (!COMPLETE_TYPE_P (type
)
10929 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
10932 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
10934 if (TREE_CODE (field
) != FIELD_DECL
)
10937 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
10940 count
= count
> sub_count
? count
: sub_count
;
10943 /* There must be no padding. */
10944 if (wi::ne_p (TYPE_SIZE (type
), count
* GET_MODE_BITSIZE (*modep
)))
10957 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
10958 type as described in AAPCS64 \S 4.1.2.
10960 See the comment above aarch64_composite_type_p for the notes on MODE. */
10963 aarch64_short_vector_p (const_tree type
,
10966 HOST_WIDE_INT size
= -1;
10968 if (type
&& TREE_CODE (type
) == VECTOR_TYPE
)
10969 size
= int_size_in_bytes (type
);
10970 else if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
10971 || GET_MODE_CLASS (mode
) == MODE_VECTOR_FLOAT
)
10972 size
= GET_MODE_SIZE (mode
);
10974 return (size
== 8 || size
== 16);
10977 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
10978 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
10979 array types. The C99 floating-point complex types are also considered
10980 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
10981 types, which are GCC extensions and out of the scope of AAPCS64, are
10982 treated as composite types here as well.
10984 Note that MODE itself is not sufficient in determining whether a type
10985 is such a composite type or not. This is because
10986 stor-layout.c:compute_record_mode may have already changed the MODE
10987 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
10988 structure with only one field may have its MODE set to the mode of the
10989 field. Also an integer mode whose size matches the size of the
10990 RECORD_TYPE type may be used to substitute the original mode
10991 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
10992 solely relied on. */
10995 aarch64_composite_type_p (const_tree type
,
10998 if (aarch64_short_vector_p (type
, mode
))
11001 if (type
&& (AGGREGATE_TYPE_P (type
) || TREE_CODE (type
) == COMPLEX_TYPE
))
11004 if (mode
== BLKmode
11005 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
11006 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_INT
)
11012 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
11013 shall be passed or returned in simd/fp register(s) (providing these
11014 parameter passing registers are available).
11016 Upon successful return, *COUNT returns the number of needed registers,
11017 *BASE_MODE returns the mode of the individual register and when IS_HAF
11018 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
11019 floating-point aggregate or a homogeneous short-vector aggregate. */
11022 aarch64_vfp_is_call_or_return_candidate (machine_mode mode
,
11024 machine_mode
*base_mode
,
11028 machine_mode new_mode
= VOIDmode
;
11029 bool composite_p
= aarch64_composite_type_p (type
, mode
);
11031 if (is_ha
!= NULL
) *is_ha
= false;
11033 if ((!composite_p
&& GET_MODE_CLASS (mode
) == MODE_FLOAT
)
11034 || aarch64_short_vector_p (type
, mode
))
11039 else if (GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
)
11041 if (is_ha
!= NULL
) *is_ha
= true;
11043 new_mode
= GET_MODE_INNER (mode
);
11045 else if (type
&& composite_p
)
11047 int ag_count
= aapcs_vfp_sub_candidate (type
, &new_mode
);
11049 if (ag_count
> 0 && ag_count
<= HA_MAX_NUM_FLDS
)
11051 if (is_ha
!= NULL
) *is_ha
= true;
11060 *base_mode
= new_mode
;
11064 /* Implement TARGET_STRUCT_VALUE_RTX. */
11067 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED
,
11068 int incoming ATTRIBUTE_UNUSED
)
11070 return gen_rtx_REG (Pmode
, AARCH64_STRUCT_VALUE_REGNUM
);
11073 /* Implements target hook vector_mode_supported_p. */
11075 aarch64_vector_mode_supported_p (machine_mode mode
)
11078 && (mode
== V4SImode
|| mode
== V8HImode
11079 || mode
== V16QImode
|| mode
== V2DImode
11080 || mode
== V2SImode
|| mode
== V4HImode
11081 || mode
== V8QImode
|| mode
== V2SFmode
11082 || mode
== V4SFmode
|| mode
== V2DFmode
11083 || mode
== V4HFmode
|| mode
== V8HFmode
11084 || mode
== V1DFmode
))
11090 /* Return appropriate SIMD container
11091 for MODE within a vector of WIDTH bits. */
11092 static machine_mode
11093 aarch64_simd_container_mode (machine_mode mode
, unsigned width
)
11095 gcc_assert (width
== 64 || width
== 128);
11138 /* Return 128-bit container as the preferred SIMD mode for MODE. */
11139 static machine_mode
11140 aarch64_preferred_simd_mode (machine_mode mode
)
11142 return aarch64_simd_container_mode (mode
, 128);
11145 /* Return the bitmask of possible vector sizes for the vectorizer
11146 to iterate over. */
11147 static unsigned int
11148 aarch64_autovectorize_vector_sizes (void)
11153 /* Implement TARGET_MANGLE_TYPE. */
11155 static const char *
11156 aarch64_mangle_type (const_tree type
)
11158 /* The AArch64 ABI documents say that "__va_list" has to be
11159 managled as if it is in the "std" namespace. */
11160 if (lang_hooks
.types_compatible_p (CONST_CAST_TREE (type
), va_list_type
))
11161 return "St9__va_list";
11163 /* Half-precision float. */
11164 if (TREE_CODE (type
) == REAL_TYPE
&& TYPE_PRECISION (type
) == 16)
11167 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
11169 if (TYPE_NAME (type
) != NULL
)
11170 return aarch64_mangle_builtin_type (type
);
11172 /* Use the default mangling. */
11176 /* Find the first rtx_insn before insn that will generate an assembly
11180 aarch64_prev_real_insn (rtx_insn
*insn
)
11187 insn
= prev_real_insn (insn
);
11189 while (insn
&& recog_memoized (insn
) < 0);
11195 is_madd_op (enum attr_type t1
)
11198 /* A number of these may be AArch32 only. */
11199 enum attr_type mlatypes
[] = {
11200 TYPE_MLA
, TYPE_MLAS
, TYPE_SMLAD
, TYPE_SMLADX
, TYPE_SMLAL
, TYPE_SMLALD
,
11201 TYPE_SMLALS
, TYPE_SMLALXY
, TYPE_SMLAWX
, TYPE_SMLAWY
, TYPE_SMLAXY
,
11202 TYPE_SMMLA
, TYPE_UMLAL
, TYPE_UMLALS
,TYPE_SMLSD
, TYPE_SMLSDX
, TYPE_SMLSLD
11205 for (i
= 0; i
< sizeof (mlatypes
) / sizeof (enum attr_type
); i
++)
11207 if (t1
== mlatypes
[i
])
11214 /* Check if there is a register dependency between a load and the insn
11215 for which we hold recog_data. */
11218 dep_between_memop_and_curr (rtx memop
)
11223 gcc_assert (GET_CODE (memop
) == SET
);
11225 if (!REG_P (SET_DEST (memop
)))
11228 load_reg
= SET_DEST (memop
);
11229 for (opno
= 1; opno
< recog_data
.n_operands
; opno
++)
11231 rtx operand
= recog_data
.operand
[opno
];
11232 if (REG_P (operand
)
11233 && reg_overlap_mentioned_p (load_reg
, operand
))
11241 /* When working around the Cortex-A53 erratum 835769,
11242 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
11243 instruction and has a preceding memory instruction such that a NOP
11244 should be inserted between them. */
11247 aarch64_madd_needs_nop (rtx_insn
* insn
)
11249 enum attr_type attr_type
;
11253 if (!TARGET_FIX_ERR_A53_835769
)
11256 if (!INSN_P (insn
) || recog_memoized (insn
) < 0)
11259 attr_type
= get_attr_type (insn
);
11260 if (!is_madd_op (attr_type
))
11263 prev
= aarch64_prev_real_insn (insn
);
11264 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
11265 Restore recog state to INSN to avoid state corruption. */
11266 extract_constrain_insn_cached (insn
);
11268 if (!prev
|| !contains_mem_rtx_p (PATTERN (prev
)))
11271 body
= single_set (prev
);
11273 /* If the previous insn is a memory op and there is no dependency between
11274 it and the DImode madd, emit a NOP between them. If body is NULL then we
11275 have a complex memory operation, probably a load/store pair.
11276 Be conservative for now and emit a NOP. */
11277 if (GET_MODE (recog_data
.operand
[0]) == DImode
11278 && (!body
|| !dep_between_memop_and_curr (body
)))
11286 /* Implement FINAL_PRESCAN_INSN. */
11289 aarch64_final_prescan_insn (rtx_insn
*insn
)
11291 if (aarch64_madd_needs_nop (insn
))
11292 fprintf (asm_out_file
, "\tnop // between mem op and mult-accumulate\n");
11296 /* Return the equivalent letter for size. */
11298 sizetochar (int size
)
11302 case 64: return 'd';
11303 case 32: return 's';
11304 case 16: return 'h';
11305 case 8 : return 'b';
11306 default: gcc_unreachable ();
11310 /* Return true iff x is a uniform vector of floating-point
11311 constants, and the constant can be represented in
11312 quarter-precision form. Note, as aarch64_float_const_representable
11313 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
11315 aarch64_vect_float_const_representable_p (rtx x
)
11318 return (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
11319 && const_vec_duplicate_p (x
, &elt
)
11320 && aarch64_float_const_representable_p (elt
));
11323 /* Return true for valid and false for invalid. */
11325 aarch64_simd_valid_immediate (rtx op
, machine_mode mode
, bool inverse
,
11326 struct simd_immediate_info
*info
)
11328 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
11330 for (i = 0; i < idx; i += (STRIDE)) \
11335 immtype = (CLASS); \
11336 elsize = (ELSIZE); \
11337 eshift = (SHIFT); \
11342 unsigned int i
, elsize
= 0, idx
= 0, n_elts
= CONST_VECTOR_NUNITS (op
);
11343 unsigned int innersize
= GET_MODE_UNIT_SIZE (mode
);
11344 unsigned char bytes
[16];
11345 int immtype
= -1, matches
;
11346 unsigned int invmask
= inverse
? 0xff : 0;
11349 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_FLOAT
)
11351 if (! (aarch64_simd_imm_zero_p (op
, mode
)
11352 || aarch64_vect_float_const_representable_p (op
)))
11357 info
->value
= CONST_VECTOR_ELT (op
, 0);
11358 info
->element_width
= GET_MODE_BITSIZE (GET_MODE (info
->value
));
11366 /* Splat vector constant out into a byte vector. */
11367 for (i
= 0; i
< n_elts
; i
++)
11369 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
11370 it must be laid out in the vector register in reverse order. */
11371 rtx el
= CONST_VECTOR_ELT (op
, BYTES_BIG_ENDIAN
? (n_elts
- 1 - i
) : i
);
11372 unsigned HOST_WIDE_INT elpart
;
11374 gcc_assert (CONST_INT_P (el
));
11375 elpart
= INTVAL (el
);
11377 for (unsigned int byte
= 0; byte
< innersize
; byte
++)
11379 bytes
[idx
++] = (elpart
& 0xff) ^ invmask
;
11380 elpart
>>= BITS_PER_UNIT
;
11385 /* Sanity check. */
11386 gcc_assert (idx
== GET_MODE_SIZE (mode
));
11390 CHECK (4, 32, 0, bytes
[i
] == bytes
[0] && bytes
[i
+ 1] == 0
11391 && bytes
[i
+ 2] == 0 && bytes
[i
+ 3] == 0, 0, 0);
11393 CHECK (4, 32, 1, bytes
[i
] == 0 && bytes
[i
+ 1] == bytes
[1]
11394 && bytes
[i
+ 2] == 0 && bytes
[i
+ 3] == 0, 8, 0);
11396 CHECK (4, 32, 2, bytes
[i
] == 0 && bytes
[i
+ 1] == 0
11397 && bytes
[i
+ 2] == bytes
[2] && bytes
[i
+ 3] == 0, 16, 0);
11399 CHECK (4, 32, 3, bytes
[i
] == 0 && bytes
[i
+ 1] == 0
11400 && bytes
[i
+ 2] == 0 && bytes
[i
+ 3] == bytes
[3], 24, 0);
11402 CHECK (2, 16, 4, bytes
[i
] == bytes
[0] && bytes
[i
+ 1] == 0, 0, 0);
11404 CHECK (2, 16, 5, bytes
[i
] == 0 && bytes
[i
+ 1] == bytes
[1], 8, 0);
11406 CHECK (4, 32, 6, bytes
[i
] == bytes
[0] && bytes
[i
+ 1] == 0xff
11407 && bytes
[i
+ 2] == 0xff && bytes
[i
+ 3] == 0xff, 0, 1);
11409 CHECK (4, 32, 7, bytes
[i
] == 0xff && bytes
[i
+ 1] == bytes
[1]
11410 && bytes
[i
+ 2] == 0xff && bytes
[i
+ 3] == 0xff, 8, 1);
11412 CHECK (4, 32, 8, bytes
[i
] == 0xff && bytes
[i
+ 1] == 0xff
11413 && bytes
[i
+ 2] == bytes
[2] && bytes
[i
+ 3] == 0xff, 16, 1);
11415 CHECK (4, 32, 9, bytes
[i
] == 0xff && bytes
[i
+ 1] == 0xff
11416 && bytes
[i
+ 2] == 0xff && bytes
[i
+ 3] == bytes
[3], 24, 1);
11418 CHECK (2, 16, 10, bytes
[i
] == bytes
[0] && bytes
[i
+ 1] == 0xff, 0, 1);
11420 CHECK (2, 16, 11, bytes
[i
] == 0xff && bytes
[i
+ 1] == bytes
[1], 8, 1);
11422 CHECK (4, 32, 12, bytes
[i
] == 0xff && bytes
[i
+ 1] == bytes
[1]
11423 && bytes
[i
+ 2] == 0 && bytes
[i
+ 3] == 0, 8, 0);
11425 CHECK (4, 32, 13, bytes
[i
] == 0 && bytes
[i
+ 1] == bytes
[1]
11426 && bytes
[i
+ 2] == 0xff && bytes
[i
+ 3] == 0xff, 8, 1);
11428 CHECK (4, 32, 14, bytes
[i
] == 0xff && bytes
[i
+ 1] == 0xff
11429 && bytes
[i
+ 2] == bytes
[2] && bytes
[i
+ 3] == 0, 16, 0);
11431 CHECK (4, 32, 15, bytes
[i
] == 0 && bytes
[i
+ 1] == 0
11432 && bytes
[i
+ 2] == bytes
[2] && bytes
[i
+ 3] == 0xff, 16, 1);
11434 CHECK (1, 8, 16, bytes
[i
] == bytes
[0], 0, 0);
11436 CHECK (1, 64, 17, (bytes
[i
] == 0 || bytes
[i
] == 0xff)
11437 && bytes
[i
] == bytes
[(i
+ 8) % idx
], 0, 0);
11446 info
->element_width
= elsize
;
11447 info
->mvn
= emvn
!= 0;
11448 info
->shift
= eshift
;
11450 unsigned HOST_WIDE_INT imm
= 0;
11452 if (immtype
>= 12 && immtype
<= 15)
11455 /* Un-invert bytes of recognized vector, if necessary. */
11457 for (i
= 0; i
< idx
; i
++)
11458 bytes
[i
] ^= invmask
;
11462 /* FIXME: Broken on 32-bit H_W_I hosts. */
11463 gcc_assert (sizeof (HOST_WIDE_INT
) == 8);
11465 for (i
= 0; i
< 8; i
++)
11466 imm
|= (unsigned HOST_WIDE_INT
) (bytes
[i
] ? 0xff : 0)
11467 << (i
* BITS_PER_UNIT
);
11470 info
->value
= GEN_INT (imm
);
11474 for (i
= 0; i
< elsize
/ BITS_PER_UNIT
; i
++)
11475 imm
|= (unsigned HOST_WIDE_INT
) bytes
[i
] << (i
* BITS_PER_UNIT
);
11477 /* Construct 'abcdefgh' because the assembler cannot handle
11478 generic constants. */
11481 imm
= (imm
>> info
->shift
) & 0xff;
11482 info
->value
= GEN_INT (imm
);
11490 /* Check of immediate shift constants are within range. */
11492 aarch64_simd_shift_imm_p (rtx x
, machine_mode mode
, bool left
)
11494 int bit_width
= GET_MODE_UNIT_SIZE (mode
) * BITS_PER_UNIT
;
11496 return aarch64_const_vec_all_same_in_range_p (x
, 0, bit_width
- 1);
11498 return aarch64_const_vec_all_same_in_range_p (x
, 1, bit_width
);
11501 /* Return true if X is a uniform vector where all elements
11502 are either the floating-point constant 0.0 or the
11503 integer constant 0. */
11505 aarch64_simd_imm_zero_p (rtx x
, machine_mode mode
)
11507 return x
== CONST0_RTX (mode
);
11511 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11512 operation of width WIDTH at bit position POS. */
11515 aarch64_mask_from_zextract_ops (rtx width
, rtx pos
)
11517 gcc_assert (CONST_INT_P (width
));
11518 gcc_assert (CONST_INT_P (pos
));
11520 unsigned HOST_WIDE_INT mask
11521 = ((unsigned HOST_WIDE_INT
) 1 << UINTVAL (width
)) - 1;
11522 return GEN_INT (mask
<< UINTVAL (pos
));
11526 aarch64_simd_imm_scalar_p (rtx x
, machine_mode mode ATTRIBUTE_UNUSED
)
11528 HOST_WIDE_INT imm
= INTVAL (x
);
11531 for (i
= 0; i
< 8; i
++)
11533 unsigned int byte
= imm
& 0xff;
11534 if (byte
!= 0xff && byte
!= 0)
11543 aarch64_mov_operand_p (rtx x
, machine_mode mode
)
11545 if (GET_CODE (x
) == HIGH
11546 && aarch64_valid_symref (XEXP (x
, 0), GET_MODE (XEXP (x
, 0))))
11549 if (CONST_INT_P (x
))
11552 if (GET_CODE (x
) == SYMBOL_REF
&& mode
== DImode
&& CONSTANT_ADDRESS_P (x
))
11555 return aarch64_classify_symbolic_expression (x
)
11556 == SYMBOL_TINY_ABSOLUTE
;
11559 /* Return a const_int vector of VAL. */
11561 aarch64_simd_gen_const_vector_dup (machine_mode mode
, HOST_WIDE_INT val
)
11563 int nunits
= GET_MODE_NUNITS (mode
);
11564 rtvec v
= rtvec_alloc (nunits
);
11567 rtx cache
= GEN_INT (val
);
11569 for (i
=0; i
< nunits
; i
++)
11570 RTVEC_ELT (v
, i
) = cache
;
11572 return gen_rtx_CONST_VECTOR (mode
, v
);
11575 /* Check OP is a legal scalar immediate for the MOVI instruction. */
11578 aarch64_simd_scalar_immediate_valid_for_move (rtx op
, machine_mode mode
)
11580 machine_mode vmode
;
11582 gcc_assert (!VECTOR_MODE_P (mode
));
11583 vmode
= aarch64_preferred_simd_mode (mode
);
11584 rtx op_v
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (op
));
11585 return aarch64_simd_valid_immediate (op_v
, vmode
, false, NULL
);
11588 /* Construct and return a PARALLEL RTX vector with elements numbering the
11589 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11590 the vector - from the perspective of the architecture. This does not
11591 line up with GCC's perspective on lane numbers, so we end up with
11592 different masks depending on our target endian-ness. The diagram
11593 below may help. We must draw the distinction when building masks
11594 which select one half of the vector. An instruction selecting
11595 architectural low-lanes for a big-endian target, must be described using
11596 a mask selecting GCC high-lanes.
11598 Big-Endian Little-Endian
11600 GCC 0 1 2 3 3 2 1 0
11601 | x | x | x | x | | x | x | x | x |
11602 Architecture 3 2 1 0 3 2 1 0
11604 Low Mask: { 2, 3 } { 0, 1 }
11605 High Mask: { 0, 1 } { 2, 3 }
11609 aarch64_simd_vect_par_cnst_half (machine_mode mode
, bool high
)
11611 int nunits
= GET_MODE_NUNITS (mode
);
11612 rtvec v
= rtvec_alloc (nunits
/ 2);
11613 int high_base
= nunits
/ 2;
11619 if (BYTES_BIG_ENDIAN
)
11620 base
= high
? low_base
: high_base
;
11622 base
= high
? high_base
: low_base
;
11624 for (i
= 0; i
< nunits
/ 2; i
++)
11625 RTVEC_ELT (v
, i
) = GEN_INT (base
+ i
);
11627 t1
= gen_rtx_PARALLEL (mode
, v
);
11631 /* Check OP for validity as a PARALLEL RTX vector with elements
11632 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11633 from the perspective of the architecture. See the diagram above
11634 aarch64_simd_vect_par_cnst_half for more details. */
11637 aarch64_simd_check_vect_par_cnst_half (rtx op
, machine_mode mode
,
11640 rtx ideal
= aarch64_simd_vect_par_cnst_half (mode
, high
);
11641 HOST_WIDE_INT count_op
= XVECLEN (op
, 0);
11642 HOST_WIDE_INT count_ideal
= XVECLEN (ideal
, 0);
11645 if (!VECTOR_MODE_P (mode
))
11648 if (count_op
!= count_ideal
)
11651 for (i
= 0; i
< count_ideal
; i
++)
11653 rtx elt_op
= XVECEXP (op
, 0, i
);
11654 rtx elt_ideal
= XVECEXP (ideal
, 0, i
);
11656 if (!CONST_INT_P (elt_op
)
11657 || INTVAL (elt_ideal
) != INTVAL (elt_op
))
11663 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
11664 HIGH (exclusive). */
11666 aarch64_simd_lane_bounds (rtx operand
, HOST_WIDE_INT low
, HOST_WIDE_INT high
,
11669 HOST_WIDE_INT lane
;
11670 gcc_assert (CONST_INT_P (operand
));
11671 lane
= INTVAL (operand
);
11673 if (lane
< low
|| lane
>= high
)
11676 error ("%Klane %wd out of range %wd - %wd", exp
, lane
, low
, high
- 1);
11678 error ("lane %wd out of range %wd - %wd", lane
, low
, high
- 1);
11682 /* Return TRUE if OP is a valid vector addressing mode. */
11684 aarch64_simd_mem_operand_p (rtx op
)
11686 return MEM_P (op
) && (GET_CODE (XEXP (op
, 0)) == POST_INC
11687 || REG_P (XEXP (op
, 0)));
11690 /* Emit a register copy from operand to operand, taking care not to
11691 early-clobber source registers in the process.
11693 COUNT is the number of components into which the copy needs to be
11696 aarch64_simd_emit_reg_reg_move (rtx
*operands
, machine_mode mode
,
11697 unsigned int count
)
11700 int rdest
= REGNO (operands
[0]);
11701 int rsrc
= REGNO (operands
[1]);
11703 if (!reg_overlap_mentioned_p (operands
[0], operands
[1])
11705 for (i
= 0; i
< count
; i
++)
11706 emit_move_insn (gen_rtx_REG (mode
, rdest
+ i
),
11707 gen_rtx_REG (mode
, rsrc
+ i
));
11709 for (i
= 0; i
< count
; i
++)
11710 emit_move_insn (gen_rtx_REG (mode
, rdest
+ count
- i
- 1),
11711 gen_rtx_REG (mode
, rsrc
+ count
- i
- 1));
11714 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11715 one of VSTRUCT modes: OI, CI, or XI. */
11717 aarch64_simd_attr_length_rglist (machine_mode mode
)
11719 return (GET_MODE_SIZE (mode
) / UNITS_PER_VREG
) * 4;
11722 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
11723 alignment of a vector to 128 bits. */
11724 static HOST_WIDE_INT
11725 aarch64_simd_vector_alignment (const_tree type
)
11727 HOST_WIDE_INT align
= tree_to_shwi (TYPE_SIZE (type
));
11728 return MIN (align
, 128);
11731 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
11733 aarch64_simd_vector_alignment_reachable (const_tree type
, bool is_packed
)
11738 /* We guarantee alignment for vectors up to 128-bits. */
11739 if (tree_int_cst_compare (TYPE_SIZE (type
),
11740 bitsize_int (BIGGEST_ALIGNMENT
)) > 0)
11743 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
11747 /* Return true if the vector misalignment factor is supported by the
11750 aarch64_builtin_support_vector_misalignment (machine_mode mode
,
11751 const_tree type
, int misalignment
,
11754 if (TARGET_SIMD
&& STRICT_ALIGNMENT
)
11756 /* Return if movmisalign pattern is not supported for this mode. */
11757 if (optab_handler (movmisalign_optab
, mode
) == CODE_FOR_nothing
)
11760 if (misalignment
== -1)
11762 /* Misalignment factor is unknown at compile time but we know
11763 it's word aligned. */
11764 if (aarch64_simd_vector_alignment_reachable (type
, is_packed
))
11766 int element_size
= TREE_INT_CST_LOW (TYPE_SIZE (type
));
11768 if (element_size
!= 64)
11774 return default_builtin_support_vector_misalignment (mode
, type
, misalignment
,
11778 /* If VALS is a vector constant that can be loaded into a register
11779 using DUP, generate instructions to do so and return an RTX to
11780 assign to the register. Otherwise return NULL_RTX. */
11782 aarch64_simd_dup_constant (rtx vals
)
11784 machine_mode mode
= GET_MODE (vals
);
11785 machine_mode inner_mode
= GET_MODE_INNER (mode
);
11788 if (!const_vec_duplicate_p (vals
, &x
))
11791 /* We can load this constant by using DUP and a constant in a
11792 single ARM register. This will be cheaper than a vector
11794 x
= copy_to_mode_reg (inner_mode
, x
);
11795 return gen_rtx_VEC_DUPLICATE (mode
, x
);
11799 /* Generate code to load VALS, which is a PARALLEL containing only
11800 constants (for vec_init) or CONST_VECTOR, efficiently into a
11801 register. Returns an RTX to copy into the register, or NULL_RTX
11802 for a PARALLEL that can not be converted into a CONST_VECTOR. */
11804 aarch64_simd_make_constant (rtx vals
)
11806 machine_mode mode
= GET_MODE (vals
);
11808 rtx const_vec
= NULL_RTX
;
11809 int n_elts
= GET_MODE_NUNITS (mode
);
11813 if (GET_CODE (vals
) == CONST_VECTOR
)
11815 else if (GET_CODE (vals
) == PARALLEL
)
11817 /* A CONST_VECTOR must contain only CONST_INTs and
11818 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11819 Only store valid constants in a CONST_VECTOR. */
11820 for (i
= 0; i
< n_elts
; ++i
)
11822 rtx x
= XVECEXP (vals
, 0, i
);
11823 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
11826 if (n_const
== n_elts
)
11827 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0));
11830 gcc_unreachable ();
11832 if (const_vec
!= NULL_RTX
11833 && aarch64_simd_valid_immediate (const_vec
, mode
, false, NULL
))
11834 /* Load using MOVI/MVNI. */
11836 else if ((const_dup
= aarch64_simd_dup_constant (vals
)) != NULL_RTX
)
11837 /* Loaded using DUP. */
11839 else if (const_vec
!= NULL_RTX
)
11840 /* Load from constant pool. We can not take advantage of single-cycle
11841 LD1 because we need a PC-relative addressing mode. */
11844 /* A PARALLEL containing something not valid inside CONST_VECTOR.
11845 We can not construct an initializer. */
11849 /* Expand a vector initialisation sequence, such that TARGET is
11850 initialised to contain VALS. */
11853 aarch64_expand_vector_init (rtx target
, rtx vals
)
11855 machine_mode mode
= GET_MODE (target
);
11856 machine_mode inner_mode
= GET_MODE_INNER (mode
);
11857 /* The number of vector elements. */
11858 int n_elts
= GET_MODE_NUNITS (mode
);
11859 /* The number of vector elements which are not constant. */
11861 rtx any_const
= NULL_RTX
;
11862 /* The first element of vals. */
11863 rtx v0
= XVECEXP (vals
, 0, 0);
11864 bool all_same
= true;
11866 /* Count the number of variable elements to initialise. */
11867 for (int i
= 0; i
< n_elts
; ++i
)
11869 rtx x
= XVECEXP (vals
, 0, i
);
11870 if (!(CONST_INT_P (x
) || CONST_DOUBLE_P (x
)))
11875 all_same
&= rtx_equal_p (x
, v0
);
11878 /* No variable elements, hand off to aarch64_simd_make_constant which knows
11879 how best to handle this. */
11882 rtx constant
= aarch64_simd_make_constant (vals
);
11883 if (constant
!= NULL_RTX
)
11885 emit_move_insn (target
, constant
);
11890 /* Splat a single non-constant element if we can. */
11893 rtx x
= copy_to_mode_reg (inner_mode
, v0
);
11894 aarch64_emit_move (target
, gen_rtx_VEC_DUPLICATE (mode
, x
));
11898 enum insn_code icode
= optab_handler (vec_set_optab
, mode
);
11899 gcc_assert (icode
!= CODE_FOR_nothing
);
11901 /* If there are only variable elements, try to optimize
11902 the insertion using dup for the most common element
11903 followed by insertions. */
11905 /* The algorithm will fill matches[*][0] with the earliest matching element,
11906 and matches[X][1] with the count of duplicate elements (if X is the
11907 earliest element which has duplicates). */
11909 if (n_var
== n_elts
&& n_elts
<= 16)
11911 int matches
[16][2] = {0};
11912 for (int i
= 0; i
< n_elts
; i
++)
11914 for (int j
= 0; j
<= i
; j
++)
11916 if (rtx_equal_p (XVECEXP (vals
, 0, i
), XVECEXP (vals
, 0, j
)))
11924 int maxelement
= 0;
11926 for (int i
= 0; i
< n_elts
; i
++)
11927 if (matches
[i
][1] > maxv
)
11930 maxv
= matches
[i
][1];
11933 /* Create a duplicate of the most common element. */
11934 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
11935 aarch64_emit_move (target
, gen_rtx_VEC_DUPLICATE (mode
, x
));
11937 /* Insert the rest. */
11938 for (int i
= 0; i
< n_elts
; i
++)
11940 rtx x
= XVECEXP (vals
, 0, i
);
11941 if (matches
[i
][0] == maxelement
)
11943 x
= copy_to_mode_reg (inner_mode
, x
);
11944 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
11949 /* Initialise a vector which is part-variable. We want to first try
11950 to build those lanes which are constant in the most efficient way we
11952 if (n_var
!= n_elts
)
11954 rtx copy
= copy_rtx (vals
);
11956 /* Load constant part of vector. We really don't care what goes into the
11957 parts we will overwrite, but we're more likely to be able to load the
11958 constant efficiently if it has fewer, larger, repeating parts
11959 (see aarch64_simd_valid_immediate). */
11960 for (int i
= 0; i
< n_elts
; i
++)
11962 rtx x
= XVECEXP (vals
, 0, i
);
11963 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
11965 rtx subst
= any_const
;
11966 for (int bit
= n_elts
/ 2; bit
> 0; bit
/= 2)
11968 /* Look in the copied vector, as more elements are const. */
11969 rtx test
= XVECEXP (copy
, 0, i
^ bit
);
11970 if (CONST_INT_P (test
) || CONST_DOUBLE_P (test
))
11976 XVECEXP (copy
, 0, i
) = subst
;
11978 aarch64_expand_vector_init (target
, copy
);
11981 /* Insert the variable lanes directly. */
11982 for (int i
= 0; i
< n_elts
; i
++)
11984 rtx x
= XVECEXP (vals
, 0, i
);
11985 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
11987 x
= copy_to_mode_reg (inner_mode
, x
);
11988 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
11992 static unsigned HOST_WIDE_INT
11993 aarch64_shift_truncation_mask (machine_mode mode
)
11996 (!SHIFT_COUNT_TRUNCATED
11997 || aarch64_vector_mode_supported_p (mode
)
11998 || aarch64_vect_struct_mode_p (mode
)) ? 0 : (GET_MODE_BITSIZE (mode
) - 1);
12001 /* Select a format to encode pointers in exception handling data. */
12003 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED
, int global
)
12006 switch (aarch64_cmodel
)
12008 case AARCH64_CMODEL_TINY
:
12009 case AARCH64_CMODEL_TINY_PIC
:
12010 case AARCH64_CMODEL_SMALL
:
12011 case AARCH64_CMODEL_SMALL_PIC
:
12012 case AARCH64_CMODEL_SMALL_SPIC
:
12013 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
12015 type
= DW_EH_PE_sdata4
;
12018 /* No assumptions here. 8-byte relocs required. */
12019 type
= DW_EH_PE_sdata8
;
12022 return (global
? DW_EH_PE_indirect
: 0) | DW_EH_PE_pcrel
| type
;
12025 /* The last .arch and .tune assembly strings that we printed. */
12026 static std::string aarch64_last_printed_arch_string
;
12027 static std::string aarch64_last_printed_tune_string
;
12029 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
12030 by the function fndecl. */
12033 aarch64_declare_function_name (FILE *stream
, const char* name
,
12036 tree target_parts
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
12038 struct cl_target_option
*targ_options
;
12040 targ_options
= TREE_TARGET_OPTION (target_parts
);
12042 targ_options
= TREE_TARGET_OPTION (target_option_current_node
);
12043 gcc_assert (targ_options
);
12045 const struct processor
*this_arch
12046 = aarch64_get_arch (targ_options
->x_explicit_arch
);
12048 unsigned long isa_flags
= targ_options
->x_aarch64_isa_flags
;
12049 std::string extension
12050 = aarch64_get_extension_string_for_isa_flags (isa_flags
,
12052 /* Only update the assembler .arch string if it is distinct from the last
12053 such string we printed. */
12054 std::string to_print
= this_arch
->name
+ extension
;
12055 if (to_print
!= aarch64_last_printed_arch_string
)
12057 asm_fprintf (asm_out_file
, "\t.arch %s\n", to_print
.c_str ());
12058 aarch64_last_printed_arch_string
= to_print
;
12061 /* Print the cpu name we're tuning for in the comments, might be
12062 useful to readers of the generated asm. Do it only when it changes
12063 from function to function and verbose assembly is requested. */
12064 const struct processor
*this_tune
12065 = aarch64_get_tune_cpu (targ_options
->x_explicit_tune_core
);
12067 if (flag_debug_asm
&& aarch64_last_printed_tune_string
!= this_tune
->name
)
12069 asm_fprintf (asm_out_file
, "\t" ASM_COMMENT_START
".tune %s\n",
12071 aarch64_last_printed_tune_string
= this_tune
->name
;
12074 /* Don't forget the type directive for ELF. */
12075 ASM_OUTPUT_TYPE_DIRECTIVE (stream
, name
, "function");
12076 ASM_OUTPUT_LABEL (stream
, name
);
12079 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
12082 aarch64_start_file (void)
12084 struct cl_target_option
*default_options
12085 = TREE_TARGET_OPTION (target_option_default_node
);
12087 const struct processor
*default_arch
12088 = aarch64_get_arch (default_options
->x_explicit_arch
);
12089 unsigned long default_isa_flags
= default_options
->x_aarch64_isa_flags
;
12090 std::string extension
12091 = aarch64_get_extension_string_for_isa_flags (default_isa_flags
,
12092 default_arch
->flags
);
12094 aarch64_last_printed_arch_string
= default_arch
->name
+ extension
;
12095 aarch64_last_printed_tune_string
= "";
12096 asm_fprintf (asm_out_file
, "\t.arch %s\n",
12097 aarch64_last_printed_arch_string
.c_str ());
12099 default_file_start ();
12102 /* Emit load exclusive. */
12105 aarch64_emit_load_exclusive (machine_mode mode
, rtx rval
,
12106 rtx mem
, rtx model_rtx
)
12108 rtx (*gen
) (rtx
, rtx
, rtx
);
12112 case QImode
: gen
= gen_aarch64_load_exclusiveqi
; break;
12113 case HImode
: gen
= gen_aarch64_load_exclusivehi
; break;
12114 case SImode
: gen
= gen_aarch64_load_exclusivesi
; break;
12115 case DImode
: gen
= gen_aarch64_load_exclusivedi
; break;
12117 gcc_unreachable ();
12120 emit_insn (gen (rval
, mem
, model_rtx
));
12123 /* Emit store exclusive. */
12126 aarch64_emit_store_exclusive (machine_mode mode
, rtx bval
,
12127 rtx rval
, rtx mem
, rtx model_rtx
)
12129 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
12133 case QImode
: gen
= gen_aarch64_store_exclusiveqi
; break;
12134 case HImode
: gen
= gen_aarch64_store_exclusivehi
; break;
12135 case SImode
: gen
= gen_aarch64_store_exclusivesi
; break;
12136 case DImode
: gen
= gen_aarch64_store_exclusivedi
; break;
12138 gcc_unreachable ();
12141 emit_insn (gen (bval
, rval
, mem
, model_rtx
));
12144 /* Mark the previous jump instruction as unlikely. */
12147 aarch64_emit_unlikely_jump (rtx insn
)
12149 int very_unlikely
= REG_BR_PROB_BASE
/ 100 - 1;
12151 rtx_insn
*jump
= emit_jump_insn (insn
);
12152 add_int_reg_note (jump
, REG_BR_PROB
, very_unlikely
);
12155 /* Expand a compare and swap pattern. */
12158 aarch64_expand_compare_and_swap (rtx operands
[])
12160 rtx bval
, rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
, x
;
12161 machine_mode mode
, cmp_mode
;
12162 typedef rtx (*gen_cas_fn
) (rtx
, rtx
, rtx
, rtx
, rtx
, rtx
, rtx
);
12165 const gen_cas_fn split_cas
[] =
12167 gen_aarch64_compare_and_swapqi
,
12168 gen_aarch64_compare_and_swaphi
,
12169 gen_aarch64_compare_and_swapsi
,
12170 gen_aarch64_compare_and_swapdi
12172 const gen_cas_fn atomic_cas
[] =
12174 gen_aarch64_compare_and_swapqi_lse
,
12175 gen_aarch64_compare_and_swaphi_lse
,
12176 gen_aarch64_compare_and_swapsi_lse
,
12177 gen_aarch64_compare_and_swapdi_lse
12180 bval
= operands
[0];
12181 rval
= operands
[1];
12183 oldval
= operands
[3];
12184 newval
= operands
[4];
12185 is_weak
= operands
[5];
12186 mod_s
= operands
[6];
12187 mod_f
= operands
[7];
12188 mode
= GET_MODE (mem
);
12191 /* Normally the succ memory model must be stronger than fail, but in the
12192 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
12193 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
12195 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f
)))
12196 && is_mm_release (memmodel_from_int (INTVAL (mod_s
))))
12197 mod_s
= GEN_INT (MEMMODEL_ACQ_REL
);
12203 /* For short modes, we're going to perform the comparison in SImode,
12204 so do the zero-extension now. */
12206 rval
= gen_reg_rtx (SImode
);
12207 oldval
= convert_modes (SImode
, mode
, oldval
, true);
12208 /* Fall through. */
12212 /* Force the value into a register if needed. */
12213 if (!aarch64_plus_operand (oldval
, mode
))
12214 oldval
= force_reg (cmp_mode
, oldval
);
12218 gcc_unreachable ();
12223 case QImode
: idx
= 0; break;
12224 case HImode
: idx
= 1; break;
12225 case SImode
: idx
= 2; break;
12226 case DImode
: idx
= 3; break;
12228 gcc_unreachable ();
12231 gen
= atomic_cas
[idx
];
12233 gen
= split_cas
[idx
];
12235 emit_insn (gen (rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
));
12237 if (mode
== QImode
|| mode
== HImode
)
12238 emit_move_insn (operands
[1], gen_lowpart (mode
, rval
));
12240 x
= gen_rtx_REG (CCmode
, CC_REGNUM
);
12241 x
= gen_rtx_EQ (SImode
, x
, const0_rtx
);
12242 emit_insn (gen_rtx_SET (bval
, x
));
12245 /* Test whether the target supports using a atomic load-operate instruction.
12246 CODE is the operation and AFTER is TRUE if the data in memory after the
12247 operation should be returned and FALSE if the data before the operation
12248 should be returned. Returns FALSE if the operation isn't supported by the
12252 aarch64_atomic_ldop_supported_p (enum rtx_code code
)
12271 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
12272 sequence implementing an atomic operation. */
12275 aarch64_emit_post_barrier (enum memmodel model
)
12277 const enum memmodel base_model
= memmodel_base (model
);
12279 if (is_mm_sync (model
)
12280 && (base_model
== MEMMODEL_ACQUIRE
12281 || base_model
== MEMMODEL_ACQ_REL
12282 || base_model
== MEMMODEL_SEQ_CST
))
12284 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST
)));
12288 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
12289 for the data in memory. EXPECTED is the value expected to be in memory.
12290 DESIRED is the value to store to memory. MEM is the memory location. MODEL
12291 is the memory ordering to use. */
12294 aarch64_gen_atomic_cas (rtx rval
, rtx mem
,
12295 rtx expected
, rtx desired
,
12298 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
12301 mode
= GET_MODE (mem
);
12305 case QImode
: gen
= gen_aarch64_atomic_casqi
; break;
12306 case HImode
: gen
= gen_aarch64_atomic_cashi
; break;
12307 case SImode
: gen
= gen_aarch64_atomic_cassi
; break;
12308 case DImode
: gen
= gen_aarch64_atomic_casdi
; break;
12310 gcc_unreachable ();
12313 /* Move the expected value into the CAS destination register. */
12314 emit_insn (gen_rtx_SET (rval
, expected
));
12316 /* Emit the CAS. */
12317 emit_insn (gen (rval
, mem
, desired
, model
));
12319 /* Compare the expected value with the value loaded by the CAS, to establish
12320 whether the swap was made. */
12321 aarch64_gen_compare_reg (EQ
, rval
, expected
);
12324 /* Split a compare and swap pattern. */
12327 aarch64_split_compare_and_swap (rtx operands
[])
12329 rtx rval
, mem
, oldval
, newval
, scratch
;
12332 rtx_code_label
*label1
, *label2
;
12334 enum memmodel model
;
12337 rval
= operands
[0];
12339 oldval
= operands
[2];
12340 newval
= operands
[3];
12341 is_weak
= (operands
[4] != const0_rtx
);
12342 model_rtx
= operands
[5];
12343 scratch
= operands
[7];
12344 mode
= GET_MODE (mem
);
12345 model
= memmodel_from_int (INTVAL (model_rtx
));
12347 /* When OLDVAL is zero and we want the strong version we can emit a tighter
12350 LD[A]XR rval, [mem]
12352 ST[L]XR scratch, newval, [mem]
12353 CBNZ scratch, .label1
12356 bool strong_zero_p
= !is_weak
&& oldval
== const0_rtx
;
12361 label1
= gen_label_rtx ();
12362 emit_label (label1
);
12364 label2
= gen_label_rtx ();
12366 /* The initial load can be relaxed for a __sync operation since a final
12367 barrier will be emitted to stop code hoisting. */
12368 if (is_mm_sync (model
))
12369 aarch64_emit_load_exclusive (mode
, rval
, mem
,
12370 GEN_INT (MEMMODEL_RELAXED
));
12372 aarch64_emit_load_exclusive (mode
, rval
, mem
, model_rtx
);
12376 x
= gen_rtx_NE (VOIDmode
, rval
, const0_rtx
);
12377 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
12378 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
12379 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
12383 cond
= aarch64_gen_compare_reg (NE
, rval
, oldval
);
12384 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
12385 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
12386 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
12387 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
12390 aarch64_emit_store_exclusive (mode
, scratch
, mem
, newval
, model_rtx
);
12394 x
= gen_rtx_NE (VOIDmode
, scratch
, const0_rtx
);
12395 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
12396 gen_rtx_LABEL_REF (Pmode
, label1
), pc_rtx
);
12397 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
12401 cond
= gen_rtx_REG (CCmode
, CC_REGNUM
);
12402 x
= gen_rtx_COMPARE (CCmode
, scratch
, const0_rtx
);
12403 emit_insn (gen_rtx_SET (cond
, x
));
12406 emit_label (label2
);
12407 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
12408 to set the condition flags. If this is not used it will be removed by
12412 cond
= gen_rtx_REG (CCmode
, CC_REGNUM
);
12413 x
= gen_rtx_COMPARE (CCmode
, rval
, const0_rtx
);
12414 emit_insn (gen_rtx_SET (cond
, x
));
12416 /* Emit any final barrier needed for a __sync operation. */
12417 if (is_mm_sync (model
))
12418 aarch64_emit_post_barrier (model
);
12421 /* Emit a BIC instruction. */
12424 aarch64_emit_bic (machine_mode mode
, rtx dst
, rtx s1
, rtx s2
, int shift
)
12426 rtx shift_rtx
= GEN_INT (shift
);
12427 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
12431 case SImode
: gen
= gen_and_one_cmpl_lshrsi3
; break;
12432 case DImode
: gen
= gen_and_one_cmpl_lshrdi3
; break;
12434 gcc_unreachable ();
12437 emit_insn (gen (dst
, s2
, shift_rtx
, s1
));
12440 /* Emit an atomic swap. */
12443 aarch64_emit_atomic_swap (machine_mode mode
, rtx dst
, rtx value
,
12444 rtx mem
, rtx model
)
12446 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
12450 case QImode
: gen
= gen_aarch64_atomic_swpqi
; break;
12451 case HImode
: gen
= gen_aarch64_atomic_swphi
; break;
12452 case SImode
: gen
= gen_aarch64_atomic_swpsi
; break;
12453 case DImode
: gen
= gen_aarch64_atomic_swpdi
; break;
12455 gcc_unreachable ();
12458 emit_insn (gen (dst
, mem
, value
, model
));
12461 /* Operations supported by aarch64_emit_atomic_load_op. */
12463 enum aarch64_atomic_load_op_code
12465 AARCH64_LDOP_PLUS
, /* A + B */
12466 AARCH64_LDOP_XOR
, /* A ^ B */
12467 AARCH64_LDOP_OR
, /* A | B */
12468 AARCH64_LDOP_BIC
/* A & ~B */
12471 /* Emit an atomic load-operate. */
12474 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code
,
12475 machine_mode mode
, rtx dst
, rtx src
,
12476 rtx mem
, rtx model
)
12478 typedef rtx (*aarch64_atomic_load_op_fn
) (rtx
, rtx
, rtx
, rtx
);
12479 const aarch64_atomic_load_op_fn plus
[] =
12481 gen_aarch64_atomic_loadaddqi
,
12482 gen_aarch64_atomic_loadaddhi
,
12483 gen_aarch64_atomic_loadaddsi
,
12484 gen_aarch64_atomic_loadadddi
12486 const aarch64_atomic_load_op_fn eor
[] =
12488 gen_aarch64_atomic_loadeorqi
,
12489 gen_aarch64_atomic_loadeorhi
,
12490 gen_aarch64_atomic_loadeorsi
,
12491 gen_aarch64_atomic_loadeordi
12493 const aarch64_atomic_load_op_fn ior
[] =
12495 gen_aarch64_atomic_loadsetqi
,
12496 gen_aarch64_atomic_loadsethi
,
12497 gen_aarch64_atomic_loadsetsi
,
12498 gen_aarch64_atomic_loadsetdi
12500 const aarch64_atomic_load_op_fn bic
[] =
12502 gen_aarch64_atomic_loadclrqi
,
12503 gen_aarch64_atomic_loadclrhi
,
12504 gen_aarch64_atomic_loadclrsi
,
12505 gen_aarch64_atomic_loadclrdi
12507 aarch64_atomic_load_op_fn gen
;
12512 case QImode
: idx
= 0; break;
12513 case HImode
: idx
= 1; break;
12514 case SImode
: idx
= 2; break;
12515 case DImode
: idx
= 3; break;
12517 gcc_unreachable ();
12522 case AARCH64_LDOP_PLUS
: gen
= plus
[idx
]; break;
12523 case AARCH64_LDOP_XOR
: gen
= eor
[idx
]; break;
12524 case AARCH64_LDOP_OR
: gen
= ior
[idx
]; break;
12525 case AARCH64_LDOP_BIC
: gen
= bic
[idx
]; break;
12527 gcc_unreachable ();
12530 emit_insn (gen (dst
, mem
, src
, model
));
12533 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
12534 location to store the data read from memory. OUT_RESULT is the location to
12535 store the result of the operation. MEM is the memory location to read and
12536 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
12537 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
12541 aarch64_gen_atomic_ldop (enum rtx_code code
, rtx out_data
, rtx out_result
,
12542 rtx mem
, rtx value
, rtx model_rtx
)
12544 machine_mode mode
= GET_MODE (mem
);
12545 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
12546 const bool short_mode
= (mode
< SImode
);
12547 aarch64_atomic_load_op_code ldop_code
;
12552 out_data
= gen_lowpart (mode
, out_data
);
12555 out_result
= gen_lowpart (mode
, out_result
);
12557 /* Make sure the value is in a register, putting it into a destination
12558 register if it needs to be manipulated. */
12559 if (!register_operand (value
, mode
)
12560 || code
== AND
|| code
== MINUS
)
12562 src
= out_result
? out_result
: out_data
;
12563 emit_move_insn (src
, gen_lowpart (mode
, value
));
12567 gcc_assert (register_operand (src
, mode
));
12569 /* Preprocess the data for the operation as necessary. If the operation is
12570 a SET then emit a swap instruction and finish. */
12574 aarch64_emit_atomic_swap (mode
, out_data
, src
, mem
, model_rtx
);
12578 /* Negate the value and treat it as a PLUS. */
12582 /* Resize the value if necessary. */
12584 src
= gen_lowpart (wmode
, src
);
12586 neg_src
= gen_rtx_NEG (wmode
, src
);
12587 emit_insn (gen_rtx_SET (src
, neg_src
));
12590 src
= gen_lowpart (mode
, src
);
12592 /* Fall-through. */
12594 ldop_code
= AARCH64_LDOP_PLUS
;
12598 ldop_code
= AARCH64_LDOP_OR
;
12602 ldop_code
= AARCH64_LDOP_XOR
;
12609 /* Resize the value if necessary. */
12611 src
= gen_lowpart (wmode
, src
);
12613 not_src
= gen_rtx_NOT (wmode
, src
);
12614 emit_insn (gen_rtx_SET (src
, not_src
));
12617 src
= gen_lowpart (mode
, src
);
12619 ldop_code
= AARCH64_LDOP_BIC
;
12623 /* The operation can't be done with atomic instructions. */
12624 gcc_unreachable ();
12627 aarch64_emit_atomic_load_op (ldop_code
, mode
, out_data
, src
, mem
, model_rtx
);
12629 /* If necessary, calculate the data in memory after the update by redoing the
12630 operation from values in registers. */
12636 src
= gen_lowpart (wmode
, src
);
12637 out_data
= gen_lowpart (wmode
, out_data
);
12638 out_result
= gen_lowpart (wmode
, out_result
);
12647 x
= gen_rtx_PLUS (wmode
, out_data
, src
);
12650 x
= gen_rtx_IOR (wmode
, out_data
, src
);
12653 x
= gen_rtx_XOR (wmode
, out_data
, src
);
12656 aarch64_emit_bic (wmode
, out_result
, out_data
, src
, 0);
12659 gcc_unreachable ();
12662 emit_set_insn (out_result
, x
);
12667 /* Split an atomic operation. */
12670 aarch64_split_atomic_op (enum rtx_code code
, rtx old_out
, rtx new_out
, rtx mem
,
12671 rtx value
, rtx model_rtx
, rtx cond
)
12673 machine_mode mode
= GET_MODE (mem
);
12674 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
12675 const enum memmodel model
= memmodel_from_int (INTVAL (model_rtx
));
12676 const bool is_sync
= is_mm_sync (model
);
12677 rtx_code_label
*label
;
12680 /* Split the atomic operation into a sequence. */
12681 label
= gen_label_rtx ();
12682 emit_label (label
);
12685 new_out
= gen_lowpart (wmode
, new_out
);
12687 old_out
= gen_lowpart (wmode
, old_out
);
12690 value
= simplify_gen_subreg (wmode
, value
, mode
, 0);
12692 /* The initial load can be relaxed for a __sync operation since a final
12693 barrier will be emitted to stop code hoisting. */
12695 aarch64_emit_load_exclusive (mode
, old_out
, mem
,
12696 GEN_INT (MEMMODEL_RELAXED
));
12698 aarch64_emit_load_exclusive (mode
, old_out
, mem
, model_rtx
);
12707 x
= gen_rtx_AND (wmode
, old_out
, value
);
12708 emit_insn (gen_rtx_SET (new_out
, x
));
12709 x
= gen_rtx_NOT (wmode
, new_out
);
12710 emit_insn (gen_rtx_SET (new_out
, x
));
12714 if (CONST_INT_P (value
))
12716 value
= GEN_INT (-INTVAL (value
));
12719 /* Fall through. */
12722 x
= gen_rtx_fmt_ee (code
, wmode
, old_out
, value
);
12723 emit_insn (gen_rtx_SET (new_out
, x
));
12727 aarch64_emit_store_exclusive (mode
, cond
, mem
,
12728 gen_lowpart (mode
, new_out
), model_rtx
);
12730 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
12731 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
12732 gen_rtx_LABEL_REF (Pmode
, label
), pc_rtx
);
12733 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
12735 /* Emit any final barrier needed for a __sync operation. */
12737 aarch64_emit_post_barrier (model
);
12741 aarch64_init_libfuncs (void)
12743 /* Half-precision float operations. The compiler handles all operations
12744 with NULL libfuncs by converting to SFmode. */
12747 set_conv_libfunc (trunc_optab
, HFmode
, SFmode
, "__gnu_f2h_ieee");
12748 set_conv_libfunc (sext_optab
, SFmode
, HFmode
, "__gnu_h2f_ieee");
12751 set_optab_libfunc (add_optab
, HFmode
, NULL
);
12752 set_optab_libfunc (sdiv_optab
, HFmode
, NULL
);
12753 set_optab_libfunc (smul_optab
, HFmode
, NULL
);
12754 set_optab_libfunc (neg_optab
, HFmode
, NULL
);
12755 set_optab_libfunc (sub_optab
, HFmode
, NULL
);
12758 set_optab_libfunc (eq_optab
, HFmode
, NULL
);
12759 set_optab_libfunc (ne_optab
, HFmode
, NULL
);
12760 set_optab_libfunc (lt_optab
, HFmode
, NULL
);
12761 set_optab_libfunc (le_optab
, HFmode
, NULL
);
12762 set_optab_libfunc (ge_optab
, HFmode
, NULL
);
12763 set_optab_libfunc (gt_optab
, HFmode
, NULL
);
12764 set_optab_libfunc (unord_optab
, HFmode
, NULL
);
12767 /* Target hook for c_mode_for_suffix. */
12768 static machine_mode
12769 aarch64_c_mode_for_suffix (char suffix
)
12777 /* We can only represent floating point constants which will fit in
12778 "quarter-precision" values. These values are characterised by
12779 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
12782 (-1)^s * (n/16) * 2^r
12785 's' is the sign bit.
12786 'n' is an integer in the range 16 <= n <= 31.
12787 'r' is an integer in the range -3 <= r <= 4. */
12789 /* Return true iff X can be represented by a quarter-precision
12790 floating point immediate operand X. Note, we cannot represent 0.0. */
12792 aarch64_float_const_representable_p (rtx x
)
12794 /* This represents our current view of how many bits
12795 make up the mantissa. */
12796 int point_pos
= 2 * HOST_BITS_PER_WIDE_INT
- 1;
12798 unsigned HOST_WIDE_INT mantissa
, mask
;
12799 REAL_VALUE_TYPE r
, m
;
12802 if (!CONST_DOUBLE_P (x
))
12805 /* We don't support HFmode constants yet. */
12806 if (GET_MODE (x
) == VOIDmode
|| GET_MODE (x
) == HFmode
)
12809 r
= *CONST_DOUBLE_REAL_VALUE (x
);
12811 /* We cannot represent infinities, NaNs or +/-zero. We won't
12812 know if we have +zero until we analyse the mantissa, but we
12813 can reject the other invalid values. */
12814 if (REAL_VALUE_ISINF (r
) || REAL_VALUE_ISNAN (r
)
12815 || REAL_VALUE_MINUS_ZERO (r
))
12818 /* Extract exponent. */
12819 r
= real_value_abs (&r
);
12820 exponent
= REAL_EXP (&r
);
12822 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12823 highest (sign) bit, with a fixed binary point at bit point_pos.
12824 m1 holds the low part of the mantissa, m2 the high part.
12825 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12826 bits for the mantissa, this can fail (low bits will be lost). */
12827 real_ldexp (&m
, &r
, point_pos
- exponent
);
12828 wide_int w
= real_to_integer (&m
, &fail
, HOST_BITS_PER_WIDE_INT
* 2);
12830 /* If the low part of the mantissa has bits set we cannot represent
12832 if (w
.ulow () != 0)
12834 /* We have rejected the lower HOST_WIDE_INT, so update our
12835 understanding of how many bits lie in the mantissa and
12836 look only at the high HOST_WIDE_INT. */
12837 mantissa
= w
.elt (1);
12838 point_pos
-= HOST_BITS_PER_WIDE_INT
;
12840 /* We can only represent values with a mantissa of the form 1.xxxx. */
12841 mask
= ((unsigned HOST_WIDE_INT
)1 << (point_pos
- 5)) - 1;
12842 if ((mantissa
& mask
) != 0)
12845 /* Having filtered unrepresentable values, we may now remove all
12846 but the highest 5 bits. */
12847 mantissa
>>= point_pos
- 5;
12849 /* We cannot represent the value 0.0, so reject it. This is handled
12854 /* Then, as bit 4 is always set, we can mask it off, leaving
12855 the mantissa in the range [0, 15]. */
12856 mantissa
&= ~(1 << 4);
12857 gcc_assert (mantissa
<= 15);
12859 /* GCC internally does not use IEEE754-like encoding (where normalized
12860 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
12861 Our mantissa values are shifted 4 places to the left relative to
12862 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12863 by 5 places to correct for GCC's representation. */
12864 exponent
= 5 - exponent
;
12866 return (exponent
>= 0 && exponent
<= 7);
12870 aarch64_output_simd_mov_immediate (rtx const_vector
,
12875 static char templ
[40];
12876 const char *mnemonic
;
12877 const char *shift_op
;
12878 unsigned int lane_count
= 0;
12881 struct simd_immediate_info info
= { NULL_RTX
, 0, 0, false, false };
12883 /* This will return true to show const_vector is legal for use as either
12884 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
12885 also update INFO to show how the immediate should be generated. */
12886 is_valid
= aarch64_simd_valid_immediate (const_vector
, mode
, false, &info
);
12887 gcc_assert (is_valid
);
12889 element_char
= sizetochar (info
.element_width
);
12890 lane_count
= width
/ info
.element_width
;
12892 mode
= GET_MODE_INNER (mode
);
12893 if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
12895 gcc_assert (info
.shift
== 0 && ! info
.mvn
);
12896 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
12897 move immediate path. */
12898 if (aarch64_float_const_zero_rtx_p (info
.value
))
12899 info
.value
= GEN_INT (0);
12902 const unsigned int buf_size
= 20;
12903 char float_buf
[buf_size
] = {'\0'};
12904 real_to_decimal_for_mode (float_buf
,
12905 CONST_DOUBLE_REAL_VALUE (info
.value
),
12906 buf_size
, buf_size
, 1, mode
);
12908 if (lane_count
== 1)
12909 snprintf (templ
, sizeof (templ
), "fmov\t%%d0, %s", float_buf
);
12911 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%d%c, %s",
12912 lane_count
, element_char
, float_buf
);
12917 mnemonic
= info
.mvn
? "mvni" : "movi";
12918 shift_op
= info
.msl
? "msl" : "lsl";
12920 gcc_assert (CONST_INT_P (info
.value
));
12921 if (lane_count
== 1)
12922 snprintf (templ
, sizeof (templ
), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX
,
12923 mnemonic
, UINTVAL (info
.value
));
12924 else if (info
.shift
)
12925 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
12926 ", %s %d", mnemonic
, lane_count
, element_char
,
12927 UINTVAL (info
.value
), shift_op
, info
.shift
);
12929 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
,
12930 mnemonic
, lane_count
, element_char
, UINTVAL (info
.value
));
12935 aarch64_output_scalar_simd_mov_immediate (rtx immediate
,
12938 machine_mode vmode
;
12940 gcc_assert (!VECTOR_MODE_P (mode
));
12941 vmode
= aarch64_simd_container_mode (mode
, 64);
12942 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (immediate
));
12943 return aarch64_output_simd_mov_immediate (v_op
, vmode
, 64);
12946 /* Split operands into moves from op[1] + op[2] into op[0]. */
12949 aarch64_split_combinev16qi (rtx operands
[3])
12951 unsigned int dest
= REGNO (operands
[0]);
12952 unsigned int src1
= REGNO (operands
[1]);
12953 unsigned int src2
= REGNO (operands
[2]);
12954 machine_mode halfmode
= GET_MODE (operands
[1]);
12955 unsigned int halfregs
= HARD_REGNO_NREGS (src1
, halfmode
);
12956 rtx destlo
, desthi
;
12958 gcc_assert (halfmode
== V16QImode
);
12960 if (src1
== dest
&& src2
== dest
+ halfregs
)
12962 /* No-op move. Can't split to nothing; emit something. */
12963 emit_note (NOTE_INSN_DELETED
);
12967 /* Preserve register attributes for variable tracking. */
12968 destlo
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
, 0);
12969 desthi
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
+ halfregs
,
12970 GET_MODE_SIZE (halfmode
));
12972 /* Special case of reversed high/low parts. */
12973 if (reg_overlap_mentioned_p (operands
[2], destlo
)
12974 && reg_overlap_mentioned_p (operands
[1], desthi
))
12976 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
12977 emit_insn (gen_xorv16qi3 (operands
[2], operands
[1], operands
[2]));
12978 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
12980 else if (!reg_overlap_mentioned_p (operands
[2], destlo
))
12982 /* Try to avoid unnecessary moves if part of the result
12983 is in the right place already. */
12985 emit_move_insn (destlo
, operands
[1]);
12986 if (src2
!= dest
+ halfregs
)
12987 emit_move_insn (desthi
, operands
[2]);
12991 if (src2
!= dest
+ halfregs
)
12992 emit_move_insn (desthi
, operands
[2]);
12994 emit_move_insn (destlo
, operands
[1]);
12998 /* vec_perm support. */
13000 #define MAX_VECT_LEN 16
13002 struct expand_vec_perm_d
13004 rtx target
, op0
, op1
;
13005 unsigned char perm
[MAX_VECT_LEN
];
13006 machine_mode vmode
;
13007 unsigned char nelt
;
13012 /* Generate a variable permutation. */
13015 aarch64_expand_vec_perm_1 (rtx target
, rtx op0
, rtx op1
, rtx sel
)
13017 machine_mode vmode
= GET_MODE (target
);
13018 bool one_vector_p
= rtx_equal_p (op0
, op1
);
13020 gcc_checking_assert (vmode
== V8QImode
|| vmode
== V16QImode
);
13021 gcc_checking_assert (GET_MODE (op0
) == vmode
);
13022 gcc_checking_assert (GET_MODE (op1
) == vmode
);
13023 gcc_checking_assert (GET_MODE (sel
) == vmode
);
13024 gcc_checking_assert (TARGET_SIMD
);
13028 if (vmode
== V8QImode
)
13030 /* Expand the argument to a V16QI mode by duplicating it. */
13031 rtx pair
= gen_reg_rtx (V16QImode
);
13032 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op0
));
13033 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
13037 emit_insn (gen_aarch64_tbl1v16qi (target
, op0
, sel
));
13044 if (vmode
== V8QImode
)
13046 pair
= gen_reg_rtx (V16QImode
);
13047 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op1
));
13048 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
13052 pair
= gen_reg_rtx (OImode
);
13053 emit_insn (gen_aarch64_combinev16qi (pair
, op0
, op1
));
13054 emit_insn (gen_aarch64_tbl2v16qi (target
, pair
, sel
));
13060 aarch64_expand_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
)
13062 machine_mode vmode
= GET_MODE (target
);
13063 unsigned int nelt
= GET_MODE_NUNITS (vmode
);
13064 bool one_vector_p
= rtx_equal_p (op0
, op1
);
13067 /* The TBL instruction does not use a modulo index, so we must take care
13068 of that ourselves. */
13069 mask
= aarch64_simd_gen_const_vector_dup (vmode
,
13070 one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
13071 sel
= expand_simple_binop (vmode
, AND
, sel
, mask
, NULL
, 0, OPTAB_LIB_WIDEN
);
13073 /* For big-endian, we also need to reverse the index within the vector
13074 (but not which vector). */
13075 if (BYTES_BIG_ENDIAN
)
13077 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
13079 mask
= aarch64_simd_gen_const_vector_dup (vmode
, nelt
- 1);
13080 sel
= expand_simple_binop (vmode
, XOR
, sel
, mask
,
13081 NULL
, 0, OPTAB_LIB_WIDEN
);
13083 aarch64_expand_vec_perm_1 (target
, op0
, op1
, sel
);
13086 /* Recognize patterns suitable for the TRN instructions. */
13088 aarch64_evpc_trn (struct expand_vec_perm_d
*d
)
13090 unsigned int i
, odd
, mask
, nelt
= d
->nelt
;
13091 rtx out
, in0
, in1
, x
;
13092 rtx (*gen
) (rtx
, rtx
, rtx
);
13093 machine_mode vmode
= d
->vmode
;
13095 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
13098 /* Note that these are little-endian tests.
13099 We correct for big-endian later. */
13100 if (d
->perm
[0] == 0)
13102 else if (d
->perm
[0] == 1)
13106 mask
= (d
->one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
13108 for (i
= 0; i
< nelt
; i
+= 2)
13110 if (d
->perm
[i
] != i
+ odd
)
13112 if (d
->perm
[i
+ 1] != ((i
+ nelt
+ odd
) & mask
))
13122 if (BYTES_BIG_ENDIAN
)
13124 x
= in0
, in0
= in1
, in1
= x
;
13133 case V16QImode
: gen
= gen_aarch64_trn2v16qi
; break;
13134 case V8QImode
: gen
= gen_aarch64_trn2v8qi
; break;
13135 case V8HImode
: gen
= gen_aarch64_trn2v8hi
; break;
13136 case V4HImode
: gen
= gen_aarch64_trn2v4hi
; break;
13137 case V4SImode
: gen
= gen_aarch64_trn2v4si
; break;
13138 case V2SImode
: gen
= gen_aarch64_trn2v2si
; break;
13139 case V2DImode
: gen
= gen_aarch64_trn2v2di
; break;
13140 case V4HFmode
: gen
= gen_aarch64_trn2v4hf
; break;
13141 case V8HFmode
: gen
= gen_aarch64_trn2v8hf
; break;
13142 case V4SFmode
: gen
= gen_aarch64_trn2v4sf
; break;
13143 case V2SFmode
: gen
= gen_aarch64_trn2v2sf
; break;
13144 case V2DFmode
: gen
= gen_aarch64_trn2v2df
; break;
13153 case V16QImode
: gen
= gen_aarch64_trn1v16qi
; break;
13154 case V8QImode
: gen
= gen_aarch64_trn1v8qi
; break;
13155 case V8HImode
: gen
= gen_aarch64_trn1v8hi
; break;
13156 case V4HImode
: gen
= gen_aarch64_trn1v4hi
; break;
13157 case V4SImode
: gen
= gen_aarch64_trn1v4si
; break;
13158 case V2SImode
: gen
= gen_aarch64_trn1v2si
; break;
13159 case V2DImode
: gen
= gen_aarch64_trn1v2di
; break;
13160 case V4HFmode
: gen
= gen_aarch64_trn1v4hf
; break;
13161 case V8HFmode
: gen
= gen_aarch64_trn1v8hf
; break;
13162 case V4SFmode
: gen
= gen_aarch64_trn1v4sf
; break;
13163 case V2SFmode
: gen
= gen_aarch64_trn1v2sf
; break;
13164 case V2DFmode
: gen
= gen_aarch64_trn1v2df
; break;
13170 emit_insn (gen (out
, in0
, in1
));
13174 /* Recognize patterns suitable for the UZP instructions. */
13176 aarch64_evpc_uzp (struct expand_vec_perm_d
*d
)
13178 unsigned int i
, odd
, mask
, nelt
= d
->nelt
;
13179 rtx out
, in0
, in1
, x
;
13180 rtx (*gen
) (rtx
, rtx
, rtx
);
13181 machine_mode vmode
= d
->vmode
;
13183 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
13186 /* Note that these are little-endian tests.
13187 We correct for big-endian later. */
13188 if (d
->perm
[0] == 0)
13190 else if (d
->perm
[0] == 1)
13194 mask
= (d
->one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
13196 for (i
= 0; i
< nelt
; i
++)
13198 unsigned elt
= (i
* 2 + odd
) & mask
;
13199 if (d
->perm
[i
] != elt
)
13209 if (BYTES_BIG_ENDIAN
)
13211 x
= in0
, in0
= in1
, in1
= x
;
13220 case V16QImode
: gen
= gen_aarch64_uzp2v16qi
; break;
13221 case V8QImode
: gen
= gen_aarch64_uzp2v8qi
; break;
13222 case V8HImode
: gen
= gen_aarch64_uzp2v8hi
; break;
13223 case V4HImode
: gen
= gen_aarch64_uzp2v4hi
; break;
13224 case V4SImode
: gen
= gen_aarch64_uzp2v4si
; break;
13225 case V2SImode
: gen
= gen_aarch64_uzp2v2si
; break;
13226 case V2DImode
: gen
= gen_aarch64_uzp2v2di
; break;
13227 case V4HFmode
: gen
= gen_aarch64_uzp2v4hf
; break;
13228 case V8HFmode
: gen
= gen_aarch64_uzp2v8hf
; break;
13229 case V4SFmode
: gen
= gen_aarch64_uzp2v4sf
; break;
13230 case V2SFmode
: gen
= gen_aarch64_uzp2v2sf
; break;
13231 case V2DFmode
: gen
= gen_aarch64_uzp2v2df
; break;
13240 case V16QImode
: gen
= gen_aarch64_uzp1v16qi
; break;
13241 case V8QImode
: gen
= gen_aarch64_uzp1v8qi
; break;
13242 case V8HImode
: gen
= gen_aarch64_uzp1v8hi
; break;
13243 case V4HImode
: gen
= gen_aarch64_uzp1v4hi
; break;
13244 case V4SImode
: gen
= gen_aarch64_uzp1v4si
; break;
13245 case V2SImode
: gen
= gen_aarch64_uzp1v2si
; break;
13246 case V2DImode
: gen
= gen_aarch64_uzp1v2di
; break;
13247 case V4HFmode
: gen
= gen_aarch64_uzp1v4hf
; break;
13248 case V8HFmode
: gen
= gen_aarch64_uzp1v8hf
; break;
13249 case V4SFmode
: gen
= gen_aarch64_uzp1v4sf
; break;
13250 case V2SFmode
: gen
= gen_aarch64_uzp1v2sf
; break;
13251 case V2DFmode
: gen
= gen_aarch64_uzp1v2df
; break;
13257 emit_insn (gen (out
, in0
, in1
));
13261 /* Recognize patterns suitable for the ZIP instructions. */
13263 aarch64_evpc_zip (struct expand_vec_perm_d
*d
)
13265 unsigned int i
, high
, mask
, nelt
= d
->nelt
;
13266 rtx out
, in0
, in1
, x
;
13267 rtx (*gen
) (rtx
, rtx
, rtx
);
13268 machine_mode vmode
= d
->vmode
;
13270 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
13273 /* Note that these are little-endian tests.
13274 We correct for big-endian later. */
13276 if (d
->perm
[0] == high
)
13279 else if (d
->perm
[0] == 0)
13283 mask
= (d
->one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
13285 for (i
= 0; i
< nelt
/ 2; i
++)
13287 unsigned elt
= (i
+ high
) & mask
;
13288 if (d
->perm
[i
* 2] != elt
)
13290 elt
= (elt
+ nelt
) & mask
;
13291 if (d
->perm
[i
* 2 + 1] != elt
)
13301 if (BYTES_BIG_ENDIAN
)
13303 x
= in0
, in0
= in1
, in1
= x
;
13312 case V16QImode
: gen
= gen_aarch64_zip2v16qi
; break;
13313 case V8QImode
: gen
= gen_aarch64_zip2v8qi
; break;
13314 case V8HImode
: gen
= gen_aarch64_zip2v8hi
; break;
13315 case V4HImode
: gen
= gen_aarch64_zip2v4hi
; break;
13316 case V4SImode
: gen
= gen_aarch64_zip2v4si
; break;
13317 case V2SImode
: gen
= gen_aarch64_zip2v2si
; break;
13318 case V2DImode
: gen
= gen_aarch64_zip2v2di
; break;
13319 case V4HFmode
: gen
= gen_aarch64_zip2v4hf
; break;
13320 case V8HFmode
: gen
= gen_aarch64_zip2v8hf
; break;
13321 case V4SFmode
: gen
= gen_aarch64_zip2v4sf
; break;
13322 case V2SFmode
: gen
= gen_aarch64_zip2v2sf
; break;
13323 case V2DFmode
: gen
= gen_aarch64_zip2v2df
; break;
13332 case V16QImode
: gen
= gen_aarch64_zip1v16qi
; break;
13333 case V8QImode
: gen
= gen_aarch64_zip1v8qi
; break;
13334 case V8HImode
: gen
= gen_aarch64_zip1v8hi
; break;
13335 case V4HImode
: gen
= gen_aarch64_zip1v4hi
; break;
13336 case V4SImode
: gen
= gen_aarch64_zip1v4si
; break;
13337 case V2SImode
: gen
= gen_aarch64_zip1v2si
; break;
13338 case V2DImode
: gen
= gen_aarch64_zip1v2di
; break;
13339 case V4HFmode
: gen
= gen_aarch64_zip1v4hf
; break;
13340 case V8HFmode
: gen
= gen_aarch64_zip1v8hf
; break;
13341 case V4SFmode
: gen
= gen_aarch64_zip1v4sf
; break;
13342 case V2SFmode
: gen
= gen_aarch64_zip1v2sf
; break;
13343 case V2DFmode
: gen
= gen_aarch64_zip1v2df
; break;
13349 emit_insn (gen (out
, in0
, in1
));
13353 /* Recognize patterns for the EXT insn. */
13356 aarch64_evpc_ext (struct expand_vec_perm_d
*d
)
13358 unsigned int i
, nelt
= d
->nelt
;
13359 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
13362 unsigned int location
= d
->perm
[0]; /* Always < nelt. */
13364 /* Check if the extracted indices are increasing by one. */
13365 for (i
= 1; i
< nelt
; i
++)
13367 unsigned int required
= location
+ i
;
13368 if (d
->one_vector_p
)
13370 /* We'll pass the same vector in twice, so allow indices to wrap. */
13371 required
&= (nelt
- 1);
13373 if (d
->perm
[i
] != required
)
13379 case V16QImode
: gen
= gen_aarch64_extv16qi
; break;
13380 case V8QImode
: gen
= gen_aarch64_extv8qi
; break;
13381 case V4HImode
: gen
= gen_aarch64_extv4hi
; break;
13382 case V8HImode
: gen
= gen_aarch64_extv8hi
; break;
13383 case V2SImode
: gen
= gen_aarch64_extv2si
; break;
13384 case V4SImode
: gen
= gen_aarch64_extv4si
; break;
13385 case V4HFmode
: gen
= gen_aarch64_extv4hf
; break;
13386 case V8HFmode
: gen
= gen_aarch64_extv8hf
; break;
13387 case V2SFmode
: gen
= gen_aarch64_extv2sf
; break;
13388 case V4SFmode
: gen
= gen_aarch64_extv4sf
; break;
13389 case V2DImode
: gen
= gen_aarch64_extv2di
; break;
13390 case V2DFmode
: gen
= gen_aarch64_extv2df
; break;
13399 /* The case where (location == 0) is a no-op for both big- and little-endian,
13400 and is removed by the mid-end at optimization levels -O1 and higher. */
13402 if (BYTES_BIG_ENDIAN
&& (location
!= 0))
13404 /* After setup, we want the high elements of the first vector (stored
13405 at the LSB end of the register), and the low elements of the second
13406 vector (stored at the MSB end of the register). So swap. */
13407 std::swap (d
->op0
, d
->op1
);
13408 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
13409 location
= nelt
- location
;
13412 offset
= GEN_INT (location
);
13413 emit_insn (gen (d
->target
, d
->op0
, d
->op1
, offset
));
13417 /* Recognize patterns for the REV insns. */
13420 aarch64_evpc_rev (struct expand_vec_perm_d
*d
)
13422 unsigned int i
, j
, diff
, nelt
= d
->nelt
;
13423 rtx (*gen
) (rtx
, rtx
);
13425 if (!d
->one_vector_p
)
13434 case V16QImode
: gen
= gen_aarch64_rev64v16qi
; break;
13435 case V8QImode
: gen
= gen_aarch64_rev64v8qi
; break;
13443 case V16QImode
: gen
= gen_aarch64_rev32v16qi
; break;
13444 case V8QImode
: gen
= gen_aarch64_rev32v8qi
; break;
13445 case V8HImode
: gen
= gen_aarch64_rev64v8hi
; break;
13446 case V4HImode
: gen
= gen_aarch64_rev64v4hi
; break;
13454 case V16QImode
: gen
= gen_aarch64_rev16v16qi
; break;
13455 case V8QImode
: gen
= gen_aarch64_rev16v8qi
; break;
13456 case V8HImode
: gen
= gen_aarch64_rev32v8hi
; break;
13457 case V4HImode
: gen
= gen_aarch64_rev32v4hi
; break;
13458 case V4SImode
: gen
= gen_aarch64_rev64v4si
; break;
13459 case V2SImode
: gen
= gen_aarch64_rev64v2si
; break;
13460 case V4SFmode
: gen
= gen_aarch64_rev64v4sf
; break;
13461 case V2SFmode
: gen
= gen_aarch64_rev64v2sf
; break;
13462 case V8HFmode
: gen
= gen_aarch64_rev64v8hf
; break;
13463 case V4HFmode
: gen
= gen_aarch64_rev64v4hf
; break;
13472 for (i
= 0; i
< nelt
; i
+= diff
+ 1)
13473 for (j
= 0; j
<= diff
; j
+= 1)
13475 /* This is guaranteed to be true as the value of diff
13476 is 7, 3, 1 and we should have enough elements in the
13477 queue to generate this. Getting a vector mask with a
13478 value of diff other than these values implies that
13479 something is wrong by the time we get here. */
13480 gcc_assert (i
+ j
< nelt
);
13481 if (d
->perm
[i
+ j
] != i
+ diff
- j
)
13489 emit_insn (gen (d
->target
, d
->op0
));
13494 aarch64_evpc_dup (struct expand_vec_perm_d
*d
)
13496 rtx (*gen
) (rtx
, rtx
, rtx
);
13497 rtx out
= d
->target
;
13499 machine_mode vmode
= d
->vmode
;
13500 unsigned int i
, elt
, nelt
= d
->nelt
;
13504 for (i
= 1; i
< nelt
; i
++)
13506 if (elt
!= d
->perm
[i
])
13510 /* The generic preparation in aarch64_expand_vec_perm_const_1
13511 swaps the operand order and the permute indices if it finds
13512 d->perm[0] to be in the second operand. Thus, we can always
13513 use d->op0 and need not do any extra arithmetic to get the
13514 correct lane number. */
13516 lane
= GEN_INT (elt
); /* The pattern corrects for big-endian. */
13520 case V16QImode
: gen
= gen_aarch64_dup_lanev16qi
; break;
13521 case V8QImode
: gen
= gen_aarch64_dup_lanev8qi
; break;
13522 case V8HImode
: gen
= gen_aarch64_dup_lanev8hi
; break;
13523 case V4HImode
: gen
= gen_aarch64_dup_lanev4hi
; break;
13524 case V4SImode
: gen
= gen_aarch64_dup_lanev4si
; break;
13525 case V2SImode
: gen
= gen_aarch64_dup_lanev2si
; break;
13526 case V2DImode
: gen
= gen_aarch64_dup_lanev2di
; break;
13527 case V8HFmode
: gen
= gen_aarch64_dup_lanev8hf
; break;
13528 case V4HFmode
: gen
= gen_aarch64_dup_lanev4hf
; break;
13529 case V4SFmode
: gen
= gen_aarch64_dup_lanev4sf
; break;
13530 case V2SFmode
: gen
= gen_aarch64_dup_lanev2sf
; break;
13531 case V2DFmode
: gen
= gen_aarch64_dup_lanev2df
; break;
13536 emit_insn (gen (out
, in0
, lane
));
13541 aarch64_evpc_tbl (struct expand_vec_perm_d
*d
)
13543 rtx rperm
[MAX_VECT_LEN
], sel
;
13544 machine_mode vmode
= d
->vmode
;
13545 unsigned int i
, nelt
= d
->nelt
;
13550 /* Generic code will try constant permutation twice. Once with the
13551 original mode and again with the elements lowered to QImode.
13552 So wait and don't do the selector expansion ourselves. */
13553 if (vmode
!= V8QImode
&& vmode
!= V16QImode
)
13556 for (i
= 0; i
< nelt
; ++i
)
13558 int nunits
= GET_MODE_NUNITS (vmode
);
13560 /* If big-endian and two vectors we end up with a weird mixed-endian
13561 mode on NEON. Reverse the index within each word but not the word
13563 rperm
[i
] = GEN_INT (BYTES_BIG_ENDIAN
? d
->perm
[i
] ^ (nunits
- 1)
13566 sel
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
13567 sel
= force_reg (vmode
, sel
);
13569 aarch64_expand_vec_perm_1 (d
->target
, d
->op0
, d
->op1
, sel
);
13574 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
13576 /* The pattern matching functions above are written to look for a small
13577 number to begin the sequence (0, 1, N/2). If we begin with an index
13578 from the second operand, we can swap the operands. */
13579 if (d
->perm
[0] >= d
->nelt
)
13581 unsigned i
, nelt
= d
->nelt
;
13583 gcc_assert (nelt
== (nelt
& -nelt
));
13584 for (i
= 0; i
< nelt
; ++i
)
13585 d
->perm
[i
] ^= nelt
; /* Keep the same index, but in the other vector. */
13587 std::swap (d
->op0
, d
->op1
);
13592 if (aarch64_evpc_rev (d
))
13594 else if (aarch64_evpc_ext (d
))
13596 else if (aarch64_evpc_dup (d
))
13598 else if (aarch64_evpc_zip (d
))
13600 else if (aarch64_evpc_uzp (d
))
13602 else if (aarch64_evpc_trn (d
))
13604 return aarch64_evpc_tbl (d
);
13609 /* Expand a vec_perm_const pattern. */
13612 aarch64_expand_vec_perm_const (rtx target
, rtx op0
, rtx op1
, rtx sel
)
13614 struct expand_vec_perm_d d
;
13615 int i
, nelt
, which
;
13621 d
.vmode
= GET_MODE (target
);
13622 gcc_assert (VECTOR_MODE_P (d
.vmode
));
13623 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
13624 d
.testing_p
= false;
13626 for (i
= which
= 0; i
< nelt
; ++i
)
13628 rtx e
= XVECEXP (sel
, 0, i
);
13629 int ei
= INTVAL (e
) & (2 * nelt
- 1);
13630 which
|= (ei
< nelt
? 1 : 2);
13637 gcc_unreachable ();
13640 d
.one_vector_p
= false;
13641 if (!rtx_equal_p (op0
, op1
))
13644 /* The elements of PERM do not suggest that only the first operand
13645 is used, but both operands are identical. Allow easier matching
13646 of the permutation by folding the permutation into the single
13648 /* Fall Through. */
13650 for (i
= 0; i
< nelt
; ++i
)
13651 d
.perm
[i
] &= nelt
- 1;
13653 d
.one_vector_p
= true;
13658 d
.one_vector_p
= true;
13662 return aarch64_expand_vec_perm_const_1 (&d
);
13666 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode
,
13667 const unsigned char *sel
)
13669 struct expand_vec_perm_d d
;
13670 unsigned int i
, nelt
, which
;
13674 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
13675 d
.testing_p
= true;
13676 memcpy (d
.perm
, sel
, nelt
);
13678 /* Calculate whether all elements are in one vector. */
13679 for (i
= which
= 0; i
< nelt
; ++i
)
13681 unsigned char e
= d
.perm
[i
];
13682 gcc_assert (e
< 2 * nelt
);
13683 which
|= (e
< nelt
? 1 : 2);
13686 /* If all elements are from the second vector, reindex as if from the
13689 for (i
= 0; i
< nelt
; ++i
)
13692 /* Check whether the mask can be applied to a single vector. */
13693 d
.one_vector_p
= (which
!= 3);
13695 d
.target
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 1);
13696 d
.op1
= d
.op0
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 2);
13697 if (!d
.one_vector_p
)
13698 d
.op1
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 3);
13701 ret
= aarch64_expand_vec_perm_const_1 (&d
);
13708 aarch64_reverse_mask (machine_mode mode
)
13710 /* We have to reverse each vector because we dont have
13711 a permuted load that can reverse-load according to ABI rules. */
13713 rtvec v
= rtvec_alloc (16);
13715 int nunits
= GET_MODE_NUNITS (mode
);
13716 int usize
= GET_MODE_UNIT_SIZE (mode
);
13718 gcc_assert (BYTES_BIG_ENDIAN
);
13719 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode
));
13721 for (i
= 0; i
< nunits
; i
++)
13722 for (j
= 0; j
< usize
; j
++)
13723 RTVEC_ELT (v
, i
* usize
+ j
) = GEN_INT ((i
+ 1) * usize
- 1 - j
);
13724 mask
= gen_rtx_CONST_VECTOR (V16QImode
, v
);
13725 return force_reg (V16QImode
, mask
);
13728 /* Implement MODES_TIEABLE_P. In principle we should always return true.
13729 However due to issues with register allocation it is preferable to avoid
13730 tieing integer scalar and FP scalar modes. Executing integer operations
13731 in general registers is better than treating them as scalar vector
13732 operations. This reduces latency and avoids redundant int<->FP moves.
13733 So tie modes if they are either the same class, or vector modes with
13734 other vector modes, vector structs or any scalar mode.
13738 aarch64_modes_tieable_p (machine_mode mode1
, machine_mode mode2
)
13740 if (GET_MODE_CLASS (mode1
) == GET_MODE_CLASS (mode2
))
13743 /* We specifically want to allow elements of "structure" modes to
13744 be tieable to the structure. This more general condition allows
13745 other rarer situations too. */
13746 if (aarch64_vector_mode_p (mode1
) && aarch64_vector_mode_p (mode2
))
13749 /* Also allow any scalar modes with vectors. */
13750 if (aarch64_vector_mode_supported_p (mode1
)
13751 || aarch64_vector_mode_supported_p (mode2
))
13757 /* Return a new RTX holding the result of moving POINTER forward by
13761 aarch64_move_pointer (rtx pointer
, int amount
)
13763 rtx next
= plus_constant (Pmode
, XEXP (pointer
, 0), amount
);
13765 return adjust_automodify_address (pointer
, GET_MODE (pointer
),
13769 /* Return a new RTX holding the result of moving POINTER forward by the
13770 size of the mode it points to. */
13773 aarch64_progress_pointer (rtx pointer
)
13775 HOST_WIDE_INT amount
= GET_MODE_SIZE (GET_MODE (pointer
));
13777 return aarch64_move_pointer (pointer
, amount
);
13780 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13784 aarch64_copy_one_block_and_progress_pointers (rtx
*src
, rtx
*dst
,
13787 rtx reg
= gen_reg_rtx (mode
);
13789 /* "Cast" the pointers to the correct mode. */
13790 *src
= adjust_address (*src
, mode
, 0);
13791 *dst
= adjust_address (*dst
, mode
, 0);
13792 /* Emit the memcpy. */
13793 emit_move_insn (reg
, *src
);
13794 emit_move_insn (*dst
, reg
);
13795 /* Move the pointers forward. */
13796 *src
= aarch64_progress_pointer (*src
);
13797 *dst
= aarch64_progress_pointer (*dst
);
13800 /* Expand movmem, as if from a __builtin_memcpy. Return true if
13801 we succeed, otherwise return false. */
13804 aarch64_expand_movmem (rtx
*operands
)
13807 rtx dst
= operands
[0];
13808 rtx src
= operands
[1];
13810 bool speed_p
= !optimize_function_for_size_p (cfun
);
13812 /* When optimizing for size, give a better estimate of the length of a
13813 memcpy call, but use the default otherwise. */
13814 unsigned int max_instructions
= (speed_p
? 15 : AARCH64_CALL_RATIO
) / 2;
13816 /* We can't do anything smart if the amount to copy is not constant. */
13817 if (!CONST_INT_P (operands
[2]))
13820 n
= UINTVAL (operands
[2]);
13822 /* Try to keep the number of instructions low. For cases below 16 bytes we
13823 need to make at most two moves. For cases above 16 bytes it will be one
13824 move for each 16 byte chunk, then at most two additional moves. */
13825 if (((n
/ 16) + (n
% 16 ? 2 : 0)) > max_instructions
)
13828 base
= copy_to_mode_reg (Pmode
, XEXP (dst
, 0));
13829 dst
= adjust_automodify_address (dst
, VOIDmode
, base
, 0);
13831 base
= copy_to_mode_reg (Pmode
, XEXP (src
, 0));
13832 src
= adjust_automodify_address (src
, VOIDmode
, base
, 0);
13834 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13840 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, HImode
);
13845 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, QImode
);
13850 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
13851 4-byte chunk, partially overlapping with the previously copied chunk. */
13854 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, SImode
);
13860 src
= aarch64_move_pointer (src
, move
);
13861 dst
= aarch64_move_pointer (dst
, move
);
13862 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, SImode
);
13867 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
13868 them, then (if applicable) an 8-byte chunk. */
13873 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, TImode
);
13878 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, DImode
);
13883 /* Finish the final bytes of the copy. We can always do this in one
13884 instruction. We either copy the exact amount we need, or partially
13885 overlap with the previous chunk we copied and copy 8-bytes. */
13889 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, QImode
);
13891 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, HImode
);
13893 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, SImode
);
13898 src
= aarch64_move_pointer (src
, -1);
13899 dst
= aarch64_move_pointer (dst
, -1);
13900 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, SImode
);
13906 src
= aarch64_move_pointer (src
, move
);
13907 dst
= aarch64_move_pointer (dst
, move
);
13908 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, DImode
);
13915 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
13916 SImode stores. Handle the case when the constant has identical
13917 bottom and top halves. This is beneficial when the two stores can be
13918 merged into an STP and we avoid synthesising potentially expensive
13919 immediates twice. Return true if such a split is possible. */
13922 aarch64_split_dimode_const_store (rtx dst
, rtx src
)
13924 rtx lo
= gen_lowpart (SImode
, src
);
13925 rtx hi
= gen_highpart_mode (SImode
, DImode
, src
);
13927 bool size_p
= optimize_function_for_size_p (cfun
);
13929 if (!rtx_equal_p (lo
, hi
))
13932 unsigned int orig_cost
13933 = aarch64_internal_mov_immediate (NULL_RTX
, src
, false, DImode
);
13934 unsigned int lo_cost
13935 = aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, SImode
);
13937 /* We want to transform:
13939 MOVK x1, 0x140, lsl 16
13940 MOVK x1, 0xc0da, lsl 32
13941 MOVK x1, 0x140, lsl 48
13945 MOVK w1, 0x140, lsl 16
13947 So we want to perform this only when we save two instructions
13948 or more. When optimizing for size, however, accept any code size
13950 if (size_p
&& orig_cost
<= lo_cost
)
13954 && (orig_cost
<= lo_cost
+ 1))
13957 rtx mem_lo
= adjust_address (dst
, SImode
, 0);
13958 if (!aarch64_mem_pair_operand (mem_lo
, SImode
))
13961 rtx tmp_reg
= gen_reg_rtx (SImode
);
13962 aarch64_expand_mov_immediate (tmp_reg
, lo
);
13963 rtx mem_hi
= aarch64_move_pointer (mem_lo
, GET_MODE_SIZE (SImode
));
13964 /* Don't emit an explicit store pair as this may not be always profitable.
13965 Let the sched-fusion logic decide whether to merge them. */
13966 emit_move_insn (mem_lo
, tmp_reg
);
13967 emit_move_insn (mem_hi
, tmp_reg
);
13972 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
13974 static unsigned HOST_WIDE_INT
13975 aarch64_asan_shadow_offset (void)
13977 return (HOST_WIDE_INT_1
<< 36);
13981 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size
,
13982 unsigned int align
,
13983 enum by_pieces_operation op
,
13986 /* STORE_BY_PIECES can be used when copying a constant string, but
13987 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
13988 For now we always fail this and let the move_by_pieces code copy
13989 the string from read-only memory. */
13990 if (op
== STORE_BY_PIECES
)
13993 return default_use_by_pieces_infrastructure_p (size
, align
, op
, speed_p
);
13997 aarch64_gen_ccmp_first (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
,
13998 int code
, tree treeop0
, tree treeop1
)
14000 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
14002 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
14004 struct expand_operand ops
[4];
14007 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
14009 op_mode
= GET_MODE (op0
);
14010 if (op_mode
== VOIDmode
)
14011 op_mode
= GET_MODE (op1
);
14019 icode
= CODE_FOR_cmpsi
;
14024 icode
= CODE_FOR_cmpdi
;
14029 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
14030 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpesf
: CODE_FOR_fcmpsf
;
14035 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
14036 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpedf
: CODE_FOR_fcmpdf
;
14044 op0
= prepare_operand (icode
, op0
, 0, op_mode
, cmp_mode
, unsignedp
);
14045 op1
= prepare_operand (icode
, op1
, 1, op_mode
, cmp_mode
, unsignedp
);
14051 *prep_seq
= get_insns ();
14054 create_fixed_operand (&ops
[0], op0
);
14055 create_fixed_operand (&ops
[1], op1
);
14058 if (!maybe_expand_insn (icode
, 2, ops
))
14063 *gen_seq
= get_insns ();
14066 return gen_rtx_fmt_ee ((rtx_code
) code
, cc_mode
,
14067 gen_rtx_REG (cc_mode
, CC_REGNUM
), const0_rtx
);
14071 aarch64_gen_ccmp_next (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
, rtx prev
,
14072 int cmp_code
, tree treeop0
, tree treeop1
, int bit_code
)
14074 rtx op0
, op1
, target
;
14075 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
14076 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
14078 struct expand_operand ops
[6];
14081 push_to_sequence (*prep_seq
);
14082 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
14084 op_mode
= GET_MODE (op0
);
14085 if (op_mode
== VOIDmode
)
14086 op_mode
= GET_MODE (op1
);
14094 icode
= CODE_FOR_ccmpsi
;
14099 icode
= CODE_FOR_ccmpdi
;
14104 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
14105 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpesf
: CODE_FOR_fccmpsf
;
14110 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
14111 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpedf
: CODE_FOR_fccmpdf
;
14119 op0
= prepare_operand (icode
, op0
, 2, op_mode
, cmp_mode
, unsignedp
);
14120 op1
= prepare_operand (icode
, op1
, 3, op_mode
, cmp_mode
, unsignedp
);
14126 *prep_seq
= get_insns ();
14129 target
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
14130 aarch64_cond
= aarch64_get_condition_code_1 (cc_mode
, (rtx_code
) cmp_code
);
14132 if (bit_code
!= AND
)
14134 prev
= gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev
),
14135 GET_MODE (XEXP (prev
, 0))),
14136 VOIDmode
, XEXP (prev
, 0), const0_rtx
);
14137 aarch64_cond
= AARCH64_INVERSE_CONDITION_CODE (aarch64_cond
);
14140 create_fixed_operand (&ops
[0], XEXP (prev
, 0));
14141 create_fixed_operand (&ops
[1], target
);
14142 create_fixed_operand (&ops
[2], op0
);
14143 create_fixed_operand (&ops
[3], op1
);
14144 create_fixed_operand (&ops
[4], prev
);
14145 create_fixed_operand (&ops
[5], GEN_INT (aarch64_cond
));
14147 push_to_sequence (*gen_seq
);
14148 if (!maybe_expand_insn (icode
, 6, ops
))
14154 *gen_seq
= get_insns ();
14157 return gen_rtx_fmt_ee ((rtx_code
) cmp_code
, VOIDmode
, target
, const0_rtx
);
14160 #undef TARGET_GEN_CCMP_FIRST
14161 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
14163 #undef TARGET_GEN_CCMP_NEXT
14164 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
14166 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
14167 instruction fusion of some sort. */
14170 aarch64_macro_fusion_p (void)
14172 return aarch64_tune_params
.fusible_ops
!= AARCH64_FUSE_NOTHING
;
14176 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
14177 should be kept together during scheduling. */
14180 aarch_macro_fusion_pair_p (rtx_insn
*prev
, rtx_insn
*curr
)
14183 rtx prev_set
= single_set (prev
);
14184 rtx curr_set
= single_set (curr
);
14185 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
14186 bool simple_sets_p
= prev_set
&& curr_set
&& !any_condjump_p (curr
);
14188 if (!aarch64_macro_fusion_p ())
14191 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK
))
14193 /* We are trying to match:
14194 prev (mov) == (set (reg r0) (const_int imm16))
14195 curr (movk) == (set (zero_extract (reg r0)
14198 (const_int imm16_1)) */
14200 set_dest
= SET_DEST (curr_set
);
14202 if (GET_CODE (set_dest
) == ZERO_EXTRACT
14203 && CONST_INT_P (SET_SRC (curr_set
))
14204 && CONST_INT_P (SET_SRC (prev_set
))
14205 && CONST_INT_P (XEXP (set_dest
, 2))
14206 && INTVAL (XEXP (set_dest
, 2)) == 16
14207 && REG_P (XEXP (set_dest
, 0))
14208 && REG_P (SET_DEST (prev_set
))
14209 && REGNO (XEXP (set_dest
, 0)) == REGNO (SET_DEST (prev_set
)))
14215 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD
))
14218 /* We're trying to match:
14219 prev (adrp) == (set (reg r1)
14220 (high (symbol_ref ("SYM"))))
14221 curr (add) == (set (reg r0)
14223 (symbol_ref ("SYM"))))
14224 Note that r0 need not necessarily be the same as r1, especially
14225 during pre-regalloc scheduling. */
14227 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
14228 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
14230 if (GET_CODE (SET_SRC (curr_set
)) == LO_SUM
14231 && REG_P (XEXP (SET_SRC (curr_set
), 0))
14232 && REGNO (XEXP (SET_SRC (curr_set
), 0))
14233 == REGNO (SET_DEST (prev_set
))
14234 && rtx_equal_p (XEXP (SET_SRC (prev_set
), 0),
14235 XEXP (SET_SRC (curr_set
), 1)))
14240 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK
))
14243 /* We're trying to match:
14244 prev (movk) == (set (zero_extract (reg r0)
14247 (const_int imm16_1))
14248 curr (movk) == (set (zero_extract (reg r0)
14251 (const_int imm16_2)) */
14253 if (GET_CODE (SET_DEST (prev_set
)) == ZERO_EXTRACT
14254 && GET_CODE (SET_DEST (curr_set
)) == ZERO_EXTRACT
14255 && REG_P (XEXP (SET_DEST (prev_set
), 0))
14256 && REG_P (XEXP (SET_DEST (curr_set
), 0))
14257 && REGNO (XEXP (SET_DEST (prev_set
), 0))
14258 == REGNO (XEXP (SET_DEST (curr_set
), 0))
14259 && CONST_INT_P (XEXP (SET_DEST (prev_set
), 2))
14260 && CONST_INT_P (XEXP (SET_DEST (curr_set
), 2))
14261 && INTVAL (XEXP (SET_DEST (prev_set
), 2)) == 32
14262 && INTVAL (XEXP (SET_DEST (curr_set
), 2)) == 48
14263 && CONST_INT_P (SET_SRC (prev_set
))
14264 && CONST_INT_P (SET_SRC (curr_set
)))
14268 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR
))
14270 /* We're trying to match:
14271 prev (adrp) == (set (reg r0)
14272 (high (symbol_ref ("SYM"))))
14273 curr (ldr) == (set (reg r1)
14274 (mem (lo_sum (reg r0)
14275 (symbol_ref ("SYM")))))
14277 curr (ldr) == (set (reg r1)
14280 (symbol_ref ("SYM")))))) */
14281 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
14282 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
14284 rtx curr_src
= SET_SRC (curr_set
);
14286 if (GET_CODE (curr_src
) == ZERO_EXTEND
)
14287 curr_src
= XEXP (curr_src
, 0);
14289 if (MEM_P (curr_src
) && GET_CODE (XEXP (curr_src
, 0)) == LO_SUM
14290 && REG_P (XEXP (XEXP (curr_src
, 0), 0))
14291 && REGNO (XEXP (XEXP (curr_src
, 0), 0))
14292 == REGNO (SET_DEST (prev_set
))
14293 && rtx_equal_p (XEXP (XEXP (curr_src
, 0), 1),
14294 XEXP (SET_SRC (prev_set
), 0)))
14299 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC
)
14300 && aarch_crypto_can_dual_issue (prev
, curr
))
14303 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH
)
14304 && any_condjump_p (curr
))
14306 enum attr_type prev_type
= get_attr_type (prev
);
14308 unsigned int condreg1
, condreg2
;
14310 aarch64_fixed_condition_code_regs (&condreg1
, &condreg2
);
14311 cc_reg_1
= gen_rtx_REG (CCmode
, condreg1
);
14313 if (reg_referenced_p (cc_reg_1
, PATTERN (curr
))
14315 && modified_in_p (cc_reg_1
, prev
))
14317 /* FIXME: this misses some which is considered simple arthematic
14318 instructions for ThunderX. Simple shifts are missed here. */
14319 if (prev_type
== TYPE_ALUS_SREG
14320 || prev_type
== TYPE_ALUS_IMM
14321 || prev_type
== TYPE_LOGICS_REG
14322 || prev_type
== TYPE_LOGICS_IMM
)
14327 if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH
)
14328 && any_condjump_p (curr
))
14330 /* We're trying to match:
14331 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
14332 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
14334 (label_ref ("SYM"))
14336 if (SET_DEST (curr_set
) == (pc_rtx
)
14337 && GET_CODE (SET_SRC (curr_set
)) == IF_THEN_ELSE
14338 && REG_P (XEXP (XEXP (SET_SRC (curr_set
), 0), 0))
14339 && REG_P (SET_DEST (prev_set
))
14340 && REGNO (SET_DEST (prev_set
))
14341 == REGNO (XEXP (XEXP (SET_SRC (curr_set
), 0), 0)))
14343 /* Fuse ALU operations followed by conditional branch instruction. */
14344 switch (get_attr_type (prev
))
14347 case TYPE_ALU_SREG
:
14350 case TYPE_ADCS_REG
:
14351 case TYPE_ADCS_IMM
:
14352 case TYPE_LOGIC_REG
:
14353 case TYPE_LOGIC_IMM
:
14357 case TYPE_SHIFT_REG
:
14358 case TYPE_SHIFT_IMM
:
14373 /* Return true iff the instruction fusion described by OP is enabled. */
14376 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op
)
14378 return (aarch64_tune_params
.fusible_ops
& op
) != 0;
14381 /* If MEM is in the form of [base+offset], extract the two parts
14382 of address and set to BASE and OFFSET, otherwise return false
14383 after clearing BASE and OFFSET. */
14386 extract_base_offset_in_addr (rtx mem
, rtx
*base
, rtx
*offset
)
14390 gcc_assert (MEM_P (mem
));
14392 addr
= XEXP (mem
, 0);
14397 *offset
= const0_rtx
;
14401 if (GET_CODE (addr
) == PLUS
14402 && REG_P (XEXP (addr
, 0)) && CONST_INT_P (XEXP (addr
, 1)))
14404 *base
= XEXP (addr
, 0);
14405 *offset
= XEXP (addr
, 1);
14410 *offset
= NULL_RTX
;
14415 /* Types for scheduling fusion. */
14416 enum sched_fusion_type
14418 SCHED_FUSION_NONE
= 0,
14419 SCHED_FUSION_LD_SIGN_EXTEND
,
14420 SCHED_FUSION_LD_ZERO_EXTEND
,
14426 /* If INSN is a load or store of address in the form of [base+offset],
14427 extract the two parts and set to BASE and OFFSET. Return scheduling
14428 fusion type this INSN is. */
14430 static enum sched_fusion_type
14431 fusion_load_store (rtx_insn
*insn
, rtx
*base
, rtx
*offset
)
14434 enum sched_fusion_type fusion
= SCHED_FUSION_LD
;
14436 gcc_assert (INSN_P (insn
));
14437 x
= PATTERN (insn
);
14438 if (GET_CODE (x
) != SET
)
14439 return SCHED_FUSION_NONE
;
14442 dest
= SET_DEST (x
);
14444 machine_mode dest_mode
= GET_MODE (dest
);
14446 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode
))
14447 return SCHED_FUSION_NONE
;
14449 if (GET_CODE (src
) == SIGN_EXTEND
)
14451 fusion
= SCHED_FUSION_LD_SIGN_EXTEND
;
14452 src
= XEXP (src
, 0);
14453 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
14454 return SCHED_FUSION_NONE
;
14456 else if (GET_CODE (src
) == ZERO_EXTEND
)
14458 fusion
= SCHED_FUSION_LD_ZERO_EXTEND
;
14459 src
= XEXP (src
, 0);
14460 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
14461 return SCHED_FUSION_NONE
;
14464 if (GET_CODE (src
) == MEM
&& REG_P (dest
))
14465 extract_base_offset_in_addr (src
, base
, offset
);
14466 else if (GET_CODE (dest
) == MEM
&& (REG_P (src
) || src
== const0_rtx
))
14468 fusion
= SCHED_FUSION_ST
;
14469 extract_base_offset_in_addr (dest
, base
, offset
);
14472 return SCHED_FUSION_NONE
;
14474 if (*base
== NULL_RTX
|| *offset
== NULL_RTX
)
14475 fusion
= SCHED_FUSION_NONE
;
14480 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
14482 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
14483 and PRI are only calculated for these instructions. For other instruction,
14484 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
14485 type instruction fusion can be added by returning different priorities.
14487 It's important that irrelevant instructions get the largest FUSION_PRI. */
14490 aarch64_sched_fusion_priority (rtx_insn
*insn
, int max_pri
,
14491 int *fusion_pri
, int *pri
)
14495 enum sched_fusion_type fusion
;
14497 gcc_assert (INSN_P (insn
));
14500 fusion
= fusion_load_store (insn
, &base
, &offset
);
14501 if (fusion
== SCHED_FUSION_NONE
)
14508 /* Set FUSION_PRI according to fusion type and base register. */
14509 *fusion_pri
= tmp
- fusion
* FIRST_PSEUDO_REGISTER
- REGNO (base
);
14511 /* Calculate PRI. */
14514 /* INSN with smaller offset goes first. */
14515 off_val
= (int)(INTVAL (offset
));
14517 tmp
-= (off_val
& 0xfffff);
14519 tmp
+= ((- off_val
) & 0xfffff);
14525 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
14526 Adjust priority of sha1h instructions so they are scheduled before
14527 other SHA1 instructions. */
14530 aarch64_sched_adjust_priority (rtx_insn
*insn
, int priority
)
14532 rtx x
= PATTERN (insn
);
14534 if (GET_CODE (x
) == SET
)
14538 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_SHA1H
)
14539 return priority
+ 10;
14545 /* Given OPERANDS of consecutive load/store, check if we can merge
14546 them into ldp/stp. LOAD is true if they are load instructions.
14547 MODE is the mode of memory operands. */
14550 aarch64_operands_ok_for_ldpstp (rtx
*operands
, bool load
,
14553 HOST_WIDE_INT offval_1
, offval_2
, msize
;
14554 enum reg_class rclass_1
, rclass_2
;
14555 rtx mem_1
, mem_2
, reg_1
, reg_2
, base_1
, base_2
, offset_1
, offset_2
;
14559 mem_1
= operands
[1];
14560 mem_2
= operands
[3];
14561 reg_1
= operands
[0];
14562 reg_2
= operands
[2];
14563 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
));
14564 if (REGNO (reg_1
) == REGNO (reg_2
))
14569 mem_1
= operands
[0];
14570 mem_2
= operands
[2];
14571 reg_1
= operands
[1];
14572 reg_2
= operands
[3];
14575 /* The mems cannot be volatile. */
14576 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
))
14579 /* If we have SImode and slow unaligned ldp,
14580 check the alignment to be at least 8 byte. */
14582 && (aarch64_tune_params
.extra_tuning_flags
14583 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
14585 && MEM_ALIGN (mem_1
) < 8 * BITS_PER_UNIT
)
14588 /* Check if the addresses are in the form of [base+offset]. */
14589 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
14590 if (base_1
== NULL_RTX
|| offset_1
== NULL_RTX
)
14592 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
14593 if (base_2
== NULL_RTX
|| offset_2
== NULL_RTX
)
14596 /* Check if the bases are same. */
14597 if (!rtx_equal_p (base_1
, base_2
))
14600 offval_1
= INTVAL (offset_1
);
14601 offval_2
= INTVAL (offset_2
);
14602 msize
= GET_MODE_SIZE (mode
);
14603 /* Check if the offsets are consecutive. */
14604 if (offval_1
!= (offval_2
+ msize
) && offval_2
!= (offval_1
+ msize
))
14607 /* Check if the addresses are clobbered by load. */
14610 if (reg_mentioned_p (reg_1
, mem_1
))
14613 /* In increasing order, the last load can clobber the address. */
14614 if (offval_1
> offval_2
&& reg_mentioned_p (reg_2
, mem_2
))
14618 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
14619 rclass_1
= FP_REGS
;
14621 rclass_1
= GENERAL_REGS
;
14623 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
14624 rclass_2
= FP_REGS
;
14626 rclass_2
= GENERAL_REGS
;
14628 /* Check if the registers are of same class. */
14629 if (rclass_1
!= rclass_2
)
14635 /* Given OPERANDS of consecutive load/store, check if we can merge
14636 them into ldp/stp by adjusting the offset. LOAD is true if they
14637 are load instructions. MODE is the mode of memory operands.
14639 Given below consecutive stores:
14641 str w1, [xb, 0x100]
14642 str w1, [xb, 0x104]
14643 str w1, [xb, 0x108]
14644 str w1, [xb, 0x10c]
14646 Though the offsets are out of the range supported by stp, we can
14647 still pair them after adjusting the offset, like:
14649 add scratch, xb, 0x100
14650 stp w1, w1, [scratch]
14651 stp w1, w1, [scratch, 0x8]
14653 The peephole patterns detecting this opportunity should guarantee
14654 the scratch register is avaliable. */
14657 aarch64_operands_adjust_ok_for_ldpstp (rtx
*operands
, bool load
,
14660 enum reg_class rclass_1
, rclass_2
, rclass_3
, rclass_4
;
14661 HOST_WIDE_INT offval_1
, offval_2
, offval_3
, offval_4
, msize
;
14662 rtx mem_1
, mem_2
, mem_3
, mem_4
, reg_1
, reg_2
, reg_3
, reg_4
;
14663 rtx base_1
, base_2
, base_3
, base_4
, offset_1
, offset_2
, offset_3
, offset_4
;
14667 reg_1
= operands
[0];
14668 mem_1
= operands
[1];
14669 reg_2
= operands
[2];
14670 mem_2
= operands
[3];
14671 reg_3
= operands
[4];
14672 mem_3
= operands
[5];
14673 reg_4
= operands
[6];
14674 mem_4
= operands
[7];
14675 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
)
14676 && REG_P (reg_3
) && REG_P (reg_4
));
14677 if (REGNO (reg_1
) == REGNO (reg_2
) || REGNO (reg_3
) == REGNO (reg_4
))
14682 mem_1
= operands
[0];
14683 reg_1
= operands
[1];
14684 mem_2
= operands
[2];
14685 reg_2
= operands
[3];
14686 mem_3
= operands
[4];
14687 reg_3
= operands
[5];
14688 mem_4
= operands
[6];
14689 reg_4
= operands
[7];
14691 /* Skip if memory operand is by itslef valid for ldp/stp. */
14692 if (!MEM_P (mem_1
) || aarch64_mem_pair_operand (mem_1
, mode
))
14695 /* The mems cannot be volatile. */
14696 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
)
14697 || MEM_VOLATILE_P (mem_3
) ||MEM_VOLATILE_P (mem_4
))
14700 /* Check if the addresses are in the form of [base+offset]. */
14701 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
14702 if (base_1
== NULL_RTX
|| offset_1
== NULL_RTX
)
14704 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
14705 if (base_2
== NULL_RTX
|| offset_2
== NULL_RTX
)
14707 extract_base_offset_in_addr (mem_3
, &base_3
, &offset_3
);
14708 if (base_3
== NULL_RTX
|| offset_3
== NULL_RTX
)
14710 extract_base_offset_in_addr (mem_4
, &base_4
, &offset_4
);
14711 if (base_4
== NULL_RTX
|| offset_4
== NULL_RTX
)
14714 /* Check if the bases are same. */
14715 if (!rtx_equal_p (base_1
, base_2
)
14716 || !rtx_equal_p (base_2
, base_3
)
14717 || !rtx_equal_p (base_3
, base_4
))
14720 offval_1
= INTVAL (offset_1
);
14721 offval_2
= INTVAL (offset_2
);
14722 offval_3
= INTVAL (offset_3
);
14723 offval_4
= INTVAL (offset_4
);
14724 msize
= GET_MODE_SIZE (mode
);
14725 /* Check if the offsets are consecutive. */
14726 if ((offval_1
!= (offval_2
+ msize
)
14727 || offval_1
!= (offval_3
+ msize
* 2)
14728 || offval_1
!= (offval_4
+ msize
* 3))
14729 && (offval_4
!= (offval_3
+ msize
)
14730 || offval_4
!= (offval_2
+ msize
* 2)
14731 || offval_4
!= (offval_1
+ msize
* 3)))
14734 /* Check if the addresses are clobbered by load. */
14737 if (reg_mentioned_p (reg_1
, mem_1
)
14738 || reg_mentioned_p (reg_2
, mem_2
)
14739 || reg_mentioned_p (reg_3
, mem_3
))
14742 /* In increasing order, the last load can clobber the address. */
14743 if (offval_1
> offval_2
&& reg_mentioned_p (reg_4
, mem_4
))
14747 /* If we have SImode and slow unaligned ldp,
14748 check the alignment to be at least 8 byte. */
14750 && (aarch64_tune_params
.extra_tuning_flags
14751 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
14753 && MEM_ALIGN (mem_1
) < 8 * BITS_PER_UNIT
)
14756 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
14757 rclass_1
= FP_REGS
;
14759 rclass_1
= GENERAL_REGS
;
14761 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
14762 rclass_2
= FP_REGS
;
14764 rclass_2
= GENERAL_REGS
;
14766 if (REG_P (reg_3
) && FP_REGNUM_P (REGNO (reg_3
)))
14767 rclass_3
= FP_REGS
;
14769 rclass_3
= GENERAL_REGS
;
14771 if (REG_P (reg_4
) && FP_REGNUM_P (REGNO (reg_4
)))
14772 rclass_4
= FP_REGS
;
14774 rclass_4
= GENERAL_REGS
;
14776 /* Check if the registers are of same class. */
14777 if (rclass_1
!= rclass_2
|| rclass_2
!= rclass_3
|| rclass_3
!= rclass_4
)
14783 /* Given OPERANDS of consecutive load/store, this function pairs them
14784 into ldp/stp after adjusting the offset. It depends on the fact
14785 that addresses of load/store instructions are in increasing order.
14786 MODE is the mode of memory operands. CODE is the rtl operator
14787 which should be applied to all memory operands, it's SIGN_EXTEND,
14788 ZERO_EXTEND or UNKNOWN. */
14791 aarch64_gen_adjusted_ldpstp (rtx
*operands
, bool load
,
14792 machine_mode mode
, RTX_CODE code
)
14794 rtx base
, offset
, t1
, t2
;
14795 rtx mem_1
, mem_2
, mem_3
, mem_4
;
14796 HOST_WIDE_INT off_val
, abs_off
, adj_off
, new_off
, stp_off_limit
, msize
;
14800 mem_1
= operands
[1];
14801 mem_2
= operands
[3];
14802 mem_3
= operands
[5];
14803 mem_4
= operands
[7];
14807 mem_1
= operands
[0];
14808 mem_2
= operands
[2];
14809 mem_3
= operands
[4];
14810 mem_4
= operands
[6];
14811 gcc_assert (code
== UNKNOWN
);
14814 extract_base_offset_in_addr (mem_1
, &base
, &offset
);
14815 gcc_assert (base
!= NULL_RTX
&& offset
!= NULL_RTX
);
14817 /* Adjust offset thus it can fit in ldp/stp instruction. */
14818 msize
= GET_MODE_SIZE (mode
);
14819 stp_off_limit
= msize
* 0x40;
14820 off_val
= INTVAL (offset
);
14821 abs_off
= (off_val
< 0) ? -off_val
: off_val
;
14822 new_off
= abs_off
% stp_off_limit
;
14823 adj_off
= abs_off
- new_off
;
14825 /* Further adjust to make sure all offsets are OK. */
14826 if ((new_off
+ msize
* 2) >= stp_off_limit
)
14828 adj_off
+= stp_off_limit
;
14829 new_off
-= stp_off_limit
;
14832 /* Make sure the adjustment can be done with ADD/SUB instructions. */
14833 if (adj_off
>= 0x1000)
14838 adj_off
= -adj_off
;
14839 new_off
= -new_off
;
14842 /* Create new memory references. */
14843 mem_1
= change_address (mem_1
, VOIDmode
,
14844 plus_constant (DImode
, operands
[8], new_off
));
14846 /* Check if the adjusted address is OK for ldp/stp. */
14847 if (!aarch64_mem_pair_operand (mem_1
, mode
))
14850 msize
= GET_MODE_SIZE (mode
);
14851 mem_2
= change_address (mem_2
, VOIDmode
,
14852 plus_constant (DImode
,
14855 mem_3
= change_address (mem_3
, VOIDmode
,
14856 plus_constant (DImode
,
14858 new_off
+ msize
* 2));
14859 mem_4
= change_address (mem_4
, VOIDmode
,
14860 plus_constant (DImode
,
14862 new_off
+ msize
* 3));
14864 if (code
== ZERO_EXTEND
)
14866 mem_1
= gen_rtx_ZERO_EXTEND (DImode
, mem_1
);
14867 mem_2
= gen_rtx_ZERO_EXTEND (DImode
, mem_2
);
14868 mem_3
= gen_rtx_ZERO_EXTEND (DImode
, mem_3
);
14869 mem_4
= gen_rtx_ZERO_EXTEND (DImode
, mem_4
);
14871 else if (code
== SIGN_EXTEND
)
14873 mem_1
= gen_rtx_SIGN_EXTEND (DImode
, mem_1
);
14874 mem_2
= gen_rtx_SIGN_EXTEND (DImode
, mem_2
);
14875 mem_3
= gen_rtx_SIGN_EXTEND (DImode
, mem_3
);
14876 mem_4
= gen_rtx_SIGN_EXTEND (DImode
, mem_4
);
14881 operands
[1] = mem_1
;
14882 operands
[3] = mem_2
;
14883 operands
[5] = mem_3
;
14884 operands
[7] = mem_4
;
14888 operands
[0] = mem_1
;
14889 operands
[2] = mem_2
;
14890 operands
[4] = mem_3
;
14891 operands
[6] = mem_4
;
14894 /* Emit adjusting instruction. */
14895 emit_insn (gen_rtx_SET (operands
[8], plus_constant (DImode
, base
, adj_off
)));
14896 /* Emit ldp/stp instructions. */
14897 t1
= gen_rtx_SET (operands
[0], operands
[1]);
14898 t2
= gen_rtx_SET (operands
[2], operands
[3]);
14899 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
14900 t1
= gen_rtx_SET (operands
[4], operands
[5]);
14901 t2
= gen_rtx_SET (operands
[6], operands
[7]);
14902 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
14906 /* Return 1 if pseudo register should be created and used to hold
14907 GOT address for PIC code. */
14910 aarch64_use_pseudo_pic_reg (void)
14912 return aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
;
14915 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
14918 aarch64_unspec_may_trap_p (const_rtx x
, unsigned flags
)
14920 switch (XINT (x
, 1))
14922 case UNSPEC_GOTSMALLPIC
:
14923 case UNSPEC_GOTSMALLPIC28K
:
14924 case UNSPEC_GOTTINYPIC
:
14930 return default_unspec_may_trap_p (x
, flags
);
14934 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
14935 return the log2 of that value. Otherwise return -1. */
14938 aarch64_fpconst_pow_of_2 (rtx x
)
14940 const REAL_VALUE_TYPE
*r
;
14942 if (!CONST_DOUBLE_P (x
))
14945 r
= CONST_DOUBLE_REAL_VALUE (x
);
14947 if (REAL_VALUE_NEGATIVE (*r
)
14948 || REAL_VALUE_ISNAN (*r
)
14949 || REAL_VALUE_ISINF (*r
)
14950 || !real_isinteger (r
, DFmode
))
14953 return exact_log2 (real_to_integer (r
));
14956 /* If X is a vector of equal CONST_DOUBLE values and that value is
14957 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
14960 aarch64_vec_fpconst_pow_of_2 (rtx x
)
14962 if (GET_CODE (x
) != CONST_VECTOR
)
14965 if (GET_MODE_CLASS (GET_MODE (x
)) != MODE_VECTOR_FLOAT
)
14968 int firstval
= aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, 0));
14972 for (int i
= 1; i
< CONST_VECTOR_NUNITS (x
); i
++)
14973 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, i
)) != firstval
)
14979 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
14982 __fp16 always promotes through this hook.
14983 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
14984 through the generic excess precision logic rather than here. */
14987 aarch64_promoted_type (const_tree t
)
14989 if (SCALAR_FLOAT_TYPE_P (t
)
14990 && TYPE_MAIN_VARIANT (t
) == aarch64_fp16_type_node
)
14991 return float_type_node
;
14996 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
14999 aarch64_optab_supported_p (int op
, machine_mode mode1
, machine_mode
,
15000 optimization_type opt_type
)
15005 return opt_type
== OPTIMIZE_FOR_SPEED
&& use_rsqrt_p (mode1
);
15012 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
15013 if MODE is HFmode, and punt to the generic implementation otherwise. */
15016 aarch64_libgcc_floating_mode_supported_p (machine_mode mode
)
15018 return (mode
== HFmode
15020 : default_libgcc_floating_mode_supported_p (mode
));
15023 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
15024 if MODE is HFmode, and punt to the generic implementation otherwise. */
15027 aarch64_scalar_mode_supported_p (machine_mode mode
)
15029 return (mode
== HFmode
15031 : default_scalar_mode_supported_p (mode
));
15034 /* Set the value of FLT_EVAL_METHOD.
15035 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
15037 0: evaluate all operations and constants, whose semantic type has at
15038 most the range and precision of type float, to the range and
15039 precision of float; evaluate all other operations and constants to
15040 the range and precision of the semantic type;
15042 N, where _FloatN is a supported interchange floating type
15043 evaluate all operations and constants, whose semantic type has at
15044 most the range and precision of _FloatN type, to the range and
15045 precision of the _FloatN type; evaluate all other operations and
15046 constants to the range and precision of the semantic type;
15048 If we have the ARMv8.2-A extensions then we support _Float16 in native
15049 precision, so we should set this to 16. Otherwise, we support the type,
15050 but want to evaluate expressions in float precision, so set this to
15053 static enum flt_eval_method
15054 aarch64_excess_precision (enum excess_precision_type type
)
15058 case EXCESS_PRECISION_TYPE_FAST
:
15059 case EXCESS_PRECISION_TYPE_STANDARD
:
15060 /* We can calculate either in 16-bit range and precision or
15061 32-bit range and precision. Make that decision based on whether
15062 we have native support for the ARMv8.2-A 16-bit floating-point
15063 instructions or not. */
15064 return (TARGET_FP_F16INST
15065 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
15066 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
);
15067 case EXCESS_PRECISION_TYPE_IMPLICIT
:
15068 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
;
15070 gcc_unreachable ();
15072 return FLT_EVAL_METHOD_UNPREDICTABLE
;
15075 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
15076 scheduled for speculative execution. Reject the long-running division
15077 and square-root instructions. */
15080 aarch64_sched_can_speculate_insn (rtx_insn
*insn
)
15082 switch (get_attr_type (insn
))
15090 case TYPE_NEON_FP_SQRT_S
:
15091 case TYPE_NEON_FP_SQRT_D
:
15092 case TYPE_NEON_FP_SQRT_S_Q
:
15093 case TYPE_NEON_FP_SQRT_D_Q
:
15094 case TYPE_NEON_FP_DIV_S
:
15095 case TYPE_NEON_FP_DIV_D
:
15096 case TYPE_NEON_FP_DIV_S_Q
:
15097 case TYPE_NEON_FP_DIV_D_Q
:
15104 /* Target-specific selftests. */
15108 namespace selftest
{
15110 /* Selftest for the RTL loader.
15111 Verify that the RTL loader copes with a dump from
15112 print_rtx_function. This is essentially just a test that class
15113 function_reader can handle a real dump, but it also verifies
15114 that lookup_reg_by_dump_name correctly handles hard regs.
15115 The presence of hard reg names in the dump means that the test is
15116 target-specific, hence it is in this file. */
15119 aarch64_test_loading_full_dump ()
15121 rtl_dump_test
t (SELFTEST_LOCATION
, locate_file ("aarch64/times-two.rtl"));
15123 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun
->decl
)));
15125 rtx_insn
*insn_1
= get_insn_by_uid (1);
15126 ASSERT_EQ (NOTE
, GET_CODE (insn_1
));
15128 rtx_insn
*insn_15
= get_insn_by_uid (15);
15129 ASSERT_EQ (INSN
, GET_CODE (insn_15
));
15130 ASSERT_EQ (USE
, GET_CODE (PATTERN (insn_15
)));
15132 /* Verify crtl->return_rtx. */
15133 ASSERT_EQ (REG
, GET_CODE (crtl
->return_rtx
));
15134 ASSERT_EQ (0, REGNO (crtl
->return_rtx
));
15135 ASSERT_EQ (SImode
, GET_MODE (crtl
->return_rtx
));
15138 /* Run all target-specific selftests. */
15141 aarch64_run_selftests (void)
15143 aarch64_test_loading_full_dump ();
15146 } // namespace selftest
15148 #endif /* #if CHECKING_P */
15150 #undef TARGET_ADDRESS_COST
15151 #define TARGET_ADDRESS_COST aarch64_address_cost
15153 /* This hook will determines whether unnamed bitfields affect the alignment
15154 of the containing structure. The hook returns true if the structure
15155 should inherit the alignment requirements of an unnamed bitfield's
15157 #undef TARGET_ALIGN_ANON_BITFIELD
15158 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
15160 #undef TARGET_ASM_ALIGNED_DI_OP
15161 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
15163 #undef TARGET_ASM_ALIGNED_HI_OP
15164 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
15166 #undef TARGET_ASM_ALIGNED_SI_OP
15167 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
15169 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
15170 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
15171 hook_bool_const_tree_hwi_hwi_const_tree_true
15173 #undef TARGET_ASM_FILE_START
15174 #define TARGET_ASM_FILE_START aarch64_start_file
15176 #undef TARGET_ASM_OUTPUT_MI_THUNK
15177 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
15179 #undef TARGET_ASM_SELECT_RTX_SECTION
15180 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
15182 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
15183 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
15185 #undef TARGET_BUILD_BUILTIN_VA_LIST
15186 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
15188 #undef TARGET_CALLEE_COPIES
15189 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
15191 #undef TARGET_CAN_ELIMINATE
15192 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
15194 #undef TARGET_CAN_INLINE_P
15195 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
15197 #undef TARGET_CANNOT_FORCE_CONST_MEM
15198 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
15200 #undef TARGET_CASE_VALUES_THRESHOLD
15201 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
15203 #undef TARGET_CONDITIONAL_REGISTER_USAGE
15204 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
15206 /* Only the least significant bit is used for initialization guard
15208 #undef TARGET_CXX_GUARD_MASK_BIT
15209 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
15211 #undef TARGET_C_MODE_FOR_SUFFIX
15212 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
15214 #ifdef TARGET_BIG_ENDIAN_DEFAULT
15215 #undef TARGET_DEFAULT_TARGET_FLAGS
15216 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
15219 #undef TARGET_CLASS_MAX_NREGS
15220 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
15222 #undef TARGET_BUILTIN_DECL
15223 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
15225 #undef TARGET_BUILTIN_RECIPROCAL
15226 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
15228 #undef TARGET_C_EXCESS_PRECISION
15229 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
15231 #undef TARGET_EXPAND_BUILTIN
15232 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
15234 #undef TARGET_EXPAND_BUILTIN_VA_START
15235 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
15237 #undef TARGET_FOLD_BUILTIN
15238 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
15240 #undef TARGET_FUNCTION_ARG
15241 #define TARGET_FUNCTION_ARG aarch64_function_arg
15243 #undef TARGET_FUNCTION_ARG_ADVANCE
15244 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
15246 #undef TARGET_FUNCTION_ARG_BOUNDARY
15247 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
15249 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
15250 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
15252 #undef TARGET_FUNCTION_VALUE
15253 #define TARGET_FUNCTION_VALUE aarch64_function_value
15255 #undef TARGET_FUNCTION_VALUE_REGNO_P
15256 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
15258 #undef TARGET_FRAME_POINTER_REQUIRED
15259 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
15261 #undef TARGET_GIMPLE_FOLD_BUILTIN
15262 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
15264 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
15265 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
15267 #undef TARGET_INIT_BUILTINS
15268 #define TARGET_INIT_BUILTINS aarch64_init_builtins
15270 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
15271 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
15272 aarch64_ira_change_pseudo_allocno_class
15274 #undef TARGET_LEGITIMATE_ADDRESS_P
15275 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
15277 #undef TARGET_LEGITIMATE_CONSTANT_P
15278 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
15280 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
15281 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
15282 aarch64_legitimize_address_displacement
15284 #undef TARGET_LIBGCC_CMP_RETURN_MODE
15285 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
15287 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
15288 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
15289 aarch64_libgcc_floating_mode_supported_p
15291 #undef TARGET_MANGLE_TYPE
15292 #define TARGET_MANGLE_TYPE aarch64_mangle_type
15294 #undef TARGET_MEMORY_MOVE_COST
15295 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
15297 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
15298 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
15300 #undef TARGET_MUST_PASS_IN_STACK
15301 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
15303 /* This target hook should return true if accesses to volatile bitfields
15304 should use the narrowest mode possible. It should return false if these
15305 accesses should use the bitfield container type. */
15306 #undef TARGET_NARROW_VOLATILE_BITFIELD
15307 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
15309 #undef TARGET_OPTION_OVERRIDE
15310 #define TARGET_OPTION_OVERRIDE aarch64_override_options
15312 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
15313 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
15314 aarch64_override_options_after_change
15316 #undef TARGET_OPTION_SAVE
15317 #define TARGET_OPTION_SAVE aarch64_option_save
15319 #undef TARGET_OPTION_RESTORE
15320 #define TARGET_OPTION_RESTORE aarch64_option_restore
15322 #undef TARGET_OPTION_PRINT
15323 #define TARGET_OPTION_PRINT aarch64_option_print
15325 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
15326 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
15328 #undef TARGET_SET_CURRENT_FUNCTION
15329 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
15331 #undef TARGET_PASS_BY_REFERENCE
15332 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
15334 #undef TARGET_PREFERRED_RELOAD_CLASS
15335 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
15337 #undef TARGET_SCHED_REASSOCIATION_WIDTH
15338 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
15340 #undef TARGET_PROMOTED_TYPE
15341 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
15343 #undef TARGET_SECONDARY_RELOAD
15344 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
15346 #undef TARGET_SHIFT_TRUNCATION_MASK
15347 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
15349 #undef TARGET_SETUP_INCOMING_VARARGS
15350 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
15352 #undef TARGET_STRUCT_VALUE_RTX
15353 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
15355 #undef TARGET_REGISTER_MOVE_COST
15356 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
15358 #undef TARGET_RETURN_IN_MEMORY
15359 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
15361 #undef TARGET_RETURN_IN_MSB
15362 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
15364 #undef TARGET_RTX_COSTS
15365 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
15367 #undef TARGET_SCALAR_MODE_SUPPORTED_P
15368 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
15370 #undef TARGET_SCHED_ISSUE_RATE
15371 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
15373 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
15374 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
15375 aarch64_sched_first_cycle_multipass_dfa_lookahead
15377 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
15378 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
15379 aarch64_first_cycle_multipass_dfa_lookahead_guard
15381 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
15382 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
15383 aarch64_get_separate_components
15385 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
15386 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
15387 aarch64_components_for_bb
15389 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
15390 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
15391 aarch64_disqualify_components
15393 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
15394 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
15395 aarch64_emit_prologue_components
15397 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
15398 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
15399 aarch64_emit_epilogue_components
15401 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
15402 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
15403 aarch64_set_handled_components
15405 #undef TARGET_TRAMPOLINE_INIT
15406 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
15408 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
15409 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
15411 #undef TARGET_VECTOR_MODE_SUPPORTED_P
15412 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
15414 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
15415 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
15416 aarch64_builtin_support_vector_misalignment
15418 #undef TARGET_ARRAY_MODE_SUPPORTED_P
15419 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
15421 #undef TARGET_VECTORIZE_ADD_STMT_COST
15422 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
15424 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
15425 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
15426 aarch64_builtin_vectorization_cost
15428 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
15429 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
15431 #undef TARGET_VECTORIZE_BUILTINS
15432 #define TARGET_VECTORIZE_BUILTINS
15434 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
15435 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
15436 aarch64_builtin_vectorized_function
15438 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
15439 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
15440 aarch64_autovectorize_vector_sizes
15442 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
15443 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
15444 aarch64_atomic_assign_expand_fenv
15446 /* Section anchor support. */
15448 #undef TARGET_MIN_ANCHOR_OFFSET
15449 #define TARGET_MIN_ANCHOR_OFFSET -256
15451 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
15452 byte offset; we can do much more for larger data types, but have no way
15453 to determine the size of the access. We assume accesses are aligned. */
15454 #undef TARGET_MAX_ANCHOR_OFFSET
15455 #define TARGET_MAX_ANCHOR_OFFSET 4095
15457 #undef TARGET_VECTOR_ALIGNMENT
15458 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
15460 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
15461 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
15462 aarch64_simd_vector_alignment_reachable
15464 /* vec_perm support. */
15466 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
15467 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
15468 aarch64_vectorize_vec_perm_const_ok
15470 #undef TARGET_INIT_LIBFUNCS
15471 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
15473 #undef TARGET_FIXED_CONDITION_CODE_REGS
15474 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
15476 #undef TARGET_FLAGS_REGNUM
15477 #define TARGET_FLAGS_REGNUM CC_REGNUM
15479 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
15480 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
15482 #undef TARGET_ASAN_SHADOW_OFFSET
15483 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
15485 #undef TARGET_LEGITIMIZE_ADDRESS
15486 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
15488 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
15489 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
15490 aarch64_use_by_pieces_infrastructure_p
15492 #undef TARGET_SCHED_CAN_SPECULATE_INSN
15493 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
15495 #undef TARGET_CAN_USE_DOLOOP_P
15496 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
15498 #undef TARGET_SCHED_ADJUST_PRIORITY
15499 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
15501 #undef TARGET_SCHED_MACRO_FUSION_P
15502 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
15504 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
15505 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
15507 #undef TARGET_SCHED_FUSION_PRIORITY
15508 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
15510 #undef TARGET_UNSPEC_MAY_TRAP_P
15511 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
15513 #undef TARGET_USE_PSEUDO_PIC_REG
15514 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
15516 #undef TARGET_PRINT_OPERAND
15517 #define TARGET_PRINT_OPERAND aarch64_print_operand
15519 #undef TARGET_PRINT_OPERAND_ADDRESS
15520 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
15522 #undef TARGET_OPTAB_SUPPORTED_P
15523 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
15525 #undef TARGET_OMIT_STRUCT_RETURN_REG
15526 #define TARGET_OMIT_STRUCT_RETURN_REG true
15528 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
15529 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
15530 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
15533 #undef TARGET_RUN_TARGET_SELFTESTS
15534 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
15535 #endif /* #if CHECKING_P */
15537 struct gcc_target targetm
= TARGET_INITIALIZER
;
15539 #include "gt-aarch64.h"