]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/aarch64/aarch64.c
Update copyright years.
[thirdparty/gcc.git] / gcc / config / aarch64 / aarch64.c
CommitLineData
bdb7bf8a 1/* Machine description for AArch64 architecture.
99dee823 2 Copyright (C) 2009-2021 Free Software Foundation, Inc.
43e9d192
IB
3 Contributed by ARM Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
8fcc61f8
RS
21#define IN_TARGET_CODE 1
22
43e9d192 23#include "config.h"
01736018 24#define INCLUDE_STRING
43e9d192
IB
25#include "system.h"
26#include "coretypes.h"
c7131fb2 27#include "backend.h"
e11c4407
AM
28#include "target.h"
29#include "rtl.h"
c7131fb2 30#include "tree.h"
e73cf9a2 31#include "memmodel.h"
c7131fb2 32#include "gimple.h"
e11c4407
AM
33#include "cfghooks.h"
34#include "cfgloop.h"
c7131fb2 35#include "df.h"
e11c4407
AM
36#include "tm_p.h"
37#include "stringpool.h"
314e6352 38#include "attribs.h"
e11c4407
AM
39#include "optabs.h"
40#include "regs.h"
41#include "emit-rtl.h"
42#include "recog.h"
d9186814 43#include "cgraph.h"
e11c4407 44#include "diagnostic.h"
43e9d192 45#include "insn-attr.h"
40e23961 46#include "alias.h"
40e23961 47#include "fold-const.h"
d8a2d370
DN
48#include "stor-layout.h"
49#include "calls.h"
50#include "varasm.h"
43e9d192 51#include "output.h"
36566b39 52#include "flags.h"
36566b39 53#include "explow.h"
43e9d192
IB
54#include "expr.h"
55#include "reload.h"
43e9d192 56#include "langhooks.h"
5a2c8331 57#include "opts.h"
45b0be94 58#include "gimplify.h"
43e9d192 59#include "dwarf2.h"
61d371eb 60#include "gimple-iterator.h"
8990e73a 61#include "tree-vectorizer.h"
d1bcc29f 62#include "aarch64-cost-tables.h"
0ee859b5 63#include "dumpfile.h"
9b2b7279 64#include "builtins.h"
8baff86e 65#include "rtl-iter.h"
9bbe08fe 66#include "tm-constrs.h"
d03f7e44 67#include "sched-int.h"
d78006d9 68#include "target-globals.h"
a3eb8a52 69#include "common/common-target.h"
43cacb12 70#include "cfgrtl.h"
51b86113
DM
71#include "selftest.h"
72#include "selftest-rtl.h"
43cacb12 73#include "rtx-vector-builder.h"
d9186814 74#include "intl.h"
7d8bdfa7 75#include "expmed.h"
002ffd3c 76#include "function-abi.h"
43e9d192 77
994c5d85 78/* This file should be included last. */
d58627a0
RS
79#include "target-def.h"
80
28514dda
YZ
81/* Defined for convenience. */
82#define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
83
b187677b 84/* Information about a legitimate vector immediate operand. */
48063b9d
IB
85struct simd_immediate_info
86{
0b1fe8cf 87 enum insn_type { MOV, MVN, INDEX, PTRUE };
b187677b
RS
88 enum modifier_type { LSL, MSL };
89
90 simd_immediate_info () {}
91 simd_immediate_info (scalar_float_mode, rtx);
92 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
93 insn_type = MOV, modifier_type = LSL,
94 unsigned int = 0);
43cacb12 95 simd_immediate_info (scalar_mode, rtx, rtx);
0b1fe8cf 96 simd_immediate_info (scalar_int_mode, aarch64_svpattern);
b187677b
RS
97
98 /* The mode of the elements. */
99 scalar_mode elt_mode;
100
b187677b
RS
101 /* The instruction to use to move the immediate into a vector. */
102 insn_type insn;
103
1da83cce
RS
104 union
105 {
106 /* For MOV and MVN. */
107 struct
108 {
109 /* The value of each element. */
110 rtx value;
111
112 /* The kind of shift modifier to use, and the number of bits to shift.
113 This is (LSL, 0) if no shift is needed. */
114 modifier_type modifier;
115 unsigned int shift;
116 } mov;
117
118 /* For INDEX. */
119 struct
120 {
121 /* The value of the first element and the step to be added for each
122 subsequent element. */
123 rtx base, step;
124 } index;
0b1fe8cf
RS
125
126 /* For PTRUE. */
127 aarch64_svpattern pattern;
1da83cce 128 } u;
48063b9d
IB
129};
130
b187677b
RS
131/* Construct a floating-point immediate in which each element has mode
132 ELT_MODE_IN and value VALUE_IN. */
133inline simd_immediate_info
134::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
1da83cce
RS
135 : elt_mode (elt_mode_in), insn (MOV)
136{
137 u.mov.value = value_in;
138 u.mov.modifier = LSL;
139 u.mov.shift = 0;
140}
b187677b
RS
141
142/* Construct an integer immediate in which each element has mode ELT_MODE_IN
143 and value VALUE_IN. The other parameters are as for the structure
144 fields. */
145inline simd_immediate_info
146::simd_immediate_info (scalar_int_mode elt_mode_in,
147 unsigned HOST_WIDE_INT value_in,
148 insn_type insn_in, modifier_type modifier_in,
149 unsigned int shift_in)
1da83cce
RS
150 : elt_mode (elt_mode_in), insn (insn_in)
151{
152 u.mov.value = gen_int_mode (value_in, elt_mode_in);
153 u.mov.modifier = modifier_in;
154 u.mov.shift = shift_in;
155}
43cacb12
RS
156
157/* Construct an integer immediate in which each element has mode ELT_MODE_IN
1da83cce 158 and where element I is equal to BASE_IN + I * STEP_IN. */
43cacb12 159inline simd_immediate_info
1da83cce
RS
160::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
161 : elt_mode (elt_mode_in), insn (INDEX)
162{
163 u.index.base = base_in;
164 u.index.step = step_in;
165}
b187677b 166
0b1fe8cf
RS
167/* Construct a predicate that controls elements of mode ELT_MODE_IN
168 and has PTRUE pattern PATTERN_IN. */
169inline simd_immediate_info
170::simd_immediate_info (scalar_int_mode elt_mode_in,
171 aarch64_svpattern pattern_in)
172 : elt_mode (elt_mode_in), insn (PTRUE)
173{
174 u.pattern = pattern_in;
175}
176
38e62001
RS
177namespace {
178
179/* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64. */
180class pure_scalable_type_info
181{
182public:
183 /* Represents the result of analyzing a type. All values are nonzero,
184 in the possibly forlorn hope that accidental conversions to bool
185 trigger a warning. */
186 enum analysis_result
187 {
188 /* The type does not have an ABI identity; i.e. it doesn't contain
189 at least one object whose type is a Fundamental Data Type. */
190 NO_ABI_IDENTITY = 1,
191
192 /* The type is definitely a Pure Scalable Type. */
193 IS_PST,
194
195 /* The type is definitely not a Pure Scalable Type. */
196 ISNT_PST,
197
198 /* It doesn't matter for PCS purposes whether the type is a Pure
199 Scalable Type or not, since the type will be handled the same
200 way regardless.
201
202 Specifically, this means that if the type is a Pure Scalable Type,
203 there aren't enough argument registers to hold it, and so it will
204 need to be passed or returned in memory. If the type isn't a
205 Pure Scalable Type, it's too big to be passed or returned in core
206 or SIMD&FP registers, and so again will need to go in memory. */
207 DOESNT_MATTER
208 };
209
210 /* Aggregates of 17 bytes or more are normally passed and returned
211 in memory, so aggregates of that size can safely be analyzed as
212 DOESNT_MATTER. We need to be able to collect enough pieces to
213 represent a PST that is smaller than that. Since predicates are
214 2 bytes in size for -msve-vector-bits=128, that means we need to be
215 able to store at least 8 pieces.
216
217 We also need to be able to store enough pieces to represent
218 a single vector in each vector argument register and a single
219 predicate in each predicate argument register. This means that
220 we need at least 12 pieces. */
221 static const unsigned int MAX_PIECES = NUM_FP_ARG_REGS + NUM_PR_ARG_REGS;
222#if __cplusplus >= 201103L
223 static_assert (MAX_PIECES >= 8, "Need to store at least 8 predicates");
224#endif
225
226 /* Describes one piece of a PST. Each piece is one of:
227
228 - a single Scalable Vector Type (SVT)
229 - a single Scalable Predicate Type (SPT)
230 - a PST containing 2, 3 or 4 SVTs, with no padding
231
232 It either represents a single built-in type or a PST formed from
233 multiple homogeneous built-in types. */
234 struct piece
235 {
236 rtx get_rtx (unsigned int, unsigned int) const;
237
238 /* The number of vector and predicate registers that the piece
239 occupies. One of the two is always zero. */
240 unsigned int num_zr;
241 unsigned int num_pr;
242
243 /* The mode of the registers described above. */
244 machine_mode mode;
245
246 /* If this piece is formed from multiple homogeneous built-in types,
247 this is the mode of the built-in types, otherwise it is MODE. */
248 machine_mode orig_mode;
249
250 /* The offset in bytes of the piece from the start of the type. */
251 poly_uint64_pod offset;
252 };
253
254 /* Divides types analyzed as IS_PST into individual pieces. The pieces
255 are in memory order. */
256 auto_vec<piece, MAX_PIECES> pieces;
257
258 unsigned int num_zr () const;
259 unsigned int num_pr () const;
260
261 rtx get_rtx (machine_mode mode, unsigned int, unsigned int) const;
262
263 analysis_result analyze (const_tree);
264 bool analyze_registers (const_tree);
265
266private:
267 analysis_result analyze_array (const_tree);
268 analysis_result analyze_record (const_tree);
269 void add_piece (const piece &);
270};
271}
272
43e9d192
IB
273/* The current code model. */
274enum aarch64_code_model aarch64_cmodel;
275
43cacb12
RS
276/* The number of 64-bit elements in an SVE vector. */
277poly_uint16 aarch64_sve_vg;
278
43e9d192
IB
279#ifdef HAVE_AS_TLS
280#undef TARGET_HAVE_TLS
281#define TARGET_HAVE_TLS 1
282#endif
283
ef4bddc2 284static bool aarch64_composite_type_p (const_tree, machine_mode);
38e62001 285static bool aarch64_return_in_memory_1 (const_tree);
ef4bddc2 286static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
43e9d192 287 const_tree,
ef4bddc2 288 machine_mode *, int *,
56fe3ca3 289 bool *, bool);
43e9d192
IB
290static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
291static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
43e9d192 292static void aarch64_override_options_after_change (void);
ef4bddc2 293static bool aarch64_vector_mode_supported_p (machine_mode);
ef4bddc2 294static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
7df76747
N
295static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
296 const_tree type,
297 int misalignment,
298 bool is_packed);
43cacb12 299static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
a25831ac
AV
300static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
301 aarch64_addr_query_type);
eb471ba3 302static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
88b08073 303
0c6caaf8
RL
304/* Major revision number of the ARM Architecture implemented by the target. */
305unsigned aarch64_architecture_version;
306
43e9d192 307/* The processor for which instructions should be scheduled. */
02fdbd5b 308enum aarch64_processor aarch64_tune = cortexa53;
43e9d192 309
43e9d192 310/* Mask to specify which instruction scheduling options should be used. */
28108a53 311uint64_t aarch64_tune_flags = 0;
43e9d192 312
1be34295 313/* Global flag for PC relative loads. */
9ee6540a 314bool aarch64_pcrelative_literal_loads;
1be34295 315
d6cb6d6a
WD
316/* Global flag for whether frame pointer is enabled. */
317bool aarch64_use_frame_pointer;
318
efac62a3
ST
319#define BRANCH_PROTECT_STR_MAX 255
320char *accepted_branch_protection_string = NULL;
321
322static enum aarch64_parse_opt_result
323aarch64_parse_branch_protection (const char*, char**);
324
8dec06f2
JG
325/* Support for command line parsing of boolean flags in the tuning
326 structures. */
327struct aarch64_flag_desc
328{
329 const char* name;
330 unsigned int flag;
331};
332
ed9fa8d2 333#define AARCH64_FUSION_PAIR(name, internal_name) \
8dec06f2
JG
334 { name, AARCH64_FUSE_##internal_name },
335static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
336{
337 { "none", AARCH64_FUSE_NOTHING },
338#include "aarch64-fusion-pairs.def"
339 { "all", AARCH64_FUSE_ALL },
340 { NULL, AARCH64_FUSE_NOTHING }
341};
8dec06f2 342
a339a01c 343#define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
8dec06f2
JG
344 { name, AARCH64_EXTRA_TUNE_##internal_name },
345static const struct aarch64_flag_desc aarch64_tuning_flags[] =
346{
347 { "none", AARCH64_EXTRA_TUNE_NONE },
348#include "aarch64-tuning-flags.def"
349 { "all", AARCH64_EXTRA_TUNE_ALL },
350 { NULL, AARCH64_EXTRA_TUNE_NONE }
351};
8dec06f2 352
43e9d192
IB
353/* Tuning parameters. */
354
43e9d192
IB
355static const struct cpu_addrcost_table generic_addrcost_table =
356{
67747367 357 {
2fae724a 358 1, /* hi */
bd95e655
JG
359 0, /* si */
360 0, /* di */
2fae724a 361 1, /* ti */
67747367 362 },
bd95e655
JG
363 0, /* pre_modify */
364 0, /* post_modify */
365 0, /* register_offset */
783879e6
EM
366 0, /* register_sextend */
367 0, /* register_zextend */
bd95e655 368 0 /* imm_offset */
43e9d192
IB
369};
370
5ec1ae3b
EM
371static const struct cpu_addrcost_table exynosm1_addrcost_table =
372{
373 {
374 0, /* hi */
375 0, /* si */
376 0, /* di */
377 2, /* ti */
378 },
379 0, /* pre_modify */
380 0, /* post_modify */
381 1, /* register_offset */
382 1, /* register_sextend */
383 2, /* register_zextend */
384 0, /* imm_offset */
385};
386
381e27aa
PT
387static const struct cpu_addrcost_table xgene1_addrcost_table =
388{
381e27aa 389 {
bd95e655
JG
390 1, /* hi */
391 0, /* si */
392 0, /* di */
393 1, /* ti */
381e27aa 394 },
bd95e655 395 1, /* pre_modify */
52ddefd8 396 1, /* post_modify */
bd95e655 397 0, /* register_offset */
783879e6
EM
398 1, /* register_sextend */
399 1, /* register_zextend */
bd95e655 400 0, /* imm_offset */
381e27aa
PT
401};
402
d1261ac6 403static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
ad611a4c
VP
404{
405 {
5f407e57
AP
406 1, /* hi */
407 1, /* si */
408 1, /* di */
ad611a4c
VP
409 2, /* ti */
410 },
411 0, /* pre_modify */
412 0, /* post_modify */
413 2, /* register_offset */
414 3, /* register_sextend */
415 3, /* register_zextend */
416 0, /* imm_offset */
417};
418
fa477e45
AY
419static const struct cpu_addrcost_table thunderx3t110_addrcost_table =
420{
421 {
422 1, /* hi */
423 1, /* si */
424 1, /* di */
425 2, /* ti */
426 },
427 0, /* pre_modify */
428 0, /* post_modify */
429 2, /* register_offset */
430 3, /* register_sextend */
431 3, /* register_zextend */
432 0, /* imm_offset */
433};
434
910f72e7
SZ
435static const struct cpu_addrcost_table tsv110_addrcost_table =
436{
437 {
438 1, /* hi */
439 0, /* si */
440 0, /* di */
441 1, /* ti */
442 },
443 0, /* pre_modify */
444 0, /* post_modify */
445 0, /* register_offset */
446 1, /* register_sextend */
447 1, /* register_zextend */
448 0, /* imm_offset */
449};
450
8d39ea2f
LM
451static const struct cpu_addrcost_table qdf24xx_addrcost_table =
452{
453 {
454 1, /* hi */
455 1, /* si */
456 1, /* di */
457 2, /* ti */
458 },
459 1, /* pre_modify */
460 1, /* post_modify */
461 3, /* register_offset */
31508b39 462 3, /* register_sextend */
8d39ea2f
LM
463 3, /* register_zextend */
464 2, /* imm_offset */
465};
466
43e9d192
IB
467static const struct cpu_regmove_cost generic_regmove_cost =
468{
bd95e655 469 1, /* GP2GP */
3969c510
WD
470 /* Avoid the use of slow int<->fp moves for spilling by setting
471 their cost higher than memmov_cost. */
bd95e655
JG
472 5, /* GP2FP */
473 5, /* FP2GP */
474 2 /* FP2FP */
43e9d192
IB
475};
476
e4a9c55a
WD
477static const struct cpu_regmove_cost cortexa57_regmove_cost =
478{
bd95e655 479 1, /* GP2GP */
e4a9c55a
WD
480 /* Avoid the use of slow int<->fp moves for spilling by setting
481 their cost higher than memmov_cost. */
bd95e655
JG
482 5, /* GP2FP */
483 5, /* FP2GP */
484 2 /* FP2FP */
e4a9c55a
WD
485};
486
487static const struct cpu_regmove_cost cortexa53_regmove_cost =
488{
bd95e655 489 1, /* GP2GP */
e4a9c55a
WD
490 /* Avoid the use of slow int<->fp moves for spilling by setting
491 their cost higher than memmov_cost. */
bd95e655
JG
492 5, /* GP2FP */
493 5, /* FP2GP */
494 2 /* FP2FP */
e4a9c55a
WD
495};
496
5ec1ae3b
EM
497static const struct cpu_regmove_cost exynosm1_regmove_cost =
498{
499 1, /* GP2GP */
500 /* Avoid the use of slow int<->fp moves for spilling by setting
501 their cost higher than memmov_cost (actual, 4 and 9). */
502 9, /* GP2FP */
503 9, /* FP2GP */
504 1 /* FP2FP */
505};
506
d1bcc29f
AP
507static const struct cpu_regmove_cost thunderx_regmove_cost =
508{
bd95e655
JG
509 2, /* GP2GP */
510 2, /* GP2FP */
511 6, /* FP2GP */
512 4 /* FP2FP */
d1bcc29f
AP
513};
514
381e27aa
PT
515static const struct cpu_regmove_cost xgene1_regmove_cost =
516{
bd95e655 517 1, /* GP2GP */
381e27aa
PT
518 /* Avoid the use of slow int<->fp moves for spilling by setting
519 their cost higher than memmov_cost. */
bd95e655
JG
520 8, /* GP2FP */
521 8, /* FP2GP */
522 2 /* FP2FP */
381e27aa
PT
523};
524
ee446d9f
JW
525static const struct cpu_regmove_cost qdf24xx_regmove_cost =
526{
527 2, /* GP2GP */
528 /* Avoid the use of int<->fp moves for spilling. */
529 6, /* GP2FP */
530 6, /* FP2GP */
531 4 /* FP2FP */
532};
533
d1261ac6 534static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
ad611a4c
VP
535{
536 1, /* GP2GP */
537 /* Avoid the use of int<->fp moves for spilling. */
2aeccecb
AY
538 5, /* GP2FP */
539 6, /* FP2GP */
540 3, /* FP2FP */
ad611a4c
VP
541};
542
fa477e45
AY
543static const struct cpu_regmove_cost thunderx3t110_regmove_cost =
544{
545 1, /* GP2GP */
546 /* Avoid the use of int<->fp moves for spilling. */
547 4, /* GP2FP */
548 5, /* FP2GP */
549 4 /* FP2FP */
550};
551
910f72e7
SZ
552static const struct cpu_regmove_cost tsv110_regmove_cost =
553{
554 1, /* GP2GP */
555 /* Avoid the use of slow int<->fp moves for spilling by setting
556 their cost higher than memmov_cost. */
557 2, /* GP2FP */
558 3, /* FP2GP */
559 2 /* FP2FP */
560};
561
76e4f444
KT
562/* Generic costs for Advanced SIMD vector operations. */
563static const advsimd_vec_cost generic_advsimd_vector_cost =
564{
565 1, /* int_stmt_cost */
566 1, /* fp_stmt_cost */
567 2, /* permute_cost */
568 2, /* vec_to_scalar_cost */
569 1, /* scalar_to_vec_cost */
570 1, /* align_load_cost */
571 1, /* unalign_load_cost */
572 1, /* unalign_store_cost */
573 1 /* store_cost */
574};
575
576/* Generic costs for SVE vector operations. */
577static const sve_vec_cost generic_sve_vector_cost =
578{
579 1, /* int_stmt_cost */
580 1, /* fp_stmt_cost */
581 2, /* permute_cost */
582 2, /* vec_to_scalar_cost */
583 1, /* scalar_to_vec_cost */
584 1, /* align_load_cost */
585 1, /* unalign_load_cost */
586 1, /* unalign_store_cost */
587 1 /* store_cost */
588};
589
8990e73a 590/* Generic costs for vector insn classes. */
8990e73a
TB
591static const struct cpu_vector_cost generic_vector_cost =
592{
cd8ae5ed
AP
593 1, /* scalar_int_stmt_cost */
594 1, /* scalar_fp_stmt_cost */
bd95e655
JG
595 1, /* scalar_load_cost */
596 1, /* scalar_store_cost */
bd95e655 597 3, /* cond_taken_branch_cost */
76e4f444
KT
598 1, /* cond_not_taken_branch_cost */
599 &generic_advsimd_vector_cost, /* advsimd */
600 &generic_sve_vector_cost /* sve */
601};
602
603static const advsimd_vec_cost qdf24xx_advsimd_vector_cost =
604{
605 1, /* int_stmt_cost */
606 3, /* fp_stmt_cost */
607 2, /* permute_cost */
608 1, /* vec_to_scalar_cost */
609 1, /* scalar_to_vec_cost */
610 1, /* align_load_cost */
611 1, /* unalign_load_cost */
612 1, /* unalign_store_cost */
613 1 /* store_cost */
8990e73a
TB
614};
615
e75bc10e
LM
616/* QDF24XX costs for vector insn classes. */
617static const struct cpu_vector_cost qdf24xx_vector_cost =
618{
619 1, /* scalar_int_stmt_cost */
620 1, /* scalar_fp_stmt_cost */
621 1, /* scalar_load_cost */
622 1, /* scalar_store_cost */
e75bc10e 623 3, /* cond_taken_branch_cost */
76e4f444
KT
624 1, /* cond_not_taken_branch_cost */
625 &qdf24xx_advsimd_vector_cost, /* advsimd */
626 NULL /* sve */
627};
628
629
630static const advsimd_vec_cost thunderx_advsimd_vector_cost =
631{
632 4, /* int_stmt_cost */
633 1, /* fp_stmt_cost */
634 4, /* permute_cost */
635 2, /* vec_to_scalar_cost */
636 2, /* scalar_to_vec_cost */
637 3, /* align_load_cost */
638 5, /* unalign_load_cost */
639 5, /* unalign_store_cost */
640 1 /* store_cost */
e75bc10e
LM
641};
642
c3f20327
AP
643/* ThunderX costs for vector insn classes. */
644static const struct cpu_vector_cost thunderx_vector_cost =
645{
cd8ae5ed
AP
646 1, /* scalar_int_stmt_cost */
647 1, /* scalar_fp_stmt_cost */
c3f20327
AP
648 3, /* scalar_load_cost */
649 1, /* scalar_store_cost */
c3f20327 650 3, /* cond_taken_branch_cost */
76e4f444
KT
651 3, /* cond_not_taken_branch_cost */
652 &thunderx_advsimd_vector_cost, /* advsimd */
653 NULL /* sve */
654};
655
656static const advsimd_vec_cost tsv110_advsimd_vector_cost =
657{
658 2, /* int_stmt_cost */
659 2, /* fp_stmt_cost */
660 2, /* permute_cost */
661 3, /* vec_to_scalar_cost */
662 2, /* scalar_to_vec_cost */
663 5, /* align_load_cost */
664 5, /* unalign_load_cost */
665 1, /* unalign_store_cost */
666 1 /* store_cost */
c3f20327
AP
667};
668
910f72e7
SZ
669static const struct cpu_vector_cost tsv110_vector_cost =
670{
671 1, /* scalar_int_stmt_cost */
672 1, /* scalar_fp_stmt_cost */
673 5, /* scalar_load_cost */
674 1, /* scalar_store_cost */
910f72e7 675 1, /* cond_taken_branch_cost */
76e4f444
KT
676 1, /* cond_not_taken_branch_cost */
677 &tsv110_advsimd_vector_cost, /* advsimd */
678 NULL, /* sve */
910f72e7
SZ
679};
680
76e4f444
KT
681static const advsimd_vec_cost cortexa57_advsimd_vector_cost =
682{
683 2, /* int_stmt_cost */
684 2, /* fp_stmt_cost */
685 3, /* permute_cost */
686 8, /* vec_to_scalar_cost */
687 8, /* scalar_to_vec_cost */
688 4, /* align_load_cost */
689 4, /* unalign_load_cost */
690 1, /* unalign_store_cost */
691 1 /* store_cost */
692};
693
694/* Cortex-A57 costs for vector insn classes. */
60bff090
JG
695static const struct cpu_vector_cost cortexa57_vector_cost =
696{
cd8ae5ed
AP
697 1, /* scalar_int_stmt_cost */
698 1, /* scalar_fp_stmt_cost */
bd95e655
JG
699 4, /* scalar_load_cost */
700 1, /* scalar_store_cost */
bd95e655 701 1, /* cond_taken_branch_cost */
76e4f444
KT
702 1, /* cond_not_taken_branch_cost */
703 &cortexa57_advsimd_vector_cost, /* advsimd */
704 NULL /* sve */
705};
706
707static const advsimd_vec_cost exynosm1_advsimd_vector_cost =
708{
709 3, /* int_stmt_cost */
710 3, /* fp_stmt_cost */
711 3, /* permute_cost */
712 3, /* vec_to_scalar_cost */
713 3, /* scalar_to_vec_cost */
714 5, /* align_load_cost */
715 5, /* unalign_load_cost */
716 1, /* unalign_store_cost */
717 1 /* store_cost */
60bff090
JG
718};
719
5ec1ae3b
EM
720static const struct cpu_vector_cost exynosm1_vector_cost =
721{
cd8ae5ed
AP
722 1, /* scalar_int_stmt_cost */
723 1, /* scalar_fp_stmt_cost */
5ec1ae3b
EM
724 5, /* scalar_load_cost */
725 1, /* scalar_store_cost */
5ec1ae3b 726 1, /* cond_taken_branch_cost */
76e4f444
KT
727 1, /* cond_not_taken_branch_cost */
728 &exynosm1_advsimd_vector_cost, /* advsimd */
729 NULL /* sve */
730};
731
732static const advsimd_vec_cost xgene1_advsimd_vector_cost =
733{
734 2, /* int_stmt_cost */
735 2, /* fp_stmt_cost */
736 2, /* permute_cost */
737 4, /* vec_to_scalar_cost */
738 4, /* scalar_to_vec_cost */
739 10, /* align_load_cost */
740 10, /* unalign_load_cost */
741 2, /* unalign_store_cost */
742 2 /* store_cost */
5ec1ae3b
EM
743};
744
381e27aa 745/* Generic costs for vector insn classes. */
381e27aa
PT
746static const struct cpu_vector_cost xgene1_vector_cost =
747{
cd8ae5ed
AP
748 1, /* scalar_int_stmt_cost */
749 1, /* scalar_fp_stmt_cost */
bd95e655
JG
750 5, /* scalar_load_cost */
751 1, /* scalar_store_cost */
bd95e655 752 2, /* cond_taken_branch_cost */
76e4f444
KT
753 1, /* cond_not_taken_branch_cost */
754 &xgene1_advsimd_vector_cost, /* advsimd */
755 NULL /* sve */
756};
757
758static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost =
759{
760 4, /* int_stmt_cost */
761 5, /* fp_stmt_cost */
762 10, /* permute_cost */
763 6, /* vec_to_scalar_cost */
764 5, /* scalar_to_vec_cost */
765 4, /* align_load_cost */
766 4, /* unalign_load_cost */
767 1, /* unalign_store_cost */
768 1 /* store_cost */
381e27aa
PT
769};
770
ad611a4c 771/* Costs for vector insn classes for Vulcan. */
d1261ac6 772static const struct cpu_vector_cost thunderx2t99_vector_cost =
ad611a4c 773{
cd8ae5ed
AP
774 1, /* scalar_int_stmt_cost */
775 6, /* scalar_fp_stmt_cost */
ad611a4c
VP
776 4, /* scalar_load_cost */
777 1, /* scalar_store_cost */
ad611a4c 778 2, /* cond_taken_branch_cost */
76e4f444
KT
779 1, /* cond_not_taken_branch_cost */
780 &thunderx2t99_advsimd_vector_cost, /* advsimd */
781 NULL /* sve */
782};
783
784static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost =
785{
786 5, /* int_stmt_cost */
787 5, /* fp_stmt_cost */
788 10, /* permute_cost */
789 5, /* vec_to_scalar_cost */
790 5, /* scalar_to_vec_cost */
791 4, /* align_load_cost */
792 4, /* unalign_load_cost */
793 4, /* unalign_store_cost */
794 4 /* store_cost */
ad611a4c
VP
795};
796
fa477e45
AY
797static const struct cpu_vector_cost thunderx3t110_vector_cost =
798{
799 1, /* scalar_int_stmt_cost */
800 5, /* scalar_fp_stmt_cost */
801 4, /* scalar_load_cost */
802 1, /* scalar_store_cost */
fa477e45 803 2, /* cond_taken_branch_cost */
76e4f444
KT
804 1, /* cond_not_taken_branch_cost */
805 &thunderx3t110_advsimd_vector_cost, /* advsimd */
806 NULL /* sve */
fa477e45
AY
807};
808
809
b9066f5a
MW
810/* Generic costs for branch instructions. */
811static const struct cpu_branch_cost generic_branch_cost =
812{
9094d4a4
WD
813 1, /* Predictable. */
814 3 /* Unpredictable. */
b9066f5a
MW
815};
816
9acc9cbe
EM
817/* Generic approximation modes. */
818static const cpu_approx_modes generic_approx_modes =
819{
79a2bc2d 820 AARCH64_APPROX_NONE, /* division */
98daafa0 821 AARCH64_APPROX_NONE, /* sqrt */
9acc9cbe
EM
822 AARCH64_APPROX_NONE /* recip_sqrt */
823};
824
825/* Approximation modes for Exynos M1. */
826static const cpu_approx_modes exynosm1_approx_modes =
827{
79a2bc2d 828 AARCH64_APPROX_NONE, /* division */
98daafa0 829 AARCH64_APPROX_ALL, /* sqrt */
9acc9cbe
EM
830 AARCH64_APPROX_ALL /* recip_sqrt */
831};
832
833/* Approximation modes for X-Gene 1. */
834static const cpu_approx_modes xgene1_approx_modes =
835{
79a2bc2d 836 AARCH64_APPROX_NONE, /* division */
98daafa0 837 AARCH64_APPROX_NONE, /* sqrt */
9acc9cbe
EM
838 AARCH64_APPROX_ALL /* recip_sqrt */
839};
840
9d2c6e2e
MK
841/* Generic prefetch settings (which disable prefetch). */
842static const cpu_prefetch_tune generic_prefetch_tune =
843{
844 0, /* num_slots */
845 -1, /* l1_cache_size */
846 -1, /* l1_cache_line_size */
16b2cafd 847 -1, /* l2_cache_size */
d2ff35c0 848 true, /* prefetch_dynamic_strides */
59100dfc 849 -1, /* minimum_stride */
16b2cafd 850 -1 /* default_opt_level */
9d2c6e2e
MK
851};
852
853static const cpu_prefetch_tune exynosm1_prefetch_tune =
854{
855 0, /* num_slots */
856 -1, /* l1_cache_size */
857 64, /* l1_cache_line_size */
16b2cafd 858 -1, /* l2_cache_size */
d2ff35c0 859 true, /* prefetch_dynamic_strides */
59100dfc 860 -1, /* minimum_stride */
16b2cafd 861 -1 /* default_opt_level */
9d2c6e2e
MK
862};
863
864static const cpu_prefetch_tune qdf24xx_prefetch_tune =
865{
70c51b58
MK
866 4, /* num_slots */
867 32, /* l1_cache_size */
9d2c6e2e 868 64, /* l1_cache_line_size */
725e2110 869 512, /* l2_cache_size */
d2ff35c0 870 false, /* prefetch_dynamic_strides */
59100dfc
LM
871 2048, /* minimum_stride */
872 3 /* default_opt_level */
9d2c6e2e
MK
873};
874
f1e247d0
AP
875static const cpu_prefetch_tune thunderxt88_prefetch_tune =
876{
877 8, /* num_slots */
878 32, /* l1_cache_size */
879 128, /* l1_cache_line_size */
880 16*1024, /* l2_cache_size */
d2ff35c0 881 true, /* prefetch_dynamic_strides */
59100dfc 882 -1, /* minimum_stride */
f1e247d0
AP
883 3 /* default_opt_level */
884};
885
886static const cpu_prefetch_tune thunderx_prefetch_tune =
887{
888 8, /* num_slots */
889 32, /* l1_cache_size */
890 128, /* l1_cache_line_size */
891 -1, /* l2_cache_size */
d2ff35c0 892 true, /* prefetch_dynamic_strides */
59100dfc 893 -1, /* minimum_stride */
f1e247d0
AP
894 -1 /* default_opt_level */
895};
896
9d2c6e2e
MK
897static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
898{
f1e247d0
AP
899 8, /* num_slots */
900 32, /* l1_cache_size */
9d2c6e2e 901 64, /* l1_cache_line_size */
f1e247d0 902 256, /* l2_cache_size */
d2ff35c0 903 true, /* prefetch_dynamic_strides */
59100dfc 904 -1, /* minimum_stride */
16b2cafd 905 -1 /* default_opt_level */
9d2c6e2e
MK
906};
907
fa477e45
AY
908static const cpu_prefetch_tune thunderx3t110_prefetch_tune =
909{
910 8, /* num_slots */
911 32, /* l1_cache_size */
912 64, /* l1_cache_line_size */
913 256, /* l2_cache_size */
914 true, /* prefetch_dynamic_strides */
915 -1, /* minimum_stride */
916 -1 /* default_opt_level */
917};
918
910f72e7
SZ
919static const cpu_prefetch_tune tsv110_prefetch_tune =
920{
921 0, /* num_slots */
922 64, /* l1_cache_size */
923 64, /* l1_cache_line_size */
924 512, /* l2_cache_size */
925 true, /* prefetch_dynamic_strides */
926 -1, /* minimum_stride */
927 -1 /* default_opt_level */
928};
929
d5e9851e
CM
930static const cpu_prefetch_tune xgene1_prefetch_tune =
931{
932 8, /* num_slots */
933 32, /* l1_cache_size */
934 64, /* l1_cache_line_size */
935 256, /* l2_cache_size */
936 true, /* prefetch_dynamic_strides */
937 -1, /* minimum_stride */
938 -1 /* default_opt_level */
939};
940
02f21aea
QJ
941static const cpu_prefetch_tune a64fx_prefetch_tune =
942{
943 8, /* num_slots */
944 64, /* l1_cache_size */
945 256, /* l1_cache_line_size */
946 32768, /* l2_cache_size */
947 true, /* prefetch_dynamic_strides */
948 -1, /* minimum_stride */
949 -1 /* default_opt_level */
950};
951
43e9d192
IB
952static const struct tune_params generic_tunings =
953{
4e2cd668 954 &cortexa57_extra_costs,
43e9d192
IB
955 &generic_addrcost_table,
956 &generic_regmove_cost,
8990e73a 957 &generic_vector_cost,
b9066f5a 958 &generic_branch_cost,
9acc9cbe 959 &generic_approx_modes,
2d56d6ba 960 SVE_NOT_IMPLEMENTED, /* sve_width */
bd95e655
JG
961 4, /* memmov_cost */
962 2, /* issue_rate */
6ed8c923 963 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
4e55aefa 964 "16:12", /* function_align. */
c518c102
ML
965 "4", /* jump_align. */
966 "8", /* loop_align. */
cee66c68
WD
967 2, /* int_reassoc_width. */
968 4, /* fp_reassoc_width. */
50093a33
WD
969 1, /* vec_reassoc_width. */
970 2, /* min_div_recip_mul_sf. */
dfba575f 971 2, /* min_div_recip_mul_df. */
50487d79 972 0, /* max_case_values. */
3b4c0f7e 973 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
9d2c6e2e
MK
974 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
975 &generic_prefetch_tune
43e9d192
IB
976};
977
1c72a3ca
JG
978static const struct tune_params cortexa35_tunings =
979{
980 &cortexa53_extra_costs,
981 &generic_addrcost_table,
982 &cortexa53_regmove_cost,
983 &generic_vector_cost,
aca97ef8 984 &generic_branch_cost,
9acc9cbe 985 &generic_approx_modes,
2d56d6ba 986 SVE_NOT_IMPLEMENTED, /* sve_width */
1c72a3ca
JG
987 4, /* memmov_cost */
988 1, /* issue_rate */
0bc24338 989 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1c72a3ca 990 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
c518c102
ML
991 "16", /* function_align. */
992 "4", /* jump_align. */
993 "8", /* loop_align. */
1c72a3ca
JG
994 2, /* int_reassoc_width. */
995 4, /* fp_reassoc_width. */
996 1, /* vec_reassoc_width. */
997 2, /* min_div_recip_mul_sf. */
998 2, /* min_div_recip_mul_df. */
999 0, /* max_case_values. */
1c72a3ca 1000 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
9d2c6e2e
MK
1001 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1002 &generic_prefetch_tune
1c72a3ca
JG
1003};
1004
984239ad
KT
1005static const struct tune_params cortexa53_tunings =
1006{
1007 &cortexa53_extra_costs,
1008 &generic_addrcost_table,
e4a9c55a 1009 &cortexa53_regmove_cost,
984239ad 1010 &generic_vector_cost,
aca97ef8 1011 &generic_branch_cost,
9acc9cbe 1012 &generic_approx_modes,
2d56d6ba 1013 SVE_NOT_IMPLEMENTED, /* sve_width */
bd95e655
JG
1014 4, /* memmov_cost */
1015 2, /* issue_rate */
00a8574a 1016 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
e9a3a175 1017 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
c518c102
ML
1018 "16", /* function_align. */
1019 "4", /* jump_align. */
1020 "8", /* loop_align. */
cee66c68
WD
1021 2, /* int_reassoc_width. */
1022 4, /* fp_reassoc_width. */
50093a33
WD
1023 1, /* vec_reassoc_width. */
1024 2, /* min_div_recip_mul_sf. */
dfba575f 1025 2, /* min_div_recip_mul_df. */
50487d79 1026 0, /* max_case_values. */
2d6bc7fa 1027 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
9d2c6e2e
MK
1028 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1029 &generic_prefetch_tune
984239ad
KT
1030};
1031
4fd92af6
KT
1032static const struct tune_params cortexa57_tunings =
1033{
1034 &cortexa57_extra_costs,
a39d4348 1035 &generic_addrcost_table,
e4a9c55a 1036 &cortexa57_regmove_cost,
60bff090 1037 &cortexa57_vector_cost,
aca97ef8 1038 &generic_branch_cost,
9acc9cbe 1039 &generic_approx_modes,
2d56d6ba 1040 SVE_NOT_IMPLEMENTED, /* sve_width */
bd95e655
JG
1041 4, /* memmov_cost */
1042 3, /* issue_rate */
00a8574a 1043 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
e9a3a175 1044 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
c518c102
ML
1045 "16", /* function_align. */
1046 "4", /* jump_align. */
1047 "8", /* loop_align. */
cee66c68
WD
1048 2, /* int_reassoc_width. */
1049 4, /* fp_reassoc_width. */
50093a33
WD
1050 1, /* vec_reassoc_width. */
1051 2, /* min_div_recip_mul_sf. */
dfba575f 1052 2, /* min_div_recip_mul_df. */
50487d79 1053 0, /* max_case_values. */
2d6bc7fa 1054 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
9d2c6e2e
MK
1055 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
1056 &generic_prefetch_tune
dfba575f
JG
1057};
1058
1059static const struct tune_params cortexa72_tunings =
1060{
1061 &cortexa57_extra_costs,
a39d4348 1062 &generic_addrcost_table,
dfba575f
JG
1063 &cortexa57_regmove_cost,
1064 &cortexa57_vector_cost,
aca97ef8 1065 &generic_branch_cost,
9acc9cbe 1066 &generic_approx_modes,
2d56d6ba 1067 SVE_NOT_IMPLEMENTED, /* sve_width */
dfba575f
JG
1068 4, /* memmov_cost */
1069 3, /* issue_rate */
00a8574a 1070 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
dfba575f 1071 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
c518c102
ML
1072 "16", /* function_align. */
1073 "4", /* jump_align. */
1074 "8", /* loop_align. */
dfba575f
JG
1075 2, /* int_reassoc_width. */
1076 4, /* fp_reassoc_width. */
1077 1, /* vec_reassoc_width. */
1078 2, /* min_div_recip_mul_sf. */
1079 2, /* min_div_recip_mul_df. */
50487d79 1080 0, /* max_case_values. */
0bc24338 1081 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
9d2c6e2e
MK
1082 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1083 &generic_prefetch_tune
4fd92af6
KT
1084};
1085
4fb570c4
KT
1086static const struct tune_params cortexa73_tunings =
1087{
1088 &cortexa57_extra_costs,
a39d4348 1089 &generic_addrcost_table,
4fb570c4
KT
1090 &cortexa57_regmove_cost,
1091 &cortexa57_vector_cost,
aca97ef8 1092 &generic_branch_cost,
4fb570c4 1093 &generic_approx_modes,
2d56d6ba 1094 SVE_NOT_IMPLEMENTED, /* sve_width */
4fb570c4
KT
1095 4, /* memmov_cost. */
1096 2, /* issue_rate. */
1097 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1098 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
c518c102
ML
1099 "16", /* function_align. */
1100 "4", /* jump_align. */
1101 "8", /* loop_align. */
4fb570c4
KT
1102 2, /* int_reassoc_width. */
1103 4, /* fp_reassoc_width. */
1104 1, /* vec_reassoc_width. */
1105 2, /* min_div_recip_mul_sf. */
1106 2, /* min_div_recip_mul_df. */
1107 0, /* max_case_values. */
4fb570c4 1108 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
9d2c6e2e
MK
1109 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1110 &generic_prefetch_tune
4fb570c4
KT
1111};
1112
9d2c6e2e
MK
1113
1114
5ec1ae3b
EM
1115static const struct tune_params exynosm1_tunings =
1116{
1117 &exynosm1_extra_costs,
1118 &exynosm1_addrcost_table,
1119 &exynosm1_regmove_cost,
1120 &exynosm1_vector_cost,
1121 &generic_branch_cost,
9acc9cbe 1122 &exynosm1_approx_modes,
2d56d6ba 1123 SVE_NOT_IMPLEMENTED, /* sve_width */
5ec1ae3b
EM
1124 4, /* memmov_cost */
1125 3, /* issue_rate */
25cc2199 1126 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
c518c102
ML
1127 "4", /* function_align. */
1128 "4", /* jump_align. */
1129 "4", /* loop_align. */
5ec1ae3b
EM
1130 2, /* int_reassoc_width. */
1131 4, /* fp_reassoc_width. */
1132 1, /* vec_reassoc_width. */
1133 2, /* min_div_recip_mul_sf. */
1134 2, /* min_div_recip_mul_df. */
1135 48, /* max_case_values. */
220379df 1136 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
9d2c6e2e
MK
1137 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1138 &exynosm1_prefetch_tune
5ec1ae3b
EM
1139};
1140
f1e247d0
AP
1141static const struct tune_params thunderxt88_tunings =
1142{
1143 &thunderx_extra_costs,
1144 &generic_addrcost_table,
1145 &thunderx_regmove_cost,
1146 &thunderx_vector_cost,
1147 &generic_branch_cost,
1148 &generic_approx_modes,
2d56d6ba 1149 SVE_NOT_IMPLEMENTED, /* sve_width */
f1e247d0
AP
1150 6, /* memmov_cost */
1151 2, /* issue_rate */
a4f3fa71 1152 AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
c518c102
ML
1153 "8", /* function_align. */
1154 "8", /* jump_align. */
1155 "8", /* loop_align. */
f1e247d0
AP
1156 2, /* int_reassoc_width. */
1157 4, /* fp_reassoc_width. */
1158 1, /* vec_reassoc_width. */
1159 2, /* min_div_recip_mul_sf. */
1160 2, /* min_div_recip_mul_df. */
1161 0, /* max_case_values. */
1162 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1163 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
1164 &thunderxt88_prefetch_tune
1165};
1166
d1bcc29f
AP
1167static const struct tune_params thunderx_tunings =
1168{
1169 &thunderx_extra_costs,
1170 &generic_addrcost_table,
1171 &thunderx_regmove_cost,
c3f20327 1172 &thunderx_vector_cost,
b9066f5a 1173 &generic_branch_cost,
9acc9cbe 1174 &generic_approx_modes,
2d56d6ba 1175 SVE_NOT_IMPLEMENTED, /* sve_width */
bd95e655
JG
1176 6, /* memmov_cost */
1177 2, /* issue_rate */
a4f3fa71 1178 AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
c518c102
ML
1179 "8", /* function_align. */
1180 "8", /* jump_align. */
1181 "8", /* loop_align. */
cee66c68
WD
1182 2, /* int_reassoc_width. */
1183 4, /* fp_reassoc_width. */
50093a33
WD
1184 1, /* vec_reassoc_width. */
1185 2, /* min_div_recip_mul_sf. */
dfba575f 1186 2, /* min_div_recip_mul_df. */
50487d79 1187 0, /* max_case_values. */
2d6bc7fa 1188 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
b10f1009
AP
1189 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
1190 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
f1e247d0 1191 &thunderx_prefetch_tune
d1bcc29f
AP
1192};
1193
910f72e7
SZ
1194static const struct tune_params tsv110_tunings =
1195{
1196 &tsv110_extra_costs,
1197 &tsv110_addrcost_table,
1198 &tsv110_regmove_cost,
1199 &tsv110_vector_cost,
1200 &generic_branch_cost,
1201 &generic_approx_modes,
2d56d6ba 1202 SVE_NOT_IMPLEMENTED, /* sve_width */
910f72e7
SZ
1203 4, /* memmov_cost */
1204 4, /* issue_rate */
a4f3fa71
WD
1205 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
1206 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
910f72e7
SZ
1207 "16", /* function_align. */
1208 "4", /* jump_align. */
1209 "8", /* loop_align. */
1210 2, /* int_reassoc_width. */
1211 4, /* fp_reassoc_width. */
1212 1, /* vec_reassoc_width. */
1213 2, /* min_div_recip_mul_sf. */
1214 2, /* min_div_recip_mul_df. */
1215 0, /* max_case_values. */
1216 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1217 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1218 &tsv110_prefetch_tune
1219};
1220
381e27aa 1221static const struct tune_params xgene1_tunings =
e02669db
CM
1222{
1223 &xgene1_extra_costs,
1224 &xgene1_addrcost_table,
1225 &xgene1_regmove_cost,
1226 &xgene1_vector_cost,
1227 &generic_branch_cost,
1228 &xgene1_approx_modes,
2d56d6ba 1229 SVE_NOT_IMPLEMENTED, /* sve_width */
e02669db
CM
1230 6, /* memmov_cost */
1231 4, /* issue_rate */
1232 AARCH64_FUSE_NOTHING, /* fusible_ops */
1233 "16", /* function_align. */
1234 "16", /* jump_align. */
1235 "16", /* loop_align. */
1236 2, /* int_reassoc_width. */
1237 4, /* fp_reassoc_width. */
1238 1, /* vec_reassoc_width. */
1239 2, /* min_div_recip_mul_sf. */
1240 2, /* min_div_recip_mul_df. */
1241 17, /* max_case_values. */
1242 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1243 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1244 &xgene1_prefetch_tune
1245};
1246
1247static const struct tune_params emag_tunings =
381e27aa
PT
1248{
1249 &xgene1_extra_costs,
1250 &xgene1_addrcost_table,
1251 &xgene1_regmove_cost,
1252 &xgene1_vector_cost,
b9066f5a 1253 &generic_branch_cost,
9acc9cbe 1254 &xgene1_approx_modes,
2d56d6ba 1255 SVE_NOT_IMPLEMENTED,
bd95e655
JG
1256 6, /* memmov_cost */
1257 4, /* issue_rate */
e9a3a175 1258 AARCH64_FUSE_NOTHING, /* fusible_ops */
c518c102 1259 "16", /* function_align. */
cf28c77e 1260 "16", /* jump_align. */
c518c102 1261 "16", /* loop_align. */
381e27aa
PT
1262 2, /* int_reassoc_width. */
1263 4, /* fp_reassoc_width. */
50093a33
WD
1264 1, /* vec_reassoc_width. */
1265 2, /* min_div_recip_mul_sf. */
dfba575f 1266 2, /* min_div_recip_mul_df. */
cf28c77e 1267 17, /* max_case_values. */
2d6bc7fa 1268 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
9f5361c8 1269 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
d5e9851e 1270 &xgene1_prefetch_tune
381e27aa
PT
1271};
1272
ee446d9f
JW
1273static const struct tune_params qdf24xx_tunings =
1274{
1275 &qdf24xx_extra_costs,
8d39ea2f 1276 &qdf24xx_addrcost_table,
ee446d9f 1277 &qdf24xx_regmove_cost,
e75bc10e 1278 &qdf24xx_vector_cost,
ee446d9f
JW
1279 &generic_branch_cost,
1280 &generic_approx_modes,
2d56d6ba 1281 SVE_NOT_IMPLEMENTED, /* sve_width */
ee446d9f
JW
1282 4, /* memmov_cost */
1283 4, /* issue_rate */
1284 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1285 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
c518c102
ML
1286 "16", /* function_align. */
1287 "8", /* jump_align. */
1288 "16", /* loop_align. */
ee446d9f
JW
1289 2, /* int_reassoc_width. */
1290 4, /* fp_reassoc_width. */
1291 1, /* vec_reassoc_width. */
1292 2, /* min_div_recip_mul_sf. */
1293 2, /* min_div_recip_mul_df. */
1294 0, /* max_case_values. */
4f2a94e6 1295 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
a98824ac 1296 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
9d2c6e2e 1297 &qdf24xx_prefetch_tune
ee446d9f
JW
1298};
1299
52ee8191
SP
1300/* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1301 for now. */
1302static const struct tune_params saphira_tunings =
1303{
1304 &generic_extra_costs,
1305 &generic_addrcost_table,
1306 &generic_regmove_cost,
1307 &generic_vector_cost,
1308 &generic_branch_cost,
1309 &generic_approx_modes,
2d56d6ba 1310 SVE_NOT_IMPLEMENTED, /* sve_width */
52ee8191
SP
1311 4, /* memmov_cost */
1312 4, /* issue_rate */
1313 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1314 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
c518c102
ML
1315 "16", /* function_align. */
1316 "8", /* jump_align. */
1317 "16", /* loop_align. */
52ee8191
SP
1318 2, /* int_reassoc_width. */
1319 4, /* fp_reassoc_width. */
1320 1, /* vec_reassoc_width. */
1321 2, /* min_div_recip_mul_sf. */
1322 2, /* min_div_recip_mul_df. */
1323 0, /* max_case_values. */
1324 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1325 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1326 &generic_prefetch_tune
1327};
1328
d1261ac6 1329static const struct tune_params thunderx2t99_tunings =
ad611a4c 1330{
d1261ac6
AP
1331 &thunderx2t99_extra_costs,
1332 &thunderx2t99_addrcost_table,
1333 &thunderx2t99_regmove_cost,
1334 &thunderx2t99_vector_cost,
aca97ef8 1335 &generic_branch_cost,
ad611a4c 1336 &generic_approx_modes,
2d56d6ba 1337 SVE_NOT_IMPLEMENTED, /* sve_width */
ad611a4c
VP
1338 4, /* memmov_cost. */
1339 4, /* issue_rate. */
a4f3fa71
WD
1340 (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1341 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
c518c102
ML
1342 "16", /* function_align. */
1343 "8", /* jump_align. */
1344 "16", /* loop_align. */
ad611a4c
VP
1345 3, /* int_reassoc_width. */
1346 2, /* fp_reassoc_width. */
1347 2, /* vec_reassoc_width. */
1348 2, /* min_div_recip_mul_sf. */
1349 2, /* min_div_recip_mul_df. */
1350 0, /* max_case_values. */
f1e247d0 1351 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
9d2c6e2e
MK
1352 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1353 &thunderx2t99_prefetch_tune
ad611a4c
VP
1354};
1355
fa477e45
AY
1356static const struct tune_params thunderx3t110_tunings =
1357{
1358 &thunderx3t110_extra_costs,
1359 &thunderx3t110_addrcost_table,
1360 &thunderx3t110_regmove_cost,
1361 &thunderx3t110_vector_cost,
1362 &generic_branch_cost,
1363 &generic_approx_modes,
1364 SVE_NOT_IMPLEMENTED, /* sve_width */
1365 4, /* memmov_cost. */
1366 6, /* issue_rate. */
1367 (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1368 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
1369 "16", /* function_align. */
1370 "8", /* jump_align. */
1371 "16", /* loop_align. */
1372 3, /* int_reassoc_width. */
1373 2, /* fp_reassoc_width. */
1374 2, /* vec_reassoc_width. */
1375 2, /* min_div_recip_mul_sf. */
1376 2, /* min_div_recip_mul_df. */
1377 0, /* max_case_values. */
1378 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1379 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1380 &thunderx3t110_prefetch_tune
1381};
1382
9ed6834d 1383static const struct tune_params neoversen1_tunings =
fc881de2 1384{
5c5a67e6 1385 &cortexa76_extra_costs,
fc881de2
KT
1386 &generic_addrcost_table,
1387 &generic_regmove_cost,
1388 &cortexa57_vector_cost,
1389 &generic_branch_cost,
1390 &generic_approx_modes,
1391 SVE_NOT_IMPLEMENTED, /* sve_width */
1392 4, /* memmov_cost */
1393 3, /* issue_rate */
6ed8c923 1394 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
fc881de2 1395 "32:16", /* function_align. */
3a434597 1396 "4", /* jump_align. */
fc881de2
KT
1397 "32:16", /* loop_align. */
1398 2, /* int_reassoc_width. */
1399 4, /* fp_reassoc_width. */
1400 2, /* vec_reassoc_width. */
1401 2, /* min_div_recip_mul_sf. */
1402 2, /* min_div_recip_mul_df. */
1403 0, /* max_case_values. */
1404 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1405 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1406 &generic_prefetch_tune
1407};
1408
c8c77ed7
KT
1409static const struct tune_params neoversev1_tunings =
1410{
5c5a67e6 1411 &cortexa76_extra_costs,
c8c77ed7
KT
1412 &generic_addrcost_table,
1413 &generic_regmove_cost,
1414 &cortexa57_vector_cost,
1415 &generic_branch_cost,
1416 &generic_approx_modes,
1417 SVE_256, /* sve_width */
1418 4, /* memmov_cost */
1419 3, /* issue_rate */
1420 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
1421 "32:16", /* function_align. */
1422 "4", /* jump_align. */
1423 "32:16", /* loop_align. */
1424 2, /* int_reassoc_width. */
1425 4, /* fp_reassoc_width. */
1426 2, /* vec_reassoc_width. */
1427 2, /* min_div_recip_mul_sf. */
1428 2, /* min_div_recip_mul_df. */
1429 0, /* max_case_values. */
1430 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1431 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1432 &generic_prefetch_tune
1433};
1434
25095d1e
KT
1435static const struct tune_params neoversen2_tunings =
1436{
5c5a67e6 1437 &cortexa76_extra_costs,
25095d1e
KT
1438 &generic_addrcost_table,
1439 &generic_regmove_cost,
1440 &cortexa57_vector_cost,
1441 &generic_branch_cost,
1442 &generic_approx_modes,
1443 SVE_128, /* sve_width */
1444 4, /* memmov_cost */
1445 3, /* issue_rate */
1446 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
1447 "32:16", /* function_align. */
1448 "4", /* jump_align. */
1449 "32:16", /* loop_align. */
1450 2, /* int_reassoc_width. */
1451 4, /* fp_reassoc_width. */
1452 2, /* vec_reassoc_width. */
1453 2, /* min_div_recip_mul_sf. */
1454 2, /* min_div_recip_mul_df. */
1455 0, /* max_case_values. */
1456 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1457 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1458 &generic_prefetch_tune
1459};
1460
02f21aea
QJ
1461static const struct tune_params a64fx_tunings =
1462{
1463 &generic_extra_costs,
1464 &generic_addrcost_table,
1465 &generic_regmove_cost,
1466 &generic_vector_cost,
1467 &generic_branch_cost,
1468 &generic_approx_modes,
1469 SVE_512, /* sve_width */
1470 4, /* memmov_cost */
1471 7, /* issue_rate */
1472 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
1473 "32", /* function_align. */
1474 "16", /* jump_align. */
1475 "32", /* loop_align. */
1476 4, /* int_reassoc_width. */
1477 2, /* fp_reassoc_width. */
1478 2, /* vec_reassoc_width. */
1479 2, /* min_div_recip_mul_sf. */
1480 2, /* min_div_recip_mul_df. */
1481 0, /* max_case_values. */
1482 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1483 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1484 &a64fx_prefetch_tune
1485};
1486
8dec06f2
JG
1487/* Support for fine-grained override of the tuning structures. */
1488struct aarch64_tuning_override_function
1489{
1490 const char* name;
1491 void (*parse_override)(const char*, struct tune_params*);
1492};
1493
1494static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1495static void aarch64_parse_tune_string (const char*, struct tune_params*);
886f092f 1496static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
8dec06f2
JG
1497
1498static const struct aarch64_tuning_override_function
1499aarch64_tuning_override_functions[] =
1500{
1501 { "fuse", aarch64_parse_fuse_string },
1502 { "tune", aarch64_parse_tune_string },
886f092f 1503 { "sve_width", aarch64_parse_sve_width_string },
8dec06f2
JG
1504 { NULL, NULL }
1505};
1506
43e9d192
IB
1507/* A processor implementing AArch64. */
1508struct processor
1509{
1510 const char *const name;
46806c44
KT
1511 enum aarch64_processor ident;
1512 enum aarch64_processor sched_core;
393ae126 1513 enum aarch64_arch arch;
0c6caaf8 1514 unsigned architecture_version;
28108a53 1515 const uint64_t flags;
43e9d192
IB
1516 const struct tune_params *const tune;
1517};
1518
393ae126
KT
1519/* Architectures implementing AArch64. */
1520static const struct processor all_architectures[] =
1521{
1522#define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1523 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1524#include "aarch64-arches.def"
393ae126
KT
1525 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1526};
1527
43e9d192
IB
1528/* Processor cores implementing AArch64. */
1529static const struct processor all_cores[] =
1530{
e8fcc9fa 1531#define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
393ae126
KT
1532 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1533 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1534 FLAGS, &COSTS##_tunings},
43e9d192 1535#include "aarch64-cores.def"
393ae126
KT
1536 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1537 AARCH64_FL_FOR_ARCH8, &generic_tunings},
1538 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
43e9d192
IB
1539};
1540
43e9d192 1541
361fb3ee
KT
1542/* Target specification. These are populated by the -march, -mtune, -mcpu
1543 handling code or by target attributes. */
43e9d192
IB
1544static const struct processor *selected_arch;
1545static const struct processor *selected_cpu;
1546static const struct processor *selected_tune;
1547
8fc16d72
ST
1548enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1549
b175b679
JG
1550/* The current tuning set. */
1551struct tune_params aarch64_tune_params = generic_tunings;
1552
c600df9a
RS
1553/* Check whether an 'aarch64_vector_pcs' attribute is valid. */
1554
1555static tree
1556handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
1557 int, bool *no_add_attrs)
1558{
1559 /* Since we set fn_type_req to true, the caller should have checked
1560 this for us. */
1561 gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
1562 switch ((arm_pcs) fntype_abi (*node).id ())
1563 {
1564 case ARM_PCS_AAPCS64:
1565 case ARM_PCS_SIMD:
1566 return NULL_TREE;
1567
1568 case ARM_PCS_SVE:
1569 error ("the %qE attribute cannot be applied to an SVE function type",
1570 name);
1571 *no_add_attrs = true;
1572 return NULL_TREE;
1573
1574 case ARM_PCS_TLSDESC:
1575 case ARM_PCS_UNKNOWN:
1576 break;
1577 }
1578 gcc_unreachable ();
1579}
1580
a0d0b980
SE
1581/* Table of machine attributes. */
1582static const struct attribute_spec aarch64_attribute_table[] =
1583{
1584 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1585 affects_type_identity, handler, exclude } */
c600df9a
RS
1586 { "aarch64_vector_pcs", 0, 0, false, true, true, true,
1587 handle_aarch64_vector_pcs_attribute, NULL },
38e62001
RS
1588 { "arm_sve_vector_bits", 1, 1, false, true, false, true,
1589 aarch64_sve::handle_arm_sve_vector_bits_attribute,
1590 NULL },
31427b97 1591 { "Advanced SIMD type", 1, 1, false, true, false, true, NULL, NULL },
683e93d1 1592 { "SVE type", 3, 3, false, true, false, true, NULL, NULL },
5002dae3 1593 { "SVE sizeless type", 0, 0, false, true, false, true, NULL, NULL },
a0d0b980
SE
1594 { NULL, 0, 0, false, false, false, false, NULL, NULL }
1595};
1596
43e9d192
IB
1597#define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1598
1599/* An ISA extension in the co-processor and main instruction set space. */
1600struct aarch64_option_extension
1601{
1602 const char *const name;
1603 const unsigned long flags_on;
1604 const unsigned long flags_off;
1605};
1606
43e9d192
IB
1607typedef enum aarch64_cond_code
1608{
1609 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1610 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1611 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1612}
1613aarch64_cc;
1614
1615#define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1616
efac62a3
ST
1617struct aarch64_branch_protect_type
1618{
1619 /* The type's name that the user passes to the branch-protection option
1620 string. */
1621 const char* name;
1622 /* Function to handle the protection type and set global variables.
1623 First argument is the string token corresponding with this type and the
1624 second argument is the next token in the option string.
1625 Return values:
1626 * AARCH64_PARSE_OK: Handling was sucessful.
1627 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1628 should print an error.
1629 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1630 own error. */
1631 enum aarch64_parse_opt_result (*handler)(char*, char*);
1632 /* A list of types that can follow this type in the option string. */
1633 const aarch64_branch_protect_type* subtypes;
1634 unsigned int num_subtypes;
1635};
1636
1637static enum aarch64_parse_opt_result
1638aarch64_handle_no_branch_protection (char* str, char* rest)
1639{
1640 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
30afdf34 1641 aarch64_enable_bti = 0;
efac62a3
ST
1642 if (rest)
1643 {
1644 error ("unexpected %<%s%> after %<%s%>", rest, str);
1645 return AARCH64_PARSE_INVALID_FEATURE;
1646 }
1647 return AARCH64_PARSE_OK;
1648}
1649
1650static enum aarch64_parse_opt_result
1651aarch64_handle_standard_branch_protection (char* str, char* rest)
1652{
1653 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
8fc16d72 1654 aarch64_ra_sign_key = AARCH64_KEY_A;
30afdf34 1655 aarch64_enable_bti = 1;
efac62a3
ST
1656 if (rest)
1657 {
1658 error ("unexpected %<%s%> after %<%s%>", rest, str);
1659 return AARCH64_PARSE_INVALID_FEATURE;
1660 }
1661 return AARCH64_PARSE_OK;
1662}
1663
1664static enum aarch64_parse_opt_result
1665aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1666 char* rest ATTRIBUTE_UNUSED)
1667{
1668 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
8fc16d72 1669 aarch64_ra_sign_key = AARCH64_KEY_A;
efac62a3
ST
1670 return AARCH64_PARSE_OK;
1671}
1672
1673static enum aarch64_parse_opt_result
1674aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1675 char* rest ATTRIBUTE_UNUSED)
1676{
1677 aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1678 return AARCH64_PARSE_OK;
1679}
1680
8fc16d72
ST
1681static enum aarch64_parse_opt_result
1682aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1683 char* rest ATTRIBUTE_UNUSED)
1684{
1685 aarch64_ra_sign_key = AARCH64_KEY_B;
1686 return AARCH64_PARSE_OK;
1687}
1688
30afdf34
SD
1689static enum aarch64_parse_opt_result
1690aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1691 char* rest ATTRIBUTE_UNUSED)
1692{
1693 aarch64_enable_bti = 1;
1694 return AARCH64_PARSE_OK;
1695}
1696
efac62a3
ST
1697static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1698 { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
8fc16d72 1699 { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
efac62a3
ST
1700 { NULL, NULL, NULL, 0 }
1701};
1702
1703static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1704 { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1705 { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1706 { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1707 ARRAY_SIZE (aarch64_pac_ret_subtypes) },
30afdf34 1708 { "bti", aarch64_handle_bti_protection, NULL, 0 },
efac62a3
ST
1709 { NULL, NULL, NULL, 0 }
1710};
1711
43e9d192
IB
1712/* The condition codes of the processor, and the inverse function. */
1713static const char * const aarch64_condition_codes[] =
1714{
1715 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1716 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1717};
1718
57d6f4d0
RS
1719/* The preferred condition codes for SVE conditions. */
1720static const char *const aarch64_sve_condition_codes[] =
1721{
1722 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1723 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1724};
1725
0b1fe8cf
RS
1726/* Return the assembly token for svpattern value VALUE. */
1727
1728static const char *
1729svpattern_token (enum aarch64_svpattern pattern)
1730{
1731 switch (pattern)
1732 {
1733#define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1734 AARCH64_FOR_SVPATTERN (CASE)
1735#undef CASE
1736 case AARCH64_NUM_SVPATTERNS:
1737 break;
1738 }
1739 gcc_unreachable ();
1740}
1741
38e62001
RS
1742/* Return the location of a piece that is known to be passed or returned
1743 in registers. FIRST_ZR is the first unused vector argument register
1744 and FIRST_PR is the first unused predicate argument register. */
1745
1746rtx
1747pure_scalable_type_info::piece::get_rtx (unsigned int first_zr,
1748 unsigned int first_pr) const
1749{
1750 gcc_assert (VECTOR_MODE_P (mode)
1751 && first_zr + num_zr <= V0_REGNUM + NUM_FP_ARG_REGS
1752 && first_pr + num_pr <= P0_REGNUM + NUM_PR_ARG_REGS);
1753
1754 if (num_zr > 0 && num_pr == 0)
1755 return gen_rtx_REG (mode, first_zr);
1756
1757 if (num_zr == 0 && num_pr == 1)
1758 return gen_rtx_REG (mode, first_pr);
1759
1760 gcc_unreachable ();
1761}
1762
1763/* Return the total number of vector registers required by the PST. */
1764
1765unsigned int
1766pure_scalable_type_info::num_zr () const
1767{
1768 unsigned int res = 0;
1769 for (unsigned int i = 0; i < pieces.length (); ++i)
1770 res += pieces[i].num_zr;
1771 return res;
1772}
1773
1774/* Return the total number of predicate registers required by the PST. */
1775
1776unsigned int
1777pure_scalable_type_info::num_pr () const
1778{
1779 unsigned int res = 0;
1780 for (unsigned int i = 0; i < pieces.length (); ++i)
1781 res += pieces[i].num_pr;
1782 return res;
1783}
1784
1785/* Return the location of a PST that is known to be passed or returned
1786 in registers. FIRST_ZR is the first unused vector argument register
1787 and FIRST_PR is the first unused predicate argument register. */
1788
1789rtx
1790pure_scalable_type_info::get_rtx (machine_mode mode,
1791 unsigned int first_zr,
1792 unsigned int first_pr) const
1793{
1794 /* Try to return a single REG if possible. This leads to better
1795 code generation; it isn't required for correctness. */
1796 if (mode == pieces[0].mode)
1797 {
1798 gcc_assert (pieces.length () == 1);
1799 return pieces[0].get_rtx (first_zr, first_pr);
1800 }
1801
1802 /* Build up a PARALLEL that contains the individual pieces. */
1803 rtvec rtxes = rtvec_alloc (pieces.length ());
1804 for (unsigned int i = 0; i < pieces.length (); ++i)
1805 {
1806 rtx reg = pieces[i].get_rtx (first_zr, first_pr);
1807 rtx offset = gen_int_mode (pieces[i].offset, Pmode);
1808 RTVEC_ELT (rtxes, i) = gen_rtx_EXPR_LIST (VOIDmode, reg, offset);
1809 first_zr += pieces[i].num_zr;
1810 first_pr += pieces[i].num_pr;
1811 }
1812 return gen_rtx_PARALLEL (mode, rtxes);
1813}
1814
1815/* Analyze whether TYPE is a Pure Scalable Type according to the rules
1816 in the AAPCS64. */
1817
1818pure_scalable_type_info::analysis_result
1819pure_scalable_type_info::analyze (const_tree type)
1820{
1821 /* Prevent accidental reuse. */
1822 gcc_assert (pieces.is_empty ());
1823
1824 /* No code will be generated for erroneous types, so we won't establish
1825 an ABI mapping. */
1826 if (type == error_mark_node)
1827 return NO_ABI_IDENTITY;
1828
1829 /* Zero-sized types disappear in the language->ABI mapping. */
1830 if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
1831 return NO_ABI_IDENTITY;
1832
1833 /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs. */
1834 piece p = {};
1835 if (aarch64_sve::builtin_type_p (type, &p.num_zr, &p.num_pr))
1836 {
1837 machine_mode mode = TYPE_MODE_RAW (type);
1838 gcc_assert (VECTOR_MODE_P (mode)
1839 && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
1840
1841 p.mode = p.orig_mode = mode;
1842 add_piece (p);
1843 return IS_PST;
1844 }
1845
1846 /* Check for user-defined PSTs. */
1847 if (TREE_CODE (type) == ARRAY_TYPE)
1848 return analyze_array (type);
1849 if (TREE_CODE (type) == RECORD_TYPE)
1850 return analyze_record (type);
1851
1852 return ISNT_PST;
1853}
1854
1855/* Analyze a type that is known not to be passed or returned in memory.
1856 Return true if it has an ABI identity and is a Pure Scalable Type. */
1857
1858bool
1859pure_scalable_type_info::analyze_registers (const_tree type)
1860{
1861 analysis_result result = analyze (type);
1862 gcc_assert (result != DOESNT_MATTER);
1863 return result == IS_PST;
1864}
1865
1866/* Subroutine of analyze for handling ARRAY_TYPEs. */
1867
1868pure_scalable_type_info::analysis_result
1869pure_scalable_type_info::analyze_array (const_tree type)
1870{
1871 /* Analyze the element type. */
1872 pure_scalable_type_info element_info;
1873 analysis_result result = element_info.analyze (TREE_TYPE (type));
1874 if (result != IS_PST)
1875 return result;
1876
1877 /* An array of unknown, flexible or variable length will be passed and
1878 returned by reference whatever we do. */
1879 tree nelts_minus_one = array_type_nelts (type);
1880 if (!tree_fits_uhwi_p (nelts_minus_one))
1881 return DOESNT_MATTER;
1882
1883 /* Likewise if the array is constant-sized but too big to be interesting.
1884 The double checks against MAX_PIECES are to protect against overflow. */
1885 unsigned HOST_WIDE_INT count = tree_to_uhwi (nelts_minus_one);
1886 if (count > MAX_PIECES)
1887 return DOESNT_MATTER;
1888 count += 1;
1889 if (count * element_info.pieces.length () > MAX_PIECES)
1890 return DOESNT_MATTER;
1891
1892 /* The above checks should have weeded out elements of unknown size. */
1893 poly_uint64 element_bytes;
1894 if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type)), &element_bytes))
1895 gcc_unreachable ();
1896
1897 /* Build up the list of individual vectors and predicates. */
1898 gcc_assert (!element_info.pieces.is_empty ());
1899 for (unsigned int i = 0; i < count; ++i)
1900 for (unsigned int j = 0; j < element_info.pieces.length (); ++j)
1901 {
1902 piece p = element_info.pieces[j];
1903 p.offset += i * element_bytes;
1904 add_piece (p);
1905 }
1906 return IS_PST;
1907}
1908
1909/* Subroutine of analyze for handling RECORD_TYPEs. */
1910
1911pure_scalable_type_info::analysis_result
1912pure_scalable_type_info::analyze_record (const_tree type)
1913{
1914 for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
1915 {
1916 if (TREE_CODE (field) != FIELD_DECL)
1917 continue;
1918
1919 /* Zero-sized fields disappear in the language->ABI mapping. */
1920 if (DECL_SIZE (field) && integer_zerop (DECL_SIZE (field)))
1921 continue;
1922
1923 /* All fields with an ABI identity must be PSTs for the record as
1924 a whole to be a PST. If any individual field is too big to be
1925 interesting then the record is too. */
1926 pure_scalable_type_info field_info;
1927 analysis_result subresult = field_info.analyze (TREE_TYPE (field));
1928 if (subresult == NO_ABI_IDENTITY)
1929 continue;
1930 if (subresult != IS_PST)
1931 return subresult;
1932
1933 /* Since all previous fields are PSTs, we ought to be able to track
1934 the field offset using poly_ints. */
1935 tree bitpos = bit_position (field);
1936 gcc_assert (poly_int_tree_p (bitpos));
1937
1938 /* For the same reason, it shouldn't be possible to create a PST field
1939 whose offset isn't byte-aligned. */
1940 poly_widest_int wide_bytepos = exact_div (wi::to_poly_widest (bitpos),
1941 BITS_PER_UNIT);
1942
1943 /* Punt if the record is too big to be interesting. */
1944 poly_uint64 bytepos;
1945 if (!wide_bytepos.to_uhwi (&bytepos)
1946 || pieces.length () + field_info.pieces.length () > MAX_PIECES)
1947 return DOESNT_MATTER;
1948
1949 /* Add the individual vectors and predicates in the field to the
1950 record's list. */
1951 gcc_assert (!field_info.pieces.is_empty ());
1952 for (unsigned int i = 0; i < field_info.pieces.length (); ++i)
1953 {
1954 piece p = field_info.pieces[i];
1955 p.offset += bytepos;
1956 add_piece (p);
1957 }
1958 }
1959 /* Empty structures disappear in the language->ABI mapping. */
1960 return pieces.is_empty () ? NO_ABI_IDENTITY : IS_PST;
1961}
1962
1963/* Add P to the list of pieces in the type. */
1964
1965void
1966pure_scalable_type_info::add_piece (const piece &p)
1967{
1968 /* Try to fold the new piece into the previous one to form a
1969 single-mode PST. For example, if we see three consecutive vectors
1970 of the same mode, we can represent them using the corresponding
1971 3-tuple mode.
1972
1973 This is purely an optimization. */
1974 if (!pieces.is_empty ())
1975 {
1976 piece &prev = pieces.last ();
1977 gcc_assert (VECTOR_MODE_P (p.mode) && VECTOR_MODE_P (prev.mode));
1978 unsigned int nelems1, nelems2;
1979 if (prev.orig_mode == p.orig_mode
1980 && known_eq (prev.offset + GET_MODE_SIZE (prev.mode), p.offset)
1981 && constant_multiple_p (GET_MODE_NUNITS (prev.mode),
1982 GET_MODE_NUNITS (p.orig_mode), &nelems1)
1983 && constant_multiple_p (GET_MODE_NUNITS (p.mode),
1984 GET_MODE_NUNITS (p.orig_mode), &nelems2)
1985 && targetm.array_mode (p.orig_mode,
1986 nelems1 + nelems2).exists (&prev.mode))
1987 {
1988 prev.num_zr += p.num_zr;
1989 prev.num_pr += p.num_pr;
1990 return;
1991 }
1992 }
1993 pieces.quick_push (p);
1994}
1995
1996/* Return true if at least one possible value of type TYPE includes at
1997 least one object of Pure Scalable Type, in the sense of the AAPCS64.
1998
1999 This is a relatively expensive test for some types, so it should
2000 generally be made as late as possible. */
2001
2002static bool
2003aarch64_some_values_include_pst_objects_p (const_tree type)
2004{
2005 if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
2006 return false;
2007
2008 if (aarch64_sve::builtin_type_p (type))
2009 return true;
2010
2011 if (TREE_CODE (type) == ARRAY_TYPE || TREE_CODE (type) == COMPLEX_TYPE)
2012 return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type));
2013
2014 if (RECORD_OR_UNION_TYPE_P (type))
2015 for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
2016 if (TREE_CODE (field) == FIELD_DECL
2017 && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field)))
2018 return true;
2019
2020 return false;
2021}
2022
002ffd3c
RS
2023/* Return the descriptor of the SIMD ABI. */
2024
2025static const predefined_function_abi &
2026aarch64_simd_abi (void)
2027{
2028 predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
2029 if (!simd_abi.initialized_p ())
2030 {
2031 HARD_REG_SET full_reg_clobbers
2032 = default_function_abi.full_reg_clobbers ();
2033 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
2034 if (FP_SIMD_SAVED_REGNUM_P (regno))
2035 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
2036 simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
2037 }
2038 return simd_abi;
2039}
2040
c600df9a
RS
2041/* Return the descriptor of the SVE PCS. */
2042
2043static const predefined_function_abi &
2044aarch64_sve_abi (void)
2045{
2046 predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
2047 if (!sve_abi.initialized_p ())
2048 {
2049 HARD_REG_SET full_reg_clobbers
2050 = default_function_abi.full_reg_clobbers ();
2051 for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
2052 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
cb26919c 2053 for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno)
c600df9a
RS
2054 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
2055 sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
2056 }
2057 return sve_abi;
2058}
2059
74b27d8e
RS
2060/* If X is an UNSPEC_SALT_ADDR expression, return the address that it
2061 wraps, otherwise return X itself. */
2062
2063static rtx
2064strip_salt (rtx x)
2065{
2066 rtx search = x;
2067 if (GET_CODE (search) == CONST)
2068 search = XEXP (search, 0);
2069 if (GET_CODE (search) == UNSPEC && XINT (search, 1) == UNSPEC_SALT_ADDR)
2070 x = XVECEXP (search, 0, 0);
2071 return x;
2072}
2073
2074/* Like strip_offset, but also strip any UNSPEC_SALT_ADDR from the
2075 expression. */
2076
2077static rtx
2078strip_offset_and_salt (rtx addr, poly_int64 *offset)
2079{
2080 return strip_salt (strip_offset (addr, offset));
2081}
2082
973d2e01
TP
2083/* Generate code to enable conditional branches in functions over 1 MiB. */
2084const char *
2085aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
2086 const char * branch_format)
2087{
2088 rtx_code_label * tmp_label = gen_label_rtx ();
2089 char label_buf[256];
2090 char buffer[128];
2091 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
2092 CODE_LABEL_NUMBER (tmp_label));
2093 const char *label_ptr = targetm.strip_name_encoding (label_buf);
2094 rtx dest_label = operands[pos_label];
2095 operands[pos_label] = tmp_label;
2096
2097 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
2098 output_asm_insn (buffer, operands);
2099
2100 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
2101 operands[pos_label] = dest_label;
2102 output_asm_insn (buffer, operands);
2103 return "";
2104}
2105
261fb553 2106void
fc29dfc9 2107aarch64_err_no_fpadvsimd (machine_mode mode)
261fb553 2108{
261fb553 2109 if (TARGET_GENERAL_REGS_ONLY)
fc29dfc9
SE
2110 if (FLOAT_MODE_P (mode))
2111 error ("%qs is incompatible with the use of floating-point types",
2112 "-mgeneral-regs-only");
2113 else
2114 error ("%qs is incompatible with the use of vector types",
2115 "-mgeneral-regs-only");
261fb553 2116 else
fc29dfc9
SE
2117 if (FLOAT_MODE_P (mode))
2118 error ("%qs feature modifier is incompatible with the use of"
2119 " floating-point types", "+nofp");
2120 else
2121 error ("%qs feature modifier is incompatible with the use of"
2122 " vector types", "+nofp");
261fb553
AL
2123}
2124
c0e0174b
RS
2125/* Report when we try to do something that requires SVE when SVE is disabled.
2126 This is an error of last resort and isn't very high-quality. It usually
2127 involves attempts to measure the vector length in some way. */
2128static void
2129aarch64_report_sve_required (void)
2130{
2131 static bool reported_p = false;
2132
2133 /* Avoid reporting a slew of messages for a single oversight. */
2134 if (reported_p)
2135 return;
2136
2137 error ("this operation requires the SVE ISA extension");
2138 inform (input_location, "you can enable SVE using the command-line"
2139 " option %<-march%>, or by using the %<target%>"
2140 " attribute or pragma");
2141 reported_p = true;
2142}
2143
183bfdaf
RS
2144/* Return true if REGNO is P0-P15 or one of the special FFR-related
2145 registers. */
2146inline bool
2147pr_or_ffr_regnum_p (unsigned int regno)
2148{
2149 return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
2150}
2151
c64f7d37 2152/* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
2eb2847e
WD
2153 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
2154 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
2155 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
2156 and GENERAL_REGS is lower than the memory cost (in this case the best class
2157 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
2158 cost results in bad allocations with many redundant int<->FP moves which
2159 are expensive on various cores.
2160 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
2161 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
2162 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
2163 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
31e2b5a3
WD
2164 The result of this is that it is no longer inefficient to have a higher
2165 memory move cost than the register move cost.
2166*/
c64f7d37
WD
2167
2168static reg_class_t
31e2b5a3
WD
2169aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
2170 reg_class_t best_class)
c64f7d37 2171{
b8506a8a 2172 machine_mode mode;
c64f7d37 2173
67e5c59a
RS
2174 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
2175 || !reg_class_subset_p (FP_REGS, allocno_class))
c64f7d37
WD
2176 return allocno_class;
2177
67e5c59a
RS
2178 if (!reg_class_subset_p (GENERAL_REGS, best_class)
2179 || !reg_class_subset_p (FP_REGS, best_class))
31e2b5a3
WD
2180 return best_class;
2181
c64f7d37
WD
2182 mode = PSEUDO_REGNO_MODE (regno);
2183 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
2184}
2185
26e0ff94 2186static unsigned int
b8506a8a 2187aarch64_min_divisions_for_recip_mul (machine_mode mode)
26e0ff94 2188{
50093a33 2189 if (GET_MODE_UNIT_SIZE (mode) == 4)
b175b679
JG
2190 return aarch64_tune_params.min_div_recip_mul_sf;
2191 return aarch64_tune_params.min_div_recip_mul_df;
26e0ff94
WD
2192}
2193
b5b33e11 2194/* Return the reassociation width of treeop OPC with mode MODE. */
cee66c68 2195static int
b5b33e11 2196aarch64_reassociation_width (unsigned opc, machine_mode mode)
cee66c68
WD
2197{
2198 if (VECTOR_MODE_P (mode))
b175b679 2199 return aarch64_tune_params.vec_reassoc_width;
cee66c68 2200 if (INTEGRAL_MODE_P (mode))
b175b679 2201 return aarch64_tune_params.int_reassoc_width;
b5b33e11
WD
2202 /* Avoid reassociating floating point addition so we emit more FMAs. */
2203 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
b175b679 2204 return aarch64_tune_params.fp_reassoc_width;
cee66c68
WD
2205 return 1;
2206}
2207
43e9d192
IB
2208/* Provide a mapping from gcc register numbers to dwarf register numbers. */
2209unsigned
2210aarch64_dbx_register_number (unsigned regno)
2211{
2212 if (GP_REGNUM_P (regno))
2213 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
2214 else if (regno == SP_REGNUM)
2215 return AARCH64_DWARF_SP;
2216 else if (FP_REGNUM_P (regno))
2217 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
43cacb12
RS
2218 else if (PR_REGNUM_P (regno))
2219 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
2220 else if (regno == VG_REGNUM)
2221 return AARCH64_DWARF_VG;
43e9d192
IB
2222
2223 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
2224 equivalent DWARF register. */
2225 return DWARF_FRAME_REGISTERS;
2226}
2227
d29f7dd5
RS
2228/* If X is a CONST_DOUBLE, return its bit representation as a constant
2229 integer, otherwise return X unmodified. */
2230static rtx
2231aarch64_bit_representation (rtx x)
2232{
2233 if (CONST_DOUBLE_P (x))
2234 x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
2235 return x;
2236}
2237
43cacb12
RS
2238/* Return true if MODE is any of the Advanced SIMD structure modes. */
2239static bool
2240aarch64_advsimd_struct_mode_p (machine_mode mode)
2241{
2242 return (TARGET_SIMD
2243 && (mode == OImode || mode == CImode || mode == XImode));
2244}
2245
2246/* Return true if MODE is an SVE predicate mode. */
2247static bool
2248aarch64_sve_pred_mode_p (machine_mode mode)
2249{
2250 return (TARGET_SVE
2251 && (mode == VNx16BImode
2252 || mode == VNx8BImode
2253 || mode == VNx4BImode
2254 || mode == VNx2BImode));
2255}
2256
2257/* Three mutually-exclusive flags describing a vector or predicate type. */
2258const unsigned int VEC_ADVSIMD = 1;
2259const unsigned int VEC_SVE_DATA = 2;
2260const unsigned int VEC_SVE_PRED = 4;
2261/* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
2262 a structure of 2, 3 or 4 vectors. */
2263const unsigned int VEC_STRUCT = 8;
550a3380
RS
2264/* Can be used in combination with VEC_SVE_DATA to indicate that the
2265 vector has fewer significant bytes than a full SVE vector. */
2266const unsigned int VEC_PARTIAL = 16;
43cacb12
RS
2267/* Useful combinations of the above. */
2268const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
2269const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
2270
2271/* Return a set of flags describing the vector properties of mode MODE.
2272 Ignore modes that are not supported by the current target. */
2273static unsigned int
2274aarch64_classify_vector_mode (machine_mode mode)
2275{
2276 if (aarch64_advsimd_struct_mode_p (mode))
2277 return VEC_ADVSIMD | VEC_STRUCT;
2278
2279 if (aarch64_sve_pred_mode_p (mode))
2280 return VEC_SVE_PRED;
2281
806f69cd
RS
2282 /* Make the decision based on the mode's enum value rather than its
2283 properties, so that we keep the correct classification regardless
2284 of -msve-vector-bits. */
2285 switch (mode)
43cacb12 2286 {
550a3380
RS
2287 /* Partial SVE QI vectors. */
2288 case E_VNx2QImode:
2289 case E_VNx4QImode:
2290 case E_VNx8QImode:
2291 /* Partial SVE HI vectors. */
2292 case E_VNx2HImode:
2293 case E_VNx4HImode:
2294 /* Partial SVE SI vector. */
2295 case E_VNx2SImode:
cc68f7c2
RS
2296 /* Partial SVE HF vectors. */
2297 case E_VNx2HFmode:
2298 case E_VNx4HFmode:
6c3ce63b
RS
2299 /* Partial SVE BF vectors. */
2300 case E_VNx2BFmode:
2301 case E_VNx4BFmode:
cc68f7c2
RS
2302 /* Partial SVE SF vector. */
2303 case E_VNx2SFmode:
550a3380
RS
2304 return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
2305
806f69cd
RS
2306 case E_VNx16QImode:
2307 case E_VNx8HImode:
2308 case E_VNx4SImode:
2309 case E_VNx2DImode:
02fcd8ac 2310 case E_VNx8BFmode:
806f69cd
RS
2311 case E_VNx8HFmode:
2312 case E_VNx4SFmode:
2313 case E_VNx2DFmode:
2314 return TARGET_SVE ? VEC_SVE_DATA : 0;
2315
2316 /* x2 SVE vectors. */
2317 case E_VNx32QImode:
2318 case E_VNx16HImode:
2319 case E_VNx8SImode:
2320 case E_VNx4DImode:
02fcd8ac 2321 case E_VNx16BFmode:
806f69cd
RS
2322 case E_VNx16HFmode:
2323 case E_VNx8SFmode:
2324 case E_VNx4DFmode:
2325 /* x3 SVE vectors. */
2326 case E_VNx48QImode:
2327 case E_VNx24HImode:
2328 case E_VNx12SImode:
2329 case E_VNx6DImode:
02fcd8ac 2330 case E_VNx24BFmode:
806f69cd
RS
2331 case E_VNx24HFmode:
2332 case E_VNx12SFmode:
2333 case E_VNx6DFmode:
2334 /* x4 SVE vectors. */
2335 case E_VNx64QImode:
2336 case E_VNx32HImode:
2337 case E_VNx16SImode:
2338 case E_VNx8DImode:
02fcd8ac 2339 case E_VNx32BFmode:
806f69cd
RS
2340 case E_VNx32HFmode:
2341 case E_VNx16SFmode:
2342 case E_VNx8DFmode:
2343 return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
2344
2345 /* 64-bit Advanced SIMD vectors. */
2346 case E_V8QImode:
2347 case E_V4HImode:
2348 case E_V2SImode:
2349 /* ...E_V1DImode doesn't exist. */
2350 case E_V4HFmode:
abbe1ed2 2351 case E_V4BFmode:
806f69cd
RS
2352 case E_V2SFmode:
2353 case E_V1DFmode:
2354 /* 128-bit Advanced SIMD vectors. */
2355 case E_V16QImode:
2356 case E_V8HImode:
2357 case E_V4SImode:
2358 case E_V2DImode:
2359 case E_V8HFmode:
abbe1ed2 2360 case E_V8BFmode:
806f69cd
RS
2361 case E_V4SFmode:
2362 case E_V2DFmode:
2363 return TARGET_SIMD ? VEC_ADVSIMD : 0;
2364
2365 default:
2366 return 0;
43cacb12 2367 }
43cacb12
RS
2368}
2369
2370/* Return true if MODE is any of the data vector modes, including
2371 structure modes. */
43e9d192 2372static bool
43cacb12 2373aarch64_vector_data_mode_p (machine_mode mode)
43e9d192 2374{
43cacb12 2375 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
43e9d192
IB
2376}
2377
5c38705d
RS
2378/* Return true if MODE is any form of SVE mode, including predicates,
2379 vectors and structures. */
2380bool
2381aarch64_sve_mode_p (machine_mode mode)
2382{
2383 return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
2384}
2385
43cacb12
RS
2386/* Return true if MODE is an SVE data vector mode; either a single vector
2387 or a structure of vectors. */
43e9d192 2388static bool
43cacb12 2389aarch64_sve_data_mode_p (machine_mode mode)
43e9d192 2390{
43cacb12 2391 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
43e9d192
IB
2392}
2393
550a3380
RS
2394/* Return the number of defined bytes in one constituent vector of
2395 SVE mode MODE, which has vector flags VEC_FLAGS. */
2396static poly_int64
2397aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
2398{
2399 if (vec_flags & VEC_PARTIAL)
2400 /* A single partial vector. */
2401 return GET_MODE_SIZE (mode);
2402
2403 if (vec_flags & VEC_SVE_DATA)
2404 /* A single vector or a tuple. */
2405 return BYTES_PER_SVE_VECTOR;
2406
2407 /* A single predicate. */
2408 gcc_assert (vec_flags & VEC_SVE_PRED);
2409 return BYTES_PER_SVE_PRED;
2410}
2411
9f4cbab8
RS
2412/* Implement target hook TARGET_ARRAY_MODE. */
2413static opt_machine_mode
2414aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
2415{
2416 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
2417 && IN_RANGE (nelems, 2, 4))
2418 return mode_for_vector (GET_MODE_INNER (mode),
2419 GET_MODE_NUNITS (mode) * nelems);
2420
2421 return opt_machine_mode ();
2422}
2423
43e9d192
IB
2424/* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
2425static bool
ef4bddc2 2426aarch64_array_mode_supported_p (machine_mode mode,
43e9d192
IB
2427 unsigned HOST_WIDE_INT nelems)
2428{
2429 if (TARGET_SIMD
635e66fe
AL
2430 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
2431 || AARCH64_VALID_SIMD_DREG_MODE (mode))
43e9d192
IB
2432 && (nelems >= 2 && nelems <= 4))
2433 return true;
2434
2435 return false;
2436}
2437
cc68f7c2
RS
2438/* MODE is some form of SVE vector mode. For data modes, return the number
2439 of vector register bits that each element of MODE occupies, such as 64
2440 for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
2441 in a 64-bit container). For predicate modes, return the number of
2442 data bits controlled by each significant predicate bit. */
2443
2444static unsigned int
2445aarch64_sve_container_bits (machine_mode mode)
2446{
2447 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2448 poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
2449 ? BITS_PER_SVE_VECTOR
2450 : GET_MODE_BITSIZE (mode));
2451 return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
2452}
2453
43cacb12
RS
2454/* Return the SVE predicate mode to use for elements that have
2455 ELEM_NBYTES bytes, if such a mode exists. */
2456
2457opt_machine_mode
2458aarch64_sve_pred_mode (unsigned int elem_nbytes)
2459{
2460 if (TARGET_SVE)
2461 {
2462 if (elem_nbytes == 1)
2463 return VNx16BImode;
2464 if (elem_nbytes == 2)
2465 return VNx8BImode;
2466 if (elem_nbytes == 4)
2467 return VNx4BImode;
2468 if (elem_nbytes == 8)
2469 return VNx2BImode;
2470 }
2471 return opt_machine_mode ();
2472}
2473
cc68f7c2
RS
2474/* Return the SVE predicate mode that should be used to control
2475 SVE mode MODE. */
2476
2477machine_mode
2478aarch64_sve_pred_mode (machine_mode mode)
2479{
2480 unsigned int bits = aarch64_sve_container_bits (mode);
2481 return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
2482}
2483
43cacb12
RS
2484/* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
2485
2486static opt_machine_mode
10116ec1 2487aarch64_get_mask_mode (machine_mode mode)
43cacb12 2488{
10116ec1
RS
2489 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2490 if (vec_flags & VEC_SVE_DATA)
cc68f7c2 2491 return aarch64_sve_pred_mode (mode);
43cacb12 2492
10116ec1 2493 return default_get_mask_mode (mode);
43cacb12
RS
2494}
2495
d7a09c44
RS
2496/* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
2497
624d0f07 2498opt_machine_mode
d7a09c44
RS
2499aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
2500{
2501 enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
2502 ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
2503 machine_mode mode;
2504 FOR_EACH_MODE_IN_CLASS (mode, mclass)
2505 if (inner_mode == GET_MODE_INNER (mode)
2506 && known_eq (nunits, GET_MODE_NUNITS (mode))
2507 && aarch64_sve_data_mode_p (mode))
2508 return mode;
2509 return opt_machine_mode ();
2510}
2511
1044fa32
RS
2512/* Return the integer element mode associated with SVE mode MODE. */
2513
2514static scalar_int_mode
2515aarch64_sve_element_int_mode (machine_mode mode)
2516{
cc68f7c2
RS
2517 poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2518 ? BITS_PER_SVE_VECTOR
2519 : GET_MODE_BITSIZE (mode));
2520 unsigned int elt_bits = vector_element_size (vector_bits,
1044fa32
RS
2521 GET_MODE_NUNITS (mode));
2522 return int_mode_for_size (elt_bits, 0).require ();
2523}
2524
cc68f7c2
RS
2525/* Return an integer element mode that contains exactly
2526 aarch64_sve_container_bits (MODE) bits. This is wider than
2527 aarch64_sve_element_int_mode if MODE is a partial vector,
2528 otherwise it's the same. */
2529
2530static scalar_int_mode
2531aarch64_sve_container_int_mode (machine_mode mode)
2532{
2533 return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
2534}
2535
d7a09c44 2536/* Return the integer vector mode associated with SVE mode MODE.
d083ee47 2537 Unlike related_int_vector_mode, this can handle the case in which
d7a09c44
RS
2538 MODE is a predicate (and thus has a different total size). */
2539
624d0f07 2540machine_mode
d7a09c44
RS
2541aarch64_sve_int_mode (machine_mode mode)
2542{
2543 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
2544 return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
2545}
2546
74166aab
RS
2547/* Implement TARGET_VECTORIZE_RELATED_MODE. */
2548
2549static opt_machine_mode
2550aarch64_vectorize_related_mode (machine_mode vector_mode,
2551 scalar_mode element_mode,
2552 poly_uint64 nunits)
2553{
2554 unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
2555
cc68f7c2
RS
2556 /* If we're operating on SVE vectors, try to return an SVE mode. */
2557 poly_uint64 sve_nunits;
2558 if ((vec_flags & VEC_SVE_DATA)
2559 && multiple_p (BYTES_PER_SVE_VECTOR,
2560 GET_MODE_SIZE (element_mode), &sve_nunits))
2561 {
2562 machine_mode sve_mode;
2563 if (maybe_ne (nunits, 0U))
2564 {
2565 /* Try to find a full or partial SVE mode with exactly
2566 NUNITS units. */
2567 if (multiple_p (sve_nunits, nunits)
2568 && aarch64_sve_data_mode (element_mode,
2569 nunits).exists (&sve_mode))
2570 return sve_mode;
2571 }
2572 else
2573 {
2574 /* Take the preferred number of units from the number of bytes
2575 that fit in VECTOR_MODE. We always start by "autodetecting"
2576 a full vector mode with preferred_simd_mode, so vectors
2577 chosen here will also be full vector modes. Then
2578 autovectorize_vector_modes tries smaller starting modes
2579 and thus smaller preferred numbers of units. */
2580 sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
2581 if (aarch64_sve_data_mode (element_mode,
2582 sve_nunits).exists (&sve_mode))
2583 return sve_mode;
2584 }
2585 }
2586
74166aab
RS
2587 /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors. */
2588 if ((vec_flags & VEC_ADVSIMD)
2589 && known_eq (nunits, 0U)
2590 && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
2591 && maybe_ge (GET_MODE_BITSIZE (element_mode)
2592 * GET_MODE_NUNITS (vector_mode), 128U))
2593 {
2594 machine_mode res = aarch64_simd_container_mode (element_mode, 128);
2595 if (VECTOR_MODE_P (res))
2596 return res;
2597 }
2598
2599 return default_vectorize_related_mode (vector_mode, element_mode, nunits);
2600}
2601
b41d1f6e
RS
2602/* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
2603 prefer to use the first arithmetic operand as the else value if
2604 the else value doesn't matter, since that exactly matches the SVE
2605 destructive merging form. For ternary operations we could either
2606 pick the first operand and use FMAD-like instructions or the last
2607 operand and use FMLA-like instructions; the latter seems more
2608 natural. */
6a86928d
RS
2609
2610static tree
b41d1f6e 2611aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
6a86928d 2612{
b41d1f6e 2613 return nops == 3 ? ops[2] : ops[0];
6a86928d
RS
2614}
2615
c43f4279 2616/* Implement TARGET_HARD_REGNO_NREGS. */
43e9d192 2617
c43f4279 2618static unsigned int
ef4bddc2 2619aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
43e9d192 2620{
6a70badb
RS
2621 /* ??? Logically we should only need to provide a value when
2622 HARD_REGNO_MODE_OK says that the combination is valid,
2623 but at the moment we need to handle all modes. Just ignore
2624 any runtime parts for registers that can't store them. */
2625 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
43e9d192
IB
2626 switch (aarch64_regno_regclass (regno))
2627 {
2628 case FP_REGS:
2629 case FP_LO_REGS:
163b1f6a 2630 case FP_LO8_REGS:
550a3380
RS
2631 {
2632 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2633 if (vec_flags & VEC_SVE_DATA)
2634 return exact_div (GET_MODE_SIZE (mode),
2635 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
2636 return CEIL (lowest_size, UNITS_PER_VREG);
2637 }
43cacb12
RS
2638 case PR_REGS:
2639 case PR_LO_REGS:
2640 case PR_HI_REGS:
183bfdaf
RS
2641 case FFR_REGS:
2642 case PR_AND_FFR_REGS:
43cacb12 2643 return 1;
43e9d192 2644 default:
6a70badb 2645 return CEIL (lowest_size, UNITS_PER_WORD);
43e9d192
IB
2646 }
2647 gcc_unreachable ();
2648}
2649
f939c3e6 2650/* Implement TARGET_HARD_REGNO_MODE_OK. */
43e9d192 2651
f939c3e6 2652static bool
ef4bddc2 2653aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
43e9d192
IB
2654{
2655 if (GET_MODE_CLASS (mode) == MODE_CC)
2656 return regno == CC_REGNUM;
2657
43cacb12
RS
2658 if (regno == VG_REGNUM)
2659 /* This must have the same size as _Unwind_Word. */
2660 return mode == DImode;
2661
2662 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2663 if (vec_flags & VEC_SVE_PRED)
183bfdaf 2664 return pr_or_ffr_regnum_p (regno);
43cacb12 2665
183bfdaf
RS
2666 if (pr_or_ffr_regnum_p (regno))
2667 return false;
43cacb12 2668
9259db42
YZ
2669 if (regno == SP_REGNUM)
2670 /* The purpose of comparing with ptr_mode is to support the
2671 global register variable associated with the stack pointer
2672 register via the syntax of asm ("wsp") in ILP32. */
2673 return mode == Pmode || mode == ptr_mode;
2674
2675 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
43e9d192
IB
2676 return mode == Pmode;
2677
563cc649
RH
2678 if (GP_REGNUM_P (regno))
2679 {
aa1a2795
RS
2680 if (vec_flags & VEC_ANY_SVE)
2681 return false;
563cc649
RH
2682 if (known_le (GET_MODE_SIZE (mode), 8))
2683 return true;
aa1a2795 2684 if (known_le (GET_MODE_SIZE (mode), 16))
563cc649
RH
2685 return (regno & 1) == 0;
2686 }
2687 else if (FP_REGNUM_P (regno))
43e9d192 2688 {
43cacb12 2689 if (vec_flags & VEC_STRUCT)
4edd6298 2690 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
43e9d192 2691 else
43cacb12 2692 return !VECTOR_MODE_P (mode) || vec_flags != 0;
43e9d192
IB
2693 }
2694
f939c3e6 2695 return false;
43e9d192
IB
2696}
2697
c600df9a
RS
2698/* Return true if a function with type FNTYPE returns its value in
2699 SVE vector or predicate registers. */
2700
2701static bool
2702aarch64_returns_value_in_sve_regs_p (const_tree fntype)
2703{
c600df9a 2704 tree return_type = TREE_TYPE (fntype);
38e62001
RS
2705
2706 pure_scalable_type_info pst_info;
2707 switch (pst_info.analyze (return_type))
2708 {
2709 case pure_scalable_type_info::IS_PST:
2710 return (pst_info.num_zr () <= NUM_FP_ARG_REGS
2711 && pst_info.num_pr () <= NUM_PR_ARG_REGS);
2712
2713 case pure_scalable_type_info::DOESNT_MATTER:
2714 gcc_assert (aarch64_return_in_memory_1 (return_type));
2715 return false;
2716
2717 case pure_scalable_type_info::NO_ABI_IDENTITY:
2718 case pure_scalable_type_info::ISNT_PST:
2719 return false;
2720 }
2721 gcc_unreachable ();
c600df9a
RS
2722}
2723
2724/* Return true if a function with type FNTYPE takes arguments in
2725 SVE vector or predicate registers. */
2726
2727static bool
2728aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
2729{
2730 CUMULATIVE_ARGS args_so_far_v;
2731 aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
2732 NULL_TREE, 0, true);
2733 cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
2734
2735 for (tree chain = TYPE_ARG_TYPES (fntype);
2736 chain && chain != void_list_node;
2737 chain = TREE_CHAIN (chain))
2738 {
2739 tree arg_type = TREE_VALUE (chain);
2740 if (arg_type == error_mark_node)
2741 return false;
2742
2743 function_arg_info arg (arg_type, /*named=*/true);
2744 apply_pass_by_reference_rules (&args_so_far_v, arg);
38e62001
RS
2745 pure_scalable_type_info pst_info;
2746 if (pst_info.analyze_registers (arg.type))
2747 {
2748 unsigned int end_zr = args_so_far_v.aapcs_nvrn + pst_info.num_zr ();
2749 unsigned int end_pr = args_so_far_v.aapcs_nprn + pst_info.num_pr ();
2750 gcc_assert (end_zr <= NUM_FP_ARG_REGS && end_pr <= NUM_PR_ARG_REGS);
2751 return true;
2752 }
c600df9a
RS
2753
2754 targetm.calls.function_arg_advance (args_so_far, arg);
2755 }
2756 return false;
2757}
2758
002ffd3c
RS
2759/* Implement TARGET_FNTYPE_ABI. */
2760
2761static const predefined_function_abi &
2762aarch64_fntype_abi (const_tree fntype)
2763{
2764 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
2765 return aarch64_simd_abi ();
c600df9a
RS
2766
2767 if (aarch64_returns_value_in_sve_regs_p (fntype)
2768 || aarch64_takes_arguments_in_sve_regs_p (fntype))
2769 return aarch64_sve_abi ();
2770
002ffd3c
RS
2771 return default_function_abi;
2772}
2773
482b2b43
RS
2774/* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P. */
2775
2776static bool
2777aarch64_compatible_vector_types_p (const_tree type1, const_tree type2)
2778{
2779 return (aarch64_sve::builtin_type_p (type1)
2780 == aarch64_sve::builtin_type_p (type2));
2781}
2782
c600df9a 2783/* Return true if we should emit CFI for register REGNO. */
a0d0b980
SE
2784
2785static bool
c600df9a 2786aarch64_emit_cfi_for_reg_p (unsigned int regno)
a0d0b980 2787{
c600df9a
RS
2788 return (GP_REGNUM_P (regno)
2789 || !default_function_abi.clobbers_full_reg_p (regno));
a0d0b980
SE
2790}
2791
c600df9a 2792/* Return the mode we should use to save and restore register REGNO. */
a0d0b980
SE
2793
2794static machine_mode
c600df9a 2795aarch64_reg_save_mode (unsigned int regno)
a0d0b980 2796{
c600df9a
RS
2797 if (GP_REGNUM_P (regno))
2798 return DImode;
2799
2800 if (FP_REGNUM_P (regno))
2801 switch (crtl->abi->id ())
2802 {
2803 case ARM_PCS_AAPCS64:
2804 /* Only the low 64 bits are saved by the base PCS. */
2805 return DFmode;
2806
2807 case ARM_PCS_SIMD:
2808 /* The vector PCS saves the low 128 bits (which is the full
2809 register on non-SVE targets). */
2810 return TFmode;
2811
2812 case ARM_PCS_SVE:
2813 /* Use vectors of DImode for registers that need frame
2814 information, so that the first 64 bytes of the save slot
2815 are always the equivalent of what storing D<n> would give. */
2816 if (aarch64_emit_cfi_for_reg_p (regno))
2817 return VNx2DImode;
2818
2819 /* Use vectors of bytes otherwise, so that the layout is
2820 endian-agnostic, and so that we can use LDR and STR for
2821 big-endian targets. */
2822 return VNx16QImode;
2823
2824 case ARM_PCS_TLSDESC:
2825 case ARM_PCS_UNKNOWN:
2826 break;
2827 }
2828
2829 if (PR_REGNUM_P (regno))
2830 /* Save the full predicate register. */
2831 return VNx16BImode;
2832
2833 gcc_unreachable ();
a0d0b980
SE
2834}
2835
5a5a3bc5 2836/* Implement TARGET_INSN_CALLEE_ABI. */
b3650d40 2837
5a5a3bc5
RS
2838const predefined_function_abi &
2839aarch64_insn_callee_abi (const rtx_insn *insn)
b3650d40 2840{
08cc4d92
RS
2841 rtx pat = PATTERN (insn);
2842 gcc_assert (GET_CODE (pat) == PARALLEL);
2843 rtx unspec = XVECEXP (pat, 0, 1);
2844 gcc_assert (GET_CODE (unspec) == UNSPEC
2845 && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
2846 return function_abis[INTVAL (XVECEXP (unspec, 0, 0))];
b3650d40
SE
2847}
2848
80ec73f4
RS
2849/* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
2850 the lower 64 bits of a 128-bit register. Tell the compiler the callee
2851 clobbers the top 64 bits when restoring the bottom 64 bits. */
2852
2853static bool
6ee2cc70
RS
2854aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
2855 unsigned int regno,
473574ee 2856 machine_mode mode)
80ec73f4 2857{
c600df9a 2858 if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
51051f47 2859 {
51051f47
RS
2860 poly_int64 per_register_size = GET_MODE_SIZE (mode);
2861 unsigned int nregs = hard_regno_nregs (regno, mode);
2862 if (nregs > 1)
2863 per_register_size = exact_div (per_register_size, nregs);
bb6ce448
RS
2864 if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
2865 return maybe_gt (per_register_size, 16);
2866 return maybe_gt (per_register_size, 8);
51051f47
RS
2867 }
2868 return false;
473574ee
SE
2869}
2870
43cacb12
RS
2871/* Implement REGMODE_NATURAL_SIZE. */
2872poly_uint64
2873aarch64_regmode_natural_size (machine_mode mode)
2874{
2875 /* The natural size for SVE data modes is one SVE data vector,
2876 and similarly for predicates. We can't independently modify
2877 anything smaller than that. */
2878 /* ??? For now, only do this for variable-width SVE registers.
2879 Doing it for constant-sized registers breaks lower-subreg.c. */
2880 /* ??? And once that's fixed, we should probably have similar
2881 code for Advanced SIMD. */
2882 if (!aarch64_sve_vg.is_constant ())
2883 {
2884 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2885 if (vec_flags & VEC_SVE_PRED)
2886 return BYTES_PER_SVE_PRED;
2887 if (vec_flags & VEC_SVE_DATA)
2888 return BYTES_PER_SVE_VECTOR;
2889 }
2890 return UNITS_PER_WORD;
2891}
2892
73d9ac6a 2893/* Implement HARD_REGNO_CALLER_SAVE_MODE. */
ef4bddc2 2894machine_mode
43cacb12
RS
2895aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
2896 machine_mode mode)
2897{
2898 /* The predicate mode determines which bits are significant and
2899 which are "don't care". Decreasing the number of lanes would
2900 lose data while increasing the number of lanes would make bits
2901 unnecessarily significant. */
2902 if (PR_REGNUM_P (regno))
2903 return mode;
6a70badb
RS
2904 if (known_ge (GET_MODE_SIZE (mode), 4))
2905 return mode;
73d9ac6a 2906 else
6a70badb 2907 return SImode;
73d9ac6a
IB
2908}
2909
231c52ae
ST
2910/* Return true if I's bits are consecutive ones from the MSB. */
2911bool
2912aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
2913{
2914 return exact_log2 (-i) != HOST_WIDE_INT_M1;
2915}
2916
58e17cf8
RS
2917/* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
2918 that strcpy from constants will be faster. */
2919
2920static HOST_WIDE_INT
2921aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
2922{
2923 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
2924 return MAX (align, BITS_PER_WORD);
2925 return align;
2926}
2927
43e9d192
IB
2928/* Return true if calls to DECL should be treated as
2929 long-calls (ie called via a register). */
2930static bool
2931aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
2932{
2933 return false;
2934}
2935
2936/* Return true if calls to symbol-ref SYM should be treated as
2937 long-calls (ie called via a register). */
2938bool
2939aarch64_is_long_call_p (rtx sym)
2940{
2941 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
2942}
2943
b60d63cb
JW
2944/* Return true if calls to symbol-ref SYM should not go through
2945 plt stubs. */
2946
2947bool
2948aarch64_is_noplt_call_p (rtx sym)
2949{
2950 const_tree decl = SYMBOL_REF_DECL (sym);
2951
2952 if (flag_pic
2953 && decl
2954 && (!flag_plt
2955 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
2956 && !targetm.binds_local_p (decl))
2957 return true;
2958
2959 return false;
2960}
2961
43e9d192
IB
2962/* Emit an insn that's a simple single-set. Both the operands must be
2963 known to be valid. */
827ab47a 2964inline static rtx_insn *
43e9d192
IB
2965emit_set_insn (rtx x, rtx y)
2966{
f7df4a84 2967 return emit_insn (gen_rtx_SET (x, y));
43e9d192
IB
2968}
2969
2970/* X and Y are two things to compare using CODE. Emit the compare insn and
2971 return the rtx for register 0 in the proper mode. */
2972rtx
2973aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2974{
4a2095eb
RH
2975 machine_mode cmp_mode = GET_MODE (x);
2976 machine_mode cc_mode;
2977 rtx cc_reg;
43e9d192 2978
4a2095eb
RH
2979 if (cmp_mode == TImode)
2980 {
2981 gcc_assert (code == NE);
2982
2983 cc_mode = CCmode;
2984 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2985
2986 rtx x_lo = operand_subword (x, 0, 0, TImode);
2987 rtx y_lo = operand_subword (y, 0, 0, TImode);
2988 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
2989
2990 rtx x_hi = operand_subword (x, 1, 0, TImode);
2991 rtx y_hi = operand_subword (y, 1, 0, TImode);
865257c4
RS
2992 emit_insn (gen_ccmpccdi (cc_reg, cc_reg, x_hi, y_hi,
2993 gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
2994 GEN_INT (AARCH64_EQ)));
4a2095eb
RH
2995 }
2996 else
2997 {
2998 cc_mode = SELECT_CC_MODE (code, x, y);
2999 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
3000 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
3001 }
43e9d192
IB
3002 return cc_reg;
3003}
3004
d400fda3
RH
3005/* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
3006
3007static rtx
3008aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
3009 machine_mode y_mode)
3010{
3011 if (y_mode == E_QImode || y_mode == E_HImode)
3012 {
3013 if (CONST_INT_P (y))
df562b12
JJ
3014 {
3015 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
3016 y_mode = SImode;
3017 }
d400fda3
RH
3018 else
3019 {
3020 rtx t, cc_reg;
3021 machine_mode cc_mode;
3022
3023 t = gen_rtx_ZERO_EXTEND (SImode, y);
3024 t = gen_rtx_COMPARE (CC_SWPmode, t, x);
3025 cc_mode = CC_SWPmode;
3026 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
3027 emit_set_insn (cc_reg, t);
3028 return cc_reg;
3029 }
3030 }
3031
846f78d4
PK
3032 if (!aarch64_plus_operand (y, y_mode))
3033 y = force_reg (y_mode, y);
3034
d400fda3
RH
3035 return aarch64_gen_compare_reg (code, x, y);
3036}
3037
43e9d192
IB
3038/* Build the SYMBOL_REF for __tls_get_addr. */
3039
3040static GTY(()) rtx tls_get_addr_libfunc;
3041
3042rtx
3043aarch64_tls_get_addr (void)
3044{
3045 if (!tls_get_addr_libfunc)
3046 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
3047 return tls_get_addr_libfunc;
3048}
3049
3050/* Return the TLS model to use for ADDR. */
3051
3052static enum tls_model
3053tls_symbolic_operand_type (rtx addr)
3054{
3055 enum tls_model tls_kind = TLS_MODEL_NONE;
74b27d8e
RS
3056 poly_int64 offset;
3057 addr = strip_offset_and_salt (addr, &offset);
3793ecc1 3058 if (SYMBOL_REF_P (addr))
43e9d192
IB
3059 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
3060
3061 return tls_kind;
3062}
3063
3064/* We'll allow lo_sum's in addresses in our legitimate addresses
3065 so that combine would take care of combining addresses where
3066 necessary, but for generation purposes, we'll generate the address
3067 as :
3068 RTL Absolute
3069 tmp = hi (symbol_ref); adrp x1, foo
3070 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
3071 nop
3072
3073 PIC TLS
3074 adrp x1, :got:foo adrp tmp, :tlsgd:foo
3075 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
3076 bl __tls_get_addr
3077 nop
3078
3079 Load TLS symbol, depending on TLS mechanism and TLS access model.
3080
3081 Global Dynamic - Traditional TLS:
3082 adrp tmp, :tlsgd:imm
3083 add dest, tmp, #:tlsgd_lo12:imm
3084 bl __tls_get_addr
3085
3086 Global Dynamic - TLS Descriptors:
3087 adrp dest, :tlsdesc:imm
3088 ldr tmp, [dest, #:tlsdesc_lo12:imm]
3089 add dest, dest, #:tlsdesc_lo12:imm
3090 blr tmp
3091 mrs tp, tpidr_el0
3092 add dest, dest, tp
3093
3094 Initial Exec:
3095 mrs tp, tpidr_el0
3096 adrp tmp, :gottprel:imm
3097 ldr dest, [tmp, #:gottprel_lo12:imm]
3098 add dest, dest, tp
3099
3100 Local Exec:
3101 mrs tp, tpidr_el0
0699caae
RL
3102 add t0, tp, #:tprel_hi12:imm, lsl #12
3103 add t0, t0, #:tprel_lo12_nc:imm
43e9d192
IB
3104*/
3105
3106static void
3107aarch64_load_symref_appropriately (rtx dest, rtx imm,
3108 enum aarch64_symbol_type type)
3109{
3110 switch (type)
3111 {
3112 case SYMBOL_SMALL_ABSOLUTE:
3113 {
28514dda 3114 /* In ILP32, the mode of dest can be either SImode or DImode. */
43e9d192 3115 rtx tmp_reg = dest;
ef4bddc2 3116 machine_mode mode = GET_MODE (dest);
28514dda
YZ
3117
3118 gcc_assert (mode == Pmode || mode == ptr_mode);
3119
43e9d192 3120 if (can_create_pseudo_p ())
28514dda 3121 tmp_reg = gen_reg_rtx (mode);
43e9d192 3122
28514dda 3123 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
43e9d192
IB
3124 emit_insn (gen_add_losym (dest, tmp_reg, imm));
3125 return;
3126 }
3127
a5350ddc 3128 case SYMBOL_TINY_ABSOLUTE:
f7df4a84 3129 emit_insn (gen_rtx_SET (dest, imm));
a5350ddc
CSS
3130 return;
3131
1b1e81f8
JW
3132 case SYMBOL_SMALL_GOT_28K:
3133 {
3134 machine_mode mode = GET_MODE (dest);
3135 rtx gp_rtx = pic_offset_table_rtx;
53021678
JW
3136 rtx insn;
3137 rtx mem;
1b1e81f8
JW
3138
3139 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
3140 here before rtl expand. Tree IVOPT will generate rtl pattern to
3141 decide rtx costs, in which case pic_offset_table_rtx is not
3142 initialized. For that case no need to generate the first adrp
026c3cfd 3143 instruction as the final cost for global variable access is
1b1e81f8
JW
3144 one instruction. */
3145 if (gp_rtx != NULL)
3146 {
3147 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
3148 using the page base as GOT base, the first page may be wasted,
3149 in the worst scenario, there is only 28K space for GOT).
3150
3151 The generate instruction sequence for accessing global variable
3152 is:
3153
a3957742 3154 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1b1e81f8
JW
3155
3156 Only one instruction needed. But we must initialize
3157 pic_offset_table_rtx properly. We generate initialize insn for
3158 every global access, and allow CSE to remove all redundant.
3159
3160 The final instruction sequences will look like the following
3161 for multiply global variables access.
3162
a3957742 3163 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1b1e81f8 3164
a3957742
JW
3165 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
3166 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
3167 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
3168 ... */
1b1e81f8
JW
3169
3170 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
3171 crtl->uses_pic_offset_table = 1;
3172 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
3173
3174 if (mode != GET_MODE (gp_rtx))
4ba8f0a3
AP
3175 gp_rtx = gen_lowpart (mode, gp_rtx);
3176
1b1e81f8
JW
3177 }
3178
3179 if (mode == ptr_mode)
3180 {
3181 if (mode == DImode)
53021678 3182 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1b1e81f8 3183 else
53021678
JW
3184 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
3185
3186 mem = XVECEXP (SET_SRC (insn), 0, 0);
1b1e81f8
JW
3187 }
3188 else
3189 {
3190 gcc_assert (mode == Pmode);
53021678
JW
3191
3192 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
3193 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1b1e81f8
JW
3194 }
3195
53021678
JW
3196 /* The operand is expected to be MEM. Whenever the related insn
3197 pattern changed, above code which calculate mem should be
3198 updated. */
3793ecc1 3199 gcc_assert (MEM_P (mem));
53021678
JW
3200 MEM_READONLY_P (mem) = 1;
3201 MEM_NOTRAP_P (mem) = 1;
3202 emit_insn (insn);
1b1e81f8
JW
3203 return;
3204 }
3205
6642bdb4 3206 case SYMBOL_SMALL_GOT_4G:
43e9d192 3207 {
28514dda
YZ
3208 /* In ILP32, the mode of dest can be either SImode or DImode,
3209 while the got entry is always of SImode size. The mode of
3210 dest depends on how dest is used: if dest is assigned to a
3211 pointer (e.g. in the memory), it has SImode; it may have
3212 DImode if dest is dereferenced to access the memeory.
3213 This is why we have to handle three different ldr_got_small
3214 patterns here (two patterns for ILP32). */
53021678
JW
3215
3216 rtx insn;
3217 rtx mem;
43e9d192 3218 rtx tmp_reg = dest;
ef4bddc2 3219 machine_mode mode = GET_MODE (dest);
28514dda 3220
43e9d192 3221 if (can_create_pseudo_p ())
28514dda
YZ
3222 tmp_reg = gen_reg_rtx (mode);
3223
3224 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
3225 if (mode == ptr_mode)
3226 {
3227 if (mode == DImode)
53021678 3228 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
28514dda 3229 else
53021678
JW
3230 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
3231
3232 mem = XVECEXP (SET_SRC (insn), 0, 0);
28514dda
YZ
3233 }
3234 else
3235 {
3236 gcc_assert (mode == Pmode);
53021678
JW
3237
3238 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
3239 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
28514dda
YZ
3240 }
3241
3793ecc1 3242 gcc_assert (MEM_P (mem));
53021678
JW
3243 MEM_READONLY_P (mem) = 1;
3244 MEM_NOTRAP_P (mem) = 1;
3245 emit_insn (insn);
43e9d192
IB
3246 return;
3247 }
3248
3249 case SYMBOL_SMALL_TLSGD:
3250 {
5d8a22a5 3251 rtx_insn *insns;
87ca615a
AP
3252 /* The return type of __tls_get_addr is the C pointer type
3253 so use ptr_mode. */
3254 rtx result = gen_rtx_REG (ptr_mode, R0_REGNUM);
3255 rtx tmp_reg = dest;
3256
3257 if (GET_MODE (dest) != ptr_mode)
3258 tmp_reg = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : result;
43e9d192
IB
3259
3260 start_sequence ();
87ca615a 3261 if (ptr_mode == SImode)
23b88fda
N
3262 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
3263 else
3264 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
43e9d192
IB
3265 insns = get_insns ();
3266 end_sequence ();
3267
3268 RTL_CONST_CALL_P (insns) = 1;
87ca615a
AP
3269 emit_libcall_block (insns, tmp_reg, result, imm);
3270 /* Convert back to the mode of the dest adding a zero_extend
3271 from SImode (ptr_mode) to DImode (Pmode). */
3272 if (dest != tmp_reg)
3273 convert_move (dest, tmp_reg, true);
43e9d192
IB
3274 return;
3275 }
3276
3277 case SYMBOL_SMALL_TLSDESC:
3278 {
ef4bddc2 3279 machine_mode mode = GET_MODE (dest);
621ad2de 3280 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
43e9d192
IB
3281 rtx tp;
3282
621ad2de
AP
3283 gcc_assert (mode == Pmode || mode == ptr_mode);
3284
2876a13f
JW
3285 /* In ILP32, the got entry is always of SImode size. Unlike
3286 small GOT, the dest is fixed at reg 0. */
3287 if (TARGET_ILP32)
3288 emit_insn (gen_tlsdesc_small_si (imm));
621ad2de 3289 else
2876a13f 3290 emit_insn (gen_tlsdesc_small_di (imm));
43e9d192 3291 tp = aarch64_load_tp (NULL);
621ad2de
AP
3292
3293 if (mode != Pmode)
3294 tp = gen_lowpart (mode, tp);
3295
2876a13f 3296 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
241dbd9d
QZ
3297 if (REG_P (dest))
3298 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
43e9d192
IB
3299 return;
3300 }
3301
79496620 3302 case SYMBOL_SMALL_TLSIE:
43e9d192 3303 {
621ad2de
AP
3304 /* In ILP32, the mode of dest can be either SImode or DImode,
3305 while the got entry is always of SImode size. The mode of
3306 dest depends on how dest is used: if dest is assigned to a
3307 pointer (e.g. in the memory), it has SImode; it may have
3308 DImode if dest is dereferenced to access the memeory.
3309 This is why we have to handle three different tlsie_small
3310 patterns here (two patterns for ILP32). */
ef4bddc2 3311 machine_mode mode = GET_MODE (dest);
621ad2de 3312 rtx tmp_reg = gen_reg_rtx (mode);
43e9d192 3313 rtx tp = aarch64_load_tp (NULL);
621ad2de
AP
3314
3315 if (mode == ptr_mode)
3316 {
3317 if (mode == DImode)
3318 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
3319 else
3320 {
3321 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
3322 tp = gen_lowpart (mode, tp);
3323 }
3324 }
3325 else
3326 {
3327 gcc_assert (mode == Pmode);
3328 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
3329 }
3330
f7df4a84 3331 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
241dbd9d
QZ
3332 if (REG_P (dest))
3333 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
43e9d192
IB
3334 return;
3335 }
3336
cbf5629e 3337 case SYMBOL_TLSLE12:
d18ba284 3338 case SYMBOL_TLSLE24:
cbf5629e
JW
3339 case SYMBOL_TLSLE32:
3340 case SYMBOL_TLSLE48:
43e9d192 3341 {
cbf5629e 3342 machine_mode mode = GET_MODE (dest);
43e9d192 3343 rtx tp = aarch64_load_tp (NULL);
e6f7f0e9 3344
cbf5629e
JW
3345 if (mode != Pmode)
3346 tp = gen_lowpart (mode, tp);
3347
3348 switch (type)
3349 {
3350 case SYMBOL_TLSLE12:
3351 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
3352 (dest, tp, imm));
3353 break;
3354 case SYMBOL_TLSLE24:
3355 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
3356 (dest, tp, imm));
3357 break;
3358 case SYMBOL_TLSLE32:
3359 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
3360 (dest, imm));
3361 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3362 (dest, dest, tp));
3363 break;
3364 case SYMBOL_TLSLE48:
3365 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
3366 (dest, imm));
3367 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3368 (dest, dest, tp));
3369 break;
3370 default:
3371 gcc_unreachable ();
3372 }
e6f7f0e9 3373
241dbd9d
QZ
3374 if (REG_P (dest))
3375 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
43e9d192
IB
3376 return;
3377 }
3378
87dd8ab0 3379 case SYMBOL_TINY_GOT:
d91480de
D
3380 {
3381 rtx insn;
3382 machine_mode mode = GET_MODE (dest);
3383
3384 if (mode == ptr_mode)
3385 insn = gen_ldr_got_tiny (mode, dest, imm);
3386 else
3387 {
3388 gcc_assert (mode == Pmode);
3389 insn = gen_ldr_got_tiny_sidi (dest, imm);
3390 }
3391
3392 emit_insn (insn);
3393 return;
3394 }
87dd8ab0 3395
5ae7caad
JW
3396 case SYMBOL_TINY_TLSIE:
3397 {
3398 machine_mode mode = GET_MODE (dest);
3399 rtx tp = aarch64_load_tp (NULL);
3400
3401 if (mode == ptr_mode)
3402 {
3403 if (mode == DImode)
3404 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
3405 else
3406 {
3407 tp = gen_lowpart (mode, tp);
3408 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
3409 }
3410 }
3411 else
3412 {
3413 gcc_assert (mode == Pmode);
3414 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
3415 }
3416
241dbd9d
QZ
3417 if (REG_P (dest))
3418 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
5ae7caad
JW
3419 return;
3420 }
3421
43e9d192
IB
3422 default:
3423 gcc_unreachable ();
3424 }
3425}
3426
3427/* Emit a move from SRC to DEST. Assume that the move expanders can
3428 handle all moves if !can_create_pseudo_p (). The distinction is
3429 important because, unlike emit_move_insn, the move expanders know
3430 how to force Pmode objects into the constant pool even when the
3431 constant pool address is not itself legitimate. */
3432static rtx
3433aarch64_emit_move (rtx dest, rtx src)
3434{
3435 return (can_create_pseudo_p ()
3436 ? emit_move_insn (dest, src)
3437 : emit_move_insn_1 (dest, src));
3438}
3439
f22d7973
RS
3440/* Apply UNOPTAB to OP and store the result in DEST. */
3441
3442static void
3443aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
3444{
3445 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
3446 if (dest != tmp)
3447 emit_move_insn (dest, tmp);
3448}
3449
3450/* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
3451
3452static void
3453aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
3454{
3455 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
3456 OPTAB_DIRECT);
3457 if (dest != tmp)
3458 emit_move_insn (dest, tmp);
3459}
3460
030d03b8
RE
3461/* Split a 128-bit move operation into two 64-bit move operations,
3462 taking care to handle partial overlap of register to register
3463 copies. Special cases are needed when moving between GP regs and
3464 FP regs. SRC can be a register, constant or memory; DST a register
3465 or memory. If either operand is memory it must not have any side
3466 effects. */
43e9d192
IB
3467void
3468aarch64_split_128bit_move (rtx dst, rtx src)
3469{
030d03b8
RE
3470 rtx dst_lo, dst_hi;
3471 rtx src_lo, src_hi;
43e9d192 3472
ef4bddc2 3473 machine_mode mode = GET_MODE (dst);
12dc6974 3474
030d03b8
RE
3475 gcc_assert (mode == TImode || mode == TFmode);
3476 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
3477 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
43e9d192
IB
3478
3479 if (REG_P (dst) && REG_P (src))
3480 {
030d03b8
RE
3481 int src_regno = REGNO (src);
3482 int dst_regno = REGNO (dst);
43e9d192 3483
030d03b8 3484 /* Handle FP <-> GP regs. */
43e9d192
IB
3485 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
3486 {
030d03b8
RE
3487 src_lo = gen_lowpart (word_mode, src);
3488 src_hi = gen_highpart (word_mode, src);
3489
0016d8d9
RS
3490 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
3491 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
030d03b8 3492 return;
43e9d192
IB
3493 }
3494 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
3495 {
030d03b8
RE
3496 dst_lo = gen_lowpart (word_mode, dst);
3497 dst_hi = gen_highpart (word_mode, dst);
3498
0016d8d9
RS
3499 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
3500 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
030d03b8 3501 return;
43e9d192 3502 }
43e9d192
IB
3503 }
3504
030d03b8
RE
3505 dst_lo = gen_lowpart (word_mode, dst);
3506 dst_hi = gen_highpart (word_mode, dst);
3507 src_lo = gen_lowpart (word_mode, src);
3508 src_hi = gen_highpart_mode (word_mode, mode, src);
3509
3510 /* At most one pairing may overlap. */
3511 if (reg_overlap_mentioned_p (dst_lo, src_hi))
3512 {
3513 aarch64_emit_move (dst_hi, src_hi);
3514 aarch64_emit_move (dst_lo, src_lo);
3515 }
3516 else
3517 {
3518 aarch64_emit_move (dst_lo, src_lo);
3519 aarch64_emit_move (dst_hi, src_hi);
3520 }
43e9d192
IB
3521}
3522
d4f9e819
RS
3523/* Return true if we should split a move from 128-bit value SRC
3524 to 128-bit register DEST. */
3525
43e9d192
IB
3526bool
3527aarch64_split_128bit_move_p (rtx dst, rtx src)
3528{
d4f9e819
RS
3529 if (FP_REGNUM_P (REGNO (dst)))
3530 return REG_P (src) && !FP_REGNUM_P (REGNO (src));
3531 /* All moves to GPRs need to be split. */
3532 return true;
43e9d192
IB
3533}
3534
8b033a8a
SN
3535/* Split a complex SIMD combine. */
3536
3537void
3538aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
3539{
ef4bddc2
RS
3540 machine_mode src_mode = GET_MODE (src1);
3541 machine_mode dst_mode = GET_MODE (dst);
8b033a8a
SN
3542
3543 gcc_assert (VECTOR_MODE_P (dst_mode));
a977dc0c
MC
3544 gcc_assert (register_operand (dst, dst_mode)
3545 && register_operand (src1, src_mode)
3546 && register_operand (src2, src_mode));
8b033a8a 3547
0016d8d9 3548 emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
a977dc0c 3549 return;
8b033a8a
SN
3550}
3551
fd4842cd
SN
3552/* Split a complex SIMD move. */
3553
3554void
3555aarch64_split_simd_move (rtx dst, rtx src)
3556{
ef4bddc2
RS
3557 machine_mode src_mode = GET_MODE (src);
3558 machine_mode dst_mode = GET_MODE (dst);
fd4842cd
SN
3559
3560 gcc_assert (VECTOR_MODE_P (dst_mode));
3561
3562 if (REG_P (dst) && REG_P (src))
3563 {
3564 gcc_assert (VECTOR_MODE_P (src_mode));
0016d8d9 3565 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
fd4842cd
SN
3566 }
3567}
3568
ef22810a
RH
3569bool
3570aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
3571 machine_mode ymode, rtx y)
3572{
3573 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
3574 gcc_assert (r != NULL);
3575 return rtx_equal_p (x, r);
3576}
ef22810a 3577
678faefc
RS
3578/* Return TARGET if it is nonnull and a register of mode MODE.
3579 Otherwise, return a fresh register of mode MODE if we can,
3580 or TARGET reinterpreted as MODE if we can't. */
3581
3582static rtx
3583aarch64_target_reg (rtx target, machine_mode mode)
3584{
3585 if (target && REG_P (target) && GET_MODE (target) == mode)
3586 return target;
3587 if (!can_create_pseudo_p ())
3588 {
3589 gcc_assert (target);
3590 return gen_lowpart (mode, target);
3591 }
3592 return gen_reg_rtx (mode);
3593}
3594
3595/* Return a register that contains the constant in BUILDER, given that
3596 the constant is a legitimate move operand. Use TARGET as the register
3597 if it is nonnull and convenient. */
3598
3599static rtx
3600aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
3601{
3602 rtx src = builder.build ();
3603 target = aarch64_target_reg (target, GET_MODE (src));
3604 emit_insn (gen_rtx_SET (target, src));
3605 return target;
3606}
3607
43e9d192 3608static rtx
ef4bddc2 3609aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
43e9d192
IB
3610{
3611 if (can_create_pseudo_p ())
e18b4a81 3612 return force_reg (mode, value);
43e9d192
IB
3613 else
3614 {
f5470a77
RS
3615 gcc_assert (x);
3616 aarch64_emit_move (x, value);
43e9d192
IB
3617 return x;
3618 }
3619}
3620
0b1fe8cf
RS
3621/* Return true if predicate value X is a constant in which every element
3622 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
3623 value, i.e. as a predicate in which all bits are significant. */
3624
3625static bool
3626aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
3627{
3628 if (GET_CODE (x) != CONST_VECTOR)
3629 return false;
3630
3631 unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
3632 GET_MODE_NUNITS (GET_MODE (x)));
3633 unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
3634 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
3635 builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
3636
3637 unsigned int nelts = const_vector_encoded_nelts (x);
3638 for (unsigned int i = 0; i < nelts; ++i)
3639 {
3640 rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
3641 if (!CONST_INT_P (elt))
3642 return false;
3643
3644 builder.quick_push (elt);
3645 for (unsigned int j = 1; j < factor; ++j)
3646 builder.quick_push (const0_rtx);
3647 }
3648 builder.finalize ();
3649 return true;
3650}
3651
3652/* BUILDER contains a predicate constant of mode VNx16BI. Return the
3653 widest predicate element size it can have (that is, the largest size
3654 for which each element would still be 0 or 1). */
3655
3656unsigned int
3657aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
3658{
3659 /* Start with the most optimistic assumption: that we only need
3660 one bit per pattern. This is what we will use if only the first
3661 bit in each pattern is ever set. */
3662 unsigned int mask = GET_MODE_SIZE (DImode);
3663 mask |= builder.npatterns ();
3664
3665 /* Look for set bits. */
3666 unsigned int nelts = builder.encoded_nelts ();
3667 for (unsigned int i = 1; i < nelts; ++i)
3668 if (INTVAL (builder.elt (i)) != 0)
3669 {
3670 if (i & 1)
3671 return 1;
3672 mask |= i;
3673 }
3674 return mask & -mask;
3675}
3676
624d0f07
RS
3677/* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
3678 return that predicate mode, otherwise return opt_machine_mode (). */
3679
3680opt_machine_mode
3681aarch64_ptrue_all_mode (rtx x)
3682{
3683 gcc_assert (GET_MODE (x) == VNx16BImode);
3684 if (GET_CODE (x) != CONST_VECTOR
3685 || !CONST_VECTOR_DUPLICATE_P (x)
3686 || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
3687 || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
3688 return opt_machine_mode ();
3689
3690 unsigned int nelts = const_vector_encoded_nelts (x);
3691 for (unsigned int i = 1; i < nelts; ++i)
3692 if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
3693 return opt_machine_mode ();
3694
3695 return aarch64_sve_pred_mode (nelts);
3696}
3697
0b1fe8cf
RS
3698/* BUILDER is a predicate constant of mode VNx16BI. Consider the value
3699 that the constant would have with predicate element size ELT_SIZE
3700 (ignoring the upper bits in each element) and return:
3701
3702 * -1 if all bits are set
3703 * N if the predicate has N leading set bits followed by all clear bits
3704 * 0 if the predicate does not have any of these forms. */
3705
3706int
3707aarch64_partial_ptrue_length (rtx_vector_builder &builder,
3708 unsigned int elt_size)
3709{
3710 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
3711 followed by set bits. */
3712 if (builder.nelts_per_pattern () == 3)
3713 return 0;
3714
3715 /* Skip over leading set bits. */
3716 unsigned int nelts = builder.encoded_nelts ();
3717 unsigned int i = 0;
3718 for (; i < nelts; i += elt_size)
3719 if (INTVAL (builder.elt (i)) == 0)
3720 break;
3721 unsigned int vl = i / elt_size;
3722
3723 /* Check for the all-true case. */
3724 if (i == nelts)
3725 return -1;
3726
3727 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
3728 repeating pattern of set bits followed by clear bits. */
3729 if (builder.nelts_per_pattern () != 2)
3730 return 0;
3731
3732 /* We have a "foreground" value and a duplicated "background" value.
3733 If the background might repeat and the last set bit belongs to it,
3734 we might have set bits followed by clear bits followed by set bits. */
3735 if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
3736 return 0;
3737
3738 /* Make sure that the rest are all clear. */
3739 for (; i < nelts; i += elt_size)
3740 if (INTVAL (builder.elt (i)) != 0)
3741 return 0;
3742
3743 return vl;
3744}
3745
3746/* See if there is an svpattern that encodes an SVE predicate of mode
3747 PRED_MODE in which the first VL bits are set and the rest are clear.
3748 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
3749 A VL of -1 indicates an all-true vector. */
3750
3751aarch64_svpattern
3752aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
3753{
3754 if (vl < 0)
3755 return AARCH64_SV_ALL;
3756
3757 if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
3758 return AARCH64_NUM_SVPATTERNS;
3759
3760 if (vl >= 1 && vl <= 8)
3761 return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
3762
3763 if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
3764 return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
3765
3766 int max_vl;
3767 if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
3768 {
3769 if (vl == (max_vl / 3) * 3)
3770 return AARCH64_SV_MUL3;
3771 /* These would only trigger for non-power-of-2 lengths. */
3772 if (vl == (max_vl & -4))
3773 return AARCH64_SV_MUL4;
3774 if (vl == (1 << floor_log2 (max_vl)))
3775 return AARCH64_SV_POW2;
3776 if (vl == max_vl)
3777 return AARCH64_SV_ALL;
3778 }
3779 return AARCH64_NUM_SVPATTERNS;
3780}
3781
34467289
RS
3782/* Return a VNx16BImode constant in which every sequence of ELT_SIZE
3783 bits has the lowest bit set and the upper bits clear. This is the
3784 VNx16BImode equivalent of a PTRUE for controlling elements of
3785 ELT_SIZE bytes. However, because the constant is VNx16BImode,
3786 all bits are significant, even the upper zeros. */
3787
3788rtx
3789aarch64_ptrue_all (unsigned int elt_size)
3790{
3791 rtx_vector_builder builder (VNx16BImode, elt_size, 1);
3792 builder.quick_push (const1_rtx);
3793 for (unsigned int i = 1; i < elt_size; ++i)
3794 builder.quick_push (const0_rtx);
3795 return builder.build ();
3796}
3797
16de3637
RS
3798/* Return an all-true predicate register of mode MODE. */
3799
3800rtx
3801aarch64_ptrue_reg (machine_mode mode)
3802{
3803 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
678faefc
RS
3804 rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3805 return gen_lowpart (mode, reg);
16de3637
RS
3806}
3807
e7053b0c
RS
3808/* Return an all-false predicate register of mode MODE. */
3809
3810rtx
3811aarch64_pfalse_reg (machine_mode mode)
3812{
3813 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
678faefc
RS
3814 rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
3815 return gen_lowpart (mode, reg);
3816}
3817
00fa90d9
RS
3818/* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
3819 for it. PRED2[0] is the predicate for the instruction whose result
3820 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
3821 for it. Return true if we can prove that the two predicates are
3822 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
3823 with PRED1[0] without changing behavior. */
3824
3825bool
3826aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
3827{
3828 machine_mode mode = GET_MODE (pred1[0]);
3829 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3830 && mode == GET_MODE (pred2[0])
3831 && aarch64_sve_ptrue_flag (pred1[1], SImode)
3832 && aarch64_sve_ptrue_flag (pred2[1], SImode));
3833
3834 bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
3835 || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
3836 bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
3837 || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
3838 return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
3839}
3840
3841/* Emit a comparison CMP between OP0 and OP1, both of which have mode
3842 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
3843 Use TARGET as the target register if nonnull and convenient. */
3844
3845static rtx
3846aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
3847 machine_mode data_mode, rtx op1, rtx op2)
3848{
3849 insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
3850 expand_operand ops[5];
3851 create_output_operand (&ops[0], target, pred_mode);
3852 create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
3853 create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
3854 create_input_operand (&ops[3], op1, data_mode);
3855 create_input_operand (&ops[4], op2, data_mode);
3856 expand_insn (icode, 5, ops);
3857 return ops[0].value;
3858}
3859
678faefc
RS
3860/* Use a comparison to convert integer vector SRC into MODE, which is
3861 the corresponding SVE predicate mode. Use TARGET for the result
3862 if it's nonnull and convenient. */
3863
624d0f07 3864rtx
678faefc
RS
3865aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
3866{
3867 machine_mode src_mode = GET_MODE (src);
00fa90d9
RS
3868 return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
3869 src, CONST0_RTX (src_mode));
e7053b0c
RS
3870}
3871
624d0f07
RS
3872/* Return the assembly token for svprfop value PRFOP. */
3873
3874static const char *
3875svprfop_token (enum aarch64_svprfop prfop)
3876{
3877 switch (prfop)
3878 {
3879#define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
3880 AARCH64_FOR_SVPRFOP (CASE)
3881#undef CASE
3882 case AARCH64_NUM_SVPRFOPS:
3883 break;
3884 }
3885 gcc_unreachable ();
3886}
3887
3888/* Return the assembly string for an SVE prefetch operation with
3889 mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
3890 and that SUFFIX is the format for the remaining operands. */
3891
3892char *
3893aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
3894 const char *suffix)
3895{
3896 static char buffer[128];
3897 aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
3898 unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
3899 mnemonic, svprfop_token (prfop), suffix);
3900 gcc_assert (written < sizeof (buffer));
3901 return buffer;
3902}
3903
3904/* Check whether we can calculate the number of elements in PATTERN
3905 at compile time, given that there are NELTS_PER_VQ elements per
3906 128-bit block. Return the value if so, otherwise return -1. */
3907
3908HOST_WIDE_INT
3909aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
3910{
3911 unsigned int vl, const_vg;
3912 if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
3913 vl = 1 + (pattern - AARCH64_SV_VL1);
3914 else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
3915 vl = 16 << (pattern - AARCH64_SV_VL16);
3916 else if (aarch64_sve_vg.is_constant (&const_vg))
3917 {
3918 /* There are two vector granules per quadword. */
3919 unsigned int nelts = (const_vg / 2) * nelts_per_vq;
3920 switch (pattern)
3921 {
3922 case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
3923 case AARCH64_SV_MUL4: return nelts & -4;
3924 case AARCH64_SV_MUL3: return (nelts / 3) * 3;
3925 case AARCH64_SV_ALL: return nelts;
3926 default: gcc_unreachable ();
3927 }
3928 }
3929 else
3930 return -1;
3931
3932 /* There are two vector granules per quadword. */
3933 poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
3934 if (known_le (vl, nelts_all))
3935 return vl;
3936
3937 /* Requesting more elements than are available results in a PFALSE. */
3938 if (known_gt (vl, nelts_all))
3939 return 0;
3940
3941 return -1;
3942}
3943
43cacb12
RS
3944/* Return true if we can move VALUE into a register using a single
3945 CNT[BHWD] instruction. */
3946
3947static bool
3948aarch64_sve_cnt_immediate_p (poly_int64 value)
3949{
3950 HOST_WIDE_INT factor = value.coeffs[0];
3951 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
3952 return (value.coeffs[1] == factor
3953 && IN_RANGE (factor, 2, 16 * 16)
3954 && (factor & 1) == 0
3955 && factor <= 16 * (factor & -factor));
3956}
3957
3958/* Likewise for rtx X. */
3959
3960bool
3961aarch64_sve_cnt_immediate_p (rtx x)
3962{
3963 poly_int64 value;
3964 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
3965}
3966
3967/* Return the asm string for an instruction with a CNT-like vector size
3968 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3969 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3970 first part of the operands template (the part that comes before the
139df05a
RS
3971 vector size itself). PATTERN is the pattern to use. FACTOR is the
3972 number of quadwords. NELTS_PER_VQ, if nonzero, is the number of elements
3973 in each quadword. If it is zero, we can use any element size. */
43cacb12
RS
3974
3975static char *
3976aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
139df05a 3977 aarch64_svpattern pattern,
43cacb12
RS
3978 unsigned int factor,
3979 unsigned int nelts_per_vq)
3980{
139df05a 3981 static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
43cacb12
RS
3982
3983 if (nelts_per_vq == 0)
3984 /* There is some overlap in the ranges of the four CNT instructions.
3985 Here we always use the smallest possible element size, so that the
3986 multiplier is 1 whereever possible. */
3987 nelts_per_vq = factor & -factor;
3988 int shift = std::min (exact_log2 (nelts_per_vq), 4);
3989 gcc_assert (IN_RANGE (shift, 1, 4));
3990 char suffix = "dwhb"[shift - 1];
3991
3992 factor >>= shift;
3993 unsigned int written;
139df05a 3994 if (pattern == AARCH64_SV_ALL && factor == 1)
43cacb12
RS
3995 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
3996 prefix, suffix, operands);
139df05a
RS
3997 else if (factor == 1)
3998 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
3999 prefix, suffix, operands, svpattern_token (pattern));
43cacb12 4000 else
139df05a
RS
4001 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
4002 prefix, suffix, operands, svpattern_token (pattern),
4003 factor);
43cacb12
RS
4004 gcc_assert (written < sizeof (buffer));
4005 return buffer;
4006}
4007
4008/* Return the asm string for an instruction with a CNT-like vector size
4009 operand (a vector pattern followed by a multiplier in the range [1, 16]).
4010 PREFIX is the mnemonic without the size suffix and OPERANDS is the
4011 first part of the operands template (the part that comes before the
4012 vector size itself). X is the value of the vector size operand,
139df05a
RS
4013 as a polynomial integer rtx; we need to convert this into an "all"
4014 pattern with a multiplier. */
43cacb12
RS
4015
4016char *
4017aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
4018 rtx x)
4019{
4020 poly_int64 value = rtx_to_poly_int64 (x);
4021 gcc_assert (aarch64_sve_cnt_immediate_p (value));
139df05a 4022 return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
43cacb12
RS
4023 value.coeffs[1], 0);
4024}
4025
624d0f07
RS
4026/* Return the asm string for an instruction with a CNT-like vector size
4027 operand (a vector pattern followed by a multiplier in the range [1, 16]).
4028 PREFIX is the mnemonic without the size suffix and OPERANDS is the
4029 first part of the operands template (the part that comes before the
4030 vector size itself). CNT_PAT[0..2] are the operands of the
4031 UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details. */
4032
4033char *
4034aarch64_output_sve_cnt_pat_immediate (const char *prefix,
4035 const char *operands, rtx *cnt_pat)
4036{
4037 aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
4038 unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
4039 unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
4040 return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
4041 factor, nelts_per_vq);
4042}
4043
0fdc30bc
RS
4044/* Return true if we can add X using a single SVE INC or DEC instruction. */
4045
4046bool
4047aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
4048{
4049 poly_int64 value;
4050 return (poly_int_rtx_p (x, &value)
4051 && (aarch64_sve_cnt_immediate_p (value)
4052 || aarch64_sve_cnt_immediate_p (-value)));
4053}
4054
4055/* Return the asm string for adding SVE INC/DEC immediate OFFSET to
4056 operand 0. */
4057
4058char *
4059aarch64_output_sve_scalar_inc_dec (rtx offset)
4060{
4061 poly_int64 offset_value = rtx_to_poly_int64 (offset);
4062 gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
4063 if (offset_value.coeffs[1] > 0)
139df05a 4064 return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
0fdc30bc
RS
4065 offset_value.coeffs[1], 0);
4066 else
139df05a 4067 return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
0fdc30bc
RS
4068 -offset_value.coeffs[1], 0);
4069}
4070
43cacb12
RS
4071/* Return true if we can add VALUE to a register using a single ADDVL
4072 or ADDPL instruction. */
4073
4074static bool
4075aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
4076{
4077 HOST_WIDE_INT factor = value.coeffs[0];
4078 if (factor == 0 || value.coeffs[1] != factor)
4079 return false;
4080 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
4081 and a value of 16 is one vector width. */
4082 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
4083 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
4084}
4085
4086/* Likewise for rtx X. */
4087
4088bool
4089aarch64_sve_addvl_addpl_immediate_p (rtx x)
4090{
4091 poly_int64 value;
4092 return (poly_int_rtx_p (x, &value)
4093 && aarch64_sve_addvl_addpl_immediate_p (value));
4094}
4095
0fdc30bc
RS
4096/* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
4097 to operand 1 and storing the result in operand 0. */
43cacb12
RS
4098
4099char *
0fdc30bc 4100aarch64_output_sve_addvl_addpl (rtx offset)
43cacb12
RS
4101{
4102 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
4103 poly_int64 offset_value = rtx_to_poly_int64 (offset);
4104 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
4105
43cacb12
RS
4106 int factor = offset_value.coeffs[1];
4107 if ((factor & 15) == 0)
4108 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
4109 else
4110 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
4111 return buffer;
4112}
4113
4114/* Return true if X is a valid immediate for an SVE vector INC or DEC
4115 instruction. If it is, store the number of elements in each vector
4116 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
4117 factor in *FACTOR_OUT (if nonnull). */
4118
4119bool
0fdc30bc
RS
4120aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
4121 unsigned int *nelts_per_vq_out)
43cacb12
RS
4122{
4123 rtx elt;
4124 poly_int64 value;
4125
4126 if (!const_vec_duplicate_p (x, &elt)
4127 || !poly_int_rtx_p (elt, &value))
4128 return false;
4129
4130 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
4131 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
4132 /* There's no vector INCB. */
4133 return false;
4134
4135 HOST_WIDE_INT factor = value.coeffs[0];
4136 if (value.coeffs[1] != factor)
4137 return false;
4138
4139 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
4140 if ((factor % nelts_per_vq) != 0
4141 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
4142 return false;
4143
4144 if (factor_out)
4145 *factor_out = factor;
4146 if (nelts_per_vq_out)
4147 *nelts_per_vq_out = nelts_per_vq;
4148 return true;
4149}
4150
4151/* Return true if X is a valid immediate for an SVE vector INC or DEC
4152 instruction. */
4153
4154bool
0fdc30bc 4155aarch64_sve_vector_inc_dec_immediate_p (rtx x)
43cacb12 4156{
0fdc30bc 4157 return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
43cacb12
RS
4158}
4159
4160/* Return the asm template for an SVE vector INC or DEC instruction.
4161 OPERANDS gives the operands before the vector count and X is the
4162 value of the vector count operand itself. */
4163
4164char *
0fdc30bc 4165aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
43cacb12
RS
4166{
4167 int factor;
4168 unsigned int nelts_per_vq;
0fdc30bc 4169 if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
43cacb12
RS
4170 gcc_unreachable ();
4171 if (factor < 0)
139df05a
RS
4172 return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
4173 -factor, nelts_per_vq);
43cacb12 4174 else
139df05a
RS
4175 return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
4176 factor, nelts_per_vq);
43cacb12 4177}
43e9d192 4178
82614948
RR
4179static int
4180aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
77e994c9 4181 scalar_int_mode mode)
43e9d192 4182{
43e9d192 4183 int i;
9a4865db
WD
4184 unsigned HOST_WIDE_INT val, val2, mask;
4185 int one_match, zero_match;
4186 int num_insns;
43e9d192 4187
9a4865db
WD
4188 val = INTVAL (imm);
4189
4190 if (aarch64_move_imm (val, mode))
43e9d192 4191 {
82614948 4192 if (generate)
f7df4a84 4193 emit_insn (gen_rtx_SET (dest, imm));
9a4865db 4194 return 1;
43e9d192
IB
4195 }
4196
9de00935
TC
4197 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
4198 (with XXXX non-zero). In that case check to see if the move can be done in
4199 a smaller mode. */
4200 val2 = val & 0xffffffff;
4201 if (mode == DImode
4202 && aarch64_move_imm (val2, SImode)
4203 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
4204 {
4205 if (generate)
4206 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4207
4208 /* Check if we have to emit a second instruction by checking to see
4209 if any of the upper 32 bits of the original DI mode value is set. */
4210 if (val == val2)
4211 return 1;
4212
4213 i = (val >> 48) ? 48 : 32;
4214
4215 if (generate)
4216 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4217 GEN_INT ((val >> i) & 0xffff)));
4218
4219 return 2;
4220 }
4221
9a4865db 4222 if ((val >> 32) == 0 || mode == SImode)
43e9d192 4223 {
82614948
RR
4224 if (generate)
4225 {
9a4865db
WD
4226 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
4227 if (mode == SImode)
4228 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
4229 GEN_INT ((val >> 16) & 0xffff)));
4230 else
4231 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4232 GEN_INT ((val >> 16) & 0xffff)));
82614948 4233 }
9a4865db 4234 return 2;
43e9d192
IB
4235 }
4236
4237 /* Remaining cases are all for DImode. */
4238
43e9d192 4239 mask = 0xffff;
9a4865db
WD
4240 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
4241 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
4242 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
4243 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
43e9d192 4244
62c8d76c 4245 if (zero_match != 2 && one_match != 2)
43e9d192 4246 {
62c8d76c
WD
4247 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
4248 For a 64-bit bitmask try whether changing 16 bits to all ones or
4249 zeroes creates a valid bitmask. To check any repeated bitmask,
4250 try using 16 bits from the other 32-bit half of val. */
43e9d192 4251
62c8d76c 4252 for (i = 0; i < 64; i += 16, mask <<= 16)
43e9d192 4253 {
62c8d76c
WD
4254 val2 = val & ~mask;
4255 if (val2 != val && aarch64_bitmask_imm (val2, mode))
4256 break;
4257 val2 = val | mask;
4258 if (val2 != val && aarch64_bitmask_imm (val2, mode))
4259 break;
4260 val2 = val2 & ~mask;
4261 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
4262 if (val2 != val && aarch64_bitmask_imm (val2, mode))
4263 break;
43e9d192 4264 }
62c8d76c 4265 if (i != 64)
43e9d192 4266 {
62c8d76c 4267 if (generate)
43e9d192 4268 {
62c8d76c
WD
4269 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4270 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
9a4865db 4271 GEN_INT ((val >> i) & 0xffff)));
43e9d192 4272 }
1312b1ba 4273 return 2;
43e9d192
IB
4274 }
4275 }
4276
9a4865db
WD
4277 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
4278 are emitted by the initial mov. If one_match > zero_match, skip set bits,
4279 otherwise skip zero bits. */
2c274197 4280
9a4865db 4281 num_insns = 1;
43e9d192 4282 mask = 0xffff;
9a4865db
WD
4283 val2 = one_match > zero_match ? ~val : val;
4284 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
4285
4286 if (generate)
4287 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
4288 ? (val | ~(mask << i))
4289 : (val & (mask << i)))));
4290 for (i += 16; i < 64; i += 16)
43e9d192 4291 {
9a4865db
WD
4292 if ((val2 & (mask << i)) == 0)
4293 continue;
4294 if (generate)
4295 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4296 GEN_INT ((val >> i) & 0xffff)));
4297 num_insns ++;
82614948
RR
4298 }
4299
4300 return num_insns;
4301}
4302
c0bb5bc5
WD
4303/* Return whether imm is a 128-bit immediate which is simple enough to
4304 expand inline. */
4305bool
4306aarch64_mov128_immediate (rtx imm)
4307{
3793ecc1 4308 if (CONST_INT_P (imm))
c0bb5bc5
WD
4309 return true;
4310
4311 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
4312
4313 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
4314 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
4315
4316 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
4317 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
4318}
4319
4320
43cacb12
RS
4321/* Return the number of temporary registers that aarch64_add_offset_1
4322 would need to add OFFSET to a register. */
4323
4324static unsigned int
4325aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
4326{
4327 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
4328}
4329
f5470a77
RS
4330/* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
4331 a non-polynomial OFFSET. MODE is the mode of the addition.
4332 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4333 be set and CFA adjustments added to the generated instructions.
4334
4335 TEMP1, if nonnull, is a register of mode MODE that can be used as a
4336 temporary if register allocation is already complete. This temporary
4337 register may overlap DEST but must not overlap SRC. If TEMP1 is known
4338 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
4339 the immediate again.
0100c5f9
RS
4340
4341 Since this function may be used to adjust the stack pointer, we must
4342 ensure that it cannot cause transient stack deallocation (for example
4343 by first incrementing SP and then decrementing when adjusting by a
4344 large immediate). */
4345
4346static void
f5470a77
RS
4347aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
4348 rtx src, HOST_WIDE_INT offset, rtx temp1,
4349 bool frame_related_p, bool emit_move_imm)
0100c5f9 4350{
f5470a77
RS
4351 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4352 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4353
42bc589e 4354 unsigned HOST_WIDE_INT moffset = absu_hwi (offset);
0100c5f9
RS
4355 rtx_insn *insn;
4356
f5470a77
RS
4357 if (!moffset)
4358 {
4359 if (!rtx_equal_p (dest, src))
4360 {
4361 insn = emit_insn (gen_rtx_SET (dest, src));
4362 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4363 }
4364 return;
4365 }
0100c5f9
RS
4366
4367 /* Single instruction adjustment. */
f5470a77 4368 if (aarch64_uimm12_shift (moffset))
0100c5f9 4369 {
f5470a77 4370 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
0100c5f9
RS
4371 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4372 return;
4373 }
4374
f5470a77
RS
4375 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
4376 and either:
4377
4378 a) the offset cannot be loaded by a 16-bit move or
4379 b) there is no spare register into which we can move it. */
4380 if (moffset < 0x1000000
4381 && ((!temp1 && !can_create_pseudo_p ())
4382 || !aarch64_move_imm (moffset, mode)))
0100c5f9 4383 {
f5470a77 4384 HOST_WIDE_INT low_off = moffset & 0xfff;
0100c5f9 4385
f5470a77
RS
4386 low_off = offset < 0 ? -low_off : low_off;
4387 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
0100c5f9 4388 RTX_FRAME_RELATED_P (insn) = frame_related_p;
f5470a77 4389 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
0100c5f9
RS
4390 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4391 return;
4392 }
4393
4394 /* Emit a move immediate if required and an addition/subtraction. */
0100c5f9 4395 if (emit_move_imm)
f5470a77
RS
4396 {
4397 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
7aa605c9
JJ
4398 temp1 = aarch64_force_temporary (mode, temp1,
4399 gen_int_mode (moffset, mode));
f5470a77
RS
4400 }
4401 insn = emit_insn (offset < 0
4402 ? gen_sub3_insn (dest, src, temp1)
4403 : gen_add3_insn (dest, src, temp1));
0100c5f9
RS
4404 if (frame_related_p)
4405 {
4406 RTX_FRAME_RELATED_P (insn) = frame_related_p;
f5470a77
RS
4407 rtx adj = plus_constant (mode, src, offset);
4408 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
0100c5f9
RS
4409 }
4410}
4411
43cacb12
RS
4412/* Return the number of temporary registers that aarch64_add_offset
4413 would need to move OFFSET into a register or add OFFSET to a register;
4414 ADD_P is true if we want the latter rather than the former. */
4415
4416static unsigned int
4417aarch64_offset_temporaries (bool add_p, poly_int64 offset)
4418{
4419 /* This follows the same structure as aarch64_add_offset. */
4420 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
4421 return 0;
4422
4423 unsigned int count = 0;
4424 HOST_WIDE_INT factor = offset.coeffs[1];
4425 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4426 poly_int64 poly_offset (factor, factor);
4427 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4428 /* Need one register for the ADDVL/ADDPL result. */
4429 count += 1;
4430 else if (factor != 0)
4431 {
4432 factor = abs (factor);
4433 if (factor > 16 * (factor & -factor))
4434 /* Need one register for the CNT result and one for the multiplication
4435 factor. If necessary, the second temporary can be reused for the
4436 constant part of the offset. */
4437 return 2;
4438 /* Need one register for the CNT result (which might then
4439 be shifted). */
4440 count += 1;
4441 }
4442 return count + aarch64_add_offset_1_temporaries (constant);
4443}
4444
4445/* If X can be represented as a poly_int64, return the number
4446 of temporaries that are required to add it to a register.
4447 Return -1 otherwise. */
4448
4449int
4450aarch64_add_offset_temporaries (rtx x)
4451{
4452 poly_int64 offset;
4453 if (!poly_int_rtx_p (x, &offset))
4454 return -1;
4455 return aarch64_offset_temporaries (true, offset);
4456}
4457
f5470a77
RS
4458/* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
4459 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4460 be set and CFA adjustments added to the generated instructions.
4461
4462 TEMP1, if nonnull, is a register of mode MODE that can be used as a
4463 temporary if register allocation is already complete. This temporary
43cacb12
RS
4464 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
4465 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
4466 false to avoid emitting the immediate again.
4467
4468 TEMP2, if nonnull, is a second temporary register that doesn't
4469 overlap either DEST or REG.
f5470a77
RS
4470
4471 Since this function may be used to adjust the stack pointer, we must
4472 ensure that it cannot cause transient stack deallocation (for example
4473 by first incrementing SP and then decrementing when adjusting by a
4474 large immediate). */
4475
4476static void
4477aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
43cacb12
RS
4478 poly_int64 offset, rtx temp1, rtx temp2,
4479 bool frame_related_p, bool emit_move_imm = true)
0100c5f9 4480{
f5470a77
RS
4481 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4482 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
43cacb12
RS
4483 gcc_assert (temp1 == NULL_RTX
4484 || !frame_related_p
4485 || !reg_overlap_mentioned_p (temp1, dest));
4486 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
4487
4488 /* Try using ADDVL or ADDPL to add the whole value. */
4489 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
4490 {
4491 rtx offset_rtx = gen_int_mode (offset, mode);
4492 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4493 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4494 return;
4495 }
4496
4497 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
4498 SVE vector register, over and above the minimum size of 128 bits.
4499 This is equivalent to half the value returned by CNTD with a
4500 vector shape of ALL. */
4501 HOST_WIDE_INT factor = offset.coeffs[1];
4502 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4503
4504 /* Try using ADDVL or ADDPL to add the VG-based part. */
4505 poly_int64 poly_offset (factor, factor);
4506 if (src != const0_rtx
4507 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4508 {
4509 rtx offset_rtx = gen_int_mode (poly_offset, mode);
4510 if (frame_related_p)
4511 {
4512 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4513 RTX_FRAME_RELATED_P (insn) = true;
4514 src = dest;
4515 }
4516 else
4517 {
4518 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
4519 src = aarch64_force_temporary (mode, temp1, addr);
4520 temp1 = temp2;
4521 temp2 = NULL_RTX;
4522 }
4523 }
4524 /* Otherwise use a CNT-based sequence. */
4525 else if (factor != 0)
4526 {
4527 /* Use a subtraction if we have a negative factor. */
4528 rtx_code code = PLUS;
4529 if (factor < 0)
4530 {
4531 factor = -factor;
4532 code = MINUS;
4533 }
4534
4535 /* Calculate CNTD * FACTOR / 2. First try to fold the division
4536 into the multiplication. */
4537 rtx val;
4538 int shift = 0;
4539 if (factor & 1)
4540 /* Use a right shift by 1. */
4541 shift = -1;
4542 else
4543 factor /= 2;
4544 HOST_WIDE_INT low_bit = factor & -factor;
4545 if (factor <= 16 * low_bit)
4546 {
4547 if (factor > 16 * 8)
4548 {
4549 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
4550 the value with the minimum multiplier and shift it into
4551 position. */
4552 int extra_shift = exact_log2 (low_bit);
4553 shift += extra_shift;
4554 factor >>= extra_shift;
4555 }
4556 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
4557 }
4558 else
4559 {
7d8bdfa7
RS
4560 /* Base the factor on LOW_BIT if we can calculate LOW_BIT
4561 directly, since that should increase the chances of being
4562 able to use a shift and add sequence. If LOW_BIT itself
4563 is out of range, just use CNTD. */
4564 if (low_bit <= 16 * 8)
4565 factor /= low_bit;
4566 else
4567 low_bit = 1;
4568
4569 val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
43cacb12
RS
4570 val = aarch64_force_temporary (mode, temp1, val);
4571
7d8bdfa7
RS
4572 if (can_create_pseudo_p ())
4573 {
4574 rtx coeff1 = gen_int_mode (factor, mode);
4575 val = expand_mult (mode, val, coeff1, NULL_RTX, false, true);
4576 }
4577 else
43cacb12 4578 {
7d8bdfa7
RS
4579 /* Go back to using a negative multiplication factor if we have
4580 no register from which to subtract. */
4581 if (code == MINUS && src == const0_rtx)
4582 {
4583 factor = -factor;
4584 code = PLUS;
4585 }
4586 rtx coeff1 = gen_int_mode (factor, mode);
4587 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
4588 val = gen_rtx_MULT (mode, val, coeff1);
43cacb12 4589 }
43cacb12
RS
4590 }
4591
4592 if (shift > 0)
4593 {
4594 /* Multiply by 1 << SHIFT. */
4595 val = aarch64_force_temporary (mode, temp1, val);
4596 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
4597 }
4598 else if (shift == -1)
4599 {
4600 /* Divide by 2. */
4601 val = aarch64_force_temporary (mode, temp1, val);
4602 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
4603 }
4604
4605 /* Calculate SRC +/- CNTD * FACTOR / 2. */
4606 if (src != const0_rtx)
4607 {
4608 val = aarch64_force_temporary (mode, temp1, val);
4609 val = gen_rtx_fmt_ee (code, mode, src, val);
4610 }
4611 else if (code == MINUS)
4612 {
4613 val = aarch64_force_temporary (mode, temp1, val);
4614 val = gen_rtx_NEG (mode, val);
4615 }
4616
4617 if (constant == 0 || frame_related_p)
4618 {
4619 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
4620 if (frame_related_p)
4621 {
4622 RTX_FRAME_RELATED_P (insn) = true;
4623 add_reg_note (insn, REG_CFA_ADJUST_CFA,
4624 gen_rtx_SET (dest, plus_constant (Pmode, src,
4625 poly_offset)));
4626 }
4627 src = dest;
4628 if (constant == 0)
4629 return;
4630 }
4631 else
4632 {
4633 src = aarch64_force_temporary (mode, temp1, val);
4634 temp1 = temp2;
4635 temp2 = NULL_RTX;
4636 }
4637
4638 emit_move_imm = true;
4639 }
f5470a77 4640
f5470a77
RS
4641 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
4642 frame_related_p, emit_move_imm);
0100c5f9
RS
4643}
4644
43cacb12
RS
4645/* Like aarch64_add_offset, but the offset is given as an rtx rather
4646 than a poly_int64. */
4647
4648void
4649aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4650 rtx offset_rtx, rtx temp1, rtx temp2)
4651{
4652 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
4653 temp1, temp2, false);
4654}
4655
f5470a77
RS
4656/* Add DELTA to the stack pointer, marking the instructions frame-related.
4657 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
4658 if TEMP1 already contains abs (DELTA). */
4659
0100c5f9 4660static inline void
43cacb12 4661aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
0100c5f9 4662{
f5470a77 4663 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
43cacb12 4664 temp1, temp2, true, emit_move_imm);
0100c5f9
RS
4665}
4666
f5470a77
RS
4667/* Subtract DELTA from the stack pointer, marking the instructions
4668 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
4669 if nonnull. */
4670
0100c5f9 4671static inline void
cd1bef27
JL
4672aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
4673 bool emit_move_imm = true)
0100c5f9 4674{
f5470a77 4675 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
cd1bef27 4676 temp1, temp2, frame_related_p, emit_move_imm);
0100c5f9 4677}
82614948 4678
43cacb12
RS
4679/* Set DEST to (vec_series BASE STEP). */
4680
4681static void
4682aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
82614948
RR
4683{
4684 machine_mode mode = GET_MODE (dest);
43cacb12
RS
4685 scalar_mode inner = GET_MODE_INNER (mode);
4686
4687 /* Each operand can be a register or an immediate in the range [-16, 15]. */
4688 if (!aarch64_sve_index_immediate_p (base))
4689 base = force_reg (inner, base);
4690 if (!aarch64_sve_index_immediate_p (step))
4691 step = force_reg (inner, step);
4692
4693 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
4694}
82614948 4695
4aeb1ba7
RS
4696/* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
4697 register of mode MODE. Use TARGET for the result if it's nonnull
4698 and convenient.
4699
4700 The two vector modes must have the same element mode. The behavior
4701 is to duplicate architectural lane N of SRC into architectural lanes
4702 N + I * STEP of the result. On big-endian targets, architectural
4703 lane 0 of an Advanced SIMD vector is the last element of the vector
4704 in memory layout, so for big-endian targets this operation has the
4705 effect of reversing SRC before duplicating it. Callers need to
4706 account for this. */
43cacb12 4707
4aeb1ba7
RS
4708rtx
4709aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
4710{
4711 machine_mode src_mode = GET_MODE (src);
4712 gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
4713 insn_code icode = (BYTES_BIG_ENDIAN
4714 ? code_for_aarch64_vec_duplicate_vq_be (mode)
4715 : code_for_aarch64_vec_duplicate_vq_le (mode));
4716
4717 unsigned int i = 0;
4718 expand_operand ops[3];
4719 create_output_operand (&ops[i++], target, mode);
4720 create_output_operand (&ops[i++], src, src_mode);
4721 if (BYTES_BIG_ENDIAN)
4722 {
4723 /* Create a PARALLEL describing the reversal of SRC. */
4724 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
4725 rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
4726 nelts_per_vq - 1, -1);
4727 create_fixed_operand (&ops[i++], sel);
43cacb12 4728 }
4aeb1ba7
RS
4729 expand_insn (icode, i, ops);
4730 return ops[0].value;
4731}
4732
4733/* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
4734 the memory image into DEST. Return true on success. */
43cacb12 4735
4aeb1ba7
RS
4736static bool
4737aarch64_expand_sve_ld1rq (rtx dest, rtx src)
4738{
4739 src = force_const_mem (GET_MODE (src), src);
43cacb12
RS
4740 if (!src)
4741 return false;
4742
4743 /* Make sure that the address is legitimate. */
4aeb1ba7 4744 if (!aarch64_sve_ld1rq_operand_p (src))
43cacb12
RS
4745 {
4746 rtx addr = force_reg (Pmode, XEXP (src, 0));
4747 src = replace_equiv_address (src, addr);
4748 }
4749
947b1372 4750 machine_mode mode = GET_MODE (dest);
cc68f7c2 4751 machine_mode pred_mode = aarch64_sve_pred_mode (mode);
16de3637 4752 rtx ptrue = aarch64_ptrue_reg (pred_mode);
4aeb1ba7 4753 emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
43cacb12
RS
4754 return true;
4755}
4756
4aeb1ba7
RS
4757/* Return a register containing CONST_VECTOR SRC, given that SRC has an
4758 SVE data mode and isn't a legitimate constant. Use TARGET for the
4759 result if convenient.
43cacb12 4760
4aeb1ba7
RS
4761 The returned register can have whatever mode seems most natural
4762 given the contents of SRC. */
4763
4764static rtx
4765aarch64_expand_sve_const_vector (rtx target, rtx src)
43cacb12
RS
4766{
4767 machine_mode mode = GET_MODE (src);
4768 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
4769 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
4aeb1ba7
RS
4770 scalar_mode elt_mode = GET_MODE_INNER (mode);
4771 unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
cc68f7c2
RS
4772 unsigned int container_bits = aarch64_sve_container_bits (mode);
4773 unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
4774
4775 if (nelts_per_pattern == 1
4776 && encoded_bits <= 128
4777 && container_bits != elt_bits)
4778 {
4779 /* We have a partial vector mode and a constant whose full-vector
4780 equivalent would occupy a repeating 128-bit sequence. Build that
4781 full-vector equivalent instead, so that we have the option of
4782 using LD1RQ and Advanced SIMD operations. */
4783 unsigned int repeat = container_bits / elt_bits;
4784 machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
4785 rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
4786 for (unsigned int i = 0; i < npatterns; ++i)
4787 for (unsigned int j = 0; j < repeat; ++j)
4788 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
4789 target = aarch64_target_reg (target, full_mode);
4790 return aarch64_expand_sve_const_vector (target, builder.build ());
4791 }
4aeb1ba7
RS
4792
4793 if (nelts_per_pattern == 1 && encoded_bits == 128)
4794 {
4795 /* The constant is a duplicated quadword but can't be narrowed
4796 beyond a quadword. Get the memory image of the first quadword
4797 as a 128-bit vector and try using LD1RQ to load it from memory.
4798
4799 The effect for both endiannesses is to load memory lane N into
4800 architectural lanes N + I * STEP of the result. On big-endian
4801 targets, the layout of the 128-bit vector in an Advanced SIMD
4802 register would be different from its layout in an SVE register,
4803 but this 128-bit vector is a memory value only. */
4804 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
4805 rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
4806 if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
4807 return target;
4808 }
4809
4810 if (nelts_per_pattern == 1 && encoded_bits < 128)
4811 {
4812 /* The vector is a repeating sequence of 64 bits or fewer.
4813 See if we can load them using an Advanced SIMD move and then
4814 duplicate it to fill a vector. This is better than using a GPR
4815 move because it keeps everything in the same register file. */
4816 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
4817 rtx_vector_builder builder (vq_mode, npatterns, 1);
4818 for (unsigned int i = 0; i < npatterns; ++i)
4819 {
4820 /* We want memory lane N to go into architectural lane N,
4821 so reverse for big-endian targets. The DUP .Q pattern
4822 has a compensating reverse built-in. */
4823 unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
4824 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
4825 }
4826 rtx vq_src = builder.build ();
4827 if (aarch64_simd_valid_immediate (vq_src, NULL))
4828 {
4829 vq_src = force_reg (vq_mode, vq_src);
4830 return aarch64_expand_sve_dupq (target, mode, vq_src);
4831 }
4832
4833 /* Get an integer representation of the repeating part of Advanced
4834 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
4835 which for big-endian targets is lane-swapped wrt a normal
4836 Advanced SIMD vector. This means that for both endiannesses,
4837 memory lane N of SVE vector SRC corresponds to architectural
4838 lane N of a register holding VQ_SRC. This in turn means that
4839 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
4840 as a single 128-bit value) and thus that memory lane 0 of SRC is
4841 in the lsb of the integer. Duplicating the integer therefore
4842 ensures that memory lane N of SRC goes into architectural lane
4843 N + I * INDEX of the SVE register. */
4844 scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
4845 rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
4846 if (elt_value)
4847 {
4848 /* Pretend that we had a vector of INT_MODE to start with. */
4849 elt_mode = int_mode;
4850 mode = aarch64_full_sve_mode (int_mode).require ();
4851
4852 /* If the integer can be moved into a general register by a
4853 single instruction, do that and duplicate the result. */
4854 if (CONST_INT_P (elt_value)
4855 && aarch64_move_imm (INTVAL (elt_value), elt_mode))
4856 {
4857 elt_value = force_reg (elt_mode, elt_value);
4858 return expand_vector_broadcast (mode, elt_value);
4859 }
4860 }
4861 else if (npatterns == 1)
4862 /* We're duplicating a single value, but can't do better than
4863 force it to memory and load from there. This handles things
4864 like symbolic constants. */
4865 elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
43cacb12 4866
4aeb1ba7 4867 if (elt_value)
8179efe0 4868 {
4aeb1ba7
RS
4869 /* Load the element from memory if we can, otherwise move it into
4870 a register and use a DUP. */
4871 rtx op = force_const_mem (elt_mode, elt_value);
4872 if (!op)
4873 op = force_reg (elt_mode, elt_value);
4874 return expand_vector_broadcast (mode, op);
8179efe0 4875 }
43cacb12
RS
4876 }
4877
4aeb1ba7
RS
4878 /* Try using INDEX. */
4879 rtx base, step;
4880 if (const_vec_series_p (src, &base, &step))
4881 {
4882 aarch64_expand_vec_series (target, base, step);
4883 return target;
4884 }
4885
4886 /* From here on, it's better to force the whole constant to memory
4887 if we can. */
4888 if (GET_MODE_NUNITS (mode).is_constant ())
4889 return NULL_RTX;
4890
43cacb12 4891 /* Expand each pattern individually. */
4aeb1ba7 4892 gcc_assert (npatterns > 1);
43cacb12
RS
4893 rtx_vector_builder builder;
4894 auto_vec<rtx, 16> vectors (npatterns);
4895 for (unsigned int i = 0; i < npatterns; ++i)
4896 {
4897 builder.new_vector (mode, 1, nelts_per_pattern);
4898 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
4899 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
4900 vectors.quick_push (force_reg (mode, builder.build ()));
4901 }
4902
4903 /* Use permutes to interleave the separate vectors. */
4904 while (npatterns > 1)
4905 {
4906 npatterns /= 2;
4907 for (unsigned int i = 0; i < npatterns; ++i)
4908 {
4aeb1ba7 4909 rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
43cacb12
RS
4910 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
4911 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
4912 vectors[i] = tmp;
4913 }
4914 }
4aeb1ba7
RS
4915 gcc_assert (vectors[0] == target);
4916 return target;
43cacb12
RS
4917}
4918
678faefc
RS
4919/* Use WHILE to set a predicate register of mode MODE in which the first
4920 VL bits are set and the rest are clear. Use TARGET for the register
4921 if it's nonnull and convenient. */
0b1fe8cf 4922
678faefc
RS
4923static rtx
4924aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
4925 unsigned int vl)
0b1fe8cf
RS
4926{
4927 rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
678faefc 4928 target = aarch64_target_reg (target, mode);
6ad9571b 4929 emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
624d0f07 4930 target, const0_rtx, limit));
678faefc
RS
4931 return target;
4932}
4933
2803bc3b
RS
4934static rtx
4935aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
4936
4937/* BUILDER is a constant predicate in which the index of every set bit
4938 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
4939 by inverting every element at a multiple of ELT_SIZE and EORing the
4940 result with an ELT_SIZE PTRUE.
4941
4942 Return a register that contains the constant on success, otherwise
4943 return null. Use TARGET as the register if it is nonnull and
4944 convenient. */
4945
4946static rtx
4947aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
4948 unsigned int elt_size)
4949{
4950 /* Invert every element at a multiple of ELT_SIZE, keeping the
4951 other bits zero. */
4952 rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
4953 builder.nelts_per_pattern ());
4954 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4955 if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
4956 inv_builder.quick_push (const1_rtx);
4957 else
4958 inv_builder.quick_push (const0_rtx);
4959 inv_builder.finalize ();
4960
4961 /* See if we can load the constant cheaply. */
4962 rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
4963 if (!inv)
4964 return NULL_RTX;
4965
4966 /* EOR the result with an ELT_SIZE PTRUE. */
4967 rtx mask = aarch64_ptrue_all (elt_size);
4968 mask = force_reg (VNx16BImode, mask);
26bebf57 4969 inv = gen_lowpart (VNx16BImode, inv);
2803bc3b
RS
4970 target = aarch64_target_reg (target, VNx16BImode);
4971 emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
4972 return target;
4973}
4974
4975/* BUILDER is a constant predicate in which the index of every set bit
4976 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
4977 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
4978 register on success, otherwise return null. Use TARGET as the register
4979 if nonnull and convenient. */
4980
4981static rtx
4982aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
4983 unsigned int elt_size,
4984 unsigned int permute_size)
4985{
4986 /* We're going to split the constant into two new constants A and B,
4987 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
4988 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
4989
4990 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
4991 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
4992
4993 where _ indicates elements that will be discarded by the permute.
4994
4995 First calculate the ELT_SIZEs for A and B. */
4996 unsigned int a_elt_size = GET_MODE_SIZE (DImode);
4997 unsigned int b_elt_size = GET_MODE_SIZE (DImode);
4998 for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
4999 if (INTVAL (builder.elt (i)) != 0)
5000 {
5001 if (i & permute_size)
5002 b_elt_size |= i - permute_size;
5003 else
5004 a_elt_size |= i;
5005 }
5006 a_elt_size &= -a_elt_size;
5007 b_elt_size &= -b_elt_size;
5008
5009 /* Now construct the vectors themselves. */
5010 rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
5011 builder.nelts_per_pattern ());
5012 rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
5013 builder.nelts_per_pattern ());
5014 unsigned int nelts = builder.encoded_nelts ();
5015 for (unsigned int i = 0; i < nelts; ++i)
5016 if (i & (elt_size - 1))
5017 {
5018 a_builder.quick_push (const0_rtx);
5019 b_builder.quick_push (const0_rtx);
5020 }
5021 else if ((i & permute_size) == 0)
5022 {
5023 /* The A and B elements are significant. */
5024 a_builder.quick_push (builder.elt (i));
5025 b_builder.quick_push (builder.elt (i + permute_size));
5026 }
5027 else
5028 {
5029 /* The A and B elements are going to be discarded, so pick whatever
5030 is likely to give a nice constant. We are targeting element
5031 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
5032 with the aim of each being a sequence of ones followed by
5033 a sequence of zeros. So:
5034
5035 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
5036 duplicate the last X_ELT_SIZE element, to extend the
5037 current sequence of ones or zeros.
5038
5039 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
5040 zero, so that the constant really does have X_ELT_SIZE and
5041 not a smaller size. */
5042 if (a_elt_size > permute_size)
5043 a_builder.quick_push (const0_rtx);
5044 else
5045 a_builder.quick_push (a_builder.elt (i - a_elt_size));
5046 if (b_elt_size > permute_size)
5047 b_builder.quick_push (const0_rtx);
5048 else
5049 b_builder.quick_push (b_builder.elt (i - b_elt_size));
5050 }
5051 a_builder.finalize ();
5052 b_builder.finalize ();
5053
5054 /* Try loading A into a register. */
5055 rtx_insn *last = get_last_insn ();
5056 rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
5057 if (!a)
5058 return NULL_RTX;
5059
5060 /* Try loading B into a register. */
5061 rtx b = a;
5062 if (a_builder != b_builder)
5063 {
5064 b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
5065 if (!b)
5066 {
5067 delete_insns_since (last);
5068 return NULL_RTX;
5069 }
5070 }
5071
5072 /* Emit the TRN1 itself. */
5073 machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
5074 target = aarch64_target_reg (target, mode);
5075 emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
5076 gen_lowpart (mode, a),
5077 gen_lowpart (mode, b)));
5078 return target;
5079}
5080
678faefc
RS
5081/* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
5082 constant in BUILDER into an SVE predicate register. Return the register
5083 on success, otherwise return null. Use TARGET for the register if
2803bc3b
RS
5084 nonnull and convenient.
5085
5086 ALLOW_RECURSE_P is true if we can use methods that would call this
5087 function recursively. */
678faefc
RS
5088
5089static rtx
2803bc3b
RS
5090aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
5091 bool allow_recurse_p)
678faefc
RS
5092{
5093 if (builder.encoded_nelts () == 1)
5094 /* A PFALSE or a PTRUE .B ALL. */
5095 return aarch64_emit_set_immediate (target, builder);
5096
5097 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
5098 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
5099 {
5100 /* If we can load the constant using PTRUE, use it as-is. */
5101 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
5102 if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
5103 return aarch64_emit_set_immediate (target, builder);
5104
5105 /* Otherwise use WHILE to set the first VL bits. */
5106 return aarch64_sve_move_pred_via_while (target, mode, vl);
5107 }
5108
2803bc3b
RS
5109 if (!allow_recurse_p)
5110 return NULL_RTX;
5111
5112 /* Try inverting the vector in element size ELT_SIZE and then EORing
5113 the result with an ELT_SIZE PTRUE. */
5114 if (INTVAL (builder.elt (0)) == 0)
5115 if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
5116 elt_size))
5117 return res;
5118
5119 /* Try using TRN1 to permute two simpler constants. */
5120 for (unsigned int i = elt_size; i <= 8; i *= 2)
5121 if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
5122 elt_size, i))
5123 return res;
5124
678faefc
RS
5125 return NULL_RTX;
5126}
5127
5128/* Return an SVE predicate register that contains the VNx16BImode
5129 constant in BUILDER, without going through the move expanders.
5130
5131 The returned register can have whatever mode seems most natural
5132 given the contents of BUILDER. Use TARGET for the result if
5133 convenient. */
5134
5135static rtx
5136aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
5137{
5138 /* Try loading the constant using pure predicate operations. */
2803bc3b 5139 if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
678faefc
RS
5140 return res;
5141
5142 /* Try forcing the constant to memory. */
5143 if (builder.full_nelts ().is_constant ())
5144 if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
5145 {
5146 target = aarch64_target_reg (target, VNx16BImode);
5147 emit_move_insn (target, mem);
5148 return target;
5149 }
5150
5151 /* The last resort is to load the constant as an integer and then
5152 compare it against zero. Use -1 for set bits in order to increase
5153 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
5154 rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
5155 builder.nelts_per_pattern ());
5156 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
5157 int_builder.quick_push (INTVAL (builder.elt (i))
5158 ? constm1_rtx : const0_rtx);
5159 return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
5160 int_builder.build ());
0b1fe8cf
RS
5161}
5162
4aeb1ba7 5163/* Set DEST to immediate IMM. */
43cacb12
RS
5164
5165void
4aeb1ba7 5166aarch64_expand_mov_immediate (rtx dest, rtx imm)
43cacb12
RS
5167{
5168 machine_mode mode = GET_MODE (dest);
82614948
RR
5169
5170 /* Check on what type of symbol it is. */
77e994c9 5171 scalar_int_mode int_mode;
3793ecc1
AC
5172 if ((SYMBOL_REF_P (imm)
5173 || LABEL_REF_P (imm)
43cacb12
RS
5174 || GET_CODE (imm) == CONST
5175 || GET_CODE (imm) == CONST_POLY_INT)
77e994c9 5176 && is_a <scalar_int_mode> (mode, &int_mode))
82614948 5177 {
43cacb12
RS
5178 rtx mem;
5179 poly_int64 offset;
5180 HOST_WIDE_INT const_offset;
82614948
RR
5181 enum aarch64_symbol_type sty;
5182
5183 /* If we have (const (plus symbol offset)), separate out the offset
5184 before we start classifying the symbol. */
43cacb12 5185 rtx base = strip_offset (imm, &offset);
82614948 5186
43cacb12
RS
5187 /* We must always add an offset involving VL separately, rather than
5188 folding it into the relocation. */
5189 if (!offset.is_constant (&const_offset))
5190 {
c0e0174b
RS
5191 if (!TARGET_SVE)
5192 {
5193 aarch64_report_sve_required ();
5194 return;
5195 }
43cacb12
RS
5196 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
5197 emit_insn (gen_rtx_SET (dest, imm));
5198 else
5199 {
5200 /* Do arithmetic on 32-bit values if the result is smaller
5201 than that. */
5202 if (partial_subreg_p (int_mode, SImode))
5203 {
5204 /* It is invalid to do symbol calculations in modes
5205 narrower than SImode. */
5206 gcc_assert (base == const0_rtx);
5207 dest = gen_lowpart (SImode, dest);
5208 int_mode = SImode;
5209 }
5210 if (base != const0_rtx)
5211 {
5212 base = aarch64_force_temporary (int_mode, dest, base);
5213 aarch64_add_offset (int_mode, dest, base, offset,
5214 NULL_RTX, NULL_RTX, false);
5215 }
5216 else
5217 aarch64_add_offset (int_mode, dest, base, offset,
5218 dest, NULL_RTX, false);
5219 }
5220 return;
5221 }
5222
5223 sty = aarch64_classify_symbol (base, const_offset);
82614948
RR
5224 switch (sty)
5225 {
5226 case SYMBOL_FORCE_TO_MEM:
43cacb12 5227 if (const_offset != 0
77e994c9 5228 && targetm.cannot_force_const_mem (int_mode, imm))
82614948
RR
5229 {
5230 gcc_assert (can_create_pseudo_p ());
77e994c9 5231 base = aarch64_force_temporary (int_mode, dest, base);
43cacb12
RS
5232 aarch64_add_offset (int_mode, dest, base, const_offset,
5233 NULL_RTX, NULL_RTX, false);
82614948
RR
5234 return;
5235 }
b4f50fd4 5236
82614948
RR
5237 mem = force_const_mem (ptr_mode, imm);
5238 gcc_assert (mem);
b4f50fd4
RR
5239
5240 /* If we aren't generating PC relative literals, then
5241 we need to expand the literal pool access carefully.
5242 This is something that needs to be done in a number
5243 of places, so could well live as a separate function. */
9ee6540a 5244 if (!aarch64_pcrelative_literal_loads)
b4f50fd4
RR
5245 {
5246 gcc_assert (can_create_pseudo_p ());
5247 base = gen_reg_rtx (ptr_mode);
5248 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
00eee3fa
WD
5249 if (ptr_mode != Pmode)
5250 base = convert_memory_address (Pmode, base);
b4f50fd4
RR
5251 mem = gen_rtx_MEM (ptr_mode, base);
5252 }
5253
77e994c9
RS
5254 if (int_mode != ptr_mode)
5255 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
b4f50fd4 5256
f7df4a84 5257 emit_insn (gen_rtx_SET (dest, mem));
b4f50fd4 5258
82614948
RR
5259 return;
5260
5261 case SYMBOL_SMALL_TLSGD:
5262 case SYMBOL_SMALL_TLSDESC:
79496620 5263 case SYMBOL_SMALL_TLSIE:
1b1e81f8 5264 case SYMBOL_SMALL_GOT_28K:
6642bdb4 5265 case SYMBOL_SMALL_GOT_4G:
82614948 5266 case SYMBOL_TINY_GOT:
5ae7caad 5267 case SYMBOL_TINY_TLSIE:
43cacb12 5268 if (const_offset != 0)
82614948
RR
5269 {
5270 gcc_assert(can_create_pseudo_p ());
77e994c9 5271 base = aarch64_force_temporary (int_mode, dest, base);
43cacb12
RS
5272 aarch64_add_offset (int_mode, dest, base, const_offset,
5273 NULL_RTX, NULL_RTX, false);
82614948
RR
5274 return;
5275 }
5276 /* FALLTHRU */
5277
82614948
RR
5278 case SYMBOL_SMALL_ABSOLUTE:
5279 case SYMBOL_TINY_ABSOLUTE:
cbf5629e 5280 case SYMBOL_TLSLE12:
d18ba284 5281 case SYMBOL_TLSLE24:
cbf5629e
JW
5282 case SYMBOL_TLSLE32:
5283 case SYMBOL_TLSLE48:
82614948
RR
5284 aarch64_load_symref_appropriately (dest, imm, sty);
5285 return;
5286
5287 default:
5288 gcc_unreachable ();
5289 }
5290 }
5291
5292 if (!CONST_INT_P (imm))
5293 {
678faefc
RS
5294 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
5295 {
5296 /* Only the low bit of each .H, .S and .D element is defined,
5297 so we can set the upper bits to whatever we like. If the
5298 predicate is all-true in MODE, prefer to set all the undefined
5299 bits as well, so that we can share a single .B predicate for
5300 all modes. */
5301 if (imm == CONSTM1_RTX (mode))
5302 imm = CONSTM1_RTX (VNx16BImode);
5303
5304 /* All methods for constructing predicate modes wider than VNx16BI
5305 will set the upper bits of each element to zero. Expose this
5306 by moving such constants as a VNx16BI, so that all bits are
5307 significant and so that constants for different modes can be
5308 shared. The wider constant will still be available as a
5309 REG_EQUAL note. */
5310 rtx_vector_builder builder;
5311 if (aarch64_get_sve_pred_bits (builder, imm))
5312 {
5313 rtx res = aarch64_expand_sve_const_pred (dest, builder);
5314 if (dest != res)
5315 emit_move_insn (dest, gen_lowpart (mode, res));
5316 return;
5317 }
5318 }
5319
43cacb12
RS
5320 if (GET_CODE (imm) == HIGH
5321 || aarch64_simd_valid_immediate (imm, NULL))
43cacb12 5322 {
4aeb1ba7
RS
5323 emit_insn (gen_rtx_SET (dest, imm));
5324 return;
43e9d192 5325 }
82614948 5326
4aeb1ba7
RS
5327 if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
5328 if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
5329 {
5330 if (dest != res)
5331 emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
5332 return;
5333 }
5334
5335 rtx mem = force_const_mem (mode, imm);
5336 gcc_assert (mem);
5337 emit_move_insn (dest, mem);
82614948 5338 return;
43e9d192 5339 }
82614948 5340
77e994c9
RS
5341 aarch64_internal_mov_immediate (dest, imm, true,
5342 as_a <scalar_int_mode> (mode));
43e9d192
IB
5343}
5344
74b27d8e
RS
5345/* Return the MEM rtx that provides the canary value that should be used
5346 for stack-smashing protection. MODE is the mode of the memory.
5347 For SSP_GLOBAL, DECL_RTL is the MEM rtx for the canary variable
5348 (__stack_chk_guard), otherwise it has no useful value. SALT_TYPE
5349 indicates whether the caller is performing a SET or a TEST operation. */
5350
5351rtx
5352aarch64_stack_protect_canary_mem (machine_mode mode, rtx decl_rtl,
5353 aarch64_salt_type salt_type)
5354{
5355 rtx addr;
5356 if (aarch64_stack_protector_guard == SSP_GLOBAL)
5357 {
5358 gcc_assert (MEM_P (decl_rtl));
5359 addr = XEXP (decl_rtl, 0);
5360 poly_int64 offset;
5361 rtx base = strip_offset_and_salt (addr, &offset);
5362 if (!SYMBOL_REF_P (base))
5363 return decl_rtl;
5364
5365 rtvec v = gen_rtvec (2, base, GEN_INT (salt_type));
5366 addr = gen_rtx_UNSPEC (Pmode, v, UNSPEC_SALT_ADDR);
5367 addr = gen_rtx_CONST (Pmode, addr);
5368 addr = plus_constant (Pmode, addr, offset);
5369 }
5370 else
5371 {
5372 /* Calculate the address from the system register. */
5373 rtx salt = GEN_INT (salt_type);
5374 addr = gen_reg_rtx (mode);
5375 if (mode == DImode)
5376 emit_insn (gen_reg_stack_protect_address_di (addr, salt));
5377 else
5378 {
5379 emit_insn (gen_reg_stack_protect_address_si (addr, salt));
5380 addr = convert_memory_address (Pmode, addr);
5381 }
5382 addr = plus_constant (Pmode, addr, aarch64_stack_protector_guard_offset);
5383 }
5384 return gen_rtx_MEM (mode, force_reg (Pmode, addr));
5385}
5386
43cacb12
RS
5387/* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
5388 that is known to contain PTRUE. */
5389
5390void
5391aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
5392{
0c63a8ee
TC
5393 expand_operand ops[3];
5394 machine_mode mode = GET_MODE (dest);
5395 create_output_operand (&ops[0], dest, mode);
5396 create_input_operand (&ops[1], pred, GET_MODE(pred));
5397 create_input_operand (&ops[2], src, mode);
f2b29269 5398 temporary_volatile_ok v (true);
0c63a8ee 5399 expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
43cacb12
RS
5400}
5401
5402/* Expand a pre-RA SVE data move from SRC to DEST in which at least one
5403 operand is in memory. In this case we need to use the predicated LD1
5404 and ST1 instead of LDR and STR, both for correctness on big-endian
5405 targets and because LD1 and ST1 support a wider range of addressing modes.
5406 PRED_MODE is the mode of the predicate.
5407
5408 See the comment at the head of aarch64-sve.md for details about the
5409 big-endian handling. */
5410
5411void
5412aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
5413{
5414 machine_mode mode = GET_MODE (dest);
16de3637 5415 rtx ptrue = aarch64_ptrue_reg (pred_mode);
43cacb12
RS
5416 if (!register_operand (src, mode)
5417 && !register_operand (dest, mode))
5418 {
5419 rtx tmp = gen_reg_rtx (mode);
5420 if (MEM_P (src))
5421 aarch64_emit_sve_pred_move (tmp, ptrue, src);
5422 else
5423 emit_move_insn (tmp, src);
5424 src = tmp;
5425 }
5426 aarch64_emit_sve_pred_move (dest, ptrue, src);
5427}
5428
002092be
RS
5429/* Called only on big-endian targets. See whether an SVE vector move
5430 from SRC to DEST is effectively a REV[BHW] instruction, because at
5431 least one operand is a subreg of an SVE vector that has wider or
5432 narrower elements. Return true and emit the instruction if so.
5433
5434 For example:
5435
5436 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
5437
5438 represents a VIEW_CONVERT between the following vectors, viewed
5439 in memory order:
5440
5441 R2: { [0].high, [0].low, [1].high, [1].low, ... }
5442 R1: { [0], [1], [2], [3], ... }
5443
5444 The high part of lane X in R2 should therefore correspond to lane X*2
5445 of R1, but the register representations are:
5446
5447 msb lsb
5448 R2: ...... [1].high [1].low [0].high [0].low
5449 R1: ...... [3] [2] [1] [0]
5450
5451 where the low part of lane X in R2 corresponds to lane X*2 in R1.
5452 We therefore need a reverse operation to swap the high and low values
5453 around.
5454
5455 This is purely an optimization. Without it we would spill the
5456 subreg operand to the stack in one mode and reload it in the
5457 other mode, which has the same effect as the REV. */
5458
5459bool
5460aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
5461{
5462 gcc_assert (BYTES_BIG_ENDIAN);
a4d9837e
RS
5463
5464 /* Do not try to optimize subregs that LRA has created for matched
5465 reloads. These subregs only exist as a temporary measure to make
5466 the RTL well-formed, but they are exempt from the usual
5467 TARGET_CAN_CHANGE_MODE_CLASS rules.
5468
5469 For example, if we have:
5470
5471 (set (reg:VNx8HI R1) (foo:VNx8HI (reg:VNx4SI R2)))
5472
5473 and the constraints require R1 and R2 to be in the same register,
5474 LRA may need to create RTL such as:
5475
5476 (set (subreg:VNx4SI (reg:VNx8HI TMP) 0) (reg:VNx4SI R2))
5477 (set (reg:VNx8HI TMP) (foo:VNx8HI (subreg:VNx4SI (reg:VNx8HI TMP) 0)))
5478 (set (reg:VNx8HI R1) (reg:VNx8HI TMP))
5479
5480 which forces both the input and output of the original instruction
5481 to use the same hard register. But for this to work, the normal
5482 rules have to be suppressed on the subreg input, otherwise LRA
5483 would need to reload that input too, meaning that the process
5484 would never terminate. To compensate for this, the normal rules
5485 are also suppressed for the subreg output of the first move.
5486 Ignoring the special case and handling the first move normally
5487 would therefore generate wrong code: we would reverse the elements
5488 for the first subreg but not reverse them back for the second subreg. */
5489 if (SUBREG_P (dest) && !LRA_SUBREG_P (dest))
002092be 5490 dest = SUBREG_REG (dest);
a4d9837e 5491 if (SUBREG_P (src) && !LRA_SUBREG_P (src))
002092be
RS
5492 src = SUBREG_REG (src);
5493
5494 /* The optimization handles two single SVE REGs with different element
5495 sizes. */
5496 if (!REG_P (dest)
5497 || !REG_P (src)
5498 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
5499 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
5500 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
5501 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
5502 return false;
5503
5504 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
16de3637 5505 rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
002092be
RS
5506 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
5507 UNSPEC_REV_SUBREG);
5508 emit_insn (gen_rtx_SET (dest, unspec));
5509 return true;
5510}
5511
5512/* Return a copy of X with mode MODE, without changing its other
5513 attributes. Unlike gen_lowpart, this doesn't care whether the
5514 mode change is valid. */
5515
624d0f07 5516rtx
002092be
RS
5517aarch64_replace_reg_mode (rtx x, machine_mode mode)
5518{
5519 if (GET_MODE (x) == mode)
5520 return x;
5521
5522 x = shallow_copy_rtx (x);
5523 set_mode_and_regno (x, mode, REGNO (x));
5524 return x;
5525}
5526
d7a09c44
RS
5527/* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
5528 stored in wider integer containers. */
5529
5530static unsigned int
5531aarch64_sve_rev_unspec (machine_mode mode)
5532{
5533 switch (GET_MODE_UNIT_SIZE (mode))
5534 {
5535 case 1: return UNSPEC_REVB;
5536 case 2: return UNSPEC_REVH;
5537 case 4: return UNSPEC_REVW;
5538 }
5539 gcc_unreachable ();
5540}
5541
002092be
RS
5542/* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
5543 operands. */
5544
5545void
5546aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
5547{
d7a09c44
RS
5548 /* Decide which REV operation we need. The mode with wider elements
5549 determines the mode of the operands and the mode with the narrower
002092be 5550 elements determines the reverse width. */
5c06093c
RS
5551 machine_mode mode_with_wider_elts = aarch64_sve_int_mode (GET_MODE (dest));
5552 machine_mode mode_with_narrower_elts = aarch64_sve_int_mode (GET_MODE (src));
002092be
RS
5553 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
5554 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
5555 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
5556
d7a09c44 5557 unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
cc68f7c2 5558 machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
002092be 5559
d7a09c44 5560 /* Get the operands in the appropriate modes and emit the instruction. */
002092be 5561 ptrue = gen_lowpart (pred_mode, ptrue);
d7a09c44
RS
5562 dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
5563 src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
5564 emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
5565 dest, ptrue, src));
002092be
RS
5566}
5567
43e9d192 5568static bool
c600df9a 5569aarch64_function_ok_for_sibcall (tree, tree exp)
43e9d192 5570{
c600df9a 5571 if (crtl->abi->id () != expr_callee_abi (exp).id ())
a0d0b980
SE
5572 return false;
5573
43e9d192
IB
5574 return true;
5575}
5576
38e62001
RS
5577/* Subroutine of aarch64_pass_by_reference for arguments that are not
5578 passed in SVE registers. */
43e9d192
IB
5579
5580static bool
56fe3ca3
RS
5581aarch64_pass_by_reference_1 (CUMULATIVE_ARGS *pcum,
5582 const function_arg_info &arg)
43e9d192
IB
5583{
5584 HOST_WIDE_INT size;
ef4bddc2 5585 machine_mode dummymode;
43e9d192
IB
5586 int nregs;
5587
5588 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
52090e4d
RS
5589 if (arg.mode == BLKmode && arg.type)
5590 size = int_size_in_bytes (arg.type);
6a70badb
RS
5591 else
5592 /* No frontends can create types with variable-sized modes, so we
5593 shouldn't be asked to pass or return them. */
52090e4d 5594 size = GET_MODE_SIZE (arg.mode).to_constant ();
43e9d192 5595
aadc1c43 5596 /* Aggregates are passed by reference based on their size. */
52090e4d
RS
5597 if (arg.aggregate_type_p ())
5598 size = int_size_in_bytes (arg.type);
43e9d192
IB
5599
5600 /* Variable sized arguments are always returned by reference. */
5601 if (size < 0)
5602 return true;
5603
5604 /* Can this be a candidate to be passed in fp/simd register(s)? */
52090e4d 5605 if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
56fe3ca3
RS
5606 &dummymode, &nregs, NULL,
5607 !pcum || pcum->silent_p))
43e9d192
IB
5608 return false;
5609
5610 /* Arguments which are variable sized or larger than 2 registers are
5611 passed by reference unless they are a homogenous floating point
5612 aggregate. */
5613 return size > 2 * UNITS_PER_WORD;
5614}
5615
38e62001
RS
5616/* Implement TARGET_PASS_BY_REFERENCE. */
5617
5618static bool
5619aarch64_pass_by_reference (cumulative_args_t pcum_v,
5620 const function_arg_info &arg)
5621{
5622 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5623
5624 if (!arg.type)
56fe3ca3 5625 return aarch64_pass_by_reference_1 (pcum, arg);
38e62001
RS
5626
5627 pure_scalable_type_info pst_info;
5628 switch (pst_info.analyze (arg.type))
5629 {
5630 case pure_scalable_type_info::IS_PST:
5631 if (pcum && !pcum->silent_p && !TARGET_SVE)
5632 /* We can't gracefully recover at this point, so make this a
5633 fatal error. */
5634 fatal_error (input_location, "arguments of type %qT require"
5635 " the SVE ISA extension", arg.type);
5636
5637 /* Variadic SVE types are passed by reference. Normal non-variadic
5638 arguments are too if we've run out of registers. */
5639 return (!arg.named
5640 || pcum->aapcs_nvrn + pst_info.num_zr () > NUM_FP_ARG_REGS
5641 || pcum->aapcs_nprn + pst_info.num_pr () > NUM_PR_ARG_REGS);
5642
5643 case pure_scalable_type_info::DOESNT_MATTER:
56fe3ca3 5644 gcc_assert (aarch64_pass_by_reference_1 (pcum, arg));
38e62001
RS
5645 return true;
5646
5647 case pure_scalable_type_info::NO_ABI_IDENTITY:
5648 case pure_scalable_type_info::ISNT_PST:
56fe3ca3 5649 return aarch64_pass_by_reference_1 (pcum, arg);
38e62001
RS
5650 }
5651 gcc_unreachable ();
5652}
5653
43e9d192
IB
5654/* Return TRUE if VALTYPE is padded to its least significant bits. */
5655static bool
5656aarch64_return_in_msb (const_tree valtype)
5657{
ef4bddc2 5658 machine_mode dummy_mode;
43e9d192
IB
5659 int dummy_int;
5660
5661 /* Never happens in little-endian mode. */
5662 if (!BYTES_BIG_ENDIAN)
5663 return false;
5664
5665 /* Only composite types smaller than or equal to 16 bytes can
5666 be potentially returned in registers. */
5667 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
5668 || int_size_in_bytes (valtype) <= 0
5669 || int_size_in_bytes (valtype) > 16)
5670 return false;
5671
5672 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
5673 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
5674 is always passed/returned in the least significant bits of fp/simd
5675 register(s). */
5676 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
56fe3ca3
RS
5677 &dummy_mode, &dummy_int, NULL,
5678 false))
43e9d192
IB
5679 return false;
5680
38e62001
RS
5681 /* Likewise pure scalable types for SVE vector and predicate registers. */
5682 pure_scalable_type_info pst_info;
5683 if (pst_info.analyze_registers (valtype))
5684 return false;
5685
43e9d192
IB
5686 return true;
5687}
5688
38e62001
RS
5689/* Implement TARGET_FUNCTION_VALUE.
5690 Define how to find the value returned by a function. */
5691
43e9d192 5692static rtx
38e62001
RS
5693aarch64_function_value (const_tree type, const_tree func,
5694 bool outgoing ATTRIBUTE_UNUSED)
43e9d192 5695{
38e62001
RS
5696 machine_mode mode;
5697 int unsignedp;
c600df9a 5698
38e62001
RS
5699 mode = TYPE_MODE (type);
5700 if (INTEGRAL_TYPE_P (type))
5701 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
c600df9a 5702
38e62001
RS
5703 pure_scalable_type_info pst_info;
5704 if (type && pst_info.analyze_registers (type))
5705 return pst_info.get_rtx (mode, V0_REGNUM, P0_REGNUM);
c600df9a 5706
38e62001
RS
5707 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
5708 are returned in memory, not by value. */
5709 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5710 bool sve_p = (vec_flags & VEC_ANY_SVE);
c600df9a 5711
43e9d192
IB
5712 if (aarch64_return_in_msb (type))
5713 {
5714 HOST_WIDE_INT size = int_size_in_bytes (type);
5715
5716 if (size % UNITS_PER_WORD != 0)
5717 {
5718 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
f4b31647 5719 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
43e9d192
IB
5720 }
5721 }
5722
6aa5370c
RS
5723 int count;
5724 machine_mode ag_mode;
56fe3ca3
RS
5725 if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &count,
5726 NULL, false))
43e9d192 5727 {
38e62001 5728 gcc_assert (!sve_p);
43e9d192
IB
5729 if (!aarch64_composite_type_p (type, mode))
5730 {
5731 gcc_assert (count == 1 && mode == ag_mode);
5732 return gen_rtx_REG (mode, V0_REGNUM);
5733 }
5734 else
5735 {
5736 int i;
5737 rtx par;
5738
5739 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
5740 for (i = 0; i < count; i++)
5741 {
5742 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
6a70badb
RS
5743 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
5744 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
43e9d192
IB
5745 XVECEXP (par, 0, i) = tmp;
5746 }
5747 return par;
5748 }
5749 }
5750 else
6aa5370c 5751 {
38e62001
RS
5752 if (sve_p)
5753 {
5754 /* Vector types can acquire a partial SVE mode using things like
5755 __attribute__((vector_size(N))), and this is potentially useful.
5756 However, the choice of mode doesn't affect the type's ABI
5757 identity, so we should treat the types as though they had
5758 the associated integer mode, just like they did before SVE
5759 was introduced.
5760
5761 We know that the vector must be 128 bits or smaller,
5762 otherwise we'd have returned it in memory instead. */
5763 gcc_assert (type
5764 && (aarch64_some_values_include_pst_objects_p (type)
5765 || (vec_flags & VEC_PARTIAL)));
5766
5767 scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
5768 rtx reg = gen_rtx_REG (int_mode, R0_REGNUM);
5769 rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
5770 return gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
5771 }
5772 return gen_rtx_REG (mode, R0_REGNUM);
6aa5370c 5773 }
6aa5370c
RS
5774}
5775
43e9d192
IB
5776/* Implements TARGET_FUNCTION_VALUE_REGNO_P.
5777 Return true if REGNO is the number of a hard register in which the values
5778 of called function may come back. */
5779
5780static bool
5781aarch64_function_value_regno_p (const unsigned int regno)
5782{
5783 /* Maximum of 16 bytes can be returned in the general registers. Examples
5784 of 16-byte return values are: 128-bit integers and 16-byte small
5785 structures (excluding homogeneous floating-point aggregates). */
5786 if (regno == R0_REGNUM || regno == R1_REGNUM)
5787 return true;
5788
5789 /* Up to four fp/simd registers can return a function value, e.g. a
5790 homogeneous floating-point aggregate having four members. */
5791 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
d5726973 5792 return TARGET_FLOAT;
43e9d192
IB
5793
5794 return false;
5795}
5796
38e62001
RS
5797/* Subroutine for aarch64_return_in_memory for types that are not returned
5798 in SVE registers. */
43e9d192
IB
5799
5800static bool
38e62001 5801aarch64_return_in_memory_1 (const_tree type)
43e9d192
IB
5802{
5803 HOST_WIDE_INT size;
ef4bddc2 5804 machine_mode ag_mode;
43e9d192
IB
5805 int count;
5806
5807 if (!AGGREGATE_TYPE_P (type)
5808 && TREE_CODE (type) != COMPLEX_TYPE
5809 && TREE_CODE (type) != VECTOR_TYPE)
5810 /* Simple scalar types always returned in registers. */
5811 return false;
5812
56fe3ca3
RS
5813 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
5814 &ag_mode, &count, NULL, false))
43e9d192
IB
5815 return false;
5816
5817 /* Types larger than 2 registers returned in memory. */
5818 size = int_size_in_bytes (type);
5819 return (size < 0 || size > 2 * UNITS_PER_WORD);
5820}
5821
38e62001
RS
5822/* Implement TARGET_RETURN_IN_MEMORY.
5823
5824 If the type T of the result of a function is such that
5825 void func (T arg)
5826 would require that arg be passed as a value in a register (or set of
5827 registers) according to the parameter passing rules, then the result
5828 is returned in the same registers as would be used for such an
5829 argument. */
5830
5831static bool
5832aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
5833{
5834 pure_scalable_type_info pst_info;
5835 switch (pst_info.analyze (type))
5836 {
5837 case pure_scalable_type_info::IS_PST:
5838 return (pst_info.num_zr () > NUM_FP_ARG_REGS
5839 || pst_info.num_pr () > NUM_PR_ARG_REGS);
5840
5841 case pure_scalable_type_info::DOESNT_MATTER:
5842 gcc_assert (aarch64_return_in_memory_1 (type));
5843 return true;
5844
5845 case pure_scalable_type_info::NO_ABI_IDENTITY:
5846 case pure_scalable_type_info::ISNT_PST:
5847 return aarch64_return_in_memory_1 (type);
5848 }
5849 gcc_unreachable ();
5850}
5851
43e9d192 5852static bool
ef4bddc2 5853aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
43e9d192
IB
5854 const_tree type, int *nregs)
5855{
5856 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
56fe3ca3 5857 return aarch64_vfp_is_call_or_return_candidate (mode, type,
43e9d192 5858 &pcum->aapcs_vfp_rmode,
56fe3ca3 5859 nregs, NULL, pcum->silent_p);
43e9d192
IB
5860}
5861
985b8393 5862/* Given MODE and TYPE of a function argument, return the alignment in
43e9d192 5863 bits. The idea is to suppress any stronger alignment requested by
c590597c
RE
5864 the user and opt for the natural alignment (specified in AAPCS64 \S
5865 4.1). ABI_BREAK is set to true if the alignment was incorrectly
5866 calculated in versions of GCC prior to GCC-9. This is a helper
5867 function for local use only. */
43e9d192 5868
985b8393 5869static unsigned int
c590597c
RE
5870aarch64_function_arg_alignment (machine_mode mode, const_tree type,
5871 bool *abi_break)
43e9d192 5872{
c590597c 5873 *abi_break = false;
75d6cc81 5874 if (!type)
985b8393 5875 return GET_MODE_ALIGNMENT (mode);
2ec07fa6 5876
75d6cc81 5877 if (integer_zerop (TYPE_SIZE (type)))
985b8393 5878 return 0;
43e9d192 5879
75d6cc81
AL
5880 gcc_assert (TYPE_MODE (type) == mode);
5881
5882 if (!AGGREGATE_TYPE_P (type))
985b8393 5883 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
75d6cc81
AL
5884
5885 if (TREE_CODE (type) == ARRAY_TYPE)
985b8393 5886 return TYPE_ALIGN (TREE_TYPE (type));
75d6cc81 5887
985b8393 5888 unsigned int alignment = 0;
c590597c 5889 unsigned int bitfield_alignment = 0;
75d6cc81 5890 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
985b8393 5891 if (TREE_CODE (field) == FIELD_DECL)
c590597c 5892 {
56fe3ca3
RS
5893 /* Note that we explicitly consider zero-sized fields here,
5894 even though they don't map to AAPCS64 machine types.
5895 For example, in:
5896
5897 struct __attribute__((aligned(8))) empty {};
5898
5899 struct s {
5900 [[no_unique_address]] empty e;
5901 int x;
5902 };
5903
5904 "s" contains only one Fundamental Data Type (the int field)
5905 but gains 8-byte alignment and size thanks to "e". */
c590597c
RE
5906 alignment = std::max (alignment, DECL_ALIGN (field));
5907 if (DECL_BIT_FIELD_TYPE (field))
5908 bitfield_alignment
5909 = std::max (bitfield_alignment,
5910 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
5911 }
5912
5913 if (bitfield_alignment > alignment)
5914 {
5915 *abi_break = true;
5916 return bitfield_alignment;
5917 }
43e9d192 5918
985b8393 5919 return alignment;
43e9d192
IB
5920}
5921
5922/* Layout a function argument according to the AAPCS64 rules. The rule
6aa5370c
RS
5923 numbers refer to the rule numbers in the AAPCS64. ORIG_MODE is the
5924 mode that was originally given to us by the target hook, whereas the
5925 mode in ARG might be the result of replacing partial SVE modes with
5926 the equivalent integer mode. */
43e9d192
IB
5927
5928static void
38e62001 5929aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
43e9d192
IB
5930{
5931 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
c600df9a
RS
5932 tree type = arg.type;
5933 machine_mode mode = arg.mode;
43e9d192
IB
5934 int ncrn, nvrn, nregs;
5935 bool allocate_ncrn, allocate_nvrn;
3abf17cf 5936 HOST_WIDE_INT size;
c590597c 5937 bool abi_break;
43e9d192
IB
5938
5939 /* We need to do this once per argument. */
5940 if (pcum->aapcs_arg_processed)
5941 return;
5942
5943 pcum->aapcs_arg_processed = true;
5944
38e62001
RS
5945 pure_scalable_type_info pst_info;
5946 if (type && pst_info.analyze_registers (type))
c600df9a
RS
5947 {
5948 /* The PCS says that it is invalid to pass an SVE value to an
5949 unprototyped function. There is no ABI-defined location we
5950 can return in this case, so we have no real choice but to raise
5951 an error immediately, even though this is only a query function. */
5952 if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
5953 {
5954 gcc_assert (!pcum->silent_p);
5955 error ("SVE type %qT cannot be passed to an unprototyped function",
5956 arg.type);
5957 /* Avoid repeating the message, and avoid tripping the assert
5958 below. */
5959 pcum->pcs_variant = ARM_PCS_SVE;
5960 }
5961
5962 /* We would have converted the argument into pass-by-reference
5963 form if it didn't fit in registers. */
38e62001
RS
5964 pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + pst_info.num_zr ();
5965 pcum->aapcs_nextnprn = pcum->aapcs_nprn + pst_info.num_pr ();
c600df9a
RS
5966 gcc_assert (arg.named
5967 && pcum->pcs_variant == ARM_PCS_SVE
c600df9a
RS
5968 && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
5969 && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
38e62001
RS
5970 pcum->aapcs_reg = pst_info.get_rtx (mode, V0_REGNUM + pcum->aapcs_nvrn,
5971 P0_REGNUM + pcum->aapcs_nprn);
c600df9a
RS
5972 return;
5973 }
5974
38e62001
RS
5975 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
5976 are passed by reference, not by value. */
5977 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5978 bool sve_p = (vec_flags & VEC_ANY_SVE);
5979 if (sve_p)
5980 /* Vector types can acquire a partial SVE mode using things like
5981 __attribute__((vector_size(N))), and this is potentially useful.
5982 However, the choice of mode doesn't affect the type's ABI
5983 identity, so we should treat the types as though they had
5984 the associated integer mode, just like they did before SVE
5985 was introduced.
5986
5987 We know that the vector must be 128 bits or smaller,
5988 otherwise we'd have passed it in memory instead. */
5989 gcc_assert (type
5990 && (aarch64_some_values_include_pst_objects_p (type)
5991 || (vec_flags & VEC_PARTIAL)));
c600df9a 5992
3abf17cf 5993 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
6a70badb
RS
5994 if (type)
5995 size = int_size_in_bytes (type);
5996 else
5997 /* No frontends can create types with variable-sized modes, so we
5998 shouldn't be asked to pass or return them. */
5999 size = GET_MODE_SIZE (mode).to_constant ();
6000 size = ROUND_UP (size, UNITS_PER_WORD);
3abf17cf 6001
43e9d192
IB
6002 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
6003 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
6004 mode,
6005 type,
6006 &nregs);
38e62001 6007 gcc_assert (!sve_p || !allocate_nvrn);
43e9d192
IB
6008
6009 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
6010 The following code thus handles passing by SIMD/FP registers first. */
6011
6012 nvrn = pcum->aapcs_nvrn;
6013
6014 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
6015 and homogenous short-vector aggregates (HVA). */
6016 if (allocate_nvrn)
6017 {
c600df9a 6018 if (!pcum->silent_p && !TARGET_FLOAT)
fc29dfc9 6019 aarch64_err_no_fpadvsimd (mode);
261fb553 6020
43e9d192
IB
6021 if (nvrn + nregs <= NUM_FP_ARG_REGS)
6022 {
6023 pcum->aapcs_nextnvrn = nvrn + nregs;
6024 if (!aarch64_composite_type_p (type, mode))
6025 {
6026 gcc_assert (nregs == 1);
6027 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
6028 }
6029 else
6030 {
6031 rtx par;
6032 int i;
6033 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
6034 for (i = 0; i < nregs; i++)
6035 {
6036 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
6037 V0_REGNUM + nvrn + i);
6a70badb
RS
6038 rtx offset = gen_int_mode
6039 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
6040 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
43e9d192
IB
6041 XVECEXP (par, 0, i) = tmp;
6042 }
6043 pcum->aapcs_reg = par;
6044 }
6045 return;
6046 }
6047 else
6048 {
6049 /* C.3 NSRN is set to 8. */
6050 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
6051 goto on_stack;
6052 }
6053 }
6054
6055 ncrn = pcum->aapcs_ncrn;
3abf17cf 6056 nregs = size / UNITS_PER_WORD;
43e9d192
IB
6057
6058 /* C6 - C9. though the sign and zero extension semantics are
6059 handled elsewhere. This is the case where the argument fits
6060 entirely general registers. */
6061 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
6062 {
43e9d192
IB
6063 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
6064
6065 /* C.8 if the argument has an alignment of 16 then the NGRN is
c590597c 6066 rounded up to the next even number. */
985b8393
JJ
6067 if (nregs == 2
6068 && ncrn % 2
2ec07fa6 6069 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
985b8393 6070 comparison is there because for > 16 * BITS_PER_UNIT
2ec07fa6
RR
6071 alignment nregs should be > 2 and therefore it should be
6072 passed by reference rather than value. */
38e62001 6073 && (aarch64_function_arg_alignment (mode, type, &abi_break)
c590597c 6074 == 16 * BITS_PER_UNIT))
985b8393 6075 {
c590597c
RE
6076 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
6077 inform (input_location, "parameter passing for argument of type "
6078 "%qT changed in GCC 9.1", type);
985b8393
JJ
6079 ++ncrn;
6080 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
43e9d192 6081 }
2ec07fa6 6082
38e62001
RS
6083 /* If an argument with an SVE mode needs to be shifted up to the
6084 high part of the register, treat it as though it had an integer mode.
6085 Using the normal (parallel [...]) would suppress the shifting. */
6086 if (sve_p
6087 && BYTES_BIG_ENDIAN
6088 && maybe_ne (GET_MODE_SIZE (mode), nregs * UNITS_PER_WORD)
6089 && aarch64_pad_reg_upward (mode, type, false))
6090 {
6091 mode = int_mode_for_mode (mode).require ();
6092 sve_p = false;
6093 }
6094
43e9d192 6095 /* NREGS can be 0 when e.g. an empty structure is to be passed.
c590597c 6096 A reg is still generated for it, but the caller should be smart
43e9d192 6097 enough not to use it. */
38e62001
RS
6098 if (nregs == 0
6099 || (nregs == 1 && !sve_p)
6100 || GET_MODE_CLASS (mode) == MODE_INT)
2ec07fa6 6101 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
43e9d192
IB
6102 else
6103 {
6104 rtx par;
6105 int i;
6106
6107 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
6108 for (i = 0; i < nregs; i++)
6109 {
38e62001
RS
6110 scalar_int_mode reg_mode = word_mode;
6111 if (nregs == 1)
6112 reg_mode = int_mode_for_mode (mode).require ();
6113 rtx tmp = gen_rtx_REG (reg_mode, R0_REGNUM + ncrn + i);
43e9d192
IB
6114 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
6115 GEN_INT (i * UNITS_PER_WORD));
6116 XVECEXP (par, 0, i) = tmp;
6117 }
6118 pcum->aapcs_reg = par;
6119 }
6120
6121 pcum->aapcs_nextncrn = ncrn + nregs;
6122 return;
6123 }
6124
6125 /* C.11 */
6126 pcum->aapcs_nextncrn = NUM_ARG_REGS;
6127
6128 /* The argument is passed on stack; record the needed number of words for
3abf17cf 6129 this argument and align the total size if necessary. */
43e9d192 6130on_stack:
3abf17cf 6131 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2ec07fa6 6132
38e62001 6133 if (aarch64_function_arg_alignment (mode, type, &abi_break)
c590597c
RE
6134 == 16 * BITS_PER_UNIT)
6135 {
6136 int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
6137 if (pcum->aapcs_stack_size != new_size)
6138 {
6139 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
6140 inform (input_location, "parameter passing for argument of type "
6141 "%qT changed in GCC 9.1", type);
6142 pcum->aapcs_stack_size = new_size;
6143 }
6144 }
43e9d192
IB
6145 return;
6146}
6147
6148/* Implement TARGET_FUNCTION_ARG. */
6149
6150static rtx
6783fdb7 6151aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
43e9d192
IB
6152{
6153 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
08cc4d92 6154 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
c600df9a
RS
6155 || pcum->pcs_variant == ARM_PCS_SIMD
6156 || pcum->pcs_variant == ARM_PCS_SVE);
43e9d192 6157
6783fdb7 6158 if (arg.end_marker_p ())
08cc4d92 6159 return gen_int_mode (pcum->pcs_variant, DImode);
43e9d192 6160
38e62001 6161 aarch64_layout_arg (pcum_v, arg);
43e9d192
IB
6162 return pcum->aapcs_reg;
6163}
6164
6165void
6166aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
08cc4d92
RS
6167 const_tree fntype,
6168 rtx libname ATTRIBUTE_UNUSED,
6169 const_tree fndecl ATTRIBUTE_UNUSED,
c600df9a
RS
6170 unsigned n_named ATTRIBUTE_UNUSED,
6171 bool silent_p)
43e9d192
IB
6172{
6173 pcum->aapcs_ncrn = 0;
6174 pcum->aapcs_nvrn = 0;
c600df9a 6175 pcum->aapcs_nprn = 0;
43e9d192
IB
6176 pcum->aapcs_nextncrn = 0;
6177 pcum->aapcs_nextnvrn = 0;
c600df9a 6178 pcum->aapcs_nextnprn = 0;
08cc4d92
RS
6179 if (fntype)
6180 pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
6181 else
6182 pcum->pcs_variant = ARM_PCS_AAPCS64;
43e9d192
IB
6183 pcum->aapcs_reg = NULL_RTX;
6184 pcum->aapcs_arg_processed = false;
6185 pcum->aapcs_stack_words = 0;
6186 pcum->aapcs_stack_size = 0;
c600df9a 6187 pcum->silent_p = silent_p;
43e9d192 6188
c600df9a
RS
6189 if (!silent_p
6190 && !TARGET_FLOAT
261fb553
AL
6191 && fntype && fntype != error_mark_node)
6192 {
6193 const_tree type = TREE_TYPE (fntype);
6194 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
6195 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
6196 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
56fe3ca3 6197 &mode, &nregs, NULL, false))
fc29dfc9 6198 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
261fb553 6199 }
c600df9a
RS
6200
6201 if (!silent_p
6202 && !TARGET_SVE
6203 && pcum->pcs_variant == ARM_PCS_SVE)
6204 {
6205 /* We can't gracefully recover at this point, so make this a
6206 fatal error. */
6207 if (fndecl)
6208 fatal_error (input_location, "%qE requires the SVE ISA extension",
6209 fndecl);
6210 else
6211 fatal_error (input_location, "calls to functions of type %qT require"
6212 " the SVE ISA extension", fntype);
6213 }
43e9d192
IB
6214}
6215
6216static void
6217aarch64_function_arg_advance (cumulative_args_t pcum_v,
6930c98c 6218 const function_arg_info &arg)
43e9d192
IB
6219{
6220 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
08cc4d92 6221 if (pcum->pcs_variant == ARM_PCS_AAPCS64
c600df9a
RS
6222 || pcum->pcs_variant == ARM_PCS_SIMD
6223 || pcum->pcs_variant == ARM_PCS_SVE)
43e9d192 6224 {
38e62001 6225 aarch64_layout_arg (pcum_v, arg);
43e9d192
IB
6226 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
6227 != (pcum->aapcs_stack_words != 0));
6228 pcum->aapcs_arg_processed = false;
6229 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
6230 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
c600df9a 6231 pcum->aapcs_nprn = pcum->aapcs_nextnprn;
43e9d192
IB
6232 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
6233 pcum->aapcs_stack_words = 0;
6234 pcum->aapcs_reg = NULL_RTX;
6235 }
6236}
6237
6238bool
6239aarch64_function_arg_regno_p (unsigned regno)
6240{
6241 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
6242 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
6243}
6244
6245/* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
6246 PARM_BOUNDARY bits of alignment, but will be given anything up
6247 to STACK_BOUNDARY bits if the type requires it. This makes sure
6248 that both before and after the layout of each argument, the Next
6249 Stacked Argument Address (NSAA) will have a minimum alignment of
6250 8 bytes. */
6251
6252static unsigned int
ef4bddc2 6253aarch64_function_arg_boundary (machine_mode mode, const_tree type)
43e9d192 6254{
c590597c
RE
6255 bool abi_break;
6256 unsigned int alignment = aarch64_function_arg_alignment (mode, type,
6257 &abi_break);
6258 if (abi_break & warn_psabi)
6259 inform (input_location, "parameter passing for argument of type "
6260 "%qT changed in GCC 9.1", type);
6261
985b8393 6262 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
43e9d192
IB
6263}
6264
43cacb12
RS
6265/* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
6266
6267static fixed_size_mode
6268aarch64_get_reg_raw_mode (int regno)
6269{
6270 if (TARGET_SVE && FP_REGNUM_P (regno))
6271 /* Don't use the SVE part of the register for __builtin_apply and
6272 __builtin_return. The SVE registers aren't used by the normal PCS,
6273 so using them there would be a waste of time. The PCS extensions
6274 for SVE types are fundamentally incompatible with the
6275 __builtin_return/__builtin_apply interface. */
6276 return as_a <fixed_size_mode> (V16QImode);
6277 return default_get_reg_raw_mode (regno);
6278}
6279
76b0cbf8 6280/* Implement TARGET_FUNCTION_ARG_PADDING.
43e9d192
IB
6281
6282 Small aggregate types are placed in the lowest memory address.
6283
6284 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
6285
76b0cbf8
RS
6286static pad_direction
6287aarch64_function_arg_padding (machine_mode mode, const_tree type)
43e9d192
IB
6288{
6289 /* On little-endian targets, the least significant byte of every stack
6290 argument is passed at the lowest byte address of the stack slot. */
6291 if (!BYTES_BIG_ENDIAN)
76b0cbf8 6292 return PAD_UPWARD;
43e9d192 6293
00edcfbe 6294 /* Otherwise, integral, floating-point and pointer types are padded downward:
43e9d192
IB
6295 the least significant byte of a stack argument is passed at the highest
6296 byte address of the stack slot. */
6297 if (type
00edcfbe
YZ
6298 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
6299 || POINTER_TYPE_P (type))
43e9d192 6300 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
76b0cbf8 6301 return PAD_DOWNWARD;
43e9d192
IB
6302
6303 /* Everything else padded upward, i.e. data in first byte of stack slot. */
76b0cbf8 6304 return PAD_UPWARD;
43e9d192
IB
6305}
6306
6307/* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
6308
6309 It specifies padding for the last (may also be the only)
6310 element of a block move between registers and memory. If
6311 assuming the block is in the memory, padding upward means that
6312 the last element is padded after its highest significant byte,
6313 while in downward padding, the last element is padded at the
6314 its least significant byte side.
6315
6316 Small aggregates and small complex types are always padded
6317 upwards.
6318
6319 We don't need to worry about homogeneous floating-point or
6320 short-vector aggregates; their move is not affected by the
6321 padding direction determined here. Regardless of endianness,
6322 each element of such an aggregate is put in the least
6323 significant bits of a fp/simd register.
6324
6325 Return !BYTES_BIG_ENDIAN if the least significant byte of the
6326 register has useful data, and return the opposite if the most
6327 significant byte does. */
6328
6329bool
ef4bddc2 6330aarch64_pad_reg_upward (machine_mode mode, const_tree type,
43e9d192
IB
6331 bool first ATTRIBUTE_UNUSED)
6332{
6333
38e62001
RS
6334 /* Aside from pure scalable types, small composite types are always
6335 padded upward. */
43e9d192
IB
6336 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
6337 {
6a70badb
RS
6338 HOST_WIDE_INT size;
6339 if (type)
6340 size = int_size_in_bytes (type);
6341 else
6342 /* No frontends can create types with variable-sized modes, so we
6343 shouldn't be asked to pass or return them. */
6344 size = GET_MODE_SIZE (mode).to_constant ();
43e9d192 6345 if (size < 2 * UNITS_PER_WORD)
38e62001
RS
6346 {
6347 pure_scalable_type_info pst_info;
6348 if (pst_info.analyze_registers (type))
6349 return false;
6350 return true;
6351 }
43e9d192
IB
6352 }
6353
6354 /* Otherwise, use the default padding. */
6355 return !BYTES_BIG_ENDIAN;
6356}
6357
095a2d76 6358static scalar_int_mode
43e9d192
IB
6359aarch64_libgcc_cmp_return_mode (void)
6360{
6361 return SImode;
6362}
6363
a3eb8a52
EB
6364#define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
6365
6366/* We use the 12-bit shifted immediate arithmetic instructions so values
6367 must be multiple of (1 << 12), i.e. 4096. */
6368#define ARITH_FACTOR 4096
6369
6370#if (PROBE_INTERVAL % ARITH_FACTOR) != 0
6371#error Cannot use simple address calculation for stack probing
6372#endif
6373
6a70badb 6374/* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
a3eb8a52
EB
6375 inclusive. These are offsets from the current stack pointer. */
6376
6377static void
6a70badb 6378aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
a3eb8a52 6379{
6a70badb
RS
6380 HOST_WIDE_INT size;
6381 if (!poly_size.is_constant (&size))
6382 {
6383 sorry ("stack probes for SVE frames");
6384 return;
6385 }
6386
5773855c 6387 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REGNUM);
a3eb8a52
EB
6388
6389 /* See the same assertion on PROBE_INTERVAL above. */
6390 gcc_assert ((first % ARITH_FACTOR) == 0);
6391
6392 /* See if we have a constant small number of probes to generate. If so,
6393 that's the easy case. */
6394 if (size <= PROBE_INTERVAL)
6395 {
6396 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
6397
6398 emit_set_insn (reg1,
5f5c5e0f 6399 plus_constant (Pmode,
a3eb8a52 6400 stack_pointer_rtx, -(first + base)));
5f5c5e0f 6401 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
a3eb8a52
EB
6402 }
6403
6404 /* The run-time loop is made up of 8 insns in the generic case while the
6405 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
6406 else if (size <= 4 * PROBE_INTERVAL)
6407 {
6408 HOST_WIDE_INT i, rem;
6409
6410 emit_set_insn (reg1,
5f5c5e0f 6411 plus_constant (Pmode,
a3eb8a52
EB
6412 stack_pointer_rtx,
6413 -(first + PROBE_INTERVAL)));
6414 emit_stack_probe (reg1);
6415
6416 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
6417 it exceeds SIZE. If only two probes are needed, this will not
6418 generate any code. Then probe at FIRST + SIZE. */
6419 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
6420 {
6421 emit_set_insn (reg1,
5f5c5e0f 6422 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
a3eb8a52
EB
6423 emit_stack_probe (reg1);
6424 }
6425
6426 rem = size - (i - PROBE_INTERVAL);
6427 if (rem > 256)
6428 {
6429 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
6430
5f5c5e0f
EB
6431 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
6432 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
a3eb8a52
EB
6433 }
6434 else
5f5c5e0f 6435 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
a3eb8a52
EB
6436 }
6437
6438 /* Otherwise, do the same as above, but in a loop. Note that we must be
6439 extra careful with variables wrapping around because we might be at
6440 the very top (or the very bottom) of the address space and we have
6441 to be able to handle this case properly; in particular, we use an
6442 equality test for the loop condition. */
6443 else
6444 {
5773855c 6445 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REGNUM);
a3eb8a52
EB
6446
6447 /* Step 1: round SIZE to the previous multiple of the interval. */
6448
6449 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
6450
6451
6452 /* Step 2: compute initial and final value of the loop counter. */
6453
6454 /* TEST_ADDR = SP + FIRST. */
6455 emit_set_insn (reg1,
5f5c5e0f 6456 plus_constant (Pmode, stack_pointer_rtx, -first));
a3eb8a52
EB
6457
6458 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
13f752b2
JL
6459 HOST_WIDE_INT adjustment = - (first + rounded_size);
6460 if (! aarch64_uimm12_shift (adjustment))
6461 {
6462 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
6463 true, Pmode);
6464 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
6465 }
6466 else
8dd64cdf
EB
6467 emit_set_insn (reg2,
6468 plus_constant (Pmode, stack_pointer_rtx, adjustment));
6469
a3eb8a52
EB
6470 /* Step 3: the loop
6471
6472 do
6473 {
6474 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
6475 probe at TEST_ADDR
6476 }
6477 while (TEST_ADDR != LAST_ADDR)
6478
6479 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
6480 until it is equal to ROUNDED_SIZE. */
6481
5f5c5e0f 6482 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
a3eb8a52
EB
6483
6484
6485 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
6486 that SIZE is equal to ROUNDED_SIZE. */
6487
6488 if (size != rounded_size)
6489 {
6490 HOST_WIDE_INT rem = size - rounded_size;
6491
6492 if (rem > 256)
6493 {
6494 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
6495
5f5c5e0f
EB
6496 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
6497 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
a3eb8a52
EB
6498 }
6499 else
5f5c5e0f 6500 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
a3eb8a52
EB
6501 }
6502 }
6503
6504 /* Make sure nothing is scheduled before we are done. */
6505 emit_insn (gen_blockage ());
6506}
6507
6508/* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
6509 absolute addresses. */
6510
6511const char *
6512aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
6513{
6514 static int labelno = 0;
6515 char loop_lab[32];
6516 rtx xops[2];
6517
6518 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
6519
6520 /* Loop. */
6521 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
6522
cd1bef27 6523 HOST_WIDE_INT stack_clash_probe_interval
028d4092 6524 = 1 << param_stack_clash_protection_guard_size;
cd1bef27 6525
a3eb8a52
EB
6526 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
6527 xops[0] = reg1;
cd1bef27
JL
6528 HOST_WIDE_INT interval;
6529 if (flag_stack_clash_protection)
6530 interval = stack_clash_probe_interval;
6531 else
6532 interval = PROBE_INTERVAL;
6533
6534 gcc_assert (aarch64_uimm12_shift (interval));
6535 xops[1] = GEN_INT (interval);
6536
a3eb8a52
EB
6537 output_asm_insn ("sub\t%0, %0, %1", xops);
6538
cd1bef27
JL
6539 /* If doing stack clash protection then we probe up by the ABI specified
6540 amount. We do this because we're dropping full pages at a time in the
6541 loop. But if we're doing non-stack clash probing, probe at SP 0. */
6542 if (flag_stack_clash_protection)
6543 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
6544 else
6545 xops[1] = CONST0_RTX (GET_MODE (xops[1]));
6546
6547 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
6548 by this amount for each iteration. */
6549 output_asm_insn ("str\txzr, [%0, %1]", xops);
a3eb8a52
EB
6550
6551 /* Test if TEST_ADDR == LAST_ADDR. */
6552 xops[1] = reg2;
6553 output_asm_insn ("cmp\t%0, %1", xops);
6554
6555 /* Branch. */
6556 fputs ("\tb.ne\t", asm_out_file);
6557 assemble_name_raw (asm_out_file, loop_lab);
6558 fputc ('\n', asm_out_file);
6559
6560 return "";
6561}
6562
eb471ba3
TC
6563/* Emit the probe loop for doing stack clash probes and stack adjustments for
6564 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
6565 of GUARD_SIZE. When a probe is emitted it is done at most
6566 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
6567 at most MIN_PROBE_THRESHOLD. By the end of this function
6568 BASE = BASE - ADJUSTMENT. */
6569
6570const char *
6571aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
6572 rtx min_probe_threshold, rtx guard_size)
6573{
6574 /* This function is not allowed to use any instruction generation function
6575 like gen_ and friends. If you do you'll likely ICE during CFG validation,
6576 so instead emit the code you want using output_asm_insn. */
6577 gcc_assert (flag_stack_clash_protection);
6578 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
6579 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
6580
6581 /* The minimum required allocation before the residual requires probing. */
6582 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
6583
6584 /* Clamp the value down to the nearest value that can be used with a cmp. */
6585 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
6586 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
6587
6588 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
6589 gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
6590
6591 static int labelno = 0;
6592 char loop_start_lab[32];
6593 char loop_end_lab[32];
6594 rtx xops[2];
6595
6596 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
6597 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
6598
6599 /* Emit loop start label. */
6600 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
6601
6602 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
6603 xops[0] = adjustment;
6604 xops[1] = probe_offset_value_rtx;
6605 output_asm_insn ("cmp\t%0, %1", xops);
6606
6607 /* Branch to end if not enough adjustment to probe. */
6608 fputs ("\tb.lt\t", asm_out_file);
6609 assemble_name_raw (asm_out_file, loop_end_lab);
6610 fputc ('\n', asm_out_file);
6611
6612 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
6613 xops[0] = base;
6614 xops[1] = probe_offset_value_rtx;
6615 output_asm_insn ("sub\t%0, %0, %1", xops);
6616
6617 /* Probe at BASE. */
6618 xops[1] = const0_rtx;
6619 output_asm_insn ("str\txzr, [%0, %1]", xops);
6620
6621 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
6622 xops[0] = adjustment;
6623 xops[1] = probe_offset_value_rtx;
6624 output_asm_insn ("sub\t%0, %0, %1", xops);
6625
6626 /* Branch to start if still more bytes to allocate. */
6627 fputs ("\tb\t", asm_out_file);
6628 assemble_name_raw (asm_out_file, loop_start_lab);
6629 fputc ('\n', asm_out_file);
6630
6631 /* No probe leave. */
6632 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
6633
6634 /* BASE = BASE - ADJUSTMENT. */
6635 xops[0] = base;
6636 xops[1] = adjustment;
6637 output_asm_insn ("sub\t%0, %0, %1", xops);
6638 return "";
6639}
6640
d6cb6d6a
WD
6641/* Determine whether a frame chain needs to be generated. */
6642static bool
6643aarch64_needs_frame_chain (void)
6644{
6645 /* Force a frame chain for EH returns so the return address is at FP+8. */
6646 if (frame_pointer_needed || crtl->calls_eh_return)
6647 return true;
6648
6649 /* A leaf function cannot have calls or write LR. */
6650 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
6651
6652 /* Don't use a frame chain in leaf functions if leaf frame pointers
6653 are disabled. */
6654 if (flag_omit_leaf_frame_pointer && is_leaf)
6655 return false;
6656
6657 return aarch64_use_frame_pointer;
6658}
6659
43e9d192
IB
6660/* Mark the registers that need to be saved by the callee and calculate
6661 the size of the callee-saved registers area and frame record (both FP
33a2e348 6662 and LR may be omitted). */
43e9d192
IB
6663static void
6664aarch64_layout_frame (void)
6665{
c600df9a 6666 poly_int64 offset = 0;
4b0685d9 6667 int regno, last_fp_reg = INVALID_REGNUM;
c600df9a
RS
6668 machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
6669 poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
6670 bool frame_related_fp_reg_p = false;
ab43763e 6671 aarch64_frame &frame = cfun->machine->frame;
43e9d192 6672
ab43763e 6673 frame.emit_frame_chain = aarch64_needs_frame_chain ();
7040939b 6674
8c6e3b23
TC
6675 /* Adjust the outgoing arguments size if required. Keep it in sync with what
6676 the mid-end is doing. */
6677 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
6678
97826595
MS
6679#define SLOT_NOT_REQUIRED (-2)
6680#define SLOT_REQUIRED (-1)
6681
ab43763e
RS
6682 frame.wb_candidate1 = INVALID_REGNUM;
6683 frame.wb_candidate2 = INVALID_REGNUM;
c600df9a 6684 frame.spare_pred_reg = INVALID_REGNUM;
363ffa50 6685
43e9d192 6686 /* First mark all the registers that really need to be saved... */
c600df9a 6687 for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
ab43763e 6688 frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
43e9d192
IB
6689
6690 /* ... that includes the eh data registers (if needed)... */
6691 if (crtl->calls_eh_return)
6692 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
ab43763e 6693 frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
43e9d192
IB
6694
6695 /* ... and any callee saved register that dataflow says is live. */
6696 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
6697 if (df_regs_ever_live_p (regno)
dcdd0f05 6698 && !fixed_regs[regno]
1c923b60 6699 && (regno == R30_REGNUM
dcdd0f05 6700 || !crtl->abi->clobbers_full_reg_p (regno)))
ab43763e 6701 frame.reg_offset[regno] = SLOT_REQUIRED;
43e9d192
IB
6702
6703 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6704 if (df_regs_ever_live_p (regno)
dcdd0f05
RS
6705 && !fixed_regs[regno]
6706 && !crtl->abi->clobbers_full_reg_p (regno))
4b0685d9 6707 {
ab43763e 6708 frame.reg_offset[regno] = SLOT_REQUIRED;
4b0685d9 6709 last_fp_reg = regno;
c600df9a
RS
6710 if (aarch64_emit_cfi_for_reg_p (regno))
6711 frame_related_fp_reg_p = true;
4b0685d9 6712 }
43e9d192 6713
c600df9a
RS
6714 /* Big-endian SVE frames need a spare predicate register in order
6715 to save Z8-Z15. Decide which register they should use. Prefer
6716 an unused argument register if possible, so that we don't force P4
6717 to be saved unnecessarily. */
6718 if (frame_related_fp_reg_p
6719 && crtl->abi->id () == ARM_PCS_SVE
6720 && BYTES_BIG_ENDIAN)
6721 {
6722 bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
6723 bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
6724 for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
6725 if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
6726 break;
6727 gcc_assert (regno <= P7_REGNUM);
6728 frame.spare_pred_reg = regno;
6729 df_set_regs_ever_live (regno, true);
6730 }
6731
6732 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
6733 if (df_regs_ever_live_p (regno)
6734 && !fixed_regs[regno]
6735 && !crtl->abi->clobbers_full_reg_p (regno))
6736 frame.reg_offset[regno] = SLOT_REQUIRED;
6737
d6430e3c
TC
6738 /* With stack-clash, LR must be saved in non-leaf functions. The saving of
6739 LR counts as an implicit probe which allows us to maintain the invariant
6740 described in the comment at expand_prologue. */
c600df9a
RS
6741 gcc_assert (crtl->is_leaf
6742 || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
6743
6744 /* Now assign stack slots for the registers. Start with the predicate
6745 registers, since predicate LDR and STR have a relatively small
6746 offset range. These saves happen below the hard frame pointer. */
6747 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
6748 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6749 {
6750 frame.reg_offset[regno] = offset;
6751 offset += BYTES_PER_SVE_PRED;
6752 }
6753
c600df9a
RS
6754 if (maybe_ne (offset, 0))
6755 {
cb26919c
RS
6756 /* If we have any vector registers to save above the predicate registers,
6757 the offset of the vector register save slots need to be a multiple
6758 of the vector size. This lets us use the immediate forms of LDR/STR
6759 (or LD1/ST1 for big-endian).
6760
6761 A vector register is 8 times the size of a predicate register,
6762 and we need to save a maximum of 12 predicate registers, so the
6763 first vector register will be at either #1, MUL VL or #2, MUL VL.
6764
6765 If we don't have any vector registers to save, and we know how
6766 big the predicate save area is, we can just round it up to the
6767 next 16-byte boundary. */
6768 if (last_fp_reg == (int) INVALID_REGNUM && offset.is_constant ())
6769 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6770 else
6771 {
6772 if (known_le (offset, vector_save_size))
6773 offset = vector_save_size;
6774 else if (known_le (offset, vector_save_size * 2))
6775 offset = vector_save_size * 2;
6776 else
6777 gcc_unreachable ();
6778 }
c600df9a
RS
6779 }
6780
6781 /* If we need to save any SVE vector registers, add them next. */
6782 if (last_fp_reg != (int) INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
6783 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6784 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6785 {
6786 frame.reg_offset[regno] = offset;
6787 offset += vector_save_size;
6788 }
6789
6790 /* OFFSET is now the offset of the hard frame pointer from the bottom
6791 of the callee save area. */
6792 bool saves_below_hard_fp_p = maybe_ne (offset, 0);
6793 frame.below_hard_fp_saved_regs_size = offset;
ab43763e 6794 if (frame.emit_frame_chain)
43e9d192 6795 {
2e1cdae5 6796 /* FP and LR are placed in the linkage record. */
c600df9a 6797 frame.reg_offset[R29_REGNUM] = offset;
ab43763e 6798 frame.wb_candidate1 = R29_REGNUM;
c600df9a 6799 frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
ab43763e 6800 frame.wb_candidate2 = R30_REGNUM;
c600df9a 6801 offset += 2 * UNITS_PER_WORD;
1f7bffd0 6802 }
43e9d192 6803
2e1cdae5 6804 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
c600df9a 6805 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
43e9d192 6806 {
ab43763e
RS
6807 frame.reg_offset[regno] = offset;
6808 if (frame.wb_candidate1 == INVALID_REGNUM)
6809 frame.wb_candidate1 = regno;
6810 else if (frame.wb_candidate2 == INVALID_REGNUM)
6811 frame.wb_candidate2 = regno;
43e9d192
IB
6812 offset += UNITS_PER_WORD;
6813 }
6814
c600df9a
RS
6815 poly_int64 max_int_offset = offset;
6816 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6817 bool has_align_gap = maybe_ne (offset, max_int_offset);
4b0685d9 6818
43e9d192 6819 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
c600df9a 6820 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
43e9d192 6821 {
4b0685d9
WD
6822 /* If there is an alignment gap between integer and fp callee-saves,
6823 allocate the last fp register to it if possible. */
a0d0b980
SE
6824 if (regno == last_fp_reg
6825 && has_align_gap
c600df9a
RS
6826 && known_eq (vector_save_size, 8)
6827 && multiple_p (offset, 16))
4b0685d9 6828 {
ab43763e 6829 frame.reg_offset[regno] = max_int_offset;
4b0685d9
WD
6830 break;
6831 }
6832
ab43763e
RS
6833 frame.reg_offset[regno] = offset;
6834 if (frame.wb_candidate1 == INVALID_REGNUM)
6835 frame.wb_candidate1 = regno;
6836 else if (frame.wb_candidate2 == INVALID_REGNUM
6837 && frame.wb_candidate1 >= V0_REGNUM)
6838 frame.wb_candidate2 = regno;
c600df9a 6839 offset += vector_save_size;
43e9d192
IB
6840 }
6841
c600df9a 6842 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
43e9d192 6843
ab43763e 6844 frame.saved_regs_size = offset;
1c960e02 6845
c600df9a 6846 poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
71bfb77a 6847
c600df9a 6848 poly_int64 above_outgoing_args
6a70badb
RS
6849 = aligned_upper_bound (varargs_and_saved_regs_size
6850 + get_frame_size (),
6851 STACK_BOUNDARY / BITS_PER_UNIT);
1c960e02 6852
c600df9a
RS
6853 frame.hard_fp_offset
6854 = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
6855
6a70badb
RS
6856 /* Both these values are already aligned. */
6857 gcc_assert (multiple_p (crtl->outgoing_args_size,
6858 STACK_BOUNDARY / BITS_PER_UNIT));
c600df9a 6859 frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
1c960e02 6860
ab43763e 6861 frame.locals_offset = frame.saved_varargs_size;
71bfb77a 6862
ab43763e
RS
6863 frame.initial_adjust = 0;
6864 frame.final_adjust = 0;
6865 frame.callee_adjust = 0;
c600df9a 6866 frame.sve_callee_adjust = 0;
ab43763e 6867 frame.callee_offset = 0;
71bfb77a
WD
6868
6869 HOST_WIDE_INT max_push_offset = 0;
ab43763e 6870 if (frame.wb_candidate2 != INVALID_REGNUM)
71bfb77a 6871 max_push_offset = 512;
ab43763e 6872 else if (frame.wb_candidate1 != INVALID_REGNUM)
71bfb77a
WD
6873 max_push_offset = 256;
6874
9b17a646 6875 HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
c600df9a 6876 HOST_WIDE_INT const_saved_regs_size;
ab43763e 6877 if (frame.frame_size.is_constant (&const_size)
6a70badb 6878 && const_size < max_push_offset
c600df9a 6879 && known_eq (frame.hard_fp_offset, const_size))
71bfb77a
WD
6880 {
6881 /* Simple, small frame with no outgoing arguments:
c600df9a 6882
71bfb77a
WD
6883 stp reg1, reg2, [sp, -frame_size]!
6884 stp reg3, reg4, [sp, 16] */
ab43763e 6885 frame.callee_adjust = const_size;
71bfb77a 6886 }
9b17a646 6887 else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size)
c600df9a
RS
6888 && frame.saved_regs_size.is_constant (&const_saved_regs_size)
6889 && const_outgoing_args_size + const_saved_regs_size < 512
6890 /* We could handle this case even with outgoing args, provided
6891 that the number of args left us with valid offsets for all
6892 predicate and vector save slots. It's such a rare case that
6893 it hardly seems worth the effort though. */
6894 && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
71bfb77a 6895 && !(cfun->calls_alloca
9b17a646
RS
6896 && frame.hard_fp_offset.is_constant (&const_fp_offset)
6897 && const_fp_offset < max_push_offset))
71bfb77a
WD
6898 {
6899 /* Frame with small outgoing arguments:
c600df9a 6900
71bfb77a
WD
6901 sub sp, sp, frame_size
6902 stp reg1, reg2, [sp, outgoing_args_size]
6903 stp reg3, reg4, [sp, outgoing_args_size + 16] */
ab43763e 6904 frame.initial_adjust = frame.frame_size;
9b17a646 6905 frame.callee_offset = const_outgoing_args_size;
71bfb77a 6906 }
c600df9a
RS
6907 else if (saves_below_hard_fp_p
6908 && known_eq (frame.saved_regs_size,
6909 frame.below_hard_fp_saved_regs_size))
6910 {
6911 /* Frame in which all saves are SVE saves:
6912
6913 sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
6914 save SVE registers relative to SP
6915 sub sp, sp, outgoing_args_size */
6916 frame.initial_adjust = (frame.hard_fp_offset
6917 + frame.below_hard_fp_saved_regs_size);
6918 frame.final_adjust = crtl->outgoing_args_size;
6919 }
ab43763e 6920 else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
6a70badb 6921 && const_fp_offset < max_push_offset)
71bfb77a 6922 {
c600df9a
RS
6923 /* Frame with large outgoing arguments or SVE saves, but with
6924 a small local area:
6925
71bfb77a
WD
6926 stp reg1, reg2, [sp, -hard_fp_offset]!
6927 stp reg3, reg4, [sp, 16]
c600df9a
RS
6928 [sub sp, sp, below_hard_fp_saved_regs_size]
6929 [save SVE registers relative to SP]
71bfb77a 6930 sub sp, sp, outgoing_args_size */
ab43763e 6931 frame.callee_adjust = const_fp_offset;
c600df9a 6932 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
8e66b377 6933 frame.final_adjust = crtl->outgoing_args_size;
71bfb77a 6934 }
71bfb77a
WD
6935 else
6936 {
c600df9a
RS
6937 /* Frame with large local area and outgoing arguments or SVE saves,
6938 using frame pointer:
6939
71bfb77a
WD
6940 sub sp, sp, hard_fp_offset
6941 stp x29, x30, [sp, 0]
6942 add x29, sp, 0
6943 stp reg3, reg4, [sp, 16]
c600df9a
RS
6944 [sub sp, sp, below_hard_fp_saved_regs_size]
6945 [save SVE registers relative to SP]
71bfb77a 6946 sub sp, sp, outgoing_args_size */
ab43763e 6947 frame.initial_adjust = frame.hard_fp_offset;
c600df9a 6948 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
8e66b377 6949 frame.final_adjust = crtl->outgoing_args_size;
71bfb77a
WD
6950 }
6951
8e66b377
RS
6952 /* Make sure the individual adjustments add up to the full frame size. */
6953 gcc_assert (known_eq (frame.initial_adjust
6954 + frame.callee_adjust
c600df9a 6955 + frame.sve_callee_adjust
8e66b377
RS
6956 + frame.final_adjust, frame.frame_size));
6957
59a3d73d
RS
6958 if (!frame.emit_frame_chain && frame.callee_adjust == 0)
6959 {
6960 /* We've decided not to associate any register saves with the initial
6961 stack allocation. */
6962 frame.wb_candidate1 = INVALID_REGNUM;
6963 frame.wb_candidate2 = INVALID_REGNUM;
6964 }
6965
ab43763e 6966 frame.laid_out = true;
43e9d192
IB
6967}
6968
04ddfe06
KT
6969/* Return true if the register REGNO is saved on entry to
6970 the current function. */
6971
43e9d192
IB
6972static bool
6973aarch64_register_saved_on_entry (int regno)
6974{
c600df9a 6975 return known_ge (cfun->machine->frame.reg_offset[regno], 0);
43e9d192
IB
6976}
6977
04ddfe06
KT
6978/* Return the next register up from REGNO up to LIMIT for the callee
6979 to save. */
6980
64dedd72
JW
6981static unsigned
6982aarch64_next_callee_save (unsigned regno, unsigned limit)
6983{
6984 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
6985 regno ++;
6986 return regno;
6987}
43e9d192 6988
04ddfe06
KT
6989/* Push the register number REGNO of mode MODE to the stack with write-back
6990 adjusting the stack by ADJUSTMENT. */
6991
c5e1f66e 6992static void
ef4bddc2 6993aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
c5e1f66e
JW
6994 HOST_WIDE_INT adjustment)
6995 {
6996 rtx base_rtx = stack_pointer_rtx;
6997 rtx insn, reg, mem;
6998
6999 reg = gen_rtx_REG (mode, regno);
7000 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
7001 plus_constant (Pmode, base_rtx, -adjustment));
30079dde 7002 mem = gen_frame_mem (mode, mem);
c5e1f66e
JW
7003
7004 insn = emit_move_insn (mem, reg);
7005 RTX_FRAME_RELATED_P (insn) = 1;
7006}
7007
04ddfe06
KT
7008/* Generate and return an instruction to store the pair of registers
7009 REG and REG2 of mode MODE to location BASE with write-back adjusting
7010 the stack location BASE by ADJUSTMENT. */
7011
80c11907 7012static rtx
ef4bddc2 7013aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
80c11907
JW
7014 HOST_WIDE_INT adjustment)
7015{
7016 switch (mode)
7017 {
4e10a5a7 7018 case E_DImode:
80c11907
JW
7019 return gen_storewb_pairdi_di (base, base, reg, reg2,
7020 GEN_INT (-adjustment),
7021 GEN_INT (UNITS_PER_WORD - adjustment));
4e10a5a7 7022 case E_DFmode:
80c11907
JW
7023 return gen_storewb_pairdf_di (base, base, reg, reg2,
7024 GEN_INT (-adjustment),
7025 GEN_INT (UNITS_PER_WORD - adjustment));
a0d0b980
SE
7026 case E_TFmode:
7027 return gen_storewb_pairtf_di (base, base, reg, reg2,
7028 GEN_INT (-adjustment),
7029 GEN_INT (UNITS_PER_VREG - adjustment));
80c11907
JW
7030 default:
7031 gcc_unreachable ();
7032 }
7033}
7034
04ddfe06
KT
7035/* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
7036 stack pointer by ADJUSTMENT. */
7037
80c11907 7038static void
89ac681e 7039aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
80c11907 7040{
5d8a22a5 7041 rtx_insn *insn;
c600df9a 7042 machine_mode mode = aarch64_reg_save_mode (regno1);
89ac681e 7043
71bfb77a 7044 if (regno2 == INVALID_REGNUM)
89ac681e
WD
7045 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
7046
80c11907
JW
7047 rtx reg1 = gen_rtx_REG (mode, regno1);
7048 rtx reg2 = gen_rtx_REG (mode, regno2);
7049
7050 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
7051 reg2, adjustment));
7052 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
80c11907
JW
7053 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
7054 RTX_FRAME_RELATED_P (insn) = 1;
7055}
7056
04ddfe06
KT
7057/* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
7058 adjusting it by ADJUSTMENT afterwards. */
7059
159313d9 7060static rtx
ef4bddc2 7061aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
159313d9
JW
7062 HOST_WIDE_INT adjustment)
7063{
7064 switch (mode)
7065 {
4e10a5a7 7066 case E_DImode:
159313d9 7067 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3e322b3f 7068 GEN_INT (UNITS_PER_WORD));
4e10a5a7 7069 case E_DFmode:
159313d9 7070 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3e322b3f 7071 GEN_INT (UNITS_PER_WORD));
a0d0b980
SE
7072 case E_TFmode:
7073 return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
7074 GEN_INT (UNITS_PER_VREG));
159313d9
JW
7075 default:
7076 gcc_unreachable ();
7077 }
7078}
7079
04ddfe06
KT
7080/* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
7081 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
7082 into CFI_OPS. */
7083
89ac681e
WD
7084static void
7085aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
7086 rtx *cfi_ops)
7087{
c600df9a 7088 machine_mode mode = aarch64_reg_save_mode (regno1);
89ac681e
WD
7089 rtx reg1 = gen_rtx_REG (mode, regno1);
7090
7091 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
7092
71bfb77a 7093 if (regno2 == INVALID_REGNUM)
89ac681e
WD
7094 {
7095 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
7096 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
30079dde 7097 emit_move_insn (reg1, gen_frame_mem (mode, mem));
89ac681e
WD
7098 }
7099 else
7100 {
7101 rtx reg2 = gen_rtx_REG (mode, regno2);
7102 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
7103 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
7104 reg2, adjustment));
7105 }
7106}
7107
04ddfe06
KT
7108/* Generate and return a store pair instruction of mode MODE to store
7109 register REG1 to MEM1 and register REG2 to MEM2. */
7110
72df5c1f 7111static rtx
ef4bddc2 7112aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
72df5c1f
JW
7113 rtx reg2)
7114{
7115 switch (mode)
7116 {
4e10a5a7 7117 case E_DImode:
dfe1da23 7118 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
72df5c1f 7119
4e10a5a7 7120 case E_DFmode:
dfe1da23 7121 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
72df5c1f 7122
a0d0b980
SE
7123 case E_TFmode:
7124 return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
7125
7cda9e08
SD
7126 case E_V4SImode:
7127 return gen_vec_store_pairv4siv4si (mem1, reg1, mem2, reg2);
7128
54bbde55
SD
7129 case E_V16QImode:
7130 return gen_vec_store_pairv16qiv16qi (mem1, reg1, mem2, reg2);
7131
72df5c1f
JW
7132 default:
7133 gcc_unreachable ();
7134 }
7135}
7136
04ddfe06
KT
7137/* Generate and regurn a load pair isntruction of mode MODE to load register
7138 REG1 from MEM1 and register REG2 from MEM2. */
7139
72df5c1f 7140static rtx
ef4bddc2 7141aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
72df5c1f
JW
7142 rtx mem2)
7143{
7144 switch (mode)
7145 {
4e10a5a7 7146 case E_DImode:
dfe1da23 7147 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
72df5c1f 7148
4e10a5a7 7149 case E_DFmode:
dfe1da23 7150 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
72df5c1f 7151
a0d0b980
SE
7152 case E_TFmode:
7153 return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
7154
7cda9e08
SD
7155 case E_V4SImode:
7156 return gen_load_pairv4siv4si (reg1, mem1, reg2, mem2);
7157
72df5c1f
JW
7158 default:
7159 gcc_unreachable ();
7160 }
7161}
7162
db58fd89
JW
7163/* Return TRUE if return address signing should be enabled for the current
7164 function, otherwise return FALSE. */
7165
7166bool
7167aarch64_return_address_signing_enabled (void)
7168{
7169 /* This function should only be called after frame laid out. */
7170 gcc_assert (cfun->machine->frame.laid_out);
7171
2bc95be3
SN
7172 /* Turn return address signing off in any function that uses
7173 __builtin_eh_return. The address passed to __builtin_eh_return
7174 is not signed so either it has to be signed (with original sp)
7175 or the code path that uses it has to avoid authenticating it.
7176 Currently eh return introduces a return to anywhere gadget, no
7177 matter what we do here since it uses ret with user provided
7178 address. An ideal fix for that is to use indirect branch which
7179 can be protected with BTI j (to some extent). */
7180 if (crtl->calls_eh_return)
7181 return false;
7182
db58fd89 7183 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
8fc16d72 7184 if its LR is pushed onto stack. */
db58fd89
JW
7185 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
7186 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
c600df9a 7187 && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
db58fd89
JW
7188}
7189
30afdf34
SD
7190/* Return TRUE if Branch Target Identification Mechanism is enabled. */
7191bool
7192aarch64_bti_enabled (void)
7193{
7194 return (aarch64_enable_bti == 1);
7195}
7196
c600df9a
RS
7197/* The caller is going to use ST1D or LD1D to save or restore an SVE
7198 register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
7199 the range [1, 16] * GET_MODE_SIZE (MODE). Prepare for this by:
7200
7201 (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
7202 or LD1D address
7203
7204 (2) setting PRED to a valid predicate register for the ST1D or LD1D,
7205 if the variable isn't already nonnull
7206
7207 (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
7208 Handle this case using a temporary base register that is suitable for
7209 all offsets in that range. Use ANCHOR_REG as this base register if it
7210 is nonnull, otherwise create a new register and store it in ANCHOR_REG. */
7211
7212static inline void
7213aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
7214 rtx &anchor_reg, poly_int64 &offset,
7215 rtx &ptrue)
7216{
7217 if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
7218 {
7219 /* This is the maximum valid offset of the anchor from the base.
7220 Lower values would be valid too. */
7221 poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
7222 if (!anchor_reg)
7223 {
7224 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
7225 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
7226 gen_int_mode (anchor_offset, Pmode)));
7227 }
7228 base_rtx = anchor_reg;
7229 offset -= anchor_offset;
7230 }
7231 if (!ptrue)
7232 {
7233 int pred_reg = cfun->machine->frame.spare_pred_reg;
7234 emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
7235 CONSTM1_RTX (VNx16BImode));
7236 ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
7237 }
7238}
7239
7240/* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
7241 is saved at BASE + OFFSET. */
7242
7243static void
7244aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
7245 rtx base, poly_int64 offset)
7246{
7247 rtx mem = gen_frame_mem (GET_MODE (reg),
7248 plus_constant (Pmode, base, offset));
7249 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
7250}
7251
04ddfe06
KT
7252/* Emit code to save the callee-saved registers from register number START
7253 to LIMIT to the stack at the location starting at offset START_OFFSET,
c600df9a
RS
7254 skipping any write-back candidates if SKIP_WB is true. HARD_FP_VALID_P
7255 is true if the hard frame pointer has been set up. */
43e9d192 7256
43e9d192 7257static void
c600df9a
RS
7258aarch64_save_callee_saves (poly_int64 start_offset,
7259 unsigned start, unsigned limit, bool skip_wb,
7260 bool hard_fp_valid_p)
43e9d192 7261{
5d8a22a5 7262 rtx_insn *insn;
43e9d192
IB
7263 unsigned regno;
7264 unsigned regno2;
c600df9a 7265 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
43e9d192 7266
0ec74a1e 7267 for (regno = aarch64_next_callee_save (start, limit);
64dedd72
JW
7268 regno <= limit;
7269 regno = aarch64_next_callee_save (regno + 1, limit))
43e9d192 7270 {
ae13fce3 7271 rtx reg, mem;
6a70badb 7272 poly_int64 offset;
c600df9a 7273 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
64dedd72 7274
ae13fce3
JW
7275 if (skip_wb
7276 && (regno == cfun->machine->frame.wb_candidate1
7277 || regno == cfun->machine->frame.wb_candidate2))
7278 continue;
7279
827ab47a 7280 if (cfun->machine->reg_is_wrapped_separately[regno])
c600df9a 7281 continue;
827ab47a 7282
c600df9a 7283 machine_mode mode = aarch64_reg_save_mode (regno);
ae13fce3
JW
7284 reg = gen_rtx_REG (mode, regno);
7285 offset = start_offset + cfun->machine->frame.reg_offset[regno];
c600df9a
RS
7286 rtx base_rtx = stack_pointer_rtx;
7287 poly_int64 sp_offset = offset;
64dedd72 7288
c600df9a
RS
7289 HOST_WIDE_INT const_offset;
7290 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7291 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
7292 offset, ptrue);
7293 else if (GP_REGNUM_P (regno)
7294 && (!offset.is_constant (&const_offset) || const_offset >= 512))
7295 {
7296 gcc_assert (known_eq (start_offset, 0));
7297 poly_int64 fp_offset
7298 = cfun->machine->frame.below_hard_fp_saved_regs_size;
7299 if (hard_fp_valid_p)
7300 base_rtx = hard_frame_pointer_rtx;
7301 else
7302 {
7303 if (!anchor_reg)
7304 {
7305 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
7306 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
7307 gen_int_mode (fp_offset, Pmode)));
7308 }
7309 base_rtx = anchor_reg;
7310 }
7311 offset -= fp_offset;
7312 }
7313 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7314 bool need_cfa_note_p = (base_rtx != stack_pointer_rtx);
64dedd72 7315
c600df9a
RS
7316 if (!aarch64_sve_mode_p (mode)
7317 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
827ab47a 7318 && !cfun->machine->reg_is_wrapped_separately[regno2]
c600df9a
RS
7319 && known_eq (GET_MODE_SIZE (mode),
7320 cfun->machine->frame.reg_offset[regno2]
7321 - cfun->machine->frame.reg_offset[regno]))
43e9d192 7322 {
0ec74a1e 7323 rtx reg2 = gen_rtx_REG (mode, regno2);
64dedd72
JW
7324 rtx mem2;
7325
c600df9a
RS
7326 offset += GET_MODE_SIZE (mode);
7327 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8ed2fc62
JW
7328 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
7329 reg2));
0b4a9743 7330
64dedd72
JW
7331 /* The first part of a frame-related parallel insn is
7332 always assumed to be relevant to the frame
7333 calculations; subsequent parts, are only
7334 frame-related if explicitly marked. */
c600df9a
RS
7335 if (aarch64_emit_cfi_for_reg_p (regno2))
7336 {
7337 if (need_cfa_note_p)
7338 aarch64_add_cfa_expression (insn, reg2, stack_pointer_rtx,
7339 sp_offset + GET_MODE_SIZE (mode));
7340 else
7341 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
7342 }
7343
64dedd72
JW
7344 regno = regno2;
7345 }
c600df9a
RS
7346 else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7347 {
7348 insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, reg));
7349 need_cfa_note_p = true;
7350 }
7351 else if (aarch64_sve_mode_p (mode))
7352 insn = emit_insn (gen_rtx_SET (mem, reg));
64dedd72 7353 else
8ed2fc62
JW
7354 insn = emit_move_insn (mem, reg);
7355
c600df9a
RS
7356 RTX_FRAME_RELATED_P (insn) = frame_related_p;
7357 if (frame_related_p && need_cfa_note_p)
7358 aarch64_add_cfa_expression (insn, reg, stack_pointer_rtx, sp_offset);
8ed2fc62
JW
7359 }
7360}
7361
c600df9a
RS
7362/* Emit code to restore the callee registers from register number START
7363 up to and including LIMIT. Restore from the stack offset START_OFFSET,
7364 skipping any write-back candidates if SKIP_WB is true. Write the
7365 appropriate REG_CFA_RESTORE notes into CFI_OPS. */
04ddfe06 7366
8ed2fc62 7367static void
c600df9a 7368aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
dd991abb 7369 unsigned limit, bool skip_wb, rtx *cfi_ops)
8ed2fc62 7370{
8ed2fc62
JW
7371 unsigned regno;
7372 unsigned regno2;
6a70badb 7373 poly_int64 offset;
c600df9a 7374 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
8ed2fc62
JW
7375
7376 for (regno = aarch64_next_callee_save (start, limit);
7377 regno <= limit;
7378 regno = aarch64_next_callee_save (regno + 1, limit))
7379 {
c600df9a 7380 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
827ab47a 7381 if (cfun->machine->reg_is_wrapped_separately[regno])
c600df9a 7382 continue;
827ab47a 7383
ae13fce3 7384 rtx reg, mem;
8ed2fc62 7385
ae13fce3
JW
7386 if (skip_wb
7387 && (regno == cfun->machine->frame.wb_candidate1
7388 || regno == cfun->machine->frame.wb_candidate2))
7389 continue;
7390
c600df9a 7391 machine_mode mode = aarch64_reg_save_mode (regno);
ae13fce3 7392 reg = gen_rtx_REG (mode, regno);
8ed2fc62 7393 offset = start_offset + cfun->machine->frame.reg_offset[regno];
c600df9a
RS
7394 rtx base_rtx = stack_pointer_rtx;
7395 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7396 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
7397 offset, ptrue);
30079dde 7398 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8ed2fc62 7399
c600df9a
RS
7400 if (!aarch64_sve_mode_p (mode)
7401 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
827ab47a 7402 && !cfun->machine->reg_is_wrapped_separately[regno2]
c600df9a
RS
7403 && known_eq (GET_MODE_SIZE (mode),
7404 cfun->machine->frame.reg_offset[regno2]
7405 - cfun->machine->frame.reg_offset[regno]))
64dedd72 7406 {
8ed2fc62
JW
7407 rtx reg2 = gen_rtx_REG (mode, regno2);
7408 rtx mem2;
7409
c600df9a 7410 offset += GET_MODE_SIZE (mode);
30079dde 7411 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
dd991abb 7412 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
8ed2fc62 7413
dd991abb 7414 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
8ed2fc62 7415 regno = regno2;
43e9d192 7416 }
c600df9a
RS
7417 else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7418 emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
7419 else if (aarch64_sve_mode_p (mode))
7420 emit_insn (gen_rtx_SET (reg, mem));
8ed2fc62 7421 else
dd991abb 7422 emit_move_insn (reg, mem);
c600df9a
RS
7423 if (frame_related_p)
7424 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
43e9d192 7425 }
43e9d192
IB
7426}
7427
43cacb12
RS
7428/* Return true if OFFSET is a signed 4-bit value multiplied by the size
7429 of MODE. */
7430
7431static inline bool
7432offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
7433{
7434 HOST_WIDE_INT multiple;
7435 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7436 && IN_RANGE (multiple, -8, 7));
7437}
7438
7439/* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
7440 of MODE. */
7441
7442static inline bool
7443offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
7444{
7445 HOST_WIDE_INT multiple;
7446 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7447 && IN_RANGE (multiple, 0, 63));
7448}
7449
7450/* Return true if OFFSET is a signed 7-bit value multiplied by the size
7451 of MODE. */
7452
7453bool
7454aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
7455{
7456 HOST_WIDE_INT multiple;
7457 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7458 && IN_RANGE (multiple, -64, 63));
7459}
7460
7461/* Return true if OFFSET is a signed 9-bit value. */
7462
3c5af608
MM
7463bool
7464aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
7465 poly_int64 offset)
827ab47a 7466{
6a70badb
RS
7467 HOST_WIDE_INT const_offset;
7468 return (offset.is_constant (&const_offset)
7469 && IN_RANGE (const_offset, -256, 255));
827ab47a
KT
7470}
7471
43cacb12
RS
7472/* Return true if OFFSET is a signed 9-bit value multiplied by the size
7473 of MODE. */
7474
827ab47a 7475static inline bool
43cacb12 7476offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
827ab47a 7477{
6a70badb
RS
7478 HOST_WIDE_INT multiple;
7479 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
43cacb12 7480 && IN_RANGE (multiple, -256, 255));
827ab47a
KT
7481}
7482
43cacb12
RS
7483/* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
7484 of MODE. */
7485
7486static inline bool
7487offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
827ab47a 7488{
6a70badb
RS
7489 HOST_WIDE_INT multiple;
7490 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
43cacb12 7491 && IN_RANGE (multiple, 0, 4095));
827ab47a
KT
7492}
7493
7494/* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
7495
7496static sbitmap
7497aarch64_get_separate_components (void)
7498{
827ab47a
KT
7499 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
7500 bitmap_clear (components);
7501
7502 /* The registers we need saved to the frame. */
7503 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7504 if (aarch64_register_saved_on_entry (regno))
7505 {
c600df9a
RS
7506 /* Punt on saves and restores that use ST1D and LD1D. We could
7507 try to be smarter, but it would involve making sure that the
7508 spare predicate register itself is safe to use at the save
7509 and restore points. Also, when a frame pointer is being used,
7510 the slots are often out of reach of ST1D and LD1D anyway. */
7511 machine_mode mode = aarch64_reg_save_mode (regno);
7512 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7513 continue;
7514
6a70badb 7515 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
c600df9a
RS
7516
7517 /* If the register is saved in the first SVE save slot, we use
7518 it as a stack probe for -fstack-clash-protection. */
7519 if (flag_stack_clash_protection
7520 && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
7521 && known_eq (offset, 0))
7522 continue;
7523
7524 /* Get the offset relative to the register we'll use. */
7525 if (frame_pointer_needed)
7526 offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
7527 else
7528 offset += crtl->outgoing_args_size;
7529
827ab47a
KT
7530 /* Check that we can access the stack slot of the register with one
7531 direct load with no adjustments needed. */
c600df9a
RS
7532 if (aarch64_sve_mode_p (mode)
7533 ? offset_9bit_signed_scaled_p (mode, offset)
7534 : offset_12bit_unsigned_scaled_p (mode, offset))
827ab47a
KT
7535 bitmap_set_bit (components, regno);
7536 }
7537
7538 /* Don't mess with the hard frame pointer. */
7539 if (frame_pointer_needed)
7540 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
7541
c600df9a
RS
7542 /* If the spare predicate register used by big-endian SVE code
7543 is call-preserved, it must be saved in the main prologue
7544 before any saves that use it. */
7545 if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
7546 bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
7547
827ab47a
KT
7548 unsigned reg1 = cfun->machine->frame.wb_candidate1;
7549 unsigned reg2 = cfun->machine->frame.wb_candidate2;
0795f659 7550 /* If registers have been chosen to be stored/restored with
827ab47a
KT
7551 writeback don't interfere with them to avoid having to output explicit
7552 stack adjustment instructions. */
7553 if (reg2 != INVALID_REGNUM)
7554 bitmap_clear_bit (components, reg2);
7555 if (reg1 != INVALID_REGNUM)
7556 bitmap_clear_bit (components, reg1);
7557
7558 bitmap_clear_bit (components, LR_REGNUM);
7559 bitmap_clear_bit (components, SP_REGNUM);
7560
7561 return components;
7562}
7563
7564/* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
7565
7566static sbitmap
7567aarch64_components_for_bb (basic_block bb)
7568{
7569 bitmap in = DF_LIVE_IN (bb);
7570 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
7571 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
7572
7573 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
7574 bitmap_clear (components);
7575
ce9d2a37
RS
7576 /* Clobbered registers don't generate values in any meaningful sense,
7577 since nothing after the clobber can rely on their value. And we can't
7578 say that partially-clobbered registers are unconditionally killed,
7579 because whether they're killed or not depends on the mode of the
7580 value they're holding. Thus partially call-clobbered registers
7581 appear in neither the kill set nor the gen set.
7582
7583 Check manually for any calls that clobber more of a register than the
7584 current function can. */
7585 function_abi_aggregator callee_abis;
7586 rtx_insn *insn;
7587 FOR_BB_INSNS (bb, insn)
7588 if (CALL_P (insn))
7589 callee_abis.note_callee_abi (insn_callee_abi (insn));
7590 HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
7591
827ab47a
KT
7592 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
7593 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
dcdd0f05
RS
7594 if (!fixed_regs[regno]
7595 && !crtl->abi->clobbers_full_reg_p (regno)
ce9d2a37
RS
7596 && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
7597 || bitmap_bit_p (in, regno)
7598 || bitmap_bit_p (gen, regno)
7599 || bitmap_bit_p (kill, regno)))
3f26f054 7600 {
3f26f054
WD
7601 bitmap_set_bit (components, regno);
7602
7603 /* If there is a callee-save at an adjacent offset, add it too
7604 to increase the use of LDP/STP. */
c600df9a
RS
7605 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
7606 unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
3f26f054
WD
7607
7608 if (regno2 <= LAST_SAVED_REGNUM)
7609 {
c600df9a
RS
7610 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
7611 if (regno < regno2
7612 ? known_eq (offset + 8, offset2)
7613 : multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
3f26f054
WD
7614 bitmap_set_bit (components, regno2);
7615 }
7616 }
827ab47a
KT
7617
7618 return components;
7619}
7620
7621/* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
7622 Nothing to do for aarch64. */
7623
7624static void
7625aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
7626{
7627}
7628
7629/* Return the next set bit in BMP from START onwards. Return the total number
7630 of bits in BMP if no set bit is found at or after START. */
7631
7632static unsigned int
7633aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
7634{
7635 unsigned int nbits = SBITMAP_SIZE (bmp);
7636 if (start == nbits)
7637 return start;
7638
7639 gcc_assert (start < nbits);
7640 for (unsigned int i = start; i < nbits; i++)
7641 if (bitmap_bit_p (bmp, i))
7642 return i;
7643
7644 return nbits;
7645}
7646
7647/* Do the work for aarch64_emit_prologue_components and
7648 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
7649 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
7650 for these components or the epilogue sequence. That is, it determines
7651 whether we should emit stores or loads and what kind of CFA notes to attach
7652 to the insns. Otherwise the logic for the two sequences is very
7653 similar. */
7654
7655static void
7656aarch64_process_components (sbitmap components, bool prologue_p)
7657{
7658 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
7659 ? HARD_FRAME_POINTER_REGNUM
7660 : STACK_POINTER_REGNUM);
7661
7662 unsigned last_regno = SBITMAP_SIZE (components);
7663 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
7664 rtx_insn *insn = NULL;
7665
7666 while (regno != last_regno)
7667 {
c600df9a
RS
7668 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
7669 machine_mode mode = aarch64_reg_save_mode (regno);
a0d0b980 7670
827ab47a 7671 rtx reg = gen_rtx_REG (mode, regno);
6a70badb 7672 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
c600df9a
RS
7673 if (frame_pointer_needed)
7674 offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
7675 else
7676 offset += crtl->outgoing_args_size;
7677
827ab47a
KT
7678 rtx addr = plus_constant (Pmode, ptr_reg, offset);
7679 rtx mem = gen_frame_mem (mode, addr);
7680
7681 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
7682 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
7683 /* No more registers to handle after REGNO.
7684 Emit a single save/restore and exit. */
7685 if (regno2 == last_regno)
7686 {
7687 insn = emit_insn (set);
c600df9a
RS
7688 if (frame_related_p)
7689 {
7690 RTX_FRAME_RELATED_P (insn) = 1;
7691 if (prologue_p)
7692 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
7693 else
7694 add_reg_note (insn, REG_CFA_RESTORE, reg);
7695 }
827ab47a
KT
7696 break;
7697 }
7698
6a70badb 7699 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
827ab47a
KT
7700 /* The next register is not of the same class or its offset is not
7701 mergeable with the current one into a pair. */
c600df9a
RS
7702 if (aarch64_sve_mode_p (mode)
7703 || !satisfies_constraint_Ump (mem)
827ab47a 7704 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
dcdd0f05 7705 || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
6a70badb
RS
7706 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
7707 GET_MODE_SIZE (mode)))
827ab47a
KT
7708 {
7709 insn = emit_insn (set);
c600df9a
RS
7710 if (frame_related_p)
7711 {
7712 RTX_FRAME_RELATED_P (insn) = 1;
7713 if (prologue_p)
7714 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
7715 else
7716 add_reg_note (insn, REG_CFA_RESTORE, reg);
7717 }
827ab47a
KT
7718
7719 regno = regno2;
7720 continue;
7721 }
7722
c600df9a
RS
7723 bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
7724
827ab47a
KT
7725 /* REGNO2 can be saved/restored in a pair with REGNO. */
7726 rtx reg2 = gen_rtx_REG (mode, regno2);
c600df9a
RS
7727 if (frame_pointer_needed)
7728 offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
7729 else
7730 offset2 += crtl->outgoing_args_size;
827ab47a
KT
7731 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
7732 rtx mem2 = gen_frame_mem (mode, addr2);
7733 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
7734 : gen_rtx_SET (reg2, mem2);
7735
7736 if (prologue_p)
7737 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
7738 else
7739 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
7740
c600df9a 7741 if (frame_related_p || frame_related2_p)
827ab47a 7742 {
c600df9a
RS
7743 RTX_FRAME_RELATED_P (insn) = 1;
7744 if (prologue_p)
7745 {
7746 if (frame_related_p)
7747 add_reg_note (insn, REG_CFA_OFFSET, set);
7748 if (frame_related2_p)
7749 add_reg_note (insn, REG_CFA_OFFSET, set2);
7750 }
7751 else
7752 {
7753 if (frame_related_p)
7754 add_reg_note (insn, REG_CFA_RESTORE, reg);
7755 if (frame_related2_p)
7756 add_reg_note (insn, REG_CFA_RESTORE, reg2);
7757 }
827ab47a
KT
7758 }
7759
7760 regno = aarch64_get_next_set_bit (components, regno2 + 1);
7761 }
7762}
7763
7764/* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
7765
7766static void
7767aarch64_emit_prologue_components (sbitmap components)
7768{
7769 aarch64_process_components (components, true);
7770}
7771
7772/* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
7773
7774static void
7775aarch64_emit_epilogue_components (sbitmap components)
7776{
7777 aarch64_process_components (components, false);
7778}
7779
7780/* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
7781
7782static void
7783aarch64_set_handled_components (sbitmap components)
7784{
7785 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7786 if (bitmap_bit_p (components, regno))
7787 cfun->machine->reg_is_wrapped_separately[regno] = true;
7788}
7789
8c6e3b23
TC
7790/* On AArch64 we have an ABI defined safe buffer. This constant is used to
7791 determining the probe offset for alloca. */
7792
7793static HOST_WIDE_INT
7794aarch64_stack_clash_protection_alloca_probe_range (void)
7795{
7796 return STACK_CLASH_CALLER_GUARD;
7797}
7798
7799
cd1bef27
JL
7800/* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
7801 registers. If POLY_SIZE is not large enough to require a probe this function
7802 will only adjust the stack. When allocating the stack space
7803 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
7804 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
7805 arguments. If we are then we ensure that any allocation larger than the ABI
7806 defined buffer needs a probe so that the invariant of having a 1KB buffer is
7807 maintained.
7808
7809 We emit barriers after each stack adjustment to prevent optimizations from
7810 breaking the invariant that we never drop the stack more than a page. This
7811 invariant is needed to make it easier to correctly handle asynchronous
7812 events, e.g. if we were to allow the stack to be dropped by more than a page
7813 and then have multiple probes up and we take a signal somewhere in between
7814 then the signal handler doesn't know the state of the stack and can make no
7815 assumptions about which pages have been probed. */
7816
7817static void
7818aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
7819 poly_int64 poly_size,
7820 bool frame_related_p,
7821 bool final_adjustment_p)
7822{
7823 HOST_WIDE_INT guard_size
028d4092 7824 = 1 << param_stack_clash_protection_guard_size;
cd1bef27 7825 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
cd1bef27 7826 HOST_WIDE_INT min_probe_threshold
c600df9a
RS
7827 = (final_adjustment_p
7828 ? guard_used_by_caller
7829 : guard_size - guard_used_by_caller);
7830 /* When doing the final adjustment for the outgoing arguments, take into
7831 account any unprobed space there is above the current SP. There are
7832 two cases:
7833
7834 - When saving SVE registers below the hard frame pointer, we force
7835 the lowest save to take place in the prologue before doing the final
7836 adjustment (i.e. we don't allow the save to be shrink-wrapped).
7837 This acts as a probe at SP, so there is no unprobed space.
7838
7839 - When there are no SVE register saves, we use the store of the link
7840 register as a probe. We can't assume that LR was saved at position 0
7841 though, so treat any space below it as unprobed. */
7842 if (final_adjustment_p
7843 && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0))
7844 {
7845 poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
7846 if (known_ge (lr_offset, 0))
7847 min_probe_threshold -= lr_offset.to_constant ();
7848 else
7849 gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
7850 }
cd1bef27
JL
7851
7852 poly_int64 frame_size = cfun->machine->frame.frame_size;
7853
7854 /* We should always have a positive probe threshold. */
7855 gcc_assert (min_probe_threshold > 0);
7856
7857 if (flag_stack_clash_protection && !final_adjustment_p)
7858 {
7859 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
c600df9a 7860 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
cd1bef27
JL
7861 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
7862
7863 if (known_eq (frame_size, 0))
7864 {
7865 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
7866 }
c600df9a
RS
7867 else if (known_lt (initial_adjust + sve_callee_adjust,
7868 guard_size - guard_used_by_caller)
cd1bef27
JL
7869 && known_lt (final_adjust, guard_used_by_caller))
7870 {
7871 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
7872 }
7873 }
7874
cd1bef27
JL
7875 /* If SIZE is not large enough to require probing, just adjust the stack and
7876 exit. */
eb471ba3 7877 if (known_lt (poly_size, min_probe_threshold)
cd1bef27
JL
7878 || !flag_stack_clash_protection)
7879 {
7880 aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
7881 return;
7882 }
7883
eb471ba3
TC
7884 HOST_WIDE_INT size;
7885 /* Handle the SVE non-constant case first. */
7886 if (!poly_size.is_constant (&size))
7887 {
7888 if (dump_file)
7889 {
7890 fprintf (dump_file, "Stack clash SVE prologue: ");
7891 print_dec (poly_size, dump_file);
7892 fprintf (dump_file, " bytes, dynamic probing will be required.\n");
7893 }
7894
7895 /* First calculate the amount of bytes we're actually spilling. */
7896 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
7897 poly_size, temp1, temp2, false, true);
7898
7899 rtx_insn *insn = get_last_insn ();
7900
7901 if (frame_related_p)
7902 {
7903 /* This is done to provide unwinding information for the stack
7904 adjustments we're about to do, however to prevent the optimizers
143d3b15 7905 from removing the R11 move and leaving the CFA note (which would be
eb471ba3
TC
7906 very wrong) we tie the old and new stack pointer together.
7907 The tie will expand to nothing but the optimizers will not touch
7908 the instruction. */
143d3b15 7909 rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
eb471ba3
TC
7910 emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
7911 emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
7912
7913 /* We want the CFA independent of the stack pointer for the
7914 duration of the loop. */
7915 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
7916 RTX_FRAME_RELATED_P (insn) = 1;
7917 }
7918
7919 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
7920 rtx guard_const = gen_int_mode (guard_size, Pmode);
7921
7922 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
7923 stack_pointer_rtx, temp1,
7924 probe_const, guard_const));
7925
7926 /* Now reset the CFA register if needed. */
7927 if (frame_related_p)
7928 {
7929 add_reg_note (insn, REG_CFA_DEF_CFA,
7930 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
7931 gen_int_mode (poly_size, Pmode)));
7932 RTX_FRAME_RELATED_P (insn) = 1;
7933 }
7934
7935 return;
7936 }
7937
cd1bef27
JL
7938 if (dump_file)
7939 fprintf (dump_file,
eb471ba3
TC
7940 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
7941 " bytes, probing will be required.\n", size);
cd1bef27
JL
7942
7943 /* Round size to the nearest multiple of guard_size, and calculate the
7944 residual as the difference between the original size and the rounded
7945 size. */
7946 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
7947 HOST_WIDE_INT residual = size - rounded_size;
7948
7949 /* We can handle a small number of allocations/probes inline. Otherwise
7950 punt to a loop. */
7951 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
7952 {
7953 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
7954 {
7955 aarch64_sub_sp (NULL, temp2, guard_size, true);
7956 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
7957 guard_used_by_caller));
7958 emit_insn (gen_blockage ());
7959 }
7960 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
7961 }
7962 else
7963 {
7964 /* Compute the ending address. */
7965 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
7966 temp1, NULL, false, true);
7967 rtx_insn *insn = get_last_insn ();
7968
7969 /* For the initial allocation, we don't have a frame pointer
7970 set up, so we always need CFI notes. If we're doing the
7971 final allocation, then we may have a frame pointer, in which
7972 case it is the CFA, otherwise we need CFI notes.
7973
7974 We can determine which allocation we are doing by looking at
7975 the value of FRAME_RELATED_P since the final allocations are not
7976 frame related. */
7977 if (frame_related_p)
7978 {
7979 /* We want the CFA independent of the stack pointer for the
7980 duration of the loop. */
7981 add_reg_note (insn, REG_CFA_DEF_CFA,
7982 plus_constant (Pmode, temp1, rounded_size));
7983 RTX_FRAME_RELATED_P (insn) = 1;
7984 }
7985
7986 /* This allocates and probes the stack. Note that this re-uses some of
7987 the existing Ada stack protection code. However we are guaranteed not
7988 to enter the non loop or residual branches of that code.
7989
7990 The non-loop part won't be entered because if our allocation amount
7991 doesn't require a loop, the case above would handle it.
7992
7993 The residual amount won't be entered because TEMP1 is a mutliple of
7994 the allocation size. The residual will always be 0. As such, the only
7995 part we are actually using from that code is the loop setup. The
7996 actual probing is done in aarch64_output_probe_stack_range. */
7997 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
7998 stack_pointer_rtx, temp1));
7999
8000 /* Now reset the CFA register if needed. */
8001 if (frame_related_p)
8002 {
8003 add_reg_note (insn, REG_CFA_DEF_CFA,
8004 plus_constant (Pmode, stack_pointer_rtx, rounded_size));
8005 RTX_FRAME_RELATED_P (insn) = 1;
8006 }
8007
8008 emit_insn (gen_blockage ());
8009 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
8010 }
8011
8012 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
8013 be probed. This maintains the requirement that each page is probed at
8014 least once. For initial probing we probe only if the allocation is
8015 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
8016 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
8017 GUARD_SIZE. This works that for any allocation that is large enough to
8018 trigger a probe here, we'll have at least one, and if they're not large
8019 enough for this code to emit anything for them, The page would have been
8020 probed by the saving of FP/LR either by this function or any callees. If
8021 we don't have any callees then we won't have more stack adjustments and so
8022 are still safe. */
8023 if (residual)
8024 {
8025 HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
8026 /* If we're doing final adjustments, and we've done any full page
8027 allocations then any residual needs to be probed. */
8028 if (final_adjustment_p && rounded_size != 0)
8029 min_probe_threshold = 0;
8030 /* If doing a small final adjustment, we always probe at offset 0.
8031 This is done to avoid issues when LR is not at position 0 or when
8032 the final adjustment is smaller than the probing offset. */
8033 else if (final_adjustment_p && rounded_size == 0)
8034 residual_probe_offset = 0;
8035
8036 aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
8037 if (residual >= min_probe_threshold)
8038 {
8039 if (dump_file)
8040 fprintf (dump_file,
8041 "Stack clash AArch64 prologue residuals: "
8042 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
8043 "\n", residual);
8044
8045 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
8046 residual_probe_offset));
8047 emit_insn (gen_blockage ());
8048 }
8049 }
8050}
8051
a0d0b980
SE
8052/* Return 1 if the register is used by the epilogue. We need to say the
8053 return register is used, but only after epilogue generation is complete.
8054 Note that in the case of sibcalls, the values "used by the epilogue" are
8055 considered live at the start of the called function.
8056
8057 For SIMD functions we need to return 1 for FP registers that are saved and
8058 restored by a function but are not zero in call_used_regs. If we do not do
8059 this optimizations may remove the restore of the register. */
8060
8061int
8062aarch64_epilogue_uses (int regno)
8063{
8064 if (epilogue_completed)
8065 {
8066 if (regno == LR_REGNUM)
8067 return 1;
a0d0b980
SE
8068 }
8069 return 0;
8070}
8071
43e9d192
IB
8072/* AArch64 stack frames generated by this compiler look like:
8073
8074 +-------------------------------+
8075 | |
8076 | incoming stack arguments |
8077 | |
34834420
MS
8078 +-------------------------------+
8079 | | <-- incoming stack pointer (aligned)
43e9d192
IB
8080 | callee-allocated save area |
8081 | for register varargs |
8082 | |
34834420
MS
8083 +-------------------------------+
8084 | local variables | <-- frame_pointer_rtx
43e9d192
IB
8085 | |
8086 +-------------------------------+
cd1bef27 8087 | padding | \
454fdba9 8088 +-------------------------------+ |
454fdba9 8089 | callee-saved registers | | frame.saved_regs_size
454fdba9
RL
8090 +-------------------------------+ |
8091 | LR' | |
8092 +-------------------------------+ |
c600df9a
RS
8093 | FP' | |
8094 +-------------------------------+ |<- hard_frame_pointer_rtx (aligned)
8095 | SVE vector registers | | \
8096 +-------------------------------+ | | below_hard_fp_saved_regs_size
8097 | SVE predicate registers | / /
8098 +-------------------------------+
43e9d192
IB
8099 | dynamic allocation |
8100 +-------------------------------+
34834420
MS
8101 | padding |
8102 +-------------------------------+
8103 | outgoing stack arguments | <-- arg_pointer
8104 | |
8105 +-------------------------------+
8106 | | <-- stack_pointer_rtx (aligned)
43e9d192 8107
34834420
MS
8108 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
8109 but leave frame_pointer_rtx and hard_frame_pointer_rtx
cd1bef27
JL
8110 unchanged.
8111
8112 By default for stack-clash we assume the guard is at least 64KB, but this
8113 value is configurable to either 4KB or 64KB. We also force the guard size to
8114 be the same as the probing interval and both values are kept in sync.
8115
8116 With those assumptions the callee can allocate up to 63KB (or 3KB depending
8117 on the guard size) of stack space without probing.
8118
8119 When probing is needed, we emit a probe at the start of the prologue
8120 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
8121
8122 We have to track how much space has been allocated and the only stores
8123 to the stack we track as implicit probes are the FP/LR stores.
8124
8125 For outgoing arguments we probe if the size is larger than 1KB, such that
143d3b15
TC
8126 the ABI specified buffer is maintained for the next callee.
8127
8128 The following registers are reserved during frame layout and should not be
8129 used for any other purpose:
8130
c600df9a
RS
8131 - r11: Used by stack clash protection when SVE is enabled, and also
8132 as an anchor register when saving and restoring registers
143d3b15
TC
8133 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
8134 - r14 and r15: Used for speculation tracking.
8135 - r16(IP0), r17(IP1): Used by indirect tailcalls.
8136 - r30(LR), r29(FP): Used by standard frame layout.
8137
8138 These registers must be avoided in frame layout related code unless the
8139 explicit intention is to interact with one of the features listed above. */
43e9d192
IB
8140
8141/* Generate the prologue instructions for entry into a function.
8142 Establish the stack frame by decreasing the stack pointer with a
8143 properly calculated size and, if necessary, create a frame record
8144 filled with the values of LR and previous frame pointer. The
6991c977 8145 current FP is also set up if it is in use. */
43e9d192
IB
8146
8147void
8148aarch64_expand_prologue (void)
8149{
6a70badb
RS
8150 poly_int64 frame_size = cfun->machine->frame.frame_size;
8151 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
71bfb77a 8152 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6a70badb
RS
8153 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
8154 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
c600df9a
RS
8155 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
8156 poly_int64 below_hard_fp_saved_regs_size
8157 = cfun->machine->frame.below_hard_fp_saved_regs_size;
71bfb77a
WD
8158 unsigned reg1 = cfun->machine->frame.wb_candidate1;
8159 unsigned reg2 = cfun->machine->frame.wb_candidate2;
204d2c03 8160 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
71bfb77a 8161 rtx_insn *insn;
43e9d192 8162
c600df9a
RS
8163 if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
8164 {
8165 /* Fold the SVE allocation into the initial allocation.
8166 We don't do this in aarch64_layout_arg to avoid pessimizing
8167 the epilogue code. */
8168 initial_adjust += sve_callee_adjust;
8169 sve_callee_adjust = 0;
8170 }
8171
db58fd89
JW
8172 /* Sign return address for functions. */
8173 if (aarch64_return_address_signing_enabled ())
27169e45 8174 {
8fc16d72
ST
8175 switch (aarch64_ra_sign_key)
8176 {
8177 case AARCH64_KEY_A:
8178 insn = emit_insn (gen_paciasp ());
8179 break;
8180 case AARCH64_KEY_B:
8181 insn = emit_insn (gen_pacibsp ());
8182 break;
8183 default:
8184 gcc_unreachable ();
8185 }
27169e45
JW
8186 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
8187 RTX_FRAME_RELATED_P (insn) = 1;
8188 }
db58fd89 8189
dd991abb 8190 if (flag_stack_usage_info)
6a70badb 8191 current_function_static_stack_size = constant_lower_bound (frame_size);
43e9d192 8192
a3eb8a52
EB
8193 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
8194 {
8195 if (crtl->is_leaf && !cfun->calls_alloca)
8196 {
6a70badb
RS
8197 if (maybe_gt (frame_size, PROBE_INTERVAL)
8198 && maybe_gt (frame_size, get_stack_check_protect ()))
8c1dd970
JL
8199 aarch64_emit_probe_stack_range (get_stack_check_protect (),
8200 (frame_size
8201 - get_stack_check_protect ()));
a3eb8a52 8202 }
6a70badb 8203 else if (maybe_gt (frame_size, 0))
8c1dd970 8204 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
a3eb8a52
EB
8205 }
8206
901e66e0
SD
8207 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
8208 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
f5470a77 8209
cd1bef27
JL
8210 /* In theory we should never have both an initial adjustment
8211 and a callee save adjustment. Verify that is the case since the
8212 code below does not handle it for -fstack-clash-protection. */
8213 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
8214
8215 /* Will only probe if the initial adjustment is larger than the guard
8216 less the amount of the guard reserved for use by the caller's
8217 outgoing args. */
901e66e0 8218 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
cd1bef27 8219 true, false);
43e9d192 8220
71bfb77a
WD
8221 if (callee_adjust != 0)
8222 aarch64_push_regs (reg1, reg2, callee_adjust);
43e9d192 8223
c600df9a
RS
8224 /* The offset of the frame chain record (if any) from the current SP. */
8225 poly_int64 chain_offset = (initial_adjust + callee_adjust
8226 - cfun->machine->frame.hard_fp_offset);
8227 gcc_assert (known_ge (chain_offset, 0));
8228
8229 /* The offset of the bottom of the save area from the current SP. */
8230 poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size;
8231
204d2c03 8232 if (emit_frame_chain)
43e9d192 8233 {
71bfb77a 8234 if (callee_adjust == 0)
43cacb12
RS
8235 {
8236 reg1 = R29_REGNUM;
8237 reg2 = R30_REGNUM;
c600df9a
RS
8238 aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
8239 false, false);
43cacb12 8240 }
c600df9a
RS
8241 else
8242 gcc_assert (known_eq (chain_offset, 0));
f5470a77 8243 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
c600df9a 8244 stack_pointer_rtx, chain_offset,
901e66e0 8245 tmp1_rtx, tmp0_rtx, frame_pointer_needed);
43cacb12
RS
8246 if (frame_pointer_needed && !frame_size.is_constant ())
8247 {
8248 /* Variable-sized frames need to describe the save slot
8249 address using DW_CFA_expression rather than DW_CFA_offset.
8250 This means that, without taking further action, the
8251 locations of the registers that we've already saved would
8252 remain based on the stack pointer even after we redefine
8253 the CFA based on the frame pointer. We therefore need new
8254 DW_CFA_expressions to re-express the save slots with addresses
8255 based on the frame pointer. */
8256 rtx_insn *insn = get_last_insn ();
8257 gcc_assert (RTX_FRAME_RELATED_P (insn));
8258
8259 /* Add an explicit CFA definition if this was previously
8260 implicit. */
8261 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
8262 {
8263 rtx src = plus_constant (Pmode, stack_pointer_rtx,
8264 callee_offset);
8265 add_reg_note (insn, REG_CFA_ADJUST_CFA,
8266 gen_rtx_SET (hard_frame_pointer_rtx, src));
8267 }
8268
8269 /* Change the save slot expressions for the registers that
8270 we've already saved. */
c600df9a
RS
8271 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
8272 hard_frame_pointer_rtx, UNITS_PER_WORD);
8273 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
8274 hard_frame_pointer_rtx, 0);
43cacb12 8275 }
71bfb77a 8276 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
43e9d192 8277 }
71bfb77a 8278
c600df9a
RS
8279 aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
8280 callee_adjust != 0 || emit_frame_chain,
8281 emit_frame_chain);
8282 if (maybe_ne (sve_callee_adjust, 0))
8283 {
8284 gcc_assert (!flag_stack_clash_protection
8285 || known_eq (initial_adjust, 0));
8286 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
8287 sve_callee_adjust,
8288 !frame_pointer_needed, false);
8289 saved_regs_offset += sve_callee_adjust;
8290 }
8291 aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
8292 false, emit_frame_chain);
8293 aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
8294 callee_adjust != 0 || emit_frame_chain,
8295 emit_frame_chain);
cd1bef27
JL
8296
8297 /* We may need to probe the final adjustment if it is larger than the guard
8298 that is assumed by the called. */
901e66e0 8299 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
cd1bef27 8300 !frame_pointer_needed, true);
43e9d192
IB
8301}
8302
4f942779
RL
8303/* Return TRUE if we can use a simple_return insn.
8304
8305 This function checks whether the callee saved stack is empty, which
8306 means no restore actions are need. The pro_and_epilogue will use
8307 this to check whether shrink-wrapping opt is feasible. */
8308
8309bool
8310aarch64_use_return_insn_p (void)
8311{
8312 if (!reload_completed)
8313 return false;
8314
8315 if (crtl->profile)
8316 return false;
8317
6a70badb 8318 return known_eq (cfun->machine->frame.frame_size, 0);
4f942779
RL
8319}
8320
71bfb77a
WD
8321/* Generate the epilogue instructions for returning from a function.
8322 This is almost exactly the reverse of the prolog sequence, except
8323 that we need to insert barriers to avoid scheduling loads that read
8324 from a deallocated stack, and we optimize the unwind records by
8325 emitting them all together if possible. */
43e9d192
IB
8326void
8327aarch64_expand_epilogue (bool for_sibcall)
8328{
6a70badb 8329 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
71bfb77a 8330 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6a70badb
RS
8331 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
8332 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
c600df9a
RS
8333 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
8334 poly_int64 below_hard_fp_saved_regs_size
8335 = cfun->machine->frame.below_hard_fp_saved_regs_size;
71bfb77a
WD
8336 unsigned reg1 = cfun->machine->frame.wb_candidate1;
8337 unsigned reg2 = cfun->machine->frame.wb_candidate2;
8338 rtx cfi_ops = NULL;
8339 rtx_insn *insn;
901e66e0
SD
8340 /* A stack clash protection prologue may not have left EP0_REGNUM or
8341 EP1_REGNUM in a usable state. The same is true for allocations
43cacb12 8342 with an SVE component, since we then need both temporary registers
cd1bef27
JL
8343 for each allocation. For stack clash we are in a usable state if
8344 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
8345 HOST_WIDE_INT guard_size
028d4092 8346 = 1 << param_stack_clash_protection_guard_size;
cd1bef27
JL
8347 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
8348
c600df9a
RS
8349 /* We can re-use the registers when:
8350
8351 (a) the deallocation amount is the same as the corresponding
8352 allocation amount (which is false if we combine the initial
8353 and SVE callee save allocations in the prologue); and
8354
8355 (b) the allocation amount doesn't need a probe (which is false
8356 if the amount is guard_size - guard_used_by_caller or greater).
8357
8358 In such situations the register should remain live with the correct
cd1bef27 8359 value. */
43cacb12 8360 bool can_inherit_p = (initial_adjust.is_constant ()
c600df9a 8361 && final_adjust.is_constant ()
cd1bef27 8362 && (!flag_stack_clash_protection
c600df9a
RS
8363 || (known_lt (initial_adjust,
8364 guard_size - guard_used_by_caller)
8365 && known_eq (sve_callee_adjust, 0))));
44c0e7b9 8366
71bfb77a 8367 /* We need to add memory barrier to prevent read from deallocated stack. */
6a70badb
RS
8368 bool need_barrier_p
8369 = maybe_ne (get_frame_size ()
8370 + cfun->machine->frame.saved_varargs_size, 0);
43e9d192 8371
71bfb77a 8372 /* Emit a barrier to prevent loads from a deallocated stack. */
6a70badb
RS
8373 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
8374 || cfun->calls_alloca
8144a493 8375 || crtl->calls_eh_return)
43e9d192 8376 {
71bfb77a
WD
8377 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
8378 need_barrier_p = false;
8379 }
7e8c2bd5 8380
71bfb77a
WD
8381 /* Restore the stack pointer from the frame pointer if it may not
8382 be the same as the stack pointer. */
901e66e0
SD
8383 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
8384 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6a70badb
RS
8385 if (frame_pointer_needed
8386 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
f5470a77
RS
8387 /* If writeback is used when restoring callee-saves, the CFA
8388 is restored on the instruction doing the writeback. */
8389 aarch64_add_offset (Pmode, stack_pointer_rtx,
c600df9a
RS
8390 hard_frame_pointer_rtx,
8391 -callee_offset - below_hard_fp_saved_regs_size,
901e66e0 8392 tmp1_rtx, tmp0_rtx, callee_adjust == 0);
71bfb77a 8393 else
cd1bef27
JL
8394 /* The case where we need to re-use the register here is very rare, so
8395 avoid the complicated condition and just always emit a move if the
8396 immediate doesn't fit. */
901e66e0 8397 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
43e9d192 8398
c600df9a
RS
8399 /* Restore the vector registers before the predicate registers,
8400 so that we can use P4 as a temporary for big-endian SVE frames. */
8401 aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
8402 callee_adjust != 0, &cfi_ops);
8403 aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
8404 false, &cfi_ops);
8405 if (maybe_ne (sve_callee_adjust, 0))
8406 aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
8407 aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
8408 R0_REGNUM, R30_REGNUM,
71bfb77a 8409 callee_adjust != 0, &cfi_ops);
43e9d192 8410
71bfb77a
WD
8411 if (need_barrier_p)
8412 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
8413
8414 if (callee_adjust != 0)
8415 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
8416
1ccbfffb
RS
8417 /* If we have no register restore information, the CFA must have been
8418 defined in terms of the stack pointer since the end of the prologue. */
8419 gcc_assert (cfi_ops || !frame_pointer_needed);
8420
8421 if (cfi_ops && (callee_adjust != 0 || maybe_gt (initial_adjust, 65536)))
71bfb77a
WD
8422 {
8423 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
89ac681e 8424 insn = get_last_insn ();
71bfb77a
WD
8425 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
8426 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
43e9d192 8427 RTX_FRAME_RELATED_P (insn) = 1;
71bfb77a 8428 cfi_ops = NULL;
43e9d192
IB
8429 }
8430
901e66e0
SD
8431 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
8432 add restriction on emit_move optimization to leaf functions. */
8433 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
8434 (!can_inherit_p || !crtl->is_leaf
8435 || df_regs_ever_live_p (EP0_REGNUM)));
7e8c2bd5 8436
71bfb77a
WD
8437 if (cfi_ops)
8438 {
8439 /* Emit delayed restores and reset the CFA to be SP. */
8440 insn = get_last_insn ();
8441 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
8442 REG_NOTES (insn) = cfi_ops;
8443 RTX_FRAME_RELATED_P (insn) = 1;
dd991abb
RH
8444 }
8445
db58fd89
JW
8446 /* We prefer to emit the combined return/authenticate instruction RETAA,
8447 however there are three cases in which we must instead emit an explicit
8448 authentication instruction.
8449
8450 1) Sibcalls don't return in a normal way, so if we're about to call one
8451 we must authenticate.
8452
8453 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
8454 generating code for !TARGET_ARMV8_3 we can't use it and must
8455 explicitly authenticate.
8456
8457 3) On an eh_return path we make extra stack adjustments to update the
8458 canonical frame address to be the exception handler's CFA. We want
8459 to authenticate using the CFA of the function which calls eh_return.
8460 */
8461 if (aarch64_return_address_signing_enabled ()
8462 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
27169e45 8463 {
8fc16d72
ST
8464 switch (aarch64_ra_sign_key)
8465 {
8466 case AARCH64_KEY_A:
8467 insn = emit_insn (gen_autiasp ());
8468 break;
8469 case AARCH64_KEY_B:
8470 insn = emit_insn (gen_autibsp ());
8471 break;
8472 default:
8473 gcc_unreachable ();
8474 }
27169e45
JW
8475 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
8476 RTX_FRAME_RELATED_P (insn) = 1;
8477 }
db58fd89 8478
dd991abb 8479 /* Stack adjustment for exception handler. */
b5b9147d 8480 if (crtl->calls_eh_return && !for_sibcall)
dd991abb
RH
8481 {
8482 /* We need to unwind the stack by the offset computed by
8483 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
8484 to be SP; letting the CFA move during this adjustment
8485 is just as correct as retaining the CFA from the body
8486 of the function. Therefore, do nothing special. */
8487 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
43e9d192
IB
8488 }
8489
8490 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
8491 if (!for_sibcall)
8492 emit_jump_insn (ret_rtx);
8493}
8494
8144a493
WD
8495/* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
8496 normally or return to a previous frame after unwinding.
1c960e02 8497
8144a493
WD
8498 An EH return uses a single shared return sequence. The epilogue is
8499 exactly like a normal epilogue except that it has an extra input
8500 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
8501 that must be applied after the frame has been destroyed. An extra label
8502 is inserted before the epilogue which initializes this register to zero,
8503 and this is the entry point for a normal return.
43e9d192 8504
8144a493
WD
8505 An actual EH return updates the return address, initializes the stack
8506 adjustment and jumps directly into the epilogue (bypassing the zeroing
8507 of the adjustment). Since the return address is typically saved on the
8508 stack when a function makes a call, the saved LR must be updated outside
8509 the epilogue.
43e9d192 8510
8144a493
WD
8511 This poses problems as the store is generated well before the epilogue,
8512 so the offset of LR is not known yet. Also optimizations will remove the
8513 store as it appears dead, even after the epilogue is generated (as the
8514 base or offset for loading LR is different in many cases).
43e9d192 8515
8144a493
WD
8516 To avoid these problems this implementation forces the frame pointer
8517 in eh_return functions so that the location of LR is fixed and known early.
8518 It also marks the store volatile, so no optimization is permitted to
8519 remove the store. */
8520rtx
8521aarch64_eh_return_handler_rtx (void)
8522{
8523 rtx tmp = gen_frame_mem (Pmode,
8524 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
43e9d192 8525
8144a493
WD
8526 /* Mark the store volatile, so no optimization is permitted to remove it. */
8527 MEM_VOLATILE_P (tmp) = true;
8528 return tmp;
43e9d192
IB
8529}
8530
43e9d192
IB
8531/* Output code to add DELTA to the first argument, and then jump
8532 to FUNCTION. Used for C++ multiple inheritance. */
8533static void
8534aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
8535 HOST_WIDE_INT delta,
8536 HOST_WIDE_INT vcall_offset,
8537 tree function)
8538{
8539 /* The this pointer is always in x0. Note that this differs from
8540 Arm where the this pointer maybe bumped to r1 if r0 is required
8541 to return a pointer to an aggregate. On AArch64 a result value
8542 pointer will be in x8. */
8543 int this_regno = R0_REGNUM;
5d8a22a5
DM
8544 rtx this_rtx, temp0, temp1, addr, funexp;
8545 rtx_insn *insn;
6b5777c6 8546 const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
43e9d192 8547
c904388d
SD
8548 if (aarch64_bti_enabled ())
8549 emit_insn (gen_bti_c());
8550
75f1d6fc
SN
8551 reload_completed = 1;
8552 emit_note (NOTE_INSN_PROLOGUE_END);
43e9d192 8553
f5470a77 8554 this_rtx = gen_rtx_REG (Pmode, this_regno);
901e66e0
SD
8555 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
8556 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
f5470a77 8557
43e9d192 8558 if (vcall_offset == 0)
43cacb12 8559 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
43e9d192
IB
8560 else
8561 {
28514dda 8562 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
43e9d192 8563
75f1d6fc
SN
8564 addr = this_rtx;
8565 if (delta != 0)
8566 {
8567 if (delta >= -256 && delta < 256)
8568 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
8569 plus_constant (Pmode, this_rtx, delta));
8570 else
43cacb12
RS
8571 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
8572 temp1, temp0, false);
43e9d192
IB
8573 }
8574
28514dda
YZ
8575 if (Pmode == ptr_mode)
8576 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
8577 else
8578 aarch64_emit_move (temp0,
8579 gen_rtx_ZERO_EXTEND (Pmode,
8580 gen_rtx_MEM (ptr_mode, addr)));
75f1d6fc 8581
28514dda 8582 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
75f1d6fc 8583 addr = plus_constant (Pmode, temp0, vcall_offset);
43e9d192
IB
8584 else
8585 {
f43657b4
JW
8586 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
8587 Pmode);
75f1d6fc 8588 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
43e9d192
IB
8589 }
8590
28514dda
YZ
8591 if (Pmode == ptr_mode)
8592 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
8593 else
8594 aarch64_emit_move (temp1,
8595 gen_rtx_SIGN_EXTEND (Pmode,
8596 gen_rtx_MEM (ptr_mode, addr)));
8597
75f1d6fc 8598 emit_insn (gen_add2_insn (this_rtx, temp1));
43e9d192
IB
8599 }
8600
75f1d6fc
SN
8601 /* Generate a tail call to the target function. */
8602 if (!TREE_USED (function))
8603 {
8604 assemble_external (function);
8605 TREE_USED (function) = 1;
8606 }
8607 funexp = XEXP (DECL_RTL (function), 0);
8608 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
08cc4d92
RS
8609 rtx callee_abi = gen_int_mode (fndecl_abi (function).id (), DImode);
8610 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
75f1d6fc
SN
8611 SIBLING_CALL_P (insn) = 1;
8612
8613 insn = get_insns ();
8614 shorten_branches (insn);
6b5777c6
MF
8615
8616 assemble_start_function (thunk, fnname);
75f1d6fc
SN
8617 final_start_function (insn, file, 1);
8618 final (insn, file, 1);
43e9d192 8619 final_end_function ();
6b5777c6 8620 assemble_end_function (thunk, fnname);
75f1d6fc
SN
8621
8622 /* Stop pretending to be a post-reload pass. */
8623 reload_completed = 0;
43e9d192
IB
8624}
8625
43e9d192
IB
8626static bool
8627aarch64_tls_referenced_p (rtx x)
8628{
8629 if (!TARGET_HAVE_TLS)
8630 return false;
e7de8563
RS
8631 subrtx_iterator::array_type array;
8632 FOR_EACH_SUBRTX (iter, array, x, ALL)
8633 {
8634 const_rtx x = *iter;
3793ecc1 8635 if (SYMBOL_REF_P (x) && SYMBOL_REF_TLS_MODEL (x) != 0)
e7de8563
RS
8636 return true;
8637 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
8638 TLS offsets, not real symbol references. */
8639 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
8640 iter.skip_subrtxes ();
8641 }
8642 return false;
43e9d192
IB
8643}
8644
8645
43e9d192
IB
8646/* Return true if val can be encoded as a 12-bit unsigned immediate with
8647 a left shift of 0 or 12 bits. */
8648bool
8649aarch64_uimm12_shift (HOST_WIDE_INT val)
8650{
8651 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
8652 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
8653 );
8654}
8655
eb471ba3
TC
8656/* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
8657 that can be created with a left shift of 0 or 12. */
8658static HOST_WIDE_INT
8659aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
8660{
8661 /* Check to see if the value fits in 24 bits, as that is the maximum we can
8662 handle correctly. */
8663 gcc_assert ((val & 0xffffff) == val);
8664
8665 if (((val & 0xfff) << 0) == val)
8666 return val;
8667
8668 return val & (0xfff << 12);
8669}
43e9d192
IB
8670
8671/* Return true if val is an immediate that can be loaded into a
8672 register by a MOVZ instruction. */
8673static bool
77e994c9 8674aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
43e9d192
IB
8675{
8676 if (GET_MODE_SIZE (mode) > 4)
8677 {
8678 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
8679 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
8680 return 1;
8681 }
8682 else
8683 {
43cacb12
RS
8684 /* Ignore sign extension. */
8685 val &= (HOST_WIDE_INT) 0xffffffff;
8686 }
8687 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
8688 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
8689}
8690
bba0c624
RS
8691/* Test whether:
8692
8693 X = (X & AND_VAL) | IOR_VAL;
8694
8695 can be implemented using:
8696
8697 MOVK X, #(IOR_VAL >> shift), LSL #shift
8698
8699 Return the shift if so, otherwise return -1. */
8700int
8701aarch64_movk_shift (const wide_int_ref &and_val,
8702 const wide_int_ref &ior_val)
8703{
8704 unsigned int precision = and_val.get_precision ();
8705 unsigned HOST_WIDE_INT mask = 0xffff;
8706 for (unsigned int shift = 0; shift < precision; shift += 16)
8707 {
8708 if (and_val == ~mask && (ior_val & mask) == ior_val)
8709 return shift;
8710 mask <<= 16;
8711 }
8712 return -1;
8713}
8714
43cacb12
RS
8715/* VAL is a value with the inner mode of MODE. Replicate it to fill a
8716 64-bit (DImode) integer. */
8717
8718static unsigned HOST_WIDE_INT
8719aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
8720{
8721 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
8722 while (size < 64)
8723 {
8724 val &= (HOST_WIDE_INT_1U << size) - 1;
8725 val |= val << size;
8726 size *= 2;
43e9d192 8727 }
43cacb12 8728 return val;
43e9d192
IB
8729}
8730
a64c73a2
WD
8731/* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
8732
8733static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
8734 {
8735 0x0000000100000001ull,
8736 0x0001000100010001ull,
8737 0x0101010101010101ull,
8738 0x1111111111111111ull,
8739 0x5555555555555555ull,
8740 };
8741
43e9d192
IB
8742
8743/* Return true if val is a valid bitmask immediate. */
a64c73a2 8744
43e9d192 8745bool
a64c73a2 8746aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
43e9d192 8747{
a64c73a2
WD
8748 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
8749 int bits;
8750
8751 /* Check for a single sequence of one bits and return quickly if so.
8752 The special cases of all ones and all zeroes returns false. */
43cacb12 8753 val = aarch64_replicate_bitmask_imm (val_in, mode);
a64c73a2
WD
8754 tmp = val + (val & -val);
8755
8756 if (tmp == (tmp & -tmp))
8757 return (val + 1) > 1;
8758
8759 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
8760 if (mode == SImode)
8761 val = (val << 32) | (val & 0xffffffff);
8762
8763 /* Invert if the immediate doesn't start with a zero bit - this means we
8764 only need to search for sequences of one bits. */
8765 if (val & 1)
8766 val = ~val;
8767
8768 /* Find the first set bit and set tmp to val with the first sequence of one
8769 bits removed. Return success if there is a single sequence of ones. */
8770 first_one = val & -val;
8771 tmp = val & (val + first_one);
8772
8773 if (tmp == 0)
8774 return true;
8775
8776 /* Find the next set bit and compute the difference in bit position. */
8777 next_one = tmp & -tmp;
8778 bits = clz_hwi (first_one) - clz_hwi (next_one);
8779 mask = val ^ tmp;
8780
8781 /* Check the bit position difference is a power of 2, and that the first
8782 sequence of one bits fits within 'bits' bits. */
8783 if ((mask >> bits) != 0 || bits != (bits & -bits))
8784 return false;
8785
8786 /* Check the sequence of one bits is repeated 64/bits times. */
8787 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
43e9d192
IB
8788}
8789
43fd192f
MC
8790/* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
8791 Assumed precondition: VAL_IN Is not zero. */
8792
8793unsigned HOST_WIDE_INT
8794aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
8795{
8796 int lowest_bit_set = ctz_hwi (val_in);
8797 int highest_bit_set = floor_log2 (val_in);
8798 gcc_assert (val_in != 0);
8799
8800 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
8801 (HOST_WIDE_INT_1U << lowest_bit_set));
8802}
8803
8804/* Create constant where bits outside of lowest bit set to highest bit set
8805 are set to 1. */
8806
8807unsigned HOST_WIDE_INT
8808aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
8809{
8810 return val_in | ~aarch64_and_split_imm1 (val_in);
8811}
8812
8813/* Return true if VAL_IN is a valid 'and' bitmask immediate. */
8814
8815bool
8816aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
8817{
77e994c9
RS
8818 scalar_int_mode int_mode;
8819 if (!is_a <scalar_int_mode> (mode, &int_mode))
8820 return false;
8821
8822 if (aarch64_bitmask_imm (val_in, int_mode))
43fd192f
MC
8823 return false;
8824
77e994c9 8825 if (aarch64_move_imm (val_in, int_mode))
43fd192f
MC
8826 return false;
8827
8828 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
8829
77e994c9 8830 return aarch64_bitmask_imm (imm2, int_mode);
43fd192f 8831}
43e9d192
IB
8832
8833/* Return true if val is an immediate that can be loaded into a
8834 register in a single instruction. */
8835bool
ef4bddc2 8836aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
43e9d192 8837{
77e994c9
RS
8838 scalar_int_mode int_mode;
8839 if (!is_a <scalar_int_mode> (mode, &int_mode))
8840 return false;
8841
8842 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
43e9d192 8843 return 1;
77e994c9 8844 return aarch64_bitmask_imm (val, int_mode);
43e9d192
IB
8845}
8846
8847static bool
ef4bddc2 8848aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
43e9d192 8849{
43e9d192
IB
8850 if (GET_CODE (x) == HIGH)
8851 return true;
8852
43cacb12
RS
8853 /* There's no way to calculate VL-based values using relocations. */
8854 subrtx_iterator::array_type array;
8855 FOR_EACH_SUBRTX (iter, array, x, ALL)
8856 if (GET_CODE (*iter) == CONST_POLY_INT)
8857 return true;
8858
74b27d8e
RS
8859 poly_int64 offset;
8860 rtx base = strip_offset_and_salt (x, &offset);
3793ecc1 8861 if (SYMBOL_REF_P (base) || LABEL_REF_P (base))
28514dda 8862 {
74b27d8e
RS
8863 /* We checked for POLY_INT_CST offsets above. */
8864 if (aarch64_classify_symbol (base, offset.to_constant ())
28514dda
YZ
8865 != SYMBOL_FORCE_TO_MEM)
8866 return true;
8867 else
8868 /* Avoid generating a 64-bit relocation in ILP32; leave
8869 to aarch64_expand_mov_immediate to handle it properly. */
8870 return mode != ptr_mode;
8871 }
43e9d192
IB
8872
8873 return aarch64_tls_referenced_p (x);
8874}
8875
e79136e4
WD
8876/* Implement TARGET_CASE_VALUES_THRESHOLD.
8877 The expansion for a table switch is quite expensive due to the number
8878 of instructions, the table lookup and hard to predict indirect jump.
8879 When optimizing for speed, and -O3 enabled, use the per-core tuning if
8880 set, otherwise use tables for > 16 cases as a tradeoff between size and
8881 performance. When optimizing for size, use the default setting. */
50487d79
EM
8882
8883static unsigned int
8884aarch64_case_values_threshold (void)
8885{
8886 /* Use the specified limit for the number of cases before using jump
8887 tables at higher optimization levels. */
8888 if (optimize > 2
8889 && selected_cpu->tune->max_case_values != 0)
8890 return selected_cpu->tune->max_case_values;
8891 else
e79136e4 8892 return optimize_size ? default_case_values_threshold () : 17;
50487d79
EM
8893}
8894
43e9d192
IB
8895/* Return true if register REGNO is a valid index register.
8896 STRICT_P is true if REG_OK_STRICT is in effect. */
8897
8898bool
8899aarch64_regno_ok_for_index_p (int regno, bool strict_p)
8900{
8901 if (!HARD_REGISTER_NUM_P (regno))
8902 {
8903 if (!strict_p)
8904 return true;
8905
8906 if (!reg_renumber)
8907 return false;
8908
8909 regno = reg_renumber[regno];
8910 }
8911 return GP_REGNUM_P (regno);
8912}
8913
8914/* Return true if register REGNO is a valid base register for mode MODE.
8915 STRICT_P is true if REG_OK_STRICT is in effect. */
8916
8917bool
8918aarch64_regno_ok_for_base_p (int regno, bool strict_p)
8919{
8920 if (!HARD_REGISTER_NUM_P (regno))
8921 {
8922 if (!strict_p)
8923 return true;
8924
8925 if (!reg_renumber)
8926 return false;
8927
8928 regno = reg_renumber[regno];
8929 }
8930
8931 /* The fake registers will be eliminated to either the stack or
8932 hard frame pointer, both of which are usually valid base registers.
8933 Reload deals with the cases where the eliminated form isn't valid. */
8934 return (GP_REGNUM_P (regno)
8935 || regno == SP_REGNUM
8936 || regno == FRAME_POINTER_REGNUM
8937 || regno == ARG_POINTER_REGNUM);
8938}
8939
8940/* Return true if X is a valid base register for mode MODE.
8941 STRICT_P is true if REG_OK_STRICT is in effect. */
8942
8943static bool
8944aarch64_base_register_rtx_p (rtx x, bool strict_p)
8945{
76160199 8946 if (!strict_p
3793ecc1 8947 && SUBREG_P (x)
76160199 8948 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
43e9d192
IB
8949 x = SUBREG_REG (x);
8950
8951 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
8952}
8953
8954/* Return true if address offset is a valid index. If it is, fill in INFO
8955 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
8956
8957static bool
8958aarch64_classify_index (struct aarch64_address_info *info, rtx x,
ef4bddc2 8959 machine_mode mode, bool strict_p)
43e9d192
IB
8960{
8961 enum aarch64_address_type type;
8962 rtx index;
8963 int shift;
8964
8965 /* (reg:P) */
3793ecc1 8966 if ((REG_P (x) || SUBREG_P (x))
43e9d192
IB
8967 && GET_MODE (x) == Pmode)
8968 {
8969 type = ADDRESS_REG_REG;
8970 index = x;
8971 shift = 0;
8972 }
8973 /* (sign_extend:DI (reg:SI)) */
8974 else if ((GET_CODE (x) == SIGN_EXTEND
8975 || GET_CODE (x) == ZERO_EXTEND)
8976 && GET_MODE (x) == DImode
8977 && GET_MODE (XEXP (x, 0)) == SImode)
8978 {
8979 type = (GET_CODE (x) == SIGN_EXTEND)
8980 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8981 index = XEXP (x, 0);
8982 shift = 0;
8983 }
8984 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
8985 else if (GET_CODE (x) == MULT
8986 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
8987 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
8988 && GET_MODE (XEXP (x, 0)) == DImode
8989 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
8990 && CONST_INT_P (XEXP (x, 1)))
8991 {
8992 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
8993 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8994 index = XEXP (XEXP (x, 0), 0);
8995 shift = exact_log2 (INTVAL (XEXP (x, 1)));
8996 }
8997 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
8998 else if (GET_CODE (x) == ASHIFT
8999 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
9000 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
9001 && GET_MODE (XEXP (x, 0)) == DImode
9002 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
9003 && CONST_INT_P (XEXP (x, 1)))
9004 {
9005 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
9006 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
9007 index = XEXP (XEXP (x, 0), 0);
9008 shift = INTVAL (XEXP (x, 1));
9009 }
43e9d192
IB
9010 /* (and:DI (mult:DI (reg:DI) (const_int scale))
9011 (const_int 0xffffffff<<shift)) */
9012 else if (GET_CODE (x) == AND
9013 && GET_MODE (x) == DImode
9014 && GET_CODE (XEXP (x, 0)) == MULT
9015 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
9016 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9017 && CONST_INT_P (XEXP (x, 1)))
9018 {
9019 type = ADDRESS_REG_UXTW;
9020 index = XEXP (XEXP (x, 0), 0);
9021 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
9022 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
9023 shift = -1;
9024 }
43e9d192
IB
9025 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
9026 (const_int 0xffffffff<<shift)) */
9027 else if (GET_CODE (x) == AND
9028 && GET_MODE (x) == DImode
9029 && GET_CODE (XEXP (x, 0)) == ASHIFT
9030 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
9031 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9032 && CONST_INT_P (XEXP (x, 1)))
9033 {
9034 type = ADDRESS_REG_UXTW;
9035 index = XEXP (XEXP (x, 0), 0);
9036 shift = INTVAL (XEXP (XEXP (x, 0), 1));
9037 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
9038 shift = -1;
9039 }
9040 /* (mult:P (reg:P) (const_int scale)) */
9041 else if (GET_CODE (x) == MULT
9042 && GET_MODE (x) == Pmode
9043 && GET_MODE (XEXP (x, 0)) == Pmode
9044 && CONST_INT_P (XEXP (x, 1)))
9045 {
9046 type = ADDRESS_REG_REG;
9047 index = XEXP (x, 0);
9048 shift = exact_log2 (INTVAL (XEXP (x, 1)));
9049 }
9050 /* (ashift:P (reg:P) (const_int shift)) */
9051 else if (GET_CODE (x) == ASHIFT
9052 && GET_MODE (x) == Pmode
9053 && GET_MODE (XEXP (x, 0)) == Pmode
9054 && CONST_INT_P (XEXP (x, 1)))
9055 {
9056 type = ADDRESS_REG_REG;
9057 index = XEXP (x, 0);
9058 shift = INTVAL (XEXP (x, 1));
9059 }
9060 else
9061 return false;
9062
76160199 9063 if (!strict_p
3793ecc1 9064 && SUBREG_P (index)
76160199 9065 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
43e9d192
IB
9066 index = SUBREG_REG (index);
9067
43cacb12
RS
9068 if (aarch64_sve_data_mode_p (mode))
9069 {
9070 if (type != ADDRESS_REG_REG
9071 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
9072 return false;
9073 }
9074 else
9075 {
9076 if (shift != 0
9077 && !(IN_RANGE (shift, 1, 3)
9078 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
9079 return false;
9080 }
9081
9082 if (REG_P (index)
43e9d192
IB
9083 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
9084 {
9085 info->type = type;
9086 info->offset = index;
9087 info->shift = shift;
9088 return true;
9089 }
9090
9091 return false;
9092}
9093
abc52318
KT
9094/* Return true if MODE is one of the modes for which we
9095 support LDP/STP operations. */
9096
9097static bool
9098aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
9099{
9100 return mode == SImode || mode == DImode
9101 || mode == SFmode || mode == DFmode
9102 || (aarch64_vector_mode_supported_p (mode)
9f5361c8
KT
9103 && (known_eq (GET_MODE_SIZE (mode), 8)
9104 || (known_eq (GET_MODE_SIZE (mode), 16)
9105 && (aarch64_tune_params.extra_tuning_flags
9106 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
abc52318
KT
9107}
9108
9e0218fc
RH
9109/* Return true if REGNO is a virtual pointer register, or an eliminable
9110 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
9111 include stack_pointer or hard_frame_pointer. */
9112static bool
9113virt_or_elim_regno_p (unsigned regno)
9114{
9115 return ((regno >= FIRST_VIRTUAL_REGISTER
9116 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
9117 || regno == FRAME_POINTER_REGNUM
9118 || regno == ARG_POINTER_REGNUM);
9119}
9120
a97d8b98
RS
9121/* Return true if X is a valid address of type TYPE for machine mode MODE.
9122 If it is, fill in INFO appropriately. STRICT_P is true if
9123 REG_OK_STRICT is in effect. */
43e9d192 9124
a98824ac 9125bool
43e9d192 9126aarch64_classify_address (struct aarch64_address_info *info,
a97d8b98 9127 rtx x, machine_mode mode, bool strict_p,
a98824ac 9128 aarch64_addr_query_type type)
43e9d192
IB
9129{
9130 enum rtx_code code = GET_CODE (x);
9131 rtx op0, op1;
dc640181
RS
9132 poly_int64 offset;
9133
6a70badb 9134 HOST_WIDE_INT const_size;
2d8c6dc1 9135
550a3380
RS
9136 /* Whether a vector mode is partial doesn't affect address legitimacy.
9137 Partial vectors like VNx8QImode allow the same indexed addressing
9138 mode and MUL VL addressing mode as full vectors like VNx16QImode;
9139 in both cases, MUL VL counts multiples of GET_MODE_SIZE. */
9140 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
9141 vec_flags &= ~VEC_PARTIAL;
9142
80d43579
WD
9143 /* On BE, we use load/store pair for all large int mode load/stores.
9144 TI/TFmode may also use a load/store pair. */
43cacb12 9145 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
a97d8b98 9146 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
a25831ac 9147 || type == ADDR_QUERY_LDP_STP_N
80d43579
WD
9148 || mode == TImode
9149 || mode == TFmode
43cacb12 9150 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
2d8c6dc1 9151
a25831ac
AV
9152 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
9153 corresponds to the actual size of the memory being loaded/stored and the
9154 mode of the corresponding addressing mode is half of that. */
9155 if (type == ADDR_QUERY_LDP_STP_N
9156 && known_eq (GET_MODE_SIZE (mode), 16))
9157 mode = DFmode;
9158
6a70badb 9159 bool allow_reg_index_p = (!load_store_pair_p
43cacb12
RS
9160 && (known_lt (GET_MODE_SIZE (mode), 16)
9161 || vec_flags == VEC_ADVSIMD
fa9863e7 9162 || vec_flags & VEC_SVE_DATA));
43cacb12
RS
9163
9164 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
9165 [Rn, #offset, MUL VL]. */
9166 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
9167 && (code != REG && code != PLUS))
9168 return false;
2d8c6dc1
AH
9169
9170 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
9171 REG addressing. */
43cacb12
RS
9172 if (advsimd_struct_p
9173 && !BYTES_BIG_ENDIAN
43e9d192
IB
9174 && (code != POST_INC && code != REG))
9175 return false;
9176
43cacb12
RS
9177 gcc_checking_assert (GET_MODE (x) == VOIDmode
9178 || SCALAR_INT_MODE_P (GET_MODE (x)));
9179
43e9d192
IB
9180 switch (code)
9181 {
9182 case REG:
9183 case SUBREG:
9184 info->type = ADDRESS_REG_IMM;
9185 info->base = x;
9186 info->offset = const0_rtx;
dc640181 9187 info->const_offset = 0;
43e9d192
IB
9188 return aarch64_base_register_rtx_p (x, strict_p);
9189
9190 case PLUS:
9191 op0 = XEXP (x, 0);
9192 op1 = XEXP (x, 1);
15c0c5c9
JW
9193
9194 if (! strict_p
4aa81c2e 9195 && REG_P (op0)
9e0218fc 9196 && virt_or_elim_regno_p (REGNO (op0))
dc640181 9197 && poly_int_rtx_p (op1, &offset))
15c0c5c9
JW
9198 {
9199 info->type = ADDRESS_REG_IMM;
9200 info->base = op0;
9201 info->offset = op1;
dc640181 9202 info->const_offset = offset;
15c0c5c9
JW
9203
9204 return true;
9205 }
9206
6a70badb 9207 if (maybe_ne (GET_MODE_SIZE (mode), 0)
dc640181
RS
9208 && aarch64_base_register_rtx_p (op0, strict_p)
9209 && poly_int_rtx_p (op1, &offset))
43e9d192 9210 {
43e9d192
IB
9211 info->type = ADDRESS_REG_IMM;
9212 info->base = op0;
9213 info->offset = op1;
dc640181 9214 info->const_offset = offset;
43e9d192
IB
9215
9216 /* TImode and TFmode values are allowed in both pairs of X
9217 registers and individual Q registers. The available
9218 address modes are:
9219 X,X: 7-bit signed scaled offset
9220 Q: 9-bit signed offset
9221 We conservatively require an offset representable in either mode.
8ed49fab
KT
9222 When performing the check for pairs of X registers i.e. LDP/STP
9223 pass down DImode since that is the natural size of the LDP/STP
9224 instruction memory accesses. */
43e9d192 9225 if (mode == TImode || mode == TFmode)
8ed49fab 9226 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
3c5af608 9227 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
8734dfac 9228 || offset_12bit_unsigned_scaled_p (mode, offset)));
43e9d192 9229
2d8c6dc1
AH
9230 /* A 7bit offset check because OImode will emit a ldp/stp
9231 instruction (only big endian will get here).
9232 For ldp/stp instructions, the offset is scaled for the size of a
9233 single element of the pair. */
9234 if (mode == OImode)
9235 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
9236
9237 /* Three 9/12 bit offsets checks because CImode will emit three
9238 ldr/str instructions (only big endian will get here). */
9239 if (mode == CImode)
9240 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3c5af608
MM
9241 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
9242 offset + 32)
2d8c6dc1
AH
9243 || offset_12bit_unsigned_scaled_p (V16QImode,
9244 offset + 32)));
9245
9246 /* Two 7bit offsets checks because XImode will emit two ldp/stp
9247 instructions (only big endian will get here). */
9248 if (mode == XImode)
9249 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
9250 && aarch64_offset_7bit_signed_scaled_p (TImode,
9251 offset + 32));
9252
43cacb12
RS
9253 /* Make "m" use the LD1 offset range for SVE data modes, so
9254 that pre-RTL optimizers like ivopts will work to that
9255 instead of the wider LDR/STR range. */
9256 if (vec_flags == VEC_SVE_DATA)
9257 return (type == ADDR_QUERY_M
9258 ? offset_4bit_signed_scaled_p (mode, offset)
9259 : offset_9bit_signed_scaled_p (mode, offset));
9260
9f4cbab8
RS
9261 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
9262 {
9263 poly_int64 end_offset = (offset
9264 + GET_MODE_SIZE (mode)
9265 - BYTES_PER_SVE_VECTOR);
9266 return (type == ADDR_QUERY_M
9267 ? offset_4bit_signed_scaled_p (mode, offset)
9268 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
9269 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
9270 end_offset)));
9271 }
9272
43cacb12
RS
9273 if (vec_flags == VEC_SVE_PRED)
9274 return offset_9bit_signed_scaled_p (mode, offset);
9275
2d8c6dc1 9276 if (load_store_pair_p)
6a70badb 9277 return ((known_eq (GET_MODE_SIZE (mode), 4)
9f5361c8
KT
9278 || known_eq (GET_MODE_SIZE (mode), 8)
9279 || known_eq (GET_MODE_SIZE (mode), 16))
44707478 9280 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
43e9d192 9281 else
3c5af608 9282 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
43e9d192
IB
9283 || offset_12bit_unsigned_scaled_p (mode, offset));
9284 }
9285
9286 if (allow_reg_index_p)
9287 {
9288 /* Look for base + (scaled/extended) index register. */
9289 if (aarch64_base_register_rtx_p (op0, strict_p)
9290 && aarch64_classify_index (info, op1, mode, strict_p))
9291 {
9292 info->base = op0;
9293 return true;
9294 }
9295 if (aarch64_base_register_rtx_p (op1, strict_p)
9296 && aarch64_classify_index (info, op0, mode, strict_p))
9297 {
9298 info->base = op1;
9299 return true;
9300 }
9301 }
9302
9303 return false;
9304
9305 case POST_INC:
9306 case POST_DEC:
9307 case PRE_INC:
9308 case PRE_DEC:
9309 info->type = ADDRESS_REG_WB;
9310 info->base = XEXP (x, 0);
9311 info->offset = NULL_RTX;
9312 return aarch64_base_register_rtx_p (info->base, strict_p);
9313
9314 case POST_MODIFY:
9315 case PRE_MODIFY:
9316 info->type = ADDRESS_REG_WB;
9317 info->base = XEXP (x, 0);
9318 if (GET_CODE (XEXP (x, 1)) == PLUS
dc640181 9319 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
43e9d192
IB
9320 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
9321 && aarch64_base_register_rtx_p (info->base, strict_p))
9322 {
43e9d192 9323 info->offset = XEXP (XEXP (x, 1), 1);
dc640181 9324 info->const_offset = offset;
43e9d192
IB
9325
9326 /* TImode and TFmode values are allowed in both pairs of X
9327 registers and individual Q registers. The available
9328 address modes are:
9329 X,X: 7-bit signed scaled offset
9330 Q: 9-bit signed offset
9331 We conservatively require an offset representable in either mode.
9332 */
9333 if (mode == TImode || mode == TFmode)
44707478 9334 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3c5af608 9335 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
43e9d192 9336
2d8c6dc1 9337 if (load_store_pair_p)
6a70badb 9338 return ((known_eq (GET_MODE_SIZE (mode), 4)
9f5361c8
KT
9339 || known_eq (GET_MODE_SIZE (mode), 8)
9340 || known_eq (GET_MODE_SIZE (mode), 16))
44707478 9341 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
43e9d192 9342 else
3c5af608 9343 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
43e9d192
IB
9344 }
9345 return false;
9346
9347 case CONST:
9348 case SYMBOL_REF:
9349 case LABEL_REF:
79517551
SN
9350 /* load literal: pc-relative constant pool entry. Only supported
9351 for SI mode or larger. */
43e9d192 9352 info->type = ADDRESS_SYMBOLIC;
2d8c6dc1 9353
6a70badb
RS
9354 if (!load_store_pair_p
9355 && GET_MODE_SIZE (mode).is_constant (&const_size)
9356 && const_size >= 4)
43e9d192 9357 {
74b27d8e
RS
9358 poly_int64 offset;
9359 rtx sym = strip_offset_and_salt (x, &offset);
3793ecc1
AC
9360 return ((LABEL_REF_P (sym)
9361 || (SYMBOL_REF_P (sym)
b4f50fd4 9362 && CONSTANT_POOL_ADDRESS_P (sym)
9ee6540a 9363 && aarch64_pcrelative_literal_loads)));
43e9d192
IB
9364 }
9365 return false;
9366
9367 case LO_SUM:
9368 info->type = ADDRESS_LO_SUM;
9369 info->base = XEXP (x, 0);
9370 info->offset = XEXP (x, 1);
9371 if (allow_reg_index_p
9372 && aarch64_base_register_rtx_p (info->base, strict_p))
9373 {
74b27d8e
RS
9374 poly_int64 offset;
9375 HOST_WIDE_INT const_offset;
9376 rtx sym = strip_offset_and_salt (info->offset, &offset);
3793ecc1 9377 if (SYMBOL_REF_P (sym)
74b27d8e
RS
9378 && offset.is_constant (&const_offset)
9379 && (aarch64_classify_symbol (sym, const_offset)
43cacb12 9380 == SYMBOL_SMALL_ABSOLUTE))
43e9d192
IB
9381 {
9382 /* The symbol and offset must be aligned to the access size. */
9383 unsigned int align;
43e9d192
IB
9384
9385 if (CONSTANT_POOL_ADDRESS_P (sym))
9386 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
9387 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
9388 {
9389 tree exp = SYMBOL_REF_DECL (sym);
9390 align = TYPE_ALIGN (TREE_TYPE (exp));
58e17cf8 9391 align = aarch64_constant_alignment (exp, align);
43e9d192
IB
9392 }
9393 else if (SYMBOL_REF_DECL (sym))
9394 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
6c031d8d
KV
9395 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
9396 && SYMBOL_REF_BLOCK (sym) != NULL)
9397 align = SYMBOL_REF_BLOCK (sym)->alignment;
43e9d192
IB
9398 else
9399 align = BITS_PER_UNIT;
9400
6a70badb
RS
9401 poly_int64 ref_size = GET_MODE_SIZE (mode);
9402 if (known_eq (ref_size, 0))
43e9d192
IB
9403 ref_size = GET_MODE_SIZE (DImode);
9404
74b27d8e 9405 return (multiple_p (const_offset, ref_size)
6a70badb 9406 && multiple_p (align / BITS_PER_UNIT, ref_size));
43e9d192
IB
9407 }
9408 }
9409 return false;
9410
9411 default:
9412 return false;
9413 }
9414}
9415
9bf2f779
KT
9416/* Return true if the address X is valid for a PRFM instruction.
9417 STRICT_P is true if we should do strict checking with
9418 aarch64_classify_address. */
9419
9420bool
9421aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
9422{
9423 struct aarch64_address_info addr;
9424
9425 /* PRFM accepts the same addresses as DImode... */
a97d8b98 9426 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
9bf2f779
KT
9427 if (!res)
9428 return false;
9429
9430 /* ... except writeback forms. */
9431 return addr.type != ADDRESS_REG_WB;
9432}
9433
43e9d192
IB
9434bool
9435aarch64_symbolic_address_p (rtx x)
9436{
74b27d8e
RS
9437 poly_int64 offset;
9438 x = strip_offset_and_salt (x, &offset);
3793ecc1 9439 return SYMBOL_REF_P (x) || LABEL_REF_P (x);
43e9d192
IB
9440}
9441
a6e0bfa7 9442/* Classify the base of symbolic expression X. */
da4f13a4
MS
9443
9444enum aarch64_symbol_type
a6e0bfa7 9445aarch64_classify_symbolic_expression (rtx x)
43e9d192
IB
9446{
9447 rtx offset;
da4f13a4 9448
43e9d192 9449 split_const (x, &x, &offset);
43cacb12 9450 return aarch64_classify_symbol (x, INTVAL (offset));
43e9d192
IB
9451}
9452
9453
9454/* Return TRUE if X is a legitimate address for accessing memory in
9455 mode MODE. */
9456static bool
ef4bddc2 9457aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
43e9d192
IB
9458{
9459 struct aarch64_address_info addr;
9460
a97d8b98 9461 return aarch64_classify_address (&addr, x, mode, strict_p);
43e9d192
IB
9462}
9463
a97d8b98
RS
9464/* Return TRUE if X is a legitimate address of type TYPE for accessing
9465 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
43e9d192 9466bool
a97d8b98
RS
9467aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
9468 aarch64_addr_query_type type)
43e9d192
IB
9469{
9470 struct aarch64_address_info addr;
9471
a97d8b98 9472 return aarch64_classify_address (&addr, x, mode, strict_p, type);
43e9d192
IB
9473}
9474
9005477f
RS
9475/* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
9476
491ec060 9477static bool
9005477f
RS
9478aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
9479 poly_int64 orig_offset,
9480 machine_mode mode)
491ec060 9481{
6a70badb
RS
9482 HOST_WIDE_INT size;
9483 if (GET_MODE_SIZE (mode).is_constant (&size))
9484 {
9005477f
RS
9485 HOST_WIDE_INT const_offset, second_offset;
9486
9487 /* A general SVE offset is A * VQ + B. Remove the A component from
9488 coefficient 0 in order to get the constant B. */
9489 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
9490
9491 /* Split an out-of-range address displacement into a base and
9492 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
9493 range otherwise to increase opportunities for sharing the base
9494 address of different sizes. Unaligned accesses use the signed
9495 9-bit range, TImode/TFmode use the intersection of signed
9496 scaled 7-bit and signed 9-bit offset. */
6a70badb 9497 if (mode == TImode || mode == TFmode)
9005477f
RS
9498 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
9499 else if ((const_offset & (size - 1)) != 0)
9500 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6a70badb 9501 else
9005477f 9502 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
491ec060 9503
9005477f
RS
9504 if (second_offset == 0 || known_eq (orig_offset, second_offset))
9505 return false;
9506
9507 /* Split the offset into second_offset and the rest. */
9508 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
9509 *offset2 = gen_int_mode (second_offset, Pmode);
9510 return true;
9511 }
9512 else
9513 {
9514 /* Get the mode we should use as the basis of the range. For structure
9515 modes this is the mode of one vector. */
9516 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
9517 machine_mode step_mode
9518 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
9519
9520 /* Get the "mul vl" multiplier we'd like to use. */
9521 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
9522 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
9523 if (vec_flags & VEC_SVE_DATA)
9524 /* LDR supports a 9-bit range, but the move patterns for
9525 structure modes require all vectors to be in range of the
9526 same base. The simplest way of accomodating that while still
9527 promoting reuse of anchor points between different modes is
9528 to use an 8-bit range unconditionally. */
9529 vnum = ((vnum + 128) & 255) - 128;
9530 else
9531 /* Predicates are only handled singly, so we might as well use
9532 the full range. */
9533 vnum = ((vnum + 256) & 511) - 256;
9534 if (vnum == 0)
9535 return false;
9536
9537 /* Convert the "mul vl" multiplier into a byte offset. */
9538 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
9539 if (known_eq (second_offset, orig_offset))
9540 return false;
9541
9542 /* Split the offset into second_offset and the rest. */
9543 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
9544 *offset2 = gen_int_mode (second_offset, Pmode);
6a70badb
RS
9545 return true;
9546 }
491ec060
WD
9547}
9548
a2170965
TC
9549/* Return the binary representation of floating point constant VALUE in INTVAL.
9550 If the value cannot be converted, return false without setting INTVAL.
9551 The conversion is done in the given MODE. */
9552bool
9553aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
9554{
9555
9556 /* We make a general exception for 0. */
9557 if (aarch64_float_const_zero_rtx_p (value))
9558 {
9559 *intval = 0;
9560 return true;
9561 }
9562
0d0e0188 9563 scalar_float_mode mode;
3793ecc1 9564 if (!CONST_DOUBLE_P (value)
0d0e0188 9565 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
a2170965
TC
9566 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
9567 /* Only support up to DF mode. */
9568 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
9569 return false;
9570
9571 unsigned HOST_WIDE_INT ival = 0;
9572
9573 long res[2];
9574 real_to_target (res,
9575 CONST_DOUBLE_REAL_VALUE (value),
9576 REAL_MODE_FORMAT (mode));
9577
5c22bb48
TC
9578 if (mode == DFmode)
9579 {
9580 int order = BYTES_BIG_ENDIAN ? 1 : 0;
9581 ival = zext_hwi (res[order], 32);
9582 ival |= (zext_hwi (res[1 - order], 32) << 32);
9583 }
9584 else
9585 ival = zext_hwi (res[0], 32);
a2170965
TC
9586
9587 *intval = ival;
9588 return true;
9589}
9590
9591/* Return TRUE if rtx X is an immediate constant that can be moved using a
9592 single MOV(+MOVK) followed by an FMOV. */
9593bool
9594aarch64_float_const_rtx_p (rtx x)
9595{
9596 machine_mode mode = GET_MODE (x);
9597 if (mode == VOIDmode)
9598 return false;
9599
9600 /* Determine whether it's cheaper to write float constants as
9601 mov/movk pairs over ldr/adrp pairs. */
9602 unsigned HOST_WIDE_INT ival;
9603
3793ecc1 9604 if (CONST_DOUBLE_P (x)
a2170965
TC
9605 && SCALAR_FLOAT_MODE_P (mode)
9606 && aarch64_reinterpret_float_as_int (x, &ival))
9607 {
77e994c9
RS
9608 scalar_int_mode imode = (mode == HFmode
9609 ? SImode
9610 : int_mode_for_mode (mode).require ());
a2170965
TC
9611 int num_instr = aarch64_internal_mov_immediate
9612 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
9613 return num_instr < 3;
9614 }
9615
9616 return false;
9617}
9618
43e9d192
IB
9619/* Return TRUE if rtx X is immediate constant 0.0 */
9620bool
3520f7cc 9621aarch64_float_const_zero_rtx_p (rtx x)
43e9d192 9622{
43e9d192
IB
9623 if (GET_MODE (x) == VOIDmode)
9624 return false;
9625
34a72c33 9626 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
43e9d192 9627 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
34a72c33 9628 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
43e9d192
IB
9629}
9630
a2170965
TC
9631/* Return TRUE if rtx X is immediate constant that fits in a single
9632 MOVI immediate operation. */
9633bool
9634aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
9635{
9636 if (!TARGET_SIMD)
9637 return false;
9638
77e994c9
RS
9639 machine_mode vmode;
9640 scalar_int_mode imode;
a2170965
TC
9641 unsigned HOST_WIDE_INT ival;
9642
3793ecc1 9643 if (CONST_DOUBLE_P (x)
a2170965
TC
9644 && SCALAR_FLOAT_MODE_P (mode))
9645 {
9646 if (!aarch64_reinterpret_float_as_int (x, &ival))
9647 return false;
9648
35c38fa6
TC
9649 /* We make a general exception for 0. */
9650 if (aarch64_float_const_zero_rtx_p (x))
9651 return true;
9652
304b9962 9653 imode = int_mode_for_mode (mode).require ();
a2170965 9654 }
3793ecc1 9655 else if (CONST_INT_P (x)
77e994c9
RS
9656 && is_a <scalar_int_mode> (mode, &imode))
9657 ival = INTVAL (x);
a2170965
TC
9658 else
9659 return false;
9660
9661 /* use a 64 bit mode for everything except for DI/DF mode, where we use
9662 a 128 bit vector mode. */
77e994c9 9663 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
a2170965
TC
9664
9665 vmode = aarch64_simd_container_mode (imode, width);
9666 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
9667
b187677b 9668 return aarch64_simd_valid_immediate (v_op, NULL);
a2170965
TC
9669}
9670
9671
70f09188
AP
9672/* Return the fixed registers used for condition codes. */
9673
9674static bool
9675aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
9676{
9677 *p1 = CC_REGNUM;
9678 *p2 = INVALID_REGNUM;
9679 return true;
9680}
9681
47210a04
RL
9682/* This function is used by the call expanders of the machine description.
9683 RESULT is the register in which the result is returned. It's NULL for
9684 "call" and "sibcall".
9685 MEM is the location of the function call.
08cc4d92 9686 CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
47210a04
RL
9687 SIBCALL indicates whether this function call is normal call or sibling call.
9688 It will generate different pattern accordingly. */
9689
9690void
08cc4d92 9691aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall)
47210a04
RL
9692{
9693 rtx call, callee, tmp;
9694 rtvec vec;
9695 machine_mode mode;
9696
9697 gcc_assert (MEM_P (mem));
9698 callee = XEXP (mem, 0);
9699 mode = GET_MODE (callee);
9700 gcc_assert (mode == Pmode);
9701
9702 /* Decide if we should generate indirect calls by loading the
9703 address of the callee into a register before performing
9704 the branch-and-link. */
9705 if (SYMBOL_REF_P (callee)
9706 ? (aarch64_is_long_call_p (callee)
9707 || aarch64_is_noplt_call_p (callee))
9708 : !REG_P (callee))
9709 XEXP (mem, 0) = force_reg (mode, callee);
9710
9711 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
9712
9713 if (result != NULL_RTX)
9714 call = gen_rtx_SET (result, call);
9715
9716 if (sibcall)
9717 tmp = ret_rtx;
9718 else
9719 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
9720
08cc4d92
RS
9721 gcc_assert (CONST_INT_P (callee_abi));
9722 callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
9723 UNSPEC_CALLEE_ABI);
9724
9725 vec = gen_rtvec (3, call, callee_abi, tmp);
47210a04
RL
9726 call = gen_rtx_PARALLEL (VOIDmode, vec);
9727
9728 aarch64_emit_call_insn (call);
9729}
9730
78607708
TV
9731/* Emit call insn with PAT and do aarch64-specific handling. */
9732
d07a3fed 9733void
78607708
TV
9734aarch64_emit_call_insn (rtx pat)
9735{
9736 rtx insn = emit_call_insn (pat);
9737
9738 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
9739 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
9740 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
9741}
9742
ef4bddc2 9743machine_mode
43e9d192
IB
9744aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
9745{
f7343f20
RE
9746 machine_mode mode_x = GET_MODE (x);
9747 rtx_code code_x = GET_CODE (x);
9748
43e9d192
IB
9749 /* All floating point compares return CCFP if it is an equality
9750 comparison, and CCFPE otherwise. */
f7343f20 9751 if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
43e9d192
IB
9752 {
9753 switch (code)
9754 {
9755 case EQ:
9756 case NE:
9757 case UNORDERED:
9758 case ORDERED:
9759 case UNLT:
9760 case UNLE:
9761 case UNGT:
9762 case UNGE:
9763 case UNEQ:
43e9d192
IB
9764 return CCFPmode;
9765
9766 case LT:
9767 case LE:
9768 case GT:
9769 case GE:
8332c5ee 9770 case LTGT:
43e9d192
IB
9771 return CCFPEmode;
9772
9773 default:
9774 gcc_unreachable ();
9775 }
9776 }
9777
2b8568fe
KT
9778 /* Equality comparisons of short modes against zero can be performed
9779 using the TST instruction with the appropriate bitmask. */
f73dc006 9780 if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
2b8568fe 9781 && (code == EQ || code == NE)
f7343f20 9782 && (mode_x == HImode || mode_x == QImode))
2b8568fe
KT
9783 return CC_NZmode;
9784
b06335f9
KT
9785 /* Similarly, comparisons of zero_extends from shorter modes can
9786 be performed using an ANDS with an immediate mask. */
f7343f20
RE
9787 if (y == const0_rtx && code_x == ZERO_EXTEND
9788 && (mode_x == SImode || mode_x == DImode)
b06335f9
KT
9789 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
9790 && (code == EQ || code == NE))
9791 return CC_NZmode;
9792
f7343f20 9793 if ((mode_x == SImode || mode_x == DImode)
43e9d192
IB
9794 && y == const0_rtx
9795 && (code == EQ || code == NE || code == LT || code == GE)
f7343f20
RE
9796 && (code_x == PLUS || code_x == MINUS || code_x == AND
9797 || code_x == NEG
9798 || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
7325d85a 9799 && CONST_INT_P (XEXP (x, 2)))))
43e9d192
IB
9800 return CC_NZmode;
9801
1c992d1e 9802 /* A compare with a shifted operand. Because of canonicalization,
43e9d192
IB
9803 the comparison will have to be swapped when we emit the assembly
9804 code. */
f7343f20 9805 if ((mode_x == SImode || mode_x == DImode)
3793ecc1 9806 && (REG_P (y) || SUBREG_P (y) || y == const0_rtx)
f7343f20
RE
9807 && (code_x == ASHIFT || code_x == ASHIFTRT
9808 || code_x == LSHIFTRT
9809 || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
43e9d192
IB
9810 return CC_SWPmode;
9811
1c992d1e
RE
9812 /* Similarly for a negated operand, but we can only do this for
9813 equalities. */
f7343f20 9814 if ((mode_x == SImode || mode_x == DImode)
3793ecc1 9815 && (REG_P (y) || SUBREG_P (y))
1c992d1e 9816 && (code == EQ || code == NE)
f7343f20 9817 && code_x == NEG)
1c992d1e
RE
9818 return CC_Zmode;
9819
f7343f20
RE
9820 /* A test for unsigned overflow from an addition. */
9821 if ((mode_x == DImode || mode_x == TImode)
9822 && (code == LTU || code == GEU)
9823 && code_x == PLUS
9824 && rtx_equal_p (XEXP (x, 0), y))
ef22810a
RH
9825 return CC_Cmode;
9826
f7343f20
RE
9827 /* A test for unsigned overflow from an add with carry. */
9828 if ((mode_x == DImode || mode_x == TImode)
9829 && (code == LTU || code == GEU)
9830 && code_x == PLUS
9831 && CONST_SCALAR_INT_P (y)
9832 && (rtx_mode_t (y, mode_x)
9833 == (wi::shwi (1, mode_x)
9834 << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
9835 return CC_ADCmode;
9836
30c46053 9837 /* A test for signed overflow. */
f7343f20 9838 if ((mode_x == DImode || mode_x == TImode)
30c46053 9839 && code == NE
f7343f20 9840 && code_x == PLUS
30c46053
MC
9841 && GET_CODE (y) == SIGN_EXTEND)
9842 return CC_Vmode;
9843
43e9d192
IB
9844 /* For everything else, return CCmode. */
9845 return CCmode;
9846}
9847
3dfa7055 9848static int
b8506a8a 9849aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
3dfa7055 9850
cd5660ab 9851int
43e9d192
IB
9852aarch64_get_condition_code (rtx x)
9853{
ef4bddc2 9854 machine_mode mode = GET_MODE (XEXP (x, 0));
43e9d192
IB
9855 enum rtx_code comp_code = GET_CODE (x);
9856
9857 if (GET_MODE_CLASS (mode) != MODE_CC)
9858 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3dfa7055
ZC
9859 return aarch64_get_condition_code_1 (mode, comp_code);
9860}
43e9d192 9861
3dfa7055 9862static int
b8506a8a 9863aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
3dfa7055 9864{
43e9d192
IB
9865 switch (mode)
9866 {
4e10a5a7
RS
9867 case E_CCFPmode:
9868 case E_CCFPEmode:
43e9d192
IB
9869 switch (comp_code)
9870 {
9871 case GE: return AARCH64_GE;
9872 case GT: return AARCH64_GT;
9873 case LE: return AARCH64_LS;
9874 case LT: return AARCH64_MI;
9875 case NE: return AARCH64_NE;
9876 case EQ: return AARCH64_EQ;
9877 case ORDERED: return AARCH64_VC;
9878 case UNORDERED: return AARCH64_VS;
9879 case UNLT: return AARCH64_LT;
9880 case UNLE: return AARCH64_LE;
9881 case UNGT: return AARCH64_HI;
9882 case UNGE: return AARCH64_PL;
cd5660ab 9883 default: return -1;
43e9d192
IB
9884 }
9885 break;
9886
4e10a5a7 9887 case E_CCmode:
43e9d192
IB
9888 switch (comp_code)
9889 {
9890 case NE: return AARCH64_NE;
9891 case EQ: return AARCH64_EQ;
9892 case GE: return AARCH64_GE;
9893 case GT: return AARCH64_GT;
9894 case LE: return AARCH64_LE;
9895 case LT: return AARCH64_LT;
9896 case GEU: return AARCH64_CS;
9897 case GTU: return AARCH64_HI;
9898 case LEU: return AARCH64_LS;
9899 case LTU: return AARCH64_CC;
cd5660ab 9900 default: return -1;
43e9d192
IB
9901 }
9902 break;
9903
4e10a5a7 9904 case E_CC_SWPmode:
43e9d192
IB
9905 switch (comp_code)
9906 {
9907 case NE: return AARCH64_NE;
9908 case EQ: return AARCH64_EQ;
9909 case GE: return AARCH64_LE;
9910 case GT: return AARCH64_LT;
9911 case LE: return AARCH64_GE;
9912 case LT: return AARCH64_GT;
9913 case GEU: return AARCH64_LS;
9914 case GTU: return AARCH64_CC;
9915 case LEU: return AARCH64_CS;
9916 case LTU: return AARCH64_HI;
cd5660ab 9917 default: return -1;
43e9d192
IB
9918 }
9919 break;
9920
57d6f4d0
RS
9921 case E_CC_NZCmode:
9922 switch (comp_code)
9923 {
9924 case NE: return AARCH64_NE; /* = any */
9925 case EQ: return AARCH64_EQ; /* = none */
9926 case GE: return AARCH64_PL; /* = nfrst */
9927 case LT: return AARCH64_MI; /* = first */
9928 case GEU: return AARCH64_CS; /* = nlast */
9929 case GTU: return AARCH64_HI; /* = pmore */
9930 case LEU: return AARCH64_LS; /* = plast */
9931 case LTU: return AARCH64_CC; /* = last */
9932 default: return -1;
9933 }
9934 break;
9935
4e10a5a7 9936 case E_CC_NZmode:
43e9d192
IB
9937 switch (comp_code)
9938 {
9939 case NE: return AARCH64_NE;
9940 case EQ: return AARCH64_EQ;
9941 case GE: return AARCH64_PL;
9942 case LT: return AARCH64_MI;
cd5660ab 9943 default: return -1;
43e9d192
IB
9944 }
9945 break;
9946
4e10a5a7 9947 case E_CC_Zmode:
1c992d1e
RE
9948 switch (comp_code)
9949 {
9950 case NE: return AARCH64_NE;
9951 case EQ: return AARCH64_EQ;
cd5660ab 9952 default: return -1;
1c992d1e
RE
9953 }
9954 break;
9955
4e10a5a7 9956 case E_CC_Cmode:
ef22810a
RH
9957 switch (comp_code)
9958 {
f7343f20
RE
9959 case LTU: return AARCH64_CS;
9960 case GEU: return AARCH64_CC;
9961 default: return -1;
9962 }
9963 break;
9964
9965 case E_CC_ADCmode:
9966 switch (comp_code)
9967 {
9968 case GEU: return AARCH64_CS;
9969 case LTU: return AARCH64_CC;
ef22810a
RH
9970 default: return -1;
9971 }
9972 break;
9973
30c46053
MC
9974 case E_CC_Vmode:
9975 switch (comp_code)
9976 {
9977 case NE: return AARCH64_VS;
9978 case EQ: return AARCH64_VC;
9979 default: return -1;
9980 }
9981 break;
9982
43e9d192 9983 default:
cd5660ab 9984 return -1;
43e9d192 9985 }
3dfa7055 9986
3dfa7055 9987 return -1;
43e9d192
IB
9988}
9989
ddeabd3e
AL
9990bool
9991aarch64_const_vec_all_same_in_range_p (rtx x,
6a70badb
RS
9992 HOST_WIDE_INT minval,
9993 HOST_WIDE_INT maxval)
ddeabd3e 9994{
6a70badb
RS
9995 rtx elt;
9996 return (const_vec_duplicate_p (x, &elt)
9997 && CONST_INT_P (elt)
9998 && IN_RANGE (INTVAL (elt), minval, maxval));
ddeabd3e
AL
9999}
10000
10001bool
10002aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
10003{
10004 return aarch64_const_vec_all_same_in_range_p (x, val, val);
10005}
10006
43cacb12
RS
10007/* Return true if VEC is a constant in which every element is in the range
10008 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
10009
10010static bool
10011aarch64_const_vec_all_in_range_p (rtx vec,
10012 HOST_WIDE_INT minval,
10013 HOST_WIDE_INT maxval)
10014{
10015 if (GET_CODE (vec) != CONST_VECTOR
10016 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
10017 return false;
10018
10019 int nunits;
10020 if (!CONST_VECTOR_STEPPED_P (vec))
10021 nunits = const_vector_encoded_nelts (vec);
10022 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
10023 return false;
10024
10025 for (int i = 0; i < nunits; i++)
10026 {
10027 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
10028 if (!CONST_INT_P (vec_elem)
10029 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
10030 return false;
10031 }
10032 return true;
10033}
43e9d192 10034
cf670503
ZC
10035/* N Z C V. */
10036#define AARCH64_CC_V 1
10037#define AARCH64_CC_C (1 << 1)
10038#define AARCH64_CC_Z (1 << 2)
10039#define AARCH64_CC_N (1 << 3)
10040
c8012fbc
WD
10041/* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
10042static const int aarch64_nzcv_codes[] =
10043{
10044 0, /* EQ, Z == 1. */
10045 AARCH64_CC_Z, /* NE, Z == 0. */
10046 0, /* CS, C == 1. */
10047 AARCH64_CC_C, /* CC, C == 0. */
10048 0, /* MI, N == 1. */
10049 AARCH64_CC_N, /* PL, N == 0. */
10050 0, /* VS, V == 1. */
10051 AARCH64_CC_V, /* VC, V == 0. */
10052 0, /* HI, C ==1 && Z == 0. */
10053 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
10054 AARCH64_CC_V, /* GE, N == V. */
10055 0, /* LT, N != V. */
10056 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
10057 0, /* LE, !(Z == 0 && N == V). */
10058 0, /* AL, Any. */
10059 0 /* NV, Any. */
cf670503
ZC
10060};
10061
43cacb12
RS
10062/* Print floating-point vector immediate operand X to F, negating it
10063 first if NEGATE is true. Return true on success, false if it isn't
10064 a constant we can handle. */
10065
10066static bool
10067aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
10068{
10069 rtx elt;
10070
10071 if (!const_vec_duplicate_p (x, &elt))
10072 return false;
10073
10074 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
10075 if (negate)
10076 r = real_value_negate (&r);
10077
d29f7dd5
RS
10078 /* Handle the SVE single-bit immediates specially, since they have a
10079 fixed form in the assembly syntax. */
43cacb12
RS
10080 if (real_equal (&r, &dconst0))
10081 asm_fprintf (f, "0.0");
a19ba9e1
RS
10082 else if (real_equal (&r, &dconst2))
10083 asm_fprintf (f, "2.0");
43cacb12
RS
10084 else if (real_equal (&r, &dconst1))
10085 asm_fprintf (f, "1.0");
10086 else if (real_equal (&r, &dconsthalf))
10087 asm_fprintf (f, "0.5");
10088 else
d29f7dd5
RS
10089 {
10090 const int buf_size = 20;
10091 char float_buf[buf_size] = {'\0'};
10092 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
10093 1, GET_MODE (elt));
10094 asm_fprintf (f, "%s", float_buf);
10095 }
43cacb12
RS
10096
10097 return true;
10098}
10099
9f4cbab8
RS
10100/* Return the equivalent letter for size. */
10101static char
10102sizetochar (int size)
10103{
10104 switch (size)
10105 {
10106 case 64: return 'd';
10107 case 32: return 's';
10108 case 16: return 'h';
10109 case 8 : return 'b';
10110 default: gcc_unreachable ();
10111 }
10112}
10113
bcf19844
JW
10114/* Print operand X to file F in a target specific manner according to CODE.
10115 The acceptable formatting commands given by CODE are:
10116 'c': An integer or symbol address without a preceding #
10117 sign.
43cacb12
RS
10118 'C': Take the duplicated element in a vector constant
10119 and print it in hex.
10120 'D': Take the duplicated element in a vector constant
10121 and print it as an unsigned integer, in decimal.
bcf19844 10122 'e': Print the sign/zero-extend size as a character 8->b,
d113ece6
RS
10123 16->h, 32->w. Can also be used for masks:
10124 0xff->b, 0xffff->h, 0xffffffff->w.
d29f7dd5
RS
10125 'I': If the operand is a duplicated vector constant,
10126 replace it with the duplicated scalar. If the
10127 operand is then a floating-point constant, replace
10128 it with the integer bit representation. Print the
10129 transformed constant as a signed decimal number.
bcf19844
JW
10130 'p': Prints N such that 2^N == X (X must be power of 2 and
10131 const int).
10132 'P': Print the number of non-zero bits in X (a const_int).
10133 'H': Print the higher numbered register of a pair (TImode)
10134 of regs.
10135 'm': Print a condition (eq, ne, etc).
10136 'M': Same as 'm', but invert condition.
43cacb12
RS
10137 'N': Take the duplicated element in a vector constant
10138 and print the negative of it in decimal.
bcf19844
JW
10139 'b/h/s/d/q': Print a scalar FP/SIMD register name.
10140 'S/T/U/V': Print a FP/SIMD register name for a register list.
10141 The register printed is the FP/SIMD register name
10142 of X + 0/1/2/3 for S/T/U/V.
e3f15286 10143 'R': Print a scalar Integer/FP/SIMD register name + 1.
bcf19844
JW
10144 'X': Print bottom 16 bits of integer constant in hex.
10145 'w/x': Print a general register name or the zero register
10146 (32-bit or 64-bit).
10147 '0': Print a normal operand, if it's a general register,
10148 then we assume DImode.
10149 'k': Print NZCV for conditional compare instructions.
10150 'A': Output address constant representing the first
10151 argument of X, specifying a relocation offset
10152 if appropriate.
10153 'L': Output constant address specified by X
10154 with a relocation offset if appropriate.
10155 'G': Prints address of X, specifying a PC relative
e69a816d
WD
10156 relocation mode if appropriate.
10157 'y': Output address of LDP or STP - this is used for
10158 some LDP/STPs which don't use a PARALLEL in their
10159 pattern (so the mode needs to be adjusted).
10160 'z': Output address of a typical LDP or STP. */
bcf19844 10161
cc8ca59e
JB
10162static void
10163aarch64_print_operand (FILE *f, rtx x, int code)
43e9d192 10164{
43cacb12 10165 rtx elt;
43e9d192
IB
10166 switch (code)
10167 {
f541a481 10168 case 'c':
74b27d8e
RS
10169 if (CONST_INT_P (x))
10170 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
10171 else
f541a481 10172 {
74b27d8e
RS
10173 poly_int64 offset;
10174 rtx base = strip_offset_and_salt (x, &offset);
10175 if (SYMBOL_REF_P (base))
10176 output_addr_const (f, x);
10177 else
10178 output_operand_lossage ("unsupported operand for code '%c'", code);
f541a481
KT
10179 }
10180 break;
10181
43e9d192 10182 case 'e':
43e9d192 10183 {
d113ece6
RS
10184 x = unwrap_const_vec_duplicate (x);
10185 if (!CONST_INT_P (x))
43e9d192
IB
10186 {
10187 output_operand_lossage ("invalid operand for '%%%c'", code);
10188 return;
10189 }
10190
d113ece6
RS
10191 HOST_WIDE_INT val = INTVAL (x);
10192 if ((val & ~7) == 8 || val == 0xff)
10193 fputc ('b', f);
10194 else if ((val & ~7) == 16 || val == 0xffff)
10195 fputc ('h', f);
10196 else if ((val & ~7) == 32 || val == 0xffffffff)
10197 fputc ('w', f);
10198 else
43e9d192 10199 {
43e9d192
IB
10200 output_operand_lossage ("invalid operand for '%%%c'", code);
10201 return;
10202 }
10203 }
10204 break;
10205
10206 case 'p':
10207 {
10208 int n;
10209
4aa81c2e 10210 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
43e9d192
IB
10211 {
10212 output_operand_lossage ("invalid operand for '%%%c'", code);
10213 return;
10214 }
10215
10216 asm_fprintf (f, "%d", n);
10217 }
10218 break;
10219
10220 case 'P':
4aa81c2e 10221 if (!CONST_INT_P (x))
43e9d192
IB
10222 {
10223 output_operand_lossage ("invalid operand for '%%%c'", code);
10224 return;
10225 }
10226
8d55c61b 10227 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
43e9d192
IB
10228 break;
10229
10230 case 'H':
c0111dc4
RE
10231 if (x == const0_rtx)
10232 {
10233 asm_fprintf (f, "xzr");
10234 break;
10235 }
10236
4aa81c2e 10237 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
43e9d192
IB
10238 {
10239 output_operand_lossage ("invalid operand for '%%%c'", code);
10240 return;
10241 }
10242
01a3a324 10243 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
43e9d192
IB
10244 break;
10245
d29f7dd5
RS
10246 case 'I':
10247 {
10248 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
10249 if (CONST_INT_P (x))
10250 asm_fprintf (f, "%wd", INTVAL (x));
10251 else
10252 {
10253 output_operand_lossage ("invalid operand for '%%%c'", code);
10254 return;
10255 }
10256 break;
10257 }
10258
43e9d192 10259 case 'M':
c8012fbc 10260 case 'm':
cd5660ab
KT
10261 {
10262 int cond_code;
c8012fbc
WD
10263 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
10264 if (x == const_true_rtx)
cd5660ab 10265 {
c8012fbc
WD
10266 if (code == 'M')
10267 fputs ("nv", f);
cd5660ab
KT
10268 return;
10269 }
43e9d192 10270
cd5660ab
KT
10271 if (!COMPARISON_P (x))
10272 {
10273 output_operand_lossage ("invalid operand for '%%%c'", code);
10274 return;
10275 }
c8012fbc 10276
cd5660ab
KT
10277 cond_code = aarch64_get_condition_code (x);
10278 gcc_assert (cond_code >= 0);
c8012fbc
WD
10279 if (code == 'M')
10280 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
57d6f4d0
RS
10281 if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
10282 fputs (aarch64_sve_condition_codes[cond_code], f);
10283 else
10284 fputs (aarch64_condition_codes[cond_code], f);
cd5660ab 10285 }
43e9d192
IB
10286 break;
10287
43cacb12
RS
10288 case 'N':
10289 if (!const_vec_duplicate_p (x, &elt))
10290 {
10291 output_operand_lossage ("invalid vector constant");
10292 return;
10293 }
10294
10295 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
10296 asm_fprintf (f, "%wd", -INTVAL (elt));
10297 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10298 && aarch64_print_vector_float_operand (f, x, true))
10299 ;
10300 else
10301 {
10302 output_operand_lossage ("invalid vector constant");
10303 return;
10304 }
10305 break;
10306
43e9d192
IB
10307 case 'b':
10308 case 'h':
10309 case 's':
10310 case 'd':
10311 case 'q':
43e9d192
IB
10312 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
10313 {
10314 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
10315 return;
10316 }
50ce6f88 10317 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
43e9d192
IB
10318 break;
10319
10320 case 'S':
10321 case 'T':
10322 case 'U':
10323 case 'V':
43e9d192
IB
10324 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
10325 {
10326 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
10327 return;
10328 }
43cacb12
RS
10329 asm_fprintf (f, "%c%d",
10330 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
10331 REGNO (x) - V0_REGNUM + (code - 'S'));
43e9d192
IB
10332 break;
10333
2d8c6dc1 10334 case 'R':
e3f15286
RH
10335 if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
10336 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
10337 else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
10338 asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
10339 else
10340 output_operand_lossage ("incompatible register operand for '%%%c'",
10341 code);
2d8c6dc1
AH
10342 break;
10343
a05c0ddf 10344 case 'X':
4aa81c2e 10345 if (!CONST_INT_P (x))
a05c0ddf
IB
10346 {
10347 output_operand_lossage ("invalid operand for '%%%c'", code);
10348 return;
10349 }
50d38551 10350 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
a05c0ddf
IB
10351 break;
10352
43cacb12
RS
10353 case 'C':
10354 {
10355 /* Print a replicated constant in hex. */
10356 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
10357 {
10358 output_operand_lossage ("invalid operand for '%%%c'", code);
10359 return;
10360 }
10361 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
10362 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
10363 }
10364 break;
10365
10366 case 'D':
10367 {
10368 /* Print a replicated constant in decimal, treating it as
10369 unsigned. */
10370 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
10371 {
10372 output_operand_lossage ("invalid operand for '%%%c'", code);
10373 return;
10374 }
10375 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
10376 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
10377 }
10378 break;
10379
43e9d192
IB
10380 case 'w':
10381 case 'x':
3520f7cc
JG
10382 if (x == const0_rtx
10383 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
43e9d192 10384 {
50ce6f88 10385 asm_fprintf (f, "%czr", code);
43e9d192
IB
10386 break;
10387 }
10388
10389 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
10390 {
50ce6f88 10391 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
43e9d192
IB
10392 break;
10393 }
10394
10395 if (REG_P (x) && REGNO (x) == SP_REGNUM)
10396 {
50ce6f88 10397 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
43e9d192
IB
10398 break;
10399 }
10400
10401 /* Fall through */
10402
10403 case 0:
43e9d192
IB
10404 if (x == NULL)
10405 {
10406 output_operand_lossage ("missing operand");
10407 return;
10408 }
10409
10410 switch (GET_CODE (x))
10411 {
10412 case REG:
43cacb12 10413 if (aarch64_sve_data_mode_p (GET_MODE (x)))
9f4cbab8
RS
10414 {
10415 if (REG_NREGS (x) == 1)
10416 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
10417 else
10418 {
10419 char suffix
10420 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
10421 asm_fprintf (f, "{z%d.%c - z%d.%c}",
10422 REGNO (x) - V0_REGNUM, suffix,
10423 END_REGNO (x) - V0_REGNUM - 1, suffix);
10424 }
10425 }
43cacb12
RS
10426 else
10427 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
43e9d192
IB
10428 break;
10429
10430 case MEM:
cc8ca59e 10431 output_address (GET_MODE (x), XEXP (x, 0));
43e9d192
IB
10432 break;
10433
10434 case LABEL_REF:
10435 case SYMBOL_REF:
10436 output_addr_const (asm_out_file, x);
10437 break;
10438
10439 case CONST_INT:
10440 asm_fprintf (f, "%wd", INTVAL (x));
10441 break;
10442
43cacb12
RS
10443 case CONST:
10444 if (!VECTOR_MODE_P (GET_MODE (x)))
3520f7cc 10445 {
43cacb12
RS
10446 output_addr_const (asm_out_file, x);
10447 break;
3520f7cc 10448 }
43cacb12
RS
10449 /* fall through */
10450
10451 case CONST_VECTOR:
10452 if (!const_vec_duplicate_p (x, &elt))
3520f7cc 10453 {
43cacb12
RS
10454 output_operand_lossage ("invalid vector constant");
10455 return;
3520f7cc 10456 }
43cacb12
RS
10457
10458 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
10459 asm_fprintf (f, "%wd", INTVAL (elt));
10460 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10461 && aarch64_print_vector_float_operand (f, x, false))
10462 ;
3520f7cc 10463 else
43cacb12
RS
10464 {
10465 output_operand_lossage ("invalid vector constant");
10466 return;
10467 }
43e9d192
IB
10468 break;
10469
3520f7cc 10470 case CONST_DOUBLE:
2ca5b430
KT
10471 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
10472 be getting CONST_DOUBLEs holding integers. */
10473 gcc_assert (GET_MODE (x) != VOIDmode);
10474 if (aarch64_float_const_zero_rtx_p (x))
3520f7cc
JG
10475 {
10476 fputc ('0', f);
10477 break;
10478 }
10479 else if (aarch64_float_const_representable_p (x))
10480 {
10481#define buf_size 20
10482 char float_buf[buf_size] = {'\0'};
34a72c33
RS
10483 real_to_decimal_for_mode (float_buf,
10484 CONST_DOUBLE_REAL_VALUE (x),
3520f7cc
JG
10485 buf_size, buf_size,
10486 1, GET_MODE (x));
10487 asm_fprintf (asm_out_file, "%s", float_buf);
10488 break;
10489#undef buf_size
10490 }
10491 output_operand_lossage ("invalid constant");
10492 return;
43e9d192
IB
10493 default:
10494 output_operand_lossage ("invalid operand");
10495 return;
10496 }
10497 break;
10498
10499 case 'A':
10500 if (GET_CODE (x) == HIGH)
10501 x = XEXP (x, 0);
10502
a6e0bfa7 10503 switch (aarch64_classify_symbolic_expression (x))
43e9d192 10504 {
6642bdb4 10505 case SYMBOL_SMALL_GOT_4G:
43e9d192
IB
10506 asm_fprintf (asm_out_file, ":got:");
10507 break;
10508
10509 case SYMBOL_SMALL_TLSGD:
10510 asm_fprintf (asm_out_file, ":tlsgd:");
10511 break;
10512
10513 case SYMBOL_SMALL_TLSDESC:
10514 asm_fprintf (asm_out_file, ":tlsdesc:");
10515 break;
10516
79496620 10517 case SYMBOL_SMALL_TLSIE:
43e9d192
IB
10518 asm_fprintf (asm_out_file, ":gottprel:");
10519 break;
10520
d18ba284 10521 case SYMBOL_TLSLE24:
43e9d192
IB
10522 asm_fprintf (asm_out_file, ":tprel:");
10523 break;
10524
87dd8ab0
MS
10525 case SYMBOL_TINY_GOT:
10526 gcc_unreachable ();
10527 break;
10528
43e9d192
IB
10529 default:
10530 break;
10531 }
10532 output_addr_const (asm_out_file, x);
10533 break;
10534
10535 case 'L':
a6e0bfa7 10536 switch (aarch64_classify_symbolic_expression (x))
43e9d192 10537 {
6642bdb4 10538 case SYMBOL_SMALL_GOT_4G:
43e9d192
IB
10539 asm_fprintf (asm_out_file, ":lo12:");
10540 break;
10541
10542 case SYMBOL_SMALL_TLSGD:
10543 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
10544 break;
10545
10546 case SYMBOL_SMALL_TLSDESC:
10547 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
10548 break;
10549
79496620 10550 case SYMBOL_SMALL_TLSIE:
43e9d192
IB
10551 asm_fprintf (asm_out_file, ":gottprel_lo12:");
10552 break;
10553
cbf5629e
JW
10554 case SYMBOL_TLSLE12:
10555 asm_fprintf (asm_out_file, ":tprel_lo12:");
10556 break;
10557
d18ba284 10558 case SYMBOL_TLSLE24:
43e9d192
IB
10559 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
10560 break;
10561
87dd8ab0
MS
10562 case SYMBOL_TINY_GOT:
10563 asm_fprintf (asm_out_file, ":got:");
10564 break;
10565
5ae7caad
JW
10566 case SYMBOL_TINY_TLSIE:
10567 asm_fprintf (asm_out_file, ":gottprel:");
10568 break;
10569
43e9d192
IB
10570 default:
10571 break;
10572 }
10573 output_addr_const (asm_out_file, x);
10574 break;
10575
10576 case 'G':
a6e0bfa7 10577 switch (aarch64_classify_symbolic_expression (x))
43e9d192 10578 {
d18ba284 10579 case SYMBOL_TLSLE24:
43e9d192
IB
10580 asm_fprintf (asm_out_file, ":tprel_hi12:");
10581 break;
10582 default:
10583 break;
10584 }
10585 output_addr_const (asm_out_file, x);
10586 break;
10587
cf670503
ZC
10588 case 'k':
10589 {
c8012fbc 10590 HOST_WIDE_INT cond_code;
cf670503 10591
c8012fbc 10592 if (!CONST_INT_P (x))
cf670503
ZC
10593 {
10594 output_operand_lossage ("invalid operand for '%%%c'", code);
10595 return;
10596 }
10597
c8012fbc
WD
10598 cond_code = INTVAL (x);
10599 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
10600 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
cf670503
ZC
10601 }
10602 break;
10603
e69a816d
WD
10604 case 'y':
10605 case 'z':
10606 {
10607 machine_mode mode = GET_MODE (x);
10608
3793ecc1 10609 if (!MEM_P (x)
6a70badb 10610 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
e69a816d
WD
10611 {
10612 output_operand_lossage ("invalid operand for '%%%c'", code);
10613 return;
10614 }
10615
a25831ac
AV
10616 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
10617 code == 'y'
10618 ? ADDR_QUERY_LDP_STP_N
10619 : ADDR_QUERY_LDP_STP))
c348cab0 10620 output_operand_lossage ("invalid operand prefix '%%%c'", code);
e69a816d
WD
10621 }
10622 break;
10623
43e9d192
IB
10624 default:
10625 output_operand_lossage ("invalid operand prefix '%%%c'", code);
10626 return;
10627 }
10628}
10629
e69a816d
WD
10630/* Print address 'x' of a memory access with mode 'mode'.
10631 'op' is the context required by aarch64_classify_address. It can either be
10632 MEM for a normal memory access or PARALLEL for LDP/STP. */
c348cab0 10633static bool
a97d8b98
RS
10634aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
10635 aarch64_addr_query_type type)
43e9d192
IB
10636{
10637 struct aarch64_address_info addr;
550a3380 10638 unsigned int size, vec_flags;
43e9d192 10639
e69a816d 10640 /* Check all addresses are Pmode - including ILP32. */
31460ed2
JJ
10641 if (GET_MODE (x) != Pmode
10642 && (!CONST_INT_P (x)
10643 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
10644 {
10645 output_operand_lossage ("invalid address mode");
10646 return false;
10647 }
e69a816d 10648
a97d8b98 10649 if (aarch64_classify_address (&addr, x, mode, true, type))
43e9d192
IB
10650 switch (addr.type)
10651 {
10652 case ADDRESS_REG_IMM:
dc640181 10653 if (known_eq (addr.const_offset, 0))
43cacb12 10654 {
550a3380
RS
10655 asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
10656 return true;
43cacb12 10657 }
550a3380
RS
10658
10659 vec_flags = aarch64_classify_vector_mode (mode);
10660 if (vec_flags & VEC_ANY_SVE)
43cacb12
RS
10661 {
10662 HOST_WIDE_INT vnum
10663 = exact_div (addr.const_offset,
550a3380 10664 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
43cacb12
RS
10665 asm_fprintf (f, "[%s, #%wd, mul vl]",
10666 reg_names[REGNO (addr.base)], vnum);
550a3380 10667 return true;
43cacb12 10668 }
550a3380
RS
10669
10670 asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
10671 INTVAL (addr.offset));
c348cab0 10672 return true;
43e9d192
IB
10673
10674 case ADDRESS_REG_REG:
10675 if (addr.shift == 0)
16a3246f 10676 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
01a3a324 10677 reg_names [REGNO (addr.offset)]);
43e9d192 10678 else
16a3246f 10679 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
01a3a324 10680 reg_names [REGNO (addr.offset)], addr.shift);
c348cab0 10681 return true;
43e9d192
IB
10682
10683 case ADDRESS_REG_UXTW:
10684 if (addr.shift == 0)
16a3246f 10685 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
43e9d192
IB
10686 REGNO (addr.offset) - R0_REGNUM);
10687 else
16a3246f 10688 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
43e9d192 10689 REGNO (addr.offset) - R0_REGNUM, addr.shift);
c348cab0 10690 return true;
43e9d192
IB
10691
10692 case ADDRESS_REG_SXTW:
10693 if (addr.shift == 0)
16a3246f 10694 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
43e9d192
IB
10695 REGNO (addr.offset) - R0_REGNUM);
10696 else
16a3246f 10697 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
43e9d192 10698 REGNO (addr.offset) - R0_REGNUM, addr.shift);
c348cab0 10699 return true;
43e9d192
IB
10700
10701 case ADDRESS_REG_WB:
6a70badb
RS
10702 /* Writeback is only supported for fixed-width modes. */
10703 size = GET_MODE_SIZE (mode).to_constant ();
43e9d192
IB
10704 switch (GET_CODE (x))
10705 {
10706 case PRE_INC:
6a70badb 10707 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
c348cab0 10708 return true;
43e9d192 10709 case POST_INC:
6a70badb 10710 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
c348cab0 10711 return true;
43e9d192 10712 case PRE_DEC:
6a70badb 10713 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
c348cab0 10714 return true;
43e9d192 10715 case POST_DEC:
6a70badb 10716 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
c348cab0 10717 return true;
43e9d192 10718 case PRE_MODIFY:
6a70badb 10719 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
43e9d192 10720 INTVAL (addr.offset));
c348cab0 10721 return true;
43e9d192 10722 case POST_MODIFY:
6a70badb 10723 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
43e9d192 10724 INTVAL (addr.offset));
c348cab0 10725 return true;
43e9d192
IB
10726 default:
10727 break;
10728 }
10729 break;
10730
10731 case ADDRESS_LO_SUM:
16a3246f 10732 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
43e9d192
IB
10733 output_addr_const (f, addr.offset);
10734 asm_fprintf (f, "]");
c348cab0 10735 return true;
43e9d192
IB
10736
10737 case ADDRESS_SYMBOLIC:
d6591257 10738 output_addr_const (f, x);
c348cab0 10739 return true;
43e9d192
IB
10740 }
10741
c348cab0 10742 return false;
43e9d192
IB
10743}
10744
e69a816d
WD
10745/* Print address 'x' of a memory access with mode 'mode'. */
10746static void
10747aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
10748{
43cacb12 10749 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
c348cab0 10750 output_addr_const (f, x);
e69a816d
WD
10751}
10752
74b27d8e
RS
10753/* Implement TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
10754
10755static bool
10756aarch64_output_addr_const_extra (FILE *file, rtx x)
10757{
10758 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SALT_ADDR)
10759 {
10760 output_addr_const (file, XVECEXP (x, 0, 0));
10761 return true;
10762 }
10763 return false;
10764}
10765
43e9d192
IB
10766bool
10767aarch64_label_mentioned_p (rtx x)
10768{
10769 const char *fmt;
10770 int i;
10771
3793ecc1 10772 if (LABEL_REF_P (x))
43e9d192
IB
10773 return true;
10774
10775 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
10776 referencing instruction, but they are constant offsets, not
10777 symbols. */
10778 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
10779 return false;
10780
10781 fmt = GET_RTX_FORMAT (GET_CODE (x));
10782 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
10783 {
10784 if (fmt[i] == 'E')
10785 {
10786 int j;
10787
10788 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
10789 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
10790 return 1;
10791 }
10792 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
10793 return 1;
10794 }
10795
10796 return 0;
10797}
10798
10799/* Implement REGNO_REG_CLASS. */
10800
10801enum reg_class
10802aarch64_regno_regclass (unsigned regno)
10803{
96b7f495
MM
10804 if (STUB_REGNUM_P (regno))
10805 return STUB_REGS;
10806
43e9d192 10807 if (GP_REGNUM_P (regno))
a4a182c6 10808 return GENERAL_REGS;
43e9d192
IB
10809
10810 if (regno == SP_REGNUM)
10811 return STACK_REG;
10812
10813 if (regno == FRAME_POINTER_REGNUM
10814 || regno == ARG_POINTER_REGNUM)
f24bb080 10815 return POINTER_REGS;
43e9d192
IB
10816
10817 if (FP_REGNUM_P (regno))
163b1f6a
RS
10818 return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
10819 : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
43e9d192 10820
43cacb12
RS
10821 if (PR_REGNUM_P (regno))
10822 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
10823
183bfdaf
RS
10824 if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
10825 return FFR_REGS;
10826
43e9d192
IB
10827 return NO_REGS;
10828}
10829
6a70badb
RS
10830/* OFFSET is an address offset for mode MODE, which has SIZE bytes.
10831 If OFFSET is out of range, return an offset of an anchor point
10832 that is in range. Return 0 otherwise. */
10833
10834static HOST_WIDE_INT
10835aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
10836 machine_mode mode)
10837{
10838 /* Does it look like we'll need a 16-byte load/store-pair operation? */
10839 if (size > 16)
10840 return (offset + 0x400) & ~0x7f0;
10841
10842 /* For offsets that aren't a multiple of the access size, the limit is
10843 -256...255. */
10844 if (offset & (size - 1))
10845 {
10846 /* BLKmode typically uses LDP of X-registers. */
10847 if (mode == BLKmode)
10848 return (offset + 512) & ~0x3ff;
10849 return (offset + 0x100) & ~0x1ff;
10850 }
10851
10852 /* Small negative offsets are supported. */
10853 if (IN_RANGE (offset, -256, 0))
10854 return 0;
10855
10856 if (mode == TImode || mode == TFmode)
10857 return (offset + 0x100) & ~0x1ff;
10858
10859 /* Use 12-bit offset by access size. */
10860 return offset & (~0xfff * size);
10861}
10862
0c4ec427 10863static rtx
ef4bddc2 10864aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
0c4ec427
RE
10865{
10866 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
10867 where mask is selected by alignment and size of the offset.
10868 We try to pick as large a range for the offset as possible to
10869 maximize the chance of a CSE. However, for aligned addresses
10870 we limit the range to 4k so that structures with different sized
e8426e0a
BC
10871 elements are likely to use the same base. We need to be careful
10872 not to split a CONST for some forms of address expression, otherwise
10873 it will generate sub-optimal code. */
0c4ec427
RE
10874
10875 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
10876 {
9e0218fc 10877 rtx base = XEXP (x, 0);
17d7bdd8 10878 rtx offset_rtx = XEXP (x, 1);
9e0218fc 10879 HOST_WIDE_INT offset = INTVAL (offset_rtx);
0c4ec427 10880
9e0218fc 10881 if (GET_CODE (base) == PLUS)
e8426e0a 10882 {
9e0218fc
RH
10883 rtx op0 = XEXP (base, 0);
10884 rtx op1 = XEXP (base, 1);
10885
10886 /* Force any scaling into a temp for CSE. */
10887 op0 = force_reg (Pmode, op0);
10888 op1 = force_reg (Pmode, op1);
10889
10890 /* Let the pointer register be in op0. */
10891 if (REG_POINTER (op1))
10892 std::swap (op0, op1);
10893
10894 /* If the pointer is virtual or frame related, then we know that
10895 virtual register instantiation or register elimination is going
10896 to apply a second constant. We want the two constants folded
10897 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
10898 if (virt_or_elim_regno_p (REGNO (op0)))
e8426e0a 10899 {
9e0218fc
RH
10900 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
10901 NULL_RTX, true, OPTAB_DIRECT);
10902 return gen_rtx_PLUS (Pmode, base, op1);
e8426e0a 10903 }
e8426e0a 10904
9e0218fc
RH
10905 /* Otherwise, in order to encourage CSE (and thence loop strength
10906 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
10907 base = expand_binop (Pmode, add_optab, op0, op1,
10908 NULL_RTX, true, OPTAB_DIRECT);
10909 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
e8426e0a
BC
10910 }
10911
6a70badb
RS
10912 HOST_WIDE_INT size;
10913 if (GET_MODE_SIZE (mode).is_constant (&size))
ff0f3f1c 10914 {
6a70badb
RS
10915 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
10916 mode);
10917 if (base_offset != 0)
10918 {
10919 base = plus_constant (Pmode, base, base_offset);
10920 base = force_operand (base, NULL_RTX);
10921 return plus_constant (Pmode, base, offset - base_offset);
10922 }
9e0218fc 10923 }
0c4ec427
RE
10924 }
10925
10926 return x;
10927}
10928
43e9d192
IB
10929static reg_class_t
10930aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
10931 reg_class_t rclass,
ef4bddc2 10932 machine_mode mode,
43e9d192
IB
10933 secondary_reload_info *sri)
10934{
cc68f7c2
RS
10935 /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
10936 LDR and STR. See the comment at the head of aarch64-sve.md for
10937 more details about the big-endian handling. */
10938 if (reg_class_subset_p (rclass, FP_REGS)
9a1b9cb4
RS
10939 && !((REG_P (x) && HARD_REGISTER_P (x))
10940 || aarch64_simd_valid_immediate (x, NULL))
cc68f7c2 10941 && mode != VNx16QImode)
43cacb12 10942 {
cc68f7c2
RS
10943 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10944 if ((vec_flags & VEC_SVE_DATA)
10945 && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
10946 {
10947 sri->icode = CODE_FOR_aarch64_sve_reload_mem;
10948 return NO_REGS;
10949 }
43cacb12 10950 }
b4f50fd4
RR
10951
10952 /* If we have to disable direct literal pool loads and stores because the
10953 function is too big, then we need a scratch register. */
3793ecc1 10954 if (MEM_P (x) && SYMBOL_REF_P (x) && CONSTANT_POOL_ADDRESS_P (x)
b4f50fd4
RR
10955 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
10956 || targetm.vector_mode_supported_p (GET_MODE (x)))
9ee6540a 10957 && !aarch64_pcrelative_literal_loads)
b4f50fd4 10958 {
0016d8d9 10959 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
b4f50fd4
RR
10960 return NO_REGS;
10961 }
10962
43e9d192
IB
10963 /* Without the TARGET_SIMD instructions we cannot move a Q register
10964 to a Q register directly. We need a scratch. */
10965 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
10966 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
10967 && reg_class_subset_p (rclass, FP_REGS))
10968 {
0016d8d9 10969 sri->icode = code_for_aarch64_reload_mov (mode);
43e9d192
IB
10970 return NO_REGS;
10971 }
10972
10973 /* A TFmode or TImode memory access should be handled via an FP_REGS
10974 because AArch64 has richer addressing modes for LDR/STR instructions
10975 than LDP/STP instructions. */
d5726973 10976 if (TARGET_FLOAT && rclass == GENERAL_REGS
6a70badb 10977 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
43e9d192
IB
10978 return FP_REGS;
10979
10980 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
a4a182c6 10981 return GENERAL_REGS;
43e9d192
IB
10982
10983 return NO_REGS;
10984}
10985
10986static bool
6216fd90 10987aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
43e9d192 10988{
6216fd90 10989 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
43e9d192 10990
6216fd90
WD
10991 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
10992 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
43e9d192 10993 if (frame_pointer_needed)
6216fd90 10994 return to == HARD_FRAME_POINTER_REGNUM;
43e9d192
IB
10995 return true;
10996}
10997
6a70badb 10998poly_int64
43e9d192
IB
10999aarch64_initial_elimination_offset (unsigned from, unsigned to)
11000{
78c29983
MS
11001 if (to == HARD_FRAME_POINTER_REGNUM)
11002 {
11003 if (from == ARG_POINTER_REGNUM)
71bfb77a 11004 return cfun->machine->frame.hard_fp_offset;
78c29983
MS
11005
11006 if (from == FRAME_POINTER_REGNUM)
71bfb77a
WD
11007 return cfun->machine->frame.hard_fp_offset
11008 - cfun->machine->frame.locals_offset;
78c29983
MS
11009 }
11010
11011 if (to == STACK_POINTER_REGNUM)
11012 {
11013 if (from == FRAME_POINTER_REGNUM)
71bfb77a
WD
11014 return cfun->machine->frame.frame_size
11015 - cfun->machine->frame.locals_offset;
78c29983
MS
11016 }
11017
1c960e02 11018 return cfun->machine->frame.frame_size;
43e9d192
IB
11019}
11020
463a54e5
SN
11021
11022/* Get return address without mangling. */
11023
11024rtx
11025aarch64_return_addr_rtx (void)
11026{
11027 rtx val = get_hard_reg_initial_val (Pmode, LR_REGNUM);
11028 /* Note: aarch64_return_address_signing_enabled only
11029 works after cfun->machine->frame.laid_out is set,
11030 so here we don't know if the return address will
11031 be signed or not. */
11032 rtx lr = gen_rtx_REG (Pmode, LR_REGNUM);
11033 emit_move_insn (lr, val);
11034 emit_insn (GEN_FCN (CODE_FOR_xpaclri) ());
11035 return lr;
11036}
11037
11038
43e9d192
IB
11039/* Implement RETURN_ADDR_RTX. We do not support moving back to a
11040 previous frame. */
11041
11042rtx
11043aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
11044{
11045 if (count != 0)
11046 return const0_rtx;
463a54e5 11047 return aarch64_return_addr_rtx ();
43e9d192
IB
11048}
11049
43e9d192
IB
11050static void
11051aarch64_asm_trampoline_template (FILE *f)
11052{
be7c41a5
OT
11053 /* Even if the current function doesn't have branch protection, some
11054 later function might, so since this template is only generated once
11055 we have to add a BTI just in case. */
11056 asm_fprintf (f, "\thint\t34 // bti c\n");
b5f794b4 11057
28514dda
YZ
11058 if (TARGET_ILP32)
11059 {
be178ecd
MM
11060 asm_fprintf (f, "\tldr\tw%d, .+20\n", IP1_REGNUM - R0_REGNUM);
11061 asm_fprintf (f, "\tldr\tw%d, .+20\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
28514dda
YZ
11062 }
11063 else
11064 {
be178ecd
MM
11065 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [IP1_REGNUM]);
11066 asm_fprintf (f, "\tldr\t%s, .+24\n", reg_names [STATIC_CHAIN_REGNUM]);
28514dda 11067 }
01a3a324 11068 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
b5f794b4 11069
be178ecd
MM
11070 /* We always emit a speculation barrier.
11071 This is because the same trampoline template is used for every nested
11072 function. Since nested functions are not particularly common or
11073 performant we don't worry too much about the extra instructions to copy
11074 around.
11075 This is not yet a problem, since we have not yet implemented function
11076 specific attributes to choose between hardening against straight line
11077 speculation or not, but such function specific attributes are likely to
11078 happen in the future. */
11079 asm_fprintf (f, "\tdsb\tsy\n\tisb\n");
11080
28514dda
YZ
11081 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
11082 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
43e9d192
IB
11083}
11084
11085static void
11086aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
11087{
11088 rtx fnaddr, mem, a_tramp;
be178ecd 11089 const int tramp_code_sz = 24;
43e9d192
IB
11090
11091 /* Don't need to copy the trailing D-words, we fill those in below. */
be178ecd
MM
11092 /* We create our own memory address in Pmode so that `emit_block_move` can
11093 use parts of the backend which expect Pmode addresses. */
11094 rtx temp = convert_memory_address (Pmode, XEXP (m_tramp, 0));
11095 emit_block_move (gen_rtx_MEM (BLKmode, temp),
11096 assemble_trampoline_template (),
28514dda
YZ
11097 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
11098 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
43e9d192 11099 fnaddr = XEXP (DECL_RTL (fndecl), 0);
28514dda
YZ
11100 if (GET_MODE (fnaddr) != ptr_mode)
11101 fnaddr = convert_memory_address (ptr_mode, fnaddr);
43e9d192
IB
11102 emit_move_insn (mem, fnaddr);
11103
28514dda 11104 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
43e9d192
IB
11105 emit_move_insn (mem, chain_value);
11106
11107 /* XXX We should really define a "clear_cache" pattern and use
11108 gen_clear_cache(). */
11109 a_tramp = XEXP (m_tramp, 0);
c05ece92
AO
11110 maybe_emit_call_builtin___clear_cache (a_tramp,
11111 plus_constant (ptr_mode,
11112 a_tramp,
11113 TRAMPOLINE_SIZE));
43e9d192
IB
11114}
11115
11116static unsigned char
ef4bddc2 11117aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
43e9d192 11118{
6a70badb
RS
11119 /* ??? Logically we should only need to provide a value when
11120 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
11121 can hold MODE, but at the moment we need to handle all modes.
11122 Just ignore any runtime parts for registers that can't store them. */
11123 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
550a3380 11124 unsigned int nregs, vec_flags;
43e9d192
IB
11125 switch (regclass)
11126 {
96b7f495 11127 case STUB_REGS:
d677263e 11128 case TAILCALL_ADDR_REGS:
43e9d192
IB
11129 case POINTER_REGS:
11130 case GENERAL_REGS:
11131 case ALL_REGS:
f25a140b 11132 case POINTER_AND_FP_REGS:
43e9d192
IB
11133 case FP_REGS:
11134 case FP_LO_REGS:
163b1f6a 11135 case FP_LO8_REGS:
550a3380
RS
11136 vec_flags = aarch64_classify_vector_mode (mode);
11137 if ((vec_flags & VEC_SVE_DATA)
43cacb12 11138 && constant_multiple_p (GET_MODE_SIZE (mode),
550a3380 11139 aarch64_vl_bytes (mode, vec_flags), &nregs))
43cacb12 11140 return nregs;
550a3380 11141 return (vec_flags & VEC_ADVSIMD
6a70badb
RS
11142 ? CEIL (lowest_size, UNITS_PER_VREG)
11143 : CEIL (lowest_size, UNITS_PER_WORD));
43e9d192 11144 case STACK_REG:
43cacb12
RS
11145 case PR_REGS:
11146 case PR_LO_REGS:
11147 case PR_HI_REGS:
183bfdaf
RS
11148 case FFR_REGS:
11149 case PR_AND_FFR_REGS:
43e9d192
IB
11150 return 1;
11151
11152 case NO_REGS:
11153 return 0;
11154
11155 default:
11156 break;
11157 }
11158 gcc_unreachable ();
11159}
11160
11161static reg_class_t
78d8b9f0 11162aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
43e9d192 11163{
51bb310d 11164 if (regclass == POINTER_REGS)
78d8b9f0
IB
11165 return GENERAL_REGS;
11166
51bb310d
MS
11167 if (regclass == STACK_REG)
11168 {
11169 if (REG_P(x)
11170 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
11171 return regclass;
11172
11173 return NO_REGS;
11174 }
11175
27bd251b
IB
11176 /* Register eliminiation can result in a request for
11177 SP+constant->FP_REGS. We cannot support such operations which
11178 use SP as source and an FP_REG as destination, so reject out
11179 right now. */
11180 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
11181 {
11182 rtx lhs = XEXP (x, 0);
11183
11184 /* Look through a possible SUBREG introduced by ILP32. */
3793ecc1 11185 if (SUBREG_P (lhs))
27bd251b
IB
11186 lhs = SUBREG_REG (lhs);
11187
11188 gcc_assert (REG_P (lhs));
11189 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
11190 POINTER_REGS));
11191 return NO_REGS;
11192 }
11193
78d8b9f0 11194 return regclass;
43e9d192
IB
11195}
11196
11197void
11198aarch64_asm_output_labelref (FILE* f, const char *name)
11199{
11200 asm_fprintf (f, "%U%s", name);
11201}
11202
11203static void
11204aarch64_elf_asm_constructor (rtx symbol, int priority)
11205{
11206 if (priority == DEFAULT_INIT_PRIORITY)
11207 default_ctor_section_asm_out_constructor (symbol, priority);
11208 else
11209 {
11210 section *s;
53d190c1
AT
11211 /* While priority is known to be in range [0, 65535], so 18 bytes
11212 would be enough, the compiler might not know that. To avoid
11213 -Wformat-truncation false positive, use a larger size. */
11214 char buf[23];
43e9d192 11215 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
fcef3abd 11216 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
43e9d192
IB
11217 switch_to_section (s);
11218 assemble_align (POINTER_SIZE);
28514dda 11219 assemble_aligned_integer (POINTER_BYTES, symbol);
43e9d192
IB
11220 }
11221}
11222
11223static void
11224aarch64_elf_asm_destructor (rtx symbol, int priority)
11225{
11226 if (priority == DEFAULT_INIT_PRIORITY)
11227 default_dtor_section_asm_out_destructor (symbol, priority);
11228 else
11229 {
11230 section *s;
53d190c1
AT
11231 /* While priority is known to be in range [0, 65535], so 18 bytes
11232 would be enough, the compiler might not know that. To avoid
11233 -Wformat-truncation false positive, use a larger size. */
11234 char buf[23];
43e9d192 11235 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
fcef3abd 11236 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
43e9d192
IB
11237 switch_to_section (s);
11238 assemble_align (POINTER_SIZE);
28514dda 11239 assemble_aligned_integer (POINTER_BYTES, symbol);
43e9d192
IB
11240 }
11241}
11242
11243const char*
11244aarch64_output_casesi (rtx *operands)
11245{
11246 char buf[100];
11247 char label[100];
b32d5189 11248 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
43e9d192
IB
11249 int index;
11250 static const char *const patterns[4][2] =
11251 {
11252 {
11253 "ldrb\t%w3, [%0,%w1,uxtw]",
11254 "add\t%3, %4, %w3, sxtb #2"
11255 },
11256 {
11257 "ldrh\t%w3, [%0,%w1,uxtw #1]",
11258 "add\t%3, %4, %w3, sxth #2"
11259 },
11260 {
11261 "ldr\t%w3, [%0,%w1,uxtw #2]",
11262 "add\t%3, %4, %w3, sxtw #2"
11263 },
11264 /* We assume that DImode is only generated when not optimizing and
11265 that we don't really need 64-bit address offsets. That would
11266 imply an object file with 8GB of code in a single function! */
11267 {
11268 "ldr\t%w3, [%0,%w1,uxtw #2]",
11269 "add\t%3, %4, %w3, sxtw #2"
11270 }
11271 };
11272
11273 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
11274
77e994c9
RS
11275 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
11276 index = exact_log2 (GET_MODE_SIZE (mode));
43e9d192
IB
11277
11278 gcc_assert (index >= 0 && index <= 3);
11279
11280 /* Need to implement table size reduction, by chaning the code below. */
11281 output_asm_insn (patterns[index][0], operands);
11282 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
11283 snprintf (buf, sizeof (buf),
11284 "adr\t%%4, %s", targetm.strip_name_encoding (label));
11285 output_asm_insn (buf, operands);
11286 output_asm_insn (patterns[index][1], operands);
11287 output_asm_insn ("br\t%3", operands);
be178ecd
MM
11288 output_asm_insn (aarch64_sls_barrier (aarch64_harden_sls_retbr_p ()),
11289 operands);
43e9d192
IB
11290 assemble_label (asm_out_file, label);
11291 return "";
11292}
11293
11294
11295/* Return size in bits of an arithmetic operand which is shifted/scaled and
11296 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
11297 operator. */
11298
11299int
11300aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
11301{
11302 if (shift >= 0 && shift <= 3)
11303 {
11304 int size;
11305 for (size = 8; size <= 32; size *= 2)
11306 {
11307 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
11308 if (mask == bits << shift)
11309 return size;
11310 }
11311 }
11312 return 0;
11313}
11314
e78d485e
RR
11315/* Constant pools are per function only when PC relative
11316 literal loads are true or we are in the large memory
11317 model. */
11318
11319static inline bool
11320aarch64_can_use_per_function_literal_pools_p (void)
11321{
9ee6540a 11322 return (aarch64_pcrelative_literal_loads
e78d485e
RR
11323 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
11324}
11325
43e9d192 11326static bool
e78d485e 11327aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
43e9d192 11328{
74a9301d
VM
11329 /* We can't use blocks for constants when we're using a per-function
11330 constant pool. */
11331 return !aarch64_can_use_per_function_literal_pools_p ();
43e9d192
IB
11332}
11333
e78d485e
RR
11334/* Select appropriate section for constants depending
11335 on where we place literal pools. */
11336
43e9d192 11337static section *
e78d485e
RR
11338aarch64_select_rtx_section (machine_mode mode,
11339 rtx x,
11340 unsigned HOST_WIDE_INT align)
43e9d192 11341{
e78d485e
RR
11342 if (aarch64_can_use_per_function_literal_pools_p ())
11343 return function_section (current_function_decl);
43e9d192 11344
e78d485e
RR
11345 return default_elf_select_rtx_section (mode, x, align);
11346}
43e9d192 11347
5fca7b66
RH
11348/* Implement ASM_OUTPUT_POOL_EPILOGUE. */
11349void
11350aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
11351 HOST_WIDE_INT offset)
11352{
11353 /* When using per-function literal pools, we must ensure that any code
11354 section is aligned to the minimal instruction length, lest we get
11355 errors from the assembler re "unaligned instructions". */
11356 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
11357 ASM_OUTPUT_ALIGN (f, 2);
11358}
11359
43e9d192
IB
11360/* Costs. */
11361
11362/* Helper function for rtx cost calculation. Strip a shift expression
11363 from X. Returns the inner operand if successful, or the original
11364 expression on failure. */
11365static rtx
11366aarch64_strip_shift (rtx x)
11367{
11368 rtx op = x;
11369
57b77d46
RE
11370 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
11371 we can convert both to ROR during final output. */
43e9d192
IB
11372 if ((GET_CODE (op) == ASHIFT
11373 || GET_CODE (op) == ASHIFTRT
57b77d46
RE
11374 || GET_CODE (op) == LSHIFTRT
11375 || GET_CODE (op) == ROTATERT
11376 || GET_CODE (op) == ROTATE)
43e9d192
IB
11377 && CONST_INT_P (XEXP (op, 1)))
11378 return XEXP (op, 0);
11379
11380 if (GET_CODE (op) == MULT
11381 && CONST_INT_P (XEXP (op, 1))
11382 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
11383 return XEXP (op, 0);
11384
11385 return x;
11386}
11387
4745e701 11388/* Helper function for rtx cost calculation. Strip an extend
43e9d192
IB
11389 expression from X. Returns the inner operand if successful, or the
11390 original expression on failure. We deal with a number of possible
b10f1009
AP
11391 canonicalization variations here. If STRIP_SHIFT is true, then
11392 we can strip off a shift also. */
43e9d192 11393static rtx
b10f1009 11394aarch64_strip_extend (rtx x, bool strip_shift)
43e9d192 11395{
77e994c9 11396 scalar_int_mode mode;
43e9d192
IB
11397 rtx op = x;
11398
77e994c9
RS
11399 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
11400 return op;
11401
43e9d192
IB
11402 if (GET_CODE (op) == AND
11403 && GET_CODE (XEXP (op, 0)) == MULT
11404 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
11405 && CONST_INT_P (XEXP (op, 1))
11406 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
11407 INTVAL (XEXP (op, 1))) != 0)
11408 return XEXP (XEXP (op, 0), 0);
11409
11410 /* Now handle extended register, as this may also have an optional
11411 left shift by 1..4. */
b10f1009
AP
11412 if (strip_shift
11413 && GET_CODE (op) == ASHIFT
43e9d192
IB
11414 && CONST_INT_P (XEXP (op, 1))
11415 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
11416 op = XEXP (op, 0);
11417
11418 if (GET_CODE (op) == ZERO_EXTEND
11419 || GET_CODE (op) == SIGN_EXTEND)
11420 op = XEXP (op, 0);
11421
11422 if (op != x)
11423 return op;
11424
4745e701
JG
11425 return x;
11426}
11427
0a78ebe4
KT
11428/* Return true iff CODE is a shift supported in combination
11429 with arithmetic instructions. */
4d1919ed 11430
0a78ebe4
KT
11431static bool
11432aarch64_shift_p (enum rtx_code code)
11433{
11434 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
11435}
11436
b10f1009
AP
11437
11438/* Return true iff X is a cheap shift without a sign extend. */
11439
11440static bool
11441aarch64_cheap_mult_shift_p (rtx x)
11442{
11443 rtx op0, op1;
11444
11445 op0 = XEXP (x, 0);
11446 op1 = XEXP (x, 1);
11447
11448 if (!(aarch64_tune_params.extra_tuning_flags
11449 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
11450 return false;
11451
11452 if (GET_CODE (op0) == SIGN_EXTEND)
11453 return false;
11454
11455 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
11456 && UINTVAL (op1) <= 4)
11457 return true;
11458
11459 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
11460 return false;
11461
11462 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
11463
11464 if (l2 > 0 && l2 <= 4)
11465 return true;
11466
11467 return false;
11468}
11469
4745e701 11470/* Helper function for rtx cost calculation. Calculate the cost of
0a78ebe4
KT
11471 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
11472 Return the calculated cost of the expression, recursing manually in to
4745e701
JG
11473 operands where needed. */
11474
11475static int
e548c9df 11476aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
4745e701
JG
11477{
11478 rtx op0, op1;
11479 const struct cpu_cost_table *extra_cost
b175b679 11480 = aarch64_tune_params.insn_extra_cost;
4745e701 11481 int cost = 0;
0a78ebe4 11482 bool compound_p = (outer == PLUS || outer == MINUS);
ef4bddc2 11483 machine_mode mode = GET_MODE (x);
4745e701
JG
11484
11485 gcc_checking_assert (code == MULT);
11486
11487 op0 = XEXP (x, 0);
11488 op1 = XEXP (x, 1);
11489
11490 if (VECTOR_MODE_P (mode))
df81764b
TC
11491 {
11492 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11493 mode = GET_MODE_INNER (mode);
11494 if (vec_flags & VEC_ADVSIMD)
11495 {
11496 /* The by-element versions of the instruction have the same costs as
11497 the normal 3-vector version. So don't add the costs of the
11498 duplicate into the costs of the multiply. We make an assumption
11499 that the input to the VEC_DUPLICATE is already on the FP & SIMD
11500 side. This means costing of a MUL by element pre RA is a bit
11501 optimistic. */
11502 if (GET_CODE (op0) == VEC_DUPLICATE)
11503 op0 = XEXP (op0, 0);
11504 else if (GET_CODE (op1) == VEC_DUPLICATE)
11505 op1 = XEXP (op1, 0);
11506 }
11507 }
4745e701
JG
11508
11509 /* Integer multiply/fma. */
11510 if (GET_MODE_CLASS (mode) == MODE_INT)
11511 {
11512 /* The multiply will be canonicalized as a shift, cost it as such. */
0a78ebe4
KT
11513 if (aarch64_shift_p (GET_CODE (x))
11514 || (CONST_INT_P (op1)
11515 && exact_log2 (INTVAL (op1)) > 0))
4745e701 11516 {
0a78ebe4
KT
11517 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
11518 || GET_CODE (op0) == SIGN_EXTEND;
4745e701
JG
11519 if (speed)
11520 {
0a78ebe4
KT
11521 if (compound_p)
11522 {
b10f1009
AP
11523 /* If the shift is considered cheap,
11524 then don't add any cost. */
11525 if (aarch64_cheap_mult_shift_p (x))
11526 ;
11527 else if (REG_P (op1))
0a78ebe4
KT
11528 /* ARITH + shift-by-register. */
11529 cost += extra_cost->alu.arith_shift_reg;
11530 else if (is_extend)
11531 /* ARITH + extended register. We don't have a cost field
11532 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
11533 cost += extra_cost->alu.extend_arith;
11534 else
11535 /* ARITH + shift-by-immediate. */
11536 cost += extra_cost->alu.arith_shift;
11537 }
4745e701
JG
11538 else
11539 /* LSL (immediate). */
0a78ebe4
KT
11540 cost += extra_cost->alu.shift;
11541
4745e701 11542 }
0a78ebe4
KT
11543 /* Strip extends as we will have costed them in the case above. */
11544 if (is_extend)
b10f1009 11545 op0 = aarch64_strip_extend (op0, true);
4745e701 11546
e548c9df 11547 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
4745e701
JG
11548
11549 return cost;
11550 }
11551
d2ac256b
KT
11552 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
11553 compound and let the below cases handle it. After all, MNEG is a
11554 special-case alias of MSUB. */
11555 if (GET_CODE (op0) == NEG)
11556 {
11557 op0 = XEXP (op0, 0);
11558 compound_p = true;
11559 }
11560
4745e701
JG
11561 /* Integer multiplies or FMAs have zero/sign extending variants. */
11562 if ((GET_CODE (op0) == ZERO_EXTEND
11563 && GET_CODE (op1) == ZERO_EXTEND)
11564 || (GET_CODE (op0) == SIGN_EXTEND
11565 && GET_CODE (op1) == SIGN_EXTEND))
11566 {
e548c9df
AM
11567 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
11568 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
4745e701
JG
11569
11570 if (speed)
11571 {
0a78ebe4 11572 if (compound_p)
d2ac256b 11573 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
4745e701
JG
11574 cost += extra_cost->mult[0].extend_add;
11575 else
11576 /* MUL/SMULL/UMULL. */
11577 cost += extra_cost->mult[0].extend;
11578 }
11579
11580 return cost;
11581 }
11582
d2ac256b 11583 /* This is either an integer multiply or a MADD. In both cases
4745e701 11584 we want to recurse and cost the operands. */
e548c9df
AM
11585 cost += rtx_cost (op0, mode, MULT, 0, speed);
11586 cost += rtx_cost (op1, mode, MULT, 1, speed);
4745e701
JG
11587
11588 if (speed)
11589 {
0a78ebe4 11590 if (compound_p)
d2ac256b 11591 /* MADD/MSUB. */
4745e701
JG
11592 cost += extra_cost->mult[mode == DImode].add;
11593 else
11594 /* MUL. */
11595 cost += extra_cost->mult[mode == DImode].simple;
11596 }
11597
11598 return cost;
11599 }
11600 else
11601 {
11602 if (speed)
11603 {
3d840f7d 11604 /* Floating-point FMA/FMUL can also support negations of the
d318517d
SN
11605 operands, unless the rounding mode is upward or downward in
11606 which case FNMUL is different than FMUL with operand negation. */
11607 bool neg0 = GET_CODE (op0) == NEG;
11608 bool neg1 = GET_CODE (op1) == NEG;
11609 if (compound_p || !flag_rounding_math || (neg0 && neg1))
11610 {
11611 if (neg0)
11612 op0 = XEXP (op0, 0);
11613 if (neg1)
11614 op1 = XEXP (op1, 0);
11615 }
4745e701 11616
0a78ebe4 11617 if (compound_p)
4745e701
JG
11618 /* FMADD/FNMADD/FNMSUB/FMSUB. */
11619 cost += extra_cost->fp[mode == DFmode].fma;
11620 else
3d840f7d 11621 /* FMUL/FNMUL. */
4745e701
JG
11622 cost += extra_cost->fp[mode == DFmode].mult;
11623 }
11624
e548c9df
AM
11625 cost += rtx_cost (op0, mode, MULT, 0, speed);
11626 cost += rtx_cost (op1, mode, MULT, 1, speed);
4745e701
JG
11627 return cost;
11628 }
43e9d192
IB
11629}
11630
67747367
JG
11631static int
11632aarch64_address_cost (rtx x,
ef4bddc2 11633 machine_mode mode,
67747367
JG
11634 addr_space_t as ATTRIBUTE_UNUSED,
11635 bool speed)
11636{
11637 enum rtx_code c = GET_CODE (x);
b175b679 11638 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
67747367
JG
11639 struct aarch64_address_info info;
11640 int cost = 0;
11641 info.shift = 0;
11642
a97d8b98 11643 if (!aarch64_classify_address (&info, x, mode, false))
67747367 11644 {
3793ecc1 11645 if (GET_CODE (x) == CONST || SYMBOL_REF_P (x))
67747367
JG
11646 {
11647 /* This is a CONST or SYMBOL ref which will be split
11648 in a different way depending on the code model in use.
11649 Cost it through the generic infrastructure. */
e548c9df 11650 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
67747367
JG
11651 /* Divide through by the cost of one instruction to
11652 bring it to the same units as the address costs. */
11653 cost_symbol_ref /= COSTS_N_INSNS (1);
11654 /* The cost is then the cost of preparing the address,
11655 followed by an immediate (possibly 0) offset. */
11656 return cost_symbol_ref + addr_cost->imm_offset;
11657 }
11658 else
11659 {
11660 /* This is most likely a jump table from a case
11661 statement. */
11662 return addr_cost->register_offset;
11663 }
11664 }
11665
11666 switch (info.type)
11667 {
11668 case ADDRESS_LO_SUM:
11669 case ADDRESS_SYMBOLIC:
11670 case ADDRESS_REG_IMM:
11671 cost += addr_cost->imm_offset;
11672 break;
11673
11674 case ADDRESS_REG_WB:
11675 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
11676 cost += addr_cost->pre_modify;
11677 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
11678 cost += addr_cost->post_modify;
11679 else
11680 gcc_unreachable ();
11681
11682 break;
11683
11684 case ADDRESS_REG_REG:
11685 cost += addr_cost->register_offset;
11686 break;
11687
67747367 11688 case ADDRESS_REG_SXTW:
783879e6
EM
11689 cost += addr_cost->register_sextend;
11690 break;
11691
11692 case ADDRESS_REG_UXTW:
11693 cost += addr_cost->register_zextend;
67747367
JG
11694 break;
11695
11696 default:
11697 gcc_unreachable ();
11698 }
11699
11700
11701 if (info.shift > 0)
11702 {
11703 /* For the sake of calculating the cost of the shifted register
11704 component, we can treat same sized modes in the same way. */
6a70badb
RS
11705 if (known_eq (GET_MODE_BITSIZE (mode), 16))
11706 cost += addr_cost->addr_scale_costs.hi;
11707 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
11708 cost += addr_cost->addr_scale_costs.si;
11709 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
11710 cost += addr_cost->addr_scale_costs.di;
11711 else
11712 /* We can't tell, or this is a 128-bit vector. */
11713 cost += addr_cost->addr_scale_costs.ti;
67747367
JG
11714 }
11715
11716 return cost;
11717}
11718
b9066f5a
MW
11719/* Return the cost of a branch. If SPEED_P is true then the compiler is
11720 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
11721 to be taken. */
11722
11723int
11724aarch64_branch_cost (bool speed_p, bool predictable_p)
11725{
11726 /* When optimizing for speed, use the cost of unpredictable branches. */
11727 const struct cpu_branch_cost *branch_costs =
b175b679 11728 aarch64_tune_params.branch_costs;
b9066f5a
MW
11729
11730 if (!speed_p || predictable_p)
11731 return branch_costs->predictable;
11732 else
11733 return branch_costs->unpredictable;
11734}
11735
7de23b8c 11736/* Return true if X is a zero or sign extract
7cc2145f
JG
11737 usable in an ADD or SUB (extended register) instruction. */
11738static bool
7de23b8c 11739aarch64_rtx_arith_op_extract_p (rtx x)
7cc2145f 11740{
e47c4031
KT
11741 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
11742 No shift. */
7de23b8c
AC
11743 if (GET_CODE (x) == SIGN_EXTEND
11744 || GET_CODE (x) == ZERO_EXTEND)
e47c4031 11745 return REG_P (XEXP (x, 0));
7cc2145f
JG
11746
11747 return false;
11748}
11749
61263118
KT
11750static bool
11751aarch64_frint_unspec_p (unsigned int u)
11752{
11753 switch (u)
11754 {
11755 case UNSPEC_FRINTZ:
11756 case UNSPEC_FRINTP:
11757 case UNSPEC_FRINTM:
11758 case UNSPEC_FRINTA:
11759 case UNSPEC_FRINTN:
11760 case UNSPEC_FRINTX:
11761 case UNSPEC_FRINTI:
11762 return true;
11763
11764 default:
11765 return false;
11766 }
11767}
11768
fb0cb7fa
KT
11769/* Return true iff X is an rtx that will match an extr instruction
11770 i.e. as described in the *extr<mode>5_insn family of patterns.
11771 OP0 and OP1 will be set to the operands of the shifts involved
11772 on success and will be NULL_RTX otherwise. */
11773
11774static bool
11775aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
11776{
11777 rtx op0, op1;
77e994c9
RS
11778 scalar_int_mode mode;
11779 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
11780 return false;
fb0cb7fa
KT
11781
11782 *res_op0 = NULL_RTX;
11783 *res_op1 = NULL_RTX;
11784
11785 if (GET_CODE (x) != IOR)
11786 return false;
11787
11788 op0 = XEXP (x, 0);
11789 op1 = XEXP (x, 1);
11790
11791 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
11792 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
11793 {
11794 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
11795 if (GET_CODE (op1) == ASHIFT)
11796 std::swap (op0, op1);
11797
11798 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
11799 return false;
11800
11801 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
11802 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
11803
11804 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
11805 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
11806 {
11807 *res_op0 = XEXP (op0, 0);
11808 *res_op1 = XEXP (op1, 0);
11809 return true;
11810 }
11811 }
11812
11813 return false;
11814}
11815
2d5ffe46
AP
11816/* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
11817 storing it in *COST. Result is true if the total cost of the operation
11818 has now been calculated. */
11819static bool
11820aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
11821{
b9e3afe9
AP
11822 rtx inner;
11823 rtx comparator;
11824 enum rtx_code cmpcode;
e2a14bec
RS
11825 const struct cpu_cost_table *extra_cost
11826 = aarch64_tune_params.insn_extra_cost;
b9e3afe9
AP
11827
11828 if (COMPARISON_P (op0))
11829 {
11830 inner = XEXP (op0, 0);
11831 comparator = XEXP (op0, 1);
11832 cmpcode = GET_CODE (op0);
11833 }
11834 else
11835 {
11836 inner = op0;
11837 comparator = const0_rtx;
11838 cmpcode = NE;
11839 }
11840
2d5ffe46
AP
11841 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
11842 {
11843 /* Conditional branch. */
b9e3afe9 11844 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
2d5ffe46
AP
11845 return true;
11846 else
11847 {
b9e3afe9 11848 if (cmpcode == NE || cmpcode == EQ)
2d5ffe46 11849 {
2d5ffe46
AP
11850 if (comparator == const0_rtx)
11851 {
11852 /* TBZ/TBNZ/CBZ/CBNZ. */
11853 if (GET_CODE (inner) == ZERO_EXTRACT)
11854 /* TBZ/TBNZ. */
e548c9df
AM
11855 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
11856 ZERO_EXTRACT, 0, speed);
11857 else
11858 /* CBZ/CBNZ. */
11859 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
2d5ffe46 11860
e2a14bec
RS
11861 return true;
11862 }
11863 if (register_operand (inner, VOIDmode)
11864 && aarch64_imm24 (comparator, VOIDmode))
11865 {
11866 /* SUB and SUBS. */
11867 *cost += COSTS_N_INSNS (2);
11868 if (speed)
11869 *cost += extra_cost->alu.arith * 2;
11870 return true;
11871 }
2d5ffe46 11872 }
b9e3afe9 11873 else if (cmpcode == LT || cmpcode == GE)
2d5ffe46 11874 {
2d5ffe46
AP
11875 /* TBZ/TBNZ. */
11876 if (comparator == const0_rtx)
11877 return true;
11878 }
11879 }
11880 }
b9e3afe9 11881 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
2d5ffe46 11882 {
786298dc 11883 /* CCMP. */
6dfeb7ce 11884 if (GET_CODE (op1) == COMPARE)
786298dc
WD
11885 {
11886 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
11887 if (XEXP (op1, 1) == const0_rtx)
11888 *cost += 1;
11889 if (speed)
11890 {
11891 machine_mode mode = GET_MODE (XEXP (op1, 0));
786298dc
WD
11892
11893 if (GET_MODE_CLASS (mode) == MODE_INT)
11894 *cost += extra_cost->alu.arith;
11895 else
11896 *cost += extra_cost->fp[mode == DFmode].compare;
11897 }
11898 return true;
11899 }
11900
2d5ffe46
AP
11901 /* It's a conditional operation based on the status flags,
11902 so it must be some flavor of CSEL. */
11903
11904 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
11905 if (GET_CODE (op1) == NEG
11906 || GET_CODE (op1) == NOT
11907 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
11908 op1 = XEXP (op1, 0);
bad00732
KT
11909 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
11910 {
11911 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
11912 op1 = XEXP (op1, 0);
11913 op2 = XEXP (op2, 0);
11914 }
d572ad49
AC
11915 else if (GET_CODE (op1) == ZERO_EXTEND && op2 == const0_rtx)
11916 {
11917 inner = XEXP (op1, 0);
11918 if (GET_CODE (inner) == NEG || GET_CODE (inner) == NOT)
11919 /* CSINV/NEG with zero extend + const 0 (*csinv3_uxtw_insn3). */
11920 op1 = XEXP (inner, 0);
11921 }
2d5ffe46 11922
e548c9df
AM
11923 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
11924 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
2d5ffe46
AP
11925 return true;
11926 }
11927
11928 /* We don't know what this is, cost all operands. */
11929 return false;
11930}
11931
283b6c85
KT
11932/* Check whether X is a bitfield operation of the form shift + extend that
11933 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
11934 operand to which the bitfield operation is applied. Otherwise return
11935 NULL_RTX. */
11936
11937static rtx
11938aarch64_extend_bitfield_pattern_p (rtx x)
11939{
11940 rtx_code outer_code = GET_CODE (x);
11941 machine_mode outer_mode = GET_MODE (x);
11942
11943 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
11944 && outer_mode != SImode && outer_mode != DImode)
11945 return NULL_RTX;
11946
11947 rtx inner = XEXP (x, 0);
11948 rtx_code inner_code = GET_CODE (inner);
11949 machine_mode inner_mode = GET_MODE (inner);
11950 rtx op = NULL_RTX;
11951
11952 switch (inner_code)
11953 {
11954 case ASHIFT:
11955 if (CONST_INT_P (XEXP (inner, 1))
11956 && (inner_mode == QImode || inner_mode == HImode))
11957 op = XEXP (inner, 0);
11958 break;
11959 case LSHIFTRT:
11960 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
11961 && (inner_mode == QImode || inner_mode == HImode))
11962 op = XEXP (inner, 0);
11963 break;
11964 case ASHIFTRT:
11965 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
11966 && (inner_mode == QImode || inner_mode == HImode))
11967 op = XEXP (inner, 0);
11968 break;
11969 default:
11970 break;
11971 }
11972
11973 return op;
11974}
11975
8c83f71d
KT
11976/* Return true if the mask and a shift amount from an RTX of the form
11977 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
11978 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
11979
11980bool
77e994c9
RS
11981aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
11982 rtx shft_amnt)
8c83f71d
KT
11983{
11984 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
11985 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
11986 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
1b6acf23
WD
11987 && (INTVAL (mask)
11988 & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
8c83f71d
KT
11989}
11990
6a0d3939
SE
11991/* Return true if the masks and a shift amount from an RTX of the form
11992 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
11993 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
11994
11995bool
11996aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
11997 unsigned HOST_WIDE_INT mask1,
11998 unsigned HOST_WIDE_INT shft_amnt,
11999 unsigned HOST_WIDE_INT mask2)
12000{
12001 unsigned HOST_WIDE_INT t;
12002
12003 /* Verify that there is no overlap in what bits are set in the two masks. */
12004 if (mask1 != ~mask2)
12005 return false;
12006
12007 /* Verify that mask2 is not all zeros or ones. */
12008 if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
12009 return false;
12010
12011 /* The shift amount should always be less than the mode size. */
12012 gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
12013
12014 /* Verify that the mask being shifted is contiguous and would be in the
12015 least significant bits after shifting by shft_amnt. */
12016 t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
12017 return (t == (t & -t));
12018}
12019
43e9d192
IB
12020/* Calculate the cost of calculating X, storing it in *COST. Result
12021 is true if the total cost of the operation has now been calculated. */
12022static bool
e548c9df 12023aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
43e9d192
IB
12024 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
12025{
a8eecd00 12026 rtx op0, op1, op2;
73250c4c 12027 const struct cpu_cost_table *extra_cost
b175b679 12028 = aarch64_tune_params.insn_extra_cost;
e548c9df 12029 int code = GET_CODE (x);
b4206259 12030 scalar_int_mode int_mode;
43e9d192 12031
7fc5ef02
JG
12032 /* By default, assume that everything has equivalent cost to the
12033 cheapest instruction. Any additional costs are applied as a delta
12034 above this default. */
12035 *cost = COSTS_N_INSNS (1);
12036
43e9d192
IB
12037 switch (code)
12038 {
12039 case SET:
ba123b0d
JG
12040 /* The cost depends entirely on the operands to SET. */
12041 *cost = 0;
43e9d192
IB
12042 op0 = SET_DEST (x);
12043 op1 = SET_SRC (x);
12044
12045 switch (GET_CODE (op0))
12046 {
12047 case MEM:
12048 if (speed)
2961177e
JG
12049 {
12050 rtx address = XEXP (op0, 0);
b6875aac
KV
12051 if (VECTOR_MODE_P (mode))
12052 *cost += extra_cost->ldst.storev;
12053 else if (GET_MODE_CLASS (mode) == MODE_INT)
2961177e
JG
12054 *cost += extra_cost->ldst.store;
12055 else if (mode == SFmode)
12056 *cost += extra_cost->ldst.storef;
12057 else if (mode == DFmode)
12058 *cost += extra_cost->ldst.stored;
12059
12060 *cost +=
12061 COSTS_N_INSNS (aarch64_address_cost (address, mode,
12062 0, speed));
12063 }
43e9d192 12064
e548c9df 12065 *cost += rtx_cost (op1, mode, SET, 1, speed);
43e9d192
IB
12066 return true;
12067
12068 case SUBREG:
12069 if (! REG_P (SUBREG_REG (op0)))
e548c9df 12070 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
ba123b0d 12071
43e9d192
IB
12072 /* Fall through. */
12073 case REG:
b6875aac
KV
12074 /* The cost is one per vector-register copied. */
12075 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
12076 {
fe1447a1
RS
12077 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
12078 *cost = COSTS_N_INSNS (nregs);
b6875aac 12079 }
ba123b0d
JG
12080 /* const0_rtx is in general free, but we will use an
12081 instruction to set a register to 0. */
b6875aac
KV
12082 else if (REG_P (op1) || op1 == const0_rtx)
12083 {
12084 /* The cost is 1 per register copied. */
fe1447a1
RS
12085 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
12086 *cost = COSTS_N_INSNS (nregs);
b6875aac 12087 }
ba123b0d
JG
12088 else
12089 /* Cost is just the cost of the RHS of the set. */
e548c9df 12090 *cost += rtx_cost (op1, mode, SET, 1, speed);
43e9d192
IB
12091 return true;
12092
ba123b0d 12093 case ZERO_EXTRACT:
43e9d192 12094 case SIGN_EXTRACT:
ba123b0d
JG
12095 /* Bit-field insertion. Strip any redundant widening of
12096 the RHS to meet the width of the target. */
43e9d192
IB
12097 if (GET_CODE (op1) == SUBREG)
12098 op1 = SUBREG_REG (op1);
12099 if ((GET_CODE (op1) == ZERO_EXTEND
12100 || GET_CODE (op1) == SIGN_EXTEND)
4aa81c2e 12101 && CONST_INT_P (XEXP (op0, 1))
77e994c9
RS
12102 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
12103 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
43e9d192 12104 op1 = XEXP (op1, 0);
ba123b0d
JG
12105
12106 if (CONST_INT_P (op1))
12107 {
12108 /* MOV immediate is assumed to always be cheap. */
12109 *cost = COSTS_N_INSNS (1);
12110 }
12111 else
12112 {
12113 /* BFM. */
12114 if (speed)
12115 *cost += extra_cost->alu.bfi;
e548c9df 12116 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
ba123b0d
JG
12117 }
12118
43e9d192
IB
12119 return true;
12120
12121 default:
ba123b0d
JG
12122 /* We can't make sense of this, assume default cost. */
12123 *cost = COSTS_N_INSNS (1);
61263118 12124 return false;
43e9d192
IB
12125 }
12126 return false;
12127
9dfc162c
JG
12128 case CONST_INT:
12129 /* If an instruction can incorporate a constant within the
12130 instruction, the instruction's expression avoids calling
12131 rtx_cost() on the constant. If rtx_cost() is called on a
12132 constant, then it is usually because the constant must be
12133 moved into a register by one or more instructions.
12134
12135 The exception is constant 0, which can be expressed
12136 as XZR/WZR and is therefore free. The exception to this is
12137 if we have (set (reg) (const0_rtx)) in which case we must cost
12138 the move. However, we can catch that when we cost the SET, so
12139 we don't need to consider that here. */
12140 if (x == const0_rtx)
12141 *cost = 0;
12142 else
12143 {
12144 /* To an approximation, building any other constant is
12145 proportionally expensive to the number of instructions
12146 required to build that constant. This is true whether we
12147 are compiling for SPEED or otherwise. */
77e994c9
RS
12148 if (!is_a <scalar_int_mode> (mode, &int_mode))
12149 int_mode = word_mode;
82614948 12150 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
77e994c9 12151 (NULL_RTX, x, false, int_mode));
9dfc162c
JG
12152 }
12153 return true;
12154
12155 case CONST_DOUBLE:
a2170965
TC
12156
12157 /* First determine number of instructions to do the move
12158 as an integer constant. */
12159 if (!aarch64_float_const_representable_p (x)
12160 && !aarch64_can_const_movi_rtx_p (x, mode)
12161 && aarch64_float_const_rtx_p (x))
12162 {
12163 unsigned HOST_WIDE_INT ival;
12164 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
12165 gcc_assert (succeed);
12166
77e994c9
RS
12167 scalar_int_mode imode = (mode == HFmode
12168 ? SImode
12169 : int_mode_for_mode (mode).require ());
a2170965
TC
12170 int ncost = aarch64_internal_mov_immediate
12171 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
12172 *cost += COSTS_N_INSNS (ncost);
12173 return true;
12174 }
12175
9dfc162c
JG
12176 if (speed)
12177 {
12178 /* mov[df,sf]_aarch64. */
12179 if (aarch64_float_const_representable_p (x))
12180 /* FMOV (scalar immediate). */
12181 *cost += extra_cost->fp[mode == DFmode].fpconst;
12182 else if (!aarch64_float_const_zero_rtx_p (x))
12183 {
12184 /* This will be a load from memory. */
12185 if (mode == DFmode)
12186 *cost += extra_cost->ldst.loadd;
12187 else
12188 *cost += extra_cost->ldst.loadf;
12189 }
12190 else
12191 /* Otherwise this is +0.0. We get this using MOVI d0, #0
12192 or MOV v0.s[0], wzr - neither of which are modeled by the
12193 cost tables. Just use the default cost. */
12194 {
12195 }
12196 }
12197
12198 return true;
12199
43e9d192
IB
12200 case MEM:
12201 if (speed)
2961177e
JG
12202 {
12203 /* For loads we want the base cost of a load, plus an
12204 approximation for the additional cost of the addressing
12205 mode. */
12206 rtx address = XEXP (x, 0);
b6875aac
KV
12207 if (VECTOR_MODE_P (mode))
12208 *cost += extra_cost->ldst.loadv;
12209 else if (GET_MODE_CLASS (mode) == MODE_INT)
2961177e
JG
12210 *cost += extra_cost->ldst.load;
12211 else if (mode == SFmode)
12212 *cost += extra_cost->ldst.loadf;
12213 else if (mode == DFmode)
12214 *cost += extra_cost->ldst.loadd;
12215
12216 *cost +=
12217 COSTS_N_INSNS (aarch64_address_cost (address, mode,
12218 0, speed));
12219 }
43e9d192
IB
12220
12221 return true;
12222
12223 case NEG:
4745e701
JG
12224 op0 = XEXP (x, 0);
12225
b6875aac
KV
12226 if (VECTOR_MODE_P (mode))
12227 {
12228 if (speed)
12229 {
12230 /* FNEG. */
12231 *cost += extra_cost->vect.alu;
12232 }
12233 return false;
12234 }
12235
e548c9df
AM
12236 if (GET_MODE_CLASS (mode) == MODE_INT)
12237 {
4745e701
JG
12238 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
12239 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
12240 {
12241 /* CSETM. */
e548c9df 12242 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
4745e701
JG
12243 return true;
12244 }
12245
12246 /* Cost this as SUB wzr, X. */
e548c9df 12247 op0 = CONST0_RTX (mode);
4745e701
JG
12248 op1 = XEXP (x, 0);
12249 goto cost_minus;
12250 }
12251
e548c9df 12252 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
4745e701
JG
12253 {
12254 /* Support (neg(fma...)) as a single instruction only if
12255 sign of zeros is unimportant. This matches the decision
12256 making in aarch64.md. */
12257 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
12258 {
12259 /* FNMADD. */
e548c9df 12260 *cost = rtx_cost (op0, mode, NEG, 0, speed);
4745e701
JG
12261 return true;
12262 }
d318517d
SN
12263 if (GET_CODE (op0) == MULT)
12264 {
12265 /* FNMUL. */
12266 *cost = rtx_cost (op0, mode, NEG, 0, speed);
12267 return true;
12268 }
4745e701
JG
12269 if (speed)
12270 /* FNEG. */
12271 *cost += extra_cost->fp[mode == DFmode].neg;
12272 return false;
12273 }
12274
12275 return false;
43e9d192 12276
781aeb73
KT
12277 case CLRSB:
12278 case CLZ:
12279 if (speed)
b6875aac
KV
12280 {
12281 if (VECTOR_MODE_P (mode))
12282 *cost += extra_cost->vect.alu;
12283 else
12284 *cost += extra_cost->alu.clz;
12285 }
781aeb73
KT
12286
12287 return false;
12288
5bfc8303
WD
12289 case CTZ:
12290 *cost = COSTS_N_INSNS (2);
12291
12292 if (speed)
12293 *cost += extra_cost->alu.clz + extra_cost->alu.rev;
12294 return false;
12295
43e9d192
IB
12296 case COMPARE:
12297 op0 = XEXP (x, 0);
12298 op1 = XEXP (x, 1);
12299
12300 if (op1 == const0_rtx
12301 && GET_CODE (op0) == AND)
12302 {
12303 x = op0;
e548c9df 12304 mode = GET_MODE (op0);
43e9d192
IB
12305 goto cost_logic;
12306 }
12307
a8eecd00
JG
12308 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
12309 {
12310 /* TODO: A write to the CC flags possibly costs extra, this
12311 needs encoding in the cost tables. */
12312
e548c9df 12313 mode = GET_MODE (op0);
a8eecd00
JG
12314 /* ANDS. */
12315 if (GET_CODE (op0) == AND)
12316 {
12317 x = op0;
12318 goto cost_logic;
12319 }
12320
12321 if (GET_CODE (op0) == PLUS)
12322 {
12323 /* ADDS (and CMN alias). */
12324 x = op0;
12325 goto cost_plus;
12326 }
12327
12328 if (GET_CODE (op0) == MINUS)
12329 {
12330 /* SUBS. */
12331 x = op0;
12332 goto cost_minus;
12333 }
12334
345854d8
KT
12335 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
12336 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
12337 && CONST_INT_P (XEXP (op0, 2)))
12338 {
12339 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
12340 Handle it here directly rather than going to cost_logic
12341 since we know the immediate generated for the TST is valid
12342 so we can avoid creating an intermediate rtx for it only
12343 for costing purposes. */
12344 if (speed)
12345 *cost += extra_cost->alu.logical;
12346
12347 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
12348 ZERO_EXTRACT, 0, speed);
12349 return true;
12350 }
12351
a8eecd00
JG
12352 if (GET_CODE (op1) == NEG)
12353 {
12354 /* CMN. */
12355 if (speed)
12356 *cost += extra_cost->alu.arith;
12357
e548c9df
AM
12358 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
12359 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
a8eecd00
JG
12360 return true;
12361 }
12362
12363 /* CMP.
12364
12365 Compare can freely swap the order of operands, and
12366 canonicalization puts the more complex operation first.
12367 But the integer MINUS logic expects the shift/extend
12368 operation in op1. */
12369 if (! (REG_P (op0)
12370 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
12371 {
12372 op0 = XEXP (x, 1);
12373 op1 = XEXP (x, 0);
12374 }
12375 goto cost_minus;
12376 }
12377
12378 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
12379 {
12380 /* FCMP. */
12381 if (speed)
12382 *cost += extra_cost->fp[mode == DFmode].compare;
12383
12384 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
12385 {
e548c9df 12386 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
a8eecd00
JG
12387 /* FCMP supports constant 0.0 for no extra cost. */
12388 return true;
12389 }
12390 return false;
12391 }
12392
b6875aac
KV
12393 if (VECTOR_MODE_P (mode))
12394 {
12395 /* Vector compare. */
12396 if (speed)
12397 *cost += extra_cost->vect.alu;
12398
12399 if (aarch64_float_const_zero_rtx_p (op1))
12400 {
12401 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
12402 cost. */
12403 return true;
12404 }
12405 return false;
12406 }
a8eecd00 12407 return false;
43e9d192
IB
12408
12409 case MINUS:
4745e701
JG
12410 {
12411 op0 = XEXP (x, 0);
12412 op1 = XEXP (x, 1);
12413
12414cost_minus:
e548c9df 12415 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
23cb6618 12416
4745e701
JG
12417 /* Detect valid immediates. */
12418 if ((GET_MODE_CLASS (mode) == MODE_INT
12419 || (GET_MODE_CLASS (mode) == MODE_CC
12420 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
12421 && CONST_INT_P (op1)
12422 && aarch64_uimm12_shift (INTVAL (op1)))
12423 {
4745e701
JG
12424 if (speed)
12425 /* SUB(S) (immediate). */
12426 *cost += extra_cost->alu.arith;
12427 return true;
4745e701
JG
12428 }
12429
7cc2145f 12430 /* Look for SUB (extended register). */
7de23b8c
AC
12431 if (is_a <scalar_int_mode> (mode)
12432 && aarch64_rtx_arith_op_extract_p (op1))
7cc2145f
JG
12433 {
12434 if (speed)
2533c820 12435 *cost += extra_cost->alu.extend_arith;
7cc2145f 12436
b10f1009 12437 op1 = aarch64_strip_extend (op1, true);
e47c4031 12438 *cost += rtx_cost (op1, VOIDmode,
e548c9df 12439 (enum rtx_code) GET_CODE (op1), 0, speed);
7cc2145f
JG
12440 return true;
12441 }
12442
b10f1009 12443 rtx new_op1 = aarch64_strip_extend (op1, false);
4745e701
JG
12444
12445 /* Cost this as an FMA-alike operation. */
12446 if ((GET_CODE (new_op1) == MULT
0a78ebe4 12447 || aarch64_shift_p (GET_CODE (new_op1)))
4745e701
JG
12448 && code != COMPARE)
12449 {
12450 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
12451 (enum rtx_code) code,
12452 speed);
4745e701
JG
12453 return true;
12454 }
43e9d192 12455
e548c9df 12456 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
43e9d192 12457
4745e701
JG
12458 if (speed)
12459 {
b6875aac
KV
12460 if (VECTOR_MODE_P (mode))
12461 {
12462 /* Vector SUB. */
12463 *cost += extra_cost->vect.alu;
12464 }
12465 else if (GET_MODE_CLASS (mode) == MODE_INT)
12466 {
12467 /* SUB(S). */
12468 *cost += extra_cost->alu.arith;
12469 }
4745e701 12470 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
b6875aac
KV
12471 {
12472 /* FSUB. */
12473 *cost += extra_cost->fp[mode == DFmode].addsub;
12474 }
4745e701
JG
12475 }
12476 return true;
12477 }
43e9d192
IB
12478
12479 case PLUS:
4745e701
JG
12480 {
12481 rtx new_op0;
43e9d192 12482
4745e701
JG
12483 op0 = XEXP (x, 0);
12484 op1 = XEXP (x, 1);
43e9d192 12485
a8eecd00 12486cost_plus:
4745e701
JG
12487 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
12488 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
12489 {
12490 /* CSINC. */
e548c9df
AM
12491 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
12492 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
4745e701
JG
12493 return true;
12494 }
43e9d192 12495
4745e701 12496 if (GET_MODE_CLASS (mode) == MODE_INT
835d50c6 12497 && (aarch64_plus_immediate (op1, mode)
43cacb12 12498 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
4745e701 12499 {
e548c9df 12500 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
43e9d192 12501
4745e701
JG
12502 if (speed)
12503 /* ADD (immediate). */
12504 *cost += extra_cost->alu.arith;
12505 return true;
12506 }
12507
e548c9df 12508 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
23cb6618 12509
7cc2145f 12510 /* Look for ADD (extended register). */
7de23b8c
AC
12511 if (is_a <scalar_int_mode> (mode)
12512 && aarch64_rtx_arith_op_extract_p (op0))
7cc2145f
JG
12513 {
12514 if (speed)
2533c820 12515 *cost += extra_cost->alu.extend_arith;
7cc2145f 12516
b10f1009 12517 op0 = aarch64_strip_extend (op0, true);
e47c4031 12518 *cost += rtx_cost (op0, VOIDmode,
e548c9df 12519 (enum rtx_code) GET_CODE (op0), 0, speed);
7cc2145f
JG
12520 return true;
12521 }
12522
4745e701
JG
12523 /* Strip any extend, leave shifts behind as we will
12524 cost them through mult_cost. */
b10f1009 12525 new_op0 = aarch64_strip_extend (op0, false);
4745e701
JG
12526
12527 if (GET_CODE (new_op0) == MULT
0a78ebe4 12528 || aarch64_shift_p (GET_CODE (new_op0)))
4745e701
JG
12529 {
12530 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
12531 speed);
4745e701
JG
12532 return true;
12533 }
12534
e548c9df 12535 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
4745e701
JG
12536
12537 if (speed)
12538 {
b6875aac
KV
12539 if (VECTOR_MODE_P (mode))
12540 {
12541 /* Vector ADD. */
12542 *cost += extra_cost->vect.alu;
12543 }
12544 else if (GET_MODE_CLASS (mode) == MODE_INT)
12545 {
12546 /* ADD. */
12547 *cost += extra_cost->alu.arith;
12548 }
4745e701 12549 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
b6875aac
KV
12550 {
12551 /* FADD. */
12552 *cost += extra_cost->fp[mode == DFmode].addsub;
12553 }
4745e701
JG
12554 }
12555 return true;
12556 }
43e9d192 12557
18b42b2a
KT
12558 case BSWAP:
12559 *cost = COSTS_N_INSNS (1);
12560
12561 if (speed)
b6875aac
KV
12562 {
12563 if (VECTOR_MODE_P (mode))
12564 *cost += extra_cost->vect.alu;
12565 else
12566 *cost += extra_cost->alu.rev;
12567 }
18b42b2a
KT
12568 return false;
12569
43e9d192 12570 case IOR:
f7d5cf8d
KT
12571 if (aarch_rev16_p (x))
12572 {
12573 *cost = COSTS_N_INSNS (1);
12574
b6875aac
KV
12575 if (speed)
12576 {
12577 if (VECTOR_MODE_P (mode))
12578 *cost += extra_cost->vect.alu;
12579 else
12580 *cost += extra_cost->alu.rev;
12581 }
12582 return true;
f7d5cf8d 12583 }
fb0cb7fa
KT
12584
12585 if (aarch64_extr_rtx_p (x, &op0, &op1))
12586 {
e548c9df
AM
12587 *cost += rtx_cost (op0, mode, IOR, 0, speed);
12588 *cost += rtx_cost (op1, mode, IOR, 1, speed);
fb0cb7fa
KT
12589 if (speed)
12590 *cost += extra_cost->alu.shift;
12591
12592 return true;
12593 }
f7d5cf8d 12594 /* Fall through. */
43e9d192
IB
12595 case XOR:
12596 case AND:
12597 cost_logic:
12598 op0 = XEXP (x, 0);
12599 op1 = XEXP (x, 1);
12600
b6875aac
KV
12601 if (VECTOR_MODE_P (mode))
12602 {
12603 if (speed)
12604 *cost += extra_cost->vect.alu;
12605 return true;
12606 }
12607
268c3b47
JG
12608 if (code == AND
12609 && GET_CODE (op0) == MULT
12610 && CONST_INT_P (XEXP (op0, 1))
12611 && CONST_INT_P (op1)
12612 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
12613 INTVAL (op1)) != 0)
12614 {
12615 /* This is a UBFM/SBFM. */
e548c9df 12616 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
268c3b47
JG
12617 if (speed)
12618 *cost += extra_cost->alu.bfx;
12619 return true;
12620 }
12621
b4206259 12622 if (is_int_mode (mode, &int_mode))
43e9d192 12623 {
8c83f71d 12624 if (CONST_INT_P (op1))
43e9d192 12625 {
8c83f71d
KT
12626 /* We have a mask + shift version of a UBFIZ
12627 i.e. the *andim_ashift<mode>_bfiz pattern. */
12628 if (GET_CODE (op0) == ASHIFT
b4206259
RS
12629 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
12630 XEXP (op0, 1)))
8c83f71d 12631 {
b4206259 12632 *cost += rtx_cost (XEXP (op0, 0), int_mode,
8c83f71d
KT
12633 (enum rtx_code) code, 0, speed);
12634 if (speed)
12635 *cost += extra_cost->alu.bfx;
268c3b47 12636
8c83f71d
KT
12637 return true;
12638 }
b4206259 12639 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
8c83f71d
KT
12640 {
12641 /* We possibly get the immediate for free, this is not
12642 modelled. */
b4206259
RS
12643 *cost += rtx_cost (op0, int_mode,
12644 (enum rtx_code) code, 0, speed);
8c83f71d
KT
12645 if (speed)
12646 *cost += extra_cost->alu.logical;
268c3b47 12647
8c83f71d
KT
12648 return true;
12649 }
43e9d192
IB
12650 }
12651 else
12652 {
268c3b47
JG
12653 rtx new_op0 = op0;
12654
12655 /* Handle ORN, EON, or BIC. */
43e9d192
IB
12656 if (GET_CODE (op0) == NOT)
12657 op0 = XEXP (op0, 0);
268c3b47
JG
12658
12659 new_op0 = aarch64_strip_shift (op0);
12660
12661 /* If we had a shift on op0 then this is a logical-shift-
12662 by-register/immediate operation. Otherwise, this is just
12663 a logical operation. */
12664 if (speed)
12665 {
12666 if (new_op0 != op0)
12667 {
12668 /* Shift by immediate. */
12669 if (CONST_INT_P (XEXP (op0, 1)))
12670 *cost += extra_cost->alu.log_shift;
12671 else
12672 *cost += extra_cost->alu.log_shift_reg;
12673 }
12674 else
12675 *cost += extra_cost->alu.logical;
12676 }
12677
12678 /* In both cases we want to cost both operands. */
b4206259
RS
12679 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
12680 0, speed);
12681 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
12682 1, speed);
268c3b47
JG
12683
12684 return true;
43e9d192 12685 }
43e9d192
IB
12686 }
12687 return false;
12688
268c3b47 12689 case NOT:
6365da9e
KT
12690 x = XEXP (x, 0);
12691 op0 = aarch64_strip_shift (x);
12692
b6875aac
KV
12693 if (VECTOR_MODE_P (mode))
12694 {
12695 /* Vector NOT. */
12696 *cost += extra_cost->vect.alu;
12697 return false;
12698 }
12699
6365da9e
KT
12700 /* MVN-shifted-reg. */
12701 if (op0 != x)
12702 {
e548c9df 12703 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6365da9e
KT
12704
12705 if (speed)
12706 *cost += extra_cost->alu.log_shift;
12707
12708 return true;
12709 }
12710 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
12711 Handle the second form here taking care that 'a' in the above can
12712 be a shift. */
12713 else if (GET_CODE (op0) == XOR)
12714 {
12715 rtx newop0 = XEXP (op0, 0);
12716 rtx newop1 = XEXP (op0, 1);
12717 rtx op0_stripped = aarch64_strip_shift (newop0);
12718
e548c9df
AM
12719 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
12720 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
6365da9e
KT
12721
12722 if (speed)
12723 {
12724 if (op0_stripped != newop0)
12725 *cost += extra_cost->alu.log_shift;
12726 else
12727 *cost += extra_cost->alu.logical;
12728 }
12729
12730 return true;
12731 }
268c3b47
JG
12732 /* MVN. */
12733 if (speed)
12734 *cost += extra_cost->alu.logical;
12735
268c3b47
JG
12736 return false;
12737
43e9d192 12738 case ZERO_EXTEND:
b1685e62
JG
12739
12740 op0 = XEXP (x, 0);
12741 /* If a value is written in SI mode, then zero extended to DI
12742 mode, the operation will in general be free as a write to
12743 a 'w' register implicitly zeroes the upper bits of an 'x'
12744 register. However, if this is
12745
12746 (set (reg) (zero_extend (reg)))
12747
12748 we must cost the explicit register move. */
12749 if (mode == DImode
12750 && GET_MODE (op0) == SImode
12751 && outer == SET)
12752 {
e548c9df 12753 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
b1685e62 12754
dde23f43
KM
12755 /* If OP_COST is non-zero, then the cost of the zero extend
12756 is effectively the cost of the inner operation. Otherwise
12757 we have a MOV instruction and we take the cost from the MOV
12758 itself. This is true independently of whether we are
12759 optimizing for space or time. */
12760 if (op_cost)
b1685e62
JG
12761 *cost = op_cost;
12762
12763 return true;
12764 }
e548c9df 12765 else if (MEM_P (op0))
43e9d192 12766 {
b1685e62 12767 /* All loads can zero extend to any size for free. */
e548c9df 12768 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
43e9d192
IB
12769 return true;
12770 }
b1685e62 12771
283b6c85
KT
12772 op0 = aarch64_extend_bitfield_pattern_p (x);
12773 if (op0)
12774 {
12775 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
12776 if (speed)
12777 *cost += extra_cost->alu.bfx;
12778 return true;
12779 }
12780
b1685e62 12781 if (speed)
b6875aac
KV
12782 {
12783 if (VECTOR_MODE_P (mode))
12784 {
12785 /* UMOV. */
12786 *cost += extra_cost->vect.alu;
12787 }
12788 else
12789 {
63715e5e
WD
12790 /* We generate an AND instead of UXTB/UXTH. */
12791 *cost += extra_cost->alu.logical;
b6875aac
KV
12792 }
12793 }
43e9d192
IB
12794 return false;
12795
12796 case SIGN_EXTEND:
b1685e62 12797 if (MEM_P (XEXP (x, 0)))
43e9d192 12798 {
b1685e62
JG
12799 /* LDRSH. */
12800 if (speed)
12801 {
12802 rtx address = XEXP (XEXP (x, 0), 0);
12803 *cost += extra_cost->ldst.load_sign_extend;
12804
12805 *cost +=
12806 COSTS_N_INSNS (aarch64_address_cost (address, mode,
12807 0, speed));
12808 }
43e9d192
IB
12809 return true;
12810 }
b1685e62 12811
283b6c85
KT
12812 op0 = aarch64_extend_bitfield_pattern_p (x);
12813 if (op0)
12814 {
12815 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
12816 if (speed)
12817 *cost += extra_cost->alu.bfx;
12818 return true;
12819 }
12820
b1685e62 12821 if (speed)
b6875aac
KV
12822 {
12823 if (VECTOR_MODE_P (mode))
12824 *cost += extra_cost->vect.alu;
12825 else
12826 *cost += extra_cost->alu.extend;
12827 }
43e9d192
IB
12828 return false;
12829
ba0cfa17
JG
12830 case ASHIFT:
12831 op0 = XEXP (x, 0);
12832 op1 = XEXP (x, 1);
12833
12834 if (CONST_INT_P (op1))
12835 {
ba0cfa17 12836 if (speed)
b6875aac
KV
12837 {
12838 if (VECTOR_MODE_P (mode))
12839 {
12840 /* Vector shift (immediate). */
12841 *cost += extra_cost->vect.alu;
12842 }
12843 else
12844 {
12845 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
12846 aliases. */
12847 *cost += extra_cost->alu.shift;
12848 }
12849 }
ba0cfa17
JG
12850
12851 /* We can incorporate zero/sign extend for free. */
12852 if (GET_CODE (op0) == ZERO_EXTEND
12853 || GET_CODE (op0) == SIGN_EXTEND)
12854 op0 = XEXP (op0, 0);
12855
e548c9df 12856 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
ba0cfa17
JG
12857 return true;
12858 }
12859 else
12860 {
7813b280 12861 if (VECTOR_MODE_P (mode))
b6875aac 12862 {
7813b280
KT
12863 if (speed)
12864 /* Vector shift (register). */
12865 *cost += extra_cost->vect.alu;
12866 }
12867 else
12868 {
12869 if (speed)
12870 /* LSLV. */
12871 *cost += extra_cost->alu.shift_reg;
12872
12873 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
12874 && CONST_INT_P (XEXP (op1, 1))
6a70badb
RS
12875 && known_eq (INTVAL (XEXP (op1, 1)),
12876 GET_MODE_BITSIZE (mode) - 1))
b6875aac 12877 {
7813b280
KT
12878 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
12879 /* We already demanded XEXP (op1, 0) to be REG_P, so
12880 don't recurse into it. */
12881 return true;
b6875aac
KV
12882 }
12883 }
ba0cfa17
JG
12884 return false; /* All arguments need to be in registers. */
12885 }
12886
43e9d192 12887 case ROTATE:
43e9d192
IB
12888 case ROTATERT:
12889 case LSHIFTRT:
43e9d192 12890 case ASHIFTRT:
ba0cfa17
JG
12891 op0 = XEXP (x, 0);
12892 op1 = XEXP (x, 1);
43e9d192 12893
ba0cfa17
JG
12894 if (CONST_INT_P (op1))
12895 {
12896 /* ASR (immediate) and friends. */
12897 if (speed)
b6875aac
KV
12898 {
12899 if (VECTOR_MODE_P (mode))
12900 *cost += extra_cost->vect.alu;
12901 else
12902 *cost += extra_cost->alu.shift;
12903 }
43e9d192 12904
e548c9df 12905 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
ba0cfa17
JG
12906 return true;
12907 }
12908 else
12909 {
7813b280 12910 if (VECTOR_MODE_P (mode))
b6875aac 12911 {
7813b280
KT
12912 if (speed)
12913 /* Vector shift (register). */
b6875aac 12914 *cost += extra_cost->vect.alu;
7813b280
KT
12915 }
12916 else
12917 {
12918 if (speed)
12919 /* ASR (register) and friends. */
b6875aac 12920 *cost += extra_cost->alu.shift_reg;
7813b280
KT
12921
12922 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
12923 && CONST_INT_P (XEXP (op1, 1))
6a70badb
RS
12924 && known_eq (INTVAL (XEXP (op1, 1)),
12925 GET_MODE_BITSIZE (mode) - 1))
7813b280
KT
12926 {
12927 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
12928 /* We already demanded XEXP (op1, 0) to be REG_P, so
12929 don't recurse into it. */
12930 return true;
12931 }
b6875aac 12932 }
ba0cfa17
JG
12933 return false; /* All arguments need to be in registers. */
12934 }
43e9d192 12935
909734be
JG
12936 case SYMBOL_REF:
12937
1b1e81f8
JW
12938 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
12939 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
909734be
JG
12940 {
12941 /* LDR. */
12942 if (speed)
12943 *cost += extra_cost->ldst.load;
12944 }
12945 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
12946 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
12947 {
12948 /* ADRP, followed by ADD. */
12949 *cost += COSTS_N_INSNS (1);
12950 if (speed)
12951 *cost += 2 * extra_cost->alu.arith;
12952 }
12953 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
12954 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
12955 {
12956 /* ADR. */
12957 if (speed)
12958 *cost += extra_cost->alu.arith;
12959 }
12960
12961 if (flag_pic)
12962 {
12963 /* One extra load instruction, after accessing the GOT. */
12964 *cost += COSTS_N_INSNS (1);
12965 if (speed)
12966 *cost += extra_cost->ldst.load;
12967 }
43e9d192
IB
12968 return true;
12969
909734be 12970 case HIGH:
43e9d192 12971 case LO_SUM:
909734be
JG
12972 /* ADRP/ADD (immediate). */
12973 if (speed)
12974 *cost += extra_cost->alu.arith;
43e9d192
IB
12975 return true;
12976
12977 case ZERO_EXTRACT:
12978 case SIGN_EXTRACT:
7cc2145f
JG
12979 /* UBFX/SBFX. */
12980 if (speed)
b6875aac
KV
12981 {
12982 if (VECTOR_MODE_P (mode))
12983 *cost += extra_cost->vect.alu;
12984 else
12985 *cost += extra_cost->alu.bfx;
12986 }
7cc2145f
JG
12987
12988 /* We can trust that the immediates used will be correct (there
12989 are no by-register forms), so we need only cost op0. */
e548c9df 12990 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
43e9d192
IB
12991 return true;
12992
12993 case MULT:
4745e701
JG
12994 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
12995 /* aarch64_rtx_mult_cost always handles recursion to its
12996 operands. */
12997 return true;
43e9d192
IB
12998
12999 case MOD:
4f58fe36
KT
13000 /* We can expand signed mod by power of 2 using a NEGS, two parallel
13001 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
13002 an unconditional negate. This case should only ever be reached through
13003 the set_smod_pow2_cheap check in expmed.c. */
13004 if (CONST_INT_P (XEXP (x, 1))
13005 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
13006 && (mode == SImode || mode == DImode))
13007 {
13008 /* We expand to 4 instructions. Reset the baseline. */
13009 *cost = COSTS_N_INSNS (4);
13010
13011 if (speed)
13012 *cost += 2 * extra_cost->alu.logical
13013 + 2 * extra_cost->alu.arith;
13014
13015 return true;
13016 }
13017
13018 /* Fall-through. */
43e9d192 13019 case UMOD:
43e9d192
IB
13020 if (speed)
13021 {
cb9ac430 13022 /* Slighly prefer UMOD over SMOD. */
b6875aac
KV
13023 if (VECTOR_MODE_P (mode))
13024 *cost += extra_cost->vect.alu;
e548c9df
AM
13025 else if (GET_MODE_CLASS (mode) == MODE_INT)
13026 *cost += (extra_cost->mult[mode == DImode].add
cb9ac430
TC
13027 + extra_cost->mult[mode == DImode].idiv
13028 + (code == MOD ? 1 : 0));
43e9d192
IB
13029 }
13030 return false; /* All arguments need to be in registers. */
13031
13032 case DIV:
13033 case UDIV:
4105fe38 13034 case SQRT:
43e9d192
IB
13035 if (speed)
13036 {
b6875aac
KV
13037 if (VECTOR_MODE_P (mode))
13038 *cost += extra_cost->vect.alu;
13039 else if (GET_MODE_CLASS (mode) == MODE_INT)
4105fe38
JG
13040 /* There is no integer SQRT, so only DIV and UDIV can get
13041 here. */
cb9ac430
TC
13042 *cost += (extra_cost->mult[mode == DImode].idiv
13043 /* Slighly prefer UDIV over SDIV. */
13044 + (code == DIV ? 1 : 0));
4105fe38
JG
13045 else
13046 *cost += extra_cost->fp[mode == DFmode].div;
43e9d192
IB
13047 }
13048 return false; /* All arguments need to be in registers. */
13049
a8eecd00 13050 case IF_THEN_ELSE:
2d5ffe46
AP
13051 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
13052 XEXP (x, 2), cost, speed);
a8eecd00
JG
13053
13054 case EQ:
13055 case NE:
13056 case GT:
13057 case GTU:
13058 case LT:
13059 case LTU:
13060 case GE:
13061 case GEU:
13062 case LE:
13063 case LEU:
13064
13065 return false; /* All arguments must be in registers. */
13066
b292109f
JG
13067 case FMA:
13068 op0 = XEXP (x, 0);
13069 op1 = XEXP (x, 1);
13070 op2 = XEXP (x, 2);
13071
13072 if (speed)
b6875aac
KV
13073 {
13074 if (VECTOR_MODE_P (mode))
13075 *cost += extra_cost->vect.alu;
13076 else
13077 *cost += extra_cost->fp[mode == DFmode].fma;
13078 }
b292109f
JG
13079
13080 /* FMSUB, FNMADD, and FNMSUB are free. */
13081 if (GET_CODE (op0) == NEG)
13082 op0 = XEXP (op0, 0);
13083
13084 if (GET_CODE (op2) == NEG)
13085 op2 = XEXP (op2, 0);
13086
13087 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
13088 and the by-element operand as operand 0. */
13089 if (GET_CODE (op1) == NEG)
13090 op1 = XEXP (op1, 0);
13091
13092 /* Catch vector-by-element operations. The by-element operand can
13093 either be (vec_duplicate (vec_select (x))) or just
13094 (vec_select (x)), depending on whether we are multiplying by
13095 a vector or a scalar.
13096
13097 Canonicalization is not very good in these cases, FMA4 will put the
13098 by-element operand as operand 0, FNMA4 will have it as operand 1. */
13099 if (GET_CODE (op0) == VEC_DUPLICATE)
13100 op0 = XEXP (op0, 0);
13101 else if (GET_CODE (op1) == VEC_DUPLICATE)
13102 op1 = XEXP (op1, 0);
13103
13104 if (GET_CODE (op0) == VEC_SELECT)
13105 op0 = XEXP (op0, 0);
13106 else if (GET_CODE (op1) == VEC_SELECT)
13107 op1 = XEXP (op1, 0);
13108
13109 /* If the remaining parameters are not registers,
13110 get the cost to put them into registers. */
e548c9df
AM
13111 *cost += rtx_cost (op0, mode, FMA, 0, speed);
13112 *cost += rtx_cost (op1, mode, FMA, 1, speed);
13113 *cost += rtx_cost (op2, mode, FMA, 2, speed);
b292109f
JG
13114 return true;
13115
5e2a765b
KT
13116 case FLOAT:
13117 case UNSIGNED_FLOAT:
13118 if (speed)
13119 *cost += extra_cost->fp[mode == DFmode].fromint;
13120 return false;
13121
b292109f
JG
13122 case FLOAT_EXTEND:
13123 if (speed)
b6875aac
KV
13124 {
13125 if (VECTOR_MODE_P (mode))
13126 {
13127 /*Vector truncate. */
13128 *cost += extra_cost->vect.alu;
13129 }
13130 else
13131 *cost += extra_cost->fp[mode == DFmode].widen;
13132 }
b292109f
JG
13133 return false;
13134
13135 case FLOAT_TRUNCATE:
13136 if (speed)
b6875aac
KV
13137 {
13138 if (VECTOR_MODE_P (mode))
13139 {
13140 /*Vector conversion. */
13141 *cost += extra_cost->vect.alu;
13142 }
13143 else
13144 *cost += extra_cost->fp[mode == DFmode].narrow;
13145 }
b292109f
JG
13146 return false;
13147
61263118
KT
13148 case FIX:
13149 case UNSIGNED_FIX:
13150 x = XEXP (x, 0);
13151 /* Strip the rounding part. They will all be implemented
13152 by the fcvt* family of instructions anyway. */
13153 if (GET_CODE (x) == UNSPEC)
13154 {
13155 unsigned int uns_code = XINT (x, 1);
13156
13157 if (uns_code == UNSPEC_FRINTA
13158 || uns_code == UNSPEC_FRINTM
13159 || uns_code == UNSPEC_FRINTN
13160 || uns_code == UNSPEC_FRINTP
13161 || uns_code == UNSPEC_FRINTZ)
13162 x = XVECEXP (x, 0, 0);
13163 }
13164
13165 if (speed)
b6875aac
KV
13166 {
13167 if (VECTOR_MODE_P (mode))
13168 *cost += extra_cost->vect.alu;
13169 else
13170 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
13171 }
39252973
KT
13172
13173 /* We can combine fmul by a power of 2 followed by a fcvt into a single
13174 fixed-point fcvt. */
13175 if (GET_CODE (x) == MULT
13176 && ((VECTOR_MODE_P (mode)
13177 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
13178 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
13179 {
13180 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
13181 0, speed);
13182 return true;
13183 }
13184
e548c9df 13185 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
61263118
KT
13186 return true;
13187
b292109f 13188 case ABS:
b6875aac
KV
13189 if (VECTOR_MODE_P (mode))
13190 {
13191 /* ABS (vector). */
13192 if (speed)
13193 *cost += extra_cost->vect.alu;
13194 }
13195 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
b292109f 13196 {
19261b99
KT
13197 op0 = XEXP (x, 0);
13198
13199 /* FABD, which is analogous to FADD. */
13200 if (GET_CODE (op0) == MINUS)
13201 {
e548c9df
AM
13202 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
13203 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
19261b99
KT
13204 if (speed)
13205 *cost += extra_cost->fp[mode == DFmode].addsub;
13206
13207 return true;
13208 }
13209 /* Simple FABS is analogous to FNEG. */
b292109f
JG
13210 if (speed)
13211 *cost += extra_cost->fp[mode == DFmode].neg;
13212 }
13213 else
13214 {
13215 /* Integer ABS will either be split to
13216 two arithmetic instructions, or will be an ABS
13217 (scalar), which we don't model. */
13218 *cost = COSTS_N_INSNS (2);
13219 if (speed)
13220 *cost += 2 * extra_cost->alu.arith;
13221 }
13222 return false;
13223
13224 case SMAX:
13225 case SMIN:
13226 if (speed)
13227 {
b6875aac
KV
13228 if (VECTOR_MODE_P (mode))
13229 *cost += extra_cost->vect.alu;
13230 else
13231 {
13232 /* FMAXNM/FMINNM/FMAX/FMIN.
13233 TODO: This may not be accurate for all implementations, but
13234 we do not model this in the cost tables. */
13235 *cost += extra_cost->fp[mode == DFmode].addsub;
13236 }
b292109f
JG
13237 }
13238 return false;
13239
61263118
KT
13240 case UNSPEC:
13241 /* The floating point round to integer frint* instructions. */
13242 if (aarch64_frint_unspec_p (XINT (x, 1)))
13243 {
13244 if (speed)
13245 *cost += extra_cost->fp[mode == DFmode].roundint;
13246
13247 return false;
13248 }
781aeb73
KT
13249
13250 if (XINT (x, 1) == UNSPEC_RBIT)
13251 {
13252 if (speed)
13253 *cost += extra_cost->alu.rev;
13254
13255 return false;
13256 }
61263118
KT
13257 break;
13258
fb620c4a
JG
13259 case TRUNCATE:
13260
13261 /* Decompose <su>muldi3_highpart. */
13262 if (/* (truncate:DI */
13263 mode == DImode
13264 /* (lshiftrt:TI */
13265 && GET_MODE (XEXP (x, 0)) == TImode
13266 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
13267 /* (mult:TI */
13268 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13269 /* (ANY_EXTEND:TI (reg:DI))
13270 (ANY_EXTEND:TI (reg:DI))) */
13271 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
13272 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
13273 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
13274 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
13275 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
13276 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
13277 /* (const_int 64) */
13278 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13279 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
13280 {
13281 /* UMULH/SMULH. */
13282 if (speed)
13283 *cost += extra_cost->mult[mode == DImode].extend;
e548c9df
AM
13284 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
13285 mode, MULT, 0, speed);
13286 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
13287 mode, MULT, 1, speed);
fb620c4a
JG
13288 return true;
13289 }
13290
13291 /* Fall through. */
43e9d192 13292 default:
61263118 13293 break;
43e9d192 13294 }
61263118 13295
c10e3d7f
AP
13296 if (dump_file
13297 && flag_aarch64_verbose_cost)
61263118
KT
13298 fprintf (dump_file,
13299 "\nFailed to cost RTX. Assuming default cost.\n");
13300
13301 return true;
43e9d192
IB
13302}
13303
0ee859b5
JG
13304/* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
13305 calculated for X. This cost is stored in *COST. Returns true
13306 if the total cost of X was calculated. */
13307static bool
e548c9df 13308aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
0ee859b5
JG
13309 int param, int *cost, bool speed)
13310{
e548c9df 13311 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
0ee859b5 13312
c10e3d7f
AP
13313 if (dump_file
13314 && flag_aarch64_verbose_cost)
0ee859b5
JG
13315 {
13316 print_rtl_single (dump_file, x);
13317 fprintf (dump_file, "\n%s cost: %d (%s)\n",
13318 speed ? "Hot" : "Cold",
13319 *cost, result ? "final" : "partial");
13320 }
13321
13322 return result;
13323}
13324
43e9d192 13325static int
ef4bddc2 13326aarch64_register_move_cost (machine_mode mode,
8a3a7e67 13327 reg_class_t from_i, reg_class_t to_i)
43e9d192 13328{
8a3a7e67
RH
13329 enum reg_class from = (enum reg_class) from_i;
13330 enum reg_class to = (enum reg_class) to_i;
43e9d192 13331 const struct cpu_regmove_cost *regmove_cost
b175b679 13332 = aarch64_tune_params.regmove_cost;
43e9d192 13333
3be07662 13334 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
96b7f495
MM
13335 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS
13336 || to == STUB_REGS)
3be07662
WD
13337 to = GENERAL_REGS;
13338
96b7f495
MM
13339 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS
13340 || from == STUB_REGS)
3be07662
WD
13341 from = GENERAL_REGS;
13342
183bfdaf
RS
13343 /* Make RDFFR very expensive. In particular, if we know that the FFR
13344 contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
13345 as a way of obtaining a PTRUE. */
13346 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
13347 && hard_reg_set_subset_p (reg_class_contents[from_i],
13348 reg_class_contents[FFR_REGS]))
13349 return 80;
13350
6ee70f81
AP
13351 /* Moving between GPR and stack cost is the same as GP2GP. */
13352 if ((from == GENERAL_REGS && to == STACK_REG)
13353 || (to == GENERAL_REGS && from == STACK_REG))
13354 return regmove_cost->GP2GP;
13355
13356 /* To/From the stack register, we move via the gprs. */
13357 if (to == STACK_REG || from == STACK_REG)
13358 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
13359 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
13360
6a70badb 13361 if (known_eq (GET_MODE_SIZE (mode), 16))
8919453c
WD
13362 {
13363 /* 128-bit operations on general registers require 2 instructions. */
13364 if (from == GENERAL_REGS && to == GENERAL_REGS)
13365 return regmove_cost->GP2GP * 2;
13366 else if (from == GENERAL_REGS)
13367 return regmove_cost->GP2FP * 2;
13368 else if (to == GENERAL_REGS)
13369 return regmove_cost->FP2GP * 2;
13370
13371 /* When AdvSIMD instructions are disabled it is not possible to move
13372 a 128-bit value directly between Q registers. This is handled in
13373 secondary reload. A general register is used as a scratch to move
13374 the upper DI value and the lower DI value is moved directly,
13375 hence the cost is the sum of three moves. */
13376 if (! TARGET_SIMD)
13377 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
13378
13379 return regmove_cost->FP2FP;
13380 }
13381
43e9d192
IB
13382 if (from == GENERAL_REGS && to == GENERAL_REGS)
13383 return regmove_cost->GP2GP;
13384 else if (from == GENERAL_REGS)
13385 return regmove_cost->GP2FP;
13386 else if (to == GENERAL_REGS)
13387 return regmove_cost->FP2GP;
13388
43e9d192
IB
13389 return regmove_cost->FP2FP;
13390}
13391
13392static int
ef4bddc2 13393aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
43e9d192
IB
13394 reg_class_t rclass ATTRIBUTE_UNUSED,
13395 bool in ATTRIBUTE_UNUSED)
13396{
b175b679 13397 return aarch64_tune_params.memmov_cost;
43e9d192
IB
13398}
13399
6d4d616a
RS
13400/* Implement TARGET_INIT_BUILTINS. */
13401static void
13402aarch64_init_builtins ()
13403{
13404 aarch64_general_init_builtins ();
624d0f07 13405 aarch64_sve::init_builtins ();
6d4d616a
RS
13406}
13407
13408/* Implement TARGET_FOLD_BUILTIN. */
13409static tree
13410aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
13411{
13412 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13413 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13414 tree type = TREE_TYPE (TREE_TYPE (fndecl));
13415 switch (code & AARCH64_BUILTIN_CLASS)
13416 {
13417 case AARCH64_BUILTIN_GENERAL:
13418 return aarch64_general_fold_builtin (subcode, type, nargs, args);
624d0f07
RS
13419
13420 case AARCH64_BUILTIN_SVE:
13421 return NULL_TREE;
6d4d616a
RS
13422 }
13423 gcc_unreachable ();
13424}
13425
13426/* Implement TARGET_GIMPLE_FOLD_BUILTIN. */
13427static bool
13428aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
13429{
13430 gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
13431 tree fndecl = gimple_call_fndecl (stmt);
13432 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13433 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13434 gimple *new_stmt = NULL;
13435 switch (code & AARCH64_BUILTIN_CLASS)
13436 {
13437 case AARCH64_BUILTIN_GENERAL:
13438 new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt);
13439 break;
624d0f07
RS
13440
13441 case AARCH64_BUILTIN_SVE:
13442 new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
13443 break;
6d4d616a
RS
13444 }
13445
13446 if (!new_stmt)
13447 return false;
13448
13449 gsi_replace (gsi, new_stmt, true);
13450 return true;
13451}
13452
13453/* Implement TARGET_EXPAND_BUILTIN. */
13454static rtx
c5dc215d 13455aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
6d4d616a
RS
13456{
13457 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
13458 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13459 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13460 switch (code & AARCH64_BUILTIN_CLASS)
13461 {
13462 case AARCH64_BUILTIN_GENERAL:
c5dc215d 13463 return aarch64_general_expand_builtin (subcode, exp, target, ignore);
624d0f07
RS
13464
13465 case AARCH64_BUILTIN_SVE:
13466 return aarch64_sve::expand_builtin (subcode, exp, target);
6d4d616a
RS
13467 }
13468 gcc_unreachable ();
13469}
13470
13471/* Implement TARGET_BUILTIN_DECL. */
13472static tree
13473aarch64_builtin_decl (unsigned int code, bool initialize_p)
13474{
13475 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13476 switch (code & AARCH64_BUILTIN_CLASS)
13477 {
13478 case AARCH64_BUILTIN_GENERAL:
13479 return aarch64_general_builtin_decl (subcode, initialize_p);
624d0f07
RS
13480
13481 case AARCH64_BUILTIN_SVE:
13482 return aarch64_sve::builtin_decl (subcode, initialize_p);
6d4d616a
RS
13483 }
13484 gcc_unreachable ();
13485}
13486
0c30e0f3
EM
13487/* Return true if it is safe and beneficial to use the approximate rsqrt optabs
13488 to optimize 1.0/sqrt. */
ee62a5a6
RS
13489
13490static bool
9acc9cbe 13491use_rsqrt_p (machine_mode mode)
ee62a5a6
RS
13492{
13493 return (!flag_trapping_math
13494 && flag_unsafe_math_optimizations
9acc9cbe
EM
13495 && ((aarch64_tune_params.approx_modes->recip_sqrt
13496 & AARCH64_APPROX_MODE (mode))
1a33079e 13497 || flag_mrecip_low_precision_sqrt));
ee62a5a6
RS
13498}
13499
0c30e0f3
EM
13500/* Function to decide when to use the approximate reciprocal square root
13501 builtin. */
a6fc00da
BH
13502
13503static tree
ee62a5a6 13504aarch64_builtin_reciprocal (tree fndecl)
a6fc00da 13505{
9acc9cbe
EM
13506 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
13507
13508 if (!use_rsqrt_p (mode))
a6fc00da 13509 return NULL_TREE;
6d4d616a
RS
13510 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13511 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13512 switch (code & AARCH64_BUILTIN_CLASS)
13513 {
13514 case AARCH64_BUILTIN_GENERAL:
13515 return aarch64_general_builtin_rsqrt (subcode);
624d0f07
RS
13516
13517 case AARCH64_BUILTIN_SVE:
13518 return NULL_TREE;
6d4d616a
RS
13519 }
13520 gcc_unreachable ();
a6fc00da
BH
13521}
13522
04f307cb
RS
13523/* Emit code to perform the floating-point operation:
13524
13525 DST = SRC1 * SRC2
13526
13527 where all three operands are already known to be registers.
13528 If the operation is an SVE one, PTRUE is a suitable all-true
13529 predicate. */
13530
13531static void
13532aarch64_emit_mult (rtx dst, rtx ptrue, rtx src1, rtx src2)
13533{
13534 if (ptrue)
13535 emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL, GET_MODE (dst),
13536 dst, ptrue, src1, src2,
13537 gen_int_mode (SVE_RELAXED_GP, SImode)));
13538 else
13539 emit_set_insn (dst, gen_rtx_MULT (GET_MODE (dst), src1, src2));
13540}
13541
98daafa0
EM
13542/* Emit instruction sequence to compute either the approximate square root
13543 or its approximate reciprocal, depending on the flag RECP, and return
13544 whether the sequence was emitted or not. */
a6fc00da 13545
98daafa0
EM
13546bool
13547aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
a6fc00da 13548{
98daafa0 13549 machine_mode mode = GET_MODE (dst);
daef0a8c
JW
13550
13551 if (GET_MODE_INNER (mode) == HFmode)
2e19adc8
RE
13552 {
13553 gcc_assert (!recp);
13554 return false;
13555 }
13556
2e19adc8
RE
13557 if (!recp)
13558 {
13559 if (!(flag_mlow_precision_sqrt
13560 || (aarch64_tune_params.approx_modes->sqrt
13561 & AARCH64_APPROX_MODE (mode))))
13562 return false;
13563
902d28bd 13564 if (!flag_finite_math_only
2e19adc8
RE
13565 || flag_trapping_math
13566 || !flag_unsafe_math_optimizations
13567 || optimize_function_for_size_p (cfun))
13568 return false;
13569 }
13570 else
13571 /* Caller assumes we cannot fail. */
13572 gcc_assert (use_rsqrt_p (mode));
daef0a8c 13573
a0ee8352
RS
13574 rtx pg = NULL_RTX;
13575 if (aarch64_sve_mode_p (mode))
13576 pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
d7814449 13577 machine_mode mmsk = (VECTOR_MODE_P (mode)
d083ee47 13578 ? related_int_vector_mode (mode).require ()
d7814449 13579 : int_mode_for_mode (mode).require ());
0df28e68 13580 rtx xmsk = NULL_RTX;
98daafa0 13581 if (!recp)
0df28e68
RS
13582 {
13583 /* When calculating the approximate square root, compare the
13584 argument with 0.0 and create a mask. */
a0ee8352
RS
13585 rtx zero = CONST0_RTX (mode);
13586 if (pg)
13587 {
13588 xmsk = gen_reg_rtx (GET_MODE (pg));
13589 rtx hint = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
13590 emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE, mode,
13591 xmsk, pg, hint, src, zero));
13592 }
13593 else
13594 {
13595 xmsk = gen_reg_rtx (mmsk);
13596 emit_insn (gen_rtx_SET (xmsk,
13597 gen_rtx_NEG (mmsk,
13598 gen_rtx_EQ (mmsk, src, zero))));
13599 }
0df28e68 13600 }
a6fc00da 13601
98daafa0
EM
13602 /* Estimate the approximate reciprocal square root. */
13603 rtx xdst = gen_reg_rtx (mode);
0016d8d9 13604 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
a6fc00da 13605
98daafa0
EM
13606 /* Iterate over the series twice for SF and thrice for DF. */
13607 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
a6fc00da 13608
98daafa0
EM
13609 /* Optionally iterate over the series once less for faster performance
13610 while sacrificing the accuracy. */
13611 if ((recp && flag_mrecip_low_precision_sqrt)
13612 || (!recp && flag_mlow_precision_sqrt))
a6fc00da
BH
13613 iterations--;
13614
98daafa0
EM
13615 /* Iterate over the series to calculate the approximate reciprocal square
13616 root. */
13617 rtx x1 = gen_reg_rtx (mode);
13618 while (iterations--)
a6fc00da 13619 {
a6fc00da 13620 rtx x2 = gen_reg_rtx (mode);
a0ee8352 13621 aarch64_emit_mult (x2, pg, xdst, xdst);
98daafa0 13622
0016d8d9 13623 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
a6fc00da 13624
98daafa0 13625 if (iterations > 0)
a0ee8352 13626 aarch64_emit_mult (xdst, pg, xdst, x1);
98daafa0
EM
13627 }
13628
13629 if (!recp)
13630 {
a0ee8352
RS
13631 if (pg)
13632 /* Multiply nonzero source values by the corresponding intermediate
13633 result elements, so that the final calculation is the approximate
13634 square root rather than its reciprocal. Select a zero result for
13635 zero source values, to avoid the Inf * 0 -> NaN that we'd get
13636 otherwise. */
13637 emit_insn (gen_cond (UNSPEC_COND_FMUL, mode,
13638 xdst, xmsk, xdst, src, CONST0_RTX (mode)));
13639 else
13640 {
13641 /* Qualify the approximate reciprocal square root when the
13642 argument is 0.0 by squashing the intermediary result to 0.0. */
13643 rtx xtmp = gen_reg_rtx (mmsk);
13644 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
13645 gen_rtx_SUBREG (mmsk, xdst, 0)));
13646 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
a6fc00da 13647
a0ee8352
RS
13648 /* Calculate the approximate square root. */
13649 aarch64_emit_mult (xdst, pg, xdst, src);
13650 }
a6fc00da
BH
13651 }
13652
98daafa0 13653 /* Finalize the approximation. */
a0ee8352 13654 aarch64_emit_mult (dst, pg, xdst, x1);
98daafa0
EM
13655
13656 return true;
a6fc00da
BH
13657}
13658
79a2bc2d
EM
13659/* Emit the instruction sequence to compute the approximation for the division
13660 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
13661
13662bool
13663aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
13664{
13665 machine_mode mode = GET_MODE (quo);
33d72b63
JW
13666
13667 if (GET_MODE_INNER (mode) == HFmode)
13668 return false;
13669
79a2bc2d
EM
13670 bool use_approx_division_p = (flag_mlow_precision_div
13671 || (aarch64_tune_params.approx_modes->division
13672 & AARCH64_APPROX_MODE (mode)));
13673
13674 if (!flag_finite_math_only
13675 || flag_trapping_math
13676 || !flag_unsafe_math_optimizations
13677 || optimize_function_for_size_p (cfun)
13678 || !use_approx_division_p)
13679 return false;
13680
1be49a38
RR
13681 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
13682 return false;
13683
04f307cb
RS
13684 rtx pg = NULL_RTX;
13685 if (aarch64_sve_mode_p (mode))
13686 pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
13687
79a2bc2d
EM
13688 /* Estimate the approximate reciprocal. */
13689 rtx xrcp = gen_reg_rtx (mode);
0016d8d9 13690 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
79a2bc2d
EM
13691
13692 /* Iterate over the series twice for SF and thrice for DF. */
13693 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
13694
dbf3dc75
BL
13695 /* Optionally iterate over the series less for faster performance,
13696 while sacrificing the accuracy. The default is 2 for DF and 1 for SF. */
79a2bc2d 13697 if (flag_mlow_precision_div)
dbf3dc75
BL
13698 iterations = (GET_MODE_INNER (mode) == DFmode
13699 ? aarch64_double_recp_precision
13700 : aarch64_float_recp_precision);
79a2bc2d
EM
13701
13702 /* Iterate over the series to calculate the approximate reciprocal. */
13703 rtx xtmp = gen_reg_rtx (mode);
13704 while (iterations--)
13705 {
0016d8d9 13706 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
79a2bc2d
EM
13707
13708 if (iterations > 0)
04f307cb 13709 aarch64_emit_mult (xrcp, pg, xrcp, xtmp);
79a2bc2d
EM
13710 }
13711
13712 if (num != CONST1_RTX (mode))
13713 {
13714 /* As the approximate reciprocal of DEN is already calculated, only
13715 calculate the approximate division when NUM is not 1.0. */
13716 rtx xnum = force_reg (mode, num);
04f307cb 13717 aarch64_emit_mult (xrcp, pg, xrcp, xnum);
79a2bc2d
EM
13718 }
13719
13720 /* Finalize the approximation. */
04f307cb 13721 aarch64_emit_mult (quo, pg, xrcp, xtmp);
79a2bc2d
EM
13722 return true;
13723}
13724
d126a4ae
AP
13725/* Return the number of instructions that can be issued per cycle. */
13726static int
13727aarch64_sched_issue_rate (void)
13728{
b175b679 13729 return aarch64_tune_params.issue_rate;
d126a4ae
AP
13730}
13731
d0bc0cb6
RS
13732/* Implement TARGET_SCHED_VARIABLE_ISSUE. */
13733static int
13734aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
13735{
13736 if (DEBUG_INSN_P (insn))
13737 return more;
13738
13739 rtx_code code = GET_CODE (PATTERN (insn));
13740 if (code == USE || code == CLOBBER)
13741 return more;
13742
13743 if (get_attr_type (insn) == TYPE_NO_INSN)
13744 return more;
13745
13746 return more - 1;
13747}
13748
d03f7e44
MK
13749static int
13750aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
13751{
13752 int issue_rate = aarch64_sched_issue_rate ();
13753
13754 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
13755}
13756
2d6bc7fa
KT
13757
13758/* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
13759 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
13760 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
13761
13762static int
13763aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
13764 int ready_index)
13765{
13766 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
13767}
13768
13769
8990e73a
TB
13770/* Vectorizer cost model target hooks. */
13771
13772/* Implement targetm.vectorize.builtin_vectorization_cost. */
13773static int
13774aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
13775 tree vectype,
13776 int misalign ATTRIBUTE_UNUSED)
13777{
13778 unsigned elements;
cd8ae5ed
AP
13779 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
13780 bool fp = false;
13781
13782 if (vectype != NULL)
13783 fp = FLOAT_TYPE_P (vectype);
8990e73a 13784
76e4f444
KT
13785 const simd_vec_cost *simd_costs;
13786 if (vectype != NULL && aarch64_sve_mode_p (TYPE_MODE (vectype))
13787 && costs->sve != NULL)
13788 simd_costs = costs->sve;
13789 else
13790 simd_costs = costs->advsimd;
13791
8990e73a
TB
13792 switch (type_of_cost)
13793 {
13794 case scalar_stmt:
cd8ae5ed 13795 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
8990e73a
TB
13796
13797 case scalar_load:
cd8ae5ed 13798 return costs->scalar_load_cost;
8990e73a
TB
13799
13800 case scalar_store:
cd8ae5ed 13801 return costs->scalar_store_cost;
8990e73a
TB
13802
13803 case vector_stmt:
76e4f444
KT
13804 return fp ? simd_costs->fp_stmt_cost
13805 : simd_costs->int_stmt_cost;
8990e73a
TB
13806
13807 case vector_load:
76e4f444 13808 return simd_costs->align_load_cost;
8990e73a
TB
13809
13810 case vector_store:
76e4f444 13811 return simd_costs->store_cost;
8990e73a
TB
13812
13813 case vec_to_scalar:
76e4f444 13814 return simd_costs->vec_to_scalar_cost;
8990e73a
TB
13815
13816 case scalar_to_vec:
76e4f444 13817 return simd_costs->scalar_to_vec_cost;
8990e73a
TB
13818
13819 case unaligned_load:
cc9fe6bb 13820 case vector_gather_load:
76e4f444 13821 return simd_costs->unalign_load_cost;
8990e73a
TB
13822
13823 case unaligned_store:
cc9fe6bb 13824 case vector_scatter_store:
76e4f444 13825 return simd_costs->unalign_store_cost;
8990e73a
TB
13826
13827 case cond_branch_taken:
cd8ae5ed 13828 return costs->cond_taken_branch_cost;
8990e73a
TB
13829
13830 case cond_branch_not_taken:
cd8ae5ed 13831 return costs->cond_not_taken_branch_cost;
8990e73a
TB
13832
13833 case vec_perm:
76e4f444 13834 return simd_costs->permute_cost;
c428f91c 13835
8990e73a 13836 case vec_promote_demote:
76e4f444
KT
13837 return fp ? simd_costs->fp_stmt_cost
13838 : simd_costs->int_stmt_cost;
8990e73a
TB
13839
13840 case vec_construct:
6a70badb 13841 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
8990e73a
TB
13842 return elements / 2 + 1;
13843
13844 default:
13845 gcc_unreachable ();
13846 }
13847}
13848
8b50d7a4
RS
13849/* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
13850 vectors would produce a series of LDP or STP operations. KIND is the
13851 kind of statement that STMT_INFO represents. */
13852static bool
13853aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
13854 stmt_vec_info stmt_info)
13855{
13856 switch (kind)
13857 {
13858 case vector_load:
13859 case vector_store:
13860 case unaligned_load:
13861 case unaligned_store:
13862 break;
13863
13864 default:
13865 return false;
13866 }
13867
13868 if (aarch64_tune_params.extra_tuning_flags
13869 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
13870 return false;
13871
13872 return is_gimple_assign (stmt_info->stmt);
13873}
13874
217ccab8
RS
13875/* Return true if STMT_INFO extends the result of a load. */
13876static bool
308bc496 13877aarch64_extending_load_p (class vec_info *vinfo, stmt_vec_info stmt_info)
217ccab8
RS
13878{
13879 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
13880 if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
13881 return false;
13882
13883 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
13884 tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
13885 tree rhs_type = TREE_TYPE (rhs);
13886 if (!INTEGRAL_TYPE_P (lhs_type)
13887 || !INTEGRAL_TYPE_P (rhs_type)
13888 || TYPE_PRECISION (lhs_type) <= TYPE_PRECISION (rhs_type))
13889 return false;
13890
308bc496 13891 stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
217ccab8
RS
13892 return (def_stmt_info
13893 && STMT_VINFO_DATA_REF (def_stmt_info)
13894 && DR_IS_READ (STMT_VINFO_DATA_REF (def_stmt_info)));
13895}
13896
2d56600c
RS
13897/* Return true if STMT_INFO is an integer truncation. */
13898static bool
13899aarch64_integer_truncation_p (stmt_vec_info stmt_info)
13900{
13901 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
13902 if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
13903 return false;
13904
13905 tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
13906 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
13907 return (INTEGRAL_TYPE_P (lhs_type)
13908 && INTEGRAL_TYPE_P (rhs_type)
13909 && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type));
13910}
13911
217ccab8 13912/* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
8b50d7a4
RS
13913 for STMT_INFO, which has cost kind KIND and which when vectorized would
13914 operate on vector type VECTYPE. Adjust the cost as necessary for SVE
13915 targets. */
217ccab8 13916static unsigned int
308bc496 13917aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
8b50d7a4 13918 stmt_vec_info stmt_info, tree vectype,
217ccab8
RS
13919 unsigned int stmt_cost)
13920{
13921 /* Unlike vec_promote_demote, vector_stmt conversions do not change the
13922 vector register size or number of units. Integer promotions of this
13923 type therefore map to SXT[BHW] or UXT[BHW].
13924
13925 Most loads have extending forms that can do the sign or zero extension
13926 on the fly. Optimistically assume that a load followed by an extension
13927 will fold to this form during combine, and that the extension therefore
13928 comes for free. */
308bc496 13929 if (kind == vector_stmt && aarch64_extending_load_p (vinfo, stmt_info))
217ccab8
RS
13930 stmt_cost = 0;
13931
2d56600c
RS
13932 /* For similar reasons, vector_stmt integer truncations are a no-op,
13933 because we can just ignore the unused upper bits of the source. */
13934 if (kind == vector_stmt && aarch64_integer_truncation_p (stmt_info))
13935 stmt_cost = 0;
13936
8b50d7a4
RS
13937 /* Advanced SIMD can load and store pairs of registers using LDP and STP,
13938 but there are no equivalent instructions for SVE. This means that
13939 (all other things being equal) 128-bit SVE needs twice as many load
13940 and store instructions as Advanced SIMD in order to process vector pairs.
13941
13942 Also, scalar code can often use LDP and STP to access pairs of values,
13943 so it is too simplistic to say that one SVE load or store replaces
13944 VF scalar loads and stores.
13945
13946 Ideally we would account for this in the scalar and Advanced SIMD
13947 costs by making suitable load/store pairs as cheap as a single
13948 load/store. However, that would be a very invasive change and in
13949 practice it tends to stress other parts of the cost model too much.
13950 E.g. stores of scalar constants currently count just a store,
13951 whereas stores of vector constants count a store and a vec_init.
13952 This is an artificial distinction for AArch64, where stores of
13953 nonzero scalar constants need the same kind of register invariant
13954 as vector stores.
13955
13956 An alternative would be to double the cost of any SVE loads and stores
13957 that could be paired in Advanced SIMD (and possibly also paired in
13958 scalar code). But this tends to stress other parts of the cost model
13959 in the same way. It also means that we can fall back to Advanced SIMD
13960 even if full-loop predication would have been useful.
13961
13962 Here we go for a more conservative version: double the costs of SVE
13963 loads and stores if one iteration of the scalar loop processes enough
13964 elements for it to use a whole number of Advanced SIMD LDP or STP
13965 instructions. This makes it very likely that the VF would be 1 for
13966 Advanced SIMD, and so no epilogue should be needed. */
13967 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
13968 {
13969 stmt_vec_info first = DR_GROUP_FIRST_ELEMENT (stmt_info);
13970 unsigned int count = DR_GROUP_SIZE (first) - DR_GROUP_GAP (first);
13971 unsigned int elt_bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype));
13972 if (multiple_p (count * elt_bits, 256)
13973 && aarch64_advsimd_ldp_stp_p (kind, stmt_info))
13974 stmt_cost *= 2;
13975 }
13976
217ccab8
RS
13977 return stmt_cost;
13978}
13979
8990e73a
TB
13980/* Implement targetm.vectorize.add_stmt_cost. */
13981static unsigned
308bc496
RB
13982aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count,
13983 enum vect_cost_for_stmt kind,
78db0e09
RB
13984 struct _stmt_vec_info *stmt_info, tree vectype,
13985 int misalign, enum vect_cost_model_location where)
8990e73a
TB
13986{
13987 unsigned *cost = (unsigned *) data;
13988 unsigned retval = 0;
13989
13990 if (flag_vect_cost_model)
13991 {
8990e73a
TB
13992 int stmt_cost =
13993 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
13994
217ccab8 13995 if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
308bc496
RB
13996 stmt_cost = aarch64_sve_adjust_stmt_cost (vinfo, kind, stmt_info,
13997 vectype, stmt_cost);
217ccab8 13998
8990e73a
TB
13999 /* Statements in an inner loop relative to the loop being
14000 vectorized are weighted more heavily. The value here is
058e4c71 14001 arbitrary and could potentially be improved with analysis. */
308bc496
RB
14002 if (where == vect_body && stmt_info
14003 && stmt_in_inner_loop_p (vinfo, stmt_info))
058e4c71 14004 count *= 50; /* FIXME */
8990e73a
TB
14005
14006 retval = (unsigned) (count * stmt_cost);
14007 cost[where] += retval;
14008 }
14009
14010 return retval;
14011}
14012
0cfff2a1 14013static void initialize_aarch64_code_model (struct gcc_options *);
43e9d192 14014
0cfff2a1
KT
14015/* Parse the TO_PARSE string and put the architecture struct that it
14016 selects into RES and the architectural features into ISA_FLAGS.
14017 Return an aarch64_parse_opt_result describing the parse result.
c7887347
ML
14018 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
14019 When the TO_PARSE string contains an invalid extension,
14020 a copy of the string is created and stored to INVALID_EXTENSION. */
43e9d192 14021
0cfff2a1
KT
14022static enum aarch64_parse_opt_result
14023aarch64_parse_arch (const char *to_parse, const struct processor **res,
28108a53 14024 uint64_t *isa_flags, std::string *invalid_extension)
43e9d192 14025{
ff150bc4 14026 const char *ext;
43e9d192 14027 const struct processor *arch;
43e9d192
IB
14028 size_t len;
14029
ff150bc4 14030 ext = strchr (to_parse, '+');
43e9d192
IB
14031
14032 if (ext != NULL)
ff150bc4 14033 len = ext - to_parse;
43e9d192 14034 else
ff150bc4 14035 len = strlen (to_parse);
43e9d192
IB
14036
14037 if (len == 0)
0cfff2a1
KT
14038 return AARCH64_PARSE_MISSING_ARG;
14039
43e9d192 14040
0cfff2a1 14041 /* Loop through the list of supported ARCHes to find a match. */
43e9d192
IB
14042 for (arch = all_architectures; arch->name != NULL; arch++)
14043 {
ff150bc4
ML
14044 if (strlen (arch->name) == len
14045 && strncmp (arch->name, to_parse, len) == 0)
43e9d192 14046 {
28108a53 14047 uint64_t isa_temp = arch->flags;
43e9d192
IB
14048
14049 if (ext != NULL)
14050 {
0cfff2a1
KT
14051 /* TO_PARSE string contains at least one extension. */
14052 enum aarch64_parse_opt_result ext_res
c7887347 14053 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
43e9d192 14054
0cfff2a1
KT
14055 if (ext_res != AARCH64_PARSE_OK)
14056 return ext_res;
ffee7aa9 14057 }
0cfff2a1
KT
14058 /* Extension parsing was successful. Confirm the result
14059 arch and ISA flags. */
14060 *res = arch;
14061 *isa_flags = isa_temp;
14062 return AARCH64_PARSE_OK;
43e9d192
IB
14063 }
14064 }
14065
14066 /* ARCH name not found in list. */
0cfff2a1 14067 return AARCH64_PARSE_INVALID_ARG;
43e9d192
IB
14068}
14069
0cfff2a1
KT
14070/* Parse the TO_PARSE string and put the result tuning in RES and the
14071 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
14072 describing the parse result. If there is an error parsing, RES and
c7887347
ML
14073 ISA_FLAGS are left unchanged.
14074 When the TO_PARSE string contains an invalid extension,
14075 a copy of the string is created and stored to INVALID_EXTENSION. */
43e9d192 14076
0cfff2a1
KT
14077static enum aarch64_parse_opt_result
14078aarch64_parse_cpu (const char *to_parse, const struct processor **res,
28108a53 14079 uint64_t *isa_flags, std::string *invalid_extension)
43e9d192 14080{
ff150bc4 14081 const char *ext;
43e9d192 14082 const struct processor *cpu;
43e9d192
IB
14083 size_t len;
14084
ff150bc4 14085 ext = strchr (to_parse, '+');
43e9d192
IB
14086
14087 if (ext != NULL)
ff150bc4 14088 len = ext - to_parse;
43e9d192 14089 else
ff150bc4 14090 len = strlen (to_parse);
43e9d192
IB
14091
14092 if (len == 0)
0cfff2a1
KT
14093 return AARCH64_PARSE_MISSING_ARG;
14094
43e9d192
IB
14095
14096 /* Loop through the list of supported CPUs to find a match. */
14097 for (cpu = all_cores; cpu->name != NULL; cpu++)
14098 {
ff150bc4 14099 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
43e9d192 14100 {
28108a53 14101 uint64_t isa_temp = cpu->flags;
0cfff2a1 14102
43e9d192
IB
14103
14104 if (ext != NULL)
14105 {
0cfff2a1
KT
14106 /* TO_PARSE string contains at least one extension. */
14107 enum aarch64_parse_opt_result ext_res
c7887347 14108 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
43e9d192 14109
0cfff2a1
KT
14110 if (ext_res != AARCH64_PARSE_OK)
14111 return ext_res;
14112 }
14113 /* Extension parsing was successfull. Confirm the result
14114 cpu and ISA flags. */
14115 *res = cpu;
14116 *isa_flags = isa_temp;
14117 return AARCH64_PARSE_OK;
43e9d192
IB
14118 }
14119 }
14120
14121 /* CPU name not found in list. */
0cfff2a1 14122 return AARCH64_PARSE_INVALID_ARG;
43e9d192
IB
14123}
14124
0cfff2a1
KT
14125/* Parse the TO_PARSE string and put the cpu it selects into RES.
14126 Return an aarch64_parse_opt_result describing the parse result.
14127 If the parsing fails the RES does not change. */
43e9d192 14128
0cfff2a1
KT
14129static enum aarch64_parse_opt_result
14130aarch64_parse_tune (const char *to_parse, const struct processor **res)
43e9d192
IB
14131{
14132 const struct processor *cpu;
43e9d192
IB
14133
14134 /* Loop through the list of supported CPUs to find a match. */
14135 for (cpu = all_cores; cpu->name != NULL; cpu++)
14136 {
ff150bc4 14137 if (strcmp (cpu->name, to_parse) == 0)
43e9d192 14138 {
0cfff2a1
KT
14139 *res = cpu;
14140 return AARCH64_PARSE_OK;
43e9d192
IB
14141 }
14142 }
14143
14144 /* CPU name not found in list. */
0cfff2a1 14145 return AARCH64_PARSE_INVALID_ARG;
43e9d192
IB
14146}
14147
8dec06f2
JG
14148/* Parse TOKEN, which has length LENGTH to see if it is an option
14149 described in FLAG. If it is, return the index bit for that fusion type.
14150 If not, error (printing OPTION_NAME) and return zero. */
14151
14152static unsigned int
14153aarch64_parse_one_option_token (const char *token,
14154 size_t length,
14155 const struct aarch64_flag_desc *flag,
14156 const char *option_name)
14157{
14158 for (; flag->name != NULL; flag++)
14159 {
14160 if (length == strlen (flag->name)
14161 && !strncmp (flag->name, token, length))
14162 return flag->flag;
14163 }
14164
a3f9f006 14165 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
8dec06f2
JG
14166 return 0;
14167}
14168
14169/* Parse OPTION which is a comma-separated list of flags to enable.
14170 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
14171 default state we inherit from the CPU tuning structures. OPTION_NAME
14172 gives the top-level option we are parsing in the -moverride string,
14173 for use in error messages. */
14174
14175static unsigned int
14176aarch64_parse_boolean_options (const char *option,
14177 const struct aarch64_flag_desc *flags,
14178 unsigned int initial_state,
14179 const char *option_name)
14180{
14181 const char separator = '.';
14182 const char* specs = option;
14183 const char* ntoken = option;
14184 unsigned int found_flags = initial_state;
14185
14186 while ((ntoken = strchr (specs, separator)))
14187 {
14188 size_t token_length = ntoken - specs;
14189 unsigned token_ops = aarch64_parse_one_option_token (specs,
14190 token_length,
14191 flags,
14192 option_name);
14193 /* If we find "none" (or, for simplicity's sake, an error) anywhere
14194 in the token stream, reset the supported operations. So:
14195
14196 adrp+add.cmp+branch.none.adrp+add
14197
14198 would have the result of turning on only adrp+add fusion. */
14199 if (!token_ops)
14200 found_flags = 0;
14201
14202 found_flags |= token_ops;
14203 specs = ++ntoken;
14204 }
14205
14206 /* We ended with a comma, print something. */
14207 if (!(*specs))
14208 {
14209 error ("%s string ill-formed\n", option_name);
14210 return 0;
14211 }
14212
14213 /* We still have one more token to parse. */
14214 size_t token_length = strlen (specs);
14215 unsigned token_ops = aarch64_parse_one_option_token (specs,
14216 token_length,
14217 flags,
14218 option_name);
14219 if (!token_ops)
14220 found_flags = 0;
14221
14222 found_flags |= token_ops;
14223 return found_flags;
14224}
14225
14226/* Support for overriding instruction fusion. */
14227
14228static void
14229aarch64_parse_fuse_string (const char *fuse_string,
14230 struct tune_params *tune)
14231{
14232 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
14233 aarch64_fusible_pairs,
14234 tune->fusible_ops,
14235 "fuse=");
14236}
14237
14238/* Support for overriding other tuning flags. */
14239
14240static void
14241aarch64_parse_tune_string (const char *tune_string,
14242 struct tune_params *tune)
14243{
14244 tune->extra_tuning_flags
14245 = aarch64_parse_boolean_options (tune_string,
14246 aarch64_tuning_flags,
14247 tune->extra_tuning_flags,
14248 "tune=");
14249}
14250
886f092f
KT
14251/* Parse the sve_width tuning moverride string in TUNE_STRING.
14252 Accept the valid SVE vector widths allowed by
14253 aarch64_sve_vector_bits_enum and use it to override sve_width
14254 in TUNE. */
14255
14256static void
14257aarch64_parse_sve_width_string (const char *tune_string,
14258 struct tune_params *tune)
14259{
14260 int width = -1;
14261
14262 int n = sscanf (tune_string, "%d", &width);
14263 if (n == EOF)
14264 {
14265 error ("invalid format for sve_width");
14266 return;
14267 }
14268 switch (width)
14269 {
14270 case SVE_128:
14271 case SVE_256:
14272 case SVE_512:
14273 case SVE_1024:
14274 case SVE_2048:
14275 break;
14276 default:
14277 error ("invalid sve_width value: %d", width);
14278 }
14279 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
14280}
14281
8dec06f2
JG
14282/* Parse TOKEN, which has length LENGTH to see if it is a tuning option
14283 we understand. If it is, extract the option string and handoff to
14284 the appropriate function. */
14285
14286void
14287aarch64_parse_one_override_token (const char* token,
14288 size_t length,
14289 struct tune_params *tune)
14290{
14291 const struct aarch64_tuning_override_function *fn
14292 = aarch64_tuning_override_functions;
14293
14294 const char *option_part = strchr (token, '=');
14295 if (!option_part)
14296 {
14297 error ("tuning string missing in option (%s)", token);
14298 return;
14299 }
14300
14301 /* Get the length of the option name. */
14302 length = option_part - token;
14303 /* Skip the '=' to get to the option string. */
14304 option_part++;
14305
14306 for (; fn->name != NULL; fn++)
14307 {
14308 if (!strncmp (fn->name, token, length))
14309 {
14310 fn->parse_override (option_part, tune);
14311 return;
14312 }
14313 }
14314
14315 error ("unknown tuning option (%s)",token);
14316 return;
14317}
14318
5eee3c34
JW
14319/* A checking mechanism for the implementation of the tls size. */
14320
14321static void
14322initialize_aarch64_tls_size (struct gcc_options *opts)
14323{
14324 if (aarch64_tls_size == 0)
14325 aarch64_tls_size = 24;
14326
14327 switch (opts->x_aarch64_cmodel_var)
14328 {
14329 case AARCH64_CMODEL_TINY:
14330 /* Both the default and maximum TLS size allowed under tiny is 1M which
14331 needs two instructions to address, so we clamp the size to 24. */
14332 if (aarch64_tls_size > 24)
14333 aarch64_tls_size = 24;
14334 break;
14335 case AARCH64_CMODEL_SMALL:
14336 /* The maximum TLS size allowed under small is 4G. */
14337 if (aarch64_tls_size > 32)
14338 aarch64_tls_size = 32;
14339 break;
14340 case AARCH64_CMODEL_LARGE:
14341 /* The maximum TLS size allowed under large is 16E.
14342 FIXME: 16E should be 64bit, we only support 48bit offset now. */
14343 if (aarch64_tls_size > 48)
14344 aarch64_tls_size = 48;
14345 break;
14346 default:
14347 gcc_unreachable ();
14348 }
14349
14350 return;
14351}
14352
8dec06f2
JG
14353/* Parse STRING looking for options in the format:
14354 string :: option:string
14355 option :: name=substring
14356 name :: {a-z}
14357 substring :: defined by option. */
14358
14359static void
14360aarch64_parse_override_string (const char* input_string,
14361 struct tune_params* tune)
14362{
14363 const char separator = ':';
14364 size_t string_length = strlen (input_string) + 1;
14365 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
14366 char *string = string_root;
14367 strncpy (string, input_string, string_length);
14368 string[string_length - 1] = '\0';
14369
14370 char* ntoken = string;
14371
14372 while ((ntoken = strchr (string, separator)))
14373 {
14374 size_t token_length = ntoken - string;
14375 /* Make this substring look like a string. */
14376 *ntoken = '\0';
14377 aarch64_parse_one_override_token (string, token_length, tune);
14378 string = ++ntoken;
14379 }
14380
14381 /* One last option to parse. */
14382 aarch64_parse_one_override_token (string, strlen (string), tune);
14383 free (string_root);
14384}
43e9d192 14385
43e9d192
IB
14386
14387static void
0cfff2a1 14388aarch64_override_options_after_change_1 (struct gcc_options *opts)
43e9d192 14389{
efac62a3
ST
14390 if (accepted_branch_protection_string)
14391 {
14392 opts->x_aarch64_branch_protection_string
14393 = xstrdup (accepted_branch_protection_string);
14394 }
14395
acea40ac
WD
14396 /* PR 70044: We have to be careful about being called multiple times for the
14397 same function. This means all changes should be repeatable. */
14398
d6cb6d6a
WD
14399 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
14400 Disable the frame pointer flag so the mid-end will not use a frame
14401 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
14402 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
14403 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
14404 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
acea40ac 14405 if (opts->x_flag_omit_frame_pointer == 0)
a3dc8760 14406 opts->x_flag_omit_frame_pointer = 2;
43e9d192 14407
1be34295 14408 /* If not optimizing for size, set the default
0cfff2a1
KT
14409 alignment to what the target wants. */
14410 if (!opts->x_optimize_size)
43e9d192 14411 {
c518c102
ML
14412 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
14413 opts->x_str_align_loops = aarch64_tune_params.loop_align;
14414 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
14415 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
14416 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
14417 opts->x_str_align_functions = aarch64_tune_params.function_align;
43e9d192 14418 }
b4f50fd4 14419
9ee6540a
WD
14420 /* We default to no pc-relative literal loads. */
14421
14422 aarch64_pcrelative_literal_loads = false;
14423
14424 /* If -mpc-relative-literal-loads is set on the command line, this
b4f50fd4 14425 implies that the user asked for PC relative literal loads. */
9ee6540a
WD
14426 if (opts->x_pcrelative_literal_loads == 1)
14427 aarch64_pcrelative_literal_loads = true;
b4f50fd4 14428
9ee6540a
WD
14429 /* In the tiny memory model it makes no sense to disallow PC relative
14430 literal pool loads. */
14431 if (aarch64_cmodel == AARCH64_CMODEL_TINY
14432 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
14433 aarch64_pcrelative_literal_loads = true;
98daafa0
EM
14434
14435 /* When enabling the lower precision Newton series for the square root, also
14436 enable it for the reciprocal square root, since the latter is an
14437 intermediary step for the former. */
14438 if (flag_mlow_precision_sqrt)
14439 flag_mrecip_low_precision_sqrt = true;
0cfff2a1 14440}
43e9d192 14441
0cfff2a1
KT
14442/* 'Unpack' up the internal tuning structs and update the options
14443 in OPTS. The caller must have set up selected_tune and selected_arch
14444 as all the other target-specific codegen decisions are
14445 derived from them. */
14446
e4ea20c8 14447void
0cfff2a1
KT
14448aarch64_override_options_internal (struct gcc_options *opts)
14449{
14450 aarch64_tune_flags = selected_tune->flags;
14451 aarch64_tune = selected_tune->sched_core;
14452 /* Make a copy of the tuning parameters attached to the core, which
14453 we may later overwrite. */
14454 aarch64_tune_params = *(selected_tune->tune);
14455 aarch64_architecture_version = selected_arch->architecture_version;
14456
14457 if (opts->x_aarch64_override_tune_string)
14458 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
14459 &aarch64_tune_params);
14460
14461 /* This target defaults to strict volatile bitfields. */
14462 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
14463 opts->x_flag_strict_volatile_bitfields = 1;
14464
cd0b2d36
RR
14465 if (aarch64_stack_protector_guard == SSP_GLOBAL
14466 && opts->x_aarch64_stack_protector_guard_offset_str)
14467 {
41804907 14468 error ("incompatible options %<-mstack-protector-guard=global%> and "
63d42e89 14469 "%<-mstack-protector-guard-offset=%s%>",
cd0b2d36
RR
14470 aarch64_stack_protector_guard_offset_str);
14471 }
14472
14473 if (aarch64_stack_protector_guard == SSP_SYSREG
14474 && !(opts->x_aarch64_stack_protector_guard_offset_str
14475 && opts->x_aarch64_stack_protector_guard_reg_str))
14476 {
a3f9f006
ML
14477 error ("both %<-mstack-protector-guard-offset%> and "
14478 "%<-mstack-protector-guard-reg%> must be used "
14479 "with %<-mstack-protector-guard=sysreg%>");
cd0b2d36
RR
14480 }
14481
14482 if (opts->x_aarch64_stack_protector_guard_reg_str)
14483 {
14484 if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
14485 error ("specify a system register with a small string length.");
14486 }
14487
14488 if (opts->x_aarch64_stack_protector_guard_offset_str)
14489 {
14490 char *end;
14491 const char *str = aarch64_stack_protector_guard_offset_str;
14492 errno = 0;
14493 long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
14494 if (!*str || *end || errno)
14495 error ("%qs is not a valid offset in %qs", str,
63d42e89 14496 "-mstack-protector-guard-offset=");
cd0b2d36
RR
14497 aarch64_stack_protector_guard_offset = offs;
14498 }
14499
0cfff2a1 14500 initialize_aarch64_code_model (opts);
5eee3c34 14501 initialize_aarch64_tls_size (opts);
63892fa2 14502
2d6bc7fa
KT
14503 int queue_depth = 0;
14504 switch (aarch64_tune_params.autoprefetcher_model)
14505 {
14506 case tune_params::AUTOPREFETCHER_OFF:
14507 queue_depth = -1;
14508 break;
14509 case tune_params::AUTOPREFETCHER_WEAK:
14510 queue_depth = 0;
14511 break;
14512 case tune_params::AUTOPREFETCHER_STRONG:
14513 queue_depth = max_insn_queue_index + 1;
14514 break;
14515 default:
14516 gcc_unreachable ();
14517 }
14518
14519 /* We don't mind passing in global_options_set here as we don't use
14520 the *options_set structs anyway. */
028d4092
ML
14521 SET_OPTION_IF_UNSET (opts, &global_options_set,
14522 param_sched_autopref_queue_depth, queue_depth);
2d6bc7fa 14523
5f29f3d5
KT
14524 /* If using Advanced SIMD only for autovectorization disable SVE vector costs
14525 comparison. */
14526 if (aarch64_autovec_preference == 1)
14527 SET_OPTION_IF_UNSET (opts, &global_options_set,
14528 aarch64_sve_compare_costs, 0);
14529
9d2c6e2e
MK
14530 /* Set up parameters to be used in prefetching algorithm. Do not
14531 override the defaults unless we are tuning for a core we have
14532 researched values for. */
14533 if (aarch64_tune_params.prefetch->num_slots > 0)
028d4092
ML
14534 SET_OPTION_IF_UNSET (opts, &global_options_set,
14535 param_simultaneous_prefetches,
14536 aarch64_tune_params.prefetch->num_slots);
9d2c6e2e 14537 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
028d4092
ML
14538 SET_OPTION_IF_UNSET (opts, &global_options_set,
14539 param_l1_cache_size,
14540 aarch64_tune_params.prefetch->l1_cache_size);
9d2c6e2e 14541 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
028d4092
ML
14542 SET_OPTION_IF_UNSET (opts, &global_options_set,
14543 param_l1_cache_line_size,
14544 aarch64_tune_params.prefetch->l1_cache_line_size);
9d2c6e2e 14545 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
028d4092
ML
14546 SET_OPTION_IF_UNSET (opts, &global_options_set,
14547 param_l2_cache_size,
14548 aarch64_tune_params.prefetch->l2_cache_size);
d2ff35c0 14549 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
028d4092
ML
14550 SET_OPTION_IF_UNSET (opts, &global_options_set,
14551 param_prefetch_dynamic_strides, 0);
59100dfc 14552 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
028d4092
ML
14553 SET_OPTION_IF_UNSET (opts, &global_options_set,
14554 param_prefetch_minimum_stride,
14555 aarch64_tune_params.prefetch->minimum_stride);
50487d79 14556
13494fcb 14557 /* Use the alternative scheduling-pressure algorithm by default. */
028d4092
ML
14558 SET_OPTION_IF_UNSET (opts, &global_options_set,
14559 param_sched_pressure_algorithm,
14560 SCHED_PRESSURE_MODEL);
13494fcb 14561
fbe9af50 14562 /* Validate the guard size. */
028d4092 14563 int guard_size = param_stack_clash_protection_guard_size;
fbe9af50 14564
8100e93b
ML
14565 if (guard_size != 12 && guard_size != 16)
14566 error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
14567 "size. Given value %d (%llu KB) is out of range",
14568 guard_size, (1ULL << guard_size) / 1024ULL);
14569
fbe9af50
TC
14570 /* Enforce that interval is the same size as size so the mid-end does the
14571 right thing. */
028d4092
ML
14572 SET_OPTION_IF_UNSET (opts, &global_options_set,
14573 param_stack_clash_protection_probe_interval,
14574 guard_size);
fbe9af50
TC
14575
14576 /* The maybe_set calls won't update the value if the user has explicitly set
14577 one. Which means we need to validate that probing interval and guard size
14578 are equal. */
14579 int probe_interval
028d4092 14580 = param_stack_clash_protection_probe_interval;
fbe9af50 14581 if (guard_size != probe_interval)
904f3daa
ML
14582 error ("stack clash guard size %<%d%> must be equal to probing interval "
14583 "%<%d%>", guard_size, probe_interval);
fbe9af50 14584
16b2cafd
MK
14585 /* Enable sw prefetching at specified optimization level for
14586 CPUS that have prefetch. Lower optimization level threshold by 1
14587 when profiling is enabled. */
14588 if (opts->x_flag_prefetch_loop_arrays < 0
14589 && !opts->x_optimize_size
14590 && aarch64_tune_params.prefetch->default_opt_level >= 0
14591 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
14592 opts->x_flag_prefetch_loop_arrays = 1;
14593
266c2b54
ML
14594 if (opts->x_aarch64_arch_string == NULL)
14595 opts->x_aarch64_arch_string = selected_arch->name;
14596 if (opts->x_aarch64_cpu_string == NULL)
14597 opts->x_aarch64_cpu_string = selected_cpu->name;
14598 if (opts->x_aarch64_tune_string == NULL)
14599 opts->x_aarch64_tune_string = selected_tune->name;
14600
0cfff2a1
KT
14601 aarch64_override_options_after_change_1 (opts);
14602}
43e9d192 14603
01f44038
KT
14604/* Print a hint with a suggestion for a core or architecture name that
14605 most closely resembles what the user passed in STR. ARCH is true if
14606 the user is asking for an architecture name. ARCH is false if the user
14607 is asking for a core name. */
14608
14609static void
14610aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
14611{
14612 auto_vec<const char *> candidates;
14613 const struct processor *entry = arch ? all_architectures : all_cores;
14614 for (; entry->name != NULL; entry++)
14615 candidates.safe_push (entry->name);
a08b5429
ML
14616
14617#ifdef HAVE_LOCAL_CPU_DETECT
14618 /* Add also "native" as possible value. */
14619 if (arch)
14620 candidates.safe_push ("native");
14621#endif
14622
01f44038
KT
14623 char *s;
14624 const char *hint = candidates_list_and_hint (str, s, candidates);
14625 if (hint)
14626 inform (input_location, "valid arguments are: %s;"
14627 " did you mean %qs?", s, hint);
6285e915
ML
14628 else
14629 inform (input_location, "valid arguments are: %s", s);
14630
01f44038
KT
14631 XDELETEVEC (s);
14632}
14633
14634/* Print a hint with a suggestion for a core name that most closely resembles
14635 what the user passed in STR. */
14636
14637inline static void
14638aarch64_print_hint_for_core (const char *str)
14639{
14640 aarch64_print_hint_for_core_or_arch (str, false);
14641}
14642
14643/* Print a hint with a suggestion for an architecture name that most closely
14644 resembles what the user passed in STR. */
14645
14646inline static void
14647aarch64_print_hint_for_arch (const char *str)
14648{
14649 aarch64_print_hint_for_core_or_arch (str, true);
14650}
14651
c7887347
ML
14652
14653/* Print a hint with a suggestion for an extension name
14654 that most closely resembles what the user passed in STR. */
14655
14656void
14657aarch64_print_hint_for_extensions (const std::string &str)
14658{
14659 auto_vec<const char *> candidates;
14660 aarch64_get_all_extension_candidates (&candidates);
14661 char *s;
14662 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
14663 if (hint)
14664 inform (input_location, "valid arguments are: %s;"
14665 " did you mean %qs?", s, hint);
14666 else
14667 inform (input_location, "valid arguments are: %s;", s);
14668
14669 XDELETEVEC (s);
14670}
14671
0cfff2a1
KT
14672/* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
14673 specified in STR and throw errors if appropriate. Put the results if
361fb3ee
KT
14674 they are valid in RES and ISA_FLAGS. Return whether the option is
14675 valid. */
43e9d192 14676
361fb3ee 14677static bool
0cfff2a1 14678aarch64_validate_mcpu (const char *str, const struct processor **res,
28108a53 14679 uint64_t *isa_flags)
0cfff2a1 14680{
c7887347 14681 std::string invalid_extension;
0cfff2a1 14682 enum aarch64_parse_opt_result parse_res
c7887347 14683 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
0cfff2a1
KT
14684
14685 if (parse_res == AARCH64_PARSE_OK)
361fb3ee 14686 return true;
0cfff2a1
KT
14687
14688 switch (parse_res)
14689 {
14690 case AARCH64_PARSE_MISSING_ARG:
fb241da2 14691 error ("missing cpu name in %<-mcpu=%s%>", str);
0cfff2a1
KT
14692 break;
14693 case AARCH64_PARSE_INVALID_ARG:
a3f9f006 14694 error ("unknown value %qs for %<-mcpu%>", str);
01f44038 14695 aarch64_print_hint_for_core (str);
0cfff2a1
KT
14696 break;
14697 case AARCH64_PARSE_INVALID_FEATURE:
c7887347
ML
14698 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
14699 invalid_extension.c_str (), str);
14700 aarch64_print_hint_for_extensions (invalid_extension);
0cfff2a1
KT
14701 break;
14702 default:
14703 gcc_unreachable ();
14704 }
361fb3ee
KT
14705
14706 return false;
0cfff2a1
KT
14707}
14708
a9ba2a9b
MM
14709/* Straight line speculation indicators. */
14710enum aarch64_sls_hardening_type
14711{
14712 SLS_NONE = 0,
14713 SLS_RETBR = 1,
14714 SLS_BLR = 2,
14715 SLS_ALL = 3,
14716};
14717static enum aarch64_sls_hardening_type aarch64_sls_hardening;
14718
14719/* Return whether we should mitigatate Straight Line Speculation for the RET
14720 and BR instructions. */
14721bool
14722aarch64_harden_sls_retbr_p (void)
14723{
14724 return aarch64_sls_hardening & SLS_RETBR;
14725}
14726
14727/* Return whether we should mitigatate Straight Line Speculation for the BLR
14728 instruction. */
14729bool
14730aarch64_harden_sls_blr_p (void)
14731{
14732 return aarch64_sls_hardening & SLS_BLR;
14733}
14734
14735/* As of yet we only allow setting these options globally, in the future we may
14736 allow setting them per function. */
14737static void
14738aarch64_validate_sls_mitigation (const char *const_str)
14739{
14740 char *token_save = NULL;
14741 char *str = NULL;
14742
14743 if (strcmp (const_str, "none") == 0)
14744 {
14745 aarch64_sls_hardening = SLS_NONE;
14746 return;
14747 }
14748 if (strcmp (const_str, "all") == 0)
14749 {
14750 aarch64_sls_hardening = SLS_ALL;
14751 return;
14752 }
14753
14754 char *str_root = xstrdup (const_str);
14755 str = strtok_r (str_root, ",", &token_save);
14756 if (!str)
14757 error ("invalid argument given to %<-mharden-sls=%>");
14758
14759 int temp = SLS_NONE;
14760 while (str)
14761 {
14762 if (strcmp (str, "blr") == 0)
14763 temp |= SLS_BLR;
14764 else if (strcmp (str, "retbr") == 0)
14765 temp |= SLS_RETBR;
14766 else if (strcmp (str, "none") == 0 || strcmp (str, "all") == 0)
14767 {
14768 error ("%<%s%> must be by itself for %<-mharden-sls=%>", str);
14769 break;
14770 }
14771 else
14772 {
14773 error ("invalid argument %<%s%> for %<-mharden-sls=%>", str);
14774 break;
14775 }
14776 str = strtok_r (NULL, ",", &token_save);
14777 }
14778 aarch64_sls_hardening = (aarch64_sls_hardening_type) temp;
14779 free (str_root);
14780}
14781
efac62a3
ST
14782/* Parses CONST_STR for branch protection features specified in
14783 aarch64_branch_protect_types, and set any global variables required. Returns
14784 the parsing result and assigns LAST_STR to the last processed token from
14785 CONST_STR so that it can be used for error reporting. */
14786
14787static enum
14788aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
14789 char** last_str)
14790{
14791 char *str_root = xstrdup (const_str);
14792 char* token_save = NULL;
14793 char *str = strtok_r (str_root, "+", &token_save);
14794 enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
14795 if (!str)
14796 res = AARCH64_PARSE_MISSING_ARG;
14797 else
14798 {
14799 char *next_str = strtok_r (NULL, "+", &token_save);
14800 /* Reset the branch protection features to their defaults. */
14801 aarch64_handle_no_branch_protection (NULL, NULL);
14802
14803 while (str && res == AARCH64_PARSE_OK)
14804 {
14805 const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
14806 bool found = false;
14807 /* Search for this type. */
14808 while (type && type->name && !found && res == AARCH64_PARSE_OK)
14809 {
14810 if (strcmp (str, type->name) == 0)
14811 {
14812 found = true;
14813 res = type->handler (str, next_str);
14814 str = next_str;
14815 next_str = strtok_r (NULL, "+", &token_save);
14816 }
14817 else
14818 type++;
14819 }
14820 if (found && res == AARCH64_PARSE_OK)
14821 {
14822 bool found_subtype = true;
14823 /* Loop through each token until we find one that isn't a
14824 subtype. */
14825 while (found_subtype)
14826 {
14827 found_subtype = false;
14828 const aarch64_branch_protect_type *subtype = type->subtypes;
14829 /* Search for the subtype. */
14830 while (str && subtype && subtype->name && !found_subtype
14831 && res == AARCH64_PARSE_OK)
14832 {
14833 if (strcmp (str, subtype->name) == 0)
14834 {
14835 found_subtype = true;
14836 res = subtype->handler (str, next_str);
14837 str = next_str;
14838 next_str = strtok_r (NULL, "+", &token_save);
14839 }
14840 else
14841 subtype++;
14842 }
14843 }
14844 }
14845 else if (!found)
14846 res = AARCH64_PARSE_INVALID_ARG;
14847 }
14848 }
14849 /* Copy the last processed token into the argument to pass it back.
14850 Used by option and attribute validation to print the offending token. */
14851 if (last_str)
14852 {
14853 if (str) strcpy (*last_str, str);
14854 else *last_str = NULL;
14855 }
14856 if (res == AARCH64_PARSE_OK)
14857 {
14858 /* If needed, alloc the accepted string then copy in const_str.
14859 Used by override_option_after_change_1. */
14860 if (!accepted_branch_protection_string)
14861 accepted_branch_protection_string = (char *) xmalloc (
14862 BRANCH_PROTECT_STR_MAX
14863 + 1);
14864 strncpy (accepted_branch_protection_string, const_str,
14865 BRANCH_PROTECT_STR_MAX + 1);
14866 /* Forcibly null-terminate. */
14867 accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
14868 }
14869 return res;
14870}
14871
14872static bool
14873aarch64_validate_mbranch_protection (const char *const_str)
14874{
14875 char *str = (char *) xmalloc (strlen (const_str));
14876 enum aarch64_parse_opt_result res =
14877 aarch64_parse_branch_protection (const_str, &str);
14878 if (res == AARCH64_PARSE_INVALID_ARG)
a9c697b8 14879 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
efac62a3 14880 else if (res == AARCH64_PARSE_MISSING_ARG)
a9c697b8 14881 error ("missing argument for %<-mbranch-protection=%>");
efac62a3
ST
14882 free (str);
14883 return res == AARCH64_PARSE_OK;
14884}
14885
0cfff2a1
KT
14886/* Validate a command-line -march option. Parse the arch and extensions
14887 (if any) specified in STR and throw errors if appropriate. Put the
361fb3ee
KT
14888 results, if they are valid, in RES and ISA_FLAGS. Return whether the
14889 option is valid. */
0cfff2a1 14890
361fb3ee 14891static bool
0cfff2a1 14892aarch64_validate_march (const char *str, const struct processor **res,
28108a53 14893 uint64_t *isa_flags)
0cfff2a1 14894{
c7887347 14895 std::string invalid_extension;
0cfff2a1 14896 enum aarch64_parse_opt_result parse_res
c7887347 14897 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
0cfff2a1
KT
14898
14899 if (parse_res == AARCH64_PARSE_OK)
361fb3ee 14900 return true;
0cfff2a1
KT
14901
14902 switch (parse_res)
14903 {
14904 case AARCH64_PARSE_MISSING_ARG:
fb241da2 14905 error ("missing arch name in %<-march=%s%>", str);
0cfff2a1
KT
14906 break;
14907 case AARCH64_PARSE_INVALID_ARG:
a3f9f006 14908 error ("unknown value %qs for %<-march%>", str);
01f44038 14909 aarch64_print_hint_for_arch (str);
0cfff2a1
KT
14910 break;
14911 case AARCH64_PARSE_INVALID_FEATURE:
c7887347
ML
14912 error ("invalid feature modifier %qs in %<-march=%s%>",
14913 invalid_extension.c_str (), str);
14914 aarch64_print_hint_for_extensions (invalid_extension);
0cfff2a1
KT
14915 break;
14916 default:
14917 gcc_unreachable ();
14918 }
361fb3ee
KT
14919
14920 return false;
0cfff2a1
KT
14921}
14922
14923/* Validate a command-line -mtune option. Parse the cpu
14924 specified in STR and throw errors if appropriate. Put the
361fb3ee
KT
14925 result, if it is valid, in RES. Return whether the option is
14926 valid. */
0cfff2a1 14927
361fb3ee 14928static bool
0cfff2a1
KT
14929aarch64_validate_mtune (const char *str, const struct processor **res)
14930{
14931 enum aarch64_parse_opt_result parse_res
14932 = aarch64_parse_tune (str, res);
14933
14934 if (parse_res == AARCH64_PARSE_OK)
361fb3ee 14935 return true;
0cfff2a1
KT
14936
14937 switch (parse_res)
14938 {
14939 case AARCH64_PARSE_MISSING_ARG:
fb241da2 14940 error ("missing cpu name in %<-mtune=%s%>", str);
0cfff2a1
KT
14941 break;
14942 case AARCH64_PARSE_INVALID_ARG:
a3f9f006 14943 error ("unknown value %qs for %<-mtune%>", str);
01f44038 14944 aarch64_print_hint_for_core (str);
0cfff2a1
KT
14945 break;
14946 default:
14947 gcc_unreachable ();
14948 }
361fb3ee
KT
14949 return false;
14950}
14951
14952/* Return the CPU corresponding to the enum CPU.
14953 If it doesn't specify a cpu, return the default. */
14954
14955static const struct processor *
14956aarch64_get_tune_cpu (enum aarch64_processor cpu)
14957{
14958 if (cpu != aarch64_none)
14959 return &all_cores[cpu];
14960
14961 /* The & 0x3f is to extract the bottom 6 bits that encode the
14962 default cpu as selected by the --with-cpu GCC configure option
14963 in config.gcc.
14964 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
14965 flags mechanism should be reworked to make it more sane. */
14966 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
14967}
14968
14969/* Return the architecture corresponding to the enum ARCH.
14970 If it doesn't specify a valid architecture, return the default. */
14971
14972static const struct processor *
14973aarch64_get_arch (enum aarch64_arch arch)
14974{
14975 if (arch != aarch64_no_arch)
14976 return &all_architectures[arch];
14977
14978 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
14979
14980 return &all_architectures[cpu->arch];
0cfff2a1
KT
14981}
14982
43cacb12
RS
14983/* Return the VG value associated with -msve-vector-bits= value VALUE. */
14984
14985static poly_uint16
14986aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
14987{
9b070057
RS
14988 /* 128-bit SVE and Advanced SIMD modes use different register layouts
14989 on big-endian targets, so we would need to forbid subregs that convert
14990 from one to the other. By default a reinterpret sequence would then
14991 involve a store to memory in one mode and a load back in the other.
14992 Even if we optimize that sequence using reverse instructions,
14993 it would still be a significant potential overhead.
14994
14995 For now, it seems better to generate length-agnostic code for that
14996 case instead. */
14997 if (value == SVE_SCALABLE
14998 || (value == SVE_128 && BYTES_BIG_ENDIAN))
43cacb12
RS
14999 return poly_uint16 (2, 2);
15000 else
15001 return (int) value / 64;
15002}
15003
0cfff2a1
KT
15004/* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
15005 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
15006 tuning structs. In particular it must set selected_tune and
15007 aarch64_isa_flags that define the available ISA features and tuning
15008 decisions. It must also set selected_arch as this will be used to
15009 output the .arch asm tags for each function. */
15010
15011static void
15012aarch64_override_options (void)
15013{
28108a53
MM
15014 uint64_t cpu_isa = 0;
15015 uint64_t arch_isa = 0;
0cfff2a1
KT
15016 aarch64_isa_flags = 0;
15017
361fb3ee
KT
15018 bool valid_cpu = true;
15019 bool valid_tune = true;
15020 bool valid_arch = true;
15021
0cfff2a1
KT
15022 selected_cpu = NULL;
15023 selected_arch = NULL;
15024 selected_tune = NULL;
15025
a9ba2a9b
MM
15026 if (aarch64_harden_sls_string)
15027 aarch64_validate_sls_mitigation (aarch64_harden_sls_string);
15028
efac62a3
ST
15029 if (aarch64_branch_protection_string)
15030 aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
15031
0cfff2a1
KT
15032 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
15033 If either of -march or -mtune is given, they override their
15034 respective component of -mcpu. */
15035 if (aarch64_cpu_string)
361fb3ee
KT
15036 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
15037 &cpu_isa);
0cfff2a1
KT
15038
15039 if (aarch64_arch_string)
361fb3ee
KT
15040 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
15041 &arch_isa);
0cfff2a1
KT
15042
15043 if (aarch64_tune_string)
361fb3ee 15044 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
43e9d192 15045
6881e3c1
OH
15046#ifdef SUBTARGET_OVERRIDE_OPTIONS
15047 SUBTARGET_OVERRIDE_OPTIONS;
15048#endif
15049
43e9d192
IB
15050 /* If the user did not specify a processor, choose the default
15051 one for them. This will be the CPU set during configuration using
a3cd0246 15052 --with-cpu, otherwise it is "generic". */
43e9d192
IB
15053 if (!selected_cpu)
15054 {
0cfff2a1
KT
15055 if (selected_arch)
15056 {
15057 selected_cpu = &all_cores[selected_arch->ident];
15058 aarch64_isa_flags = arch_isa;
361fb3ee 15059 explicit_arch = selected_arch->arch;
0cfff2a1
KT
15060 }
15061 else
15062 {
361fb3ee
KT
15063 /* Get default configure-time CPU. */
15064 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
0cfff2a1
KT
15065 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
15066 }
361fb3ee
KT
15067
15068 if (selected_tune)
15069 explicit_tune_core = selected_tune->ident;
0cfff2a1
KT
15070 }
15071 /* If both -mcpu and -march are specified check that they are architecturally
15072 compatible, warn if they're not and prefer the -march ISA flags. */
15073 else if (selected_arch)
15074 {
15075 if (selected_arch->arch != selected_cpu->arch)
15076 {
a3f9f006 15077 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
349297b6
JH
15078 aarch64_cpu_string,
15079 aarch64_arch_string);
0cfff2a1
KT
15080 }
15081 aarch64_isa_flags = arch_isa;
361fb3ee
KT
15082 explicit_arch = selected_arch->arch;
15083 explicit_tune_core = selected_tune ? selected_tune->ident
15084 : selected_cpu->ident;
0cfff2a1
KT
15085 }
15086 else
15087 {
15088 /* -mcpu but no -march. */
15089 aarch64_isa_flags = cpu_isa;
361fb3ee
KT
15090 explicit_tune_core = selected_tune ? selected_tune->ident
15091 : selected_cpu->ident;
15092 gcc_assert (selected_cpu);
15093 selected_arch = &all_architectures[selected_cpu->arch];
15094 explicit_arch = selected_arch->arch;
43e9d192
IB
15095 }
15096
0cfff2a1
KT
15097 /* Set the arch as well as we will need it when outputing
15098 the .arch directive in assembly. */
15099 if (!selected_arch)
15100 {
15101 gcc_assert (selected_cpu);
15102 selected_arch = &all_architectures[selected_cpu->arch];
15103 }
43e9d192 15104
43e9d192 15105 if (!selected_tune)
3edaf26d 15106 selected_tune = selected_cpu;
43e9d192 15107
c7ff4f0f
SD
15108 if (aarch64_enable_bti == 2)
15109 {
15110#ifdef TARGET_ENABLE_BTI
15111 aarch64_enable_bti = 1;
15112#else
15113 aarch64_enable_bti = 0;
15114#endif
15115 }
15116
15117 /* Return address signing is currently not supported for ILP32 targets. For
15118 LP64 targets use the configured option in the absence of a command-line
15119 option for -mbranch-protection. */
15120 if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
15121 {
15122#ifdef TARGET_ENABLE_PAC_RET
15123 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
c7ff4f0f
SD
15124#else
15125 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
15126#endif
15127 }
15128
0cfff2a1
KT
15129#ifndef HAVE_AS_MABI_OPTION
15130 /* The compiler may have been configured with 2.23.* binutils, which does
15131 not have support for ILP32. */
15132 if (TARGET_ILP32)
a3f9f006 15133 error ("assembler does not support %<-mabi=ilp32%>");
0cfff2a1 15134#endif
43e9d192 15135
43cacb12
RS
15136 /* Convert -msve-vector-bits to a VG count. */
15137 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
15138
db58fd89 15139 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
a3f9f006 15140 sorry ("return address signing is only supported for %<-mabi=lp64%>");
db58fd89 15141
361fb3ee
KT
15142 /* Make sure we properly set up the explicit options. */
15143 if ((aarch64_cpu_string && valid_cpu)
15144 || (aarch64_tune_string && valid_tune))
15145 gcc_assert (explicit_tune_core != aarch64_none);
15146
15147 if ((aarch64_cpu_string && valid_cpu)
15148 || (aarch64_arch_string && valid_arch))
15149 gcc_assert (explicit_arch != aarch64_no_arch);
15150
5f7dbaa0
RE
15151 /* The pass to insert speculation tracking runs before
15152 shrink-wrapping and the latter does not know how to update the
15153 tracking status. So disable it in this case. */
15154 if (aarch64_track_speculation)
15155 flag_shrink_wrap = 0;
15156
0cfff2a1
KT
15157 aarch64_override_options_internal (&global_options);
15158
15159 /* Save these options as the default ones in case we push and pop them later
15160 while processing functions with potential target attributes. */
15161 target_option_default_node = target_option_current_node
ba948b37 15162 = build_target_option_node (&global_options, &global_options_set);
43e9d192
IB
15163}
15164
15165/* Implement targetm.override_options_after_change. */
15166
15167static void
15168aarch64_override_options_after_change (void)
15169{
0cfff2a1 15170 aarch64_override_options_after_change_1 (&global_options);
43e9d192
IB
15171}
15172
29a14a1a
MK
15173/* Implement the TARGET_OFFLOAD_OPTIONS hook. */
15174static char *
15175aarch64_offload_options (void)
15176{
15177 if (TARGET_ILP32)
15178 return xstrdup ("-foffload-abi=ilp32");
15179 else
15180 return xstrdup ("-foffload-abi=lp64");
15181}
15182
43e9d192
IB
15183static struct machine_function *
15184aarch64_init_machine_status (void)
15185{
15186 struct machine_function *machine;
766090c2 15187 machine = ggc_cleared_alloc<machine_function> ();
43e9d192
IB
15188 return machine;
15189}
15190
15191void
15192aarch64_init_expanders (void)
15193{
15194 init_machine_status = aarch64_init_machine_status;
15195}
15196
15197/* A checking mechanism for the implementation of the various code models. */
15198static void
0cfff2a1 15199initialize_aarch64_code_model (struct gcc_options *opts)
43e9d192 15200{
6c0ab626
X
15201 aarch64_cmodel = opts->x_aarch64_cmodel_var;
15202 switch (opts->x_aarch64_cmodel_var)
15203 {
15204 case AARCH64_CMODEL_TINY:
15205 if (opts->x_flag_pic)
15206 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
15207 break;
15208 case AARCH64_CMODEL_SMALL:
15209 if (opts->x_flag_pic)
15210 {
34ecdb0f 15211#ifdef HAVE_AS_SMALL_PIC_RELOCS
6c0ab626
X
15212 aarch64_cmodel = (flag_pic == 2
15213 ? AARCH64_CMODEL_SMALL_PIC
15214 : AARCH64_CMODEL_SMALL_SPIC);
34ecdb0f 15215#else
6c0ab626 15216 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
34ecdb0f 15217#endif
6c0ab626
X
15218 }
15219 break;
15220 case AARCH64_CMODEL_LARGE:
15221 if (opts->x_flag_pic)
15222 sorry ("code model %qs with %<-f%s%>", "large",
15223 opts->x_flag_pic > 1 ? "PIC" : "pic");
15224 if (opts->x_aarch64_abi == AARCH64_ABI_ILP32)
15225 sorry ("code model %qs not supported in ilp32 mode", "large");
15226 break;
15227 case AARCH64_CMODEL_TINY_PIC:
15228 case AARCH64_CMODEL_SMALL_PIC:
15229 case AARCH64_CMODEL_SMALL_SPIC:
15230 gcc_unreachable ();
15231 }
43e9d192
IB
15232}
15233
361fb3ee
KT
15234/* Implement TARGET_OPTION_SAVE. */
15235
15236static void
ba948b37
JJ
15237aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts,
15238 struct gcc_options */* opts_set */)
361fb3ee
KT
15239{
15240 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
efac62a3
ST
15241 ptr->x_aarch64_branch_protection_string
15242 = opts->x_aarch64_branch_protection_string;
361fb3ee
KT
15243}
15244
15245/* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
15246 using the information saved in PTR. */
15247
15248static void
ba948b37
JJ
15249aarch64_option_restore (struct gcc_options *opts,
15250 struct gcc_options */* opts_set */,
15251 struct cl_target_option *ptr)
361fb3ee
KT
15252{
15253 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
15254 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
15255 opts->x_explicit_arch = ptr->x_explicit_arch;
15256 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
15257 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
efac62a3
ST
15258 opts->x_aarch64_branch_protection_string
15259 = ptr->x_aarch64_branch_protection_string;
15260 if (opts->x_aarch64_branch_protection_string)
15261 {
15262 aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
15263 NULL);
15264 }
361fb3ee
KT
15265
15266 aarch64_override_options_internal (opts);
15267}
15268
15269/* Implement TARGET_OPTION_PRINT. */
15270
15271static void
15272aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
15273{
15274 const struct processor *cpu
15275 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
28108a53 15276 uint64_t isa_flags = ptr->x_aarch64_isa_flags;
361fb3ee 15277 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
054b4005 15278 std::string extension
04a99ebe 15279 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
361fb3ee
KT
15280
15281 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
054b4005
JG
15282 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
15283 arch->name, extension.c_str ());
361fb3ee
KT
15284}
15285
d78006d9
KT
15286static GTY(()) tree aarch64_previous_fndecl;
15287
e4ea20c8
KT
15288void
15289aarch64_reset_previous_fndecl (void)
15290{
15291 aarch64_previous_fndecl = NULL;
15292}
15293
acfc1ac1
KT
15294/* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
15295 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
15296 make sure optab availability predicates are recomputed when necessary. */
15297
15298void
15299aarch64_save_restore_target_globals (tree new_tree)
15300{
15301 if (TREE_TARGET_GLOBALS (new_tree))
15302 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
15303 else if (new_tree == target_option_default_node)
15304 restore_target_globals (&default_target_globals);
15305 else
15306 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
15307}
15308
d78006d9
KT
15309/* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
15310 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
15311 of the function, if such exists. This function may be called multiple
15312 times on a single function so use aarch64_previous_fndecl to avoid
15313 setting up identical state. */
15314
15315static void
15316aarch64_set_current_function (tree fndecl)
15317{
acfc1ac1
KT
15318 if (!fndecl || fndecl == aarch64_previous_fndecl)
15319 return;
15320
d78006d9
KT
15321 tree old_tree = (aarch64_previous_fndecl
15322 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
15323 : NULL_TREE);
15324
acfc1ac1 15325 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
d78006d9 15326
acfc1ac1
KT
15327 /* If current function has no attributes but the previous one did,
15328 use the default node. */
15329 if (!new_tree && old_tree)
15330 new_tree = target_option_default_node;
d78006d9 15331
acfc1ac1
KT
15332 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
15333 the default have been handled by aarch64_save_restore_target_globals from
15334 aarch64_pragma_target_parse. */
15335 if (old_tree == new_tree)
15336 return;
d78006d9 15337
acfc1ac1 15338 aarch64_previous_fndecl = fndecl;
6e17a23b 15339
acfc1ac1 15340 /* First set the target options. */
ba948b37
JJ
15341 cl_target_option_restore (&global_options, &global_options_set,
15342 TREE_TARGET_OPTION (new_tree));
6e17a23b 15343
acfc1ac1 15344 aarch64_save_restore_target_globals (new_tree);
d78006d9 15345}
361fb3ee 15346
5a2c8331
KT
15347/* Enum describing the various ways we can handle attributes.
15348 In many cases we can reuse the generic option handling machinery. */
15349
15350enum aarch64_attr_opt_type
15351{
15352 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
15353 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
15354 aarch64_attr_enum, /* Attribute sets an enum variable. */
15355 aarch64_attr_custom /* Attribute requires a custom handling function. */
15356};
15357
15358/* All the information needed to handle a target attribute.
15359 NAME is the name of the attribute.
9c582551 15360 ATTR_TYPE specifies the type of behavior of the attribute as described
5a2c8331
KT
15361 in the definition of enum aarch64_attr_opt_type.
15362 ALLOW_NEG is true if the attribute supports a "no-" form.
ab93e9b7
SE
15363 HANDLER is the function that takes the attribute string as an argument
15364 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
5a2c8331 15365 OPT_NUM is the enum specifying the option that the attribute modifies.
9c582551 15366 This is needed for attributes that mirror the behavior of a command-line
5a2c8331
KT
15367 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
15368 aarch64_attr_enum. */
15369
15370struct aarch64_attribute_info
15371{
15372 const char *name;
15373 enum aarch64_attr_opt_type attr_type;
15374 bool allow_neg;
ab93e9b7 15375 bool (*handler) (const char *);
5a2c8331
KT
15376 enum opt_code opt_num;
15377};
15378
ab93e9b7 15379/* Handle the ARCH_STR argument to the arch= target attribute. */
5a2c8331
KT
15380
15381static bool
ab93e9b7 15382aarch64_handle_attr_arch (const char *str)
5a2c8331
KT
15383{
15384 const struct processor *tmp_arch = NULL;
c7887347 15385 std::string invalid_extension;
5a2c8331 15386 enum aarch64_parse_opt_result parse_res
c7887347 15387 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
5a2c8331
KT
15388
15389 if (parse_res == AARCH64_PARSE_OK)
15390 {
15391 gcc_assert (tmp_arch);
15392 selected_arch = tmp_arch;
15393 explicit_arch = selected_arch->arch;
15394 return true;
15395 }
15396
15397 switch (parse_res)
15398 {
15399 case AARCH64_PARSE_MISSING_ARG:
ab93e9b7 15400 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
5a2c8331
KT
15401 break;
15402 case AARCH64_PARSE_INVALID_ARG:
ab93e9b7 15403 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
01f44038 15404 aarch64_print_hint_for_arch (str);
5a2c8331
KT
15405 break;
15406 case AARCH64_PARSE_INVALID_FEATURE:
c7887347
ML
15407 error ("invalid feature modifier %s of value (\"%s\") in "
15408 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
15409 aarch64_print_hint_for_extensions (invalid_extension);
5a2c8331
KT
15410 break;
15411 default:
15412 gcc_unreachable ();
15413 }
15414
15415 return false;
15416}
15417
ab93e9b7 15418/* Handle the argument CPU_STR to the cpu= target attribute. */
5a2c8331
KT
15419
15420static bool
ab93e9b7 15421aarch64_handle_attr_cpu (const char *str)
5a2c8331
KT
15422{
15423 const struct processor *tmp_cpu = NULL;
c7887347 15424 std::string invalid_extension;
5a2c8331 15425 enum aarch64_parse_opt_result parse_res
c7887347 15426 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
5a2c8331
KT
15427
15428 if (parse_res == AARCH64_PARSE_OK)
15429 {
15430 gcc_assert (tmp_cpu);
15431 selected_tune = tmp_cpu;
15432 explicit_tune_core = selected_tune->ident;
15433
15434 selected_arch = &all_architectures[tmp_cpu->arch];
15435 explicit_arch = selected_arch->arch;
15436 return true;
15437 }
15438
15439 switch (parse_res)
15440 {
15441 case AARCH64_PARSE_MISSING_ARG:
ab93e9b7 15442 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
5a2c8331
KT
15443 break;
15444 case AARCH64_PARSE_INVALID_ARG:
ab93e9b7 15445 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
01f44038 15446 aarch64_print_hint_for_core (str);
5a2c8331
KT
15447 break;
15448 case AARCH64_PARSE_INVALID_FEATURE:
c7887347
ML
15449 error ("invalid feature modifier %s of value (\"%s\") in "
15450 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
15451 aarch64_print_hint_for_extensions (invalid_extension);
5a2c8331
KT
15452 break;
15453 default:
15454 gcc_unreachable ();
15455 }
15456
15457 return false;
15458}
15459
efac62a3
ST
15460/* Handle the argument STR to the branch-protection= attribute. */
15461
15462 static bool
15463 aarch64_handle_attr_branch_protection (const char* str)
15464 {
81e40f3a 15465 char *err_str = (char *) xmalloc (strlen (str) + 1);
efac62a3
ST
15466 enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
15467 &err_str);
15468 bool success = false;
15469 switch (res)
15470 {
15471 case AARCH64_PARSE_MISSING_ARG:
15472 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
15473 " attribute");
15474 break;
15475 case AARCH64_PARSE_INVALID_ARG:
15476 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
15477 "=\")%> pragma or attribute", err_str);
15478 break;
15479 case AARCH64_PARSE_OK:
15480 success = true;
15481 /* Fall through. */
15482 case AARCH64_PARSE_INVALID_FEATURE:
15483 break;
15484 default:
15485 gcc_unreachable ();
15486 }
15487 free (err_str);
15488 return success;
15489 }
15490
ab93e9b7 15491/* Handle the argument STR to the tune= target attribute. */
5a2c8331
KT
15492
15493static bool
ab93e9b7 15494aarch64_handle_attr_tune (const char *str)
5a2c8331
KT
15495{
15496 const struct processor *tmp_tune = NULL;
15497 enum aarch64_parse_opt_result parse_res
15498 = aarch64_parse_tune (str, &tmp_tune);
15499
15500 if (parse_res == AARCH64_PARSE_OK)
15501 {
15502 gcc_assert (tmp_tune);
15503 selected_tune = tmp_tune;
15504 explicit_tune_core = selected_tune->ident;
15505 return true;
15506 }
15507
15508 switch (parse_res)
15509 {
15510 case AARCH64_PARSE_INVALID_ARG:
ab93e9b7 15511 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
01f44038 15512 aarch64_print_hint_for_core (str);
5a2c8331
KT
15513 break;
15514 default:
15515 gcc_unreachable ();
15516 }
15517
15518 return false;
15519}
15520
15521/* Parse an architecture extensions target attribute string specified in STR.
15522 For example "+fp+nosimd". Show any errors if needed. Return TRUE
15523 if successful. Update aarch64_isa_flags to reflect the ISA features
ab93e9b7 15524 modified. */
5a2c8331
KT
15525
15526static bool
ab93e9b7 15527aarch64_handle_attr_isa_flags (char *str)
5a2c8331
KT
15528{
15529 enum aarch64_parse_opt_result parse_res;
28108a53 15530 uint64_t isa_flags = aarch64_isa_flags;
5a2c8331 15531
e4ea20c8
KT
15532 /* We allow "+nothing" in the beginning to clear out all architectural
15533 features if the user wants to handpick specific features. */
15534 if (strncmp ("+nothing", str, 8) == 0)
15535 {
15536 isa_flags = 0;
15537 str += 8;
15538 }
15539
c7887347
ML
15540 std::string invalid_extension;
15541 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
5a2c8331
KT
15542
15543 if (parse_res == AARCH64_PARSE_OK)
15544 {
15545 aarch64_isa_flags = isa_flags;
15546 return true;
15547 }
15548
15549 switch (parse_res)
15550 {
15551 case AARCH64_PARSE_MISSING_ARG:
ab93e9b7 15552 error ("missing value in %<target()%> pragma or attribute");
5a2c8331
KT
15553 break;
15554
15555 case AARCH64_PARSE_INVALID_FEATURE:
c7887347
ML
15556 error ("invalid feature modifier %s of value (\"%s\") in "
15557 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
5a2c8331
KT
15558 break;
15559
15560 default:
15561 gcc_unreachable ();
15562 }
15563
15564 return false;
15565}
15566
15567/* The target attributes that we support. On top of these we also support just
15568 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
15569 handled explicitly in aarch64_process_one_target_attr. */
15570
15571static const struct aarch64_attribute_info aarch64_attributes[] =
15572{
15573 { "general-regs-only", aarch64_attr_mask, false, NULL,
15574 OPT_mgeneral_regs_only },
15575 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
15576 OPT_mfix_cortex_a53_835769 },
48bb1a55
CL
15577 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
15578 OPT_mfix_cortex_a53_843419 },
5a2c8331 15579 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
675d044c 15580 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
5a2c8331
KT
15581 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
15582 OPT_momit_leaf_frame_pointer },
15583 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
15584 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
15585 OPT_march_ },
15586 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
15587 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
15588 OPT_mtune_ },
efac62a3
ST
15589 { "branch-protection", aarch64_attr_custom, false,
15590 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
db58fd89
JW
15591 { "sign-return-address", aarch64_attr_enum, false, NULL,
15592 OPT_msign_return_address_ },
9e02b45f
ML
15593 { "outline-atomics", aarch64_attr_bool, true, NULL,
15594 OPT_moutline_atomics},
5a2c8331
KT
15595 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
15596};
15597
15598/* Parse ARG_STR which contains the definition of one target attribute.
ab93e9b7 15599 Show appropriate errors if any or return true if the attribute is valid. */
5a2c8331
KT
15600
15601static bool
ab93e9b7 15602aarch64_process_one_target_attr (char *arg_str)
5a2c8331
KT
15603{
15604 bool invert = false;
15605
15606 size_t len = strlen (arg_str);
15607
15608 if (len == 0)
15609 {
ab93e9b7 15610 error ("malformed %<target()%> pragma or attribute");
5a2c8331
KT
15611 return false;
15612 }
15613
15614 char *str_to_check = (char *) alloca (len + 1);
15615 strcpy (str_to_check, arg_str);
15616
5a2c8331
KT
15617 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
15618 It is easier to detect and handle it explicitly here rather than going
15619 through the machinery for the rest of the target attributes in this
15620 function. */
15621 if (*str_to_check == '+')
ab93e9b7 15622 return aarch64_handle_attr_isa_flags (str_to_check);
5a2c8331
KT
15623
15624 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
15625 {
15626 invert = true;
15627 str_to_check += 3;
15628 }
15629 char *arg = strchr (str_to_check, '=');
15630
15631 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
15632 and point ARG to "foo". */
15633 if (arg)
15634 {
15635 *arg = '\0';
15636 arg++;
15637 }
15638 const struct aarch64_attribute_info *p_attr;
16d12992 15639 bool found = false;
5a2c8331
KT
15640 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
15641 {
15642 /* If the names don't match up, or the user has given an argument
15643 to an attribute that doesn't accept one, or didn't give an argument
15644 to an attribute that expects one, fail to match. */
15645 if (strcmp (str_to_check, p_attr->name) != 0)
15646 continue;
15647
16d12992 15648 found = true;
5a2c8331
KT
15649 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
15650 || p_attr->attr_type == aarch64_attr_enum;
15651
15652 if (attr_need_arg_p ^ (arg != NULL))
15653 {
ab93e9b7 15654 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
5a2c8331
KT
15655 return false;
15656 }
15657
15658 /* If the name matches but the attribute does not allow "no-" versions
15659 then we can't match. */
15660 if (invert && !p_attr->allow_neg)
15661 {
ab93e9b7 15662 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
5a2c8331
KT
15663 return false;
15664 }
15665
15666 switch (p_attr->attr_type)
15667 {
15668 /* Has a custom handler registered.
15669 For example, cpu=, arch=, tune=. */
15670 case aarch64_attr_custom:
15671 gcc_assert (p_attr->handler);
ab93e9b7 15672 if (!p_attr->handler (arg))
5a2c8331
KT
15673 return false;
15674 break;
15675
15676 /* Either set or unset a boolean option. */
15677 case aarch64_attr_bool:
15678 {
15679 struct cl_decoded_option decoded;
15680
15681 generate_option (p_attr->opt_num, NULL, !invert,
15682 CL_TARGET, &decoded);
15683 aarch64_handle_option (&global_options, &global_options_set,
15684 &decoded, input_location);
15685 break;
15686 }
15687 /* Set or unset a bit in the target_flags. aarch64_handle_option
15688 should know what mask to apply given the option number. */
15689 case aarch64_attr_mask:
15690 {
15691 struct cl_decoded_option decoded;
15692 /* We only need to specify the option number.
15693 aarch64_handle_option will know which mask to apply. */
15694 decoded.opt_index = p_attr->opt_num;
15695 decoded.value = !invert;
15696 aarch64_handle_option (&global_options, &global_options_set,
15697 &decoded, input_location);
15698 break;
15699 }
15700 /* Use the option setting machinery to set an option to an enum. */
15701 case aarch64_attr_enum:
15702 {
15703 gcc_assert (arg);
15704 bool valid;
15705 int value;
15706 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
15707 &value, CL_TARGET);
15708 if (valid)
15709 {
15710 set_option (&global_options, NULL, p_attr->opt_num, value,
15711 NULL, DK_UNSPECIFIED, input_location,
15712 global_dc);
15713 }
15714 else
15715 {
ab93e9b7 15716 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
5a2c8331
KT
15717 }
15718 break;
15719 }
15720 default:
15721 gcc_unreachable ();
15722 }
15723 }
15724
16d12992
KT
15725 /* If we reached here we either have found an attribute and validated
15726 it or didn't match any. If we matched an attribute but its arguments
15727 were malformed we will have returned false already. */
15728 return found;
5a2c8331
KT
15729}
15730
15731/* Count how many times the character C appears in
15732 NULL-terminated string STR. */
15733
15734static unsigned int
15735num_occurences_in_str (char c, char *str)
15736{
15737 unsigned int res = 0;
15738 while (*str != '\0')
15739 {
15740 if (*str == c)
15741 res++;
15742
15743 str++;
15744 }
15745
15746 return res;
15747}
15748
15749/* Parse the tree in ARGS that contains the target attribute information
ab93e9b7 15750 and update the global target options space. */
5a2c8331
KT
15751
15752bool
ab93e9b7 15753aarch64_process_target_attr (tree args)
5a2c8331
KT
15754{
15755 if (TREE_CODE (args) == TREE_LIST)
15756 {
15757 do
15758 {
15759 tree head = TREE_VALUE (args);
15760 if (head)
15761 {
ab93e9b7 15762 if (!aarch64_process_target_attr (head))
5a2c8331
KT
15763 return false;
15764 }
15765 args = TREE_CHAIN (args);
15766 } while (args);
15767
15768 return true;
15769 }
3b6cb9e3
ML
15770
15771 if (TREE_CODE (args) != STRING_CST)
15772 {
15773 error ("attribute %<target%> argument not a string");
15774 return false;
15775 }
5a2c8331
KT
15776
15777 size_t len = strlen (TREE_STRING_POINTER (args));
15778 char *str_to_check = (char *) alloca (len + 1);
15779 strcpy (str_to_check, TREE_STRING_POINTER (args));
15780
15781 if (len == 0)
15782 {
ab93e9b7 15783 error ("malformed %<target()%> pragma or attribute");
5a2c8331
KT
15784 return false;
15785 }
15786
15787 /* Used to catch empty spaces between commas i.e.
15788 attribute ((target ("attr1,,attr2"))). */
15789 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
15790
15791 /* Handle multiple target attributes separated by ','. */
7185a4eb 15792 char *token = strtok_r (str_to_check, ",", &str_to_check);
5a2c8331
KT
15793
15794 unsigned int num_attrs = 0;
15795 while (token)
15796 {
15797 num_attrs++;
ab93e9b7 15798 if (!aarch64_process_one_target_attr (token))
5a2c8331 15799 {
ab93e9b7 15800 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
5a2c8331
KT
15801 return false;
15802 }
15803
7185a4eb 15804 token = strtok_r (NULL, ",", &str_to_check);
5a2c8331
KT
15805 }
15806
15807 if (num_attrs != num_commas + 1)
15808 {
ab93e9b7 15809 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
5a2c8331
KT
15810 return false;
15811 }
15812
15813 return true;
15814}
15815
15816/* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
15817 process attribute ((target ("..."))). */
15818
15819static bool
15820aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
15821{
15822 struct cl_target_option cur_target;
15823 bool ret;
15824 tree old_optimize;
15825 tree new_target, new_optimize;
15826 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
91d0e8de
KT
15827
15828 /* If what we're processing is the current pragma string then the
15829 target option node is already stored in target_option_current_node
15830 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
15831 having to re-parse the string. This is especially useful to keep
15832 arm_neon.h compile times down since that header contains a lot
15833 of intrinsics enclosed in pragmas. */
15834 if (!existing_target && args == current_target_pragma)
15835 {
15836 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
15837 return true;
15838 }
5a2c8331
KT
15839 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
15840
ba948b37
JJ
15841 old_optimize
15842 = build_optimization_node (&global_options, &global_options_set);
5a2c8331
KT
15843 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
15844
15845 /* If the function changed the optimization levels as well as setting
15846 target options, start with the optimizations specified. */
15847 if (func_optimize && func_optimize != old_optimize)
ba948b37 15848 cl_optimization_restore (&global_options, &global_options_set,
5a2c8331
KT
15849 TREE_OPTIMIZATION (func_optimize));
15850
15851 /* Save the current target options to restore at the end. */
ba948b37 15852 cl_target_option_save (&cur_target, &global_options, &global_options_set);
5a2c8331
KT
15853
15854 /* If fndecl already has some target attributes applied to it, unpack
15855 them so that we add this attribute on top of them, rather than
15856 overwriting them. */
15857 if (existing_target)
15858 {
15859 struct cl_target_option *existing_options
15860 = TREE_TARGET_OPTION (existing_target);
15861
15862 if (existing_options)
ba948b37
JJ
15863 cl_target_option_restore (&global_options, &global_options_set,
15864 existing_options);
5a2c8331
KT
15865 }
15866 else
ba948b37
JJ
15867 cl_target_option_restore (&global_options, &global_options_set,
15868 TREE_TARGET_OPTION (target_option_current_node));
5a2c8331 15869
ab93e9b7 15870 ret = aarch64_process_target_attr (args);
5a2c8331
KT
15871
15872 /* Set up any additional state. */
15873 if (ret)
15874 {
15875 aarch64_override_options_internal (&global_options);
e95a988a
KT
15876 /* Initialize SIMD builtins if we haven't already.
15877 Set current_target_pragma to NULL for the duration so that
15878 the builtin initialization code doesn't try to tag the functions
15879 being built with the attributes specified by any current pragma, thus
15880 going into an infinite recursion. */
15881 if (TARGET_SIMD)
15882 {
15883 tree saved_current_target_pragma = current_target_pragma;
15884 current_target_pragma = NULL;
15885 aarch64_init_simd_builtins ();
15886 current_target_pragma = saved_current_target_pragma;
15887 }
ba948b37
JJ
15888 new_target = build_target_option_node (&global_options,
15889 &global_options_set);
5a2c8331
KT
15890 }
15891 else
15892 new_target = NULL;
15893
ba948b37
JJ
15894 new_optimize = build_optimization_node (&global_options,
15895 &global_options_set);
5a2c8331
KT
15896
15897 if (fndecl && ret)
15898 {
15899 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
15900
15901 if (old_optimize != new_optimize)
15902 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
15903 }
15904
ba948b37 15905 cl_target_option_restore (&global_options, &global_options_set, &cur_target);
5a2c8331
KT
15906
15907 if (old_optimize != new_optimize)
ba948b37 15908 cl_optimization_restore (&global_options, &global_options_set,
5a2c8331
KT
15909 TREE_OPTIMIZATION (old_optimize));
15910 return ret;
15911}
15912
1fd8d40c
KT
15913/* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
15914 tri-bool options (yes, no, don't care) and the default value is
15915 DEF, determine whether to reject inlining. */
15916
15917static bool
15918aarch64_tribools_ok_for_inlining_p (int caller, int callee,
15919 int dont_care, int def)
15920{
15921 /* If the callee doesn't care, always allow inlining. */
15922 if (callee == dont_care)
15923 return true;
15924
15925 /* If the caller doesn't care, always allow inlining. */
15926 if (caller == dont_care)
15927 return true;
15928
15929 /* Otherwise, allow inlining if either the callee and caller values
15930 agree, or if the callee is using the default value. */
15931 return (callee == caller || callee == def);
15932}
15933
15934/* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
15935 to inline CALLEE into CALLER based on target-specific info.
15936 Make sure that the caller and callee have compatible architectural
15937 features. Then go through the other possible target attributes
15938 and see if they can block inlining. Try not to reject always_inline
15939 callees unless they are incompatible architecturally. */
15940
15941static bool
15942aarch64_can_inline_p (tree caller, tree callee)
15943{
15944 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
15945 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
15946
1fd8d40c
KT
15947 struct cl_target_option *caller_opts
15948 = TREE_TARGET_OPTION (caller_tree ? caller_tree
15949 : target_option_default_node);
15950
675d044c
SD
15951 struct cl_target_option *callee_opts
15952 = TREE_TARGET_OPTION (callee_tree ? callee_tree
15953 : target_option_default_node);
1fd8d40c
KT
15954
15955 /* Callee's ISA flags should be a subset of the caller's. */
15956 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
15957 != callee_opts->x_aarch64_isa_flags)
15958 return false;
15959
15960 /* Allow non-strict aligned functions inlining into strict
15961 aligned ones. */
15962 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
15963 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
15964 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
15965 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
15966 return false;
15967
15968 bool always_inline = lookup_attribute ("always_inline",
15969 DECL_ATTRIBUTES (callee));
15970
15971 /* If the architectural features match up and the callee is always_inline
15972 then the other attributes don't matter. */
15973 if (always_inline)
15974 return true;
15975
15976 if (caller_opts->x_aarch64_cmodel_var
15977 != callee_opts->x_aarch64_cmodel_var)
15978 return false;
15979
15980 if (caller_opts->x_aarch64_tls_dialect
15981 != callee_opts->x_aarch64_tls_dialect)
15982 return false;
15983
15984 /* Honour explicit requests to workaround errata. */
15985 if (!aarch64_tribools_ok_for_inlining_p (
15986 caller_opts->x_aarch64_fix_a53_err835769,
15987 callee_opts->x_aarch64_fix_a53_err835769,
15988 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
15989 return false;
15990
48bb1a55
CL
15991 if (!aarch64_tribools_ok_for_inlining_p (
15992 caller_opts->x_aarch64_fix_a53_err843419,
15993 callee_opts->x_aarch64_fix_a53_err843419,
15994 2, TARGET_FIX_ERR_A53_843419))
15995 return false;
15996
1fd8d40c
KT
15997 /* If the user explicitly specified -momit-leaf-frame-pointer for the
15998 caller and calle and they don't match up, reject inlining. */
15999 if (!aarch64_tribools_ok_for_inlining_p (
16000 caller_opts->x_flag_omit_leaf_frame_pointer,
16001 callee_opts->x_flag_omit_leaf_frame_pointer,
16002 2, 1))
16003 return false;
16004
16005 /* If the callee has specific tuning overrides, respect them. */
16006 if (callee_opts->x_aarch64_override_tune_string != NULL
16007 && caller_opts->x_aarch64_override_tune_string == NULL)
16008 return false;
16009
16010 /* If the user specified tuning override strings for the
16011 caller and callee and they don't match up, reject inlining.
16012 We just do a string compare here, we don't analyze the meaning
16013 of the string, as it would be too costly for little gain. */
16014 if (callee_opts->x_aarch64_override_tune_string
16015 && caller_opts->x_aarch64_override_tune_string
16016 && (strcmp (callee_opts->x_aarch64_override_tune_string,
16017 caller_opts->x_aarch64_override_tune_string) != 0))
16018 return false;
16019
16020 return true;
16021}
16022
bb6ce448
RS
16023/* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
16024 been already. */
16025
16026unsigned int
16027aarch64_tlsdesc_abi_id ()
16028{
16029 predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
16030 if (!tlsdesc_abi.initialized_p ())
16031 {
16032 HARD_REG_SET full_reg_clobbers;
16033 CLEAR_HARD_REG_SET (full_reg_clobbers);
16034 SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
16035 SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
16036 for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
16037 SET_HARD_REG_BIT (full_reg_clobbers, regno);
16038 tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
16039 }
16040 return tlsdesc_abi.id ();
16041}
16042
43e9d192
IB
16043/* Return true if SYMBOL_REF X binds locally. */
16044
16045static bool
16046aarch64_symbol_binds_local_p (const_rtx x)
16047{
16048 return (SYMBOL_REF_DECL (x)
16049 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
16050 : SYMBOL_REF_LOCAL_P (x));
16051}
16052
16053/* Return true if SYMBOL_REF X is thread local */
16054static bool
16055aarch64_tls_symbol_p (rtx x)
16056{
16057 if (! TARGET_HAVE_TLS)
16058 return false;
16059
74b27d8e 16060 x = strip_salt (x);
3793ecc1 16061 if (!SYMBOL_REF_P (x))
43e9d192
IB
16062 return false;
16063
16064 return SYMBOL_REF_TLS_MODEL (x) != 0;
16065}
16066
16067/* Classify a TLS symbol into one of the TLS kinds. */
16068enum aarch64_symbol_type
16069aarch64_classify_tls_symbol (rtx x)
16070{
16071 enum tls_model tls_kind = tls_symbolic_operand_type (x);
16072
16073 switch (tls_kind)
16074 {
16075 case TLS_MODEL_GLOBAL_DYNAMIC:
16076 case TLS_MODEL_LOCAL_DYNAMIC:
16077 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
16078
16079 case TLS_MODEL_INITIAL_EXEC:
5ae7caad
JW
16080 switch (aarch64_cmodel)
16081 {
16082 case AARCH64_CMODEL_TINY:
16083 case AARCH64_CMODEL_TINY_PIC:
16084 return SYMBOL_TINY_TLSIE;
16085 default:
79496620 16086 return SYMBOL_SMALL_TLSIE;
5ae7caad 16087 }
43e9d192
IB
16088
16089 case TLS_MODEL_LOCAL_EXEC:
cbf5629e
JW
16090 if (aarch64_tls_size == 12)
16091 return SYMBOL_TLSLE12;
16092 else if (aarch64_tls_size == 24)
16093 return SYMBOL_TLSLE24;
16094 else if (aarch64_tls_size == 32)
16095 return SYMBOL_TLSLE32;
16096 else if (aarch64_tls_size == 48)
16097 return SYMBOL_TLSLE48;
16098 else
16099 gcc_unreachable ();
43e9d192
IB
16100
16101 case TLS_MODEL_EMULATED:
16102 case TLS_MODEL_NONE:
16103 return SYMBOL_FORCE_TO_MEM;
16104
16105 default:
16106 gcc_unreachable ();
16107 }
16108}
16109
43cacb12
RS
16110/* Return the correct method for accessing X + OFFSET, where X is either
16111 a SYMBOL_REF or LABEL_REF. */
17f4d4bf 16112
43e9d192 16113enum aarch64_symbol_type
43cacb12 16114aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
43e9d192 16115{
74b27d8e
RS
16116 x = strip_salt (x);
16117
3793ecc1 16118 if (LABEL_REF_P (x))
43e9d192
IB
16119 {
16120 switch (aarch64_cmodel)
16121 {
16122 case AARCH64_CMODEL_LARGE:
16123 return SYMBOL_FORCE_TO_MEM;
16124
16125 case AARCH64_CMODEL_TINY_PIC:
16126 case AARCH64_CMODEL_TINY:
a5350ddc
CSS
16127 return SYMBOL_TINY_ABSOLUTE;
16128
1b1e81f8 16129 case AARCH64_CMODEL_SMALL_SPIC:
43e9d192
IB
16130 case AARCH64_CMODEL_SMALL_PIC:
16131 case AARCH64_CMODEL_SMALL:
16132 return SYMBOL_SMALL_ABSOLUTE;
16133
16134 default:
16135 gcc_unreachable ();
16136 }
16137 }
16138
3793ecc1 16139 if (SYMBOL_REF_P (x))
43e9d192 16140 {
43e9d192
IB
16141 if (aarch64_tls_symbol_p (x))
16142 return aarch64_classify_tls_symbol (x);
16143
17f4d4bf
CSS
16144 switch (aarch64_cmodel)
16145 {
16146 case AARCH64_CMODEL_TINY:
15f6e0da 16147 /* When we retrieve symbol + offset address, we have to make sure
f8b756b7
TB
16148 the offset does not cause overflow of the final address. But
16149 we have no way of knowing the address of symbol at compile time
16150 so we can't accurately say if the distance between the PC and
7d3b27ff
WD
16151 symbol + offset is outside the addressible range of +/-1MB in the
16152 TINY code model. So we limit the maximum offset to +/-64KB and
16153 assume the offset to the symbol is not larger than +/-(1MB - 64KB).
16154 If offset_within_block_p is true we allow larger offsets.
16155 Furthermore force to memory if the symbol is a weak reference to
16156 something that doesn't resolve to a symbol in this module. */
16157
16158 if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
a5350ddc 16159 return SYMBOL_FORCE_TO_MEM;
7d3b27ff
WD
16160 if (!(IN_RANGE (offset, -0x10000, 0x10000)
16161 || offset_within_block_p (x, offset)))
16162 return SYMBOL_FORCE_TO_MEM;
16163
a5350ddc
CSS
16164 return SYMBOL_TINY_ABSOLUTE;
16165
17f4d4bf 16166 case AARCH64_CMODEL_SMALL:
f8b756b7 16167 /* Same reasoning as the tiny code model, but the offset cap here is
7d3b27ff
WD
16168 1MB, allowing +/-3.9GB for the offset to the symbol. */
16169
16170 if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
17f4d4bf 16171 return SYMBOL_FORCE_TO_MEM;
7d3b27ff
WD
16172 if (!(IN_RANGE (offset, -0x100000, 0x100000)
16173 || offset_within_block_p (x, offset)))
16174 return SYMBOL_FORCE_TO_MEM;
16175
17f4d4bf 16176 return SYMBOL_SMALL_ABSOLUTE;
43e9d192 16177
17f4d4bf 16178 case AARCH64_CMODEL_TINY_PIC:
38e6c9a6 16179 if (!aarch64_symbol_binds_local_p (x))
87dd8ab0 16180 return SYMBOL_TINY_GOT;
38e6c9a6
MS
16181 return SYMBOL_TINY_ABSOLUTE;
16182
1b1e81f8 16183 case AARCH64_CMODEL_SMALL_SPIC:
17f4d4bf
CSS
16184 case AARCH64_CMODEL_SMALL_PIC:
16185 if (!aarch64_symbol_binds_local_p (x))
1b1e81f8
JW
16186 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
16187 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
17f4d4bf 16188 return SYMBOL_SMALL_ABSOLUTE;
43e9d192 16189
9ee6540a
WD
16190 case AARCH64_CMODEL_LARGE:
16191 /* This is alright even in PIC code as the constant
16192 pool reference is always PC relative and within
16193 the same translation unit. */
d47d34bb 16194 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
9ee6540a
WD
16195 return SYMBOL_SMALL_ABSOLUTE;
16196 else
16197 return SYMBOL_FORCE_TO_MEM;
16198
17f4d4bf
CSS
16199 default:
16200 gcc_unreachable ();
16201 }
43e9d192 16202 }
17f4d4bf 16203
43e9d192
IB
16204 /* By default push everything into the constant pool. */
16205 return SYMBOL_FORCE_TO_MEM;
16206}
16207
43e9d192
IB
16208bool
16209aarch64_constant_address_p (rtx x)
16210{
16211 return (CONSTANT_P (x) && memory_address_p (DImode, x));
16212}
16213
16214bool
16215aarch64_legitimate_pic_operand_p (rtx x)
16216{
74b27d8e
RS
16217 poly_int64 offset;
16218 x = strip_offset_and_salt (x, &offset);
3793ecc1 16219 if (SYMBOL_REF_P (x))
74b27d8e 16220 return false;
43e9d192
IB
16221
16222 return true;
16223}
16224
26895c21
WD
16225/* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
16226 that should be rematerialized rather than spilled. */
3520f7cc 16227
43e9d192 16228static bool
ef4bddc2 16229aarch64_legitimate_constant_p (machine_mode mode, rtx x)
43e9d192 16230{
26895c21 16231 /* Support CSE and rematerialization of common constants. */
c0bb5bc5 16232 if (CONST_INT_P (x)
9f7b87ca 16233 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
c0bb5bc5 16234 || GET_CODE (x) == CONST_VECTOR)
26895c21
WD
16235 return true;
16236
43cacb12
RS
16237 /* Do not allow vector struct mode constants for Advanced SIMD.
16238 We could support 0 and -1 easily, but they need support in
16239 aarch64-simd.md. */
16240 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
16241 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
43e9d192
IB
16242 return false;
16243
43cacb12
RS
16244 /* Only accept variable-length vector constants if they can be
16245 handled directly.
16246
16247 ??? It would be possible to handle rematerialization of other
16248 constants via secondary reloads. */
16249 if (vec_flags & VEC_ANY_SVE)
16250 return aarch64_simd_valid_immediate (x, NULL);
16251
509bb9b6
RS
16252 if (GET_CODE (x) == HIGH)
16253 x = XEXP (x, 0);
16254
43cacb12
RS
16255 /* Accept polynomial constants that can be calculated by using the
16256 destination of a move as the sole temporary. Constants that
16257 require a second temporary cannot be rematerialized (they can't be
16258 forced to memory and also aren't legitimate constants). */
16259 poly_int64 offset;
16260 if (poly_int_rtx_p (x, &offset))
16261 return aarch64_offset_temporaries (false, offset) <= 1;
16262
16263 /* If an offset is being added to something else, we need to allow the
16264 base to be moved into the destination register, meaning that there
16265 are no free temporaries for the offset. */
74b27d8e 16266 x = strip_offset_and_salt (x, &offset);
43cacb12
RS
16267 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
16268 return false;
26895c21 16269
43cacb12
RS
16270 /* Do not allow const (plus (anchor_symbol, const_int)). */
16271 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
16272 return false;
26895c21 16273
f28e54bd
WD
16274 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
16275 so spilling them is better than rematerialization. */
16276 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
16277 return true;
16278
26895c21 16279 /* Label references are always constant. */
3793ecc1 16280 if (LABEL_REF_P (x))
26895c21
WD
16281 return true;
16282
16283 return false;
43e9d192
IB
16284}
16285
a5bc806c 16286rtx
43e9d192
IB
16287aarch64_load_tp (rtx target)
16288{
16289 if (!target
16290 || GET_MODE (target) != Pmode
16291 || !register_operand (target, Pmode))
16292 target = gen_reg_rtx (Pmode);
16293
16294 /* Can return in any reg. */
16295 emit_insn (gen_aarch64_load_tp_hard (target));
16296 return target;
16297}
16298
43e9d192
IB
16299/* On AAPCS systems, this is the "struct __va_list". */
16300static GTY(()) tree va_list_type;
16301
16302/* Implement TARGET_BUILD_BUILTIN_VA_LIST.
16303 Return the type to use as __builtin_va_list.
16304
16305 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
16306
16307 struct __va_list
16308 {
16309 void *__stack;
16310 void *__gr_top;
16311 void *__vr_top;
16312 int __gr_offs;
16313 int __vr_offs;
16314 }; */
16315
16316static tree
16317aarch64_build_builtin_va_list (void)
16318{
16319 tree va_list_name;
16320 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
16321
16322 /* Create the type. */
16323 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
16324 /* Give it the required name. */
16325 va_list_name = build_decl (BUILTINS_LOCATION,
16326 TYPE_DECL,
16327 get_identifier ("__va_list"),
16328 va_list_type);
16329 DECL_ARTIFICIAL (va_list_name) = 1;
16330 TYPE_NAME (va_list_type) = va_list_name;
665c56c6 16331 TYPE_STUB_DECL (va_list_type) = va_list_name;
43e9d192
IB
16332
16333 /* Create the fields. */
16334 f_stack = build_decl (BUILTINS_LOCATION,
16335 FIELD_DECL, get_identifier ("__stack"),
16336 ptr_type_node);
16337 f_grtop = build_decl (BUILTINS_LOCATION,
16338 FIELD_DECL, get_identifier ("__gr_top"),
16339 ptr_type_node);
16340 f_vrtop = build_decl (BUILTINS_LOCATION,
16341 FIELD_DECL, get_identifier ("__vr_top"),
16342 ptr_type_node);
16343 f_groff = build_decl (BUILTINS_LOCATION,
16344 FIELD_DECL, get_identifier ("__gr_offs"),
16345 integer_type_node);
16346 f_vroff = build_decl (BUILTINS_LOCATION,
16347 FIELD_DECL, get_identifier ("__vr_offs"),
16348 integer_type_node);
16349
88e3bdd1 16350 /* Tell tree-stdarg pass about our internal offset fields.
3fd6b9cc
JW
16351 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
16352 purpose to identify whether the code is updating va_list internal
16353 offset fields through irregular way. */
16354 va_list_gpr_counter_field = f_groff;
16355 va_list_fpr_counter_field = f_vroff;
16356
43e9d192
IB
16357 DECL_ARTIFICIAL (f_stack) = 1;
16358 DECL_ARTIFICIAL (f_grtop) = 1;
16359 DECL_ARTIFICIAL (f_vrtop) = 1;
16360 DECL_ARTIFICIAL (f_groff) = 1;
16361 DECL_ARTIFICIAL (f_vroff) = 1;
16362
16363 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
16364 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
16365 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
16366 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
16367 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
16368
16369 TYPE_FIELDS (va_list_type) = f_stack;
16370 DECL_CHAIN (f_stack) = f_grtop;
16371 DECL_CHAIN (f_grtop) = f_vrtop;
16372 DECL_CHAIN (f_vrtop) = f_groff;
16373 DECL_CHAIN (f_groff) = f_vroff;
16374
16375 /* Compute its layout. */
16376 layout_type (va_list_type);
16377
16378 return va_list_type;
16379}
16380
16381/* Implement TARGET_EXPAND_BUILTIN_VA_START. */
16382static void
16383aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
16384{
16385 const CUMULATIVE_ARGS *cum;
16386 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
16387 tree stack, grtop, vrtop, groff, vroff;
16388 tree t;
88e3bdd1
JW
16389 int gr_save_area_size = cfun->va_list_gpr_size;
16390 int vr_save_area_size = cfun->va_list_fpr_size;
43e9d192
IB
16391 int vr_offset;
16392
16393 cum = &crtl->args.info;
88e3bdd1
JW
16394 if (cfun->va_list_gpr_size)
16395 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
16396 cfun->va_list_gpr_size);
16397 if (cfun->va_list_fpr_size)
16398 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
16399 * UNITS_PER_VREG, cfun->va_list_fpr_size);
43e9d192 16400
d5726973 16401 if (!TARGET_FLOAT)
43e9d192 16402 {
261fb553 16403 gcc_assert (cum->aapcs_nvrn == 0);
43e9d192
IB
16404 vr_save_area_size = 0;
16405 }
16406
16407 f_stack = TYPE_FIELDS (va_list_type_node);
16408 f_grtop = DECL_CHAIN (f_stack);
16409 f_vrtop = DECL_CHAIN (f_grtop);
16410 f_groff = DECL_CHAIN (f_vrtop);
16411 f_vroff = DECL_CHAIN (f_groff);
16412
16413 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
16414 NULL_TREE);
16415 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
16416 NULL_TREE);
16417 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
16418 NULL_TREE);
16419 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
16420 NULL_TREE);
16421 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
16422 NULL_TREE);
16423
16424 /* Emit code to initialize STACK, which points to the next varargs stack
16425 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
16426 by named arguments. STACK is 8-byte aligned. */
16427 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
16428 if (cum->aapcs_stack_size > 0)
16429 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
16430 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
16431 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16432
16433 /* Emit code to initialize GRTOP, the top of the GR save area.
16434 virtual_incoming_args_rtx should have been 16 byte aligned. */
16435 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
16436 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
16437 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16438
16439 /* Emit code to initialize VRTOP, the top of the VR save area.
16440 This address is gr_save_area_bytes below GRTOP, rounded
16441 down to the next 16-byte boundary. */
16442 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
4f59f9f2
UB
16443 vr_offset = ROUND_UP (gr_save_area_size,
16444 STACK_BOUNDARY / BITS_PER_UNIT);
43e9d192
IB
16445
16446 if (vr_offset)
16447 t = fold_build_pointer_plus_hwi (t, -vr_offset);
16448 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
16449 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16450
16451 /* Emit code to initialize GROFF, the offset from GRTOP of the
16452 next GPR argument. */
16453 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
16454 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
16455 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16456
16457 /* Likewise emit code to initialize VROFF, the offset from FTOP
16458 of the next VR argument. */
16459 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
16460 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
16461 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16462}
16463
16464/* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
16465
16466static tree
16467aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
16468 gimple_seq *post_p ATTRIBUTE_UNUSED)
16469{
16470 tree addr;
16471 bool indirect_p;
16472 bool is_ha; /* is HFA or HVA. */
16473 bool dw_align; /* double-word align. */
ef4bddc2 16474 machine_mode ag_mode = VOIDmode;
43e9d192 16475 int nregs;
ef4bddc2 16476 machine_mode mode;
43e9d192
IB
16477
16478 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
16479 tree stack, f_top, f_off, off, arg, roundup, on_stack;
16480 HOST_WIDE_INT size, rsize, adjust, align;
16481 tree t, u, cond1, cond2;
16482
fde65a89 16483 indirect_p = pass_va_arg_by_reference (type);
43e9d192
IB
16484 if (indirect_p)
16485 type = build_pointer_type (type);
16486
16487 mode = TYPE_MODE (type);
16488
16489 f_stack = TYPE_FIELDS (va_list_type_node);
16490 f_grtop = DECL_CHAIN (f_stack);
16491 f_vrtop = DECL_CHAIN (f_grtop);
16492 f_groff = DECL_CHAIN (f_vrtop);
16493 f_vroff = DECL_CHAIN (f_groff);
16494
16495 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
16496 f_stack, NULL_TREE);
16497 size = int_size_in_bytes (type);
c590597c
RE
16498
16499 bool abi_break;
16500 align
16501 = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
43e9d192
IB
16502
16503 dw_align = false;
16504 adjust = 0;
56fe3ca3
RS
16505 if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &nregs,
16506 &is_ha, false))
43e9d192 16507 {
6a70badb
RS
16508 /* No frontends can create types with variable-sized modes, so we
16509 shouldn't be asked to pass or return them. */
16510 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
16511
43e9d192 16512 /* TYPE passed in fp/simd registers. */
d5726973 16513 if (!TARGET_FLOAT)
fc29dfc9 16514 aarch64_err_no_fpadvsimd (mode);
43e9d192
IB
16515
16516 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
16517 unshare_expr (valist), f_vrtop, NULL_TREE);
16518 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
16519 unshare_expr (valist), f_vroff, NULL_TREE);
16520
16521 rsize = nregs * UNITS_PER_VREG;
16522
16523 if (is_ha)
16524 {
6a70badb
RS
16525 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
16526 adjust = UNITS_PER_VREG - ag_size;
43e9d192 16527 }
76b0cbf8 16528 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
43e9d192
IB
16529 && size < UNITS_PER_VREG)
16530 {
16531 adjust = UNITS_PER_VREG - size;
16532 }
16533 }
16534 else
16535 {
16536 /* TYPE passed in general registers. */
16537 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
16538 unshare_expr (valist), f_grtop, NULL_TREE);
16539 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
16540 unshare_expr (valist), f_groff, NULL_TREE);
4f59f9f2 16541 rsize = ROUND_UP (size, UNITS_PER_WORD);
43e9d192
IB
16542 nregs = rsize / UNITS_PER_WORD;
16543
16544 if (align > 8)
c590597c
RE
16545 {
16546 if (abi_break && warn_psabi)
16547 inform (input_location, "parameter passing for argument of type "
16548 "%qT changed in GCC 9.1", type);
16549 dw_align = true;
16550 }
43e9d192 16551
76b0cbf8 16552 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
43e9d192
IB
16553 && size < UNITS_PER_WORD)
16554 {
16555 adjust = UNITS_PER_WORD - size;
16556 }
16557 }
16558
16559 /* Get a local temporary for the field value. */
16560 off = get_initialized_tmp_var (f_off, pre_p, NULL);
16561
16562 /* Emit code to branch if off >= 0. */
16563 t = build2 (GE_EXPR, boolean_type_node, off,
16564 build_int_cst (TREE_TYPE (off), 0));
16565 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
16566
16567 if (dw_align)
16568 {
16569 /* Emit: offs = (offs + 15) & -16. */
16570 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
16571 build_int_cst (TREE_TYPE (off), 15));
16572 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
16573 build_int_cst (TREE_TYPE (off), -16));
16574 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
16575 }
16576 else
16577 roundup = NULL;
16578
16579 /* Update ap.__[g|v]r_offs */
16580 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
16581 build_int_cst (TREE_TYPE (off), rsize));
16582 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
16583
16584 /* String up. */
16585 if (roundup)
16586 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
16587
16588 /* [cond2] if (ap.__[g|v]r_offs > 0) */
16589 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
16590 build_int_cst (TREE_TYPE (f_off), 0));
16591 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
16592
16593 /* String up: make sure the assignment happens before the use. */
16594 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
16595 COND_EXPR_ELSE (cond1) = t;
16596
16597 /* Prepare the trees handling the argument that is passed on the stack;
16598 the top level node will store in ON_STACK. */
16599 arg = get_initialized_tmp_var (stack, pre_p, NULL);
16600 if (align > 8)
16601 {
16602 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
4bdc2738 16603 t = fold_build_pointer_plus_hwi (arg, 15);
43e9d192
IB
16604 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
16605 build_int_cst (TREE_TYPE (t), -16));
43e9d192
IB
16606 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
16607 }
16608 else
16609 roundup = NULL;
16610 /* Advance ap.__stack */
4bdc2738 16611 t = fold_build_pointer_plus_hwi (arg, size + 7);
43e9d192
IB
16612 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
16613 build_int_cst (TREE_TYPE (t), -8));
43e9d192
IB
16614 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
16615 /* String up roundup and advance. */
16616 if (roundup)
16617 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
16618 /* String up with arg */
16619 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
16620 /* Big-endianness related address adjustment. */
76b0cbf8 16621 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
43e9d192
IB
16622 && size < UNITS_PER_WORD)
16623 {
16624 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
16625 size_int (UNITS_PER_WORD - size));
16626 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
16627 }
16628
16629 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
16630 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
16631
16632 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
16633 t = off;
16634 if (adjust)
16635 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
16636 build_int_cst (TREE_TYPE (off), adjust));
16637
16638 t = fold_convert (sizetype, t);
16639 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
16640
16641 if (is_ha)
16642 {
16643 /* type ha; // treat as "struct {ftype field[n];}"
16644 ... [computing offs]
16645 for (i = 0; i <nregs; ++i, offs += 16)
16646 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
16647 return ha; */
16648 int i;
16649 tree tmp_ha, field_t, field_ptr_t;
16650
16651 /* Declare a local variable. */
16652 tmp_ha = create_tmp_var_raw (type, "ha");
16653 gimple_add_tmp_var (tmp_ha);
16654
16655 /* Establish the base type. */
16656 switch (ag_mode)
16657 {
4e10a5a7 16658 case E_SFmode:
43e9d192
IB
16659 field_t = float_type_node;
16660 field_ptr_t = float_ptr_type_node;
16661 break;
4e10a5a7 16662 case E_DFmode:
43e9d192
IB
16663 field_t = double_type_node;
16664 field_ptr_t = double_ptr_type_node;
16665 break;
4e10a5a7 16666 case E_TFmode:
43e9d192
IB
16667 field_t = long_double_type_node;
16668 field_ptr_t = long_double_ptr_type_node;
16669 break;
4e10a5a7 16670 case E_HFmode:
1b62ed4f
JG
16671 field_t = aarch64_fp16_type_node;
16672 field_ptr_t = aarch64_fp16_ptr_type_node;
43e9d192 16673 break;
abbe1ed2
SMW
16674 case E_BFmode:
16675 field_t = aarch64_bf16_type_node;
16676 field_ptr_t = aarch64_bf16_ptr_type_node;
16677 break;
4e10a5a7
RS
16678 case E_V2SImode:
16679 case E_V4SImode:
43e9d192
IB
16680 {
16681 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
16682 field_t = build_vector_type_for_mode (innertype, ag_mode);
16683 field_ptr_t = build_pointer_type (field_t);
16684 }
16685 break;
16686 default:
16687 gcc_assert (0);
16688 }
16689
16690 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
ab563903 16691 TREE_ADDRESSABLE (tmp_ha) = 1;
43e9d192
IB
16692 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
16693 addr = t;
16694 t = fold_convert (field_ptr_t, addr);
16695 t = build2 (MODIFY_EXPR, field_t,
16696 build1 (INDIRECT_REF, field_t, tmp_ha),
16697 build1 (INDIRECT_REF, field_t, t));
16698
16699 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
16700 for (i = 1; i < nregs; ++i)
16701 {
16702 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
16703 u = fold_convert (field_ptr_t, addr);
16704 u = build2 (MODIFY_EXPR, field_t,
16705 build2 (MEM_REF, field_t, tmp_ha,
16706 build_int_cst (field_ptr_t,
16707 (i *
16708 int_size_in_bytes (field_t)))),
16709 build1 (INDIRECT_REF, field_t, u));
16710 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
16711 }
16712
16713 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
16714 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
16715 }
16716
16717 COND_EXPR_ELSE (cond2) = t;
16718 addr = fold_convert (build_pointer_type (type), cond1);
16719 addr = build_va_arg_indirect_ref (addr);
16720
16721 if (indirect_p)
16722 addr = build_va_arg_indirect_ref (addr);
16723
16724 return addr;
16725}
16726
16727/* Implement TARGET_SETUP_INCOMING_VARARGS. */
16728
16729static void
e7056ca4
RS
16730aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
16731 const function_arg_info &arg,
16732 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
43e9d192
IB
16733{
16734 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
16735 CUMULATIVE_ARGS local_cum;
88e3bdd1
JW
16736 int gr_saved = cfun->va_list_gpr_size;
16737 int vr_saved = cfun->va_list_fpr_size;
43e9d192
IB
16738
16739 /* The caller has advanced CUM up to, but not beyond, the last named
16740 argument. Advance a local copy of CUM past the last "real" named
16741 argument, to find out how many registers are left over. */
16742 local_cum = *cum;
6930c98c 16743 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
43e9d192 16744
88e3bdd1
JW
16745 /* Found out how many registers we need to save.
16746 Honor tree-stdvar analysis results. */
16747 if (cfun->va_list_gpr_size)
16748 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
16749 cfun->va_list_gpr_size / UNITS_PER_WORD);
16750 if (cfun->va_list_fpr_size)
16751 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
16752 cfun->va_list_fpr_size / UNITS_PER_VREG);
43e9d192 16753
d5726973 16754 if (!TARGET_FLOAT)
43e9d192 16755 {
261fb553 16756 gcc_assert (local_cum.aapcs_nvrn == 0);
43e9d192
IB
16757 vr_saved = 0;
16758 }
16759
16760 if (!no_rtl)
16761 {
16762 if (gr_saved > 0)
16763 {
16764 rtx ptr, mem;
16765
16766 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
16767 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
16768 - gr_saved * UNITS_PER_WORD);
16769 mem = gen_frame_mem (BLKmode, ptr);
16770 set_mem_alias_set (mem, get_varargs_alias_set ());
16771
16772 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
16773 mem, gr_saved);
16774 }
16775 if (vr_saved > 0)
16776 {
16777 /* We can't use move_block_from_reg, because it will use
16778 the wrong mode, storing D regs only. */
ef4bddc2 16779 machine_mode mode = TImode;
88e3bdd1 16780 int off, i, vr_start;
43e9d192
IB
16781
16782 /* Set OFF to the offset from virtual_incoming_args_rtx of
16783 the first vector register. The VR save area lies below
16784 the GR one, and is aligned to 16 bytes. */
4f59f9f2
UB
16785 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
16786 STACK_BOUNDARY / BITS_PER_UNIT);
43e9d192
IB
16787 off -= vr_saved * UNITS_PER_VREG;
16788
88e3bdd1
JW
16789 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
16790 for (i = 0; i < vr_saved; ++i)
43e9d192
IB
16791 {
16792 rtx ptr, mem;
16793
16794 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
16795 mem = gen_frame_mem (mode, ptr);
16796 set_mem_alias_set (mem, get_varargs_alias_set ());
88e3bdd1 16797 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
43e9d192
IB
16798 off += UNITS_PER_VREG;
16799 }
16800 }
16801 }
16802
16803 /* We don't save the size into *PRETEND_SIZE because we want to avoid
16804 any complication of having crtl->args.pretend_args_size changed. */
8799637a 16805 cfun->machine->frame.saved_varargs_size
4f59f9f2
UB
16806 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
16807 STACK_BOUNDARY / BITS_PER_UNIT)
43e9d192
IB
16808 + vr_saved * UNITS_PER_VREG);
16809}
16810
16811static void
16812aarch64_conditional_register_usage (void)
16813{
16814 int i;
16815 if (!TARGET_FLOAT)
16816 {
16817 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
16818 {
16819 fixed_regs[i] = 1;
16820 call_used_regs[i] = 1;
16821 }
16822 }
43cacb12
RS
16823 if (!TARGET_SVE)
16824 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
16825 {
16826 fixed_regs[i] = 1;
16827 call_used_regs[i] = 1;
16828 }
3751345d 16829
183bfdaf
RS
16830 /* Only allow the FFR and FFRT to be accessed via special patterns. */
16831 CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
16832 CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
16833
3751345d
RE
16834 /* When tracking speculation, we need a couple of call-clobbered registers
16835 to track the speculation state. It would be nice to just use
16836 IP0 and IP1, but currently there are numerous places that just
16837 assume these registers are free for other uses (eg pointer
16838 authentication). */
16839 if (aarch64_track_speculation)
16840 {
16841 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
16842 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
16843 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
16844 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
16845 }
43e9d192
IB
16846}
16847
38e62001
RS
16848/* Implement TARGET_MEMBER_TYPE_FORCES_BLK. */
16849
16850bool
16851aarch64_member_type_forces_blk (const_tree field_or_array, machine_mode mode)
16852{
16853 /* For records we're passed a FIELD_DECL, for arrays we're passed
16854 an ARRAY_TYPE. In both cases we're interested in the TREE_TYPE. */
16855 const_tree type = TREE_TYPE (field_or_array);
16856
16857 /* Assign BLKmode to anything that contains multiple SVE predicates.
16858 For structures, the "multiple" case is indicated by MODE being
16859 VOIDmode. */
16860 unsigned int num_zr, num_pr;
16861 if (aarch64_sve::builtin_type_p (type, &num_zr, &num_pr) && num_pr != 0)
16862 {
16863 if (TREE_CODE (field_or_array) == ARRAY_TYPE)
16864 return !simple_cst_equal (TYPE_SIZE (field_or_array),
16865 TYPE_SIZE (type));
16866 return mode == VOIDmode;
16867 }
16868
16869 return default_member_type_forces_blk (field_or_array, mode);
16870}
16871
56fe3ca3
RS
16872/* Bitmasks that indicate whether earlier versions of GCC would have
16873 taken a different path through the ABI logic. This should result in
16874 a -Wpsabi warning if the earlier path led to a different ABI decision.
16875
16876 WARN_PSABI_EMPTY_CXX17_BASE
16877 Indicates that the type includes an artificial empty C++17 base field
16878 that, prior to GCC 10.1, would prevent the type from being treated as
16879 a HFA or HVA. See PR94383 for details.
16880
16881 WARN_PSABI_NO_UNIQUE_ADDRESS
16882 Indicates that the type includes an empty [[no_unique_address]] field
16883 that, prior to GCC 10.1, would prevent the type from being treated as
16884 a HFA or HVA. */
16885const unsigned int WARN_PSABI_EMPTY_CXX17_BASE = 1U << 0;
16886const unsigned int WARN_PSABI_NO_UNIQUE_ADDRESS = 1U << 1;
16887
43e9d192
IB
16888/* Walk down the type tree of TYPE counting consecutive base elements.
16889 If *MODEP is VOIDmode, then set it to the first valid floating point
16890 type. If a non-floating point type is found, or if a floating point
16891 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
e73a32d6
MM
16892 otherwise return the count in the sub-tree.
16893
56fe3ca3
RS
16894 The WARN_PSABI_FLAGS argument allows the caller to check whether this
16895 function has changed its behavior relative to earlier versions of GCC.
16896 Normally the argument should be nonnull and point to a zero-initialized
16897 variable. The function then records whether the ABI decision might
16898 be affected by a known fix to the ABI logic, setting the associated
16899 WARN_PSABI_* bits if so.
16900
16901 When the argument is instead a null pointer, the function tries to
16902 simulate the behavior of GCC before all such ABI fixes were made.
16903 This is useful to check whether the function returns something
16904 different after the ABI fixes. */
43e9d192 16905static int
e73a32d6 16906aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep,
56fe3ca3 16907 unsigned int *warn_psabi_flags)
43e9d192 16908{
ef4bddc2 16909 machine_mode mode;
43e9d192
IB
16910 HOST_WIDE_INT size;
16911
38e62001
RS
16912 if (aarch64_sve::builtin_type_p (type))
16913 return -1;
c600df9a 16914
43e9d192
IB
16915 switch (TREE_CODE (type))
16916 {
16917 case REAL_TYPE:
16918 mode = TYPE_MODE (type);
1b62ed4f
JG
16919 if (mode != DFmode && mode != SFmode
16920 && mode != TFmode && mode != HFmode)
43e9d192
IB
16921 return -1;
16922
16923 if (*modep == VOIDmode)
16924 *modep = mode;
16925
16926 if (*modep == mode)
16927 return 1;
16928
16929 break;
16930
16931 case COMPLEX_TYPE:
16932 mode = TYPE_MODE (TREE_TYPE (type));
1b62ed4f
JG
16933 if (mode != DFmode && mode != SFmode
16934 && mode != TFmode && mode != HFmode)
43e9d192
IB
16935 return -1;
16936
16937 if (*modep == VOIDmode)
16938 *modep = mode;
16939
16940 if (*modep == mode)
16941 return 2;
16942
16943 break;
16944
16945 case VECTOR_TYPE:
16946 /* Use V2SImode and V4SImode as representatives of all 64-bit
16947 and 128-bit vector types. */
16948 size = int_size_in_bytes (type);
16949 switch (size)
16950 {
16951 case 8:
16952 mode = V2SImode;
16953 break;
16954 case 16:
16955 mode = V4SImode;
16956 break;
16957 default:
16958 return -1;
16959 }
16960
16961 if (*modep == VOIDmode)
16962 *modep = mode;
16963
16964 /* Vector modes are considered to be opaque: two vectors are
16965 equivalent for the purposes of being homogeneous aggregates
16966 if they are the same size. */
16967 if (*modep == mode)
16968 return 1;
16969
16970 break;
16971
16972 case ARRAY_TYPE:
16973 {
16974 int count;
16975 tree index = TYPE_DOMAIN (type);
16976
807e902e
KZ
16977 /* Can't handle incomplete types nor sizes that are not
16978 fixed. */
16979 if (!COMPLETE_TYPE_P (type)
16980 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
43e9d192
IB
16981 return -1;
16982
e73a32d6 16983 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep,
56fe3ca3 16984 warn_psabi_flags);
43e9d192
IB
16985 if (count == -1
16986 || !index
16987 || !TYPE_MAX_VALUE (index)
cc269bb6 16988 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
43e9d192 16989 || !TYPE_MIN_VALUE (index)
cc269bb6 16990 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
43e9d192
IB
16991 || count < 0)
16992 return -1;
16993
ae7e9ddd
RS
16994 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
16995 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
43e9d192
IB
16996
16997 /* There must be no padding. */
6a70badb
RS
16998 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
16999 count * GET_MODE_BITSIZE (*modep)))
43e9d192
IB
17000 return -1;
17001
17002 return count;
17003 }
17004
17005 case RECORD_TYPE:
17006 {
17007 int count = 0;
17008 int sub_count;
17009 tree field;
17010
807e902e
KZ
17011 /* Can't handle incomplete types nor sizes that are not
17012 fixed. */
17013 if (!COMPLETE_TYPE_P (type)
17014 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
43e9d192
IB
17015 return -1;
17016
17017 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
17018 {
17019 if (TREE_CODE (field) != FIELD_DECL)
17020 continue;
17021
56fe3ca3 17022 if (DECL_FIELD_ABI_IGNORED (field))
e73a32d6 17023 {
56fe3ca3
RS
17024 /* See whether this is something that earlier versions of
17025 GCC failed to ignore. */
17026 unsigned int flag;
17027 if (lookup_attribute ("no_unique_address",
17028 DECL_ATTRIBUTES (field)))
17029 flag = WARN_PSABI_NO_UNIQUE_ADDRESS;
17030 else if (cxx17_empty_base_field_p (field))
17031 flag = WARN_PSABI_EMPTY_CXX17_BASE;
17032 else
17033 /* No compatibility problem. */
17034 continue;
17035
17036 /* Simulate the old behavior when WARN_PSABI_FLAGS is null. */
17037 if (warn_psabi_flags)
17038 {
17039 *warn_psabi_flags |= flag;
17040 continue;
17041 }
e73a32d6
MM
17042 }
17043
17044 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
56fe3ca3 17045 warn_psabi_flags);
43e9d192
IB
17046 if (sub_count < 0)
17047 return -1;
17048 count += sub_count;
17049 }
17050
17051 /* There must be no padding. */
6a70badb
RS
17052 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
17053 count * GET_MODE_BITSIZE (*modep)))
43e9d192
IB
17054 return -1;
17055
17056 return count;
17057 }
17058
17059 case UNION_TYPE:
17060 case QUAL_UNION_TYPE:
17061 {
17062 /* These aren't very interesting except in a degenerate case. */
17063 int count = 0;
17064 int sub_count;
17065 tree field;
17066
807e902e
KZ
17067 /* Can't handle incomplete types nor sizes that are not
17068 fixed. */
17069 if (!COMPLETE_TYPE_P (type)
17070 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
43e9d192
IB
17071 return -1;
17072
17073 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
17074 {
17075 if (TREE_CODE (field) != FIELD_DECL)
17076 continue;
17077
e73a32d6 17078 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
56fe3ca3 17079 warn_psabi_flags);
43e9d192
IB
17080 if (sub_count < 0)
17081 return -1;
17082 count = count > sub_count ? count : sub_count;
17083 }
17084
17085 /* There must be no padding. */
6a70badb
RS
17086 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
17087 count * GET_MODE_BITSIZE (*modep)))
43e9d192
IB
17088 return -1;
17089
17090 return count;
17091 }
17092
17093 default:
17094 break;
17095 }
17096
17097 return -1;
17098}
17099
b6ec6215
KT
17100/* Return TRUE if the type, as described by TYPE and MODE, is a short vector
17101 type as described in AAPCS64 \S 4.1.2.
17102
17103 See the comment above aarch64_composite_type_p for the notes on MODE. */
17104
17105static bool
17106aarch64_short_vector_p (const_tree type,
17107 machine_mode mode)
17108{
6a70badb 17109 poly_int64 size = -1;
b6ec6215
KT
17110
17111 if (type && TREE_CODE (type) == VECTOR_TYPE)
38e62001
RS
17112 {
17113 if (aarch64_sve::builtin_type_p (type))
17114 return false;
17115 size = int_size_in_bytes (type);
17116 }
b6ec6215 17117 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
38e62001
RS
17118 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
17119 {
17120 /* Rely only on the type, not the mode, when processing SVE types. */
17121 if (type && aarch64_some_values_include_pst_objects_p (type))
b2672dd6
FY
17122 /* Leave later code to report an error if SVE is disabled. */
17123 gcc_assert (!TARGET_SVE || aarch64_sve_mode_p (mode));
38e62001
RS
17124 else
17125 size = GET_MODE_SIZE (mode);
17126 }
17127 if (known_eq (size, 8) || known_eq (size, 16))
17128 {
17129 /* 64-bit and 128-bit vectors should only acquire an SVE mode if
17130 they are being treated as scalable AAPCS64 types. */
17131 gcc_assert (!aarch64_sve_mode_p (mode));
17132 return true;
17133 }
17134 return false;
b6ec6215
KT
17135}
17136
43e9d192
IB
17137/* Return TRUE if the type, as described by TYPE and MODE, is a composite
17138 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
17139 array types. The C99 floating-point complex types are also considered
17140 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
17141 types, which are GCC extensions and out of the scope of AAPCS64, are
17142 treated as composite types here as well.
17143
17144 Note that MODE itself is not sufficient in determining whether a type
17145 is such a composite type or not. This is because
17146 stor-layout.c:compute_record_mode may have already changed the MODE
17147 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
17148 structure with only one field may have its MODE set to the mode of the
17149 field. Also an integer mode whose size matches the size of the
17150 RECORD_TYPE type may be used to substitute the original mode
17151 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
17152 solely relied on. */
17153
17154static bool
17155aarch64_composite_type_p (const_tree type,
ef4bddc2 17156 machine_mode mode)
43e9d192 17157{
b6ec6215
KT
17158 if (aarch64_short_vector_p (type, mode))
17159 return false;
17160
43e9d192
IB
17161 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
17162 return true;
17163
17164 if (mode == BLKmode
17165 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
17166 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
17167 return true;
17168
17169 return false;
17170}
17171
43e9d192
IB
17172/* Return TRUE if an argument, whose type is described by TYPE and MODE,
17173 shall be passed or returned in simd/fp register(s) (providing these
17174 parameter passing registers are available).
17175
17176 Upon successful return, *COUNT returns the number of needed registers,
17177 *BASE_MODE returns the mode of the individual register and when IS_HAF
17178 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
56fe3ca3
RS
17179 floating-point aggregate or a homogeneous short-vector aggregate.
17180
17181 SILENT_P is true if the function should refrain from reporting any
17182 diagnostics. This should only be used if the caller is certain that
17183 any ABI decisions would eventually come through this function with
17184 SILENT_P set to false. */
43e9d192
IB
17185
17186static bool
ef4bddc2 17187aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
43e9d192 17188 const_tree type,
ef4bddc2 17189 machine_mode *base_mode,
43e9d192 17190 int *count,
56fe3ca3
RS
17191 bool *is_ha,
17192 bool silent_p)
43e9d192 17193{
c600df9a
RS
17194 if (is_ha != NULL) *is_ha = false;
17195
ef4bddc2 17196 machine_mode new_mode = VOIDmode;
43e9d192
IB
17197 bool composite_p = aarch64_composite_type_p (type, mode);
17198
43e9d192
IB
17199 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
17200 || aarch64_short_vector_p (type, mode))
17201 {
17202 *count = 1;
17203 new_mode = mode;
17204 }
17205 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
17206 {
17207 if (is_ha != NULL) *is_ha = true;
17208 *count = 2;
17209 new_mode = GET_MODE_INNER (mode);
17210 }
17211 else if (type && composite_p)
17212 {
56fe3ca3
RS
17213 unsigned int warn_psabi_flags = 0;
17214 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode,
17215 &warn_psabi_flags);
43e9d192
IB
17216 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
17217 {
e73a32d6
MM
17218 static unsigned last_reported_type_uid;
17219 unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (type));
17220 int alt;
56fe3ca3
RS
17221 if (!silent_p
17222 && warn_psabi
17223 && warn_psabi_flags
e73a32d6
MM
17224 && uid != last_reported_type_uid
17225 && ((alt = aapcs_vfp_sub_candidate (type, &new_mode, NULL))
17226 != ag_count))
17227 {
e33a1eae
JJ
17228 const char *url
17229 = CHANGES_ROOT_URL "gcc-10/changes.html#empty_base";
e73a32d6
MM
17230 gcc_assert (alt == -1);
17231 last_reported_type_uid = uid;
56fe3ca3
RS
17232 /* Use TYPE_MAIN_VARIANT to strip any redundant const
17233 qualification. */
17234 if (warn_psabi_flags & WARN_PSABI_NO_UNIQUE_ADDRESS)
17235 inform (input_location, "parameter passing for argument of "
17236 "type %qT with %<[[no_unique_address]]%> members "
691eeb65
JJ
17237 "changed %{in GCC 10.1%}",
17238 TYPE_MAIN_VARIANT (type), url);
56fe3ca3
RS
17239 else if (warn_psabi_flags & WARN_PSABI_EMPTY_CXX17_BASE)
17240 inform (input_location, "parameter passing for argument of "
17241 "type %qT when C++17 is enabled changed to match "
691eeb65
JJ
17242 "C++14 %{in GCC 10.1%}",
17243 TYPE_MAIN_VARIANT (type), url);
e73a32d6
MM
17244 }
17245
43e9d192
IB
17246 if (is_ha != NULL) *is_ha = true;
17247 *count = ag_count;
17248 }
17249 else
17250 return false;
17251 }
17252 else
17253 return false;
17254
38e62001 17255 gcc_assert (!aarch64_sve_mode_p (new_mode));
43e9d192
IB
17256 *base_mode = new_mode;
17257 return true;
17258}
17259
17260/* Implement TARGET_STRUCT_VALUE_RTX. */
17261
17262static rtx
17263aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
17264 int incoming ATTRIBUTE_UNUSED)
17265{
17266 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
17267}
17268
17269/* Implements target hook vector_mode_supported_p. */
17270static bool
ef4bddc2 17271aarch64_vector_mode_supported_p (machine_mode mode)
43e9d192 17272{
43cacb12 17273 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
cc68f7c2 17274 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
43e9d192
IB
17275}
17276
4aeb1ba7
RS
17277/* Return the full-width SVE vector mode for element mode MODE, if one
17278 exists. */
17279opt_machine_mode
17280aarch64_full_sve_mode (scalar_mode mode)
17281{
17282 switch (mode)
17283 {
17284 case E_DFmode:
17285 return VNx2DFmode;
17286 case E_SFmode:
17287 return VNx4SFmode;
17288 case E_HFmode:
17289 return VNx8HFmode;
02fcd8ac
RS
17290 case E_BFmode:
17291 return VNx8BFmode;
4aeb1ba7 17292 case E_DImode:
02fcd8ac 17293 return VNx2DImode;
4aeb1ba7
RS
17294 case E_SImode:
17295 return VNx4SImode;
17296 case E_HImode:
17297 return VNx8HImode;
17298 case E_QImode:
17299 return VNx16QImode;
17300 default:
17301 return opt_machine_mode ();
17302 }
17303}
17304
17305/* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
17306 if it exists. */
17307opt_machine_mode
17308aarch64_vq_mode (scalar_mode mode)
17309{
17310 switch (mode)
17311 {
17312 case E_DFmode:
17313 return V2DFmode;
17314 case E_SFmode:
17315 return V4SFmode;
17316 case E_HFmode:
17317 return V8HFmode;
abbe1ed2
SMW
17318 case E_BFmode:
17319 return V8BFmode;
4aeb1ba7
RS
17320 case E_SImode:
17321 return V4SImode;
17322 case E_HImode:
17323 return V8HImode;
17324 case E_QImode:
17325 return V16QImode;
17326 case E_DImode:
17327 return V2DImode;
17328 default:
17329 return opt_machine_mode ();
17330 }
17331}
17332
b7342d25
IB
17333/* Return appropriate SIMD container
17334 for MODE within a vector of WIDTH bits. */
ef4bddc2 17335static machine_mode
43cacb12 17336aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
43e9d192 17337{
9b070057
RS
17338 if (TARGET_SVE
17339 && maybe_ne (width, 128)
17340 && known_eq (width, BITS_PER_SVE_VECTOR))
4aeb1ba7 17341 return aarch64_full_sve_mode (mode).else_mode (word_mode);
43cacb12
RS
17342
17343 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
43e9d192 17344 if (TARGET_SIMD)
b7342d25 17345 {
43cacb12 17346 if (known_eq (width, 128))
4aeb1ba7 17347 return aarch64_vq_mode (mode).else_mode (word_mode);
b7342d25
IB
17348 else
17349 switch (mode)
17350 {
4e10a5a7 17351 case E_SFmode:
b7342d25 17352 return V2SFmode;
4e10a5a7 17353 case E_HFmode:
b719f884 17354 return V4HFmode;
abbe1ed2
SMW
17355 case E_BFmode:
17356 return V4BFmode;
4e10a5a7 17357 case E_SImode:
b7342d25 17358 return V2SImode;
4e10a5a7 17359 case E_HImode:
b7342d25 17360 return V4HImode;
4e10a5a7 17361 case E_QImode:
b7342d25
IB
17362 return V8QImode;
17363 default:
17364 break;
17365 }
17366 }
43e9d192
IB
17367 return word_mode;
17368}
17369
5f29f3d5
KT
17370/* Compare an SVE mode SVE_M and an Advanced SIMD mode ASIMD_M
17371 and return whether the SVE mode should be preferred over the
17372 Advanced SIMD one in aarch64_autovectorize_vector_modes. */
17373static bool
17374aarch64_cmp_autovec_modes (machine_mode sve_m, machine_mode asimd_m)
17375{
17376 /* Take into account the aarch64-autovec-preference param if non-zero. */
17377 bool only_asimd_p = aarch64_autovec_preference == 1;
17378 bool only_sve_p = aarch64_autovec_preference == 2;
17379
17380 if (only_asimd_p)
17381 return false;
17382 if (only_sve_p)
17383 return true;
17384
17385 /* The preference in case of a tie in costs. */
17386 bool prefer_asimd = aarch64_autovec_preference == 3;
17387 bool prefer_sve = aarch64_autovec_preference == 4;
17388
17389 aarch64_sve_vector_bits_enum tune_width = aarch64_tune_params.sve_width;
17390
17391 poly_int64 nunits_sve = GET_MODE_NUNITS (sve_m);
17392 poly_int64 nunits_asimd = GET_MODE_NUNITS (asimd_m);
17393 /* If the CPU information does not have an SVE width registered use the
17394 generic poly_int comparison that prefers SVE. If a preference is
17395 explicitly requested avoid this path. */
17396 if (tune_width == SVE_SCALABLE
17397 && !prefer_asimd
17398 && !prefer_sve)
17399 return maybe_gt (nunits_sve, nunits_asimd);
17400
17401 /* Otherwise estimate the runtime width of the modes involved. */
64432b68
KT
17402 HOST_WIDE_INT est_sve = estimated_poly_value (nunits_sve);
17403 HOST_WIDE_INT est_asimd = estimated_poly_value (nunits_asimd);
5f29f3d5
KT
17404
17405 /* Preferring SVE means picking it first unless the Advanced SIMD mode
17406 is clearly wider. */
17407 if (prefer_sve)
17408 return est_sve >= est_asimd;
17409 /* Conversely, preferring Advanced SIMD means picking SVE only if SVE
17410 is clearly wider. */
17411 if (prefer_asimd)
17412 return est_sve > est_asimd;
17413
17414 /* In the default case prefer Advanced SIMD over SVE in case of a tie. */
17415 return est_sve > est_asimd;
17416}
17417
b7342d25 17418/* Return 128-bit container as the preferred SIMD mode for MODE. */
ef4bddc2 17419static machine_mode
005ba29c 17420aarch64_preferred_simd_mode (scalar_mode mode)
b7342d25 17421{
5f29f3d5
KT
17422 /* Take into account explicit auto-vectorization ISA preferences through
17423 aarch64_cmp_autovec_modes. */
7ff5706f
RS
17424 if (TARGET_SVE && aarch64_cmp_autovec_modes (VNx16QImode, V16QImode))
17425 return aarch64_full_sve_mode (mode).else_mode (word_mode);
17426 if (TARGET_SIMD)
17427 return aarch64_vq_mode (mode).else_mode (word_mode);
17428 return word_mode;
b7342d25
IB
17429}
17430
86e36728 17431/* Return a list of possible vector sizes for the vectorizer
3b357264 17432 to iterate over. */
bcc7e346 17433static unsigned int
e021fb86 17434aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
3b357264 17435{
cc68f7c2
RS
17436 static const machine_mode sve_modes[] = {
17437 /* Try using full vectors for all element types. */
17438 VNx16QImode,
17439
17440 /* Try using 16-bit containers for 8-bit elements and full vectors
17441 for wider elements. */
17442 VNx8QImode,
17443
17444 /* Try using 32-bit containers for 8-bit and 16-bit elements and
17445 full vectors for wider elements. */
17446 VNx4QImode,
74166aab 17447
cc68f7c2
RS
17448 /* Try using 64-bit containers for all element types. */
17449 VNx2QImode
17450 };
17451
17452 static const machine_mode advsimd_modes[] = {
17453 /* Try using 128-bit vectors for all element types. */
17454 V16QImode,
17455
17456 /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
17457 for wider elements. */
17458 V8QImode,
17459
17460 /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
17461 for wider elements.
17462
17463 TODO: We could support a limited form of V4QImode too, so that
17464 we use 32-bit vectors for 8-bit elements. */
17465 V4HImode,
17466
17467 /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
17468 for 64-bit elements.
74166aab 17469
cc68f7c2
RS
17470 TODO: We could similarly support limited forms of V2QImode and V2HImode
17471 for this case. */
17472 V2SImode
17473 };
74166aab 17474
cc68f7c2
RS
17475 /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
17476 This is because:
74166aab 17477
cc68f7c2
RS
17478 - If we can't use N-byte Advanced SIMD vectors then the placement
17479 doesn't matter; we'll just continue as though the Advanced SIMD
17480 entry didn't exist.
74166aab 17481
cc68f7c2
RS
17482 - If an SVE main loop with N bytes ends up being cheaper than an
17483 Advanced SIMD main loop with N bytes then by default we'll replace
17484 the Advanced SIMD version with the SVE one.
74166aab 17485
cc68f7c2
RS
17486 - If an Advanced SIMD main loop with N bytes ends up being cheaper
17487 than an SVE main loop with N bytes then by default we'll try to
17488 use the SVE loop to vectorize the epilogue instead. */
5f29f3d5
KT
17489
17490 bool only_asimd_p = aarch64_autovec_preference == 1;
17491 bool only_sve_p = aarch64_autovec_preference == 2;
17492
17493 unsigned int sve_i = (TARGET_SVE && !only_asimd_p) ? 0 : ARRAY_SIZE (sve_modes);
cc68f7c2 17494 unsigned int advsimd_i = 0;
5f29f3d5
KT
17495
17496 while (!only_sve_p && advsimd_i < ARRAY_SIZE (advsimd_modes))
cc68f7c2
RS
17497 {
17498 if (sve_i < ARRAY_SIZE (sve_modes)
5f29f3d5
KT
17499 && aarch64_cmp_autovec_modes (sve_modes[sve_i],
17500 advsimd_modes[advsimd_i]))
cc68f7c2
RS
17501 modes->safe_push (sve_modes[sve_i++]);
17502 else
17503 modes->safe_push (advsimd_modes[advsimd_i++]);
17504 }
17505 while (sve_i < ARRAY_SIZE (sve_modes))
5f29f3d5 17506 modes->safe_push (sve_modes[sve_i++]);
bcc7e346 17507
eb23241b
RS
17508 unsigned int flags = 0;
17509 /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we
17510 can compare SVE against Advanced SIMD and so that we can compare
17511 multiple SVE vectorization approaches against each other. There's
17512 not really any point doing this for Advanced SIMD only, since the
17513 first mode that works should always be the best. */
17514 if (TARGET_SVE && aarch64_sve_compare_costs)
17515 flags |= VECT_COMPARE_COSTS;
17516 return flags;
3b357264
JG
17517}
17518
ac2b960f
YZ
17519/* Implement TARGET_MANGLE_TYPE. */
17520
6f549691 17521static const char *
ac2b960f
YZ
17522aarch64_mangle_type (const_tree type)
17523{
17524 /* The AArch64 ABI documents say that "__va_list" has to be
17f8ace2 17525 mangled as if it is in the "std" namespace. */
ac2b960f
YZ
17526 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
17527 return "St9__va_list";
17528
abbe1ed2 17529 /* Half-precision floating point types. */
c2ec330c 17530 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
abbe1ed2
SMW
17531 {
17532 if (TYPE_MODE (type) == BFmode)
17533 return "u6__bf16";
17534 else
17535 return "Dh";
17536 }
c2ec330c 17537
f9d53c27
TB
17538 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
17539 builtin types. */
17540 if (TYPE_NAME (type) != NULL)
624d0f07
RS
17541 {
17542 const char *res;
17543 if ((res = aarch64_general_mangle_builtin_type (type))
17544 || (res = aarch64_sve::mangle_builtin_type (type)))
17545 return res;
17546 }
c6fc9e43 17547
ac2b960f
YZ
17548 /* Use the default mangling. */
17549 return NULL;
17550}
17551
65ef05d0
RS
17552/* Implement TARGET_VERIFY_TYPE_CONTEXT. */
17553
17554static bool
17555aarch64_verify_type_context (location_t loc, type_context_kind context,
17556 const_tree type, bool silent_p)
17557{
17558 return aarch64_sve::verify_type_context (loc, context, type, silent_p);
17559}
17560
75cf1494
KT
17561/* Find the first rtx_insn before insn that will generate an assembly
17562 instruction. */
17563
17564static rtx_insn *
17565aarch64_prev_real_insn (rtx_insn *insn)
17566{
17567 if (!insn)
17568 return NULL;
17569
17570 do
17571 {
17572 insn = prev_real_insn (insn);
17573 }
17574 while (insn && recog_memoized (insn) < 0);
17575
17576 return insn;
17577}
17578
17579static bool
17580is_madd_op (enum attr_type t1)
17581{
17582 unsigned int i;
17583 /* A number of these may be AArch32 only. */
17584 enum attr_type mlatypes[] = {
17585 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
17586 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
17587 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
17588 };
17589
17590 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
17591 {
17592 if (t1 == mlatypes[i])
17593 return true;
17594 }
17595
17596 return false;
17597}
17598
17599/* Check if there is a register dependency between a load and the insn
17600 for which we hold recog_data. */
17601
17602static bool
17603dep_between_memop_and_curr (rtx memop)
17604{
17605 rtx load_reg;
17606 int opno;
17607
8baff86e 17608 gcc_assert (GET_CODE (memop) == SET);
75cf1494
KT
17609
17610 if (!REG_P (SET_DEST (memop)))
17611 return false;
17612
17613 load_reg = SET_DEST (memop);
8baff86e 17614 for (opno = 1; opno < recog_data.n_operands; opno++)
75cf1494
KT
17615 {
17616 rtx operand = recog_data.operand[opno];
17617 if (REG_P (operand)
17618 && reg_overlap_mentioned_p (load_reg, operand))
17619 return true;
17620
17621 }
17622 return false;
17623}
17624
8baff86e
KT
17625
17626/* When working around the Cortex-A53 erratum 835769,
17627 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
17628 instruction and has a preceding memory instruction such that a NOP
17629 should be inserted between them. */
17630
75cf1494
KT
17631bool
17632aarch64_madd_needs_nop (rtx_insn* insn)
17633{
17634 enum attr_type attr_type;
17635 rtx_insn *prev;
17636 rtx body;
17637
b32c1043 17638 if (!TARGET_FIX_ERR_A53_835769)
75cf1494
KT
17639 return false;
17640
e322d6e3 17641 if (!INSN_P (insn) || recog_memoized (insn) < 0)
75cf1494
KT
17642 return false;
17643
17644 attr_type = get_attr_type (insn);
17645 if (!is_madd_op (attr_type))
17646 return false;
17647
17648 prev = aarch64_prev_real_insn (insn);
3fea1a75
KT
17649 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
17650 Restore recog state to INSN to avoid state corruption. */
17651 extract_constrain_insn_cached (insn);
17652
550e2205 17653 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
75cf1494
KT
17654 return false;
17655
17656 body = single_set (prev);
17657
17658 /* If the previous insn is a memory op and there is no dependency between
8baff86e
KT
17659 it and the DImode madd, emit a NOP between them. If body is NULL then we
17660 have a complex memory operation, probably a load/store pair.
17661 Be conservative for now and emit a NOP. */
17662 if (GET_MODE (recog_data.operand[0]) == DImode
17663 && (!body || !dep_between_memop_and_curr (body)))
75cf1494
KT
17664 return true;
17665
17666 return false;
17667
17668}
17669
8baff86e
KT
17670
17671/* Implement FINAL_PRESCAN_INSN. */
17672
75cf1494
KT
17673void
17674aarch64_final_prescan_insn (rtx_insn *insn)
17675{
17676 if (aarch64_madd_needs_nop (insn))
17677 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
17678}
17679
17680
43cacb12
RS
17681/* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
17682 instruction. */
17683
17684bool
17685aarch64_sve_index_immediate_p (rtx base_or_step)
17686{
17687 return (CONST_INT_P (base_or_step)
17688 && IN_RANGE (INTVAL (base_or_step), -16, 15));
17689}
17690
f3582fda
RS
17691/* Return true if X is a valid immediate for the SVE ADD and SUB instructions
17692 when applied to mode MODE. Negate X first if NEGATE_P is true. */
43cacb12
RS
17693
17694bool
f3582fda 17695aarch64_sve_arith_immediate_p (machine_mode mode, rtx x, bool negate_p)
43cacb12 17696{
f3582fda
RS
17697 rtx elt = unwrap_const_vec_duplicate (x);
17698 if (!CONST_INT_P (elt))
43cacb12
RS
17699 return false;
17700
17701 HOST_WIDE_INT val = INTVAL (elt);
17702 if (negate_p)
17703 val = -val;
f3582fda 17704 val &= GET_MODE_MASK (GET_MODE_INNER (mode));
43cacb12
RS
17705
17706 if (val & 0xff)
17707 return IN_RANGE (val, 0, 0xff);
17708 return IN_RANGE (val, 0, 0xff00);
17709}
17710
624d0f07 17711/* Return true if X is a valid immediate for the SVE SQADD and SQSUB
f3582fda
RS
17712 instructions when applied to mode MODE. Negate X first if NEGATE_P
17713 is true. */
624d0f07
RS
17714
17715bool
f3582fda 17716aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode, rtx x, bool negate_p)
624d0f07 17717{
f3582fda 17718 if (!aarch64_sve_arith_immediate_p (mode, x, negate_p))
624d0f07
RS
17719 return false;
17720
17721 /* After the optional negation, the immediate must be nonnegative.
17722 E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
17723 instead of SQADD Zn.B, Zn.B, #129. */
f3582fda 17724 rtx elt = unwrap_const_vec_duplicate (x);
624d0f07
RS
17725 return negate_p == (INTVAL (elt) < 0);
17726}
17727
43cacb12
RS
17728/* Return true if X is a valid immediate operand for an SVE logical
17729 instruction such as AND. */
17730
17731bool
17732aarch64_sve_bitmask_immediate_p (rtx x)
17733{
17734 rtx elt;
17735
17736 return (const_vec_duplicate_p (x, &elt)
17737 && CONST_INT_P (elt)
17738 && aarch64_bitmask_imm (INTVAL (elt),
17739 GET_MODE_INNER (GET_MODE (x))));
17740}
17741
17742/* Return true if X is a valid immediate for the SVE DUP and CPY
17743 instructions. */
17744
17745bool
17746aarch64_sve_dup_immediate_p (rtx x)
17747{
d29f7dd5
RS
17748 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
17749 if (!CONST_INT_P (x))
43cacb12
RS
17750 return false;
17751
d29f7dd5 17752 HOST_WIDE_INT val = INTVAL (x);
43cacb12
RS
17753 if (val & 0xff)
17754 return IN_RANGE (val, -0x80, 0x7f);
17755 return IN_RANGE (val, -0x8000, 0x7f00);
17756}
17757
17758/* Return true if X is a valid immediate operand for an SVE CMP instruction.
17759 SIGNED_P says whether the operand is signed rather than unsigned. */
17760
17761bool
17762aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
17763{
6bc67182
RS
17764 x = unwrap_const_vec_duplicate (x);
17765 return (CONST_INT_P (x)
43cacb12 17766 && (signed_p
6bc67182
RS
17767 ? IN_RANGE (INTVAL (x), -16, 15)
17768 : IN_RANGE (INTVAL (x), 0, 127)));
43cacb12
RS
17769}
17770
17771/* Return true if X is a valid immediate operand for an SVE FADD or FSUB
17772 instruction. Negate X first if NEGATE_P is true. */
17773
17774bool
17775aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
17776{
17777 rtx elt;
17778 REAL_VALUE_TYPE r;
17779
17780 if (!const_vec_duplicate_p (x, &elt)
3793ecc1 17781 || !CONST_DOUBLE_P (elt))
43cacb12
RS
17782 return false;
17783
17784 r = *CONST_DOUBLE_REAL_VALUE (elt);
17785
17786 if (negate_p)
17787 r = real_value_negate (&r);
17788
17789 if (real_equal (&r, &dconst1))
17790 return true;
17791 if (real_equal (&r, &dconsthalf))
17792 return true;
17793 return false;
17794}
17795
17796/* Return true if X is a valid immediate operand for an SVE FMUL
17797 instruction. */
17798
17799bool
17800aarch64_sve_float_mul_immediate_p (rtx x)
17801{
17802 rtx elt;
17803
43cacb12 17804 return (const_vec_duplicate_p (x, &elt)
3793ecc1 17805 && CONST_DOUBLE_P (elt)
a19ba9e1
RS
17806 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
17807 || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
43cacb12
RS
17808}
17809
b187677b
RS
17810/* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
17811 for the Advanced SIMD operation described by WHICH and INSN. If INFO
17812 is nonnull, use it to describe valid immediates. */
3520f7cc 17813static bool
b187677b
RS
17814aarch64_advsimd_valid_immediate_hs (unsigned int val32,
17815 simd_immediate_info *info,
17816 enum simd_immediate_check which,
17817 simd_immediate_info::insn_type insn)
17818{
17819 /* Try a 4-byte immediate with LSL. */
17820 for (unsigned int shift = 0; shift < 32; shift += 8)
17821 if ((val32 & (0xff << shift)) == val32)
17822 {
17823 if (info)
17824 *info = simd_immediate_info (SImode, val32 >> shift, insn,
17825 simd_immediate_info::LSL, shift);
17826 return true;
17827 }
3520f7cc 17828
b187677b
RS
17829 /* Try a 2-byte immediate with LSL. */
17830 unsigned int imm16 = val32 & 0xffff;
17831 if (imm16 == (val32 >> 16))
17832 for (unsigned int shift = 0; shift < 16; shift += 8)
17833 if ((imm16 & (0xff << shift)) == imm16)
48063b9d 17834 {
b187677b
RS
17835 if (info)
17836 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
17837 simd_immediate_info::LSL, shift);
17838 return true;
48063b9d 17839 }
3520f7cc 17840
b187677b
RS
17841 /* Try a 4-byte immediate with MSL, except for cases that MVN
17842 can handle. */
17843 if (which == AARCH64_CHECK_MOV)
17844 for (unsigned int shift = 8; shift < 24; shift += 8)
17845 {
17846 unsigned int low = (1 << shift) - 1;
17847 if (((val32 & (0xff << shift)) | low) == val32)
17848 {
17849 if (info)
17850 *info = simd_immediate_info (SImode, val32 >> shift, insn,
17851 simd_immediate_info::MSL, shift);
17852 return true;
17853 }
17854 }
43e9d192 17855
b187677b
RS
17856 return false;
17857}
17858
17859/* Return true if replicating VAL64 is a valid immediate for the
17860 Advanced SIMD operation described by WHICH. If INFO is nonnull,
17861 use it to describe valid immediates. */
17862static bool
17863aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
17864 simd_immediate_info *info,
17865 enum simd_immediate_check which)
17866{
17867 unsigned int val32 = val64 & 0xffffffff;
17868 unsigned int val16 = val64 & 0xffff;
17869 unsigned int val8 = val64 & 0xff;
17870
17871 if (val32 == (val64 >> 32))
43e9d192 17872 {
b187677b
RS
17873 if ((which & AARCH64_CHECK_ORR) != 0
17874 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
17875 simd_immediate_info::MOV))
17876 return true;
43e9d192 17877
b187677b
RS
17878 if ((which & AARCH64_CHECK_BIC) != 0
17879 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
17880 simd_immediate_info::MVN))
17881 return true;
ee78df47 17882
b187677b
RS
17883 /* Try using a replicated byte. */
17884 if (which == AARCH64_CHECK_MOV
17885 && val16 == (val32 >> 16)
17886 && val8 == (val16 >> 8))
ee78df47 17887 {
b187677b
RS
17888 if (info)
17889 *info = simd_immediate_info (QImode, val8);
17890 return true;
ee78df47 17891 }
43e9d192
IB
17892 }
17893
b187677b
RS
17894 /* Try using a bit-to-bytemask. */
17895 if (which == AARCH64_CHECK_MOV)
43e9d192 17896 {
b187677b
RS
17897 unsigned int i;
17898 for (i = 0; i < 64; i += 8)
ab6501d7 17899 {
b187677b
RS
17900 unsigned char byte = (val64 >> i) & 0xff;
17901 if (byte != 0 && byte != 0xff)
17902 break;
ab6501d7 17903 }
b187677b 17904 if (i == 64)
ab6501d7 17905 {
b187677b
RS
17906 if (info)
17907 *info = simd_immediate_info (DImode, val64);
17908 return true;
ab6501d7 17909 }
43e9d192 17910 }
b187677b
RS
17911 return false;
17912}
43e9d192 17913
43cacb12
RS
17914/* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
17915 instruction. If INFO is nonnull, use it to describe valid immediates. */
17916
17917static bool
17918aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
17919 simd_immediate_info *info)
17920{
17921 scalar_int_mode mode = DImode;
17922 unsigned int val32 = val64 & 0xffffffff;
17923 if (val32 == (val64 >> 32))
17924 {
17925 mode = SImode;
17926 unsigned int val16 = val32 & 0xffff;
17927 if (val16 == (val32 >> 16))
17928 {
17929 mode = HImode;
17930 unsigned int val8 = val16 & 0xff;
17931 if (val8 == (val16 >> 8))
17932 mode = QImode;
17933 }
17934 }
17935 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
17936 if (IN_RANGE (val, -0x80, 0x7f))
17937 {
17938 /* DUP with no shift. */
17939 if (info)
17940 *info = simd_immediate_info (mode, val);
17941 return true;
17942 }
17943 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
17944 {
17945 /* DUP with LSL #8. */
17946 if (info)
17947 *info = simd_immediate_info (mode, val);
17948 return true;
17949 }
17950 if (aarch64_bitmask_imm (val64, mode))
17951 {
17952 /* DUPM. */
17953 if (info)
17954 *info = simd_immediate_info (mode, val);
17955 return true;
17956 }
17957 return false;
17958}
17959
624d0f07
RS
17960/* Return true if X is an UNSPEC_PTRUE constant of the form:
17961
17962 (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
17963
17964 where PATTERN is the svpattern as a CONST_INT and where ZERO
17965 is a zero constant of the required PTRUE mode (which can have
17966 fewer elements than X's mode, if zero bits are significant).
17967
17968 If so, and if INFO is nonnull, describe the immediate in INFO. */
17969bool
17970aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
17971{
17972 if (GET_CODE (x) != CONST)
17973 return false;
17974
17975 x = XEXP (x, 0);
17976 if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
17977 return false;
17978
17979 if (info)
17980 {
17981 aarch64_svpattern pattern
17982 = (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
17983 machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
17984 scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
17985 *info = simd_immediate_info (int_mode, pattern);
17986 }
17987 return true;
17988}
17989
0b1fe8cf
RS
17990/* Return true if X is a valid SVE predicate. If INFO is nonnull, use
17991 it to describe valid immediates. */
17992
17993static bool
17994aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
17995{
624d0f07
RS
17996 if (aarch64_sve_ptrue_svpattern_p (x, info))
17997 return true;
17998
0b1fe8cf
RS
17999 if (x == CONST0_RTX (GET_MODE (x)))
18000 {
18001 if (info)
18002 *info = simd_immediate_info (DImode, 0);
18003 return true;
18004 }
18005
18006 /* Analyze the value as a VNx16BImode. This should be relatively
18007 efficient, since rtx_vector_builder has enough built-in capacity
18008 to store all VLA predicate constants without needing the heap. */
18009 rtx_vector_builder builder;
18010 if (!aarch64_get_sve_pred_bits (builder, x))
18011 return false;
18012
18013 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
18014 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
18015 {
18016 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
18017 aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
18018 if (pattern != AARCH64_NUM_SVPATTERNS)
18019 {
18020 if (info)
18021 {
18022 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
18023 *info = simd_immediate_info (int_mode, pattern);
18024 }
18025 return true;
18026 }
18027 }
18028 return false;
18029}
18030
b187677b
RS
18031/* Return true if OP is a valid SIMD immediate for the operation
18032 described by WHICH. If INFO is nonnull, use it to describe valid
18033 immediates. */
18034bool
18035aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
18036 enum simd_immediate_check which)
18037{
43cacb12
RS
18038 machine_mode mode = GET_MODE (op);
18039 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
18040 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
18041 return false;
18042
0b1fe8cf
RS
18043 if (vec_flags & VEC_SVE_PRED)
18044 return aarch64_sve_pred_valid_immediate (op, info);
18045
43cacb12 18046 scalar_mode elt_mode = GET_MODE_INNER (mode);
f9093f23 18047 rtx base, step;
b187677b 18048 unsigned int n_elts;
f9093f23
RS
18049 if (GET_CODE (op) == CONST_VECTOR
18050 && CONST_VECTOR_DUPLICATE_P (op))
18051 n_elts = CONST_VECTOR_NPATTERNS (op);
43cacb12
RS
18052 else if ((vec_flags & VEC_SVE_DATA)
18053 && const_vec_series_p (op, &base, &step))
18054 {
18055 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
18056 if (!aarch64_sve_index_immediate_p (base)
18057 || !aarch64_sve_index_immediate_p (step))
18058 return false;
18059
18060 if (info)
cc68f7c2
RS
18061 {
18062 /* Get the corresponding container mode. E.g. an INDEX on V2SI
18063 should yield two integer values per 128-bit block, meaning
18064 that we need to treat it in the same way as V2DI and then
18065 ignore the upper 32 bits of each element. */
18066 elt_mode = aarch64_sve_container_int_mode (mode);
18067 *info = simd_immediate_info (elt_mode, base, step);
18068 }
43cacb12
RS
18069 return true;
18070 }
6a70badb
RS
18071 else if (GET_CODE (op) == CONST_VECTOR
18072 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
18073 /* N_ELTS set above. */;
b187677b 18074 else
d8edd899 18075 return false;
43e9d192 18076
b187677b 18077 scalar_float_mode elt_float_mode;
f9093f23
RS
18078 if (n_elts == 1
18079 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
43e9d192 18080 {
f9093f23
RS
18081 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
18082 if (aarch64_float_const_zero_rtx_p (elt)
18083 || aarch64_float_const_representable_p (elt))
18084 {
18085 if (info)
18086 *info = simd_immediate_info (elt_float_mode, elt);
18087 return true;
18088 }
b187677b 18089 }
43e9d192 18090
b23c6a2c
RS
18091 /* If all elements in an SVE vector have the same value, we have a free
18092 choice between using the element mode and using the container mode.
18093 Using the element mode means that unused parts of the vector are
18094 duplicates of the used elements, while using the container mode means
18095 that the unused parts are an extension of the used elements. Using the
18096 element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
18097 for its container mode VNx4SI while 0x00000101 isn't.
18098
18099 If not all elements in an SVE vector have the same value, we need the
18100 transition from one element to the next to occur at container boundaries.
18101 E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
18102 in the same way as a VNx4SI containing { 1, 2, 3, 4 }. */
18103 scalar_int_mode elt_int_mode;
18104 if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
18105 elt_int_mode = aarch64_sve_container_int_mode (mode);
18106 else
18107 elt_int_mode = int_mode_for_mode (elt_mode).require ();
18108
18109 unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
b187677b
RS
18110 if (elt_size > 8)
18111 return false;
e4f0f84d 18112
b187677b
RS
18113 /* Expand the vector constant out into a byte vector, with the least
18114 significant byte of the register first. */
18115 auto_vec<unsigned char, 16> bytes;
18116 bytes.reserve (n_elts * elt_size);
18117 for (unsigned int i = 0; i < n_elts; i++)
18118 {
f9093f23
RS
18119 /* The vector is provided in gcc endian-neutral fashion.
18120 For aarch64_be Advanced SIMD, it must be laid out in the vector
18121 register in reverse order. */
18122 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
18123 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
43e9d192 18124
b187677b
RS
18125 if (elt_mode != elt_int_mode)
18126 elt = gen_lowpart (elt_int_mode, elt);
43e9d192 18127
b187677b
RS
18128 if (!CONST_INT_P (elt))
18129 return false;
43e9d192 18130
b187677b
RS
18131 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
18132 for (unsigned int byte = 0; byte < elt_size; byte++)
48063b9d 18133 {
b187677b
RS
18134 bytes.quick_push (elt_val & 0xff);
18135 elt_val >>= BITS_PER_UNIT;
48063b9d 18136 }
43e9d192
IB
18137 }
18138
b187677b
RS
18139 /* The immediate must repeat every eight bytes. */
18140 unsigned int nbytes = bytes.length ();
18141 for (unsigned i = 8; i < nbytes; ++i)
18142 if (bytes[i] != bytes[i - 8])
18143 return false;
18144
18145 /* Get the repeating 8-byte value as an integer. No endian correction
18146 is needed here because bytes is already in lsb-first order. */
18147 unsigned HOST_WIDE_INT val64 = 0;
18148 for (unsigned int i = 0; i < 8; i++)
18149 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
18150 << (i * BITS_PER_UNIT));
18151
43cacb12
RS
18152 if (vec_flags & VEC_SVE_DATA)
18153 return aarch64_sve_valid_immediate (val64, info);
18154 else
18155 return aarch64_advsimd_valid_immediate (val64, info, which);
18156}
18157
18158/* Check whether X is a VEC_SERIES-like constant that starts at 0 and
18159 has a step in the range of INDEX. Return the index expression if so,
18160 otherwise return null. */
18161rtx
18162aarch64_check_zero_based_sve_index_immediate (rtx x)
18163{
18164 rtx base, step;
18165 if (const_vec_series_p (x, &base, &step)
18166 && base == const0_rtx
18167 && aarch64_sve_index_immediate_p (step))
18168 return step;
18169 return NULL_RTX;
43e9d192
IB
18170}
18171
43e9d192
IB
18172/* Check of immediate shift constants are within range. */
18173bool
ef4bddc2 18174aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
43e9d192 18175{
6bc67182
RS
18176 x = unwrap_const_vec_duplicate (x);
18177 if (!CONST_INT_P (x))
18178 return false;
43e9d192
IB
18179 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
18180 if (left)
6bc67182 18181 return IN_RANGE (INTVAL (x), 0, bit_width - 1);
43e9d192 18182 else
6bc67182 18183 return IN_RANGE (INTVAL (x), 1, bit_width);
43e9d192
IB
18184}
18185
7325d85a
KT
18186/* Return the bitmask CONST_INT to select the bits required by a zero extract
18187 operation of width WIDTH at bit position POS. */
18188
18189rtx
18190aarch64_mask_from_zextract_ops (rtx width, rtx pos)
18191{
18192 gcc_assert (CONST_INT_P (width));
18193 gcc_assert (CONST_INT_P (pos));
18194
18195 unsigned HOST_WIDE_INT mask
18196 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
18197 return GEN_INT (mask << UINTVAL (pos));
18198}
18199
83f8c414 18200bool
a6e0bfa7 18201aarch64_mov_operand_p (rtx x, machine_mode mode)
83f8c414 18202{
83f8c414
CSS
18203 if (GET_CODE (x) == HIGH
18204 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
18205 return true;
18206
82614948 18207 if (CONST_INT_P (x))
83f8c414
CSS
18208 return true;
18209
43cacb12 18210 if (VECTOR_MODE_P (GET_MODE (x)))
678faefc
RS
18211 {
18212 /* Require predicate constants to be VNx16BI before RA, so that we
18213 force everything to have a canonical form. */
18214 if (!lra_in_progress
18215 && !reload_completed
18216 && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
18217 && GET_MODE (x) != VNx16BImode)
18218 return false;
18219
18220 return aarch64_simd_valid_immediate (x, NULL);
18221 }
43cacb12 18222
74b27d8e 18223 x = strip_salt (x);
3793ecc1 18224 if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x))
83f8c414
CSS
18225 return true;
18226
c0e0174b 18227 if (TARGET_SVE && aarch64_sve_cnt_immediate_p (x))
43cacb12
RS
18228 return true;
18229
a6e0bfa7 18230 return aarch64_classify_symbolic_expression (x)
a5350ddc 18231 == SYMBOL_TINY_ABSOLUTE;
83f8c414
CSS
18232}
18233
43e9d192
IB
18234/* Return a const_int vector of VAL. */
18235rtx
ab014eb3 18236aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
43e9d192 18237{
59d06c05
RS
18238 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
18239 return gen_const_vec_duplicate (mode, c);
43e9d192
IB
18240}
18241
051d0e2f
SN
18242/* Check OP is a legal scalar immediate for the MOVI instruction. */
18243
18244bool
77e994c9 18245aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
051d0e2f 18246{
ef4bddc2 18247 machine_mode vmode;
051d0e2f 18248
43cacb12 18249 vmode = aarch64_simd_container_mode (mode, 64);
051d0e2f 18250 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
b187677b 18251 return aarch64_simd_valid_immediate (op_v, NULL);
051d0e2f
SN
18252}
18253
988fa693
JG
18254/* Construct and return a PARALLEL RTX vector with elements numbering the
18255 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
18256 the vector - from the perspective of the architecture. This does not
18257 line up with GCC's perspective on lane numbers, so we end up with
18258 different masks depending on our target endian-ness. The diagram
18259 below may help. We must draw the distinction when building masks
18260 which select one half of the vector. An instruction selecting
18261 architectural low-lanes for a big-endian target, must be described using
18262 a mask selecting GCC high-lanes.
18263
18264 Big-Endian Little-Endian
18265
18266GCC 0 1 2 3 3 2 1 0
18267 | x | x | x | x | | x | x | x | x |
18268Architecture 3 2 1 0 3 2 1 0
18269
18270Low Mask: { 2, 3 } { 0, 1 }
18271High Mask: { 0, 1 } { 2, 3 }
f5cbabc1
RS
18272
18273 MODE Is the mode of the vector and NUNITS is the number of units in it. */
988fa693 18274
43e9d192 18275rtx
f5cbabc1 18276aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
43e9d192 18277{
43e9d192 18278 rtvec v = rtvec_alloc (nunits / 2);
988fa693
JG
18279 int high_base = nunits / 2;
18280 int low_base = 0;
18281 int base;
43e9d192
IB
18282 rtx t1;
18283 int i;
18284
988fa693
JG
18285 if (BYTES_BIG_ENDIAN)
18286 base = high ? low_base : high_base;
18287 else
18288 base = high ? high_base : low_base;
18289
18290 for (i = 0; i < nunits / 2; i++)
43e9d192
IB
18291 RTVEC_ELT (v, i) = GEN_INT (base + i);
18292
18293 t1 = gen_rtx_PARALLEL (mode, v);
18294 return t1;
18295}
18296
988fa693
JG
18297/* Check OP for validity as a PARALLEL RTX vector with elements
18298 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
18299 from the perspective of the architecture. See the diagram above
18300 aarch64_simd_vect_par_cnst_half for more details. */
18301
18302bool
ef4bddc2 18303aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
988fa693
JG
18304 bool high)
18305{
6a70badb
RS
18306 int nelts;
18307 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
f5cbabc1
RS
18308 return false;
18309
6a70badb 18310 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
988fa693
JG
18311 HOST_WIDE_INT count_op = XVECLEN (op, 0);
18312 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
18313 int i = 0;
18314
988fa693
JG
18315 if (count_op != count_ideal)
18316 return false;
18317
18318 for (i = 0; i < count_ideal; i++)
18319 {
18320 rtx elt_op = XVECEXP (op, 0, i);
18321 rtx elt_ideal = XVECEXP (ideal, 0, i);
18322
4aa81c2e 18323 if (!CONST_INT_P (elt_op)
988fa693
JG
18324 || INTVAL (elt_ideal) != INTVAL (elt_op))
18325 return false;
18326 }
18327 return true;
18328}
18329
4aeb1ba7
RS
18330/* Return a PARALLEL containing NELTS elements, with element I equal
18331 to BASE + I * STEP. */
18332
18333rtx
18334aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
18335{
18336 rtvec vec = rtvec_alloc (nelts);
18337 for (unsigned int i = 0; i < nelts; ++i)
18338 RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
18339 return gen_rtx_PARALLEL (VOIDmode, vec);
18340}
18341
18342/* Return true if OP is a PARALLEL of CONST_INTs that form a linear
18343 series with step STEP. */
18344
18345bool
18346aarch64_stepped_int_parallel_p (rtx op, int step)
18347{
18348 if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
18349 return false;
18350
18351 unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
18352 for (int i = 1; i < XVECLEN (op, 0); ++i)
18353 if (!CONST_INT_P (XVECEXP (op, 0, i))
18354 || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
18355 return false;
18356
18357 return true;
18358}
18359
43e9d192
IB
18360/* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
18361 HIGH (exclusive). */
18362void
46ed6024
CB
18363aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
18364 const_tree exp)
43e9d192
IB
18365{
18366 HOST_WIDE_INT lane;
4aa81c2e 18367 gcc_assert (CONST_INT_P (operand));
43e9d192
IB
18368 lane = INTVAL (operand);
18369
18370 if (lane < low || lane >= high)
46ed6024
CB
18371 {
18372 if (exp)
cf0c27ef 18373 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
46ed6024 18374 else
cf0c27ef 18375 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
46ed6024 18376 }
43e9d192
IB
18377}
18378
7ac29c0f
RS
18379/* Peform endian correction on lane number N, which indexes a vector
18380 of mode MODE, and return the result as an SImode rtx. */
18381
18382rtx
18383aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
18384{
18385 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
18386}
18387
43e9d192 18388/* Return TRUE if OP is a valid vector addressing mode. */
43cacb12 18389
43e9d192
IB
18390bool
18391aarch64_simd_mem_operand_p (rtx op)
18392{
18393 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
4aa81c2e 18394 || REG_P (XEXP (op, 0)));
43e9d192
IB
18395}
18396
43cacb12
RS
18397/* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
18398
18399bool
18400aarch64_sve_ld1r_operand_p (rtx op)
18401{
18402 struct aarch64_address_info addr;
18403 scalar_mode mode;
18404
18405 return (MEM_P (op)
18406 && is_a <scalar_mode> (GET_MODE (op), &mode)
18407 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
18408 && addr.type == ADDRESS_REG_IMM
18409 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
18410}
18411
9ceec73f
MM
18412/* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
18413 where the size of the read data is specified by `mode` and the size of the
18414 vector elements are specified by `elem_mode`. */
4aeb1ba7 18415bool
9ceec73f
MM
18416aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode,
18417 scalar_mode elem_mode)
4aeb1ba7
RS
18418{
18419 struct aarch64_address_info addr;
4aeb1ba7
RS
18420 if (!MEM_P (op)
18421 || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
18422 return false;
18423
18424 if (addr.type == ADDRESS_REG_IMM)
9ceec73f 18425 return offset_4bit_signed_scaled_p (mode, addr.const_offset);
4aeb1ba7
RS
18426
18427 if (addr.type == ADDRESS_REG_REG)
18428 return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
18429
18430 return false;
18431}
18432
9ceec73f
MM
18433/* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
18434bool
18435aarch64_sve_ld1rq_operand_p (rtx op)
18436{
18437 return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode,
18438 GET_MODE_INNER (GET_MODE (op)));
18439}
18440
18441/* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
18442 accessing a vector where the element size is specified by `elem_mode`. */
18443bool
18444aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode)
18445{
18446 return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode);
18447}
18448
624d0f07
RS
18449/* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction. */
18450bool
18451aarch64_sve_ldff1_operand_p (rtx op)
18452{
18453 if (!MEM_P (op))
18454 return false;
18455
18456 struct aarch64_address_info addr;
18457 if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
18458 return false;
18459
18460 if (addr.type == ADDRESS_REG_IMM)
18461 return known_eq (addr.const_offset, 0);
18462
18463 return addr.type == ADDRESS_REG_REG;
18464}
18465
18466/* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction. */
18467bool
18468aarch64_sve_ldnf1_operand_p (rtx op)
18469{
18470 struct aarch64_address_info addr;
18471
18472 return (MEM_P (op)
18473 && aarch64_classify_address (&addr, XEXP (op, 0),
18474 GET_MODE (op), false)
18475 && addr.type == ADDRESS_REG_IMM);
18476}
18477
43cacb12
RS
18478/* Return true if OP is a valid MEM operand for an SVE LDR instruction.
18479 The conditions for STR are the same. */
18480bool
18481aarch64_sve_ldr_operand_p (rtx op)
18482{
18483 struct aarch64_address_info addr;
18484
18485 return (MEM_P (op)
18486 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
18487 false, ADDR_QUERY_ANY)
18488 && addr.type == ADDRESS_REG_IMM);
18489}
18490
624d0f07
RS
18491/* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
18492 addressing memory of mode MODE. */
18493bool
18494aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
18495{
18496 struct aarch64_address_info addr;
18497 if (!aarch64_classify_address (&addr, op, mode, false))
18498 return false;
18499
18500 if (addr.type == ADDRESS_REG_IMM)
18501 return known_eq (addr.const_offset, 0);
18502
18503 return addr.type == ADDRESS_REG_REG;
18504}
18505
9f4cbab8
RS
18506/* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
18507 We need to be able to access the individual pieces, so the range
18508 is different from LD[234] and ST[234]. */
18509bool
18510aarch64_sve_struct_memory_operand_p (rtx op)
18511{
18512 if (!MEM_P (op))
18513 return false;
18514
18515 machine_mode mode = GET_MODE (op);
18516 struct aarch64_address_info addr;
18517 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
18518 ADDR_QUERY_ANY)
18519 || addr.type != ADDRESS_REG_IMM)
18520 return false;
18521
18522 poly_int64 first = addr.const_offset;
18523 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
18524 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
18525 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
18526}
18527
2d8c6dc1
AH
18528/* Emit a register copy from operand to operand, taking care not to
18529 early-clobber source registers in the process.
43e9d192 18530
2d8c6dc1
AH
18531 COUNT is the number of components into which the copy needs to be
18532 decomposed. */
43e9d192 18533void
b8506a8a 18534aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
2d8c6dc1 18535 unsigned int count)
43e9d192
IB
18536{
18537 unsigned int i;
2d8c6dc1
AH
18538 int rdest = REGNO (operands[0]);
18539 int rsrc = REGNO (operands[1]);
43e9d192
IB
18540
18541 if (!reg_overlap_mentioned_p (operands[0], operands[1])
2d8c6dc1
AH
18542 || rdest < rsrc)
18543 for (i = 0; i < count; i++)
18544 emit_move_insn (gen_rtx_REG (mode, rdest + i),
18545 gen_rtx_REG (mode, rsrc + i));
43e9d192 18546 else
2d8c6dc1
AH
18547 for (i = 0; i < count; i++)
18548 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
18549 gen_rtx_REG (mode, rsrc + count - i - 1));
43e9d192
IB
18550}
18551
668046d1 18552/* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
6ec0e5b9 18553 one of VSTRUCT modes: OI, CI, or XI. */
668046d1 18554int
b8506a8a 18555aarch64_simd_attr_length_rglist (machine_mode mode)
668046d1 18556{
6a70badb
RS
18557 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
18558 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
668046d1
DS
18559}
18560
db0253a4 18561/* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
43cacb12
RS
18562 alignment of a vector to 128 bits. SVE predicates have an alignment of
18563 16 bits. */
db0253a4
TB
18564static HOST_WIDE_INT
18565aarch64_simd_vector_alignment (const_tree type)
18566{
07108a9e
RS
18567 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
18568 be set for non-predicate vectors of booleans. Modes are the most
18569 direct way we have of identifying real SVE predicate types. */
18570 if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
18571 return 16;
cc68f7c2
RS
18572 widest_int min_size
18573 = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
18574 return wi::umin (min_size, 128).to_uhwi ();
db0253a4
TB
18575}
18576
43cacb12 18577/* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
ca31798e 18578static poly_uint64
43cacb12
RS
18579aarch64_vectorize_preferred_vector_alignment (const_tree type)
18580{
18581 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
18582 {
18583 /* If the length of the vector is fixed, try to align to that length,
18584 otherwise don't try to align at all. */
18585 HOST_WIDE_INT result;
18586 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
18587 result = TYPE_ALIGN (TREE_TYPE (type));
18588 return result;
18589 }
18590 return TYPE_ALIGN (type);
18591}
18592
db0253a4
TB
18593/* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
18594static bool
18595aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
18596{
18597 if (is_packed)
18598 return false;
18599
43cacb12
RS
18600 /* For fixed-length vectors, check that the vectorizer will aim for
18601 full-vector alignment. This isn't true for generic GCC vectors
18602 that are wider than the ABI maximum of 128 bits. */
ca31798e
AV
18603 poly_uint64 preferred_alignment =
18604 aarch64_vectorize_preferred_vector_alignment (type);
43cacb12 18605 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
ca31798e
AV
18606 && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
18607 preferred_alignment))
db0253a4
TB
18608 return false;
18609
18610 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
18611 return true;
18612}
18613
7df76747
N
18614/* Return true if the vector misalignment factor is supported by the
18615 target. */
18616static bool
18617aarch64_builtin_support_vector_misalignment (machine_mode mode,
18618 const_tree type, int misalignment,
18619 bool is_packed)
18620{
18621 if (TARGET_SIMD && STRICT_ALIGNMENT)
18622 {
18623 /* Return if movmisalign pattern is not supported for this mode. */
18624 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
18625 return false;
18626
a509c571 18627 /* Misalignment factor is unknown at compile time. */
7df76747 18628 if (misalignment == -1)
a509c571 18629 return false;
7df76747
N
18630 }
18631 return default_builtin_support_vector_misalignment (mode, type, misalignment,
18632 is_packed);
18633}
18634
4369c11e
TB
18635/* If VALS is a vector constant that can be loaded into a register
18636 using DUP, generate instructions to do so and return an RTX to
18637 assign to the register. Otherwise return NULL_RTX. */
18638static rtx
18639aarch64_simd_dup_constant (rtx vals)
18640{
ef4bddc2
RS
18641 machine_mode mode = GET_MODE (vals);
18642 machine_mode inner_mode = GET_MODE_INNER (mode);
4369c11e 18643 rtx x;
4369c11e 18644
92695fbb 18645 if (!const_vec_duplicate_p (vals, &x))
4369c11e
TB
18646 return NULL_RTX;
18647
18648 /* We can load this constant by using DUP and a constant in a
18649 single ARM register. This will be cheaper than a vector
18650 load. */
92695fbb 18651 x = copy_to_mode_reg (inner_mode, x);
59d06c05 18652 return gen_vec_duplicate (mode, x);
4369c11e
TB
18653}
18654
18655
18656/* Generate code to load VALS, which is a PARALLEL containing only
18657 constants (for vec_init) or CONST_VECTOR, efficiently into a
18658 register. Returns an RTX to copy into the register, or NULL_RTX
67914693 18659 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
1df3f464 18660static rtx
4369c11e
TB
18661aarch64_simd_make_constant (rtx vals)
18662{
ef4bddc2 18663 machine_mode mode = GET_MODE (vals);
4369c11e
TB
18664 rtx const_dup;
18665 rtx const_vec = NULL_RTX;
4369c11e
TB
18666 int n_const = 0;
18667 int i;
18668
18669 if (GET_CODE (vals) == CONST_VECTOR)
18670 const_vec = vals;
18671 else if (GET_CODE (vals) == PARALLEL)
18672 {
18673 /* A CONST_VECTOR must contain only CONST_INTs and
18674 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
18675 Only store valid constants in a CONST_VECTOR. */
6a70badb 18676 int n_elts = XVECLEN (vals, 0);
4369c11e
TB
18677 for (i = 0; i < n_elts; ++i)
18678 {
18679 rtx x = XVECEXP (vals, 0, i);
18680 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
18681 n_const++;
18682 }
18683 if (n_const == n_elts)
18684 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
18685 }
18686 else
18687 gcc_unreachable ();
18688
18689 if (const_vec != NULL_RTX
b187677b 18690 && aarch64_simd_valid_immediate (const_vec, NULL))
4369c11e
TB
18691 /* Load using MOVI/MVNI. */
18692 return const_vec;
18693 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
18694 /* Loaded using DUP. */
18695 return const_dup;
18696 else if (const_vec != NULL_RTX)
67914693 18697 /* Load from constant pool. We cannot take advantage of single-cycle
4369c11e
TB
18698 LD1 because we need a PC-relative addressing mode. */
18699 return const_vec;
18700 else
18701 /* A PARALLEL containing something not valid inside CONST_VECTOR.
67914693 18702 We cannot construct an initializer. */
4369c11e
TB
18703 return NULL_RTX;
18704}
18705
35a093b6
JG
18706/* Expand a vector initialisation sequence, such that TARGET is
18707 initialised to contain VALS. */
18708
4369c11e
TB
18709void
18710aarch64_expand_vector_init (rtx target, rtx vals)
18711{
ef4bddc2 18712 machine_mode mode = GET_MODE (target);
146c2e3a 18713 scalar_mode inner_mode = GET_MODE_INNER (mode);
35a093b6 18714 /* The number of vector elements. */
6a70badb 18715 int n_elts = XVECLEN (vals, 0);
35a093b6 18716 /* The number of vector elements which are not constant. */
8b66a2d4
AL
18717 int n_var = 0;
18718 rtx any_const = NULL_RTX;
35a093b6
JG
18719 /* The first element of vals. */
18720 rtx v0 = XVECEXP (vals, 0, 0);
4369c11e 18721 bool all_same = true;
4369c11e 18722
41dab855
KT
18723 /* This is a special vec_init<M><N> where N is not an element mode but a
18724 vector mode with half the elements of M. We expect to find two entries
18725 of mode N in VALS and we must put their concatentation into TARGET. */
18726 if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
18727 {
18728 gcc_assert (known_eq (GET_MODE_SIZE (mode),
18729 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
18730 rtx lo = XVECEXP (vals, 0, 0);
18731 rtx hi = XVECEXP (vals, 0, 1);
18732 machine_mode narrow_mode = GET_MODE (lo);
18733 gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
18734 gcc_assert (narrow_mode == GET_MODE (hi));
18735
18736 /* When we want to concatenate a half-width vector with zeroes we can
18737 use the aarch64_combinez[_be] patterns. Just make sure that the
18738 zeroes are in the right half. */
18739 if (BYTES_BIG_ENDIAN
18740 && aarch64_simd_imm_zero (lo, narrow_mode)
18741 && general_operand (hi, narrow_mode))
18742 emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
18743 else if (!BYTES_BIG_ENDIAN
18744 && aarch64_simd_imm_zero (hi, narrow_mode)
18745 && general_operand (lo, narrow_mode))
18746 emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
18747 else
18748 {
18749 /* Else create the two half-width registers and combine them. */
18750 if (!REG_P (lo))
18751 lo = force_reg (GET_MODE (lo), lo);
18752 if (!REG_P (hi))
18753 hi = force_reg (GET_MODE (hi), hi);
18754
18755 if (BYTES_BIG_ENDIAN)
18756 std::swap (lo, hi);
18757 emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
18758 }
18759 return;
18760 }
18761
35a093b6 18762 /* Count the number of variable elements to initialise. */
8b66a2d4 18763 for (int i = 0; i < n_elts; ++i)
4369c11e 18764 {
8b66a2d4 18765 rtx x = XVECEXP (vals, 0, i);
35a093b6 18766 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
8b66a2d4
AL
18767 ++n_var;
18768 else
18769 any_const = x;
4369c11e 18770
35a093b6 18771 all_same &= rtx_equal_p (x, v0);
4369c11e
TB
18772 }
18773
35a093b6
JG
18774 /* No variable elements, hand off to aarch64_simd_make_constant which knows
18775 how best to handle this. */
4369c11e
TB
18776 if (n_var == 0)
18777 {
18778 rtx constant = aarch64_simd_make_constant (vals);
18779 if (constant != NULL_RTX)
18780 {
18781 emit_move_insn (target, constant);
18782 return;
18783 }
18784 }
18785
18786 /* Splat a single non-constant element if we can. */
18787 if (all_same)
18788 {
35a093b6 18789 rtx x = copy_to_mode_reg (inner_mode, v0);
59d06c05 18790 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
4369c11e
TB
18791 return;
18792 }
18793
85c1b6d7
AP
18794 enum insn_code icode = optab_handler (vec_set_optab, mode);
18795 gcc_assert (icode != CODE_FOR_nothing);
18796
18797 /* If there are only variable elements, try to optimize
18798 the insertion using dup for the most common element
18799 followed by insertions. */
18800
18801 /* The algorithm will fill matches[*][0] with the earliest matching element,
18802 and matches[X][1] with the count of duplicate elements (if X is the
18803 earliest element which has duplicates). */
18804
18805 if (n_var == n_elts && n_elts <= 16)
18806 {
18807 int matches[16][2] = {0};
18808 for (int i = 0; i < n_elts; i++)
18809 {
18810 for (int j = 0; j <= i; j++)
18811 {
18812 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
18813 {
18814 matches[i][0] = j;
18815 matches[j][1]++;
18816 break;
18817 }
18818 }
18819 }
18820 int maxelement = 0;
18821 int maxv = 0;
18822 for (int i = 0; i < n_elts; i++)
18823 if (matches[i][1] > maxv)
18824 {
18825 maxelement = i;
18826 maxv = matches[i][1];
18827 }
18828
b4e2cd5b
JG
18829 /* Create a duplicate of the most common element, unless all elements
18830 are equally useless to us, in which case just immediately set the
18831 vector register using the first element. */
18832
18833 if (maxv == 1)
18834 {
18835 /* For vectors of two 64-bit elements, we can do even better. */
18836 if (n_elts == 2
18837 && (inner_mode == E_DImode
18838 || inner_mode == E_DFmode))
18839
18840 {
18841 rtx x0 = XVECEXP (vals, 0, 0);
18842 rtx x1 = XVECEXP (vals, 0, 1);
18843 /* Combine can pick up this case, but handling it directly
18844 here leaves clearer RTL.
18845
18846 This is load_pair_lanes<mode>, and also gives us a clean-up
18847 for store_pair_lanes<mode>. */
18848 if (memory_operand (x0, inner_mode)
18849 && memory_operand (x1, inner_mode)
18850 && !STRICT_ALIGNMENT
18851 && rtx_equal_p (XEXP (x1, 0),
18852 plus_constant (Pmode,
18853 XEXP (x0, 0),
18854 GET_MODE_SIZE (inner_mode))))
18855 {
18856 rtx t;
18857 if (inner_mode == DFmode)
18858 t = gen_load_pair_lanesdf (target, x0, x1);
18859 else
18860 t = gen_load_pair_lanesdi (target, x0, x1);
18861 emit_insn (t);
18862 return;
18863 }
18864 }
18865 /* The subreg-move sequence below will move into lane zero of the
18866 vector register. For big-endian we want that position to hold
18867 the last element of VALS. */
18868 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
18869 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
18870 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
18871 }
18872 else
18873 {
18874 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
18875 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
18876 }
85c1b6d7
AP
18877
18878 /* Insert the rest. */
18879 for (int i = 0; i < n_elts; i++)
18880 {
18881 rtx x = XVECEXP (vals, 0, i);
18882 if (matches[i][0] == maxelement)
18883 continue;
18884 x = copy_to_mode_reg (inner_mode, x);
18885 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
18886 }
18887 return;
18888 }
18889
35a093b6
JG
18890 /* Initialise a vector which is part-variable. We want to first try
18891 to build those lanes which are constant in the most efficient way we
18892 can. */
18893 if (n_var != n_elts)
4369c11e
TB
18894 {
18895 rtx copy = copy_rtx (vals);
4369c11e 18896
8b66a2d4
AL
18897 /* Load constant part of vector. We really don't care what goes into the
18898 parts we will overwrite, but we're more likely to be able to load the
18899 constant efficiently if it has fewer, larger, repeating parts
18900 (see aarch64_simd_valid_immediate). */
18901 for (int i = 0; i < n_elts; i++)
18902 {
18903 rtx x = XVECEXP (vals, 0, i);
18904 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
18905 continue;
18906 rtx subst = any_const;
18907 for (int bit = n_elts / 2; bit > 0; bit /= 2)
18908 {
18909 /* Look in the copied vector, as more elements are const. */
18910 rtx test = XVECEXP (copy, 0, i ^ bit);
18911 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
18912 {
18913 subst = test;
18914 break;
18915 }
18916 }
18917 XVECEXP (copy, 0, i) = subst;
18918 }
4369c11e 18919 aarch64_expand_vector_init (target, copy);
35a093b6 18920 }
4369c11e 18921
35a093b6 18922 /* Insert the variable lanes directly. */
8b66a2d4 18923 for (int i = 0; i < n_elts; i++)
35a093b6
JG
18924 {
18925 rtx x = XVECEXP (vals, 0, i);
18926 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
18927 continue;
18928 x = copy_to_mode_reg (inner_mode, x);
18929 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
18930 }
4369c11e
TB
18931}
18932
3a0afad0
PK
18933/* Emit RTL corresponding to:
18934 insr TARGET, ELEM. */
18935
18936static void
18937emit_insr (rtx target, rtx elem)
18938{
18939 machine_mode mode = GET_MODE (target);
18940 scalar_mode elem_mode = GET_MODE_INNER (mode);
18941 elem = force_reg (elem_mode, elem);
18942
18943 insn_code icode = optab_handler (vec_shl_insert_optab, mode);
18944 gcc_assert (icode != CODE_FOR_nothing);
18945 emit_insn (GEN_FCN (icode) (target, target, elem));
18946}
18947
18948/* Subroutine of aarch64_sve_expand_vector_init for handling
18949 trailing constants.
18950 This function works as follows:
18951 (a) Create a new vector consisting of trailing constants.
18952 (b) Initialize TARGET with the constant vector using emit_move_insn.
18953 (c) Insert remaining elements in TARGET using insr.
18954 NELTS is the total number of elements in original vector while
18955 while NELTS_REQD is the number of elements that are actually
18956 significant.
18957
18958 ??? The heuristic used is to do above only if number of constants
18959 is at least half the total number of elements. May need fine tuning. */
18960
18961static bool
18962aarch64_sve_expand_vector_init_handle_trailing_constants
18963 (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
18964{
18965 machine_mode mode = GET_MODE (target);
18966 scalar_mode elem_mode = GET_MODE_INNER (mode);
18967 int n_trailing_constants = 0;
18968
18969 for (int i = nelts_reqd - 1;
5da301cb 18970 i >= 0 && valid_for_const_vector_p (elem_mode, builder.elt (i));
3a0afad0
PK
18971 i--)
18972 n_trailing_constants++;
18973
18974 if (n_trailing_constants >= nelts_reqd / 2)
18975 {
5da301cb
RS
18976 /* Try to use the natural pattern of BUILDER to extend the trailing
18977 constant elements to a full vector. Replace any variables in the
18978 extra elements with zeros.
18979
18980 ??? It would be better if the builders supported "don't care"
18981 elements, with the builder filling in whichever elements
18982 give the most compact encoding. */
18983 rtx_vector_builder v (mode, nelts, 1);
3a0afad0 18984 for (int i = 0; i < nelts; i++)
5da301cb
RS
18985 {
18986 rtx x = builder.elt (i + nelts_reqd - n_trailing_constants);
18987 if (!valid_for_const_vector_p (elem_mode, x))
18988 x = const0_rtx;
18989 v.quick_push (x);
18990 }
3a0afad0
PK
18991 rtx const_vec = v.build ();
18992 emit_move_insn (target, const_vec);
18993
18994 for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
18995 emit_insr (target, builder.elt (i));
18996
18997 return true;
18998 }
18999
19000 return false;
19001}
19002
19003/* Subroutine of aarch64_sve_expand_vector_init.
19004 Works as follows:
19005 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
19006 (b) Skip trailing elements from BUILDER, which are the same as
19007 element NELTS_REQD - 1.
19008 (c) Insert earlier elements in reverse order in TARGET using insr. */
19009
19010static void
19011aarch64_sve_expand_vector_init_insert_elems (rtx target,
19012 const rtx_vector_builder &builder,
19013 int nelts_reqd)
19014{
19015 machine_mode mode = GET_MODE (target);
19016 scalar_mode elem_mode = GET_MODE_INNER (mode);
19017
19018 struct expand_operand ops[2];
19019 enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
19020 gcc_assert (icode != CODE_FOR_nothing);
19021
19022 create_output_operand (&ops[0], target, mode);
19023 create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
19024 expand_insn (icode, 2, ops);
19025
19026 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
19027 for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
19028 emit_insr (target, builder.elt (i));
19029}
19030
19031/* Subroutine of aarch64_sve_expand_vector_init to handle case
19032 when all trailing elements of builder are same.
19033 This works as follows:
19034 (a) Use expand_insn interface to broadcast last vector element in TARGET.
19035 (b) Insert remaining elements in TARGET using insr.
19036
19037 ??? The heuristic used is to do above if number of same trailing elements
19038 is at least 3/4 of total number of elements, loosely based on
19039 heuristic from mostly_zeros_p. May need fine-tuning. */
19040
19041static bool
19042aarch64_sve_expand_vector_init_handle_trailing_same_elem
19043 (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
19044{
19045 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
19046 if (ndups >= (3 * nelts_reqd) / 4)
19047 {
19048 aarch64_sve_expand_vector_init_insert_elems (target, builder,
19049 nelts_reqd - ndups + 1);
19050 return true;
19051 }
19052
19053 return false;
19054}
19055
19056/* Initialize register TARGET from BUILDER. NELTS is the constant number
19057 of elements in BUILDER.
19058
19059 The function tries to initialize TARGET from BUILDER if it fits one
19060 of the special cases outlined below.
19061
19062 Failing that, the function divides BUILDER into two sub-vectors:
19063 v_even = even elements of BUILDER;
19064 v_odd = odd elements of BUILDER;
19065
19066 and recursively calls itself with v_even and v_odd.
19067
19068 if (recursive call succeeded for v_even or v_odd)
19069 TARGET = zip (v_even, v_odd)
19070
19071 The function returns true if it managed to build TARGET from BUILDER
19072 with one of the special cases, false otherwise.
19073
19074 Example: {a, 1, b, 2, c, 3, d, 4}
19075
19076 The vector gets divided into:
19077 v_even = {a, b, c, d}
19078 v_odd = {1, 2, 3, 4}
19079
19080 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
19081 initialize tmp2 from constant vector v_odd using emit_move_insn.
19082
19083 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
19084 4 elements, so we construct tmp1 from v_even using insr:
19085 tmp1 = dup(d)
19086 insr tmp1, c
19087 insr tmp1, b
19088 insr tmp1, a
19089
19090 And finally:
19091 TARGET = zip (tmp1, tmp2)
19092 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
19093
19094static bool
19095aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
19096 int nelts, int nelts_reqd)
19097{
19098 machine_mode mode = GET_MODE (target);
19099
19100 /* Case 1: Vector contains trailing constants. */
19101
19102 if (aarch64_sve_expand_vector_init_handle_trailing_constants
19103 (target, builder, nelts, nelts_reqd))
19104 return true;
19105
19106 /* Case 2: Vector contains leading constants. */
19107
5da301cb 19108 rtx_vector_builder rev_builder (mode, nelts_reqd, 1);
3a0afad0
PK
19109 for (int i = 0; i < nelts_reqd; i++)
19110 rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
19111 rev_builder.finalize ();
19112
19113 if (aarch64_sve_expand_vector_init_handle_trailing_constants
19114 (target, rev_builder, nelts, nelts_reqd))
19115 {
19116 emit_insn (gen_aarch64_sve_rev (mode, target, target));
19117 return true;
19118 }
19119
19120 /* Case 3: Vector contains trailing same element. */
19121
19122 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
19123 (target, builder, nelts_reqd))
19124 return true;
19125
19126 /* Case 4: Vector contains leading same element. */
19127
19128 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
19129 (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
19130 {
19131 emit_insn (gen_aarch64_sve_rev (mode, target, target));
19132 return true;
19133 }
19134
19135 /* Avoid recursing below 4-elements.
19136 ??? The threshold 4 may need fine-tuning. */
19137
19138 if (nelts_reqd <= 4)
19139 return false;
19140
5da301cb
RS
19141 rtx_vector_builder v_even (mode, nelts, 1);
19142 rtx_vector_builder v_odd (mode, nelts, 1);
3a0afad0
PK
19143
19144 for (int i = 0; i < nelts * 2; i += 2)
19145 {
19146 v_even.quick_push (builder.elt (i));
19147 v_odd.quick_push (builder.elt (i + 1));
19148 }
19149
19150 v_even.finalize ();
19151 v_odd.finalize ();
19152
19153 rtx tmp1 = gen_reg_rtx (mode);
19154 bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
19155 nelts, nelts_reqd / 2);
19156
19157 rtx tmp2 = gen_reg_rtx (mode);
19158 bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
19159 nelts, nelts_reqd / 2);
19160
19161 if (!did_even_p && !did_odd_p)
19162 return false;
19163
19164 /* Initialize v_even and v_odd using INSR if it didn't match any of the
19165 special cases and zip v_even, v_odd. */
19166
19167 if (!did_even_p)
19168 aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
19169
19170 if (!did_odd_p)
19171 aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
19172
19173 rtvec v = gen_rtvec (2, tmp1, tmp2);
19174 emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
19175 return true;
19176}
19177
19178/* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
19179
19180void
19181aarch64_sve_expand_vector_init (rtx target, rtx vals)
19182{
19183 machine_mode mode = GET_MODE (target);
19184 int nelts = XVECLEN (vals, 0);
19185
5da301cb 19186 rtx_vector_builder v (mode, nelts, 1);
3a0afad0
PK
19187 for (int i = 0; i < nelts; i++)
19188 v.quick_push (XVECEXP (vals, 0, i));
19189 v.finalize ();
19190
19191 /* If neither sub-vectors of v could be initialized specially,
19192 then use INSR to insert all elements from v into TARGET.
19193 ??? This might not be optimal for vectors with large
19194 initializers like 16-element or above.
19195 For nelts < 4, it probably isn't useful to handle specially. */
19196
19197 if (nelts < 4
19198 || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
19199 aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
19200}
19201
b6c3aea1
RS
19202/* Check whether VALUE is a vector constant in which every element
19203 is either a power of 2 or a negated power of 2. If so, return
19204 a constant vector of log2s, and flip CODE between PLUS and MINUS
19205 if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
19206
19207static rtx
19208aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
19209{
19210 if (GET_CODE (value) != CONST_VECTOR)
19211 return NULL_RTX;
19212
19213 rtx_vector_builder builder;
19214 if (!builder.new_unary_operation (GET_MODE (value), value, false))
19215 return NULL_RTX;
19216
19217 scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
19218 /* 1 if the result of the multiplication must be negated,
19219 0 if it mustn't, or -1 if we don't yet care. */
19220 int negate = -1;
19221 unsigned int encoded_nelts = const_vector_encoded_nelts (value);
19222 for (unsigned int i = 0; i < encoded_nelts; ++i)
19223 {
19224 rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
19225 if (!CONST_SCALAR_INT_P (elt))
19226 return NULL_RTX;
19227 rtx_mode_t val (elt, int_mode);
19228 wide_int pow2 = wi::neg (val);
19229 if (val != pow2)
19230 {
19231 /* It matters whether we negate or not. Make that choice,
19232 and make sure that it's consistent with previous elements. */
19233 if (negate == !wi::neg_p (val))
19234 return NULL_RTX;
19235 negate = wi::neg_p (val);
19236 if (!negate)
19237 pow2 = val;
19238 }
19239 /* POW2 is now the value that we want to be a power of 2. */
19240 int shift = wi::exact_log2 (pow2);
19241 if (shift < 0)
19242 return NULL_RTX;
19243 builder.quick_push (gen_int_mode (shift, int_mode));
19244 }
19245 if (negate == -1)
19246 /* PLUS and MINUS are equivalent; canonicalize on PLUS. */
19247 code = PLUS;
19248 else if (negate == 1)
19249 code = code == PLUS ? MINUS : PLUS;
19250 return builder.build ();
19251}
19252
19253/* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
19254 CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
19255 operands array, in the same order as for fma_optab. Return true if
19256 the function emitted all the necessary instructions, false if the caller
19257 should generate the pattern normally with the new OPERANDS array. */
19258
19259bool
19260aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
19261{
19262 machine_mode mode = GET_MODE (operands[0]);
19263 if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
19264 {
19265 rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
19266 NULL_RTX, true, OPTAB_DIRECT);
19267 force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
19268 operands[3], product, operands[0], true,
19269 OPTAB_DIRECT);
19270 return true;
19271 }
19272 operands[2] = force_reg (mode, operands[2]);
19273 return false;
19274}
19275
19276/* Likewise, but for a conditional pattern. */
19277
19278bool
19279aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
19280{
19281 machine_mode mode = GET_MODE (operands[0]);
19282 if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
19283 {
19284 rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
19285 NULL_RTX, true, OPTAB_DIRECT);
19286 emit_insn (gen_cond (code, mode, operands[0], operands[1],
19287 operands[4], product, operands[5]));
19288 return true;
19289 }
19290 operands[3] = force_reg (mode, operands[3]);
19291 return false;
19292}
19293
43e9d192 19294static unsigned HOST_WIDE_INT
ef4bddc2 19295aarch64_shift_truncation_mask (machine_mode mode)
43e9d192 19296{
43cacb12
RS
19297 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
19298 return 0;
19299 return GET_MODE_UNIT_BITSIZE (mode) - 1;
43e9d192
IB
19300}
19301
43e9d192
IB
19302/* Select a format to encode pointers in exception handling data. */
19303int
19304aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
19305{
19306 int type;
19307 switch (aarch64_cmodel)
19308 {
19309 case AARCH64_CMODEL_TINY:
19310 case AARCH64_CMODEL_TINY_PIC:
19311 case AARCH64_CMODEL_SMALL:
19312 case AARCH64_CMODEL_SMALL_PIC:
1b1e81f8 19313 case AARCH64_CMODEL_SMALL_SPIC:
43e9d192
IB
19314 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
19315 for everything. */
19316 type = DW_EH_PE_sdata4;
19317 break;
19318 default:
19319 /* No assumptions here. 8-byte relocs required. */
19320 type = DW_EH_PE_sdata8;
19321 break;
19322 }
19323 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
19324}
19325
b07fc91c
SN
19326/* Output .variant_pcs for aarch64_vector_pcs function symbols. */
19327
19328static void
19329aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
19330{
c600df9a 19331 if (TREE_CODE (decl) == FUNCTION_DECL)
b07fc91c 19332 {
c600df9a
RS
19333 arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
19334 if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
19335 {
19336 fprintf (stream, "\t.variant_pcs\t");
19337 assemble_name (stream, name);
19338 fprintf (stream, "\n");
19339 }
b07fc91c
SN
19340 }
19341}
19342
e1c1ecb0
KT
19343/* The last .arch and .tune assembly strings that we printed. */
19344static std::string aarch64_last_printed_arch_string;
19345static std::string aarch64_last_printed_tune_string;
19346
361fb3ee
KT
19347/* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
19348 by the function fndecl. */
19349
19350void
19351aarch64_declare_function_name (FILE *stream, const char* name,
19352 tree fndecl)
19353{
19354 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
19355
19356 struct cl_target_option *targ_options;
19357 if (target_parts)
19358 targ_options = TREE_TARGET_OPTION (target_parts);
19359 else
19360 targ_options = TREE_TARGET_OPTION (target_option_current_node);
19361 gcc_assert (targ_options);
19362
19363 const struct processor *this_arch
19364 = aarch64_get_arch (targ_options->x_explicit_arch);
19365
28108a53 19366 uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
054b4005 19367 std::string extension
04a99ebe
JG
19368 = aarch64_get_extension_string_for_isa_flags (isa_flags,
19369 this_arch->flags);
e1c1ecb0
KT
19370 /* Only update the assembler .arch string if it is distinct from the last
19371 such string we printed. */
19372 std::string to_print = this_arch->name + extension;
19373 if (to_print != aarch64_last_printed_arch_string)
19374 {
19375 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
19376 aarch64_last_printed_arch_string = to_print;
19377 }
361fb3ee
KT
19378
19379 /* Print the cpu name we're tuning for in the comments, might be
e1c1ecb0
KT
19380 useful to readers of the generated asm. Do it only when it changes
19381 from function to function and verbose assembly is requested. */
361fb3ee
KT
19382 const struct processor *this_tune
19383 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
19384
e1c1ecb0
KT
19385 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
19386 {
19387 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
19388 this_tune->name);
19389 aarch64_last_printed_tune_string = this_tune->name;
19390 }
361fb3ee 19391
b07fc91c
SN
19392 aarch64_asm_output_variant_pcs (stream, fndecl, name);
19393
361fb3ee
KT
19394 /* Don't forget the type directive for ELF. */
19395 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
19396 ASM_OUTPUT_LABEL (stream, name);
c292cfe5
SN
19397
19398 cfun->machine->label_is_assembled = true;
19399}
19400
19401/* Implement PRINT_PATCHABLE_FUNCTION_ENTRY. Check if the patch area is after
19402 the function label and emit a BTI if necessary. */
19403
19404void
19405aarch64_print_patchable_function_entry (FILE *file,
19406 unsigned HOST_WIDE_INT patch_area_size,
19407 bool record_p)
19408{
19409 if (cfun->machine->label_is_assembled
19410 && aarch64_bti_enabled ()
19411 && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
19412 {
19413 /* Remove the BTI that follows the patch area and insert a new BTI
19414 before the patch area right after the function label. */
19415 rtx_insn *insn = next_real_nondebug_insn (get_insns ());
19416 if (insn
19417 && INSN_P (insn)
19418 && GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
19419 && XINT (PATTERN (insn), 1) == UNSPECV_BTI_C)
19420 delete_insn (insn);
19421 asm_fprintf (file, "\thint\t34 // bti c\n");
19422 }
19423
19424 default_print_patchable_function_entry (file, patch_area_size, record_p);
361fb3ee
KT
19425}
19426
b07fc91c
SN
19427/* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
19428
19429void
19430aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
19431{
19432 const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
19433 const char *value = IDENTIFIER_POINTER (target);
19434 aarch64_asm_output_variant_pcs (stream, decl, name);
19435 ASM_OUTPUT_DEF (stream, name, value);
19436}
19437
19438/* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
19439 function symbol references. */
19440
19441void
e8c47069 19442aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
b07fc91c 19443{
e8c47069 19444 default_elf_asm_output_external (stream, decl, name);
b07fc91c
SN
19445 aarch64_asm_output_variant_pcs (stream, decl, name);
19446}
19447
8fc16d72
ST
19448/* Triggered after a .cfi_startproc directive is emitted into the assembly file.
19449 Used to output the .cfi_b_key_frame directive when signing the current
19450 function with the B key. */
19451
19452void
19453aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
19454{
2bdc7dcb 19455 if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
8fc16d72
ST
19456 && aarch64_ra_sign_key == AARCH64_KEY_B)
19457 asm_fprintf (f, "\t.cfi_b_key_frame\n");
19458}
19459
e1c1ecb0
KT
19460/* Implements TARGET_ASM_FILE_START. Output the assembly header. */
19461
19462static void
19463aarch64_start_file (void)
19464{
19465 struct cl_target_option *default_options
19466 = TREE_TARGET_OPTION (target_option_default_node);
19467
19468 const struct processor *default_arch
19469 = aarch64_get_arch (default_options->x_explicit_arch);
28108a53 19470 uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
e1c1ecb0 19471 std::string extension
04a99ebe
JG
19472 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
19473 default_arch->flags);
e1c1ecb0
KT
19474
19475 aarch64_last_printed_arch_string = default_arch->name + extension;
19476 aarch64_last_printed_tune_string = "";
19477 asm_fprintf (asm_out_file, "\t.arch %s\n",
19478 aarch64_last_printed_arch_string.c_str ());
19479
19480 default_file_start ();
19481}
19482
0462169c
SN
19483/* Emit load exclusive. */
19484
19485static void
ef4bddc2 19486aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
0462169c
SN
19487 rtx mem, rtx model_rtx)
19488{
4a2095eb
RH
19489 if (mode == TImode)
19490 emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
19491 gen_highpart (DImode, rval),
19492 mem, model_rtx));
19493 else
19494 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
0462169c
SN
19495}
19496
19497/* Emit store exclusive. */
19498
19499static void
ef4bddc2 19500aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
4a2095eb 19501 rtx mem, rtx rval, rtx model_rtx)
0462169c 19502{
4a2095eb
RH
19503 if (mode == TImode)
19504 emit_insn (gen_aarch64_store_exclusive_pair
19505 (bval, mem, operand_subword (rval, 0, 0, TImode),
19506 operand_subword (rval, 1, 0, TImode), model_rtx));
19507 else
19508 emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
0462169c
SN
19509}
19510
19511/* Mark the previous jump instruction as unlikely. */
19512
19513static void
19514aarch64_emit_unlikely_jump (rtx insn)
19515{
f370536c 19516 rtx_insn *jump = emit_jump_insn (insn);
5fa396ad 19517 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
0462169c
SN
19518}
19519
3950b229
RH
19520/* We store the names of the various atomic helpers in a 5x4 array.
19521 Return the libcall function given MODE, MODEL and NAMES. */
19522
19523rtx
19524aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
19525 const atomic_ool_names *names)
19526{
19527 memmodel model = memmodel_base (INTVAL (model_rtx));
19528 int mode_idx, model_idx;
19529
19530 switch (mode)
19531 {
19532 case E_QImode:
19533 mode_idx = 0;
19534 break;
19535 case E_HImode:
19536 mode_idx = 1;
19537 break;
19538 case E_SImode:
19539 mode_idx = 2;
19540 break;
19541 case E_DImode:
19542 mode_idx = 3;
19543 break;
19544 case E_TImode:
19545 mode_idx = 4;
19546 break;
19547 default:
19548 gcc_unreachable ();
19549 }
19550
19551 switch (model)
19552 {
19553 case MEMMODEL_RELAXED:
19554 model_idx = 0;
19555 break;
19556 case MEMMODEL_CONSUME:
19557 case MEMMODEL_ACQUIRE:
19558 model_idx = 1;
19559 break;
19560 case MEMMODEL_RELEASE:
19561 model_idx = 2;
19562 break;
19563 case MEMMODEL_ACQ_REL:
19564 case MEMMODEL_SEQ_CST:
19565 model_idx = 3;
19566 break;
19567 default:
19568 gcc_unreachable ();
19569 }
19570
19571 return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
19572 VISIBILITY_HIDDEN);
19573}
19574
19575#define DEF0(B, N) \
19576 { "__aarch64_" #B #N "_relax", \
19577 "__aarch64_" #B #N "_acq", \
19578 "__aarch64_" #B #N "_rel", \
19579 "__aarch64_" #B #N "_acq_rel" }
19580
19581#define DEF4(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
19582 { NULL, NULL, NULL, NULL }
19583#define DEF5(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
19584
19585static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
19586const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
19587const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
19588const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
19589const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
19590const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
19591
19592#undef DEF0
19593#undef DEF4
19594#undef DEF5
19595
0462169c
SN
19596/* Expand a compare and swap pattern. */
19597
19598void
19599aarch64_expand_compare_and_swap (rtx operands[])
19600{
d400fda3
RH
19601 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
19602 machine_mode mode, r_mode;
0462169c
SN
19603
19604 bval = operands[0];
19605 rval = operands[1];
19606 mem = operands[2];
19607 oldval = operands[3];
19608 newval = operands[4];
19609 is_weak = operands[5];
19610 mod_s = operands[6];
19611 mod_f = operands[7];
19612 mode = GET_MODE (mem);
0462169c
SN
19613
19614 /* Normally the succ memory model must be stronger than fail, but in the
19615 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
19616 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
46b35980
AM
19617 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
19618 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
0462169c
SN
19619 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
19620
d400fda3
RH
19621 r_mode = mode;
19622 if (mode == QImode || mode == HImode)
0462169c 19623 {
d400fda3
RH
19624 r_mode = SImode;
19625 rval = gen_reg_rtx (r_mode);
0462169c
SN
19626 }
19627
b0770c0f 19628 if (TARGET_LSE)
77f33f44
RH
19629 {
19630 /* The CAS insn requires oldval and rval overlap, but we need to
19631 have a copy of oldval saved across the operation to tell if
19632 the operation is successful. */
d400fda3
RH
19633 if (reg_overlap_mentioned_p (rval, oldval))
19634 rval = copy_to_mode_reg (r_mode, oldval);
77f33f44 19635 else
d400fda3
RH
19636 emit_move_insn (rval, gen_lowpart (r_mode, oldval));
19637
77f33f44
RH
19638 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
19639 newval, mod_s));
d400fda3 19640 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
77f33f44 19641 }
3950b229
RH
19642 else if (TARGET_OUTLINE_ATOMICS)
19643 {
19644 /* Oldval must satisfy compare afterward. */
19645 if (!aarch64_plus_operand (oldval, mode))
19646 oldval = force_reg (mode, oldval);
19647 rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
19648 rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
19649 oldval, mode, newval, mode,
19650 XEXP (mem, 0), Pmode);
19651 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
19652 }
b0770c0f 19653 else
d400fda3
RH
19654 {
19655 /* The oldval predicate varies by mode. Test it and force to reg. */
19656 insn_code code = code_for_aarch64_compare_and_swap (mode);
19657 if (!insn_data[code].operand[2].predicate (oldval, mode))
19658 oldval = force_reg (mode, oldval);
0462169c 19659
d400fda3
RH
19660 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
19661 is_weak, mod_s, mod_f));
19662 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
19663 }
19664
19665 if (r_mode != mode)
77f33f44
RH
19666 rval = gen_lowpart (mode, rval);
19667 emit_move_insn (operands[1], rval);
0462169c 19668
d400fda3 19669 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
f7df4a84 19670 emit_insn (gen_rtx_SET (bval, x));
0462169c
SN
19671}
19672
f70fb3b6
MW
19673/* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
19674 sequence implementing an atomic operation. */
19675
19676static void
19677aarch64_emit_post_barrier (enum memmodel model)
19678{
19679 const enum memmodel base_model = memmodel_base (model);
19680
19681 if (is_mm_sync (model)
19682 && (base_model == MEMMODEL_ACQUIRE
19683 || base_model == MEMMODEL_ACQ_REL
19684 || base_model == MEMMODEL_SEQ_CST))
19685 {
19686 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
19687 }
19688}
19689
0462169c
SN
19690/* Split a compare and swap pattern. */
19691
19692void
19693aarch64_split_compare_and_swap (rtx operands[])
19694{
e5e07b68
WD
19695 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
19696 gcc_assert (epilogue_completed);
19697
b7e560de 19698 rtx rval, mem, oldval, newval, scratch, x, model_rtx;
ef4bddc2 19699 machine_mode mode;
0462169c 19700 bool is_weak;
5d8a22a5 19701 rtx_code_label *label1, *label2;
ab876106 19702 enum memmodel model;
0462169c
SN
19703
19704 rval = operands[0];
19705 mem = operands[1];
19706 oldval = operands[2];
19707 newval = operands[3];
19708 is_weak = (operands[4] != const0_rtx);
ab876106 19709 model_rtx = operands[5];
0462169c
SN
19710 scratch = operands[7];
19711 mode = GET_MODE (mem);
ab876106 19712 model = memmodel_from_int (INTVAL (model_rtx));
0462169c 19713
17f47f86
KT
19714 /* When OLDVAL is zero and we want the strong version we can emit a tighter
19715 loop:
19716 .label1:
19717 LD[A]XR rval, [mem]
19718 CBNZ rval, .label2
19719 ST[L]XR scratch, newval, [mem]
19720 CBNZ scratch, .label1
19721 .label2:
19722 CMP rval, 0. */
b7e560de
RH
19723 bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
19724 oldval == const0_rtx && mode != TImode);
17f47f86 19725
5d8a22a5 19726 label1 = NULL;
0462169c
SN
19727 if (!is_weak)
19728 {
19729 label1 = gen_label_rtx ();
19730 emit_label (label1);
19731 }
19732 label2 = gen_label_rtx ();
19733
ab876106
MW
19734 /* The initial load can be relaxed for a __sync operation since a final
19735 barrier will be emitted to stop code hoisting. */
19736 if (is_mm_sync (model))
b7e560de 19737 aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
ab876106
MW
19738 else
19739 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
0462169c 19740
17f47f86 19741 if (strong_zero_p)
b7e560de 19742 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
17f47f86
KT
19743 else
19744 {
b7e560de
RH
19745 rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
19746 x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
17f47f86 19747 }
b7e560de
RH
19748 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
19749 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
19750 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
0462169c 19751
ab876106 19752 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
0462169c
SN
19753
19754 if (!is_weak)
19755 {
6e1eaca9
RE
19756 if (aarch64_track_speculation)
19757 {
19758 /* Emit an explicit compare instruction, so that we can correctly
19759 track the condition codes. */
19760 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
19761 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
19762 }
19763 else
19764 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
19765
0462169c
SN
19766 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
19767 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
f7df4a84 19768 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
0462169c
SN
19769 }
19770 else
b7e560de 19771 aarch64_gen_compare_reg (NE, scratch, const0_rtx);
0462169c
SN
19772
19773 emit_label (label2);
b7e560de 19774
17f47f86
KT
19775 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
19776 to set the condition flags. If this is not used it will be removed by
19777 later passes. */
19778 if (strong_zero_p)
b7e560de
RH
19779 aarch64_gen_compare_reg (NE, rval, const0_rtx);
19780
ab876106
MW
19781 /* Emit any final barrier needed for a __sync operation. */
19782 if (is_mm_sync (model))
19783 aarch64_emit_post_barrier (model);
0462169c 19784}
9cd7b720 19785
0462169c
SN
19786/* Split an atomic operation. */
19787
19788void
19789aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
9cd7b720 19790 rtx value, rtx model_rtx, rtx cond)
0462169c 19791{
e5e07b68
WD
19792 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
19793 gcc_assert (epilogue_completed);
19794
ef4bddc2
RS
19795 machine_mode mode = GET_MODE (mem);
19796 machine_mode wmode = (mode == DImode ? DImode : SImode);
f70fb3b6
MW
19797 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
19798 const bool is_sync = is_mm_sync (model);
5d8a22a5
DM
19799 rtx_code_label *label;
19800 rtx x;
0462169c 19801
9cd7b720 19802 /* Split the atomic operation into a sequence. */
0462169c
SN
19803 label = gen_label_rtx ();
19804 emit_label (label);
19805
19806 if (new_out)
19807 new_out = gen_lowpart (wmode, new_out);
19808 if (old_out)
19809 old_out = gen_lowpart (wmode, old_out);
19810 else
19811 old_out = new_out;
19812 value = simplify_gen_subreg (wmode, value, mode, 0);
19813
f70fb3b6
MW
19814 /* The initial load can be relaxed for a __sync operation since a final
19815 barrier will be emitted to stop code hoisting. */
19816 if (is_sync)
19817 aarch64_emit_load_exclusive (mode, old_out, mem,
19818 GEN_INT (MEMMODEL_RELAXED));
19819 else
19820 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
0462169c
SN
19821
19822 switch (code)
19823 {
19824 case SET:
19825 new_out = value;
19826 break;
19827
19828 case NOT:
19829 x = gen_rtx_AND (wmode, old_out, value);
f7df4a84 19830 emit_insn (gen_rtx_SET (new_out, x));
0462169c 19831 x = gen_rtx_NOT (wmode, new_out);
f7df4a84 19832 emit_insn (gen_rtx_SET (new_out, x));
0462169c
SN
19833 break;
19834
19835 case MINUS:
19836 if (CONST_INT_P (value))
19837 {
19838 value = GEN_INT (-INTVAL (value));
19839 code = PLUS;
19840 }
19841 /* Fall through. */
19842
19843 default:
19844 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
f7df4a84 19845 emit_insn (gen_rtx_SET (new_out, x));
0462169c
SN
19846 break;
19847 }
19848
19849 aarch64_emit_store_exclusive (mode, cond, mem,
19850 gen_lowpart (mode, new_out), model_rtx);
19851
6e1eaca9
RE
19852 if (aarch64_track_speculation)
19853 {
19854 /* Emit an explicit compare instruction, so that we can correctly
19855 track the condition codes. */
19856 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
19857 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
19858 }
19859 else
19860 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
19861
0462169c
SN
19862 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
19863 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
f7df4a84 19864 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
f70fb3b6
MW
19865
19866 /* Emit any final barrier needed for a __sync operation. */
19867 if (is_sync)
19868 aarch64_emit_post_barrier (model);
0462169c
SN
19869}
19870
c2ec330c
AL
19871static void
19872aarch64_init_libfuncs (void)
19873{
19874 /* Half-precision float operations. The compiler handles all operations
19875 with NULL libfuncs by converting to SFmode. */
19876
19877 /* Conversions. */
19878 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
19879 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
19880
19881 /* Arithmetic. */
19882 set_optab_libfunc (add_optab, HFmode, NULL);
19883 set_optab_libfunc (sdiv_optab, HFmode, NULL);
19884 set_optab_libfunc (smul_optab, HFmode, NULL);
19885 set_optab_libfunc (neg_optab, HFmode, NULL);
19886 set_optab_libfunc (sub_optab, HFmode, NULL);
19887
19888 /* Comparisons. */
19889 set_optab_libfunc (eq_optab, HFmode, NULL);
19890 set_optab_libfunc (ne_optab, HFmode, NULL);
19891 set_optab_libfunc (lt_optab, HFmode, NULL);
19892 set_optab_libfunc (le_optab, HFmode, NULL);
19893 set_optab_libfunc (ge_optab, HFmode, NULL);
19894 set_optab_libfunc (gt_optab, HFmode, NULL);
19895 set_optab_libfunc (unord_optab, HFmode, NULL);
19896}
19897
43e9d192 19898/* Target hook for c_mode_for_suffix. */
ef4bddc2 19899static machine_mode
43e9d192
IB
19900aarch64_c_mode_for_suffix (char suffix)
19901{
19902 if (suffix == 'q')
19903 return TFmode;
19904
19905 return VOIDmode;
19906}
19907
3520f7cc
JG
19908/* We can only represent floating point constants which will fit in
19909 "quarter-precision" values. These values are characterised by
19910 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
19911 by:
19912
19913 (-1)^s * (n/16) * 2^r
19914
19915 Where:
19916 's' is the sign bit.
19917 'n' is an integer in the range 16 <= n <= 31.
19918 'r' is an integer in the range -3 <= r <= 4. */
19919
19920/* Return true iff X can be represented by a quarter-precision
19921 floating point immediate operand X. Note, we cannot represent 0.0. */
19922bool
19923aarch64_float_const_representable_p (rtx x)
19924{
19925 /* This represents our current view of how many bits
19926 make up the mantissa. */
19927 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
ba96cdfb 19928 int exponent;
3520f7cc 19929 unsigned HOST_WIDE_INT mantissa, mask;
3520f7cc 19930 REAL_VALUE_TYPE r, m;
807e902e 19931 bool fail;
3520f7cc 19932
d29f7dd5 19933 x = unwrap_const_vec_duplicate (x);
3520f7cc
JG
19934 if (!CONST_DOUBLE_P (x))
19935 return false;
19936
a4518821
RS
19937 if (GET_MODE (x) == VOIDmode
19938 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
94bfa2da
TV
19939 return false;
19940
34a72c33 19941 r = *CONST_DOUBLE_REAL_VALUE (x);
3520f7cc
JG
19942
19943 /* We cannot represent infinities, NaNs or +/-zero. We won't
19944 know if we have +zero until we analyse the mantissa, but we
19945 can reject the other invalid values. */
19946 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
19947 || REAL_VALUE_MINUS_ZERO (r))
19948 return false;
19949
ba96cdfb 19950 /* Extract exponent. */
3520f7cc
JG
19951 r = real_value_abs (&r);
19952 exponent = REAL_EXP (&r);
19953
19954 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
19955 highest (sign) bit, with a fixed binary point at bit point_pos.
19956 m1 holds the low part of the mantissa, m2 the high part.
19957 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
19958 bits for the mantissa, this can fail (low bits will be lost). */
19959 real_ldexp (&m, &r, point_pos - exponent);
807e902e 19960 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
3520f7cc
JG
19961
19962 /* If the low part of the mantissa has bits set we cannot represent
19963 the value. */
d9074b29 19964 if (w.ulow () != 0)
3520f7cc
JG
19965 return false;
19966 /* We have rejected the lower HOST_WIDE_INT, so update our
19967 understanding of how many bits lie in the mantissa and
19968 look only at the high HOST_WIDE_INT. */
807e902e 19969 mantissa = w.elt (1);
3520f7cc
JG
19970 point_pos -= HOST_BITS_PER_WIDE_INT;
19971
19972 /* We can only represent values with a mantissa of the form 1.xxxx. */
19973 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
19974 if ((mantissa & mask) != 0)
19975 return false;
19976
19977 /* Having filtered unrepresentable values, we may now remove all
19978 but the highest 5 bits. */
19979 mantissa >>= point_pos - 5;
19980
19981 /* We cannot represent the value 0.0, so reject it. This is handled
19982 elsewhere. */
19983 if (mantissa == 0)
19984 return false;
19985
19986 /* Then, as bit 4 is always set, we can mask it off, leaving
19987 the mantissa in the range [0, 15]. */
19988 mantissa &= ~(1 << 4);
19989 gcc_assert (mantissa <= 15);
19990
19991 /* GCC internally does not use IEEE754-like encoding (where normalized
19992 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
19993 Our mantissa values are shifted 4 places to the left relative to
19994 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
19995 by 5 places to correct for GCC's representation. */
19996 exponent = 5 - exponent;
19997
19998 return (exponent >= 0 && exponent <= 7);
19999}
20000
ab6501d7
SD
20001/* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
20002 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
20003 output MOVI/MVNI, ORR or BIC immediate. */
3520f7cc 20004char*
b187677b 20005aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
ab6501d7 20006 enum simd_immediate_check which)
3520f7cc 20007{
3ea63f60 20008 bool is_valid;
3520f7cc 20009 static char templ[40];
3520f7cc 20010 const char *mnemonic;
e4f0f84d 20011 const char *shift_op;
3520f7cc 20012 unsigned int lane_count = 0;
81c2dfb9 20013 char element_char;
3520f7cc 20014
b187677b 20015 struct simd_immediate_info info;
48063b9d
IB
20016
20017 /* This will return true to show const_vector is legal for use as either
ab6501d7
SD
20018 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
20019 It will also update INFO to show how the immediate should be generated.
20020 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
b187677b 20021 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
3520f7cc
JG
20022 gcc_assert (is_valid);
20023
b187677b
RS
20024 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
20025 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
48063b9d 20026
b187677b 20027 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
3520f7cc 20028 {
1da83cce
RS
20029 gcc_assert (info.insn == simd_immediate_info::MOV
20030 && info.u.mov.shift == 0);
0d8e1702
KT
20031 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
20032 move immediate path. */
1da83cce
RS
20033 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
20034 info.u.mov.value = GEN_INT (0);
48063b9d
IB
20035 else
20036 {
83faf7d0 20037 const unsigned int buf_size = 20;
48063b9d 20038 char float_buf[buf_size] = {'\0'};
34a72c33 20039 real_to_decimal_for_mode (float_buf,
1da83cce 20040 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
b187677b 20041 buf_size, buf_size, 1, info.elt_mode);
48063b9d
IB
20042
20043 if (lane_count == 1)
20044 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
20045 else
20046 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
81c2dfb9 20047 lane_count, element_char, float_buf);
48063b9d
IB
20048 return templ;
20049 }
3520f7cc 20050 }
3520f7cc 20051
1da83cce 20052 gcc_assert (CONST_INT_P (info.u.mov.value));
ab6501d7
SD
20053
20054 if (which == AARCH64_CHECK_MOV)
20055 {
b187677b 20056 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
1da83cce
RS
20057 shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
20058 ? "msl" : "lsl");
ab6501d7
SD
20059 if (lane_count == 1)
20060 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
1da83cce
RS
20061 mnemonic, UINTVAL (info.u.mov.value));
20062 else if (info.u.mov.shift)
ab6501d7
SD
20063 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
20064 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
1da83cce
RS
20065 element_char, UINTVAL (info.u.mov.value), shift_op,
20066 info.u.mov.shift);
ab6501d7
SD
20067 else
20068 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
20069 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
1da83cce 20070 element_char, UINTVAL (info.u.mov.value));
ab6501d7 20071 }
3520f7cc 20072 else
ab6501d7
SD
20073 {
20074 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
b187677b 20075 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
1da83cce 20076 if (info.u.mov.shift)
ab6501d7
SD
20077 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
20078 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
1da83cce
RS
20079 element_char, UINTVAL (info.u.mov.value), "lsl",
20080 info.u.mov.shift);
ab6501d7
SD
20081 else
20082 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
20083 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
1da83cce 20084 element_char, UINTVAL (info.u.mov.value));
ab6501d7 20085 }
3520f7cc
JG
20086 return templ;
20087}
20088
b7342d25 20089char*
77e994c9 20090aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
b7342d25 20091{
a2170965
TC
20092
20093 /* If a floating point number was passed and we desire to use it in an
20094 integer mode do the conversion to integer. */
20095 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
20096 {
20097 unsigned HOST_WIDE_INT ival;
20098 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
20099 gcc_unreachable ();
20100 immediate = gen_int_mode (ival, mode);
20101 }
20102
ef4bddc2 20103 machine_mode vmode;
a2170965
TC
20104 /* use a 64 bit mode for everything except for DI/DF mode, where we use
20105 a 128 bit vector mode. */
20106 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
b7342d25 20107
a2170965 20108 vmode = aarch64_simd_container_mode (mode, width);
b7342d25 20109 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
b187677b 20110 return aarch64_output_simd_mov_immediate (v_op, width);
b7342d25
IB
20111}
20112
43cacb12
RS
20113/* Return the output string to use for moving immediate CONST_VECTOR
20114 into an SVE register. */
20115
20116char *
20117aarch64_output_sve_mov_immediate (rtx const_vector)
20118{
20119 static char templ[40];
20120 struct simd_immediate_info info;
20121 char element_char;
20122
20123 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
20124 gcc_assert (is_valid);
20125
20126 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
20127
1044fa32
RS
20128 machine_mode vec_mode = GET_MODE (const_vector);
20129 if (aarch64_sve_pred_mode_p (vec_mode))
20130 {
20131 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
0b1fe8cf
RS
20132 if (info.insn == simd_immediate_info::MOV)
20133 {
20134 gcc_assert (info.u.mov.value == const0_rtx);
20135 snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
20136 }
1044fa32 20137 else
0b1fe8cf
RS
20138 {
20139 gcc_assert (info.insn == simd_immediate_info::PTRUE);
20140 unsigned int total_bytes;
20141 if (info.u.pattern == AARCH64_SV_ALL
20142 && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
20143 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
20144 total_bytes / GET_MODE_SIZE (info.elt_mode));
20145 else
20146 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
20147 svpattern_token (info.u.pattern));
20148 }
1044fa32
RS
20149 return buf;
20150 }
20151
1da83cce 20152 if (info.insn == simd_immediate_info::INDEX)
43cacb12
RS
20153 {
20154 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
20155 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
1da83cce
RS
20156 element_char, INTVAL (info.u.index.base),
20157 INTVAL (info.u.index.step));
43cacb12
RS
20158 return templ;
20159 }
20160
20161 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
20162 {
1da83cce
RS
20163 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
20164 info.u.mov.value = GEN_INT (0);
43cacb12
RS
20165 else
20166 {
20167 const int buf_size = 20;
20168 char float_buf[buf_size] = {};
20169 real_to_decimal_for_mode (float_buf,
1da83cce 20170 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
43cacb12
RS
20171 buf_size, buf_size, 1, info.elt_mode);
20172
20173 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
20174 element_char, float_buf);
20175 return templ;
20176 }
20177 }
20178
20179 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
1da83cce 20180 element_char, INTVAL (info.u.mov.value));
43cacb12
RS
20181 return templ;
20182}
20183
624d0f07
RS
20184/* Return the asm template for a PTRUES. CONST_UNSPEC is the
20185 aarch64_sve_ptrue_svpattern_immediate that describes the predicate
20186 pattern. */
20187
20188char *
20189aarch64_output_sve_ptrues (rtx const_unspec)
20190{
20191 static char templ[40];
20192
20193 struct simd_immediate_info info;
20194 bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
20195 gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
20196
20197 char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
20198 snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
20199 svpattern_token (info.u.pattern));
20200 return templ;
20201}
20202
88b08073
JG
20203/* Split operands into moves from op[1] + op[2] into op[0]. */
20204
20205void
20206aarch64_split_combinev16qi (rtx operands[3])
20207{
20208 unsigned int dest = REGNO (operands[0]);
20209 unsigned int src1 = REGNO (operands[1]);
20210 unsigned int src2 = REGNO (operands[2]);
ef4bddc2 20211 machine_mode halfmode = GET_MODE (operands[1]);
462a99aa 20212 unsigned int halfregs = REG_NREGS (operands[1]);
88b08073
JG
20213 rtx destlo, desthi;
20214
20215 gcc_assert (halfmode == V16QImode);
20216
20217 if (src1 == dest && src2 == dest + halfregs)
20218 {
20219 /* No-op move. Can't split to nothing; emit something. */
20220 emit_note (NOTE_INSN_DELETED);
20221 return;
20222 }
20223
20224 /* Preserve register attributes for variable tracking. */
20225 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
20226 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
20227 GET_MODE_SIZE (halfmode));
20228
20229 /* Special case of reversed high/low parts. */
20230 if (reg_overlap_mentioned_p (operands[2], destlo)
20231 && reg_overlap_mentioned_p (operands[1], desthi))
20232 {
20233 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
20234 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
20235 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
20236 }
20237 else if (!reg_overlap_mentioned_p (operands[2], destlo))
20238 {
20239 /* Try to avoid unnecessary moves if part of the result
20240 is in the right place already. */
20241 if (src1 != dest)
20242 emit_move_insn (destlo, operands[1]);
20243 if (src2 != dest + halfregs)
20244 emit_move_insn (desthi, operands[2]);
20245 }
20246 else
20247 {
20248 if (src2 != dest + halfregs)
20249 emit_move_insn (desthi, operands[2]);
20250 if (src1 != dest)
20251 emit_move_insn (destlo, operands[1]);
20252 }
20253}
20254
20255/* vec_perm support. */
20256
88b08073
JG
20257struct expand_vec_perm_d
20258{
20259 rtx target, op0, op1;
e3342de4 20260 vec_perm_indices perm;
ef4bddc2 20261 machine_mode vmode;
43cacb12 20262 unsigned int vec_flags;
88b08073
JG
20263 bool one_vector_p;
20264 bool testing_p;
20265};
20266
7efc03fd
DP
20267static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d);
20268
88b08073
JG
20269/* Generate a variable permutation. */
20270
20271static void
20272aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
20273{
ef4bddc2 20274 machine_mode vmode = GET_MODE (target);
88b08073
JG
20275 bool one_vector_p = rtx_equal_p (op0, op1);
20276
20277 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
20278 gcc_checking_assert (GET_MODE (op0) == vmode);
20279 gcc_checking_assert (GET_MODE (op1) == vmode);
20280 gcc_checking_assert (GET_MODE (sel) == vmode);
20281 gcc_checking_assert (TARGET_SIMD);
20282
20283 if (one_vector_p)
20284 {
20285 if (vmode == V8QImode)
20286 {
20287 /* Expand the argument to a V16QI mode by duplicating it. */
20288 rtx pair = gen_reg_rtx (V16QImode);
20289 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
20290 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
20291 }
20292 else
20293 {
20294 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
20295 }
20296 }
20297 else
20298 {
20299 rtx pair;
20300
20301 if (vmode == V8QImode)
20302 {
20303 pair = gen_reg_rtx (V16QImode);
20304 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
20305 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
20306 }
20307 else
20308 {
20309 pair = gen_reg_rtx (OImode);
20310 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
20311 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
20312 }
20313 }
20314}
20315
80940017
RS
20316/* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
20317 NELT is the number of elements in the vector. */
20318
88b08073 20319void
80940017
RS
20320aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
20321 unsigned int nelt)
88b08073 20322{
ef4bddc2 20323 machine_mode vmode = GET_MODE (target);
88b08073 20324 bool one_vector_p = rtx_equal_p (op0, op1);
f7c4e5b8 20325 rtx mask;
88b08073
JG
20326
20327 /* The TBL instruction does not use a modulo index, so we must take care
20328 of that ourselves. */
f7c4e5b8
AL
20329 mask = aarch64_simd_gen_const_vector_dup (vmode,
20330 one_vector_p ? nelt - 1 : 2 * nelt - 1);
88b08073
JG
20331 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
20332
f7c4e5b8
AL
20333 /* For big-endian, we also need to reverse the index within the vector
20334 (but not which vector). */
20335 if (BYTES_BIG_ENDIAN)
20336 {
20337 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
20338 if (!one_vector_p)
20339 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
20340 sel = expand_simple_binop (vmode, XOR, sel, mask,
20341 NULL, 0, OPTAB_LIB_WIDEN);
20342 }
88b08073
JG
20343 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
20344}
20345
43cacb12
RS
20346/* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
20347
20348static void
20349emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
20350{
20351 emit_insn (gen_rtx_SET (target,
20352 gen_rtx_UNSPEC (GET_MODE (target),
20353 gen_rtvec (2, op0, op1), code)));
20354}
20355
20356/* Expand an SVE vec_perm with the given operands. */
20357
20358void
20359aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
20360{
20361 machine_mode data_mode = GET_MODE (target);
20362 machine_mode sel_mode = GET_MODE (sel);
20363 /* Enforced by the pattern condition. */
20364 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
20365
20366 /* Note: vec_perm indices are supposed to wrap when they go beyond the
20367 size of the two value vectors, i.e. the upper bits of the indices
20368 are effectively ignored. SVE TBL instead produces 0 for any
20369 out-of-range indices, so we need to modulo all the vec_perm indices
20370 to ensure they are all in range. */
20371 rtx sel_reg = force_reg (sel_mode, sel);
20372
20373 /* Check if the sel only references the first values vector. */
20374 if (GET_CODE (sel) == CONST_VECTOR
20375 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
20376 {
20377 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
20378 return;
20379 }
20380
20381 /* Check if the two values vectors are the same. */
20382 if (rtx_equal_p (op0, op1))
20383 {
20384 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
20385 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
20386 NULL, 0, OPTAB_DIRECT);
20387 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
20388 return;
20389 }
20390
20391 /* Run TBL on for each value vector and combine the results. */
20392
20393 rtx res0 = gen_reg_rtx (data_mode);
20394 rtx res1 = gen_reg_rtx (data_mode);
20395 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
20396 if (GET_CODE (sel) != CONST_VECTOR
20397 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
20398 {
20399 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
20400 2 * nunits - 1);
20401 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
20402 NULL, 0, OPTAB_DIRECT);
20403 }
20404 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
20405 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
20406 NULL, 0, OPTAB_DIRECT);
20407 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
20408 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
20409 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
20410 else
20411 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
20412}
20413
cc4d934f
JG
20414/* Recognize patterns suitable for the TRN instructions. */
20415static bool
20416aarch64_evpc_trn (struct expand_vec_perm_d *d)
20417{
6a70badb
RS
20418 HOST_WIDE_INT odd;
20419 poly_uint64 nelt = d->perm.length ();
cc4d934f 20420 rtx out, in0, in1, x;
ef4bddc2 20421 machine_mode vmode = d->vmode;
cc4d934f
JG
20422
20423 if (GET_MODE_UNIT_SIZE (vmode) > 8)
20424 return false;
20425
20426 /* Note that these are little-endian tests.
20427 We correct for big-endian later. */
6a70badb
RS
20428 if (!d->perm[0].is_constant (&odd)
20429 || (odd != 0 && odd != 1)
326ac20e
RS
20430 || !d->perm.series_p (0, 2, odd, 2)
20431 || !d->perm.series_p (1, 2, nelt + odd, 2))
cc4d934f 20432 return false;
cc4d934f
JG
20433
20434 /* Success! */
20435 if (d->testing_p)
20436 return true;
20437
20438 in0 = d->op0;
20439 in1 = d->op1;
43cacb12
RS
20440 /* We don't need a big-endian lane correction for SVE; see the comment
20441 at the head of aarch64-sve.md for details. */
20442 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
cc4d934f
JG
20443 {
20444 x = in0, in0 = in1, in1 = x;
20445 odd = !odd;
20446 }
20447 out = d->target;
20448
3f8334a5
RS
20449 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
20450 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
cc4d934f
JG
20451 return true;
20452}
20453
7efc03fd
DP
20454/* Try to re-encode the PERM constant so it combines odd and even elements.
20455 This rewrites constants such as {0, 1, 4, 5}/V4SF to {0, 2}/V2DI.
20456 We retry with this new constant with the full suite of patterns. */
20457static bool
20458aarch64_evpc_reencode (struct expand_vec_perm_d *d)
20459{
20460 expand_vec_perm_d newd;
20461 unsigned HOST_WIDE_INT nelt;
20462
20463 if (d->vec_flags != VEC_ADVSIMD)
20464 return false;
20465
20466 /* Get the new mode. Always twice the size of the inner
20467 and half the elements. */
20468 poly_uint64 vec_bits = GET_MODE_BITSIZE (d->vmode);
20469 unsigned int new_elt_bits = GET_MODE_UNIT_BITSIZE (d->vmode) * 2;
20470 auto new_elt_mode = int_mode_for_size (new_elt_bits, false).require ();
20471 machine_mode new_mode = aarch64_simd_container_mode (new_elt_mode, vec_bits);
20472
20473 if (new_mode == word_mode)
20474 return false;
20475
20476 /* to_constant is safe since this routine is specific to Advanced SIMD
20477 vectors. */
20478 nelt = d->perm.length ().to_constant ();
20479
20480 vec_perm_builder newpermconst;
20481 newpermconst.new_vector (nelt / 2, nelt / 2, 1);
20482
20483 /* Convert the perm constant if we can. Require even, odd as the pairs. */
20484 for (unsigned int i = 0; i < nelt; i += 2)
20485 {
20486 poly_int64 elt0 = d->perm[i];
20487 poly_int64 elt1 = d->perm[i + 1];
20488 poly_int64 newelt;
20489 if (!multiple_p (elt0, 2, &newelt) || maybe_ne (elt0 + 1, elt1))
20490 return false;
20491 newpermconst.quick_push (newelt.to_constant ());
20492 }
20493 newpermconst.finalize ();
20494
20495 newd.vmode = new_mode;
20496 newd.vec_flags = VEC_ADVSIMD;
20497 newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
20498 newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
20499 newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
20500 newd.testing_p = d->testing_p;
20501 newd.one_vector_p = d->one_vector_p;
20502
20503 newd.perm.new_vector (newpermconst, newd.one_vector_p ? 1 : 2, nelt / 2);
20504 return aarch64_expand_vec_perm_const_1 (&newd);
20505}
20506
cc4d934f
JG
20507/* Recognize patterns suitable for the UZP instructions. */
20508static bool
20509aarch64_evpc_uzp (struct expand_vec_perm_d *d)
20510{
6a70badb 20511 HOST_WIDE_INT odd;
cc4d934f 20512 rtx out, in0, in1, x;
ef4bddc2 20513 machine_mode vmode = d->vmode;
cc4d934f
JG
20514
20515 if (GET_MODE_UNIT_SIZE (vmode) > 8)
20516 return false;
20517
20518 /* Note that these are little-endian tests.
20519 We correct for big-endian later. */
6a70badb
RS
20520 if (!d->perm[0].is_constant (&odd)
20521 || (odd != 0 && odd != 1)
326ac20e 20522 || !d->perm.series_p (0, 1, odd, 2))
cc4d934f 20523 return false;
cc4d934f
JG
20524
20525 /* Success! */
20526 if (d->testing_p)
20527 return true;
20528
20529 in0 = d->op0;
20530 in1 = d->op1;
43cacb12
RS
20531 /* We don't need a big-endian lane correction for SVE; see the comment
20532 at the head of aarch64-sve.md for details. */
20533 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
cc4d934f
JG
20534 {
20535 x = in0, in0 = in1, in1 = x;
20536 odd = !odd;
20537 }
20538 out = d->target;
20539
3f8334a5
RS
20540 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
20541 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
cc4d934f
JG
20542 return true;
20543}
20544
20545/* Recognize patterns suitable for the ZIP instructions. */
20546static bool
20547aarch64_evpc_zip (struct expand_vec_perm_d *d)
20548{
6a70badb
RS
20549 unsigned int high;
20550 poly_uint64 nelt = d->perm.length ();
cc4d934f 20551 rtx out, in0, in1, x;
ef4bddc2 20552 machine_mode vmode = d->vmode;
cc4d934f
JG
20553
20554 if (GET_MODE_UNIT_SIZE (vmode) > 8)
20555 return false;
20556
20557 /* Note that these are little-endian tests.
20558 We correct for big-endian later. */
6a70badb
RS
20559 poly_uint64 first = d->perm[0];
20560 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
20561 || !d->perm.series_p (0, 2, first, 1)
20562 || !d->perm.series_p (1, 2, first + nelt, 1))
cc4d934f 20563 return false;
6a70badb 20564 high = maybe_ne (first, 0U);
cc4d934f
JG
20565
20566 /* Success! */
20567 if (d->testing_p)
20568 return true;
20569
20570 in0 = d->op0;
20571 in1 = d->op1;
43cacb12
RS
20572 /* We don't need a big-endian lane correction for SVE; see the comment
20573 at the head of aarch64-sve.md for details. */
20574 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
cc4d934f
JG
20575 {
20576 x = in0, in0 = in1, in1 = x;
20577 high = !high;
20578 }
20579 out = d->target;
20580
3f8334a5
RS
20581 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
20582 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
cc4d934f
JG
20583 return true;
20584}
20585
ae0533da
AL
20586/* Recognize patterns for the EXT insn. */
20587
20588static bool
20589aarch64_evpc_ext (struct expand_vec_perm_d *d)
20590{
6a70badb 20591 HOST_WIDE_INT location;
ae0533da
AL
20592 rtx offset;
20593
6a70badb
RS
20594 /* The first element always refers to the first vector.
20595 Check if the extracted indices are increasing by one. */
43cacb12
RS
20596 if (d->vec_flags == VEC_SVE_PRED
20597 || !d->perm[0].is_constant (&location)
6a70badb 20598 || !d->perm.series_p (0, 1, location, 1))
326ac20e 20599 return false;
ae0533da 20600
ae0533da
AL
20601 /* Success! */
20602 if (d->testing_p)
20603 return true;
20604
b31e65bb 20605 /* The case where (location == 0) is a no-op for both big- and little-endian,
43cacb12 20606 and is removed by the mid-end at optimization levels -O1 and higher.
b31e65bb 20607
43cacb12
RS
20608 We don't need a big-endian lane correction for SVE; see the comment
20609 at the head of aarch64-sve.md for details. */
20610 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
ae0533da
AL
20611 {
20612 /* After setup, we want the high elements of the first vector (stored
20613 at the LSB end of the register), and the low elements of the second
20614 vector (stored at the MSB end of the register). So swap. */
cb5c6c29 20615 std::swap (d->op0, d->op1);
6a70badb
RS
20616 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
20617 to_constant () is safe since this is restricted to Advanced SIMD
20618 vectors. */
20619 location = d->perm.length ().to_constant () - location;
ae0533da
AL
20620 }
20621
20622 offset = GEN_INT (location);
3f8334a5
RS
20623 emit_set_insn (d->target,
20624 gen_rtx_UNSPEC (d->vmode,
20625 gen_rtvec (3, d->op0, d->op1, offset),
20626 UNSPEC_EXT));
ae0533da
AL
20627 return true;
20628}
20629
43cacb12
RS
20630/* Recognize patterns for the REV{64,32,16} insns, which reverse elements
20631 within each 64-bit, 32-bit or 16-bit granule. */
923fcec3
AL
20632
20633static bool
43cacb12 20634aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
923fcec3 20635{
6a70badb
RS
20636 HOST_WIDE_INT diff;
20637 unsigned int i, size, unspec;
43cacb12 20638 machine_mode pred_mode;
923fcec3 20639
43cacb12
RS
20640 if (d->vec_flags == VEC_SVE_PRED
20641 || !d->one_vector_p
98452668
AC
20642 || !d->perm[0].is_constant (&diff)
20643 || !diff)
923fcec3
AL
20644 return false;
20645
6c3ce63b
RS
20646 if (d->vec_flags & VEC_SVE_DATA)
20647 size = (diff + 1) * aarch64_sve_container_bits (d->vmode);
20648 else
20649 size = (diff + 1) * GET_MODE_UNIT_BITSIZE (d->vmode);
20650 if (size == 64)
43cacb12
RS
20651 {
20652 unspec = UNSPEC_REV64;
20653 pred_mode = VNx2BImode;
20654 }
6c3ce63b 20655 else if (size == 32)
43cacb12
RS
20656 {
20657 unspec = UNSPEC_REV32;
20658 pred_mode = VNx4BImode;
20659 }
6c3ce63b 20660 else if (size == 16)
43cacb12
RS
20661 {
20662 unspec = UNSPEC_REV16;
20663 pred_mode = VNx8BImode;
20664 }
3f8334a5
RS
20665 else
20666 return false;
923fcec3 20667
326ac20e
RS
20668 unsigned int step = diff + 1;
20669 for (i = 0; i < step; ++i)
20670 if (!d->perm.series_p (i, step, diff - i, step))
20671 return false;
923fcec3
AL
20672
20673 /* Success! */
20674 if (d->testing_p)
20675 return true;
20676
6c3ce63b
RS
20677 if (d->vec_flags & VEC_SVE_DATA)
20678 {
20679 rtx pred = aarch64_ptrue_reg (pred_mode);
20680 emit_insn (gen_aarch64_sve_revbhw (d->vmode, pred_mode,
20681 d->target, pred, d->op0));
d7a09c44 20682 return true;
43cacb12 20683 }
d7a09c44 20684 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
43cacb12
RS
20685 emit_set_insn (d->target, src);
20686 return true;
20687}
20688
20689/* Recognize patterns for the REV insn, which reverses elements within
20690 a full vector. */
20691
20692static bool
20693aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
20694{
20695 poly_uint64 nelt = d->perm.length ();
20696
28350fd1 20697 if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
43cacb12
RS
20698 return false;
20699
20700 if (!d->perm.series_p (0, 1, nelt - 1, -1))
20701 return false;
20702
20703 /* Success! */
20704 if (d->testing_p)
20705 return true;
20706
20707 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
20708 emit_set_insn (d->target, src);
923fcec3
AL
20709 return true;
20710}
20711
91bd4114
JG
20712static bool
20713aarch64_evpc_dup (struct expand_vec_perm_d *d)
20714{
91bd4114
JG
20715 rtx out = d->target;
20716 rtx in0;
6a70badb 20717 HOST_WIDE_INT elt;
ef4bddc2 20718 machine_mode vmode = d->vmode;
91bd4114
JG
20719 rtx lane;
20720
43cacb12
RS
20721 if (d->vec_flags == VEC_SVE_PRED
20722 || d->perm.encoding ().encoded_nelts () != 1
6a70badb 20723 || !d->perm[0].is_constant (&elt))
326ac20e
RS
20724 return false;
20725
6c3ce63b
RS
20726 if ((d->vec_flags & VEC_SVE_DATA)
20727 && elt * (aarch64_sve_container_bits (vmode) / 8) >= 64)
43cacb12
RS
20728 return false;
20729
326ac20e
RS
20730 /* Success! */
20731 if (d->testing_p)
20732 return true;
20733
91bd4114
JG
20734 /* The generic preparation in aarch64_expand_vec_perm_const_1
20735 swaps the operand order and the permute indices if it finds
20736 d->perm[0] to be in the second operand. Thus, we can always
20737 use d->op0 and need not do any extra arithmetic to get the
20738 correct lane number. */
20739 in0 = d->op0;
f901401e 20740 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
91bd4114 20741
3f8334a5
RS
20742 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
20743 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
20744 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
91bd4114
JG
20745 return true;
20746}
20747
88b08073
JG
20748static bool
20749aarch64_evpc_tbl (struct expand_vec_perm_d *d)
20750{
43cacb12 20751 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
ef4bddc2 20752 machine_mode vmode = d->vmode;
6a70badb
RS
20753
20754 /* Make sure that the indices are constant. */
20755 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
20756 for (unsigned int i = 0; i < encoded_nelts; ++i)
20757 if (!d->perm[i].is_constant ())
20758 return false;
88b08073 20759
88b08073
JG
20760 if (d->testing_p)
20761 return true;
20762
20763 /* Generic code will try constant permutation twice. Once with the
20764 original mode and again with the elements lowered to QImode.
20765 So wait and don't do the selector expansion ourselves. */
20766 if (vmode != V8QImode && vmode != V16QImode)
20767 return false;
20768
6a70badb
RS
20769 /* to_constant is safe since this routine is specific to Advanced SIMD
20770 vectors. */
20771 unsigned int nelt = d->perm.length ().to_constant ();
20772 for (unsigned int i = 0; i < nelt; ++i)
20773 /* If big-endian and two vectors we end up with a weird mixed-endian
20774 mode on NEON. Reverse the index within each word but not the word
20775 itself. to_constant is safe because we checked is_constant above. */
20776 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
20777 ? d->perm[i].to_constant () ^ (nelt - 1)
20778 : d->perm[i].to_constant ());
bbcc9c00 20779
88b08073
JG
20780 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
20781 sel = force_reg (vmode, sel);
20782
20783 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
20784 return true;
20785}
20786
43cacb12
RS
20787/* Try to implement D using an SVE TBL instruction. */
20788
20789static bool
20790aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
20791{
20792 unsigned HOST_WIDE_INT nelt;
20793
20794 /* Permuting two variable-length vectors could overflow the
20795 index range. */
20796 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
20797 return false;
20798
20799 if (d->testing_p)
20800 return true;
20801
d083ee47 20802 machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
43cacb12 20803 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
e25c95ef
RS
20804 if (d->one_vector_p)
20805 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
20806 else
20807 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
43cacb12
RS
20808 return true;
20809}
20810
9556ef20
PK
20811/* Try to implement D using SVE SEL instruction. */
20812
20813static bool
20814aarch64_evpc_sel (struct expand_vec_perm_d *d)
20815{
20816 machine_mode vmode = d->vmode;
20817 int unit_size = GET_MODE_UNIT_SIZE (vmode);
20818
20819 if (d->vec_flags != VEC_SVE_DATA
20820 || unit_size > 8)
20821 return false;
20822
20823 int n_patterns = d->perm.encoding ().npatterns ();
20824 poly_int64 vec_len = d->perm.length ();
20825
20826 for (int i = 0; i < n_patterns; ++i)
20827 if (!known_eq (d->perm[i], i)
20828 && !known_eq (d->perm[i], vec_len + i))
20829 return false;
20830
20831 for (int i = n_patterns; i < n_patterns * 2; i++)
20832 if (!d->perm.series_p (i, n_patterns, i, n_patterns)
20833 && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
20834 return false;
20835
20836 if (d->testing_p)
20837 return true;
20838
cc68f7c2 20839 machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
9556ef20 20840
b2f5b380 20841 /* Build a predicate that is true when op0 elements should be used. */
9556ef20
PK
20842 rtx_vector_builder builder (pred_mode, n_patterns, 2);
20843 for (int i = 0; i < n_patterns * 2; i++)
20844 {
20845 rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
20846 : CONST0_RTX (BImode);
20847 builder.quick_push (elem);
20848 }
20849
20850 rtx const_vec = builder.build ();
20851 rtx pred = force_reg (pred_mode, const_vec);
b2f5b380
RS
20852 /* TARGET = PRED ? OP0 : OP1. */
20853 emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, pred));
9556ef20
PK
20854 return true;
20855}
20856
c9c87e6f
DP
20857/* Recognize patterns suitable for the INS instructions. */
20858static bool
20859aarch64_evpc_ins (struct expand_vec_perm_d *d)
20860{
20861 machine_mode mode = d->vmode;
20862 unsigned HOST_WIDE_INT nelt;
20863
20864 if (d->vec_flags != VEC_ADVSIMD)
20865 return false;
20866
20867 /* to_constant is safe since this routine is specific to Advanced SIMD
20868 vectors. */
20869 nelt = d->perm.length ().to_constant ();
20870 rtx insv = d->op0;
20871
20872 HOST_WIDE_INT idx = -1;
20873
20874 for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
20875 {
20876 HOST_WIDE_INT elt;
20877 if (!d->perm[i].is_constant (&elt))
20878 return false;
20879 if (elt == (HOST_WIDE_INT) i)
20880 continue;
20881 if (idx != -1)
20882 {
20883 idx = -1;
20884 break;
20885 }
20886 idx = i;
20887 }
20888
20889 if (idx == -1)
20890 {
20891 insv = d->op1;
20892 for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
20893 {
20894 if (d->perm[i].to_constant () == (HOST_WIDE_INT) (i + nelt))
20895 continue;
20896 if (idx != -1)
20897 return false;
20898 idx = i;
20899 }
20900
20901 if (idx == -1)
20902 return false;
20903 }
20904
20905 if (d->testing_p)
20906 return true;
20907
20908 gcc_assert (idx != -1);
20909
20910 unsigned extractindex = d->perm[idx].to_constant ();
20911 rtx extractv = d->op0;
20912 if (extractindex >= nelt)
20913 {
20914 extractv = d->op1;
20915 extractindex -= nelt;
20916 }
20917 gcc_assert (extractindex < nelt);
20918
20919 emit_move_insn (d->target, insv);
20920 insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
20921 expand_operand ops[5];
20922 create_output_operand (&ops[0], d->target, mode);
20923 create_input_operand (&ops[1], d->target, mode);
20924 create_integer_operand (&ops[2], 1 << idx);
20925 create_input_operand (&ops[3], extractv, mode);
20926 create_integer_operand (&ops[4], extractindex);
20927 expand_insn (icode, 5, ops);
20928
20929 return true;
20930}
20931
88b08073
JG
20932static bool
20933aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
20934{
20935 /* The pattern matching functions above are written to look for a small
20936 number to begin the sequence (0, 1, N/2). If we begin with an index
20937 from the second operand, we can swap the operands. */
6a70badb
RS
20938 poly_int64 nelt = d->perm.length ();
20939 if (known_ge (d->perm[0], nelt))
88b08073 20940 {
e3342de4 20941 d->perm.rotate_inputs (1);
cb5c6c29 20942 std::swap (d->op0, d->op1);
88b08073
JG
20943 }
20944
43cacb12
RS
20945 if ((d->vec_flags == VEC_ADVSIMD
20946 || d->vec_flags == VEC_SVE_DATA
6c3ce63b 20947 || d->vec_flags == (VEC_SVE_DATA | VEC_PARTIAL)
43cacb12
RS
20948 || d->vec_flags == VEC_SVE_PRED)
20949 && known_gt (nelt, 1))
cc4d934f 20950 {
43cacb12
RS
20951 if (aarch64_evpc_rev_local (d))
20952 return true;
20953 else if (aarch64_evpc_rev_global (d))
923fcec3
AL
20954 return true;
20955 else if (aarch64_evpc_ext (d))
ae0533da 20956 return true;
f901401e
AL
20957 else if (aarch64_evpc_dup (d))
20958 return true;
ae0533da 20959 else if (aarch64_evpc_zip (d))
cc4d934f
JG
20960 return true;
20961 else if (aarch64_evpc_uzp (d))
20962 return true;
20963 else if (aarch64_evpc_trn (d))
20964 return true;
9556ef20
PK
20965 else if (aarch64_evpc_sel (d))
20966 return true;
c9c87e6f
DP
20967 else if (aarch64_evpc_ins (d))
20968 return true;
7efc03fd
DP
20969 else if (aarch64_evpc_reencode (d))
20970 return true;
43cacb12
RS
20971 if (d->vec_flags == VEC_SVE_DATA)
20972 return aarch64_evpc_sve_tbl (d);
4ec8bb67 20973 else if (d->vec_flags == VEC_ADVSIMD)
43cacb12 20974 return aarch64_evpc_tbl (d);
cc4d934f 20975 }
88b08073
JG
20976 return false;
20977}
20978
f151c9e1 20979/* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
88b08073 20980
f151c9e1
RS
20981static bool
20982aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
20983 rtx op1, const vec_perm_indices &sel)
88b08073
JG
20984{
20985 struct expand_vec_perm_d d;
88b08073 20986
326ac20e 20987 /* Check whether the mask can be applied to a single vector. */
e25c95ef
RS
20988 if (sel.ninputs () == 1
20989 || (op0 && rtx_equal_p (op0, op1)))
326ac20e
RS
20990 d.one_vector_p = true;
20991 else if (sel.all_from_input_p (0))
88b08073 20992 {
326ac20e
RS
20993 d.one_vector_p = true;
20994 op1 = op0;
88b08073 20995 }
326ac20e 20996 else if (sel.all_from_input_p (1))
88b08073 20997 {
88b08073 20998 d.one_vector_p = true;
326ac20e 20999 op0 = op1;
88b08073 21000 }
326ac20e
RS
21001 else
21002 d.one_vector_p = false;
88b08073 21003
326ac20e
RS
21004 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
21005 sel.nelts_per_input ());
21006 d.vmode = vmode;
43cacb12 21007 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
326ac20e
RS
21008 d.target = target;
21009 d.op0 = op0;
21010 d.op1 = op1;
21011 d.testing_p = !target;
e3342de4 21012
f151c9e1
RS
21013 if (!d.testing_p)
21014 return aarch64_expand_vec_perm_const_1 (&d);
88b08073 21015
326ac20e 21016 rtx_insn *last = get_last_insn ();
f151c9e1 21017 bool ret = aarch64_expand_vec_perm_const_1 (&d);
326ac20e 21018 gcc_assert (last == get_last_insn ());
88b08073
JG
21019
21020 return ret;
21021}
21022
73e3da51
RS
21023/* Generate a byte permute mask for a register of mode MODE,
21024 which has NUNITS units. */
21025
668046d1 21026rtx
73e3da51 21027aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
668046d1
DS
21028{
21029 /* We have to reverse each vector because we dont have
21030 a permuted load that can reverse-load according to ABI rules. */
21031 rtx mask;
21032 rtvec v = rtvec_alloc (16);
73e3da51
RS
21033 unsigned int i, j;
21034 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
668046d1
DS
21035
21036 gcc_assert (BYTES_BIG_ENDIAN);
21037 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
21038
21039 for (i = 0; i < nunits; i++)
21040 for (j = 0; j < usize; j++)
21041 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
21042 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
21043 return force_reg (V16QImode, mask);
21044}
21045
4a942af6 21046/* Expand an SVE integer comparison using the SVE equivalent of:
f22d7973 21047
4a942af6
RS
21048 (set TARGET (CODE OP0 OP1)). */
21049
21050void
21051aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
f22d7973 21052{
4a942af6
RS
21053 machine_mode pred_mode = GET_MODE (target);
21054 machine_mode data_mode = GET_MODE (op0);
00fa90d9
RS
21055 rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
21056 op0, op1);
21057 if (!rtx_equal_p (target, res))
21058 emit_move_insn (target, res);
f22d7973
RS
21059}
21060
43cacb12
RS
21061/* Return the UNSPEC_COND_* code for comparison CODE. */
21062
21063static unsigned int
21064aarch64_unspec_cond_code (rtx_code code)
21065{
21066 switch (code)
21067 {
21068 case NE:
cb18e86d 21069 return UNSPEC_COND_FCMNE;
43cacb12 21070 case EQ:
cb18e86d 21071 return UNSPEC_COND_FCMEQ;
43cacb12 21072 case LT:
cb18e86d 21073 return UNSPEC_COND_FCMLT;
43cacb12 21074 case GT:
cb18e86d 21075 return UNSPEC_COND_FCMGT;
43cacb12 21076 case LE:
cb18e86d 21077 return UNSPEC_COND_FCMLE;
43cacb12 21078 case GE:
cb18e86d 21079 return UNSPEC_COND_FCMGE;
4a942af6
RS
21080 case UNORDERED:
21081 return UNSPEC_COND_FCMUO;
43cacb12
RS
21082 default:
21083 gcc_unreachable ();
21084 }
21085}
21086
f22d7973 21087/* Emit:
43cacb12 21088
4a942af6 21089 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
f22d7973 21090
4a942af6
RS
21091 where <X> is the operation associated with comparison CODE.
21092 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
f22d7973
RS
21093
21094static void
4a942af6
RS
21095aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
21096 bool known_ptrue_p, rtx op0, rtx op1)
43cacb12 21097{
4a942af6 21098 rtx flag = gen_int_mode (known_ptrue_p, SImode);
f22d7973 21099 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
4a942af6 21100 gen_rtvec (4, pred, flag, op0, op1),
f22d7973
RS
21101 aarch64_unspec_cond_code (code));
21102 emit_set_insn (target, unspec);
43cacb12
RS
21103}
21104
f22d7973 21105/* Emit the SVE equivalent of:
43cacb12 21106
4a942af6
RS
21107 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
21108 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
f22d7973 21109 (set TARGET (ior:PRED_MODE TMP1 TMP2))
43cacb12 21110
4a942af6
RS
21111 where <Xi> is the operation associated with comparison CODEi.
21112 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
43cacb12
RS
21113
21114static void
4a942af6
RS
21115aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
21116 rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
43cacb12 21117{
4a942af6 21118 machine_mode pred_mode = GET_MODE (pred);
43cacb12 21119 rtx tmp1 = gen_reg_rtx (pred_mode);
4a942af6 21120 aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
43cacb12 21121 rtx tmp2 = gen_reg_rtx (pred_mode);
4a942af6 21122 aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
f22d7973 21123 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
43cacb12
RS
21124}
21125
f22d7973 21126/* Emit the SVE equivalent of:
43cacb12 21127
4a942af6 21128 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
f22d7973 21129 (set TARGET (not TMP))
43cacb12 21130
4a942af6
RS
21131 where <X> is the operation associated with comparison CODE.
21132 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
43cacb12
RS
21133
21134static void
4a942af6
RS
21135aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
21136 bool known_ptrue_p, rtx op0, rtx op1)
43cacb12 21137{
4a942af6 21138 machine_mode pred_mode = GET_MODE (pred);
f22d7973 21139 rtx tmp = gen_reg_rtx (pred_mode);
4a942af6 21140 aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
f22d7973 21141 aarch64_emit_unop (target, one_cmpl_optab, tmp);
43cacb12
RS
21142}
21143
f22d7973 21144/* Expand an SVE floating-point comparison using the SVE equivalent of:
43cacb12 21145
f22d7973 21146 (set TARGET (CODE OP0 OP1))
43cacb12
RS
21147
21148 If CAN_INVERT_P is true, the caller can also handle inverted results;
21149 return true if the result is in fact inverted. */
21150
21151bool
21152aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
21153 rtx op0, rtx op1, bool can_invert_p)
21154{
21155 machine_mode pred_mode = GET_MODE (target);
21156 machine_mode data_mode = GET_MODE (op0);
21157
16de3637 21158 rtx ptrue = aarch64_ptrue_reg (pred_mode);
43cacb12
RS
21159 switch (code)
21160 {
21161 case UNORDERED:
21162 /* UNORDERED has no immediate form. */
21163 op1 = force_reg (data_mode, op1);
f22d7973 21164 /* fall through */
43cacb12
RS
21165 case LT:
21166 case LE:
21167 case GT:
21168 case GE:
21169 case EQ:
21170 case NE:
f22d7973
RS
21171 {
21172 /* There is native support for the comparison. */
4a942af6 21173 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
f22d7973
RS
21174 return false;
21175 }
43cacb12
RS
21176
21177 case LTGT:
21178 /* This is a trapping operation (LT or GT). */
4a942af6 21179 aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
43cacb12
RS
21180 return false;
21181
21182 case UNEQ:
21183 if (!flag_trapping_math)
21184 {
21185 /* This would trap for signaling NaNs. */
21186 op1 = force_reg (data_mode, op1);
4a942af6
RS
21187 aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
21188 ptrue, true, op0, op1);
43cacb12
RS
21189 return false;
21190 }
21191 /* fall through */
43cacb12
RS
21192 case UNLT:
21193 case UNLE:
21194 case UNGT:
21195 case UNGE:
f22d7973
RS
21196 if (flag_trapping_math)
21197 {
21198 /* Work out which elements are ordered. */
21199 rtx ordered = gen_reg_rtx (pred_mode);
21200 op1 = force_reg (data_mode, op1);
4a942af6
RS
21201 aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
21202 ptrue, true, op0, op1);
f22d7973
RS
21203
21204 /* Test the opposite condition for the ordered elements,
21205 then invert the result. */
21206 if (code == UNEQ)
21207 code = NE;
21208 else
21209 code = reverse_condition_maybe_unordered (code);
21210 if (can_invert_p)
21211 {
4a942af6
RS
21212 aarch64_emit_sve_fp_cond (target, code,
21213 ordered, false, op0, op1);
f22d7973
RS
21214 return true;
21215 }
4a942af6
RS
21216 aarch64_emit_sve_invert_fp_cond (target, code,
21217 ordered, false, op0, op1);
f22d7973
RS
21218 return false;
21219 }
21220 break;
21221
21222 case ORDERED:
21223 /* ORDERED has no immediate form. */
21224 op1 = force_reg (data_mode, op1);
21225 break;
43cacb12
RS
21226
21227 default:
21228 gcc_unreachable ();
21229 }
f22d7973
RS
21230
21231 /* There is native support for the inverse comparison. */
21232 code = reverse_condition_maybe_unordered (code);
21233 if (can_invert_p)
21234 {
4a942af6 21235 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
f22d7973
RS
21236 return true;
21237 }
4a942af6 21238 aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
f22d7973 21239 return false;
43cacb12
RS
21240}
21241
21242/* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
21243 of the data being selected and CMP_MODE is the mode of the values being
21244 compared. */
21245
21246void
21247aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
21248 rtx *ops)
21249{
10116ec1 21250 machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require ();
43cacb12
RS
21251 rtx pred = gen_reg_rtx (pred_mode);
21252 if (FLOAT_MODE_P (cmp_mode))
21253 {
21254 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
21255 ops[4], ops[5], true))
21256 std::swap (ops[1], ops[2]);
21257 }
21258 else
21259 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
21260
d29f7dd5
RS
21261 if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
21262 ops[1] = force_reg (data_mode, ops[1]);
21263 /* The "false" value can only be zero if the "true" value is a constant. */
21264 if (register_operand (ops[1], data_mode)
21265 || !aarch64_simd_reg_or_zero (ops[2], data_mode))
21266 ops[2] = force_reg (data_mode, ops[2]);
21267
43cacb12
RS
21268 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
21269 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
21270}
21271
99e1629f
RS
21272/* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
21273 true. However due to issues with register allocation it is preferable
21274 to avoid tieing integer scalar and FP scalar modes. Executing integer
21275 operations in general registers is better than treating them as scalar
21276 vector operations. This reduces latency and avoids redundant int<->FP
21277 moves. So tie modes if they are either the same class, or vector modes
21278 with other vector modes, vector structs or any scalar mode. */
97e1ad78 21279
99e1629f 21280static bool
ef4bddc2 21281aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
97e1ad78
JG
21282{
21283 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
21284 return true;
21285
21286 /* We specifically want to allow elements of "structure" modes to
21287 be tieable to the structure. This more general condition allows
43cacb12
RS
21288 other rarer situations too. The reason we don't extend this to
21289 predicate modes is that there are no predicate structure modes
21290 nor any specific instructions for extracting part of a predicate
21291 register. */
21292 if (aarch64_vector_data_mode_p (mode1)
21293 && aarch64_vector_data_mode_p (mode2))
61f17a5c
WD
21294 return true;
21295
21296 /* Also allow any scalar modes with vectors. */
21297 if (aarch64_vector_mode_supported_p (mode1)
21298 || aarch64_vector_mode_supported_p (mode2))
97e1ad78
JG
21299 return true;
21300
21301 return false;
21302}
21303
e2c75eea
JG
21304/* Return a new RTX holding the result of moving POINTER forward by
21305 AMOUNT bytes. */
21306
21307static rtx
6a70badb 21308aarch64_move_pointer (rtx pointer, poly_int64 amount)
e2c75eea
JG
21309{
21310 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
21311
21312 return adjust_automodify_address (pointer, GET_MODE (pointer),
21313 next, amount);
21314}
21315
21316/* Return a new RTX holding the result of moving POINTER forward by the
21317 size of the mode it points to. */
21318
21319static rtx
21320aarch64_progress_pointer (rtx pointer)
21321{
6a70badb 21322 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
e2c75eea
JG
21323}
21324
21325/* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
21326 MODE bytes. */
21327
21328static void
21329aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
ef4bddc2 21330 machine_mode mode)
e2c75eea 21331{
7cda9e08
SD
21332 /* Handle 256-bit memcpy separately. We do this by making 2 adjacent memory
21333 address copies using V4SImode so that we can use Q registers. */
21334 if (known_eq (GET_MODE_BITSIZE (mode), 256))
21335 {
21336 mode = V4SImode;
21337 rtx reg1 = gen_reg_rtx (mode);
21338 rtx reg2 = gen_reg_rtx (mode);
21339 /* "Cast" the pointers to the correct mode. */
21340 *src = adjust_address (*src, mode, 0);
21341 *dst = adjust_address (*dst, mode, 0);
21342 /* Emit the memcpy. */
21343 emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2,
21344 aarch64_progress_pointer (*src)));
21345 emit_insn (aarch64_gen_store_pair (mode, *dst, reg1,
21346 aarch64_progress_pointer (*dst), reg2));
21347 /* Move the pointers forward. */
21348 *src = aarch64_move_pointer (*src, 32);
21349 *dst = aarch64_move_pointer (*dst, 32);
21350 return;
21351 }
21352
e2c75eea
JG
21353 rtx reg = gen_reg_rtx (mode);
21354
21355 /* "Cast" the pointers to the correct mode. */
21356 *src = adjust_address (*src, mode, 0);
21357 *dst = adjust_address (*dst, mode, 0);
21358 /* Emit the memcpy. */
21359 emit_move_insn (reg, *src);
21360 emit_move_insn (*dst, reg);
21361 /* Move the pointers forward. */
21362 *src = aarch64_progress_pointer (*src);
21363 *dst = aarch64_progress_pointer (*dst);
21364}
21365
76715c32 21366/* Expand cpymem, as if from a __builtin_memcpy. Return true if
e2c75eea
JG
21367 we succeed, otherwise return false. */
21368
21369bool
76715c32 21370aarch64_expand_cpymem (rtx *operands)
e2c75eea 21371{
1d77928f 21372 int mode_bits;
e2c75eea
JG
21373 rtx dst = operands[0];
21374 rtx src = operands[1];
21375 rtx base;
1d77928f 21376 machine_mode cur_mode = BLKmode;
e2c75eea 21377
1d77928f 21378 /* Only expand fixed-size copies. */
e2c75eea
JG
21379 if (!CONST_INT_P (operands[2]))
21380 return false;
21381
1d77928f 21382 unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
e2c75eea 21383
1d77928f
WD
21384 /* Inline up to 256 bytes when optimizing for speed. */
21385 unsigned HOST_WIDE_INT max_copy_size = 256;
21386
21387 if (optimize_function_for_size_p (cfun))
21388 max_copy_size = 128;
e2c75eea 21389
1d77928f
WD
21390 int copy_bits = 256;
21391
21392 /* Default to 256-bit LDP/STP on large copies, however small copies, no SIMD
21393 support or slow 256-bit LDP/STP fall back to 128-bit chunks. */
21394 if (size <= 24
21395 || !TARGET_SIMD
21396 || (aarch64_tune_params.extra_tuning_flags
21397 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
21398 {
21399 copy_bits = 128;
21400 max_copy_size = max_copy_size / 2;
21401 }
21402
21403 if (size > max_copy_size)
21404 return false;
0f801e0b 21405
e2c75eea
JG
21406 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
21407 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
21408
21409 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
21410 src = adjust_automodify_address (src, VOIDmode, base, 0);
21411
1d77928f
WD
21412 /* Convert size to bits to make the rest of the code simpler. */
21413 int n = size * BITS_PER_UNIT;
f7e1d19d 21414
89c52e5e 21415 while (n > 0)
e2c75eea 21416 {
89c52e5e
TC
21417 /* Find the largest mode in which to do the copy in without over reading
21418 or writing. */
21419 opt_scalar_int_mode mode_iter;
21420 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
1d77928f 21421 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_bits))
89c52e5e 21422 cur_mode = mode_iter.require ();
e2c75eea 21423
89c52e5e 21424 gcc_assert (cur_mode != BLKmode);
e2c75eea 21425
89c52e5e 21426 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
1d77928f
WD
21427
21428 /* Prefer Q-register accesses for the last bytes. */
21429 if (mode_bits == 128 && copy_bits == 256)
21430 cur_mode = V4SImode;
21431
89c52e5e 21432 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
e2c75eea 21433
89c52e5e 21434 n -= mode_bits;
e2c75eea 21435
1d77928f
WD
21436 /* Emit trailing copies using overlapping unaligned accesses - this is
21437 smaller and faster. */
21438 if (n > 0 && n < copy_bits / 2)
89c52e5e 21439 {
1d77928f 21440 machine_mode next_mode = smallest_mode_for_size (n, MODE_INT);
f7e1d19d 21441 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
1d77928f 21442 gcc_assert (n_bits <= mode_bits);
89c52e5e
TC
21443 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
21444 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
21445 n = n_bits;
e2c75eea
JG
21446 }
21447 }
21448
21449 return true;
21450}
21451
54bbde55
SD
21452/* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
21453 SRC is a register we have created with the duplicated value to be set. */
21454static void
21455aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
21456 machine_mode mode)
21457{
21458 /* If we are copying 128bits or 256bits, we can do that straight from
21459 the SIMD register we prepared. */
21460 if (known_eq (GET_MODE_BITSIZE (mode), 256))
21461 {
21462 mode = GET_MODE (src);
21463 /* "Cast" the *dst to the correct mode. */
21464 *dst = adjust_address (*dst, mode, 0);
21465 /* Emit the memset. */
21466 emit_insn (aarch64_gen_store_pair (mode, *dst, src,
21467 aarch64_progress_pointer (*dst), src));
21468
21469 /* Move the pointers forward. */
21470 *dst = aarch64_move_pointer (*dst, 32);
21471 return;
21472 }
21473 if (known_eq (GET_MODE_BITSIZE (mode), 128))
21474 {
21475 /* "Cast" the *dst to the correct mode. */
21476 *dst = adjust_address (*dst, GET_MODE (src), 0);
21477 /* Emit the memset. */
21478 emit_move_insn (*dst, src);
21479 /* Move the pointers forward. */
21480 *dst = aarch64_move_pointer (*dst, 16);
21481 return;
21482 }
21483 /* For copying less, we have to extract the right amount from src. */
21484 rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
21485
21486 /* "Cast" the *dst to the correct mode. */
21487 *dst = adjust_address (*dst, mode, 0);
21488 /* Emit the memset. */
21489 emit_move_insn (*dst, reg);
21490 /* Move the pointer forward. */
21491 *dst = aarch64_progress_pointer (*dst);
21492}
21493
21494/* Expand setmem, as if from a __builtin_memset. Return true if
21495 we succeed, otherwise return false. */
21496
21497bool
21498aarch64_expand_setmem (rtx *operands)
21499{
21500 int n, mode_bits;
21501 unsigned HOST_WIDE_INT len;
21502 rtx dst = operands[0];
21503 rtx val = operands[2], src;
21504 rtx base;
21505 machine_mode cur_mode = BLKmode, next_mode;
21506
21507 /* We can't do anything smart if the amount to copy is not constant. */
21508 if (!CONST_INT_P (operands[1]))
21509 return false;
21510
21511 bool speed_p = !optimize_function_for_size_p (cfun);
21512
21513 /* Default the maximum to 256-bytes. */
21514 unsigned max_set_size = 256;
21515
21516 /* In case we are optimizing for size or if the core does not
21517 want to use STP Q regs, lower the max_set_size. */
21518 max_set_size = (!speed_p
21519 || (aarch64_tune_params.extra_tuning_flags
21520 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
21521 ? max_set_size / 2 : max_set_size;
21522
21523 len = INTVAL (operands[1]);
21524
21525 /* Upper bound check. */
21526 if (len > max_set_size)
21527 return false;
21528
21529 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
21530 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
21531
21532 /* Prepare the val using a DUP/MOVI v0.16B, val. */
21533 src = expand_vector_broadcast (V16QImode, val);
21534 src = force_reg (V16QImode, src);
21535
21536 /* Convert len to bits to make the rest of the code simpler. */
21537 n = len * BITS_PER_UNIT;
21538
21539 /* Maximum amount to copy in one go. We allow 256-bit chunks based on the
21540 AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter. setmem expand
21541 pattern is only turned on for TARGET_SIMD. */
21542 const int copy_limit = (speed_p
21543 && (aarch64_tune_params.extra_tuning_flags
21544 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
21545 ? GET_MODE_BITSIZE (TImode) : 256;
21546
21547 while (n > 0)
21548 {
21549 /* Find the largest mode in which to do the copy without
21550 over writing. */
21551 opt_scalar_int_mode mode_iter;
21552 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
21553 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
21554 cur_mode = mode_iter.require ();
21555
21556 gcc_assert (cur_mode != BLKmode);
21557
21558 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
21559 aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
21560
21561 n -= mode_bits;
21562
21563 /* Do certain trailing copies as overlapping if it's going to be
21564 cheaper. i.e. less instructions to do so. For instance doing a 15
21565 byte copy it's more efficient to do two overlapping 8 byte copies than
21566 8 + 4 + 2 + 1. */
21567 if (n > 0 && n < copy_limit / 2)
21568 {
21569 next_mode = smallest_mode_for_size (n, MODE_INT);
21570 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
21571 gcc_assert (n_bits <= mode_bits);
21572 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
21573 n = n_bits;
21574 }
21575 }
21576
21577 return true;
21578}
21579
21580
141a3ccf
KT
21581/* Split a DImode store of a CONST_INT SRC to MEM DST as two
21582 SImode stores. Handle the case when the constant has identical
21583 bottom and top halves. This is beneficial when the two stores can be
21584 merged into an STP and we avoid synthesising potentially expensive
21585 immediates twice. Return true if such a split is possible. */
21586
21587bool
21588aarch64_split_dimode_const_store (rtx dst, rtx src)
21589{
21590 rtx lo = gen_lowpart (SImode, src);
21591 rtx hi = gen_highpart_mode (SImode, DImode, src);
21592
21593 bool size_p = optimize_function_for_size_p (cfun);
21594
21595 if (!rtx_equal_p (lo, hi))
21596 return false;
21597
21598 unsigned int orig_cost
21599 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
21600 unsigned int lo_cost
21601 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
21602
21603 /* We want to transform:
21604 MOV x1, 49370
21605 MOVK x1, 0x140, lsl 16
21606 MOVK x1, 0xc0da, lsl 32
21607 MOVK x1, 0x140, lsl 48
21608 STR x1, [x0]
21609 into:
21610 MOV w1, 49370
21611 MOVK w1, 0x140, lsl 16
21612 STP w1, w1, [x0]
21613 So we want to perform this only when we save two instructions
21614 or more. When optimizing for size, however, accept any code size
21615 savings we can. */
21616 if (size_p && orig_cost <= lo_cost)
21617 return false;
21618
21619 if (!size_p
21620 && (orig_cost <= lo_cost + 1))
21621 return false;
21622
21623 rtx mem_lo = adjust_address (dst, SImode, 0);
21624 if (!aarch64_mem_pair_operand (mem_lo, SImode))
21625 return false;
21626
21627 rtx tmp_reg = gen_reg_rtx (SImode);
21628 aarch64_expand_mov_immediate (tmp_reg, lo);
21629 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
21630 /* Don't emit an explicit store pair as this may not be always profitable.
21631 Let the sched-fusion logic decide whether to merge them. */
21632 emit_move_insn (mem_lo, tmp_reg);
21633 emit_move_insn (mem_hi, tmp_reg);
21634
21635 return true;
21636}
21637
30c46053
MC
21638/* Generate RTL for a conditional branch with rtx comparison CODE in
21639 mode CC_MODE. The destination of the unlikely conditional branch
21640 is LABEL_REF. */
21641
21642void
21643aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
21644 rtx label_ref)
21645{
21646 rtx x;
21647 x = gen_rtx_fmt_ee (code, VOIDmode,
21648 gen_rtx_REG (cc_mode, CC_REGNUM),
21649 const0_rtx);
21650
21651 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
21652 gen_rtx_LABEL_REF (VOIDmode, label_ref),
21653 pc_rtx);
21654 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
21655}
21656
21657/* Generate DImode scratch registers for 128-bit (TImode) addition.
21658
21659 OP1 represents the TImode destination operand 1
21660 OP2 represents the TImode destination operand 2
21661 LOW_DEST represents the low half (DImode) of TImode operand 0
21662 LOW_IN1 represents the low half (DImode) of TImode operand 1
21663 LOW_IN2 represents the low half (DImode) of TImode operand 2
21664 HIGH_DEST represents the high half (DImode) of TImode operand 0
21665 HIGH_IN1 represents the high half (DImode) of TImode operand 1
21666 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
21667
21668void
21669aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
21670 rtx *low_in1, rtx *low_in2,
21671 rtx *high_dest, rtx *high_in1,
21672 rtx *high_in2)
21673{
21674 *low_dest = gen_reg_rtx (DImode);
21675 *low_in1 = gen_lowpart (DImode, op1);
21676 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
21677 subreg_lowpart_offset (DImode, TImode));
21678 *high_dest = gen_reg_rtx (DImode);
21679 *high_in1 = gen_highpart (DImode, op1);
21680 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
21681 subreg_highpart_offset (DImode, TImode));
21682}
21683
21684/* Generate DImode scratch registers for 128-bit (TImode) subtraction.
21685
21686 This function differs from 'arch64_addti_scratch_regs' in that
21687 OP1 can be an immediate constant (zero). We must call
21688 subreg_highpart_offset with DImode and TImode arguments, otherwise
21689 VOIDmode will be used for the const_int which generates an internal
21690 error from subreg_size_highpart_offset which does not expect a size of zero.
21691
21692 OP1 represents the TImode destination operand 1
21693 OP2 represents the TImode destination operand 2
21694 LOW_DEST represents the low half (DImode) of TImode operand 0
21695 LOW_IN1 represents the low half (DImode) of TImode operand 1
21696 LOW_IN2 represents the low half (DImode) of TImode operand 2
21697 HIGH_DEST represents the high half (DImode) of TImode operand 0
21698 HIGH_IN1 represents the high half (DImode) of TImode operand 1
21699 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
21700
21701
21702void
21703aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
21704 rtx *low_in1, rtx *low_in2,
21705 rtx *high_dest, rtx *high_in1,
21706 rtx *high_in2)
21707{
21708 *low_dest = gen_reg_rtx (DImode);
21709 *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
21710 subreg_lowpart_offset (DImode, TImode));
21711
21712 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
21713 subreg_lowpart_offset (DImode, TImode));
21714 *high_dest = gen_reg_rtx (DImode);
21715
21716 *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
21717 subreg_highpart_offset (DImode, TImode));
21718 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
21719 subreg_highpart_offset (DImode, TImode));
21720}
21721
21722/* Generate RTL for 128-bit (TImode) subtraction with overflow.
21723
21724 OP0 represents the TImode destination operand 0
21725 LOW_DEST represents the low half (DImode) of TImode operand 0
21726 LOW_IN1 represents the low half (DImode) of TImode operand 1
21727 LOW_IN2 represents the low half (DImode) of TImode operand 2
21728 HIGH_DEST represents the high half (DImode) of TImode operand 0
21729 HIGH_IN1 represents the high half (DImode) of TImode operand 1
a58fe3c5
RE
21730 HIGH_IN2 represents the high half (DImode) of TImode operand 2
21731 UNSIGNED_P is true if the operation is being performed on unsigned
21732 values. */
30c46053
MC
21733void
21734aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
21735 rtx low_in2, rtx high_dest, rtx high_in1,
a58fe3c5 21736 rtx high_in2, bool unsigned_p)
30c46053
MC
21737{
21738 if (low_in2 == const0_rtx)
21739 {
21740 low_dest = low_in1;
a58fe3c5
RE
21741 high_in2 = force_reg (DImode, high_in2);
21742 if (unsigned_p)
21743 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
21744 else
21745 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
30c46053
MC
21746 }
21747 else
21748 {
d80f0a8d
JJ
21749 if (aarch64_plus_immediate (low_in2, DImode))
21750 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
21751 GEN_INT (-INTVAL (low_in2))));
21752 else
30c46053 21753 {
d80f0a8d
JJ
21754 low_in2 = force_reg (DImode, low_in2);
21755 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
30c46053 21756 }
d80f0a8d 21757 high_in2 = force_reg (DImode, high_in2);
a58fe3c5
RE
21758
21759 if (unsigned_p)
21760 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
21761 else
21762 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
30c46053
MC
21763 }
21764
21765 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
21766 emit_move_insn (gen_highpart (DImode, op0), high_dest);
21767
21768}
21769
a3125fc2
CL
21770/* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
21771
21772static unsigned HOST_WIDE_INT
21773aarch64_asan_shadow_offset (void)
21774{
10078f3e
AP
21775 if (TARGET_ILP32)
21776 return (HOST_WIDE_INT_1 << 29);
21777 else
21778 return (HOST_WIDE_INT_1 << 36);
a3125fc2
CL
21779}
21780
5f3bc026 21781static rtx
cb4347e8 21782aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
5f3bc026
ZC
21783 int code, tree treeop0, tree treeop1)
21784{
c8012fbc
WD
21785 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
21786 rtx op0, op1;
5f3bc026 21787 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
c8012fbc 21788 insn_code icode;
5f3bc026
ZC
21789 struct expand_operand ops[4];
21790
5f3bc026
ZC
21791 start_sequence ();
21792 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
21793
21794 op_mode = GET_MODE (op0);
21795 if (op_mode == VOIDmode)
21796 op_mode = GET_MODE (op1);
21797
21798 switch (op_mode)
21799 {
4e10a5a7
RS
21800 case E_QImode:
21801 case E_HImode:
21802 case E_SImode:
5f3bc026
ZC
21803 cmp_mode = SImode;
21804 icode = CODE_FOR_cmpsi;
21805 break;
21806
4e10a5a7 21807 case E_DImode:
5f3bc026
ZC
21808 cmp_mode = DImode;
21809 icode = CODE_FOR_cmpdi;
21810 break;
21811
4e10a5a7 21812 case E_SFmode:
786e3c06
WD
21813 cmp_mode = SFmode;
21814 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
21815 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
21816 break;
21817
4e10a5a7 21818 case E_DFmode:
786e3c06
WD
21819 cmp_mode = DFmode;
21820 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
21821 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
21822 break;
21823
5f3bc026
ZC
21824 default:
21825 end_sequence ();
21826 return NULL_RTX;
21827 }
21828
c8012fbc
WD
21829 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
21830 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
5f3bc026
ZC
21831 if (!op0 || !op1)
21832 {
21833 end_sequence ();
21834 return NULL_RTX;
21835 }
21836 *prep_seq = get_insns ();
21837 end_sequence ();
21838
c8012fbc
WD
21839 create_fixed_operand (&ops[0], op0);
21840 create_fixed_operand (&ops[1], op1);
5f3bc026
ZC
21841
21842 start_sequence ();
c8012fbc 21843 if (!maybe_expand_insn (icode, 2, ops))
5f3bc026
ZC
21844 {
21845 end_sequence ();
21846 return NULL_RTX;
21847 }
21848 *gen_seq = get_insns ();
21849 end_sequence ();
21850
c8012fbc
WD
21851 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
21852 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
5f3bc026
ZC
21853}
21854
21855static rtx
cb4347e8
TS
21856aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
21857 int cmp_code, tree treeop0, tree treeop1, int bit_code)
5f3bc026 21858{
c8012fbc
WD
21859 rtx op0, op1, target;
21860 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
5f3bc026 21861 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
c8012fbc 21862 insn_code icode;
5f3bc026 21863 struct expand_operand ops[6];
c8012fbc 21864 int aarch64_cond;
5f3bc026 21865
cb4347e8 21866 push_to_sequence (*prep_seq);
5f3bc026
ZC
21867 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
21868
21869 op_mode = GET_MODE (op0);
21870 if (op_mode == VOIDmode)
21871 op_mode = GET_MODE (op1);
21872
21873 switch (op_mode)
21874 {
4e10a5a7
RS
21875 case E_QImode:
21876 case E_HImode:
21877 case E_SImode:
5f3bc026 21878 cmp_mode = SImode;
5f3bc026
ZC
21879 break;
21880
4e10a5a7 21881 case E_DImode:
5f3bc026 21882 cmp_mode = DImode;
5f3bc026
ZC
21883 break;
21884
4e10a5a7 21885 case E_SFmode:
786e3c06
WD
21886 cmp_mode = SFmode;
21887 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
786e3c06
WD
21888 break;
21889
4e10a5a7 21890 case E_DFmode:
786e3c06
WD
21891 cmp_mode = DFmode;
21892 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
786e3c06
WD
21893 break;
21894
5f3bc026
ZC
21895 default:
21896 end_sequence ();
21897 return NULL_RTX;
21898 }
21899
865257c4
RS
21900 icode = code_for_ccmp (cc_mode, cmp_mode);
21901
5f3bc026
ZC
21902 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
21903 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
21904 if (!op0 || !op1)
21905 {
21906 end_sequence ();
21907 return NULL_RTX;
21908 }
21909 *prep_seq = get_insns ();
21910 end_sequence ();
21911
21912 target = gen_rtx_REG (cc_mode, CC_REGNUM);
c8012fbc 21913 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
5f3bc026 21914
c8012fbc
WD
21915 if (bit_code != AND)
21916 {
865257c4
RS
21917 /* Treat the ccmp patterns as canonical and use them where possible,
21918 but fall back to ccmp_rev patterns if there's no other option. */
21919 rtx_code prev_code = GET_CODE (prev);
21920 machine_mode prev_mode = GET_MODE (XEXP (prev, 0));
21921 if ((prev_mode == CCFPmode || prev_mode == CCFPEmode)
21922 && !(prev_code == EQ
21923 || prev_code == NE
21924 || prev_code == ORDERED
21925 || prev_code == UNORDERED))
21926 icode = code_for_ccmp_rev (cc_mode, cmp_mode);
21927 else
21928 {
21929 rtx_code code = reverse_condition (prev_code);
21930 prev = gen_rtx_fmt_ee (code, VOIDmode, XEXP (prev, 0), const0_rtx);
21931 }
c8012fbc
WD
21932 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
21933 }
21934
21935 create_fixed_operand (&ops[0], XEXP (prev, 0));
5f3bc026
ZC
21936 create_fixed_operand (&ops[1], target);
21937 create_fixed_operand (&ops[2], op0);
21938 create_fixed_operand (&ops[3], op1);
c8012fbc
WD
21939 create_fixed_operand (&ops[4], prev);
21940 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
5f3bc026 21941
cb4347e8 21942 push_to_sequence (*gen_seq);
5f3bc026
ZC
21943 if (!maybe_expand_insn (icode, 6, ops))
21944 {
21945 end_sequence ();
21946 return NULL_RTX;
21947 }
21948
21949 *gen_seq = get_insns ();
21950 end_sequence ();
21951
c8012fbc 21952 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
5f3bc026
ZC
21953}
21954
21955#undef TARGET_GEN_CCMP_FIRST
21956#define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
21957
21958#undef TARGET_GEN_CCMP_NEXT
21959#define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
21960
6a569cdd
KT
21961/* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
21962 instruction fusion of some sort. */
21963
21964static bool
21965aarch64_macro_fusion_p (void)
21966{
b175b679 21967 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
6a569cdd
KT
21968}
21969
21970
21971/* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
21972 should be kept together during scheduling. */
21973
21974static bool
21975aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
21976{
21977 rtx set_dest;
21978 rtx prev_set = single_set (prev);
21979 rtx curr_set = single_set (curr);
21980 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
21981 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
21982
21983 if (!aarch64_macro_fusion_p ())
21984 return false;
21985
d7b03373 21986 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
6a569cdd
KT
21987 {
21988 /* We are trying to match:
21989 prev (mov) == (set (reg r0) (const_int imm16))
21990 curr (movk) == (set (zero_extract (reg r0)
21991 (const_int 16)
21992 (const_int 16))
21993 (const_int imm16_1)) */
21994
21995 set_dest = SET_DEST (curr_set);
21996
21997 if (GET_CODE (set_dest) == ZERO_EXTRACT
21998 && CONST_INT_P (SET_SRC (curr_set))
21999 && CONST_INT_P (SET_SRC (prev_set))
22000 && CONST_INT_P (XEXP (set_dest, 2))
22001 && INTVAL (XEXP (set_dest, 2)) == 16
22002 && REG_P (XEXP (set_dest, 0))
22003 && REG_P (SET_DEST (prev_set))
22004 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
22005 {
22006 return true;
22007 }
22008 }
22009
d7b03373 22010 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
9bbe08fe
KT
22011 {
22012
22013 /* We're trying to match:
22014 prev (adrp) == (set (reg r1)
22015 (high (symbol_ref ("SYM"))))
22016 curr (add) == (set (reg r0)
22017 (lo_sum (reg r1)
22018 (symbol_ref ("SYM"))))
22019 Note that r0 need not necessarily be the same as r1, especially
22020 during pre-regalloc scheduling. */
22021
22022 if (satisfies_constraint_Ush (SET_SRC (prev_set))
22023 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
22024 {
22025 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
22026 && REG_P (XEXP (SET_SRC (curr_set), 0))
22027 && REGNO (XEXP (SET_SRC (curr_set), 0))
22028 == REGNO (SET_DEST (prev_set))
22029 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
22030 XEXP (SET_SRC (curr_set), 1)))
22031 return true;
22032 }
22033 }
22034
d7b03373 22035 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
cd0cb232
KT
22036 {
22037
22038 /* We're trying to match:
22039 prev (movk) == (set (zero_extract (reg r0)
22040 (const_int 16)
22041 (const_int 32))
22042 (const_int imm16_1))
22043 curr (movk) == (set (zero_extract (reg r0)
22044 (const_int 16)
22045 (const_int 48))
22046 (const_int imm16_2)) */
22047
22048 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
22049 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
22050 && REG_P (XEXP (SET_DEST (prev_set), 0))
22051 && REG_P (XEXP (SET_DEST (curr_set), 0))
22052 && REGNO (XEXP (SET_DEST (prev_set), 0))
22053 == REGNO (XEXP (SET_DEST (curr_set), 0))
22054 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
22055 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
22056 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
22057 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
22058 && CONST_INT_P (SET_SRC (prev_set))
22059 && CONST_INT_P (SET_SRC (curr_set)))
22060 return true;
22061
22062 }
d7b03373 22063 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
d8354ad7
KT
22064 {
22065 /* We're trying to match:
22066 prev (adrp) == (set (reg r0)
22067 (high (symbol_ref ("SYM"))))
22068 curr (ldr) == (set (reg r1)
22069 (mem (lo_sum (reg r0)
22070 (symbol_ref ("SYM")))))
22071 or
22072 curr (ldr) == (set (reg r1)
22073 (zero_extend (mem
22074 (lo_sum (reg r0)
22075 (symbol_ref ("SYM")))))) */
22076 if (satisfies_constraint_Ush (SET_SRC (prev_set))
22077 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
22078 {
22079 rtx curr_src = SET_SRC (curr_set);
22080
22081 if (GET_CODE (curr_src) == ZERO_EXTEND)
22082 curr_src = XEXP (curr_src, 0);
22083
22084 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
22085 && REG_P (XEXP (XEXP (curr_src, 0), 0))
22086 && REGNO (XEXP (XEXP (curr_src, 0), 0))
22087 == REGNO (SET_DEST (prev_set))
22088 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
22089 XEXP (SET_SRC (prev_set), 0)))
22090 return true;
22091 }
22092 }
cd0cb232 22093
a4f3fa71 22094 /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch. */
d7b03373 22095 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
a4f3fa71
WD
22096 && prev_set && curr_set && any_condjump_p (curr)
22097 && GET_CODE (SET_SRC (prev_set)) == COMPARE
22098 && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
22099 && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
22100 return true;
22101
22102 /* Fuse flag-setting ALU instructions and conditional branch. */
22103 if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
3759108f
AP
22104 && any_condjump_p (curr))
22105 {
509f819a
N
22106 unsigned int condreg1, condreg2;
22107 rtx cc_reg_1;
22108 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
22109 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
22110
22111 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
22112 && prev
22113 && modified_in_p (cc_reg_1, prev))
22114 {
f8a27206
AP
22115 enum attr_type prev_type = get_attr_type (prev);
22116
509f819a
N
22117 /* FIXME: this misses some which is considered simple arthematic
22118 instructions for ThunderX. Simple shifts are missed here. */
22119 if (prev_type == TYPE_ALUS_SREG
22120 || prev_type == TYPE_ALUS_IMM
22121 || prev_type == TYPE_LOGICS_REG
22122 || prev_type == TYPE_LOGICS_IMM)
22123 return true;
22124 }
3759108f
AP
22125 }
22126
a4f3fa71 22127 /* Fuse ALU instructions and CBZ/CBNZ. */
bee7e0fc
AP
22128 if (prev_set
22129 && curr_set
a4f3fa71 22130 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
00c7c57f
JB
22131 && any_condjump_p (curr))
22132 {
22133 /* We're trying to match:
22134 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
22135 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
22136 (const_int 0))
22137 (label_ref ("SYM"))
22138 (pc)) */
22139 if (SET_DEST (curr_set) == (pc_rtx)
22140 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
22141 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
22142 && REG_P (SET_DEST (prev_set))
22143 && REGNO (SET_DEST (prev_set))
22144 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
22145 {
22146 /* Fuse ALU operations followed by conditional branch instruction. */
22147 switch (get_attr_type (prev))
22148 {
22149 case TYPE_ALU_IMM:
22150 case TYPE_ALU_SREG:
22151 case TYPE_ADC_REG:
22152 case TYPE_ADC_IMM:
22153 case TYPE_ADCS_REG:
22154 case TYPE_ADCS_IMM:
22155 case TYPE_LOGIC_REG:
22156 case TYPE_LOGIC_IMM:
22157 case TYPE_CSEL:
22158 case TYPE_ADR:
22159 case TYPE_MOV_IMM:
22160 case TYPE_SHIFT_REG:
22161 case TYPE_SHIFT_IMM:
22162 case TYPE_BFM:
22163 case TYPE_RBIT:
22164 case TYPE_REV:
22165 case TYPE_EXTEND:
22166 return true;
22167
22168 default:;
22169 }
22170 }
22171 }
22172
6a569cdd
KT
22173 return false;
22174}
22175
f2879a90
KT
22176/* Return true iff the instruction fusion described by OP is enabled. */
22177
22178bool
22179aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
22180{
22181 return (aarch64_tune_params.fusible_ops & op) != 0;
22182}
22183
350013bc
BC
22184/* If MEM is in the form of [base+offset], extract the two parts
22185 of address and set to BASE and OFFSET, otherwise return false
22186 after clearing BASE and OFFSET. */
22187
22188bool
22189extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
22190{
22191 rtx addr;
22192
22193 gcc_assert (MEM_P (mem));
22194
22195 addr = XEXP (mem, 0);
22196
22197 if (REG_P (addr))
22198 {
22199 *base = addr;
22200 *offset = const0_rtx;
22201 return true;
22202 }
22203
22204 if (GET_CODE (addr) == PLUS
22205 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
22206 {
22207 *base = XEXP (addr, 0);
22208 *offset = XEXP (addr, 1);
22209 return true;
22210 }
22211
22212 *base = NULL_RTX;
22213 *offset = NULL_RTX;
22214
22215 return false;
22216}
22217
22218/* Types for scheduling fusion. */
22219enum sched_fusion_type
22220{
22221 SCHED_FUSION_NONE = 0,
22222 SCHED_FUSION_LD_SIGN_EXTEND,
22223 SCHED_FUSION_LD_ZERO_EXTEND,
22224 SCHED_FUSION_LD,
22225 SCHED_FUSION_ST,
22226 SCHED_FUSION_NUM
22227};
22228
22229/* If INSN is a load or store of address in the form of [base+offset],
22230 extract the two parts and set to BASE and OFFSET. Return scheduling
22231 fusion type this INSN is. */
22232
22233static enum sched_fusion_type
22234fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
22235{
22236 rtx x, dest, src;
22237 enum sched_fusion_type fusion = SCHED_FUSION_LD;
22238
22239 gcc_assert (INSN_P (insn));
22240 x = PATTERN (insn);
22241 if (GET_CODE (x) != SET)
22242 return SCHED_FUSION_NONE;
22243
22244 src = SET_SRC (x);
22245 dest = SET_DEST (x);
22246
abc52318
KT
22247 machine_mode dest_mode = GET_MODE (dest);
22248
22249 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
350013bc
BC
22250 return SCHED_FUSION_NONE;
22251
22252 if (GET_CODE (src) == SIGN_EXTEND)
22253 {
22254 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
22255 src = XEXP (src, 0);
3793ecc1 22256 if (!MEM_P (src) || GET_MODE (src) != SImode)
350013bc
BC
22257 return SCHED_FUSION_NONE;
22258 }
22259 else if (GET_CODE (src) == ZERO_EXTEND)
22260 {
22261 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
22262 src = XEXP (src, 0);
3793ecc1 22263 if (!MEM_P (src) || GET_MODE (src) != SImode)
350013bc
BC
22264 return SCHED_FUSION_NONE;
22265 }
22266
3793ecc1 22267 if (MEM_P (src) && REG_P (dest))
350013bc 22268 extract_base_offset_in_addr (src, base, offset);
3793ecc1 22269 else if (MEM_P (dest) && (REG_P (src) || src == const0_rtx))
350013bc
BC
22270 {
22271 fusion = SCHED_FUSION_ST;
22272 extract_base_offset_in_addr (dest, base, offset);
22273 }
22274 else
22275 return SCHED_FUSION_NONE;
22276
22277 if (*base == NULL_RTX || *offset == NULL_RTX)
22278 fusion = SCHED_FUSION_NONE;
22279
22280 return fusion;
22281}
22282
22283/* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
22284
22285 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
22286 and PRI are only calculated for these instructions. For other instruction,
22287 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
22288 type instruction fusion can be added by returning different priorities.
22289
22290 It's important that irrelevant instructions get the largest FUSION_PRI. */
22291
22292static void
22293aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
22294 int *fusion_pri, int *pri)
22295{
22296 int tmp, off_val;
22297 rtx base, offset;
22298 enum sched_fusion_type fusion;
22299
22300 gcc_assert (INSN_P (insn));
22301
22302 tmp = max_pri - 1;
22303 fusion = fusion_load_store (insn, &base, &offset);
22304 if (fusion == SCHED_FUSION_NONE)
22305 {
22306 *pri = tmp;
22307 *fusion_pri = tmp;
22308 return;
22309 }
22310
22311 /* Set FUSION_PRI according to fusion type and base register. */
22312 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
22313
22314 /* Calculate PRI. */
22315 tmp /= 2;
22316
22317 /* INSN with smaller offset goes first. */
22318 off_val = (int)(INTVAL (offset));
22319 if (off_val >= 0)
22320 tmp -= (off_val & 0xfffff);
22321 else
22322 tmp += ((- off_val) & 0xfffff);
22323
22324 *pri = tmp;
22325 return;
22326}
22327
9bca63d4
WD
22328/* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
22329 Adjust priority of sha1h instructions so they are scheduled before
22330 other SHA1 instructions. */
22331
22332static int
22333aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
22334{
22335 rtx x = PATTERN (insn);
22336
22337 if (GET_CODE (x) == SET)
22338 {
22339 x = SET_SRC (x);
22340
22341 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
22342 return priority + 10;
22343 }
22344
22345 return priority;
22346}
22347
350013bc
BC
22348/* Given OPERANDS of consecutive load/store, check if we can merge
22349 them into ldp/stp. LOAD is true if they are load instructions.
22350 MODE is the mode of memory operands. */
22351
22352bool
22353aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
b8506a8a 22354 machine_mode mode)
350013bc
BC
22355{
22356 HOST_WIDE_INT offval_1, offval_2, msize;
22357 enum reg_class rclass_1, rclass_2;
22358 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
22359
22360 if (load)
22361 {
22362 mem_1 = operands[1];
22363 mem_2 = operands[3];
22364 reg_1 = operands[0];
22365 reg_2 = operands[2];
22366 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
22367 if (REGNO (reg_1) == REGNO (reg_2))
22368 return false;
22369 }
22370 else
22371 {
22372 mem_1 = operands[0];
22373 mem_2 = operands[2];
22374 reg_1 = operands[1];
22375 reg_2 = operands[3];
22376 }
22377
bf84ac44
AP
22378 /* The mems cannot be volatile. */
22379 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
22380 return false;
22381
54700e2e
AP
22382 /* If we have SImode and slow unaligned ldp,
22383 check the alignment to be at least 8 byte. */
22384 if (mode == SImode
22385 && (aarch64_tune_params.extra_tuning_flags
22386 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
22387 && !optimize_size
22388 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
22389 return false;
22390
350013bc
BC
22391 /* Check if the addresses are in the form of [base+offset]. */
22392 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
22393 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
22394 return false;
22395 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
22396 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
22397 return false;
22398
22399 /* Check if the bases are same. */
22400 if (!rtx_equal_p (base_1, base_2))
22401 return false;
22402
dfe1da23
JW
22403 /* The operands must be of the same size. */
22404 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
22405 GET_MODE_SIZE (GET_MODE (mem_2))));
22406
350013bc
BC
22407 offval_1 = INTVAL (offset_1);
22408 offval_2 = INTVAL (offset_2);
6a70badb
RS
22409 /* We should only be trying this for fixed-sized modes. There is no
22410 SVE LDP/STP instruction. */
22411 msize = GET_MODE_SIZE (mode).to_constant ();
350013bc
BC
22412 /* Check if the offsets are consecutive. */
22413 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
22414 return false;
22415
22416 /* Check if the addresses are clobbered by load. */
22417 if (load)
22418 {
22419 if (reg_mentioned_p (reg_1, mem_1))
22420 return false;
22421
22422 /* In increasing order, the last load can clobber the address. */
22423 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
9b56ec11 22424 return false;
350013bc
BC
22425 }
22426
9b56ec11
JW
22427 /* One of the memory accesses must be a mempair operand.
22428 If it is not the first one, they need to be swapped by the
22429 peephole. */
22430 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
22431 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
22432 return false;
22433
350013bc
BC
22434 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
22435 rclass_1 = FP_REGS;
22436 else
22437 rclass_1 = GENERAL_REGS;
22438
22439 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
22440 rclass_2 = FP_REGS;
22441 else
22442 rclass_2 = GENERAL_REGS;
22443
22444 /* Check if the registers are of same class. */
22445 if (rclass_1 != rclass_2)
22446 return false;
22447
22448 return true;
22449}
22450
9b56ec11
JW
22451/* Given OPERANDS of consecutive load/store that can be merged,
22452 swap them if they are not in ascending order. */
22453void
22454aarch64_swap_ldrstr_operands (rtx* operands, bool load)
22455{
22456 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
22457 HOST_WIDE_INT offval_1, offval_2;
22458
22459 if (load)
22460 {
22461 mem_1 = operands[1];
22462 mem_2 = operands[3];
22463 }
22464 else
22465 {
22466 mem_1 = operands[0];
22467 mem_2 = operands[2];
22468 }
22469
22470 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
22471 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
22472
22473 offval_1 = INTVAL (offset_1);
22474 offval_2 = INTVAL (offset_2);
22475
22476 if (offval_1 > offval_2)
22477 {
22478 /* Irrespective of whether this is a load or a store,
22479 we do the same swap. */
22480 std::swap (operands[0], operands[2]);
22481 std::swap (operands[1], operands[3]);
22482 }
22483}
22484
d0b51297
JW
22485/* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
22486 comparison between the two. */
22487int
22488aarch64_host_wide_int_compare (const void *x, const void *y)
22489{
22490 return wi::cmps (* ((const HOST_WIDE_INT *) x),
22491 * ((const HOST_WIDE_INT *) y));
22492}
22493
22494/* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
22495 other pointing to a REG rtx containing an offset, compare the offsets
22496 of the two pairs.
22497
22498 Return:
22499
22500 1 iff offset (X) > offset (Y)
22501 0 iff offset (X) == offset (Y)
22502 -1 iff offset (X) < offset (Y) */
22503int
22504aarch64_ldrstr_offset_compare (const void *x, const void *y)
22505{
22506 const rtx * operands_1 = (const rtx *) x;
22507 const rtx * operands_2 = (const rtx *) y;
22508 rtx mem_1, mem_2, base, offset_1, offset_2;
22509
22510 if (MEM_P (operands_1[0]))
22511 mem_1 = operands_1[0];
22512 else
22513 mem_1 = operands_1[1];
22514
22515 if (MEM_P (operands_2[0]))
22516 mem_2 = operands_2[0];
22517 else
22518 mem_2 = operands_2[1];
22519
22520 /* Extract the offsets. */
22521 extract_base_offset_in_addr (mem_1, &base, &offset_1);
22522 extract_base_offset_in_addr (mem_2, &base, &offset_2);
22523
22524 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
22525
22526 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
22527}
22528
350013bc
BC
22529/* Given OPERANDS of consecutive load/store, check if we can merge
22530 them into ldp/stp by adjusting the offset. LOAD is true if they
22531 are load instructions. MODE is the mode of memory operands.
22532
22533 Given below consecutive stores:
22534
22535 str w1, [xb, 0x100]
22536 str w1, [xb, 0x104]
22537 str w1, [xb, 0x108]
22538 str w1, [xb, 0x10c]
22539
22540 Though the offsets are out of the range supported by stp, we can
22541 still pair them after adjusting the offset, like:
22542
22543 add scratch, xb, 0x100
22544 stp w1, w1, [scratch]
22545 stp w1, w1, [scratch, 0x8]
22546
22547 The peephole patterns detecting this opportunity should guarantee
22548 the scratch register is avaliable. */
22549
22550bool
22551aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
cd91a084 22552 machine_mode mode)
350013bc 22553{
34d7854d
JW
22554 const int num_insns = 4;
22555 enum reg_class rclass;
22556 HOST_WIDE_INT offvals[num_insns], msize;
22557 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
350013bc
BC
22558
22559 if (load)
22560 {
34d7854d
JW
22561 for (int i = 0; i < num_insns; i++)
22562 {
22563 reg[i] = operands[2 * i];
22564 mem[i] = operands[2 * i + 1];
22565
22566 gcc_assert (REG_P (reg[i]));
22567 }
d0b51297
JW
22568
22569 /* Do not attempt to merge the loads if the loads clobber each other. */
22570 for (int i = 0; i < 8; i += 2)
22571 for (int j = i + 2; j < 8; j += 2)
22572 if (reg_overlap_mentioned_p (operands[i], operands[j]))
22573 return false;
350013bc
BC
22574 }
22575 else
34d7854d
JW
22576 for (int i = 0; i < num_insns; i++)
22577 {
22578 mem[i] = operands[2 * i];
22579 reg[i] = operands[2 * i + 1];
22580 }
350013bc 22581
34d7854d
JW
22582 /* Skip if memory operand is by itself valid for ldp/stp. */
22583 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
bf84ac44
AP
22584 return false;
22585
34d7854d
JW
22586 for (int i = 0; i < num_insns; i++)
22587 {
22588 /* The mems cannot be volatile. */
22589 if (MEM_VOLATILE_P (mem[i]))
22590 return false;
22591
22592 /* Check if the addresses are in the form of [base+offset]. */
22593 extract_base_offset_in_addr (mem[i], base + i, offset + i);
22594 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
22595 return false;
22596 }
22597
363b395b
JW
22598 /* Check if the registers are of same class. */
22599 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
22600 ? FP_REGS : GENERAL_REGS;
22601
22602 for (int i = 1; i < num_insns; i++)
22603 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
22604 {
22605 if (rclass != FP_REGS)
22606 return false;
22607 }
22608 else
22609 {
22610 if (rclass != GENERAL_REGS)
22611 return false;
22612 }
22613
22614 /* Only the last register in the order in which they occur
22615 may be clobbered by the load. */
22616 if (rclass == GENERAL_REGS && load)
22617 for (int i = 0; i < num_insns - 1; i++)
34d7854d
JW
22618 if (reg_mentioned_p (reg[i], mem[i]))
22619 return false;
350013bc
BC
22620
22621 /* Check if the bases are same. */
34d7854d
JW
22622 for (int i = 0; i < num_insns - 1; i++)
22623 if (!rtx_equal_p (base[i], base[i + 1]))
22624 return false;
22625
22626 for (int i = 0; i < num_insns; i++)
22627 offvals[i] = INTVAL (offset[i]);
350013bc 22628
cd91a084 22629 msize = GET_MODE_SIZE (mode).to_constant ();
d0b51297
JW
22630
22631 /* Check if the offsets can be put in the right order to do a ldp/stp. */
34d7854d
JW
22632 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
22633 aarch64_host_wide_int_compare);
d0b51297
JW
22634
22635 if (!(offvals[1] == offvals[0] + msize
22636 && offvals[3] == offvals[2] + msize))
350013bc
BC
22637 return false;
22638
d0b51297
JW
22639 /* Check that offsets are within range of each other. The ldp/stp
22640 instructions have 7 bit immediate offsets, so use 0x80. */
22641 if (offvals[2] - offvals[0] >= msize * 0x80)
22642 return false;
350013bc 22643
d0b51297
JW
22644 /* The offsets must be aligned with respect to each other. */
22645 if (offvals[0] % msize != offvals[2] % msize)
22646 return false;
22647
54700e2e
AP
22648 /* If we have SImode and slow unaligned ldp,
22649 check the alignment to be at least 8 byte. */
22650 if (mode == SImode
22651 && (aarch64_tune_params.extra_tuning_flags
34d7854d 22652 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
54700e2e 22653 && !optimize_size
34d7854d 22654 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
54700e2e
AP
22655 return false;
22656
350013bc
BC
22657 return true;
22658}
22659
22660/* Given OPERANDS of consecutive load/store, this function pairs them
d0b51297
JW
22661 into LDP/STP after adjusting the offset. It depends on the fact
22662 that the operands can be sorted so the offsets are correct for STP.
350013bc
BC
22663 MODE is the mode of memory operands. CODE is the rtl operator
22664 which should be applied to all memory operands, it's SIGN_EXTEND,
22665 ZERO_EXTEND or UNKNOWN. */
22666
22667bool
22668aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
cd91a084 22669 machine_mode mode, RTX_CODE code)
350013bc 22670{
d0b51297 22671 rtx base, offset_1, offset_3, t1, t2;
350013bc 22672 rtx mem_1, mem_2, mem_3, mem_4;
d0b51297
JW
22673 rtx temp_operands[8];
22674 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
22675 stp_off_upper_limit, stp_off_lower_limit, msize;
9b56ec11 22676
d0b51297
JW
22677 /* We make changes on a copy as we may still bail out. */
22678 for (int i = 0; i < 8; i ++)
22679 temp_operands[i] = operands[i];
9b56ec11 22680
d0b51297
JW
22681 /* Sort the operands. */
22682 qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
9b56ec11 22683
f6af9c21
RE
22684 /* Copy the memory operands so that if we have to bail for some
22685 reason the original addresses are unchanged. */
350013bc
BC
22686 if (load)
22687 {
f6af9c21
RE
22688 mem_1 = copy_rtx (temp_operands[1]);
22689 mem_2 = copy_rtx (temp_operands[3]);
22690 mem_3 = copy_rtx (temp_operands[5]);
22691 mem_4 = copy_rtx (temp_operands[7]);
350013bc
BC
22692 }
22693 else
22694 {
f6af9c21
RE
22695 mem_1 = copy_rtx (temp_operands[0]);
22696 mem_2 = copy_rtx (temp_operands[2]);
22697 mem_3 = copy_rtx (temp_operands[4]);
22698 mem_4 = copy_rtx (temp_operands[6]);
350013bc
BC
22699 gcc_assert (code == UNKNOWN);
22700 }
22701
9b56ec11 22702 extract_base_offset_in_addr (mem_1, &base, &offset_1);
d0b51297
JW
22703 extract_base_offset_in_addr (mem_3, &base, &offset_3);
22704 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
22705 && offset_3 != NULL_RTX);
350013bc 22706
d0b51297 22707 /* Adjust offset so it can fit in LDP/STP instruction. */
cd91a084 22708 msize = GET_MODE_SIZE (mode).to_constant();
d0b51297
JW
22709 stp_off_upper_limit = msize * (0x40 - 1);
22710 stp_off_lower_limit = - msize * 0x40;
350013bc 22711
d0b51297
JW
22712 off_val_1 = INTVAL (offset_1);
22713 off_val_3 = INTVAL (offset_3);
22714
22715 /* The base offset is optimally half way between the two STP/LDP offsets. */
22716 if (msize <= 4)
22717 base_off = (off_val_1 + off_val_3) / 2;
22718 else
22719 /* However, due to issues with negative LDP/STP offset generation for
22720 larger modes, for DF, DI and vector modes. we must not use negative
22721 addresses smaller than 9 signed unadjusted bits can store. This
22722 provides the most range in this case. */
22723 base_off = off_val_1;
22724
22725 /* Adjust the base so that it is aligned with the addresses but still
22726 optimal. */
22727 if (base_off % msize != off_val_1 % msize)
22728 /* Fix the offset, bearing in mind we want to make it bigger not
22729 smaller. */
22730 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
22731 else if (msize <= 4)
22732 /* The negative range of LDP/STP is one larger than the positive range. */
22733 base_off += msize;
22734
22735 /* Check if base offset is too big or too small. We can attempt to resolve
22736 this issue by setting it to the maximum value and seeing if the offsets
22737 still fit. */
22738 if (base_off >= 0x1000)
350013bc 22739 {
d0b51297
JW
22740 base_off = 0x1000 - 1;
22741 /* We must still make sure that the base offset is aligned with respect
700d4cb0 22742 to the address. But it may not be made any bigger. */
d0b51297 22743 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
350013bc
BC
22744 }
22745
d0b51297
JW
22746 /* Likewise for the case where the base is too small. */
22747 if (base_off <= -0x1000)
350013bc 22748 {
d0b51297
JW
22749 base_off = -0x1000 + 1;
22750 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
350013bc
BC
22751 }
22752
d0b51297
JW
22753 /* Offset of the first STP/LDP. */
22754 new_off_1 = off_val_1 - base_off;
22755
22756 /* Offset of the second STP/LDP. */
22757 new_off_3 = off_val_3 - base_off;
350013bc 22758
d0b51297
JW
22759 /* The offsets must be within the range of the LDP/STP instructions. */
22760 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
22761 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
350013bc
BC
22762 return false;
22763
d0b51297
JW
22764 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
22765 new_off_1), true);
22766 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
22767 new_off_1 + msize), true);
22768 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
22769 new_off_3), true);
22770 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
22771 new_off_3 + msize), true);
22772
22773 if (!aarch64_mem_pair_operand (mem_1, mode)
22774 || !aarch64_mem_pair_operand (mem_3, mode))
22775 return false;
350013bc
BC
22776
22777 if (code == ZERO_EXTEND)
22778 {
22779 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
22780 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
22781 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
22782 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
22783 }
22784 else if (code == SIGN_EXTEND)
22785 {
22786 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
22787 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
22788 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
22789 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
22790 }
22791
22792 if (load)
22793 {
d0b51297 22794 operands[0] = temp_operands[0];
350013bc 22795 operands[1] = mem_1;
d0b51297 22796 operands[2] = temp_operands[2];
350013bc 22797 operands[3] = mem_2;
d0b51297 22798 operands[4] = temp_operands[4];
350013bc 22799 operands[5] = mem_3;
d0b51297 22800 operands[6] = temp_operands[6];
350013bc
BC
22801 operands[7] = mem_4;
22802 }
22803 else
22804 {
22805 operands[0] = mem_1;
d0b51297 22806 operands[1] = temp_operands[1];
350013bc 22807 operands[2] = mem_2;
d0b51297 22808 operands[3] = temp_operands[3];
350013bc 22809 operands[4] = mem_3;
d0b51297 22810 operands[5] = temp_operands[5];
350013bc 22811 operands[6] = mem_4;
d0b51297 22812 operands[7] = temp_operands[7];
350013bc
BC
22813 }
22814
22815 /* Emit adjusting instruction. */
d0b51297 22816 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
350013bc 22817 /* Emit ldp/stp instructions. */
f7df4a84
RS
22818 t1 = gen_rtx_SET (operands[0], operands[1]);
22819 t2 = gen_rtx_SET (operands[2], operands[3]);
350013bc 22820 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
f7df4a84
RS
22821 t1 = gen_rtx_SET (operands[4], operands[5]);
22822 t2 = gen_rtx_SET (operands[6], operands[7]);
350013bc
BC
22823 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
22824 return true;
22825}
22826
76a34e3f
RS
22827/* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
22828 it isn't worth branching around empty masked ops (including masked
22829 stores). */
22830
22831static bool
22832aarch64_empty_mask_is_expensive (unsigned)
22833{
22834 return false;
22835}
22836
1b1e81f8
JW
22837/* Return 1 if pseudo register should be created and used to hold
22838 GOT address for PIC code. */
22839
22840bool
22841aarch64_use_pseudo_pic_reg (void)
22842{
22843 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
22844}
22845
7b841a12
JW
22846/* Implement TARGET_UNSPEC_MAY_TRAP_P. */
22847
22848static int
22849aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
22850{
22851 switch (XINT (x, 1))
22852 {
22853 case UNSPEC_GOTSMALLPIC:
22854 case UNSPEC_GOTSMALLPIC28K:
22855 case UNSPEC_GOTTINYPIC:
22856 return 0;
22857 default:
22858 break;
22859 }
22860
22861 return default_unspec_may_trap_p (x, flags);
22862}
22863
39252973
KT
22864
22865/* If X is a positive CONST_DOUBLE with a value that is a power of 2
22866 return the log2 of that value. Otherwise return -1. */
22867
22868int
22869aarch64_fpconst_pow_of_2 (rtx x)
22870{
22871 const REAL_VALUE_TYPE *r;
22872
22873 if (!CONST_DOUBLE_P (x))
22874 return -1;
22875
22876 r = CONST_DOUBLE_REAL_VALUE (x);
22877
22878 if (REAL_VALUE_NEGATIVE (*r)
22879 || REAL_VALUE_ISNAN (*r)
22880 || REAL_VALUE_ISINF (*r)
22881 || !real_isinteger (r, DFmode))
22882 return -1;
22883
22884 return exact_log2 (real_to_integer (r));
22885}
22886
188d0079
JH
22887/* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
22888 power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
22889 return n. Otherwise return -1. */
22890
22891int
22892aarch64_fpconst_pow2_recip (rtx x)
22893{
22894 REAL_VALUE_TYPE r0;
22895
22896 if (!CONST_DOUBLE_P (x))
22897 return -1;
22898
22899 r0 = *CONST_DOUBLE_REAL_VALUE (x);
22900 if (exact_real_inverse (DFmode, &r0)
22901 && !REAL_VALUE_NEGATIVE (r0))
22902 {
22903 int ret = exact_log2 (real_to_integer (&r0));
22904 if (ret >= 1 && ret <= 32)
22905 return ret;
22906 }
22907 return -1;
22908}
22909
39252973
KT
22910/* If X is a vector of equal CONST_DOUBLE values and that value is
22911 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
22912
22913int
22914aarch64_vec_fpconst_pow_of_2 (rtx x)
22915{
6a70badb
RS
22916 int nelts;
22917 if (GET_CODE (x) != CONST_VECTOR
22918 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
39252973
KT
22919 return -1;
22920
22921 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
22922 return -1;
22923
22924 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
22925 if (firstval <= 0)
22926 return -1;
22927
6a70badb 22928 for (int i = 1; i < nelts; i++)
39252973
KT
22929 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
22930 return -1;
22931
22932 return firstval;
22933}
22934
11e554b3
JG
22935/* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
22936 to float.
22937
22938 __fp16 always promotes through this hook.
22939 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
22940 through the generic excess precision logic rather than here. */
22941
c2ec330c
AL
22942static tree
22943aarch64_promoted_type (const_tree t)
22944{
11e554b3
JG
22945 if (SCALAR_FLOAT_TYPE_P (t)
22946 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
c2ec330c 22947 return float_type_node;
11e554b3 22948
c2ec330c
AL
22949 return NULL_TREE;
22950}
ee62a5a6
RS
22951
22952/* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
22953
22954static bool
9acc9cbe 22955aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
ee62a5a6
RS
22956 optimization_type opt_type)
22957{
22958 switch (op)
22959 {
22960 case rsqrt_optab:
9acc9cbe 22961 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
ee62a5a6
RS
22962
22963 default:
22964 return true;
22965 }
22966}
22967
43cacb12
RS
22968/* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
22969
22970static unsigned int
22971aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
22972 int *offset)
22973{
22974 /* Polynomial invariant 1 == (VG / 2) - 1. */
22975 gcc_assert (i == 1);
22976 *factor = 2;
22977 *offset = 1;
22978 return AARCH64_DWARF_VG;
22979}
22980
11e554b3
JG
22981/* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
22982 if MODE is HFmode, and punt to the generic implementation otherwise. */
22983
22984static bool
7c5bd57a 22985aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
11e554b3
JG
22986{
22987 return (mode == HFmode
22988 ? true
22989 : default_libgcc_floating_mode_supported_p (mode));
22990}
22991
2e5f8203
JG
22992/* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
22993 if MODE is HFmode, and punt to the generic implementation otherwise. */
22994
22995static bool
18e2a8b8 22996aarch64_scalar_mode_supported_p (scalar_mode mode)
2e5f8203
JG
22997{
22998 return (mode == HFmode
22999 ? true
23000 : default_scalar_mode_supported_p (mode));
23001}
23002
11e554b3
JG
23003/* Set the value of FLT_EVAL_METHOD.
23004 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
23005
23006 0: evaluate all operations and constants, whose semantic type has at
23007 most the range and precision of type float, to the range and
23008 precision of float; evaluate all other operations and constants to
23009 the range and precision of the semantic type;
23010
23011 N, where _FloatN is a supported interchange floating type
23012 evaluate all operations and constants, whose semantic type has at
23013 most the range and precision of _FloatN type, to the range and
23014 precision of the _FloatN type; evaluate all other operations and
23015 constants to the range and precision of the semantic type;
23016
23017 If we have the ARMv8.2-A extensions then we support _Float16 in native
23018 precision, so we should set this to 16. Otherwise, we support the type,
23019 but want to evaluate expressions in float precision, so set this to
23020 0. */
23021
23022static enum flt_eval_method
23023aarch64_excess_precision (enum excess_precision_type type)
23024{
23025 switch (type)
23026 {
23027 case EXCESS_PRECISION_TYPE_FAST:
23028 case EXCESS_PRECISION_TYPE_STANDARD:
23029 /* We can calculate either in 16-bit range and precision or
23030 32-bit range and precision. Make that decision based on whether
23031 we have native support for the ARMv8.2-A 16-bit floating-point
23032 instructions or not. */
23033 return (TARGET_FP_F16INST
23034 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
23035 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
23036 case EXCESS_PRECISION_TYPE_IMPLICIT:
23037 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
23038 default:
23039 gcc_unreachable ();
23040 }
23041 return FLT_EVAL_METHOD_UNPREDICTABLE;
23042}
23043
b48d6421
KT
23044/* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
23045 scheduled for speculative execution. Reject the long-running division
23046 and square-root instructions. */
23047
23048static bool
23049aarch64_sched_can_speculate_insn (rtx_insn *insn)
23050{
23051 switch (get_attr_type (insn))
23052 {
23053 case TYPE_SDIV:
23054 case TYPE_UDIV:
23055 case TYPE_FDIVS:
23056 case TYPE_FDIVD:
23057 case TYPE_FSQRTS:
23058 case TYPE_FSQRTD:
23059 case TYPE_NEON_FP_SQRT_S:
23060 case TYPE_NEON_FP_SQRT_D:
23061 case TYPE_NEON_FP_SQRT_S_Q:
23062 case TYPE_NEON_FP_SQRT_D_Q:
23063 case TYPE_NEON_FP_DIV_S:
23064 case TYPE_NEON_FP_DIV_D:
23065 case TYPE_NEON_FP_DIV_S_Q:
23066 case TYPE_NEON_FP_DIV_D_Q:
23067 return false;
23068 default:
23069 return true;
23070 }
23071}
23072
43cacb12
RS
23073/* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
23074
23075static int
23076aarch64_compute_pressure_classes (reg_class *classes)
23077{
23078 int i = 0;
23079 classes[i++] = GENERAL_REGS;
23080 classes[i++] = FP_REGS;
23081 /* PR_REGS isn't a useful pressure class because many predicate pseudo
23082 registers need to go in PR_LO_REGS at some point during their
23083 lifetime. Splitting it into two halves has the effect of making
23084 all predicates count against PR_LO_REGS, so that we try whenever
23085 possible to restrict the number of live predicates to 8. This
23086 greatly reduces the amount of spilling in certain loops. */
23087 classes[i++] = PR_LO_REGS;
23088 classes[i++] = PR_HI_REGS;
23089 return i;
23090}
23091
23092/* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
23093
23094static bool
23095aarch64_can_change_mode_class (machine_mode from,
23096 machine_mode to, reg_class_t)
23097{
76607e7e
RS
23098 unsigned int from_flags = aarch64_classify_vector_mode (from);
23099 unsigned int to_flags = aarch64_classify_vector_mode (to);
23100
23101 bool from_sve_p = (from_flags & VEC_ANY_SVE);
23102 bool to_sve_p = (to_flags & VEC_ANY_SVE);
23103
23104 bool from_partial_sve_p = from_sve_p && (from_flags & VEC_PARTIAL);
23105 bool to_partial_sve_p = to_sve_p && (to_flags & VEC_PARTIAL);
23106
38e62001
RS
23107 bool from_pred_p = (from_flags & VEC_SVE_PRED);
23108 bool to_pred_p = (to_flags & VEC_SVE_PRED);
23109
23110 /* Don't allow changes between predicate modes and other modes.
23111 Only predicate registers can hold predicate modes and only
23112 non-predicate registers can hold non-predicate modes, so any
23113 attempt to mix them would require a round trip through memory. */
23114 if (from_pred_p != to_pred_p)
23115 return false;
23116
76607e7e
RS
23117 /* Don't allow changes between partial SVE modes and other modes.
23118 The contents of partial SVE modes are distributed evenly across
23119 the register, whereas GCC expects them to be clustered together. */
23120 if (from_partial_sve_p != to_partial_sve_p)
23121 return false;
23122
23123 /* Similarly reject changes between partial SVE modes that have
23124 different patterns of significant and insignificant bits. */
23125 if (from_partial_sve_p
23126 && (aarch64_sve_container_bits (from) != aarch64_sve_container_bits (to)
23127 || GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to)))
23128 return false;
23129
38e62001
RS
23130 if (maybe_ne (BITS_PER_SVE_VECTOR, 128u))
23131 {
23132 /* Don't allow changes between SVE modes and other modes that might
23133 be bigger than 128 bits. In particular, OImode, CImode and XImode
23134 divide into 128-bit quantities while SVE modes divide into
23135 BITS_PER_SVE_VECTOR quantities. */
23136 if (from_sve_p && !to_sve_p && maybe_gt (GET_MODE_BITSIZE (to), 128))
23137 return false;
23138 if (to_sve_p && !from_sve_p && maybe_gt (GET_MODE_BITSIZE (from), 128))
23139 return false;
23140 }
23141
002092be
RS
23142 if (BYTES_BIG_ENDIAN)
23143 {
002092be
RS
23144 /* Don't allow changes between SVE data modes and non-SVE modes.
23145 See the comment at the head of aarch64-sve.md for details. */
23146 if (from_sve_p != to_sve_p)
23147 return false;
23148
23149 /* Don't allow changes in element size: lane 0 of the new vector
23150 would not then be lane 0 of the old vector. See the comment
23151 above aarch64_maybe_expand_sve_subreg_move for a more detailed
23152 description.
23153
23154 In the worst case, this forces a register to be spilled in
23155 one mode and reloaded in the other, which handles the
23156 endianness correctly. */
23157 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
23158 return false;
23159 }
43cacb12
RS
23160 return true;
23161}
23162
5cce8171
RS
23163/* Implement TARGET_EARLY_REMAT_MODES. */
23164
23165static void
23166aarch64_select_early_remat_modes (sbitmap modes)
23167{
23168 /* SVE values are not normally live across a call, so it should be
23169 worth doing early rematerialization even in VL-specific mode. */
23170 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
5c38705d
RS
23171 if (aarch64_sve_mode_p ((machine_mode) i))
23172 bitmap_set_bit (modes, i);
5cce8171
RS
23173}
23174
c0111dc4
RE
23175/* Override the default target speculation_safe_value. */
23176static rtx
23177aarch64_speculation_safe_value (machine_mode mode,
23178 rtx result, rtx val, rtx failval)
23179{
23180 /* Maybe we should warn if falling back to hard barriers. They are
23181 likely to be noticably more expensive than the alternative below. */
23182 if (!aarch64_track_speculation)
23183 return default_speculation_safe_value (mode, result, val, failval);
23184
23185 if (!REG_P (val))
23186 val = copy_to_mode_reg (mode, val);
23187
23188 if (!aarch64_reg_or_zero (failval, mode))
23189 failval = copy_to_mode_reg (mode, failval);
23190
21cebf90 23191 emit_insn (gen_despeculate_copy (mode, result, val, failval));
c0111dc4
RE
23192 return result;
23193}
23194
2d56d6ba
KT
23195/* Implement TARGET_ESTIMATED_POLY_VALUE.
23196 Look into the tuning structure for an estimate.
64432b68
KT
23197 KIND specifies the type of requested estimate: min, max or likely.
23198 For cores with a known SVE width all three estimates are the same.
23199 For generic SVE tuning we want to distinguish the maximum estimate from
23200 the minimum and likely ones.
23201 The likely estimate is the same as the minimum in that case to give a
23202 conservative behavior of auto-vectorizing with SVE when it is a win
23203 even for 128-bit SVE.
23204 When SVE width information is available VAL.coeffs[1] is multiplied by
23205 the number of VQ chunks over the initial Advanced SIMD 128 bits. */
2d56d6ba
KT
23206
23207static HOST_WIDE_INT
64432b68
KT
23208aarch64_estimated_poly_value (poly_int64 val,
23209 poly_value_estimate_kind kind
23210 = POLY_VALUE_LIKELY)
2d56d6ba
KT
23211{
23212 enum aarch64_sve_vector_bits_enum width_source
23213 = aarch64_tune_params.sve_width;
23214
64432b68
KT
23215 /* If there is no core-specific information then the minimum and likely
23216 values are based on 128-bit vectors and the maximum is based on
23217 the architectural maximum of 2048 bits. */
2d56d6ba 23218 if (width_source == SVE_SCALABLE)
64432b68
KT
23219 switch (kind)
23220 {
23221 case POLY_VALUE_MIN:
23222 case POLY_VALUE_LIKELY:
23223 return val.coeffs[0];
23224 case POLY_VALUE_MAX:
23225 return val.coeffs[0] + val.coeffs[1] * 15;
23226 }
2d56d6ba 23227
64432b68 23228 /* If the core provides width information, use that. */
2d56d6ba
KT
23229 HOST_WIDE_INT over_128 = width_source - 128;
23230 return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
23231}
23232
d9186814
SE
23233
23234/* Return true for types that could be supported as SIMD return or
23235 argument types. */
23236
23237static bool
23238supported_simd_type (tree t)
23239{
23240 if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
23241 {
23242 HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
23243 return s == 1 || s == 2 || s == 4 || s == 8;
23244 }
23245 return false;
23246}
23247
23248/* Return true for types that currently are supported as SIMD return
23249 or argument types. */
23250
23251static bool
23252currently_supported_simd_type (tree t, tree b)
23253{
23254 if (COMPLEX_FLOAT_TYPE_P (t))
23255 return false;
23256
23257 if (TYPE_SIZE (t) != TYPE_SIZE (b))
23258 return false;
23259
23260 return supported_simd_type (t);
23261}
23262
23263/* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
23264
23265static int
23266aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
23267 struct cgraph_simd_clone *clonei,
23268 tree base_type, int num)
23269{
23270 tree t, ret_type, arg_type;
abe93733
YY
23271 unsigned int elt_bits, count;
23272 unsigned HOST_WIDE_INT const_simdlen;
23273 poly_uint64 vec_bits;
d9186814
SE
23274
23275 if (!TARGET_SIMD)
23276 return 0;
23277
abe93733
YY
23278 /* For now, SVE simdclones won't produce illegal simdlen, So only check
23279 const simdlens here. */
23280 if (maybe_ne (clonei->simdlen, 0U)
23281 && clonei->simdlen.is_constant (&const_simdlen)
23282 && (const_simdlen < 2
23283 || const_simdlen > 1024
23284 || (const_simdlen & (const_simdlen - 1)) != 0))
d9186814
SE
23285 {
23286 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
abe93733 23287 "unsupported simdlen %wd", const_simdlen);
d9186814
SE
23288 return 0;
23289 }
23290
23291 ret_type = TREE_TYPE (TREE_TYPE (node->decl));
23292 if (TREE_CODE (ret_type) != VOID_TYPE
23293 && !currently_supported_simd_type (ret_type, base_type))
23294 {
23295 if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
23296 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
23297 "GCC does not currently support mixed size types "
23298 "for %<simd%> functions");
23299 else if (supported_simd_type (ret_type))
23300 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
23301 "GCC does not currently support return type %qT "
23302 "for %<simd%> functions", ret_type);
23303 else
23304 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
23305 "unsupported return type %qT for %<simd%> functions",
23306 ret_type);
23307 return 0;
23308 }
23309
23310 for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
23311 {
23312 arg_type = TREE_TYPE (t);
23313
23314 if (!currently_supported_simd_type (arg_type, base_type))
23315 {
23316 if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
23317 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
23318 "GCC does not currently support mixed size types "
23319 "for %<simd%> functions");
23320 else
23321 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
23322 "GCC does not currently support argument type %qT "
23323 "for %<simd%> functions", arg_type);
23324 return 0;
23325 }
23326 }
23327
23328 clonei->vecsize_mangle = 'n';
23329 clonei->mask_mode = VOIDmode;
23330 elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
abe93733 23331 if (known_eq (clonei->simdlen, 0U))
d9186814
SE
23332 {
23333 count = 2;
23334 vec_bits = (num == 0 ? 64 : 128);
abe93733 23335 clonei->simdlen = exact_div (vec_bits, elt_bits);
d9186814
SE
23336 }
23337 else
23338 {
23339 count = 1;
23340 vec_bits = clonei->simdlen * elt_bits;
abe93733
YY
23341 /* For now, SVE simdclones won't produce illegal simdlen, So only check
23342 const simdlens here. */
23343 if (clonei->simdlen.is_constant (&const_simdlen)
23344 && maybe_ne (vec_bits, 64U) && maybe_ne (vec_bits, 128U))
d9186814
SE
23345 {
23346 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
abe93733
YY
23347 "GCC does not currently support simdlen %wd for type %qT",
23348 const_simdlen, base_type);
d9186814
SE
23349 return 0;
23350 }
23351 }
23352 clonei->vecsize_int = vec_bits;
23353 clonei->vecsize_float = vec_bits;
23354 return count;
23355}
23356
23357/* Implement TARGET_SIMD_CLONE_ADJUST. */
23358
23359static void
23360aarch64_simd_clone_adjust (struct cgraph_node *node)
23361{
23362 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
23363 use the correct ABI. */
23364
23365 tree t = TREE_TYPE (node->decl);
23366 TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
23367 TYPE_ATTRIBUTES (t));
23368}
23369
23370/* Implement TARGET_SIMD_CLONE_USABLE. */
23371
23372static int
23373aarch64_simd_clone_usable (struct cgraph_node *node)
23374{
23375 switch (node->simdclone->vecsize_mangle)
23376 {
23377 case 'n':
23378 if (!TARGET_SIMD)
23379 return -1;
23380 return 0;
23381 default:
23382 gcc_unreachable ();
23383 }
23384}
23385
497f281c
SE
23386/* Implement TARGET_COMP_TYPE_ATTRIBUTES */
23387
23388static int
23389aarch64_comp_type_attributes (const_tree type1, const_tree type2)
23390{
31427b97
RS
23391 auto check_attr = [&](const char *name) {
23392 tree attr1 = lookup_attribute (name, TYPE_ATTRIBUTES (type1));
23393 tree attr2 = lookup_attribute (name, TYPE_ATTRIBUTES (type2));
23394 if (!attr1 && !attr2)
23395 return true;
23396
23397 return attr1 && attr2 && attribute_value_equal (attr1, attr2);
23398 };
23399
23400 if (!check_attr ("aarch64_vector_pcs"))
23401 return 0;
23402 if (!check_attr ("Advanced SIMD type"))
497f281c
SE
23403 return 0;
23404 return 1;
23405}
23406
3bac1e20
SE
23407/* Implement TARGET_GET_MULTILIB_ABI_NAME */
23408
23409static const char *
23410aarch64_get_multilib_abi_name (void)
23411{
23412 if (TARGET_BIG_END)
23413 return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
23414 return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
23415}
23416
e76c8e56
JJ
23417/* Implement TARGET_STACK_PROTECT_GUARD. In case of a
23418 global variable based guard use the default else
23419 return a null tree. */
23420static tree
23421aarch64_stack_protect_guard (void)
23422{
23423 if (aarch64_stack_protector_guard == SSP_GLOBAL)
23424 return default_stack_protect_guard ();
23425
23426 return NULL_TREE;
23427}
23428
98698967
SMW
23429/* Return the diagnostic message string if conversion from FROMTYPE to
23430 TOTYPE is not allowed, NULL otherwise. */
23431
23432static const char *
23433aarch64_invalid_conversion (const_tree fromtype, const_tree totype)
23434{
23435 if (element_mode (fromtype) != element_mode (totype))
23436 {
23437 /* Do no allow conversions to/from BFmode scalar types. */
23438 if (TYPE_MODE (fromtype) == BFmode)
23439 return N_("invalid conversion from type %<bfloat16_t%>");
23440 if (TYPE_MODE (totype) == BFmode)
23441 return N_("invalid conversion to type %<bfloat16_t%>");
23442 }
23443
23444 /* Conversion allowed. */
23445 return NULL;
23446}
23447
23448/* Return the diagnostic message string if the unary operation OP is
23449 not permitted on TYPE, NULL otherwise. */
23450
23451static const char *
23452aarch64_invalid_unary_op (int op, const_tree type)
23453{
23454 /* Reject all single-operand operations on BFmode except for &. */
23455 if (element_mode (type) == BFmode && op != ADDR_EXPR)
23456 return N_("operation not permitted on type %<bfloat16_t%>");
23457
23458 /* Operation allowed. */
23459 return NULL;
23460}
23461
23462/* Return the diagnostic message string if the binary operation OP is
23463 not permitted on TYPE1 and TYPE2, NULL otherwise. */
23464
23465static const char *
23466aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1,
23467 const_tree type2)
23468{
23469 /* Reject all 2-operand operations on BFmode. */
23470 if (element_mode (type1) == BFmode
23471 || element_mode (type2) == BFmode)
23472 return N_("operation not permitted on type %<bfloat16_t%>");
23473
38e62001
RS
23474 if (VECTOR_TYPE_P (type1)
23475 && VECTOR_TYPE_P (type2)
23476 && !TYPE_INDIVISIBLE_P (type1)
23477 && !TYPE_INDIVISIBLE_P (type2)
23478 && (aarch64_sve::builtin_type_p (type1)
23479 != aarch64_sve::builtin_type_p (type2)))
23480 return N_("cannot combine GNU and SVE vectors in a binary operation");
23481
98698967
SMW
23482 /* Operation allowed. */
23483 return NULL;
23484}
23485
3bd87832
MM
23486/* Implement TARGET_MEMTAG_CAN_TAG_ADDRESSES. Here we tell the rest of the
23487 compiler that we automatically ignore the top byte of our pointers, which
23488 allows using -fsanitize=hwaddress. */
23489bool
23490aarch64_can_tag_addresses ()
23491{
23492 return !TARGET_ILP32;
23493}
23494
32efff9f
SD
23495/* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
23496 section at the end if needed. */
23497#define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
23498#define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
23499#define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
23500void
23501aarch64_file_end_indicate_exec_stack ()
23502{
23503 file_end_indicate_exec_stack ();
23504
23505 unsigned feature_1_and = 0;
23506 if (aarch64_bti_enabled ())
23507 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
23508
23509 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
23510 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
23511
23512 if (feature_1_and)
23513 {
23514 /* Generate .note.gnu.property section. */
23515 switch_to_section (get_section (".note.gnu.property",
23516 SECTION_NOTYPE, NULL));
23517
23518 /* PT_NOTE header: namesz, descsz, type.
23519 namesz = 4 ("GNU\0")
23520 descsz = 16 (Size of the program property array)
23521 [(12 + padding) * Number of array elements]
23522 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
23523 assemble_align (POINTER_SIZE);
23524 assemble_integer (GEN_INT (4), 4, 32, 1);
23525 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
23526 assemble_integer (GEN_INT (5), 4, 32, 1);
23527
23528 /* PT_NOTE name. */
23529 assemble_string ("GNU", 4);
23530
23531 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
23532 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
23533 datasz = 4
23534 data = feature_1_and. */
23535 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
23536 assemble_integer (GEN_INT (4), 4, 32, 1);
23537 assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
23538
23539 /* Pad the size of the note to the required alignment. */
23540 assemble_align (POINTER_SIZE);
23541 }
23542}
23543#undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
23544#undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
23545#undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
e76c8e56 23546
be178ecd
MM
23547/* Helper function for straight line speculation.
23548 Return what barrier should be emitted for straight line speculation
23549 mitigation.
23550 When not mitigating against straight line speculation this function returns
23551 an empty string.
23552 When mitigating against straight line speculation, use:
23553 * SB when the v8.5-A SB extension is enabled.
23554 * DSB+ISB otherwise. */
23555const char *
23556aarch64_sls_barrier (int mitigation_required)
23557{
23558 return mitigation_required
23559 ? (TARGET_SB ? "sb" : "dsb\tsy\n\tisb")
23560 : "";
23561}
23562
96b7f495
MM
23563static GTY (()) tree aarch64_sls_shared_thunks[30];
23564static GTY (()) bool aarch64_sls_shared_thunks_needed = false;
23565const char *indirect_symbol_names[30] = {
23566 "__call_indirect_x0",
23567 "__call_indirect_x1",
23568 "__call_indirect_x2",
23569 "__call_indirect_x3",
23570 "__call_indirect_x4",
23571 "__call_indirect_x5",
23572 "__call_indirect_x6",
23573 "__call_indirect_x7",
23574 "__call_indirect_x8",
23575 "__call_indirect_x9",
23576 "__call_indirect_x10",
23577 "__call_indirect_x11",
23578 "__call_indirect_x12",
23579 "__call_indirect_x13",
23580 "__call_indirect_x14",
23581 "__call_indirect_x15",
23582 "", /* "__call_indirect_x16", */
23583 "", /* "__call_indirect_x17", */
23584 "__call_indirect_x18",
23585 "__call_indirect_x19",
23586 "__call_indirect_x20",
23587 "__call_indirect_x21",
23588 "__call_indirect_x22",
23589 "__call_indirect_x23",
23590 "__call_indirect_x24",
23591 "__call_indirect_x25",
23592 "__call_indirect_x26",
23593 "__call_indirect_x27",
23594 "__call_indirect_x28",
23595 "__call_indirect_x29",
23596};
23597
23598/* Function to create a BLR thunk. This thunk is used to mitigate straight
23599 line speculation. Instead of a simple BLR that can be speculated past,
23600 we emit a BL to this thunk, and this thunk contains a BR to the relevant
23601 register. These thunks have the relevant speculation barries put after
23602 their indirect branch so that speculation is blocked.
23603
23604 We use such a thunk so the speculation barriers are kept off the
23605 architecturally executed path in order to reduce the performance overhead.
23606
23607 When optimizing for size we use stubs shared by the linked object.
23608 When optimizing for performance we emit stubs for each function in the hope
23609 that the branch predictor can better train on jumps specific for a given
23610 function. */
23611rtx
23612aarch64_sls_create_blr_label (int regnum)
23613{
23614 gcc_assert (STUB_REGNUM_P (regnum));
23615 if (optimize_function_for_size_p (cfun))
23616 {
23617 /* For the thunks shared between different functions in this compilation
23618 unit we use a named symbol -- this is just for users to more easily
23619 understand the generated assembly. */
23620 aarch64_sls_shared_thunks_needed = true;
23621 const char *thunk_name = indirect_symbol_names[regnum];
23622 if (aarch64_sls_shared_thunks[regnum] == NULL)
23623 {
23624 /* Build a decl representing this function stub and record it for
23625 later. We build a decl here so we can use the GCC machinery for
23626 handling sections automatically (through `get_named_section` and
23627 `make_decl_one_only`). That saves us a lot of trouble handling
23628 the specifics of different output file formats. */
23629 tree decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
23630 get_identifier (thunk_name),
23631 build_function_type_list (void_type_node,
23632 NULL_TREE));
23633 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
23634 NULL_TREE, void_type_node);
23635 TREE_PUBLIC (decl) = 1;
23636 TREE_STATIC (decl) = 1;
23637 DECL_IGNORED_P (decl) = 1;
23638 DECL_ARTIFICIAL (decl) = 1;
23639 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
23640 resolve_unique_section (decl, 0, false);
23641 aarch64_sls_shared_thunks[regnum] = decl;
23642 }
23643
23644 return gen_rtx_SYMBOL_REF (Pmode, thunk_name);
23645 }
23646
23647 if (cfun->machine->call_via[regnum] == NULL)
23648 cfun->machine->call_via[regnum]
23649 = gen_rtx_LABEL_REF (Pmode, gen_label_rtx ());
23650 return cfun->machine->call_via[regnum];
23651}
23652
23653/* Helper function for aarch64_sls_emit_blr_function_thunks and
23654 aarch64_sls_emit_shared_blr_thunks below. */
23655static void
23656aarch64_sls_emit_function_stub (FILE *out_file, int regnum)
23657{
23658 /* Save in x16 and branch to that function so this transformation does
23659 not prevent jumping to `BTI c` instructions. */
23660 asm_fprintf (out_file, "\tmov\tx16, x%d\n", regnum);
23661 asm_fprintf (out_file, "\tbr\tx16\n");
23662}
23663
23664/* Emit all BLR stubs for this particular function.
23665 Here we emit all the BLR stubs needed for the current function. Since we
23666 emit these stubs in a consecutive block we know there will be no speculation
23667 gadgets between each stub, and hence we only emit a speculation barrier at
23668 the end of the stub sequences.
23669
23670 This is called in the TARGET_ASM_FUNCTION_EPILOGUE hook. */
23671void
23672aarch64_sls_emit_blr_function_thunks (FILE *out_file)
23673{
23674 if (! aarch64_harden_sls_blr_p ())
23675 return;
23676
23677 bool any_functions_emitted = false;
23678 /* We must save and restore the current function section since this assembly
23679 is emitted at the end of the function. This means it can be emitted *just
23680 after* the cold section of a function. That cold part would be emitted in
23681 a different section. That switch would trigger a `.cfi_endproc` directive
23682 to be emitted in the original section and a `.cfi_startproc` directive to
23683 be emitted in the new section. Switching to the original section without
23684 restoring would mean that the `.cfi_endproc` emitted as a function ends
23685 would happen in a different section -- leaving an unmatched
23686 `.cfi_startproc` in the cold text section and an unmatched `.cfi_endproc`
23687 in the standard text section. */
23688 section *save_text_section = in_section;
23689 switch_to_section (function_section (current_function_decl));
23690 for (int regnum = 0; regnum < 30; ++regnum)
23691 {
23692 rtx specu_label = cfun->machine->call_via[regnum];
23693 if (specu_label == NULL)
23694 continue;
23695
23696 targetm.asm_out.print_operand (out_file, specu_label, 0);
23697 asm_fprintf (out_file, ":\n");
23698 aarch64_sls_emit_function_stub (out_file, regnum);
23699 any_functions_emitted = true;
23700 }
23701 if (any_functions_emitted)
23702 /* Can use the SB if needs be here, since this stub will only be used
23703 by the current function, and hence for the current target. */
23704 asm_fprintf (out_file, "\t%s\n", aarch64_sls_barrier (true));
23705 switch_to_section (save_text_section);
23706}
23707
23708/* Emit shared BLR stubs for the current compilation unit.
23709 Over the course of compiling this unit we may have converted some BLR
23710 instructions to a BL to a shared stub function. This is where we emit those
23711 stub functions.
23712 This function is for the stubs shared between different functions in this
23713 compilation unit. We share when optimizing for size instead of speed.
23714
23715 This function is called through the TARGET_ASM_FILE_END hook. */
23716void
23717aarch64_sls_emit_shared_blr_thunks (FILE *out_file)
23718{
23719 if (! aarch64_sls_shared_thunks_needed)
23720 return;
23721
23722 for (int regnum = 0; regnum < 30; ++regnum)
23723 {
23724 tree decl = aarch64_sls_shared_thunks[regnum];
23725 if (!decl)
23726 continue;
23727
23728 const char *name = indirect_symbol_names[regnum];
23729 switch_to_section (get_named_section (decl, NULL, 0));
23730 ASM_OUTPUT_ALIGN (out_file, 2);
23731 targetm.asm_out.globalize_label (out_file, name);
23732 /* Only emits if the compiler is configured for an assembler that can
23733 handle visibility directives. */
23734 targetm.asm_out.assemble_visibility (decl, VISIBILITY_HIDDEN);
23735 ASM_OUTPUT_TYPE_DIRECTIVE (out_file, name, "function");
23736 ASM_OUTPUT_LABEL (out_file, name);
23737 aarch64_sls_emit_function_stub (out_file, regnum);
23738 /* Use the most conservative target to ensure it can always be used by any
23739 function in the translation unit. */
23740 asm_fprintf (out_file, "\tdsb\tsy\n\tisb\n");
23741 ASM_DECLARE_FUNCTION_SIZE (out_file, name, decl);
23742 }
23743}
23744
23745/* Implement TARGET_ASM_FILE_END. */
23746void
23747aarch64_asm_file_end ()
23748{
23749 aarch64_sls_emit_shared_blr_thunks (asm_out_file);
23750 /* Since this function will be called for the ASM_FILE_END hook, we ensure
23751 that what would be called otherwise (e.g. `file_end_indicate_exec_stack`
23752 for FreeBSD) still gets called. */
23753#ifdef TARGET_ASM_FILE_END
23754 TARGET_ASM_FILE_END ();
23755#endif
23756}
23757
23758const char *
23759aarch64_indirect_call_asm (rtx addr)
23760{
23761 gcc_assert (REG_P (addr));
23762 if (aarch64_harden_sls_blr_p ())
23763 {
23764 rtx stub_label = aarch64_sls_create_blr_label (REGNO (addr));
23765 output_asm_insn ("bl\t%0", &stub_label);
23766 }
23767 else
23768 output_asm_insn ("blr\t%0", &addr);
23769 return "";
23770}
23771
51b86113
DM
23772/* Target-specific selftests. */
23773
23774#if CHECKING_P
23775
23776namespace selftest {
23777
23778/* Selftest for the RTL loader.
23779 Verify that the RTL loader copes with a dump from
23780 print_rtx_function. This is essentially just a test that class
23781 function_reader can handle a real dump, but it also verifies
23782 that lookup_reg_by_dump_name correctly handles hard regs.
23783 The presence of hard reg names in the dump means that the test is
23784 target-specific, hence it is in this file. */
23785
23786static void
23787aarch64_test_loading_full_dump ()
23788{
23789 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
23790
23791 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
23792
23793 rtx_insn *insn_1 = get_insn_by_uid (1);
23794 ASSERT_EQ (NOTE, GET_CODE (insn_1));
23795
23796 rtx_insn *insn_15 = get_insn_by_uid (15);
23797 ASSERT_EQ (INSN, GET_CODE (insn_15));
23798 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
23799
23800 /* Verify crtl->return_rtx. */
23801 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
23802 ASSERT_EQ (0, REGNO (crtl->return_rtx));
23803 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
23804}
23805
23806/* Run all target-specific selftests. */
23807
23808static void
23809aarch64_run_selftests (void)
23810{
23811 aarch64_test_loading_full_dump ();
23812}
23813
23814} // namespace selftest
23815
23816#endif /* #if CHECKING_P */
23817
cd0b2d36
RR
23818#undef TARGET_STACK_PROTECT_GUARD
23819#define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
23820
43e9d192
IB
23821#undef TARGET_ADDRESS_COST
23822#define TARGET_ADDRESS_COST aarch64_address_cost
23823
23824/* This hook will determines whether unnamed bitfields affect the alignment
23825 of the containing structure. The hook returns true if the structure
23826 should inherit the alignment requirements of an unnamed bitfield's
23827 type. */
23828#undef TARGET_ALIGN_ANON_BITFIELD
23829#define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
23830
23831#undef TARGET_ASM_ALIGNED_DI_OP
23832#define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
23833
23834#undef TARGET_ASM_ALIGNED_HI_OP
23835#define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
23836
23837#undef TARGET_ASM_ALIGNED_SI_OP
23838#define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
23839
23840#undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
23841#define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
23842 hook_bool_const_tree_hwi_hwi_const_tree_true
23843
e1c1ecb0
KT
23844#undef TARGET_ASM_FILE_START
23845#define TARGET_ASM_FILE_START aarch64_start_file
23846
43e9d192
IB
23847#undef TARGET_ASM_OUTPUT_MI_THUNK
23848#define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
23849
23850#undef TARGET_ASM_SELECT_RTX_SECTION
23851#define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
23852
23853#undef TARGET_ASM_TRAMPOLINE_TEMPLATE
23854#define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
23855
c292cfe5
SN
23856#undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
23857#define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
23858
43e9d192
IB
23859#undef TARGET_BUILD_BUILTIN_VA_LIST
23860#define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
23861
23862#undef TARGET_CALLEE_COPIES
7256c719 23863#define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
43e9d192
IB
23864
23865#undef TARGET_CAN_ELIMINATE
23866#define TARGET_CAN_ELIMINATE aarch64_can_eliminate
23867
1fd8d40c
KT
23868#undef TARGET_CAN_INLINE_P
23869#define TARGET_CAN_INLINE_P aarch64_can_inline_p
23870
43e9d192
IB
23871#undef TARGET_CANNOT_FORCE_CONST_MEM
23872#define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
23873
50487d79
EM
23874#undef TARGET_CASE_VALUES_THRESHOLD
23875#define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
23876
43e9d192
IB
23877#undef TARGET_CONDITIONAL_REGISTER_USAGE
23878#define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
23879
38e62001
RS
23880#undef TARGET_MEMBER_TYPE_FORCES_BLK
23881#define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk
23882
43e9d192
IB
23883/* Only the least significant bit is used for initialization guard
23884 variables. */
23885#undef TARGET_CXX_GUARD_MASK_BIT
23886#define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
23887
23888#undef TARGET_C_MODE_FOR_SUFFIX
23889#define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
23890
23891#ifdef TARGET_BIG_ENDIAN_DEFAULT
23892#undef TARGET_DEFAULT_TARGET_FLAGS
23893#define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
23894#endif
23895
23896#undef TARGET_CLASS_MAX_NREGS
23897#define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
23898
119103ca
JG
23899#undef TARGET_BUILTIN_DECL
23900#define TARGET_BUILTIN_DECL aarch64_builtin_decl
23901
a6fc00da
BH
23902#undef TARGET_BUILTIN_RECIPROCAL
23903#define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
23904
11e554b3
JG
23905#undef TARGET_C_EXCESS_PRECISION
23906#define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
23907
43e9d192
IB
23908#undef TARGET_EXPAND_BUILTIN
23909#define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
23910
23911#undef TARGET_EXPAND_BUILTIN_VA_START
23912#define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
23913
9697e620
JG
23914#undef TARGET_FOLD_BUILTIN
23915#define TARGET_FOLD_BUILTIN aarch64_fold_builtin
23916
43e9d192
IB
23917#undef TARGET_FUNCTION_ARG
23918#define TARGET_FUNCTION_ARG aarch64_function_arg
23919
23920#undef TARGET_FUNCTION_ARG_ADVANCE
23921#define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
23922
23923#undef TARGET_FUNCTION_ARG_BOUNDARY
23924#define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
23925
76b0cbf8
RS
23926#undef TARGET_FUNCTION_ARG_PADDING
23927#define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
23928
43cacb12
RS
23929#undef TARGET_GET_RAW_RESULT_MODE
23930#define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
23931#undef TARGET_GET_RAW_ARG_MODE
23932#define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
23933
43e9d192
IB
23934#undef TARGET_FUNCTION_OK_FOR_SIBCALL
23935#define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
23936
23937#undef TARGET_FUNCTION_VALUE
23938#define TARGET_FUNCTION_VALUE aarch64_function_value
23939
23940#undef TARGET_FUNCTION_VALUE_REGNO_P
23941#define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
23942
fc72cba7
AL
23943#undef TARGET_GIMPLE_FOLD_BUILTIN
23944#define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
0ac198d3 23945
43e9d192
IB
23946#undef TARGET_GIMPLIFY_VA_ARG_EXPR
23947#define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
23948
23949#undef TARGET_INIT_BUILTINS
23950#define TARGET_INIT_BUILTINS aarch64_init_builtins
23951
c64f7d37
WD
23952#undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
23953#define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
23954 aarch64_ira_change_pseudo_allocno_class
23955
43e9d192
IB
23956#undef TARGET_LEGITIMATE_ADDRESS_P
23957#define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
23958
23959#undef TARGET_LEGITIMATE_CONSTANT_P
23960#define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
23961
491ec060
WD
23962#undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
23963#define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
23964 aarch64_legitimize_address_displacement
23965
43e9d192
IB
23966#undef TARGET_LIBGCC_CMP_RETURN_MODE
23967#define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
23968
11e554b3
JG
23969#undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
23970#define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
23971aarch64_libgcc_floating_mode_supported_p
23972
ac2b960f
YZ
23973#undef TARGET_MANGLE_TYPE
23974#define TARGET_MANGLE_TYPE aarch64_mangle_type
23975
98698967
SMW
23976#undef TARGET_INVALID_CONVERSION
23977#define TARGET_INVALID_CONVERSION aarch64_invalid_conversion
23978
23979#undef TARGET_INVALID_UNARY_OP
23980#define TARGET_INVALID_UNARY_OP aarch64_invalid_unary_op
23981
23982#undef TARGET_INVALID_BINARY_OP
23983#define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
23984
65ef05d0
RS
23985#undef TARGET_VERIFY_TYPE_CONTEXT
23986#define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
23987
43e9d192
IB
23988#undef TARGET_MEMORY_MOVE_COST
23989#define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
23990
26e0ff94
WD
23991#undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
23992#define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
23993
43e9d192
IB
23994#undef TARGET_MUST_PASS_IN_STACK
23995#define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
23996
23997/* This target hook should return true if accesses to volatile bitfields
23998 should use the narrowest mode possible. It should return false if these
23999 accesses should use the bitfield container type. */
24000#undef TARGET_NARROW_VOLATILE_BITFIELD
24001#define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
24002
24003#undef TARGET_OPTION_OVERRIDE
24004#define TARGET_OPTION_OVERRIDE aarch64_override_options
24005
24006#undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
24007#define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
24008 aarch64_override_options_after_change
24009
29a14a1a
MK
24010#undef TARGET_OFFLOAD_OPTIONS
24011#define TARGET_OFFLOAD_OPTIONS aarch64_offload_options
24012
361fb3ee
KT
24013#undef TARGET_OPTION_SAVE
24014#define TARGET_OPTION_SAVE aarch64_option_save
24015
24016#undef TARGET_OPTION_RESTORE
24017#define TARGET_OPTION_RESTORE aarch64_option_restore
24018
24019#undef TARGET_OPTION_PRINT
24020#define TARGET_OPTION_PRINT aarch64_option_print
24021
5a2c8331
KT
24022#undef TARGET_OPTION_VALID_ATTRIBUTE_P
24023#define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
24024
d78006d9
KT
24025#undef TARGET_SET_CURRENT_FUNCTION
24026#define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
24027
43e9d192
IB
24028#undef TARGET_PASS_BY_REFERENCE
24029#define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
24030
24031#undef TARGET_PREFERRED_RELOAD_CLASS
24032#define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
24033
cee66c68
WD
24034#undef TARGET_SCHED_REASSOCIATION_WIDTH
24035#define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
24036
c2ec330c
AL
24037#undef TARGET_PROMOTED_TYPE
24038#define TARGET_PROMOTED_TYPE aarch64_promoted_type
24039
43e9d192
IB
24040#undef TARGET_SECONDARY_RELOAD
24041#define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
24042
24043#undef TARGET_SHIFT_TRUNCATION_MASK
24044#define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
24045
24046#undef TARGET_SETUP_INCOMING_VARARGS
24047#define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
24048
24049#undef TARGET_STRUCT_VALUE_RTX
24050#define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
24051
24052#undef TARGET_REGISTER_MOVE_COST
24053#define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
24054
24055#undef TARGET_RETURN_IN_MEMORY
24056#define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
24057
24058#undef TARGET_RETURN_IN_MSB
24059#define TARGET_RETURN_IN_MSB aarch64_return_in_msb
24060
24061#undef TARGET_RTX_COSTS
7cc2145f 24062#define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
43e9d192 24063
2e5f8203
JG
24064#undef TARGET_SCALAR_MODE_SUPPORTED_P
24065#define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
24066
d126a4ae
AP
24067#undef TARGET_SCHED_ISSUE_RATE
24068#define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
24069
d0bc0cb6
RS
24070#undef TARGET_SCHED_VARIABLE_ISSUE
24071#define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
24072
d03f7e44
MK
24073#undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
24074#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
24075 aarch64_sched_first_cycle_multipass_dfa_lookahead
24076
2d6bc7fa
KT
24077#undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
24078#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
24079 aarch64_first_cycle_multipass_dfa_lookahead_guard
24080
827ab47a
KT
24081#undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
24082#define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
24083 aarch64_get_separate_components
24084
24085#undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
24086#define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
24087 aarch64_components_for_bb
24088
24089#undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
24090#define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
24091 aarch64_disqualify_components
24092
24093#undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
24094#define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
24095 aarch64_emit_prologue_components
24096
24097#undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
24098#define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
24099 aarch64_emit_epilogue_components
24100
24101#undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
24102#define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
24103 aarch64_set_handled_components
24104
43e9d192
IB
24105#undef TARGET_TRAMPOLINE_INIT
24106#define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
24107
24108#undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
24109#define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
24110
24111#undef TARGET_VECTOR_MODE_SUPPORTED_P
24112#define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
24113
482b2b43
RS
24114#undef TARGET_COMPATIBLE_VECTOR_TYPES_P
24115#define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
24116
7df76747
N
24117#undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
24118#define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
24119 aarch64_builtin_support_vector_misalignment
24120
9f4cbab8
RS
24121#undef TARGET_ARRAY_MODE
24122#define TARGET_ARRAY_MODE aarch64_array_mode
24123
43e9d192
IB
24124#undef TARGET_ARRAY_MODE_SUPPORTED_P
24125#define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
24126
8990e73a
TB
24127#undef TARGET_VECTORIZE_ADD_STMT_COST
24128#define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
24129
24130#undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
24131#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
24132 aarch64_builtin_vectorization_cost
24133
43e9d192
IB
24134#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
24135#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
24136
42fc9a7f
JG
24137#undef TARGET_VECTORIZE_BUILTINS
24138#define TARGET_VECTORIZE_BUILTINS
24139
24140#undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
24141#define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
24142 aarch64_builtin_vectorized_function
24143
e021fb86
RS
24144#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
24145#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
24146 aarch64_autovectorize_vector_modes
3b357264 24147
aa87aced
KV
24148#undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
24149#define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
24150 aarch64_atomic_assign_expand_fenv
24151
43e9d192
IB
24152/* Section anchor support. */
24153
24154#undef TARGET_MIN_ANCHOR_OFFSET
24155#define TARGET_MIN_ANCHOR_OFFSET -256
24156
24157/* Limit the maximum anchor offset to 4k-1, since that's the limit for a
24158 byte offset; we can do much more for larger data types, but have no way
24159 to determine the size of the access. We assume accesses are aligned. */
24160#undef TARGET_MAX_ANCHOR_OFFSET
24161#define TARGET_MAX_ANCHOR_OFFSET 4095
24162
db0253a4
TB
24163#undef TARGET_VECTOR_ALIGNMENT
24164#define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
24165
43cacb12
RS
24166#undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
24167#define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
24168 aarch64_vectorize_preferred_vector_alignment
db0253a4
TB
24169#undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
24170#define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
24171 aarch64_simd_vector_alignment_reachable
24172
88b08073
JG
24173/* vec_perm support. */
24174
f151c9e1
RS
24175#undef TARGET_VECTORIZE_VEC_PERM_CONST
24176#define TARGET_VECTORIZE_VEC_PERM_CONST \
24177 aarch64_vectorize_vec_perm_const
88b08073 24178
74166aab
RS
24179#undef TARGET_VECTORIZE_RELATED_MODE
24180#define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
43cacb12
RS
24181#undef TARGET_VECTORIZE_GET_MASK_MODE
24182#define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
76a34e3f
RS
24183#undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
24184#define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
24185 aarch64_empty_mask_is_expensive
6a86928d
RS
24186#undef TARGET_PREFERRED_ELSE_VALUE
24187#define TARGET_PREFERRED_ELSE_VALUE \
24188 aarch64_preferred_else_value
43cacb12 24189
c2ec330c
AL
24190#undef TARGET_INIT_LIBFUNCS
24191#define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
70f09188 24192
706b2314 24193#undef TARGET_FIXED_CONDITION_CODE_REGS
70f09188
AP
24194#define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
24195
5cb74e90
RR
24196#undef TARGET_FLAGS_REGNUM
24197#define TARGET_FLAGS_REGNUM CC_REGNUM
24198
78607708
TV
24199#undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
24200#define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
24201
a3125fc2
CL
24202#undef TARGET_ASAN_SHADOW_OFFSET
24203#define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
24204
0c4ec427
RE
24205#undef TARGET_LEGITIMIZE_ADDRESS
24206#define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
24207
b48d6421
KT
24208#undef TARGET_SCHED_CAN_SPECULATE_INSN
24209#define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
24210
594bdd53
FY
24211#undef TARGET_CAN_USE_DOLOOP_P
24212#define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
24213
9bca63d4
WD
24214#undef TARGET_SCHED_ADJUST_PRIORITY
24215#define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
24216
6a569cdd
KT
24217#undef TARGET_SCHED_MACRO_FUSION_P
24218#define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
24219
24220#undef TARGET_SCHED_MACRO_FUSION_PAIR_P
24221#define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
24222
350013bc
BC
24223#undef TARGET_SCHED_FUSION_PRIORITY
24224#define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
24225
7b841a12
JW
24226#undef TARGET_UNSPEC_MAY_TRAP_P
24227#define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
24228
1b1e81f8
JW
24229#undef TARGET_USE_PSEUDO_PIC_REG
24230#define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
24231
cc8ca59e
JB
24232#undef TARGET_PRINT_OPERAND
24233#define TARGET_PRINT_OPERAND aarch64_print_operand
24234
24235#undef TARGET_PRINT_OPERAND_ADDRESS
24236#define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
24237
74b27d8e
RS
24238#undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
24239#define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA aarch64_output_addr_const_extra
24240
ee62a5a6
RS
24241#undef TARGET_OPTAB_SUPPORTED_P
24242#define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
24243
43203dea
RR
24244#undef TARGET_OMIT_STRUCT_RETURN_REG
24245#define TARGET_OMIT_STRUCT_RETURN_REG true
24246
43cacb12
RS
24247#undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
24248#define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
24249 aarch64_dwarf_poly_indeterminate_value
24250
f46fe37e
EB
24251/* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
24252#undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
24253#define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
24254
c43f4279
RS
24255#undef TARGET_HARD_REGNO_NREGS
24256#define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
f939c3e6
RS
24257#undef TARGET_HARD_REGNO_MODE_OK
24258#define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
24259
99e1629f
RS
24260#undef TARGET_MODES_TIEABLE_P
24261#define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
24262
80ec73f4
RS
24263#undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
24264#define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
24265 aarch64_hard_regno_call_part_clobbered
24266
5a5a3bc5
RS
24267#undef TARGET_INSN_CALLEE_ABI
24268#define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
b3650d40 24269
58e17cf8
RS
24270#undef TARGET_CONSTANT_ALIGNMENT
24271#define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
24272
8c6e3b23
TC
24273#undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
24274#define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
24275 aarch64_stack_clash_protection_alloca_probe_range
24276
43cacb12
RS
24277#undef TARGET_COMPUTE_PRESSURE_CLASSES
24278#define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
24279
24280#undef TARGET_CAN_CHANGE_MODE_CLASS
24281#define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
24282
5cce8171
RS
24283#undef TARGET_SELECT_EARLY_REMAT_MODES
24284#define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
24285
c0111dc4
RE
24286#undef TARGET_SPECULATION_SAFE_VALUE
24287#define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
24288
2d56d6ba
KT
24289#undef TARGET_ESTIMATED_POLY_VALUE
24290#define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
24291
a0d0b980
SE
24292#undef TARGET_ATTRIBUTE_TABLE
24293#define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
24294
d9186814
SE
24295#undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
24296#define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
24297 aarch64_simd_clone_compute_vecsize_and_simdlen
24298
24299#undef TARGET_SIMD_CLONE_ADJUST
24300#define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
24301
24302#undef TARGET_SIMD_CLONE_USABLE
24303#define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
24304
497f281c
SE
24305#undef TARGET_COMP_TYPE_ATTRIBUTES
24306#define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
24307
3bac1e20
SE
24308#undef TARGET_GET_MULTILIB_ABI_NAME
24309#define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
24310
002ffd3c
RS
24311#undef TARGET_FNTYPE_ABI
24312#define TARGET_FNTYPE_ABI aarch64_fntype_abi
24313
3bd87832
MM
24314#undef TARGET_MEMTAG_CAN_TAG_ADDRESSES
24315#define TARGET_MEMTAG_CAN_TAG_ADDRESSES aarch64_can_tag_addresses
24316
51b86113
DM
24317#if CHECKING_P
24318#undef TARGET_RUN_TARGET_SELFTESTS
24319#define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
24320#endif /* #if CHECKING_P */
24321
8fc16d72
ST
24322#undef TARGET_ASM_POST_CFI_STARTPROC
24323#define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
24324
c600df9a
RS
24325#undef TARGET_STRICT_ARGUMENT_NAMING
24326#define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
24327
1a7a35c7
RH
24328#undef TARGET_MD_ASM_ADJUST
24329#define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
24330
96b7f495
MM
24331#undef TARGET_ASM_FILE_END
24332#define TARGET_ASM_FILE_END aarch64_asm_file_end
24333
24334#undef TARGET_ASM_FUNCTION_EPILOGUE
24335#define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
24336
43e9d192
IB
24337struct gcc_target targetm = TARGET_INITIALIZER;
24338
24339#include "gt-aarch64.h"