]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/aarch64/aarch64.c
aarch64: Add LS64 extension and intrinsics
[thirdparty/gcc.git] / gcc / config / aarch64 / aarch64.c
CommitLineData
bdb7bf8a 1/* Machine description for AArch64 architecture.
99dee823 2 Copyright (C) 2009-2021 Free Software Foundation, Inc.
43e9d192
IB
3 Contributed by ARM Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
8fcc61f8
RS
21#define IN_TARGET_CODE 1
22
01736018 23#define INCLUDE_STRING
83d796d3
RS
24#define INCLUDE_ALGORITHM
25#include "config.h"
43e9d192
IB
26#include "system.h"
27#include "coretypes.h"
c7131fb2 28#include "backend.h"
e11c4407
AM
29#include "target.h"
30#include "rtl.h"
c7131fb2 31#include "tree.h"
e73cf9a2 32#include "memmodel.h"
c7131fb2 33#include "gimple.h"
e11c4407
AM
34#include "cfghooks.h"
35#include "cfgloop.h"
c7131fb2 36#include "df.h"
e11c4407
AM
37#include "tm_p.h"
38#include "stringpool.h"
314e6352 39#include "attribs.h"
e11c4407
AM
40#include "optabs.h"
41#include "regs.h"
42#include "emit-rtl.h"
43#include "recog.h"
d9186814 44#include "cgraph.h"
e11c4407 45#include "diagnostic.h"
43e9d192 46#include "insn-attr.h"
40e23961 47#include "alias.h"
40e23961 48#include "fold-const.h"
d8a2d370
DN
49#include "stor-layout.h"
50#include "calls.h"
51#include "varasm.h"
43e9d192 52#include "output.h"
36566b39 53#include "flags.h"
36566b39 54#include "explow.h"
43e9d192
IB
55#include "expr.h"
56#include "reload.h"
43e9d192 57#include "langhooks.h"
5a2c8331 58#include "opts.h"
45b0be94 59#include "gimplify.h"
43e9d192 60#include "dwarf2.h"
61d371eb 61#include "gimple-iterator.h"
8990e73a 62#include "tree-vectorizer.h"
d1bcc29f 63#include "aarch64-cost-tables.h"
0ee859b5 64#include "dumpfile.h"
9b2b7279 65#include "builtins.h"
8baff86e 66#include "rtl-iter.h"
9bbe08fe 67#include "tm-constrs.h"
d03f7e44 68#include "sched-int.h"
d78006d9 69#include "target-globals.h"
a3eb8a52 70#include "common/common-target.h"
43cacb12 71#include "cfgrtl.h"
51b86113
DM
72#include "selftest.h"
73#include "selftest-rtl.h"
43cacb12 74#include "rtx-vector-builder.h"
d9186814 75#include "intl.h"
7d8bdfa7 76#include "expmed.h"
002ffd3c 77#include "function-abi.h"
1205a8ca
RS
78#include "gimple-pretty-print.h"
79#include "tree-ssa-loop-niter.h"
83d796d3 80#include "fractional-cost.h"
63834c84 81#include "rtlanal.h"
526e1639 82#include "tree-dfa.h"
43e9d192 83
994c5d85 84/* This file should be included last. */
d58627a0
RS
85#include "target-def.h"
86
28514dda
YZ
87/* Defined for convenience. */
88#define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
89
b187677b 90/* Information about a legitimate vector immediate operand. */
48063b9d
IB
91struct simd_immediate_info
92{
0b1fe8cf 93 enum insn_type { MOV, MVN, INDEX, PTRUE };
b187677b
RS
94 enum modifier_type { LSL, MSL };
95
96 simd_immediate_info () {}
97 simd_immediate_info (scalar_float_mode, rtx);
98 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
99 insn_type = MOV, modifier_type = LSL,
100 unsigned int = 0);
43cacb12 101 simd_immediate_info (scalar_mode, rtx, rtx);
0b1fe8cf 102 simd_immediate_info (scalar_int_mode, aarch64_svpattern);
b187677b
RS
103
104 /* The mode of the elements. */
105 scalar_mode elt_mode;
106
b187677b
RS
107 /* The instruction to use to move the immediate into a vector. */
108 insn_type insn;
109
1da83cce
RS
110 union
111 {
112 /* For MOV and MVN. */
113 struct
114 {
115 /* The value of each element. */
116 rtx value;
117
118 /* The kind of shift modifier to use, and the number of bits to shift.
119 This is (LSL, 0) if no shift is needed. */
120 modifier_type modifier;
121 unsigned int shift;
122 } mov;
123
124 /* For INDEX. */
125 struct
126 {
127 /* The value of the first element and the step to be added for each
128 subsequent element. */
129 rtx base, step;
130 } index;
0b1fe8cf
RS
131
132 /* For PTRUE. */
133 aarch64_svpattern pattern;
1da83cce 134 } u;
48063b9d
IB
135};
136
b187677b
RS
137/* Construct a floating-point immediate in which each element has mode
138 ELT_MODE_IN and value VALUE_IN. */
139inline simd_immediate_info
140::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
1da83cce
RS
141 : elt_mode (elt_mode_in), insn (MOV)
142{
143 u.mov.value = value_in;
144 u.mov.modifier = LSL;
145 u.mov.shift = 0;
146}
b187677b
RS
147
148/* Construct an integer immediate in which each element has mode ELT_MODE_IN
149 and value VALUE_IN. The other parameters are as for the structure
150 fields. */
151inline simd_immediate_info
152::simd_immediate_info (scalar_int_mode elt_mode_in,
153 unsigned HOST_WIDE_INT value_in,
154 insn_type insn_in, modifier_type modifier_in,
155 unsigned int shift_in)
1da83cce
RS
156 : elt_mode (elt_mode_in), insn (insn_in)
157{
158 u.mov.value = gen_int_mode (value_in, elt_mode_in);
159 u.mov.modifier = modifier_in;
160 u.mov.shift = shift_in;
161}
43cacb12
RS
162
163/* Construct an integer immediate in which each element has mode ELT_MODE_IN
1da83cce 164 and where element I is equal to BASE_IN + I * STEP_IN. */
43cacb12 165inline simd_immediate_info
1da83cce
RS
166::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
167 : elt_mode (elt_mode_in), insn (INDEX)
168{
169 u.index.base = base_in;
170 u.index.step = step_in;
171}
b187677b 172
0b1fe8cf
RS
173/* Construct a predicate that controls elements of mode ELT_MODE_IN
174 and has PTRUE pattern PATTERN_IN. */
175inline simd_immediate_info
176::simd_immediate_info (scalar_int_mode elt_mode_in,
177 aarch64_svpattern pattern_in)
178 : elt_mode (elt_mode_in), insn (PTRUE)
179{
180 u.pattern = pattern_in;
181}
182
38e62001
RS
183namespace {
184
185/* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64. */
186class pure_scalable_type_info
187{
188public:
189 /* Represents the result of analyzing a type. All values are nonzero,
190 in the possibly forlorn hope that accidental conversions to bool
191 trigger a warning. */
192 enum analysis_result
193 {
194 /* The type does not have an ABI identity; i.e. it doesn't contain
195 at least one object whose type is a Fundamental Data Type. */
196 NO_ABI_IDENTITY = 1,
197
198 /* The type is definitely a Pure Scalable Type. */
199 IS_PST,
200
201 /* The type is definitely not a Pure Scalable Type. */
202 ISNT_PST,
203
204 /* It doesn't matter for PCS purposes whether the type is a Pure
205 Scalable Type or not, since the type will be handled the same
206 way regardless.
207
208 Specifically, this means that if the type is a Pure Scalable Type,
209 there aren't enough argument registers to hold it, and so it will
210 need to be passed or returned in memory. If the type isn't a
211 Pure Scalable Type, it's too big to be passed or returned in core
212 or SIMD&FP registers, and so again will need to go in memory. */
213 DOESNT_MATTER
214 };
215
216 /* Aggregates of 17 bytes or more are normally passed and returned
217 in memory, so aggregates of that size can safely be analyzed as
218 DOESNT_MATTER. We need to be able to collect enough pieces to
219 represent a PST that is smaller than that. Since predicates are
220 2 bytes in size for -msve-vector-bits=128, that means we need to be
221 able to store at least 8 pieces.
222
223 We also need to be able to store enough pieces to represent
224 a single vector in each vector argument register and a single
225 predicate in each predicate argument register. This means that
226 we need at least 12 pieces. */
227 static const unsigned int MAX_PIECES = NUM_FP_ARG_REGS + NUM_PR_ARG_REGS;
38e62001 228 static_assert (MAX_PIECES >= 8, "Need to store at least 8 predicates");
38e62001
RS
229
230 /* Describes one piece of a PST. Each piece is one of:
231
232 - a single Scalable Vector Type (SVT)
233 - a single Scalable Predicate Type (SPT)
234 - a PST containing 2, 3 or 4 SVTs, with no padding
235
236 It either represents a single built-in type or a PST formed from
237 multiple homogeneous built-in types. */
238 struct piece
239 {
240 rtx get_rtx (unsigned int, unsigned int) const;
241
242 /* The number of vector and predicate registers that the piece
243 occupies. One of the two is always zero. */
244 unsigned int num_zr;
245 unsigned int num_pr;
246
247 /* The mode of the registers described above. */
248 machine_mode mode;
249
250 /* If this piece is formed from multiple homogeneous built-in types,
251 this is the mode of the built-in types, otherwise it is MODE. */
252 machine_mode orig_mode;
253
254 /* The offset in bytes of the piece from the start of the type. */
255 poly_uint64_pod offset;
256 };
257
258 /* Divides types analyzed as IS_PST into individual pieces. The pieces
259 are in memory order. */
260 auto_vec<piece, MAX_PIECES> pieces;
261
262 unsigned int num_zr () const;
263 unsigned int num_pr () const;
264
265 rtx get_rtx (machine_mode mode, unsigned int, unsigned int) const;
266
267 analysis_result analyze (const_tree);
268 bool analyze_registers (const_tree);
269
270private:
271 analysis_result analyze_array (const_tree);
272 analysis_result analyze_record (const_tree);
273 void add_piece (const piece &);
274};
275}
276
43e9d192
IB
277/* The current code model. */
278enum aarch64_code_model aarch64_cmodel;
279
43cacb12
RS
280/* The number of 64-bit elements in an SVE vector. */
281poly_uint16 aarch64_sve_vg;
282
43e9d192
IB
283#ifdef HAVE_AS_TLS
284#undef TARGET_HAVE_TLS
285#define TARGET_HAVE_TLS 1
286#endif
287
ef4bddc2 288static bool aarch64_composite_type_p (const_tree, machine_mode);
38e62001 289static bool aarch64_return_in_memory_1 (const_tree);
ef4bddc2 290static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
43e9d192 291 const_tree,
ef4bddc2 292 machine_mode *, int *,
56fe3ca3 293 bool *, bool);
43e9d192
IB
294static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
295static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
43e9d192 296static void aarch64_override_options_after_change (void);
ef4bddc2 297static bool aarch64_vector_mode_supported_p (machine_mode);
ef4bddc2 298static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
7df76747
N
299static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
300 const_tree type,
301 int misalignment,
302 bool is_packed);
43cacb12 303static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
a25831ac
AV
304static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
305 aarch64_addr_query_type);
eb471ba3 306static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
88b08073 307
0c6caaf8
RL
308/* Major revision number of the ARM Architecture implemented by the target. */
309unsigned aarch64_architecture_version;
310
43e9d192 311/* The processor for which instructions should be scheduled. */
02fdbd5b 312enum aarch64_processor aarch64_tune = cortexa53;
43e9d192 313
43e9d192 314/* Mask to specify which instruction scheduling options should be used. */
28108a53 315uint64_t aarch64_tune_flags = 0;
43e9d192 316
1be34295 317/* Global flag for PC relative loads. */
9ee6540a 318bool aarch64_pcrelative_literal_loads;
1be34295 319
d6cb6d6a
WD
320/* Global flag for whether frame pointer is enabled. */
321bool aarch64_use_frame_pointer;
322
efac62a3
ST
323#define BRANCH_PROTECT_STR_MAX 255
324char *accepted_branch_protection_string = NULL;
325
326static enum aarch64_parse_opt_result
327aarch64_parse_branch_protection (const char*, char**);
328
8dec06f2
JG
329/* Support for command line parsing of boolean flags in the tuning
330 structures. */
331struct aarch64_flag_desc
332{
333 const char* name;
334 unsigned int flag;
335};
336
ed9fa8d2 337#define AARCH64_FUSION_PAIR(name, internal_name) \
8dec06f2
JG
338 { name, AARCH64_FUSE_##internal_name },
339static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
340{
341 { "none", AARCH64_FUSE_NOTHING },
342#include "aarch64-fusion-pairs.def"
343 { "all", AARCH64_FUSE_ALL },
344 { NULL, AARCH64_FUSE_NOTHING }
345};
8dec06f2 346
a339a01c 347#define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
8dec06f2
JG
348 { name, AARCH64_EXTRA_TUNE_##internal_name },
349static const struct aarch64_flag_desc aarch64_tuning_flags[] =
350{
351 { "none", AARCH64_EXTRA_TUNE_NONE },
352#include "aarch64-tuning-flags.def"
353 { "all", AARCH64_EXTRA_TUNE_ALL },
354 { NULL, AARCH64_EXTRA_TUNE_NONE }
355};
8dec06f2 356
43e9d192
IB
357/* Tuning parameters. */
358
43e9d192
IB
359static const struct cpu_addrcost_table generic_addrcost_table =
360{
67747367 361 {
2fae724a 362 1, /* hi */
bd95e655
JG
363 0, /* si */
364 0, /* di */
2fae724a 365 1, /* ti */
67747367 366 },
bd95e655
JG
367 0, /* pre_modify */
368 0, /* post_modify */
6b8b0c8e
RS
369 0, /* post_modify_ld3_st3 */
370 0, /* post_modify_ld4_st4 */
bd95e655 371 0, /* register_offset */
783879e6
EM
372 0, /* register_sextend */
373 0, /* register_zextend */
bd95e655 374 0 /* imm_offset */
43e9d192
IB
375};
376
5ec1ae3b
EM
377static const struct cpu_addrcost_table exynosm1_addrcost_table =
378{
379 {
380 0, /* hi */
381 0, /* si */
382 0, /* di */
383 2, /* ti */
384 },
385 0, /* pre_modify */
386 0, /* post_modify */
6b8b0c8e
RS
387 0, /* post_modify_ld3_st3 */
388 0, /* post_modify_ld4_st4 */
5ec1ae3b
EM
389 1, /* register_offset */
390 1, /* register_sextend */
391 2, /* register_zextend */
392 0, /* imm_offset */
393};
394
381e27aa
PT
395static const struct cpu_addrcost_table xgene1_addrcost_table =
396{
381e27aa 397 {
bd95e655
JG
398 1, /* hi */
399 0, /* si */
400 0, /* di */
401 1, /* ti */
381e27aa 402 },
bd95e655 403 1, /* pre_modify */
52ddefd8 404 1, /* post_modify */
6b8b0c8e
RS
405 1, /* post_modify_ld3_st3 */
406 1, /* post_modify_ld4_st4 */
bd95e655 407 0, /* register_offset */
783879e6
EM
408 1, /* register_sextend */
409 1, /* register_zextend */
bd95e655 410 0, /* imm_offset */
381e27aa
PT
411};
412
d1261ac6 413static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
ad611a4c
VP
414{
415 {
5f407e57
AP
416 1, /* hi */
417 1, /* si */
418 1, /* di */
ad611a4c
VP
419 2, /* ti */
420 },
421 0, /* pre_modify */
422 0, /* post_modify */
6b8b0c8e
RS
423 0, /* post_modify_ld3_st3 */
424 0, /* post_modify_ld4_st4 */
ad611a4c
VP
425 2, /* register_offset */
426 3, /* register_sextend */
427 3, /* register_zextend */
428 0, /* imm_offset */
429};
430
fa477e45
AY
431static const struct cpu_addrcost_table thunderx3t110_addrcost_table =
432{
433 {
434 1, /* hi */
435 1, /* si */
436 1, /* di */
437 2, /* ti */
438 },
439 0, /* pre_modify */
440 0, /* post_modify */
6b8b0c8e
RS
441 0, /* post_modify_ld3_st3 */
442 0, /* post_modify_ld4_st4 */
fa477e45
AY
443 2, /* register_offset */
444 3, /* register_sextend */
445 3, /* register_zextend */
446 0, /* imm_offset */
447};
448
910f72e7
SZ
449static const struct cpu_addrcost_table tsv110_addrcost_table =
450{
451 {
452 1, /* hi */
453 0, /* si */
454 0, /* di */
455 1, /* ti */
456 },
457 0, /* pre_modify */
458 0, /* post_modify */
6b8b0c8e
RS
459 0, /* post_modify_ld3_st3 */
460 0, /* post_modify_ld4_st4 */
910f72e7
SZ
461 0, /* register_offset */
462 1, /* register_sextend */
463 1, /* register_zextend */
464 0, /* imm_offset */
465};
466
8d39ea2f
LM
467static const struct cpu_addrcost_table qdf24xx_addrcost_table =
468{
469 {
470 1, /* hi */
471 1, /* si */
472 1, /* di */
473 2, /* ti */
474 },
475 1, /* pre_modify */
476 1, /* post_modify */
6b8b0c8e
RS
477 1, /* post_modify_ld3_st3 */
478 1, /* post_modify_ld4_st4 */
8d39ea2f 479 3, /* register_offset */
31508b39 480 3, /* register_sextend */
8d39ea2f
LM
481 3, /* register_zextend */
482 2, /* imm_offset */
483};
484
3f325179
QJ
485static const struct cpu_addrcost_table a64fx_addrcost_table =
486{
487 {
488 1, /* hi */
489 1, /* si */
490 1, /* di */
491 2, /* ti */
492 },
493 0, /* pre_modify */
494 0, /* post_modify */
6b8b0c8e
RS
495 0, /* post_modify_ld3_st3 */
496 0, /* post_modify_ld4_st4 */
3f325179
QJ
497 2, /* register_offset */
498 3, /* register_sextend */
499 3, /* register_zextend */
500 0, /* imm_offset */
501};
502
6b8b0c8e
RS
503static const struct cpu_addrcost_table neoversev1_addrcost_table =
504{
505 {
506 1, /* hi */
507 0, /* si */
508 0, /* di */
509 1, /* ti */
510 },
511 0, /* pre_modify */
512 0, /* post_modify */
513 3, /* post_modify_ld3_st3 */
514 3, /* post_modify_ld4_st4 */
515 0, /* register_offset */
516 0, /* register_sextend */
517 0, /* register_zextend */
518 0 /* imm_offset */
519};
520
43e9d192
IB
521static const struct cpu_regmove_cost generic_regmove_cost =
522{
bd95e655 523 1, /* GP2GP */
3969c510
WD
524 /* Avoid the use of slow int<->fp moves for spilling by setting
525 their cost higher than memmov_cost. */
bd95e655
JG
526 5, /* GP2FP */
527 5, /* FP2GP */
528 2 /* FP2FP */
43e9d192
IB
529};
530
e4a9c55a
WD
531static const struct cpu_regmove_cost cortexa57_regmove_cost =
532{
bd95e655 533 1, /* GP2GP */
e4a9c55a
WD
534 /* Avoid the use of slow int<->fp moves for spilling by setting
535 their cost higher than memmov_cost. */
bd95e655
JG
536 5, /* GP2FP */
537 5, /* FP2GP */
538 2 /* FP2FP */
e4a9c55a
WD
539};
540
541static const struct cpu_regmove_cost cortexa53_regmove_cost =
542{
bd95e655 543 1, /* GP2GP */
e4a9c55a
WD
544 /* Avoid the use of slow int<->fp moves for spilling by setting
545 their cost higher than memmov_cost. */
bd95e655
JG
546 5, /* GP2FP */
547 5, /* FP2GP */
548 2 /* FP2FP */
e4a9c55a
WD
549};
550
5ec1ae3b
EM
551static const struct cpu_regmove_cost exynosm1_regmove_cost =
552{
553 1, /* GP2GP */
554 /* Avoid the use of slow int<->fp moves for spilling by setting
555 their cost higher than memmov_cost (actual, 4 and 9). */
556 9, /* GP2FP */
557 9, /* FP2GP */
558 1 /* FP2FP */
559};
560
d1bcc29f
AP
561static const struct cpu_regmove_cost thunderx_regmove_cost =
562{
bd95e655
JG
563 2, /* GP2GP */
564 2, /* GP2FP */
565 6, /* FP2GP */
566 4 /* FP2FP */
d1bcc29f
AP
567};
568
381e27aa
PT
569static const struct cpu_regmove_cost xgene1_regmove_cost =
570{
bd95e655 571 1, /* GP2GP */
381e27aa
PT
572 /* Avoid the use of slow int<->fp moves for spilling by setting
573 their cost higher than memmov_cost. */
bd95e655
JG
574 8, /* GP2FP */
575 8, /* FP2GP */
576 2 /* FP2FP */
381e27aa
PT
577};
578
ee446d9f
JW
579static const struct cpu_regmove_cost qdf24xx_regmove_cost =
580{
581 2, /* GP2GP */
582 /* Avoid the use of int<->fp moves for spilling. */
583 6, /* GP2FP */
584 6, /* FP2GP */
585 4 /* FP2FP */
586};
587
d1261ac6 588static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
ad611a4c
VP
589{
590 1, /* GP2GP */
591 /* Avoid the use of int<->fp moves for spilling. */
2aeccecb
AY
592 5, /* GP2FP */
593 6, /* FP2GP */
594 3, /* FP2FP */
ad611a4c
VP
595};
596
fa477e45
AY
597static const struct cpu_regmove_cost thunderx3t110_regmove_cost =
598{
599 1, /* GP2GP */
600 /* Avoid the use of int<->fp moves for spilling. */
601 4, /* GP2FP */
602 5, /* FP2GP */
603 4 /* FP2FP */
604};
605
910f72e7
SZ
606static const struct cpu_regmove_cost tsv110_regmove_cost =
607{
608 1, /* GP2GP */
609 /* Avoid the use of slow int<->fp moves for spilling by setting
610 their cost higher than memmov_cost. */
611 2, /* GP2FP */
612 3, /* FP2GP */
613 2 /* FP2FP */
614};
615
3f325179
QJ
616static const struct cpu_regmove_cost a64fx_regmove_cost =
617{
618 1, /* GP2GP */
619 /* Avoid the use of slow int<->fp moves for spilling by setting
620 their cost higher than memmov_cost. */
621 5, /* GP2FP */
622 7, /* FP2GP */
623 2 /* FP2FP */
624};
625
76e4f444
KT
626/* Generic costs for Advanced SIMD vector operations. */
627static const advsimd_vec_cost generic_advsimd_vector_cost =
628{
629 1, /* int_stmt_cost */
630 1, /* fp_stmt_cost */
b1a831f0
RS
631 0, /* ld2_st2_permute_cost */
632 0, /* ld3_st3_permute_cost */
633 0, /* ld4_st4_permute_cost */
76e4f444 634 2, /* permute_cost */
e253bb8b
RS
635 2, /* reduc_i8_cost */
636 2, /* reduc_i16_cost */
637 2, /* reduc_i32_cost */
638 2, /* reduc_i64_cost */
639 2, /* reduc_f16_cost */
640 2, /* reduc_f32_cost */
641 2, /* reduc_f64_cost */
d1ff0847 642 2, /* store_elt_extra_cost */
76e4f444
KT
643 2, /* vec_to_scalar_cost */
644 1, /* scalar_to_vec_cost */
645 1, /* align_load_cost */
646 1, /* unalign_load_cost */
647 1, /* unalign_store_cost */
648 1 /* store_cost */
649};
650
651/* Generic costs for SVE vector operations. */
652static const sve_vec_cost generic_sve_vector_cost =
653{
1282988b
RS
654 {
655 1, /* int_stmt_cost */
656 1, /* fp_stmt_cost */
b1a831f0
RS
657 0, /* ld2_st2_permute_cost */
658 0, /* ld3_st3_permute_cost */
659 0, /* ld4_st4_permute_cost */
1282988b
RS
660 2, /* permute_cost */
661 2, /* reduc_i8_cost */
662 2, /* reduc_i16_cost */
663 2, /* reduc_i32_cost */
664 2, /* reduc_i64_cost */
665 2, /* reduc_f16_cost */
666 2, /* reduc_f32_cost */
667 2, /* reduc_f64_cost */
d1ff0847 668 2, /* store_elt_extra_cost */
1282988b
RS
669 2, /* vec_to_scalar_cost */
670 1, /* scalar_to_vec_cost */
671 1, /* align_load_cost */
672 1, /* unalign_load_cost */
673 1, /* unalign_store_cost */
674 1 /* store_cost */
675 },
676 2, /* clast_cost */
677 2, /* fadda_f16_cost */
678 2, /* fadda_f32_cost */
7c679969 679 2, /* fadda_f64_cost */
78770e0e
RS
680 4, /* gather_load_x32_cost */
681 2, /* gather_load_x64_cost */
7c679969 682 1 /* scatter_store_elt_cost */
76e4f444
KT
683};
684
8990e73a 685/* Generic costs for vector insn classes. */
8990e73a
TB
686static const struct cpu_vector_cost generic_vector_cost =
687{
cd8ae5ed
AP
688 1, /* scalar_int_stmt_cost */
689 1, /* scalar_fp_stmt_cost */
bd95e655
JG
690 1, /* scalar_load_cost */
691 1, /* scalar_store_cost */
bd95e655 692 3, /* cond_taken_branch_cost */
76e4f444
KT
693 1, /* cond_not_taken_branch_cost */
694 &generic_advsimd_vector_cost, /* advsimd */
1205a8ca
RS
695 &generic_sve_vector_cost, /* sve */
696 nullptr /* issue_info */
76e4f444
KT
697};
698
3f325179
QJ
699static const advsimd_vec_cost a64fx_advsimd_vector_cost =
700{
701 2, /* int_stmt_cost */
702 5, /* fp_stmt_cost */
b1a831f0
RS
703 0, /* ld2_st2_permute_cost */
704 0, /* ld3_st3_permute_cost */
705 0, /* ld4_st4_permute_cost */
3f325179 706 3, /* permute_cost */
e253bb8b
RS
707 13, /* reduc_i8_cost */
708 13, /* reduc_i16_cost */
709 13, /* reduc_i32_cost */
710 13, /* reduc_i64_cost */
711 13, /* reduc_f16_cost */
712 13, /* reduc_f32_cost */
713 13, /* reduc_f64_cost */
d1ff0847 714 13, /* store_elt_extra_cost */
3f325179
QJ
715 13, /* vec_to_scalar_cost */
716 4, /* scalar_to_vec_cost */
717 6, /* align_load_cost */
718 6, /* unalign_load_cost */
719 1, /* unalign_store_cost */
720 1 /* store_cost */
721};
722
723static const sve_vec_cost a64fx_sve_vector_cost =
724{
1282988b
RS
725 {
726 2, /* int_stmt_cost */
727 5, /* fp_stmt_cost */
b1a831f0
RS
728 0, /* ld2_st2_permute_cost */
729 0, /* ld3_st3_permute_cost */
730 0, /* ld4_st4_permute_cost */
1282988b
RS
731 3, /* permute_cost */
732 13, /* reduc_i8_cost */
733 13, /* reduc_i16_cost */
734 13, /* reduc_i32_cost */
735 13, /* reduc_i64_cost */
736 13, /* reduc_f16_cost */
737 13, /* reduc_f32_cost */
738 13, /* reduc_f64_cost */
d1ff0847 739 13, /* store_elt_extra_cost */
1282988b
RS
740 13, /* vec_to_scalar_cost */
741 4, /* scalar_to_vec_cost */
742 6, /* align_load_cost */
743 6, /* unalign_load_cost */
744 1, /* unalign_store_cost */
745 1 /* store_cost */
746 },
747 13, /* clast_cost */
748 13, /* fadda_f16_cost */
749 13, /* fadda_f32_cost */
7c679969 750 13, /* fadda_f64_cost */
78770e0e
RS
751 64, /* gather_load_x32_cost */
752 32, /* gather_load_x64_cost */
7c679969 753 1 /* scatter_store_elt_cost */
3f325179
QJ
754};
755
756static const struct cpu_vector_cost a64fx_vector_cost =
757{
758 1, /* scalar_int_stmt_cost */
759 5, /* scalar_fp_stmt_cost */
760 4, /* scalar_load_cost */
761 1, /* scalar_store_cost */
762 3, /* cond_taken_branch_cost */
763 1, /* cond_not_taken_branch_cost */
764 &a64fx_advsimd_vector_cost, /* advsimd */
1205a8ca
RS
765 &a64fx_sve_vector_cost, /* sve */
766 nullptr /* issue_info */
3f325179
QJ
767};
768
76e4f444
KT
769static const advsimd_vec_cost qdf24xx_advsimd_vector_cost =
770{
771 1, /* int_stmt_cost */
772 3, /* fp_stmt_cost */
b1a831f0
RS
773 0, /* ld2_st2_permute_cost */
774 0, /* ld3_st3_permute_cost */
775 0, /* ld4_st4_permute_cost */
76e4f444 776 2, /* permute_cost */
e253bb8b
RS
777 1, /* reduc_i8_cost */
778 1, /* reduc_i16_cost */
779 1, /* reduc_i32_cost */
780 1, /* reduc_i64_cost */
781 1, /* reduc_f16_cost */
782 1, /* reduc_f32_cost */
783 1, /* reduc_f64_cost */
d1ff0847 784 1, /* store_elt_extra_cost */
76e4f444
KT
785 1, /* vec_to_scalar_cost */
786 1, /* scalar_to_vec_cost */
787 1, /* align_load_cost */
788 1, /* unalign_load_cost */
789 1, /* unalign_store_cost */
790 1 /* store_cost */
8990e73a
TB
791};
792
e75bc10e
LM
793/* QDF24XX costs for vector insn classes. */
794static const struct cpu_vector_cost qdf24xx_vector_cost =
795{
796 1, /* scalar_int_stmt_cost */
797 1, /* scalar_fp_stmt_cost */
798 1, /* scalar_load_cost */
799 1, /* scalar_store_cost */
e75bc10e 800 3, /* cond_taken_branch_cost */
76e4f444
KT
801 1, /* cond_not_taken_branch_cost */
802 &qdf24xx_advsimd_vector_cost, /* advsimd */
1205a8ca
RS
803 nullptr, /* sve */
804 nullptr /* issue_info */
76e4f444
KT
805};
806
807
808static const advsimd_vec_cost thunderx_advsimd_vector_cost =
809{
810 4, /* int_stmt_cost */
811 1, /* fp_stmt_cost */
b1a831f0
RS
812 0, /* ld2_st2_permute_cost */
813 0, /* ld3_st3_permute_cost */
814 0, /* ld4_st4_permute_cost */
76e4f444 815 4, /* permute_cost */
e253bb8b
RS
816 2, /* reduc_i8_cost */
817 2, /* reduc_i16_cost */
818 2, /* reduc_i32_cost */
819 2, /* reduc_i64_cost */
820 2, /* reduc_f16_cost */
821 2, /* reduc_f32_cost */
822 2, /* reduc_f64_cost */
d1ff0847 823 2, /* store_elt_extra_cost */
76e4f444
KT
824 2, /* vec_to_scalar_cost */
825 2, /* scalar_to_vec_cost */
826 3, /* align_load_cost */
827 5, /* unalign_load_cost */
828 5, /* unalign_store_cost */
829 1 /* store_cost */
e75bc10e
LM
830};
831
c3f20327
AP
832/* ThunderX costs for vector insn classes. */
833static const struct cpu_vector_cost thunderx_vector_cost =
834{
cd8ae5ed
AP
835 1, /* scalar_int_stmt_cost */
836 1, /* scalar_fp_stmt_cost */
c3f20327
AP
837 3, /* scalar_load_cost */
838 1, /* scalar_store_cost */
c3f20327 839 3, /* cond_taken_branch_cost */
76e4f444
KT
840 3, /* cond_not_taken_branch_cost */
841 &thunderx_advsimd_vector_cost, /* advsimd */
1205a8ca
RS
842 nullptr, /* sve */
843 nullptr /* issue_info */
76e4f444
KT
844};
845
846static const advsimd_vec_cost tsv110_advsimd_vector_cost =
847{
848 2, /* int_stmt_cost */
849 2, /* fp_stmt_cost */
b1a831f0
RS
850 0, /* ld2_st2_permute_cost */
851 0, /* ld3_st3_permute_cost */
852 0, /* ld4_st4_permute_cost */
76e4f444 853 2, /* permute_cost */
e253bb8b
RS
854 3, /* reduc_i8_cost */
855 3, /* reduc_i16_cost */
856 3, /* reduc_i32_cost */
857 3, /* reduc_i64_cost */
858 3, /* reduc_f16_cost */
859 3, /* reduc_f32_cost */
860 3, /* reduc_f64_cost */
d1ff0847 861 3, /* store_elt_extra_cost */
76e4f444
KT
862 3, /* vec_to_scalar_cost */
863 2, /* scalar_to_vec_cost */
864 5, /* align_load_cost */
865 5, /* unalign_load_cost */
866 1, /* unalign_store_cost */
867 1 /* store_cost */
c3f20327
AP
868};
869
910f72e7
SZ
870static const struct cpu_vector_cost tsv110_vector_cost =
871{
872 1, /* scalar_int_stmt_cost */
873 1, /* scalar_fp_stmt_cost */
874 5, /* scalar_load_cost */
875 1, /* scalar_store_cost */
910f72e7 876 1, /* cond_taken_branch_cost */
76e4f444
KT
877 1, /* cond_not_taken_branch_cost */
878 &tsv110_advsimd_vector_cost, /* advsimd */
1205a8ca
RS
879 nullptr, /* sve */
880 nullptr /* issue_info */
910f72e7
SZ
881};
882
76e4f444
KT
883static const advsimd_vec_cost cortexa57_advsimd_vector_cost =
884{
885 2, /* int_stmt_cost */
886 2, /* fp_stmt_cost */
b1a831f0
RS
887 0, /* ld2_st2_permute_cost */
888 0, /* ld3_st3_permute_cost */
889 0, /* ld4_st4_permute_cost */
76e4f444 890 3, /* permute_cost */
e253bb8b
RS
891 8, /* reduc_i8_cost */
892 8, /* reduc_i16_cost */
893 8, /* reduc_i32_cost */
894 8, /* reduc_i64_cost */
895 8, /* reduc_f16_cost */
896 8, /* reduc_f32_cost */
897 8, /* reduc_f64_cost */
d1ff0847 898 8, /* store_elt_extra_cost */
76e4f444
KT
899 8, /* vec_to_scalar_cost */
900 8, /* scalar_to_vec_cost */
901 4, /* align_load_cost */
902 4, /* unalign_load_cost */
903 1, /* unalign_store_cost */
904 1 /* store_cost */
905};
906
907/* Cortex-A57 costs for vector insn classes. */
60bff090
JG
908static const struct cpu_vector_cost cortexa57_vector_cost =
909{
cd8ae5ed
AP
910 1, /* scalar_int_stmt_cost */
911 1, /* scalar_fp_stmt_cost */
bd95e655
JG
912 4, /* scalar_load_cost */
913 1, /* scalar_store_cost */
bd95e655 914 1, /* cond_taken_branch_cost */
76e4f444
KT
915 1, /* cond_not_taken_branch_cost */
916 &cortexa57_advsimd_vector_cost, /* advsimd */
1205a8ca
RS
917 nullptr, /* sve */
918 nullptr /* issue_info */
76e4f444
KT
919};
920
921static const advsimd_vec_cost exynosm1_advsimd_vector_cost =
922{
923 3, /* int_stmt_cost */
924 3, /* fp_stmt_cost */
b1a831f0
RS
925 0, /* ld2_st2_permute_cost */
926 0, /* ld3_st3_permute_cost */
927 0, /* ld4_st4_permute_cost */
76e4f444 928 3, /* permute_cost */
e253bb8b
RS
929 3, /* reduc_i8_cost */
930 3, /* reduc_i16_cost */
931 3, /* reduc_i32_cost */
932 3, /* reduc_i64_cost */
933 3, /* reduc_f16_cost */
934 3, /* reduc_f32_cost */
935 3, /* reduc_f64_cost */
d1ff0847 936 3, /* store_elt_extra_cost */
76e4f444
KT
937 3, /* vec_to_scalar_cost */
938 3, /* scalar_to_vec_cost */
939 5, /* align_load_cost */
940 5, /* unalign_load_cost */
941 1, /* unalign_store_cost */
942 1 /* store_cost */
60bff090
JG
943};
944
5ec1ae3b
EM
945static const struct cpu_vector_cost exynosm1_vector_cost =
946{
cd8ae5ed
AP
947 1, /* scalar_int_stmt_cost */
948 1, /* scalar_fp_stmt_cost */
5ec1ae3b
EM
949 5, /* scalar_load_cost */
950 1, /* scalar_store_cost */
5ec1ae3b 951 1, /* cond_taken_branch_cost */
76e4f444
KT
952 1, /* cond_not_taken_branch_cost */
953 &exynosm1_advsimd_vector_cost, /* advsimd */
1205a8ca
RS
954 nullptr, /* sve */
955 nullptr /* issue_info */
76e4f444
KT
956};
957
958static const advsimd_vec_cost xgene1_advsimd_vector_cost =
959{
960 2, /* int_stmt_cost */
961 2, /* fp_stmt_cost */
b1a831f0
RS
962 0, /* ld2_st2_permute_cost */
963 0, /* ld3_st3_permute_cost */
964 0, /* ld4_st4_permute_cost */
76e4f444 965 2, /* permute_cost */
e253bb8b
RS
966 4, /* reduc_i8_cost */
967 4, /* reduc_i16_cost */
968 4, /* reduc_i32_cost */
969 4, /* reduc_i64_cost */
970 4, /* reduc_f16_cost */
971 4, /* reduc_f32_cost */
972 4, /* reduc_f64_cost */
d1ff0847 973 4, /* store_elt_extra_cost */
76e4f444
KT
974 4, /* vec_to_scalar_cost */
975 4, /* scalar_to_vec_cost */
976 10, /* align_load_cost */
977 10, /* unalign_load_cost */
978 2, /* unalign_store_cost */
979 2 /* store_cost */
5ec1ae3b
EM
980};
981
381e27aa 982/* Generic costs for vector insn classes. */
381e27aa
PT
983static const struct cpu_vector_cost xgene1_vector_cost =
984{
cd8ae5ed
AP
985 1, /* scalar_int_stmt_cost */
986 1, /* scalar_fp_stmt_cost */
bd95e655
JG
987 5, /* scalar_load_cost */
988 1, /* scalar_store_cost */
bd95e655 989 2, /* cond_taken_branch_cost */
76e4f444
KT
990 1, /* cond_not_taken_branch_cost */
991 &xgene1_advsimd_vector_cost, /* advsimd */
1205a8ca
RS
992 nullptr, /* sve */
993 nullptr /* issue_info */
76e4f444
KT
994};
995
996static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost =
997{
998 4, /* int_stmt_cost */
999 5, /* fp_stmt_cost */
b1a831f0
RS
1000 0, /* ld2_st2_permute_cost */
1001 0, /* ld3_st3_permute_cost */
1002 0, /* ld4_st4_permute_cost */
76e4f444 1003 10, /* permute_cost */
e253bb8b
RS
1004 6, /* reduc_i8_cost */
1005 6, /* reduc_i16_cost */
1006 6, /* reduc_i32_cost */
1007 6, /* reduc_i64_cost */
1008 6, /* reduc_f16_cost */
1009 6, /* reduc_f32_cost */
1010 6, /* reduc_f64_cost */
d1ff0847 1011 6, /* store_elt_extra_cost */
76e4f444
KT
1012 6, /* vec_to_scalar_cost */
1013 5, /* scalar_to_vec_cost */
1014 4, /* align_load_cost */
1015 4, /* unalign_load_cost */
1016 1, /* unalign_store_cost */
1017 1 /* store_cost */
381e27aa
PT
1018};
1019
ad611a4c 1020/* Costs for vector insn classes for Vulcan. */
d1261ac6 1021static const struct cpu_vector_cost thunderx2t99_vector_cost =
ad611a4c 1022{
cd8ae5ed
AP
1023 1, /* scalar_int_stmt_cost */
1024 6, /* scalar_fp_stmt_cost */
ad611a4c
VP
1025 4, /* scalar_load_cost */
1026 1, /* scalar_store_cost */
ad611a4c 1027 2, /* cond_taken_branch_cost */
76e4f444
KT
1028 1, /* cond_not_taken_branch_cost */
1029 &thunderx2t99_advsimd_vector_cost, /* advsimd */
1205a8ca
RS
1030 nullptr, /* sve */
1031 nullptr /* issue_info */
76e4f444
KT
1032};
1033
1034static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost =
1035{
1036 5, /* int_stmt_cost */
1037 5, /* fp_stmt_cost */
b1a831f0
RS
1038 0, /* ld2_st2_permute_cost */
1039 0, /* ld3_st3_permute_cost */
1040 0, /* ld4_st4_permute_cost */
76e4f444 1041 10, /* permute_cost */
e253bb8b
RS
1042 5, /* reduc_i8_cost */
1043 5, /* reduc_i16_cost */
1044 5, /* reduc_i32_cost */
1045 5, /* reduc_i64_cost */
1046 5, /* reduc_f16_cost */
1047 5, /* reduc_f32_cost */
1048 5, /* reduc_f64_cost */
d1ff0847 1049 5, /* store_elt_extra_cost */
76e4f444
KT
1050 5, /* vec_to_scalar_cost */
1051 5, /* scalar_to_vec_cost */
1052 4, /* align_load_cost */
1053 4, /* unalign_load_cost */
1054 4, /* unalign_store_cost */
1055 4 /* store_cost */
ad611a4c
VP
1056};
1057
fa477e45
AY
1058static const struct cpu_vector_cost thunderx3t110_vector_cost =
1059{
1060 1, /* scalar_int_stmt_cost */
1061 5, /* scalar_fp_stmt_cost */
1062 4, /* scalar_load_cost */
1063 1, /* scalar_store_cost */
fa477e45 1064 2, /* cond_taken_branch_cost */
76e4f444
KT
1065 1, /* cond_not_taken_branch_cost */
1066 &thunderx3t110_advsimd_vector_cost, /* advsimd */
1205a8ca
RS
1067 nullptr, /* sve */
1068 nullptr /* issue_info */
fa477e45
AY
1069};
1070
67b0d47e
PT
1071static const advsimd_vec_cost ampere1_advsimd_vector_cost =
1072{
1073 3, /* int_stmt_cost */
1074 3, /* fp_stmt_cost */
1075 0, /* ld2_st2_permute_cost */
1076 0, /* ld3_st3_permute_cost */
1077 0, /* ld4_st4_permute_cost */
1078 2, /* permute_cost */
1079 12, /* reduc_i8_cost */
1080 9, /* reduc_i16_cost */
1081 6, /* reduc_i32_cost */
1082 5, /* reduc_i64_cost */
1083 9, /* reduc_f16_cost */
1084 6, /* reduc_f32_cost */
1085 5, /* reduc_f64_cost */
1086 8, /* store_elt_extra_cost */
1087 6, /* vec_to_scalar_cost */
1088 7, /* scalar_to_vec_cost */
1089 5, /* align_load_cost */
1090 5, /* unalign_load_cost */
1091 2, /* unalign_store_cost */
1092 2 /* store_cost */
1093};
1094
1095/* Ampere-1 costs for vector insn classes. */
1096static const struct cpu_vector_cost ampere1_vector_cost =
1097{
1098 1, /* scalar_int_stmt_cost */
1099 1, /* scalar_fp_stmt_cost */
1100 4, /* scalar_load_cost */
1101 1, /* scalar_store_cost */
1102 1, /* cond_taken_branch_cost */
1103 1, /* cond_not_taken_branch_cost */
1104 &ampere1_advsimd_vector_cost, /* advsimd */
1105 nullptr, /* sve */
1106 nullptr /* issue_info */
1107};
fa477e45 1108
b9066f5a
MW
1109/* Generic costs for branch instructions. */
1110static const struct cpu_branch_cost generic_branch_cost =
1111{
9094d4a4
WD
1112 1, /* Predictable. */
1113 3 /* Unpredictable. */
b9066f5a
MW
1114};
1115
9acc9cbe
EM
1116/* Generic approximation modes. */
1117static const cpu_approx_modes generic_approx_modes =
1118{
79a2bc2d 1119 AARCH64_APPROX_NONE, /* division */
98daafa0 1120 AARCH64_APPROX_NONE, /* sqrt */
9acc9cbe
EM
1121 AARCH64_APPROX_NONE /* recip_sqrt */
1122};
1123
1124/* Approximation modes for Exynos M1. */
1125static const cpu_approx_modes exynosm1_approx_modes =
1126{
79a2bc2d 1127 AARCH64_APPROX_NONE, /* division */
98daafa0 1128 AARCH64_APPROX_ALL, /* sqrt */
9acc9cbe
EM
1129 AARCH64_APPROX_ALL /* recip_sqrt */
1130};
1131
1132/* Approximation modes for X-Gene 1. */
1133static const cpu_approx_modes xgene1_approx_modes =
1134{
79a2bc2d 1135 AARCH64_APPROX_NONE, /* division */
98daafa0 1136 AARCH64_APPROX_NONE, /* sqrt */
9acc9cbe
EM
1137 AARCH64_APPROX_ALL /* recip_sqrt */
1138};
1139
9d2c6e2e
MK
1140/* Generic prefetch settings (which disable prefetch). */
1141static const cpu_prefetch_tune generic_prefetch_tune =
1142{
1143 0, /* num_slots */
1144 -1, /* l1_cache_size */
1145 -1, /* l1_cache_line_size */
16b2cafd 1146 -1, /* l2_cache_size */
d2ff35c0 1147 true, /* prefetch_dynamic_strides */
59100dfc 1148 -1, /* minimum_stride */
16b2cafd 1149 -1 /* default_opt_level */
9d2c6e2e
MK
1150};
1151
1152static const cpu_prefetch_tune exynosm1_prefetch_tune =
1153{
1154 0, /* num_slots */
1155 -1, /* l1_cache_size */
1156 64, /* l1_cache_line_size */
16b2cafd 1157 -1, /* l2_cache_size */
d2ff35c0 1158 true, /* prefetch_dynamic_strides */
59100dfc 1159 -1, /* minimum_stride */
16b2cafd 1160 -1 /* default_opt_level */
9d2c6e2e
MK
1161};
1162
1163static const cpu_prefetch_tune qdf24xx_prefetch_tune =
1164{
70c51b58
MK
1165 4, /* num_slots */
1166 32, /* l1_cache_size */
9d2c6e2e 1167 64, /* l1_cache_line_size */
725e2110 1168 512, /* l2_cache_size */
d2ff35c0 1169 false, /* prefetch_dynamic_strides */
59100dfc
LM
1170 2048, /* minimum_stride */
1171 3 /* default_opt_level */
9d2c6e2e
MK
1172};
1173
f1e247d0
AP
1174static const cpu_prefetch_tune thunderxt88_prefetch_tune =
1175{
1176 8, /* num_slots */
1177 32, /* l1_cache_size */
1178 128, /* l1_cache_line_size */
1179 16*1024, /* l2_cache_size */
d2ff35c0 1180 true, /* prefetch_dynamic_strides */
59100dfc 1181 -1, /* minimum_stride */
f1e247d0
AP
1182 3 /* default_opt_level */
1183};
1184
1185static const cpu_prefetch_tune thunderx_prefetch_tune =
1186{
1187 8, /* num_slots */
1188 32, /* l1_cache_size */
1189 128, /* l1_cache_line_size */
1190 -1, /* l2_cache_size */
d2ff35c0 1191 true, /* prefetch_dynamic_strides */
59100dfc 1192 -1, /* minimum_stride */
f1e247d0
AP
1193 -1 /* default_opt_level */
1194};
1195
9d2c6e2e
MK
1196static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
1197{
f1e247d0
AP
1198 8, /* num_slots */
1199 32, /* l1_cache_size */
9d2c6e2e 1200 64, /* l1_cache_line_size */
f1e247d0 1201 256, /* l2_cache_size */
d2ff35c0 1202 true, /* prefetch_dynamic_strides */
59100dfc 1203 -1, /* minimum_stride */
16b2cafd 1204 -1 /* default_opt_level */
9d2c6e2e
MK
1205};
1206
fa477e45
AY
1207static const cpu_prefetch_tune thunderx3t110_prefetch_tune =
1208{
1209 8, /* num_slots */
1210 32, /* l1_cache_size */
1211 64, /* l1_cache_line_size */
1212 256, /* l2_cache_size */
1213 true, /* prefetch_dynamic_strides */
1214 -1, /* minimum_stride */
1215 -1 /* default_opt_level */
1216};
1217
910f72e7
SZ
1218static const cpu_prefetch_tune tsv110_prefetch_tune =
1219{
1220 0, /* num_slots */
1221 64, /* l1_cache_size */
1222 64, /* l1_cache_line_size */
1223 512, /* l2_cache_size */
1224 true, /* prefetch_dynamic_strides */
1225 -1, /* minimum_stride */
1226 -1 /* default_opt_level */
1227};
1228
d5e9851e
CM
1229static const cpu_prefetch_tune xgene1_prefetch_tune =
1230{
1231 8, /* num_slots */
1232 32, /* l1_cache_size */
1233 64, /* l1_cache_line_size */
1234 256, /* l2_cache_size */
1235 true, /* prefetch_dynamic_strides */
1236 -1, /* minimum_stride */
1237 -1 /* default_opt_level */
1238};
1239
02f21aea
QJ
1240static const cpu_prefetch_tune a64fx_prefetch_tune =
1241{
1242 8, /* num_slots */
1243 64, /* l1_cache_size */
1244 256, /* l1_cache_line_size */
1245 32768, /* l2_cache_size */
1246 true, /* prefetch_dynamic_strides */
1247 -1, /* minimum_stride */
1248 -1 /* default_opt_level */
1249};
1250
67b0d47e
PT
1251static const cpu_prefetch_tune ampere1_prefetch_tune =
1252{
1253 0, /* num_slots */
1254 64, /* l1_cache_size */
1255 64, /* l1_cache_line_size */
1256 2048, /* l2_cache_size */
1257 true, /* prefetch_dynamic_strides */
1258 -1, /* minimum_stride */
1259 -1 /* default_opt_level */
1260};
1261
43e9d192
IB
1262static const struct tune_params generic_tunings =
1263{
4e2cd668 1264 &cortexa57_extra_costs,
43e9d192
IB
1265 &generic_addrcost_table,
1266 &generic_regmove_cost,
8990e73a 1267 &generic_vector_cost,
b9066f5a 1268 &generic_branch_cost,
9acc9cbe 1269 &generic_approx_modes,
2d56d6ba 1270 SVE_NOT_IMPLEMENTED, /* sve_width */
bd95e655
JG
1271 4, /* memmov_cost */
1272 2, /* issue_rate */
6ed8c923 1273 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
4e55aefa 1274 "16:12", /* function_align. */
c518c102
ML
1275 "4", /* jump_align. */
1276 "8", /* loop_align. */
cee66c68
WD
1277 2, /* int_reassoc_width. */
1278 4, /* fp_reassoc_width. */
50093a33
WD
1279 1, /* vec_reassoc_width. */
1280 2, /* min_div_recip_mul_sf. */
dfba575f 1281 2, /* min_div_recip_mul_df. */
50487d79 1282 0, /* max_case_values. */
3b4c0f7e 1283 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
8f0c9d53
KT
1284 /* Enabling AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS significantly benefits
1285 Neoverse V1. It does not have a noticeable effect on A64FX and should
1286 have at most a very minor effect on SVE2 cores. */
1287 (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS), /* tune_flags. */
9d2c6e2e 1288 &generic_prefetch_tune
43e9d192
IB
1289};
1290
1c72a3ca
JG
1291static const struct tune_params cortexa35_tunings =
1292{
1293 &cortexa53_extra_costs,
1294 &generic_addrcost_table,
1295 &cortexa53_regmove_cost,
1296 &generic_vector_cost,
aca97ef8 1297 &generic_branch_cost,
9acc9cbe 1298 &generic_approx_modes,
2d56d6ba 1299 SVE_NOT_IMPLEMENTED, /* sve_width */
1c72a3ca
JG
1300 4, /* memmov_cost */
1301 1, /* issue_rate */
0bc24338 1302 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1c72a3ca 1303 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
c518c102
ML
1304 "16", /* function_align. */
1305 "4", /* jump_align. */
1306 "8", /* loop_align. */
1c72a3ca
JG
1307 2, /* int_reassoc_width. */
1308 4, /* fp_reassoc_width. */
1309 1, /* vec_reassoc_width. */
1310 2, /* min_div_recip_mul_sf. */
1311 2, /* min_div_recip_mul_df. */
1312 0, /* max_case_values. */
1c72a3ca 1313 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
9d2c6e2e
MK
1314 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1315 &generic_prefetch_tune
1c72a3ca
JG
1316};
1317
984239ad
KT
1318static const struct tune_params cortexa53_tunings =
1319{
1320 &cortexa53_extra_costs,
1321 &generic_addrcost_table,
e4a9c55a 1322 &cortexa53_regmove_cost,
984239ad 1323 &generic_vector_cost,
aca97ef8 1324 &generic_branch_cost,
9acc9cbe 1325 &generic_approx_modes,
2d56d6ba 1326 SVE_NOT_IMPLEMENTED, /* sve_width */
bd95e655
JG
1327 4, /* memmov_cost */
1328 2, /* issue_rate */
00a8574a 1329 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
e9a3a175 1330 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
c518c102
ML
1331 "16", /* function_align. */
1332 "4", /* jump_align. */
1333 "8", /* loop_align. */
cee66c68
WD
1334 2, /* int_reassoc_width. */
1335 4, /* fp_reassoc_width. */
50093a33
WD
1336 1, /* vec_reassoc_width. */
1337 2, /* min_div_recip_mul_sf. */
dfba575f 1338 2, /* min_div_recip_mul_df. */
50487d79 1339 0, /* max_case_values. */
2d6bc7fa 1340 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
9d2c6e2e
MK
1341 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1342 &generic_prefetch_tune
984239ad
KT
1343};
1344
4fd92af6
KT
1345static const struct tune_params cortexa57_tunings =
1346{
1347 &cortexa57_extra_costs,
a39d4348 1348 &generic_addrcost_table,
e4a9c55a 1349 &cortexa57_regmove_cost,
60bff090 1350 &cortexa57_vector_cost,
aca97ef8 1351 &generic_branch_cost,
9acc9cbe 1352 &generic_approx_modes,
2d56d6ba 1353 SVE_NOT_IMPLEMENTED, /* sve_width */
bd95e655
JG
1354 4, /* memmov_cost */
1355 3, /* issue_rate */
00a8574a 1356 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
e9a3a175 1357 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
c518c102
ML
1358 "16", /* function_align. */
1359 "4", /* jump_align. */
1360 "8", /* loop_align. */
cee66c68
WD
1361 2, /* int_reassoc_width. */
1362 4, /* fp_reassoc_width. */
50093a33
WD
1363 1, /* vec_reassoc_width. */
1364 2, /* min_div_recip_mul_sf. */
dfba575f 1365 2, /* min_div_recip_mul_df. */
50487d79 1366 0, /* max_case_values. */
2d6bc7fa 1367 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
9d2c6e2e
MK
1368 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
1369 &generic_prefetch_tune
dfba575f
JG
1370};
1371
1372static const struct tune_params cortexa72_tunings =
1373{
1374 &cortexa57_extra_costs,
a39d4348 1375 &generic_addrcost_table,
dfba575f
JG
1376 &cortexa57_regmove_cost,
1377 &cortexa57_vector_cost,
aca97ef8 1378 &generic_branch_cost,
9acc9cbe 1379 &generic_approx_modes,
2d56d6ba 1380 SVE_NOT_IMPLEMENTED, /* sve_width */
dfba575f
JG
1381 4, /* memmov_cost */
1382 3, /* issue_rate */
00a8574a 1383 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
dfba575f 1384 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
c518c102
ML
1385 "16", /* function_align. */
1386 "4", /* jump_align. */
1387 "8", /* loop_align. */
dfba575f
JG
1388 2, /* int_reassoc_width. */
1389 4, /* fp_reassoc_width. */
1390 1, /* vec_reassoc_width. */
1391 2, /* min_div_recip_mul_sf. */
1392 2, /* min_div_recip_mul_df. */
50487d79 1393 0, /* max_case_values. */
0bc24338 1394 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
9d2c6e2e
MK
1395 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1396 &generic_prefetch_tune
4fd92af6
KT
1397};
1398
4fb570c4
KT
1399static const struct tune_params cortexa73_tunings =
1400{
1401 &cortexa57_extra_costs,
a39d4348 1402 &generic_addrcost_table,
4fb570c4
KT
1403 &cortexa57_regmove_cost,
1404 &cortexa57_vector_cost,
aca97ef8 1405 &generic_branch_cost,
4fb570c4 1406 &generic_approx_modes,
2d56d6ba 1407 SVE_NOT_IMPLEMENTED, /* sve_width */
4fb570c4
KT
1408 4, /* memmov_cost. */
1409 2, /* issue_rate. */
1410 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1411 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
c518c102
ML
1412 "16", /* function_align. */
1413 "4", /* jump_align. */
1414 "8", /* loop_align. */
4fb570c4
KT
1415 2, /* int_reassoc_width. */
1416 4, /* fp_reassoc_width. */
1417 1, /* vec_reassoc_width. */
1418 2, /* min_div_recip_mul_sf. */
1419 2, /* min_div_recip_mul_df. */
1420 0, /* max_case_values. */
4fb570c4 1421 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
9d2c6e2e
MK
1422 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1423 &generic_prefetch_tune
4fb570c4
KT
1424};
1425
9d2c6e2e
MK
1426
1427
5ec1ae3b
EM
1428static const struct tune_params exynosm1_tunings =
1429{
1430 &exynosm1_extra_costs,
1431 &exynosm1_addrcost_table,
1432 &exynosm1_regmove_cost,
1433 &exynosm1_vector_cost,
1434 &generic_branch_cost,
9acc9cbe 1435 &exynosm1_approx_modes,
2d56d6ba 1436 SVE_NOT_IMPLEMENTED, /* sve_width */
5ec1ae3b
EM
1437 4, /* memmov_cost */
1438 3, /* issue_rate */
25cc2199 1439 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
c518c102
ML
1440 "4", /* function_align. */
1441 "4", /* jump_align. */
1442 "4", /* loop_align. */
5ec1ae3b
EM
1443 2, /* int_reassoc_width. */
1444 4, /* fp_reassoc_width. */
1445 1, /* vec_reassoc_width. */
1446 2, /* min_div_recip_mul_sf. */
1447 2, /* min_div_recip_mul_df. */
1448 48, /* max_case_values. */
220379df 1449 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
9d2c6e2e
MK
1450 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1451 &exynosm1_prefetch_tune
5ec1ae3b
EM
1452};
1453
f1e247d0
AP
1454static const struct tune_params thunderxt88_tunings =
1455{
1456 &thunderx_extra_costs,
1457 &generic_addrcost_table,
1458 &thunderx_regmove_cost,
1459 &thunderx_vector_cost,
1460 &generic_branch_cost,
1461 &generic_approx_modes,
2d56d6ba 1462 SVE_NOT_IMPLEMENTED, /* sve_width */
f1e247d0
AP
1463 6, /* memmov_cost */
1464 2, /* issue_rate */
a4f3fa71 1465 AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
c518c102
ML
1466 "8", /* function_align. */
1467 "8", /* jump_align. */
1468 "8", /* loop_align. */
f1e247d0
AP
1469 2, /* int_reassoc_width. */
1470 4, /* fp_reassoc_width. */
1471 1, /* vec_reassoc_width. */
1472 2, /* min_div_recip_mul_sf. */
1473 2, /* min_div_recip_mul_df. */
1474 0, /* max_case_values. */
1475 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1476 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
1477 &thunderxt88_prefetch_tune
1478};
1479
d1bcc29f
AP
1480static const struct tune_params thunderx_tunings =
1481{
1482 &thunderx_extra_costs,
1483 &generic_addrcost_table,
1484 &thunderx_regmove_cost,
c3f20327 1485 &thunderx_vector_cost,
b9066f5a 1486 &generic_branch_cost,
9acc9cbe 1487 &generic_approx_modes,
2d56d6ba 1488 SVE_NOT_IMPLEMENTED, /* sve_width */
bd95e655
JG
1489 6, /* memmov_cost */
1490 2, /* issue_rate */
a4f3fa71 1491 AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
c518c102
ML
1492 "8", /* function_align. */
1493 "8", /* jump_align. */
1494 "8", /* loop_align. */
cee66c68
WD
1495 2, /* int_reassoc_width. */
1496 4, /* fp_reassoc_width. */
50093a33
WD
1497 1, /* vec_reassoc_width. */
1498 2, /* min_div_recip_mul_sf. */
dfba575f 1499 2, /* min_div_recip_mul_df. */
50487d79 1500 0, /* max_case_values. */
2d6bc7fa 1501 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
b10f1009
AP
1502 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
1503 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
f1e247d0 1504 &thunderx_prefetch_tune
d1bcc29f
AP
1505};
1506
910f72e7
SZ
1507static const struct tune_params tsv110_tunings =
1508{
1509 &tsv110_extra_costs,
1510 &tsv110_addrcost_table,
1511 &tsv110_regmove_cost,
1512 &tsv110_vector_cost,
1513 &generic_branch_cost,
1514 &generic_approx_modes,
2d56d6ba 1515 SVE_NOT_IMPLEMENTED, /* sve_width */
910f72e7
SZ
1516 4, /* memmov_cost */
1517 4, /* issue_rate */
a4f3fa71
WD
1518 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
1519 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
910f72e7
SZ
1520 "16", /* function_align. */
1521 "4", /* jump_align. */
1522 "8", /* loop_align. */
1523 2, /* int_reassoc_width. */
1524 4, /* fp_reassoc_width. */
1525 1, /* vec_reassoc_width. */
1526 2, /* min_div_recip_mul_sf. */
1527 2, /* min_div_recip_mul_df. */
1528 0, /* max_case_values. */
1529 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1530 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1531 &tsv110_prefetch_tune
1532};
1533
381e27aa 1534static const struct tune_params xgene1_tunings =
e02669db
CM
1535{
1536 &xgene1_extra_costs,
1537 &xgene1_addrcost_table,
1538 &xgene1_regmove_cost,
1539 &xgene1_vector_cost,
1540 &generic_branch_cost,
1541 &xgene1_approx_modes,
2d56d6ba 1542 SVE_NOT_IMPLEMENTED, /* sve_width */
e02669db
CM
1543 6, /* memmov_cost */
1544 4, /* issue_rate */
1545 AARCH64_FUSE_NOTHING, /* fusible_ops */
1546 "16", /* function_align. */
1547 "16", /* jump_align. */
1548 "16", /* loop_align. */
1549 2, /* int_reassoc_width. */
1550 4, /* fp_reassoc_width. */
1551 1, /* vec_reassoc_width. */
1552 2, /* min_div_recip_mul_sf. */
1553 2, /* min_div_recip_mul_df. */
1554 17, /* max_case_values. */
1555 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1556 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1557 &xgene1_prefetch_tune
1558};
1559
1560static const struct tune_params emag_tunings =
381e27aa
PT
1561{
1562 &xgene1_extra_costs,
1563 &xgene1_addrcost_table,
1564 &xgene1_regmove_cost,
1565 &xgene1_vector_cost,
b9066f5a 1566 &generic_branch_cost,
9acc9cbe 1567 &xgene1_approx_modes,
2d56d6ba 1568 SVE_NOT_IMPLEMENTED,
bd95e655
JG
1569 6, /* memmov_cost */
1570 4, /* issue_rate */
e9a3a175 1571 AARCH64_FUSE_NOTHING, /* fusible_ops */
c518c102 1572 "16", /* function_align. */
cf28c77e 1573 "16", /* jump_align. */
c518c102 1574 "16", /* loop_align. */
381e27aa
PT
1575 2, /* int_reassoc_width. */
1576 4, /* fp_reassoc_width. */
50093a33
WD
1577 1, /* vec_reassoc_width. */
1578 2, /* min_div_recip_mul_sf. */
dfba575f 1579 2, /* min_div_recip_mul_df. */
cf28c77e 1580 17, /* max_case_values. */
2d6bc7fa 1581 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
9f5361c8 1582 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
d5e9851e 1583 &xgene1_prefetch_tune
381e27aa
PT
1584};
1585
ee446d9f
JW
1586static const struct tune_params qdf24xx_tunings =
1587{
1588 &qdf24xx_extra_costs,
8d39ea2f 1589 &qdf24xx_addrcost_table,
ee446d9f 1590 &qdf24xx_regmove_cost,
e75bc10e 1591 &qdf24xx_vector_cost,
ee446d9f
JW
1592 &generic_branch_cost,
1593 &generic_approx_modes,
2d56d6ba 1594 SVE_NOT_IMPLEMENTED, /* sve_width */
ee446d9f
JW
1595 4, /* memmov_cost */
1596 4, /* issue_rate */
1597 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1598 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
c518c102
ML
1599 "16", /* function_align. */
1600 "8", /* jump_align. */
1601 "16", /* loop_align. */
ee446d9f
JW
1602 2, /* int_reassoc_width. */
1603 4, /* fp_reassoc_width. */
1604 1, /* vec_reassoc_width. */
1605 2, /* min_div_recip_mul_sf. */
1606 2, /* min_div_recip_mul_df. */
1607 0, /* max_case_values. */
4f2a94e6 1608 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
a98824ac 1609 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
9d2c6e2e 1610 &qdf24xx_prefetch_tune
ee446d9f
JW
1611};
1612
52ee8191
SP
1613/* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1614 for now. */
1615static const struct tune_params saphira_tunings =
1616{
1617 &generic_extra_costs,
1618 &generic_addrcost_table,
1619 &generic_regmove_cost,
1620 &generic_vector_cost,
1621 &generic_branch_cost,
1622 &generic_approx_modes,
2d56d6ba 1623 SVE_NOT_IMPLEMENTED, /* sve_width */
52ee8191
SP
1624 4, /* memmov_cost */
1625 4, /* issue_rate */
1626 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1627 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
c518c102
ML
1628 "16", /* function_align. */
1629 "8", /* jump_align. */
1630 "16", /* loop_align. */
52ee8191
SP
1631 2, /* int_reassoc_width. */
1632 4, /* fp_reassoc_width. */
1633 1, /* vec_reassoc_width. */
1634 2, /* min_div_recip_mul_sf. */
1635 2, /* min_div_recip_mul_df. */
1636 0, /* max_case_values. */
1637 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1638 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1639 &generic_prefetch_tune
1640};
1641
d1261ac6 1642static const struct tune_params thunderx2t99_tunings =
ad611a4c 1643{
d1261ac6
AP
1644 &thunderx2t99_extra_costs,
1645 &thunderx2t99_addrcost_table,
1646 &thunderx2t99_regmove_cost,
1647 &thunderx2t99_vector_cost,
aca97ef8 1648 &generic_branch_cost,
ad611a4c 1649 &generic_approx_modes,
2d56d6ba 1650 SVE_NOT_IMPLEMENTED, /* sve_width */
ad611a4c
VP
1651 4, /* memmov_cost. */
1652 4, /* issue_rate. */
a4f3fa71
WD
1653 (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1654 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
c518c102
ML
1655 "16", /* function_align. */
1656 "8", /* jump_align. */
1657 "16", /* loop_align. */
ad611a4c
VP
1658 3, /* int_reassoc_width. */
1659 2, /* fp_reassoc_width. */
1660 2, /* vec_reassoc_width. */
1661 2, /* min_div_recip_mul_sf. */
1662 2, /* min_div_recip_mul_df. */
1663 0, /* max_case_values. */
f1e247d0 1664 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
9d2c6e2e
MK
1665 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1666 &thunderx2t99_prefetch_tune
ad611a4c
VP
1667};
1668
fa477e45
AY
1669static const struct tune_params thunderx3t110_tunings =
1670{
1671 &thunderx3t110_extra_costs,
1672 &thunderx3t110_addrcost_table,
1673 &thunderx3t110_regmove_cost,
1674 &thunderx3t110_vector_cost,
1675 &generic_branch_cost,
1676 &generic_approx_modes,
1677 SVE_NOT_IMPLEMENTED, /* sve_width */
1678 4, /* memmov_cost. */
1679 6, /* issue_rate. */
1680 (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1681 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
1682 "16", /* function_align. */
1683 "8", /* jump_align. */
1684 "16", /* loop_align. */
1685 3, /* int_reassoc_width. */
1686 2, /* fp_reassoc_width. */
1687 2, /* vec_reassoc_width. */
1688 2, /* min_div_recip_mul_sf. */
1689 2, /* min_div_recip_mul_df. */
1690 0, /* max_case_values. */
1691 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1692 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1693 &thunderx3t110_prefetch_tune
1694};
1695
9ed6834d 1696static const struct tune_params neoversen1_tunings =
fc881de2 1697{
5c5a67e6 1698 &cortexa76_extra_costs,
fc881de2
KT
1699 &generic_addrcost_table,
1700 &generic_regmove_cost,
1701 &cortexa57_vector_cost,
1702 &generic_branch_cost,
1703 &generic_approx_modes,
1704 SVE_NOT_IMPLEMENTED, /* sve_width */
1705 4, /* memmov_cost */
1706 3, /* issue_rate */
6ed8c923 1707 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
fc881de2 1708 "32:16", /* function_align. */
3a434597 1709 "4", /* jump_align. */
fc881de2
KT
1710 "32:16", /* loop_align. */
1711 2, /* int_reassoc_width. */
1712 4, /* fp_reassoc_width. */
1713 2, /* vec_reassoc_width. */
1714 2, /* min_div_recip_mul_sf. */
1715 2, /* min_div_recip_mul_df. */
1716 0, /* max_case_values. */
1717 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
b326f495 1718 (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
fc881de2
KT
1719 &generic_prefetch_tune
1720};
1721
67b0d47e
PT
1722static const struct tune_params ampere1_tunings =
1723{
1724 &ampere1_extra_costs,
1725 &generic_addrcost_table,
1726 &generic_regmove_cost,
1727 &ampere1_vector_cost,
1728 &generic_branch_cost,
1729 &generic_approx_modes,
1730 SVE_NOT_IMPLEMENTED, /* sve_width */
1731 4, /* memmov_cost */
1732 4, /* issue_rate */
1733 (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
1734 AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
1735 AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
1736 AARCH64_FUSE_CMP_BRANCH),
1737 /* fusible_ops */
1738 "32", /* function_align. */
1739 "4", /* jump_align. */
1740 "32:16", /* loop_align. */
1741 2, /* int_reassoc_width. */
1742 4, /* fp_reassoc_width. */
1743 2, /* vec_reassoc_width. */
1744 2, /* min_div_recip_mul_sf. */
1745 2, /* min_div_recip_mul_df. */
1746 0, /* max_case_values. */
1747 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1748 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1749 &ampere1_prefetch_tune
1750};
1751
14bd21c2
RS
1752static const advsimd_vec_cost neoversev1_advsimd_vector_cost =
1753{
1754 2, /* int_stmt_cost */
1755 2, /* fp_stmt_cost */
1756 4, /* ld2_st2_permute_cost */
1757 4, /* ld3_st3_permute_cost */
1758 5, /* ld4_st4_permute_cost */
1759 3, /* permute_cost */
1760 4, /* reduc_i8_cost */
1761 4, /* reduc_i16_cost */
1762 2, /* reduc_i32_cost */
1763 2, /* reduc_i64_cost */
1764 6, /* reduc_f16_cost */
1765 3, /* reduc_f32_cost */
1766 2, /* reduc_f64_cost */
1767 2, /* store_elt_extra_cost */
1768 /* This value is just inherited from the Cortex-A57 table. */
1769 8, /* vec_to_scalar_cost */
1770 /* This depends very much on what the scalar value is and
1771 where it comes from. E.g. some constants take two dependent
1772 instructions or a load, while others might be moved from a GPR.
1773 4 seems to be a reasonable compromise in practice. */
1774 4, /* scalar_to_vec_cost */
1775 4, /* align_load_cost */
1776 4, /* unalign_load_cost */
1777 /* Although stores have a latency of 2 and compete for the
1778 vector pipes, in practice it's better not to model that. */
1779 1, /* unalign_store_cost */
1780 1 /* store_cost */
1781};
1782
1783static const sve_vec_cost neoversev1_sve_vector_cost =
1784{
1785 {
1786 2, /* int_stmt_cost */
1787 2, /* fp_stmt_cost */
1788 4, /* ld2_st2_permute_cost */
1789 7, /* ld3_st3_permute_cost */
1790 8, /* ld4_st4_permute_cost */
1791 3, /* permute_cost */
1792 /* Theoretically, a reduction involving 31 scalar ADDs could
1793 complete in ~9 cycles and would have a cost of 31. [SU]ADDV
1794 completes in 14 cycles, so give it a cost of 31 + 5. */
1795 36, /* reduc_i8_cost */
1796 /* Likewise for 15 scalar ADDs (~5 cycles) vs. 12: 15 + 7. */
1797 22, /* reduc_i16_cost */
1798 /* Likewise for 7 scalar ADDs (~3 cycles) vs. 10: 7 + 7. */
1799 14, /* reduc_i32_cost */
1800 /* Likewise for 3 scalar ADDs (~2 cycles) vs. 10: 3 + 8. */
1801 11, /* reduc_i64_cost */
1802 /* Theoretically, a reduction involving 15 scalar FADDs could
1803 complete in ~9 cycles and would have a cost of 30. FADDV
1804 completes in 13 cycles, so give it a cost of 30 + 4. */
1805 34, /* reduc_f16_cost */
1806 /* Likewise for 7 scalar FADDs (~6 cycles) vs. 11: 14 + 5. */
1807 19, /* reduc_f32_cost */
1808 /* Likewise for 3 scalar FADDs (~4 cycles) vs. 9: 6 + 5. */
1809 11, /* reduc_f64_cost */
1810 2, /* store_elt_extra_cost */
1811 /* This value is just inherited from the Cortex-A57 table. */
1812 8, /* vec_to_scalar_cost */
1813 /* See the comment above the Advanced SIMD versions. */
1814 4, /* scalar_to_vec_cost */
1815 4, /* align_load_cost */
1816 4, /* unalign_load_cost */
1817 /* Although stores have a latency of 2 and compete for the
1818 vector pipes, in practice it's better not to model that. */
1819 1, /* unalign_store_cost */
1820 1 /* store_cost */
1821 },
1822 3, /* clast_cost */
1823 19, /* fadda_f16_cost */
1824 11, /* fadda_f32_cost */
1825 8, /* fadda_f64_cost */
78770e0e
RS
1826 32, /* gather_load_x32_cost */
1827 16, /* gather_load_x64_cost */
14bd21c2
RS
1828 3 /* scatter_store_elt_cost */
1829};
1830
1205a8ca
RS
1831static const aarch64_scalar_vec_issue_info neoversev1_scalar_issue_info =
1832{
1833 3, /* loads_stores_per_cycle */
1834 2, /* stores_per_cycle */
1835 4, /* general_ops_per_cycle */
1836 0, /* fp_simd_load_general_ops */
1837 1 /* fp_simd_store_general_ops */
1838};
1839
1840static const aarch64_advsimd_vec_issue_info neoversev1_advsimd_issue_info =
1841{
1842 {
1843 3, /* loads_stores_per_cycle */
1844 2, /* stores_per_cycle */
1845 4, /* general_ops_per_cycle */
1846 0, /* fp_simd_load_general_ops */
1847 1 /* fp_simd_store_general_ops */
1848 },
1849 2, /* ld2_st2_general_ops */
1850 2, /* ld3_st3_general_ops */
1851 3 /* ld4_st4_general_ops */
1852};
1853
1854static const aarch64_sve_vec_issue_info neoversev1_sve_issue_info =
1855{
1856 {
1857 {
1858 2, /* loads_per_cycle */
1859 2, /* stores_per_cycle */
1860 2, /* general_ops_per_cycle */
1861 0, /* fp_simd_load_general_ops */
1862 1 /* fp_simd_store_general_ops */
1863 },
1864 2, /* ld2_st2_general_ops */
1865 2, /* ld3_st3_general_ops */
1866 3 /* ld4_st4_general_ops */
1867 },
1868 1, /* pred_ops_per_cycle */
1869 2, /* while_pred_ops */
1870 2, /* int_cmp_pred_ops */
1871 1, /* fp_cmp_pred_ops */
1872 1, /* gather_scatter_pair_general_ops */
1873 1 /* gather_scatter_pair_pred_ops */
1874};
1875
1876static const aarch64_vec_issue_info neoversev1_vec_issue_info =
1877{
1878 &neoversev1_scalar_issue_info,
1879 &neoversev1_advsimd_issue_info,
1880 &neoversev1_sve_issue_info
1881};
1882
14bd21c2
RS
1883/* Neoverse V1 costs for vector insn classes. */
1884static const struct cpu_vector_cost neoversev1_vector_cost =
1885{
1886 1, /* scalar_int_stmt_cost */
1887 2, /* scalar_fp_stmt_cost */
1888 4, /* scalar_load_cost */
1889 1, /* scalar_store_cost */
1890 1, /* cond_taken_branch_cost */
1891 1, /* cond_not_taken_branch_cost */
1892 &neoversev1_advsimd_vector_cost, /* advsimd */
1205a8ca
RS
1893 &neoversev1_sve_vector_cost, /* sve */
1894 &neoversev1_vec_issue_info /* issue_info */
14bd21c2
RS
1895};
1896
c8c77ed7
KT
1897static const struct tune_params neoversev1_tunings =
1898{
5c5a67e6 1899 &cortexa76_extra_costs,
6b8b0c8e 1900 &neoversev1_addrcost_table,
c8c77ed7 1901 &generic_regmove_cost,
14bd21c2 1902 &neoversev1_vector_cost,
c8c77ed7
KT
1903 &generic_branch_cost,
1904 &generic_approx_modes,
1905 SVE_256, /* sve_width */
1906 4, /* memmov_cost */
1907 3, /* issue_rate */
1908 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
1909 "32:16", /* function_align. */
1910 "4", /* jump_align. */
1911 "32:16", /* loop_align. */
1912 2, /* int_reassoc_width. */
1913 4, /* fp_reassoc_width. */
1914 2, /* vec_reassoc_width. */
1915 2, /* min_div_recip_mul_sf. */
1916 2, /* min_div_recip_mul_df. */
1917 0, /* max_case_values. */
1918 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
14bd21c2 1919 (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
3b924b0d 1920 | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
c437d334
WD
1921 | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
1922 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
c8c77ed7
KT
1923 &generic_prefetch_tune
1924};
1925
048039c4
RS
1926static const sve_vec_cost neoverse512tvb_sve_vector_cost =
1927{
1928 {
1929 2, /* int_stmt_cost */
1930 2, /* fp_stmt_cost */
1931 4, /* ld2_st2_permute_cost */
1932 5, /* ld3_st3_permute_cost */
1933 5, /* ld4_st4_permute_cost */
1934 3, /* permute_cost */
1935 /* Theoretically, a reduction involving 15 scalar ADDs could
1936 complete in ~5 cycles and would have a cost of 15. Assume that
1937 [SU]ADDV completes in 11 cycles and so give it a cost of 15 + 6. */
1938 21, /* reduc_i8_cost */
1939 /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6. */
1940 13, /* reduc_i16_cost */
1941 /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6. */
1942 9, /* reduc_i32_cost */
1943 /* Likewise for 1 scalar ADD (1 cycle) vs. 8: 1 + 7. */
1944 8, /* reduc_i64_cost */
1945 /* Theoretically, a reduction involving 7 scalar FADDs could
1946 complete in ~6 cycles and would have a cost of 14. Assume that
1947 FADDV completes in 8 cycles and so give it a cost of 14 + 2. */
1948 16, /* reduc_f16_cost */
1949 /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2. */
1950 8, /* reduc_f32_cost */
1951 /* Likewise for 1 scalar FADD (2 cycles) vs. 4: 2 + 2. */
1952 4, /* reduc_f64_cost */
1953 2, /* store_elt_extra_cost */
1954 /* This value is just inherited from the Cortex-A57 table. */
1955 8, /* vec_to_scalar_cost */
1956 /* This depends very much on what the scalar value is and
1957 where it comes from. E.g. some constants take two dependent
1958 instructions or a load, while others might be moved from a GPR.
1959 4 seems to be a reasonable compromise in practice. */
1960 4, /* scalar_to_vec_cost */
1961 4, /* align_load_cost */
1962 4, /* unalign_load_cost */
1963 /* Although stores generally have a latency of 2 and compete for the
1964 vector pipes, in practice it's better not to model that. */
1965 1, /* unalign_store_cost */
1966 1 /* store_cost */
1967 },
1968 3, /* clast_cost */
1969 10, /* fadda_f16_cost */
1970 6, /* fadda_f32_cost */
1971 4, /* fadda_f64_cost */
1972 /* A strided Advanced SIMD x64 load would take two parallel FP loads
1973 (6 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather
1974 is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads
1975 (cost 8) and a vec_construct (cost 2). Add a full vector operation
1976 (cost 2) to that, to avoid the difference being lost in rounding.
1977
1978 There is no easy comparison between a strided Advanced SIMD x32 load
1979 and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
1980 operation more than a 64-bit gather. */
1981 14, /* gather_load_x32_cost */
1982 12, /* gather_load_x64_cost */
1983 3 /* scatter_store_elt_cost */
1984};
1985
1986static const aarch64_sve_vec_issue_info neoverse512tvb_sve_issue_info =
1987{
1988 {
1989 {
1990 3, /* loads_per_cycle */
1991 2, /* stores_per_cycle */
1992 4, /* general_ops_per_cycle */
1993 0, /* fp_simd_load_general_ops */
1994 1 /* fp_simd_store_general_ops */
1995 },
1996 2, /* ld2_st2_general_ops */
1997 2, /* ld3_st3_general_ops */
1998 3 /* ld4_st4_general_ops */
1999 },
2000 2, /* pred_ops_per_cycle */
2001 2, /* while_pred_ops */
2002 2, /* int_cmp_pred_ops */
2003 1, /* fp_cmp_pred_ops */
2004 1, /* gather_scatter_pair_general_ops */
2005 1 /* gather_scatter_pair_pred_ops */
2006};
2007
2008static const aarch64_vec_issue_info neoverse512tvb_vec_issue_info =
2009{
2010 &neoversev1_scalar_issue_info,
2011 &neoversev1_advsimd_issue_info,
2012 &neoverse512tvb_sve_issue_info
2013};
2014
2015static const struct cpu_vector_cost neoverse512tvb_vector_cost =
2016{
2017 1, /* scalar_int_stmt_cost */
2018 2, /* scalar_fp_stmt_cost */
2019 4, /* scalar_load_cost */
2020 1, /* scalar_store_cost */
2021 1, /* cond_taken_branch_cost */
2022 1, /* cond_not_taken_branch_cost */
2023 &neoversev1_advsimd_vector_cost, /* advsimd */
2024 &neoverse512tvb_sve_vector_cost, /* sve */
2025 &neoverse512tvb_vec_issue_info /* issue_info */
2026};
2027
2028static const struct tune_params neoverse512tvb_tunings =
2029{
2030 &cortexa76_extra_costs,
2031 &neoversev1_addrcost_table,
2032 &generic_regmove_cost,
2033 &neoverse512tvb_vector_cost,
2034 &generic_branch_cost,
2035 &generic_approx_modes,
2036 SVE_128 | SVE_256, /* sve_width */
2037 4, /* memmov_cost */
2038 3, /* issue_rate */
2039 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
2040 "32:16", /* function_align. */
2041 "4", /* jump_align. */
2042 "32:16", /* loop_align. */
2043 2, /* int_reassoc_width. */
2044 4, /* fp_reassoc_width. */
2045 2, /* vec_reassoc_width. */
2046 2, /* min_div_recip_mul_sf. */
2047 2, /* min_div_recip_mul_df. */
2048 0, /* max_case_values. */
2049 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
2050 (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2051 | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2052 | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
2053 &generic_prefetch_tune
2054};
2055
25095d1e
KT
2056static const struct tune_params neoversen2_tunings =
2057{
5c5a67e6 2058 &cortexa76_extra_costs,
25095d1e
KT
2059 &generic_addrcost_table,
2060 &generic_regmove_cost,
2061 &cortexa57_vector_cost,
2062 &generic_branch_cost,
2063 &generic_approx_modes,
2064 SVE_128, /* sve_width */
2065 4, /* memmov_cost */
2066 3, /* issue_rate */
2067 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
2068 "32:16", /* function_align. */
2069 "4", /* jump_align. */
2070 "32:16", /* loop_align. */
2071 2, /* int_reassoc_width. */
2072 4, /* fp_reassoc_width. */
2073 2, /* vec_reassoc_width. */
2074 2, /* min_div_recip_mul_sf. */
2075 2, /* min_div_recip_mul_df. */
2076 0, /* max_case_values. */
2077 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
c437d334 2078 (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
25095d1e
KT
2079 &generic_prefetch_tune
2080};
2081
02f21aea
QJ
2082static const struct tune_params a64fx_tunings =
2083{
3f325179
QJ
2084 &a64fx_extra_costs,
2085 &a64fx_addrcost_table,
2086 &a64fx_regmove_cost,
2087 &a64fx_vector_cost,
02f21aea
QJ
2088 &generic_branch_cost,
2089 &generic_approx_modes,
2090 SVE_512, /* sve_width */
2091 4, /* memmov_cost */
2092 7, /* issue_rate */
2093 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
2094 "32", /* function_align. */
2095 "16", /* jump_align. */
2096 "32", /* loop_align. */
2097 4, /* int_reassoc_width. */
2098 2, /* fp_reassoc_width. */
2099 2, /* vec_reassoc_width. */
2100 2, /* min_div_recip_mul_sf. */
2101 2, /* min_div_recip_mul_df. */
2102 0, /* max_case_values. */
2103 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
2104 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
2105 &a64fx_prefetch_tune
2106};
2107
8dec06f2
JG
2108/* Support for fine-grained override of the tuning structures. */
2109struct aarch64_tuning_override_function
2110{
2111 const char* name;
2112 void (*parse_override)(const char*, struct tune_params*);
2113};
2114
2115static void aarch64_parse_fuse_string (const char*, struct tune_params*);
2116static void aarch64_parse_tune_string (const char*, struct tune_params*);
886f092f 2117static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
8dec06f2
JG
2118
2119static const struct aarch64_tuning_override_function
2120aarch64_tuning_override_functions[] =
2121{
2122 { "fuse", aarch64_parse_fuse_string },
2123 { "tune", aarch64_parse_tune_string },
886f092f 2124 { "sve_width", aarch64_parse_sve_width_string },
8dec06f2
JG
2125 { NULL, NULL }
2126};
2127
43e9d192
IB
2128/* A processor implementing AArch64. */
2129struct processor
2130{
2131 const char *const name;
46806c44
KT
2132 enum aarch64_processor ident;
2133 enum aarch64_processor sched_core;
393ae126 2134 enum aarch64_arch arch;
0c6caaf8 2135 unsigned architecture_version;
28108a53 2136 const uint64_t flags;
43e9d192
IB
2137 const struct tune_params *const tune;
2138};
2139
393ae126
KT
2140/* Architectures implementing AArch64. */
2141static const struct processor all_architectures[] =
2142{
2143#define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
2144 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
2145#include "aarch64-arches.def"
393ae126
KT
2146 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
2147};
2148
43e9d192
IB
2149/* Processor cores implementing AArch64. */
2150static const struct processor all_cores[] =
2151{
e8fcc9fa 2152#define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
393ae126
KT
2153 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
2154 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
2155 FLAGS, &COSTS##_tunings},
43e9d192 2156#include "aarch64-cores.def"
393ae126
KT
2157 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
2158 AARCH64_FL_FOR_ARCH8, &generic_tunings},
2159 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
43e9d192
IB
2160};
2161
43e9d192 2162
361fb3ee
KT
2163/* Target specification. These are populated by the -march, -mtune, -mcpu
2164 handling code or by target attributes. */
43e9d192
IB
2165static const struct processor *selected_arch;
2166static const struct processor *selected_cpu;
2167static const struct processor *selected_tune;
2168
8fc16d72
ST
2169enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
2170
b175b679
JG
2171/* The current tuning set. */
2172struct tune_params aarch64_tune_params = generic_tunings;
2173
c600df9a
RS
2174/* Check whether an 'aarch64_vector_pcs' attribute is valid. */
2175
2176static tree
2177handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
2178 int, bool *no_add_attrs)
2179{
2180 /* Since we set fn_type_req to true, the caller should have checked
2181 this for us. */
2182 gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
2183 switch ((arm_pcs) fntype_abi (*node).id ())
2184 {
2185 case ARM_PCS_AAPCS64:
2186 case ARM_PCS_SIMD:
2187 return NULL_TREE;
2188
2189 case ARM_PCS_SVE:
2190 error ("the %qE attribute cannot be applied to an SVE function type",
2191 name);
2192 *no_add_attrs = true;
2193 return NULL_TREE;
2194
2195 case ARM_PCS_TLSDESC:
2196 case ARM_PCS_UNKNOWN:
2197 break;
2198 }
2199 gcc_unreachable ();
2200}
2201
a0d0b980
SE
2202/* Table of machine attributes. */
2203static const struct attribute_spec aarch64_attribute_table[] =
2204{
2205 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
2206 affects_type_identity, handler, exclude } */
c600df9a
RS
2207 { "aarch64_vector_pcs", 0, 0, false, true, true, true,
2208 handle_aarch64_vector_pcs_attribute, NULL },
38e62001
RS
2209 { "arm_sve_vector_bits", 1, 1, false, true, false, true,
2210 aarch64_sve::handle_arm_sve_vector_bits_attribute,
2211 NULL },
31427b97 2212 { "Advanced SIMD type", 1, 1, false, true, false, true, NULL, NULL },
683e93d1 2213 { "SVE type", 3, 3, false, true, false, true, NULL, NULL },
5002dae3 2214 { "SVE sizeless type", 0, 0, false, true, false, true, NULL, NULL },
a0d0b980
SE
2215 { NULL, 0, 0, false, false, false, false, NULL, NULL }
2216};
2217
43e9d192
IB
2218#define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
2219
2220/* An ISA extension in the co-processor and main instruction set space. */
2221struct aarch64_option_extension
2222{
2223 const char *const name;
2224 const unsigned long flags_on;
2225 const unsigned long flags_off;
2226};
2227
43e9d192
IB
2228typedef enum aarch64_cond_code
2229{
2230 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
2231 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
2232 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
2233}
2234aarch64_cc;
2235
2236#define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
2237
efac62a3
ST
2238struct aarch64_branch_protect_type
2239{
2240 /* The type's name that the user passes to the branch-protection option
2241 string. */
2242 const char* name;
2243 /* Function to handle the protection type and set global variables.
2244 First argument is the string token corresponding with this type and the
2245 second argument is the next token in the option string.
2246 Return values:
2247 * AARCH64_PARSE_OK: Handling was sucessful.
2248 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
2249 should print an error.
2250 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
2251 own error. */
2252 enum aarch64_parse_opt_result (*handler)(char*, char*);
2253 /* A list of types that can follow this type in the option string. */
2254 const aarch64_branch_protect_type* subtypes;
2255 unsigned int num_subtypes;
2256};
2257
2258static enum aarch64_parse_opt_result
2259aarch64_handle_no_branch_protection (char* str, char* rest)
2260{
2261 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
30afdf34 2262 aarch64_enable_bti = 0;
efac62a3
ST
2263 if (rest)
2264 {
2265 error ("unexpected %<%s%> after %<%s%>", rest, str);
2266 return AARCH64_PARSE_INVALID_FEATURE;
2267 }
2268 return AARCH64_PARSE_OK;
2269}
2270
2271static enum aarch64_parse_opt_result
2272aarch64_handle_standard_branch_protection (char* str, char* rest)
2273{
2274 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
8fc16d72 2275 aarch64_ra_sign_key = AARCH64_KEY_A;
30afdf34 2276 aarch64_enable_bti = 1;
efac62a3
ST
2277 if (rest)
2278 {
2279 error ("unexpected %<%s%> after %<%s%>", rest, str);
2280 return AARCH64_PARSE_INVALID_FEATURE;
2281 }
2282 return AARCH64_PARSE_OK;
2283}
2284
2285static enum aarch64_parse_opt_result
2286aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
2287 char* rest ATTRIBUTE_UNUSED)
2288{
2289 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
8fc16d72 2290 aarch64_ra_sign_key = AARCH64_KEY_A;
efac62a3
ST
2291 return AARCH64_PARSE_OK;
2292}
2293
2294static enum aarch64_parse_opt_result
2295aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
2296 char* rest ATTRIBUTE_UNUSED)
2297{
2298 aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
2299 return AARCH64_PARSE_OK;
2300}
2301
8fc16d72
ST
2302static enum aarch64_parse_opt_result
2303aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
2304 char* rest ATTRIBUTE_UNUSED)
2305{
2306 aarch64_ra_sign_key = AARCH64_KEY_B;
2307 return AARCH64_PARSE_OK;
2308}
2309
30afdf34
SD
2310static enum aarch64_parse_opt_result
2311aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
2312 char* rest ATTRIBUTE_UNUSED)
2313{
2314 aarch64_enable_bti = 1;
2315 return AARCH64_PARSE_OK;
2316}
2317
efac62a3
ST
2318static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
2319 { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
8fc16d72 2320 { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
efac62a3
ST
2321 { NULL, NULL, NULL, 0 }
2322};
2323
2324static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
2325 { "none", aarch64_handle_no_branch_protection, NULL, 0 },
2326 { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
2327 { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
2328 ARRAY_SIZE (aarch64_pac_ret_subtypes) },
30afdf34 2329 { "bti", aarch64_handle_bti_protection, NULL, 0 },
efac62a3
ST
2330 { NULL, NULL, NULL, 0 }
2331};
2332
43e9d192
IB
2333/* The condition codes of the processor, and the inverse function. */
2334static const char * const aarch64_condition_codes[] =
2335{
2336 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
2337 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
2338};
2339
57d6f4d0
RS
2340/* The preferred condition codes for SVE conditions. */
2341static const char *const aarch64_sve_condition_codes[] =
2342{
2343 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
2344 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
2345};
2346
0b1fe8cf
RS
2347/* Return the assembly token for svpattern value VALUE. */
2348
2349static const char *
2350svpattern_token (enum aarch64_svpattern pattern)
2351{
2352 switch (pattern)
2353 {
2354#define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
2355 AARCH64_FOR_SVPATTERN (CASE)
2356#undef CASE
2357 case AARCH64_NUM_SVPATTERNS:
2358 break;
2359 }
2360 gcc_unreachable ();
2361}
2362
38e62001
RS
2363/* Return the location of a piece that is known to be passed or returned
2364 in registers. FIRST_ZR is the first unused vector argument register
2365 and FIRST_PR is the first unused predicate argument register. */
2366
2367rtx
2368pure_scalable_type_info::piece::get_rtx (unsigned int first_zr,
2369 unsigned int first_pr) const
2370{
2371 gcc_assert (VECTOR_MODE_P (mode)
2372 && first_zr + num_zr <= V0_REGNUM + NUM_FP_ARG_REGS
2373 && first_pr + num_pr <= P0_REGNUM + NUM_PR_ARG_REGS);
2374
2375 if (num_zr > 0 && num_pr == 0)
2376 return gen_rtx_REG (mode, first_zr);
2377
2378 if (num_zr == 0 && num_pr == 1)
2379 return gen_rtx_REG (mode, first_pr);
2380
2381 gcc_unreachable ();
2382}
2383
2384/* Return the total number of vector registers required by the PST. */
2385
2386unsigned int
2387pure_scalable_type_info::num_zr () const
2388{
2389 unsigned int res = 0;
2390 for (unsigned int i = 0; i < pieces.length (); ++i)
2391 res += pieces[i].num_zr;
2392 return res;
2393}
2394
2395/* Return the total number of predicate registers required by the PST. */
2396
2397unsigned int
2398pure_scalable_type_info::num_pr () const
2399{
2400 unsigned int res = 0;
2401 for (unsigned int i = 0; i < pieces.length (); ++i)
2402 res += pieces[i].num_pr;
2403 return res;
2404}
2405
2406/* Return the location of a PST that is known to be passed or returned
2407 in registers. FIRST_ZR is the first unused vector argument register
2408 and FIRST_PR is the first unused predicate argument register. */
2409
2410rtx
2411pure_scalable_type_info::get_rtx (machine_mode mode,
2412 unsigned int first_zr,
2413 unsigned int first_pr) const
2414{
2415 /* Try to return a single REG if possible. This leads to better
2416 code generation; it isn't required for correctness. */
2417 if (mode == pieces[0].mode)
2418 {
2419 gcc_assert (pieces.length () == 1);
2420 return pieces[0].get_rtx (first_zr, first_pr);
2421 }
2422
2423 /* Build up a PARALLEL that contains the individual pieces. */
2424 rtvec rtxes = rtvec_alloc (pieces.length ());
2425 for (unsigned int i = 0; i < pieces.length (); ++i)
2426 {
2427 rtx reg = pieces[i].get_rtx (first_zr, first_pr);
2428 rtx offset = gen_int_mode (pieces[i].offset, Pmode);
2429 RTVEC_ELT (rtxes, i) = gen_rtx_EXPR_LIST (VOIDmode, reg, offset);
2430 first_zr += pieces[i].num_zr;
2431 first_pr += pieces[i].num_pr;
2432 }
2433 return gen_rtx_PARALLEL (mode, rtxes);
2434}
2435
2436/* Analyze whether TYPE is a Pure Scalable Type according to the rules
2437 in the AAPCS64. */
2438
2439pure_scalable_type_info::analysis_result
2440pure_scalable_type_info::analyze (const_tree type)
2441{
2442 /* Prevent accidental reuse. */
2443 gcc_assert (pieces.is_empty ());
2444
2445 /* No code will be generated for erroneous types, so we won't establish
2446 an ABI mapping. */
2447 if (type == error_mark_node)
2448 return NO_ABI_IDENTITY;
2449
2450 /* Zero-sized types disappear in the language->ABI mapping. */
2451 if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
2452 return NO_ABI_IDENTITY;
2453
2454 /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs. */
2455 piece p = {};
2456 if (aarch64_sve::builtin_type_p (type, &p.num_zr, &p.num_pr))
2457 {
2458 machine_mode mode = TYPE_MODE_RAW (type);
2459 gcc_assert (VECTOR_MODE_P (mode)
2460 && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
2461
2462 p.mode = p.orig_mode = mode;
2463 add_piece (p);
2464 return IS_PST;
2465 }
2466
2467 /* Check for user-defined PSTs. */
2468 if (TREE_CODE (type) == ARRAY_TYPE)
2469 return analyze_array (type);
2470 if (TREE_CODE (type) == RECORD_TYPE)
2471 return analyze_record (type);
2472
2473 return ISNT_PST;
2474}
2475
2476/* Analyze a type that is known not to be passed or returned in memory.
2477 Return true if it has an ABI identity and is a Pure Scalable Type. */
2478
2479bool
2480pure_scalable_type_info::analyze_registers (const_tree type)
2481{
2482 analysis_result result = analyze (type);
2483 gcc_assert (result != DOESNT_MATTER);
2484 return result == IS_PST;
2485}
2486
2487/* Subroutine of analyze for handling ARRAY_TYPEs. */
2488
2489pure_scalable_type_info::analysis_result
2490pure_scalable_type_info::analyze_array (const_tree type)
2491{
2492 /* Analyze the element type. */
2493 pure_scalable_type_info element_info;
2494 analysis_result result = element_info.analyze (TREE_TYPE (type));
2495 if (result != IS_PST)
2496 return result;
2497
2498 /* An array of unknown, flexible or variable length will be passed and
2499 returned by reference whatever we do. */
2500 tree nelts_minus_one = array_type_nelts (type);
2501 if (!tree_fits_uhwi_p (nelts_minus_one))
2502 return DOESNT_MATTER;
2503
2504 /* Likewise if the array is constant-sized but too big to be interesting.
2505 The double checks against MAX_PIECES are to protect against overflow. */
2506 unsigned HOST_WIDE_INT count = tree_to_uhwi (nelts_minus_one);
2507 if (count > MAX_PIECES)
2508 return DOESNT_MATTER;
2509 count += 1;
2510 if (count * element_info.pieces.length () > MAX_PIECES)
2511 return DOESNT_MATTER;
2512
2513 /* The above checks should have weeded out elements of unknown size. */
2514 poly_uint64 element_bytes;
2515 if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type)), &element_bytes))
2516 gcc_unreachable ();
2517
2518 /* Build up the list of individual vectors and predicates. */
2519 gcc_assert (!element_info.pieces.is_empty ());
2520 for (unsigned int i = 0; i < count; ++i)
2521 for (unsigned int j = 0; j < element_info.pieces.length (); ++j)
2522 {
2523 piece p = element_info.pieces[j];
2524 p.offset += i * element_bytes;
2525 add_piece (p);
2526 }
2527 return IS_PST;
2528}
2529
2530/* Subroutine of analyze for handling RECORD_TYPEs. */
2531
2532pure_scalable_type_info::analysis_result
2533pure_scalable_type_info::analyze_record (const_tree type)
2534{
2535 for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
2536 {
2537 if (TREE_CODE (field) != FIELD_DECL)
2538 continue;
2539
2540 /* Zero-sized fields disappear in the language->ABI mapping. */
2541 if (DECL_SIZE (field) && integer_zerop (DECL_SIZE (field)))
2542 continue;
2543
2544 /* All fields with an ABI identity must be PSTs for the record as
2545 a whole to be a PST. If any individual field is too big to be
2546 interesting then the record is too. */
2547 pure_scalable_type_info field_info;
2548 analysis_result subresult = field_info.analyze (TREE_TYPE (field));
2549 if (subresult == NO_ABI_IDENTITY)
2550 continue;
2551 if (subresult != IS_PST)
2552 return subresult;
2553
2554 /* Since all previous fields are PSTs, we ought to be able to track
2555 the field offset using poly_ints. */
2556 tree bitpos = bit_position (field);
2557 gcc_assert (poly_int_tree_p (bitpos));
2558
2559 /* For the same reason, it shouldn't be possible to create a PST field
2560 whose offset isn't byte-aligned. */
2561 poly_widest_int wide_bytepos = exact_div (wi::to_poly_widest (bitpos),
2562 BITS_PER_UNIT);
2563
2564 /* Punt if the record is too big to be interesting. */
2565 poly_uint64 bytepos;
2566 if (!wide_bytepos.to_uhwi (&bytepos)
2567 || pieces.length () + field_info.pieces.length () > MAX_PIECES)
2568 return DOESNT_MATTER;
2569
2570 /* Add the individual vectors and predicates in the field to the
2571 record's list. */
2572 gcc_assert (!field_info.pieces.is_empty ());
2573 for (unsigned int i = 0; i < field_info.pieces.length (); ++i)
2574 {
2575 piece p = field_info.pieces[i];
2576 p.offset += bytepos;
2577 add_piece (p);
2578 }
2579 }
2580 /* Empty structures disappear in the language->ABI mapping. */
2581 return pieces.is_empty () ? NO_ABI_IDENTITY : IS_PST;
2582}
2583
2584/* Add P to the list of pieces in the type. */
2585
2586void
2587pure_scalable_type_info::add_piece (const piece &p)
2588{
2589 /* Try to fold the new piece into the previous one to form a
2590 single-mode PST. For example, if we see three consecutive vectors
2591 of the same mode, we can represent them using the corresponding
2592 3-tuple mode.
2593
2594 This is purely an optimization. */
2595 if (!pieces.is_empty ())
2596 {
2597 piece &prev = pieces.last ();
2598 gcc_assert (VECTOR_MODE_P (p.mode) && VECTOR_MODE_P (prev.mode));
2599 unsigned int nelems1, nelems2;
2600 if (prev.orig_mode == p.orig_mode
2601 && known_eq (prev.offset + GET_MODE_SIZE (prev.mode), p.offset)
2602 && constant_multiple_p (GET_MODE_NUNITS (prev.mode),
2603 GET_MODE_NUNITS (p.orig_mode), &nelems1)
2604 && constant_multiple_p (GET_MODE_NUNITS (p.mode),
2605 GET_MODE_NUNITS (p.orig_mode), &nelems2)
2606 && targetm.array_mode (p.orig_mode,
2607 nelems1 + nelems2).exists (&prev.mode))
2608 {
2609 prev.num_zr += p.num_zr;
2610 prev.num_pr += p.num_pr;
2611 return;
2612 }
2613 }
2614 pieces.quick_push (p);
2615}
2616
2617/* Return true if at least one possible value of type TYPE includes at
2618 least one object of Pure Scalable Type, in the sense of the AAPCS64.
2619
2620 This is a relatively expensive test for some types, so it should
2621 generally be made as late as possible. */
2622
2623static bool
2624aarch64_some_values_include_pst_objects_p (const_tree type)
2625{
2626 if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
2627 return false;
2628
2629 if (aarch64_sve::builtin_type_p (type))
2630 return true;
2631
2632 if (TREE_CODE (type) == ARRAY_TYPE || TREE_CODE (type) == COMPLEX_TYPE)
2633 return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type));
2634
2635 if (RECORD_OR_UNION_TYPE_P (type))
2636 for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
2637 if (TREE_CODE (field) == FIELD_DECL
2638 && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field)))
2639 return true;
2640
2641 return false;
2642}
2643
002ffd3c
RS
2644/* Return the descriptor of the SIMD ABI. */
2645
2646static const predefined_function_abi &
2647aarch64_simd_abi (void)
2648{
2649 predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
2650 if (!simd_abi.initialized_p ())
2651 {
2652 HARD_REG_SET full_reg_clobbers
2653 = default_function_abi.full_reg_clobbers ();
2654 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
2655 if (FP_SIMD_SAVED_REGNUM_P (regno))
2656 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
2657 simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
2658 }
2659 return simd_abi;
2660}
2661
c600df9a
RS
2662/* Return the descriptor of the SVE PCS. */
2663
2664static const predefined_function_abi &
2665aarch64_sve_abi (void)
2666{
2667 predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
2668 if (!sve_abi.initialized_p ())
2669 {
2670 HARD_REG_SET full_reg_clobbers
2671 = default_function_abi.full_reg_clobbers ();
2672 for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
2673 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
cb26919c 2674 for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno)
c600df9a
RS
2675 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
2676 sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
2677 }
2678 return sve_abi;
2679}
2680
74b27d8e
RS
2681/* If X is an UNSPEC_SALT_ADDR expression, return the address that it
2682 wraps, otherwise return X itself. */
2683
2684static rtx
2685strip_salt (rtx x)
2686{
2687 rtx search = x;
2688 if (GET_CODE (search) == CONST)
2689 search = XEXP (search, 0);
2690 if (GET_CODE (search) == UNSPEC && XINT (search, 1) == UNSPEC_SALT_ADDR)
2691 x = XVECEXP (search, 0, 0);
2692 return x;
2693}
2694
2695/* Like strip_offset, but also strip any UNSPEC_SALT_ADDR from the
2696 expression. */
2697
2698static rtx
2699strip_offset_and_salt (rtx addr, poly_int64 *offset)
2700{
2701 return strip_salt (strip_offset (addr, offset));
2702}
2703
973d2e01
TP
2704/* Generate code to enable conditional branches in functions over 1 MiB. */
2705const char *
2706aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
2707 const char * branch_format)
2708{
2709 rtx_code_label * tmp_label = gen_label_rtx ();
2710 char label_buf[256];
2711 char buffer[128];
2712 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
2713 CODE_LABEL_NUMBER (tmp_label));
2714 const char *label_ptr = targetm.strip_name_encoding (label_buf);
2715 rtx dest_label = operands[pos_label];
2716 operands[pos_label] = tmp_label;
2717
2718 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
2719 output_asm_insn (buffer, operands);
2720
2721 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
2722 operands[pos_label] = dest_label;
2723 output_asm_insn (buffer, operands);
2724 return "";
2725}
2726
261fb553 2727void
fc29dfc9 2728aarch64_err_no_fpadvsimd (machine_mode mode)
261fb553 2729{
261fb553 2730 if (TARGET_GENERAL_REGS_ONLY)
fc29dfc9
SE
2731 if (FLOAT_MODE_P (mode))
2732 error ("%qs is incompatible with the use of floating-point types",
2733 "-mgeneral-regs-only");
2734 else
2735 error ("%qs is incompatible with the use of vector types",
2736 "-mgeneral-regs-only");
261fb553 2737 else
fc29dfc9
SE
2738 if (FLOAT_MODE_P (mode))
2739 error ("%qs feature modifier is incompatible with the use of"
2740 " floating-point types", "+nofp");
2741 else
2742 error ("%qs feature modifier is incompatible with the use of"
2743 " vector types", "+nofp");
261fb553
AL
2744}
2745
c0e0174b
RS
2746/* Report when we try to do something that requires SVE when SVE is disabled.
2747 This is an error of last resort and isn't very high-quality. It usually
2748 involves attempts to measure the vector length in some way. */
2749static void
2750aarch64_report_sve_required (void)
2751{
2752 static bool reported_p = false;
2753
2754 /* Avoid reporting a slew of messages for a single oversight. */
2755 if (reported_p)
2756 return;
2757
2758 error ("this operation requires the SVE ISA extension");
2759 inform (input_location, "you can enable SVE using the command-line"
2760 " option %<-march%>, or by using the %<target%>"
2761 " attribute or pragma");
2762 reported_p = true;
2763}
2764
183bfdaf
RS
2765/* Return true if REGNO is P0-P15 or one of the special FFR-related
2766 registers. */
2767inline bool
2768pr_or_ffr_regnum_p (unsigned int regno)
2769{
2770 return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
2771}
2772
c64f7d37 2773/* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
2eb2847e
WD
2774 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
2775 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
2776 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
2777 and GENERAL_REGS is lower than the memory cost (in this case the best class
2778 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
2779 cost results in bad allocations with many redundant int<->FP moves which
2780 are expensive on various cores.
2781 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
2782 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
2783 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
2784 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
31e2b5a3
WD
2785 The result of this is that it is no longer inefficient to have a higher
2786 memory move cost than the register move cost.
2787*/
c64f7d37
WD
2788
2789static reg_class_t
31e2b5a3
WD
2790aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
2791 reg_class_t best_class)
c64f7d37 2792{
b8506a8a 2793 machine_mode mode;
c64f7d37 2794
67e5c59a
RS
2795 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
2796 || !reg_class_subset_p (FP_REGS, allocno_class))
c64f7d37
WD
2797 return allocno_class;
2798
67e5c59a
RS
2799 if (!reg_class_subset_p (GENERAL_REGS, best_class)
2800 || !reg_class_subset_p (FP_REGS, best_class))
31e2b5a3
WD
2801 return best_class;
2802
c64f7d37
WD
2803 mode = PSEUDO_REGNO_MODE (regno);
2804 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
2805}
2806
26e0ff94 2807static unsigned int
b8506a8a 2808aarch64_min_divisions_for_recip_mul (machine_mode mode)
26e0ff94 2809{
50093a33 2810 if (GET_MODE_UNIT_SIZE (mode) == 4)
b175b679
JG
2811 return aarch64_tune_params.min_div_recip_mul_sf;
2812 return aarch64_tune_params.min_div_recip_mul_df;
26e0ff94
WD
2813}
2814
b5b33e11 2815/* Return the reassociation width of treeop OPC with mode MODE. */
cee66c68 2816static int
b5b33e11 2817aarch64_reassociation_width (unsigned opc, machine_mode mode)
cee66c68
WD
2818{
2819 if (VECTOR_MODE_P (mode))
b175b679 2820 return aarch64_tune_params.vec_reassoc_width;
cee66c68 2821 if (INTEGRAL_MODE_P (mode))
b175b679 2822 return aarch64_tune_params.int_reassoc_width;
b5b33e11
WD
2823 /* Avoid reassociating floating point addition so we emit more FMAs. */
2824 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
b175b679 2825 return aarch64_tune_params.fp_reassoc_width;
cee66c68
WD
2826 return 1;
2827}
2828
43e9d192
IB
2829/* Provide a mapping from gcc register numbers to dwarf register numbers. */
2830unsigned
2831aarch64_dbx_register_number (unsigned regno)
2832{
2833 if (GP_REGNUM_P (regno))
2834 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
2835 else if (regno == SP_REGNUM)
2836 return AARCH64_DWARF_SP;
2837 else if (FP_REGNUM_P (regno))
2838 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
43cacb12
RS
2839 else if (PR_REGNUM_P (regno))
2840 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
2841 else if (regno == VG_REGNUM)
2842 return AARCH64_DWARF_VG;
43e9d192
IB
2843
2844 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
2845 equivalent DWARF register. */
2846 return DWARF_FRAME_REGISTERS;
2847}
2848
d29f7dd5
RS
2849/* If X is a CONST_DOUBLE, return its bit representation as a constant
2850 integer, otherwise return X unmodified. */
2851static rtx
2852aarch64_bit_representation (rtx x)
2853{
2854 if (CONST_DOUBLE_P (x))
2855 x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
2856 return x;
2857}
2858
3b924b0d
RS
2859/* Return an estimate for the number of quadwords in an SVE vector. This is
2860 equivalent to the number of Advanced SIMD vectors in an SVE vector. */
2861static unsigned int
2862aarch64_estimated_sve_vq ()
2863{
2864 return estimated_poly_value (BITS_PER_SVE_VECTOR) / 128;
2865}
2866
43cacb12
RS
2867/* Return true if MODE is an SVE predicate mode. */
2868static bool
2869aarch64_sve_pred_mode_p (machine_mode mode)
2870{
2871 return (TARGET_SVE
2872 && (mode == VNx16BImode
2873 || mode == VNx8BImode
2874 || mode == VNx4BImode
2875 || mode == VNx2BImode));
2876}
2877
2878/* Three mutually-exclusive flags describing a vector or predicate type. */
2879const unsigned int VEC_ADVSIMD = 1;
2880const unsigned int VEC_SVE_DATA = 2;
2881const unsigned int VEC_SVE_PRED = 4;
2882/* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
2883 a structure of 2, 3 or 4 vectors. */
2884const unsigned int VEC_STRUCT = 8;
550a3380
RS
2885/* Can be used in combination with VEC_SVE_DATA to indicate that the
2886 vector has fewer significant bytes than a full SVE vector. */
2887const unsigned int VEC_PARTIAL = 16;
43cacb12
RS
2888/* Useful combinations of the above. */
2889const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
2890const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
2891
2892/* Return a set of flags describing the vector properties of mode MODE.
2893 Ignore modes that are not supported by the current target. */
2894static unsigned int
2895aarch64_classify_vector_mode (machine_mode mode)
2896{
43cacb12
RS
2897 if (aarch64_sve_pred_mode_p (mode))
2898 return VEC_SVE_PRED;
2899
806f69cd
RS
2900 /* Make the decision based on the mode's enum value rather than its
2901 properties, so that we keep the correct classification regardless
2902 of -msve-vector-bits. */
2903 switch (mode)
43cacb12 2904 {
550a3380
RS
2905 /* Partial SVE QI vectors. */
2906 case E_VNx2QImode:
2907 case E_VNx4QImode:
2908 case E_VNx8QImode:
2909 /* Partial SVE HI vectors. */
2910 case E_VNx2HImode:
2911 case E_VNx4HImode:
2912 /* Partial SVE SI vector. */
2913 case E_VNx2SImode:
cc68f7c2
RS
2914 /* Partial SVE HF vectors. */
2915 case E_VNx2HFmode:
2916 case E_VNx4HFmode:
6c3ce63b
RS
2917 /* Partial SVE BF vectors. */
2918 case E_VNx2BFmode:
2919 case E_VNx4BFmode:
cc68f7c2
RS
2920 /* Partial SVE SF vector. */
2921 case E_VNx2SFmode:
550a3380
RS
2922 return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
2923
806f69cd
RS
2924 case E_VNx16QImode:
2925 case E_VNx8HImode:
2926 case E_VNx4SImode:
2927 case E_VNx2DImode:
02fcd8ac 2928 case E_VNx8BFmode:
806f69cd
RS
2929 case E_VNx8HFmode:
2930 case E_VNx4SFmode:
2931 case E_VNx2DFmode:
2932 return TARGET_SVE ? VEC_SVE_DATA : 0;
2933
2934 /* x2 SVE vectors. */
2935 case E_VNx32QImode:
2936 case E_VNx16HImode:
2937 case E_VNx8SImode:
2938 case E_VNx4DImode:
02fcd8ac 2939 case E_VNx16BFmode:
806f69cd
RS
2940 case E_VNx16HFmode:
2941 case E_VNx8SFmode:
2942 case E_VNx4DFmode:
2943 /* x3 SVE vectors. */
2944 case E_VNx48QImode:
2945 case E_VNx24HImode:
2946 case E_VNx12SImode:
2947 case E_VNx6DImode:
02fcd8ac 2948 case E_VNx24BFmode:
806f69cd
RS
2949 case E_VNx24HFmode:
2950 case E_VNx12SFmode:
2951 case E_VNx6DFmode:
2952 /* x4 SVE vectors. */
2953 case E_VNx64QImode:
2954 case E_VNx32HImode:
2955 case E_VNx16SImode:
2956 case E_VNx8DImode:
02fcd8ac 2957 case E_VNx32BFmode:
806f69cd
RS
2958 case E_VNx32HFmode:
2959 case E_VNx16SFmode:
2960 case E_VNx8DFmode:
2961 return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
2962
66f206b8
JW
2963 case E_OImode:
2964 case E_CImode:
2965 case E_XImode:
2966 return TARGET_SIMD ? VEC_ADVSIMD | VEC_STRUCT : 0;
2967
2968 /* Structures of 64-bit Advanced SIMD vectors. */
2969 case E_V2x8QImode:
2970 case E_V2x4HImode:
2971 case E_V2x2SImode:
2972 case E_V2x1DImode:
2973 case E_V2x4BFmode:
2974 case E_V2x4HFmode:
2975 case E_V2x2SFmode:
2976 case E_V2x1DFmode:
2977 case E_V3x8QImode:
2978 case E_V3x4HImode:
2979 case E_V3x2SImode:
2980 case E_V3x1DImode:
2981 case E_V3x4BFmode:
2982 case E_V3x4HFmode:
2983 case E_V3x2SFmode:
2984 case E_V3x1DFmode:
2985 case E_V4x8QImode:
2986 case E_V4x4HImode:
2987 case E_V4x2SImode:
2988 case E_V4x1DImode:
2989 case E_V4x4BFmode:
2990 case E_V4x4HFmode:
2991 case E_V4x2SFmode:
2992 case E_V4x1DFmode:
2993 return TARGET_SIMD ? VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL : 0;
2994
2995 /* Structures of 128-bit Advanced SIMD vectors. */
2996 case E_V2x16QImode:
2997 case E_V2x8HImode:
2998 case E_V2x4SImode:
2999 case E_V2x2DImode:
3000 case E_V2x8BFmode:
3001 case E_V2x8HFmode:
3002 case E_V2x4SFmode:
3003 case E_V2x2DFmode:
3004 case E_V3x16QImode:
3005 case E_V3x8HImode:
3006 case E_V3x4SImode:
3007 case E_V3x2DImode:
3008 case E_V3x8BFmode:
3009 case E_V3x8HFmode:
3010 case E_V3x4SFmode:
3011 case E_V3x2DFmode:
3012 case E_V4x16QImode:
3013 case E_V4x8HImode:
3014 case E_V4x4SImode:
3015 case E_V4x2DImode:
3016 case E_V4x8BFmode:
3017 case E_V4x8HFmode:
3018 case E_V4x4SFmode:
3019 case E_V4x2DFmode:
3020 return TARGET_SIMD ? VEC_ADVSIMD | VEC_STRUCT : 0;
3021
806f69cd
RS
3022 /* 64-bit Advanced SIMD vectors. */
3023 case E_V8QImode:
3024 case E_V4HImode:
3025 case E_V2SImode:
3026 /* ...E_V1DImode doesn't exist. */
3027 case E_V4HFmode:
abbe1ed2 3028 case E_V4BFmode:
806f69cd
RS
3029 case E_V2SFmode:
3030 case E_V1DFmode:
3031 /* 128-bit Advanced SIMD vectors. */
3032 case E_V16QImode:
3033 case E_V8HImode:
3034 case E_V4SImode:
3035 case E_V2DImode:
3036 case E_V8HFmode:
abbe1ed2 3037 case E_V8BFmode:
806f69cd
RS
3038 case E_V4SFmode:
3039 case E_V2DFmode:
3040 return TARGET_SIMD ? VEC_ADVSIMD : 0;
3041
3042 default:
3043 return 0;
43cacb12 3044 }
43cacb12
RS
3045}
3046
66f206b8
JW
3047/* Return true if MODE is any of the Advanced SIMD structure modes. */
3048bool
3049aarch64_advsimd_struct_mode_p (machine_mode mode)
3050{
3051 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3052 return (vec_flags & VEC_ADVSIMD) && (vec_flags & VEC_STRUCT);
3053}
3054
3055/* Return true if MODE is an Advanced SIMD D-register structure mode. */
3056static bool
3057aarch64_advsimd_partial_struct_mode_p (machine_mode mode)
3058{
3059 return (aarch64_classify_vector_mode (mode)
3060 == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL));
3061}
3062
3063/* Return true if MODE is an Advanced SIMD Q-register structure mode. */
3064static bool
3065aarch64_advsimd_full_struct_mode_p (machine_mode mode)
3066{
3067 return (aarch64_classify_vector_mode (mode) == (VEC_ADVSIMD | VEC_STRUCT));
3068}
3069
43cacb12
RS
3070/* Return true if MODE is any of the data vector modes, including
3071 structure modes. */
43e9d192 3072static bool
43cacb12 3073aarch64_vector_data_mode_p (machine_mode mode)
43e9d192 3074{
43cacb12 3075 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
43e9d192
IB
3076}
3077
5c38705d
RS
3078/* Return true if MODE is any form of SVE mode, including predicates,
3079 vectors and structures. */
3080bool
3081aarch64_sve_mode_p (machine_mode mode)
3082{
3083 return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
3084}
3085
43cacb12
RS
3086/* Return true if MODE is an SVE data vector mode; either a single vector
3087 or a structure of vectors. */
43e9d192 3088static bool
43cacb12 3089aarch64_sve_data_mode_p (machine_mode mode)
43e9d192 3090{
43cacb12 3091 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
43e9d192
IB
3092}
3093
550a3380
RS
3094/* Return the number of defined bytes in one constituent vector of
3095 SVE mode MODE, which has vector flags VEC_FLAGS. */
3096static poly_int64
3097aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
3098{
3099 if (vec_flags & VEC_PARTIAL)
3100 /* A single partial vector. */
3101 return GET_MODE_SIZE (mode);
3102
3103 if (vec_flags & VEC_SVE_DATA)
3104 /* A single vector or a tuple. */
3105 return BYTES_PER_SVE_VECTOR;
3106
3107 /* A single predicate. */
3108 gcc_assert (vec_flags & VEC_SVE_PRED);
3109 return BYTES_PER_SVE_PRED;
3110}
3111
05783fe6
RS
3112/* If MODE holds an array of vectors, return the number of vectors
3113 in the array, otherwise return 1. */
3114
3115static unsigned int
3116aarch64_ldn_stn_vectors (machine_mode mode)
3117{
3118 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3119 if (vec_flags == (VEC_ADVSIMD | VEC_PARTIAL | VEC_STRUCT))
3120 return exact_div (GET_MODE_SIZE (mode), 8).to_constant ();
3121 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
3122 return exact_div (GET_MODE_SIZE (mode), 16).to_constant ();
3123 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
3124 return exact_div (GET_MODE_SIZE (mode),
3125 BYTES_PER_SVE_VECTOR).to_constant ();
3126 return 1;
3127}
3128
66f206b8
JW
3129/* Given an Advanced SIMD vector mode MODE and a tuple size NELEMS, return the
3130 corresponding vector structure mode. */
3131static opt_machine_mode
3132aarch64_advsimd_vector_array_mode (machine_mode mode,
3133 unsigned HOST_WIDE_INT nelems)
3134{
3135 unsigned int flags = VEC_ADVSIMD | VEC_STRUCT;
3136 if (known_eq (GET_MODE_SIZE (mode), 8))
3137 flags |= VEC_PARTIAL;
3138
3139 machine_mode struct_mode;
3140 FOR_EACH_MODE_IN_CLASS (struct_mode, GET_MODE_CLASS (mode))
3141 if (aarch64_classify_vector_mode (struct_mode) == flags
3142 && GET_MODE_INNER (struct_mode) == GET_MODE_INNER (mode)
3143 && known_eq (GET_MODE_NUNITS (struct_mode),
3144 GET_MODE_NUNITS (mode) * nelems))
3145 return struct_mode;
3146 return opt_machine_mode ();
3147}
3148
3149/* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
3150
3151opt_machine_mode
3152aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
3153{
3154 enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
3155 ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
3156 machine_mode mode;
3157 FOR_EACH_MODE_IN_CLASS (mode, mclass)
3158 if (inner_mode == GET_MODE_INNER (mode)
3159 && known_eq (nunits, GET_MODE_NUNITS (mode))
3160 && aarch64_sve_data_mode_p (mode))
3161 return mode;
3162 return opt_machine_mode ();
3163}
3164
9f4cbab8
RS
3165/* Implement target hook TARGET_ARRAY_MODE. */
3166static opt_machine_mode
3167aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
3168{
3169 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
3170 && IN_RANGE (nelems, 2, 4))
66f206b8
JW
3171 return aarch64_sve_data_mode (GET_MODE_INNER (mode),
3172 GET_MODE_NUNITS (mode) * nelems);
3173 if (aarch64_classify_vector_mode (mode) == VEC_ADVSIMD
3174 && IN_RANGE (nelems, 2, 4))
3175 return aarch64_advsimd_vector_array_mode (mode, nelems);
9f4cbab8
RS
3176
3177 return opt_machine_mode ();
3178}
3179
43e9d192
IB
3180/* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
3181static bool
ef4bddc2 3182aarch64_array_mode_supported_p (machine_mode mode,
43e9d192
IB
3183 unsigned HOST_WIDE_INT nelems)
3184{
3185 if (TARGET_SIMD
635e66fe
AL
3186 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
3187 || AARCH64_VALID_SIMD_DREG_MODE (mode))
43e9d192
IB
3188 && (nelems >= 2 && nelems <= 4))
3189 return true;
3190
3191 return false;
3192}
3193
cc68f7c2
RS
3194/* MODE is some form of SVE vector mode. For data modes, return the number
3195 of vector register bits that each element of MODE occupies, such as 64
3196 for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
3197 in a 64-bit container). For predicate modes, return the number of
3198 data bits controlled by each significant predicate bit. */
3199
3200static unsigned int
3201aarch64_sve_container_bits (machine_mode mode)
3202{
3203 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3204 poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
3205 ? BITS_PER_SVE_VECTOR
3206 : GET_MODE_BITSIZE (mode));
3207 return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
3208}
3209
43cacb12
RS
3210/* Return the SVE predicate mode to use for elements that have
3211 ELEM_NBYTES bytes, if such a mode exists. */
3212
3213opt_machine_mode
3214aarch64_sve_pred_mode (unsigned int elem_nbytes)
3215{
3216 if (TARGET_SVE)
3217 {
3218 if (elem_nbytes == 1)
3219 return VNx16BImode;
3220 if (elem_nbytes == 2)
3221 return VNx8BImode;
3222 if (elem_nbytes == 4)
3223 return VNx4BImode;
3224 if (elem_nbytes == 8)
3225 return VNx2BImode;
3226 }
3227 return opt_machine_mode ();
3228}
3229
cc68f7c2
RS
3230/* Return the SVE predicate mode that should be used to control
3231 SVE mode MODE. */
3232
3233machine_mode
3234aarch64_sve_pred_mode (machine_mode mode)
3235{
3236 unsigned int bits = aarch64_sve_container_bits (mode);
3237 return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
3238}
3239
43cacb12
RS
3240/* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
3241
3242static opt_machine_mode
10116ec1 3243aarch64_get_mask_mode (machine_mode mode)
43cacb12 3244{
10116ec1
RS
3245 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3246 if (vec_flags & VEC_SVE_DATA)
cc68f7c2 3247 return aarch64_sve_pred_mode (mode);
43cacb12 3248
10116ec1 3249 return default_get_mask_mode (mode);
43cacb12
RS
3250}
3251
1044fa32
RS
3252/* Return the integer element mode associated with SVE mode MODE. */
3253
3254static scalar_int_mode
3255aarch64_sve_element_int_mode (machine_mode mode)
3256{
cc68f7c2
RS
3257 poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3258 ? BITS_PER_SVE_VECTOR
3259 : GET_MODE_BITSIZE (mode));
3260 unsigned int elt_bits = vector_element_size (vector_bits,
1044fa32
RS
3261 GET_MODE_NUNITS (mode));
3262 return int_mode_for_size (elt_bits, 0).require ();
3263}
3264
cc68f7c2
RS
3265/* Return an integer element mode that contains exactly
3266 aarch64_sve_container_bits (MODE) bits. This is wider than
3267 aarch64_sve_element_int_mode if MODE is a partial vector,
3268 otherwise it's the same. */
3269
3270static scalar_int_mode
3271aarch64_sve_container_int_mode (machine_mode mode)
3272{
3273 return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
3274}
3275
d7a09c44 3276/* Return the integer vector mode associated with SVE mode MODE.
d083ee47 3277 Unlike related_int_vector_mode, this can handle the case in which
d7a09c44
RS
3278 MODE is a predicate (and thus has a different total size). */
3279
624d0f07 3280machine_mode
d7a09c44
RS
3281aarch64_sve_int_mode (machine_mode mode)
3282{
3283 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
3284 return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
3285}
3286
74166aab
RS
3287/* Implement TARGET_VECTORIZE_RELATED_MODE. */
3288
3289static opt_machine_mode
3290aarch64_vectorize_related_mode (machine_mode vector_mode,
3291 scalar_mode element_mode,
3292 poly_uint64 nunits)
3293{
3294 unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
3295
cc68f7c2
RS
3296 /* If we're operating on SVE vectors, try to return an SVE mode. */
3297 poly_uint64 sve_nunits;
3298 if ((vec_flags & VEC_SVE_DATA)
3299 && multiple_p (BYTES_PER_SVE_VECTOR,
3300 GET_MODE_SIZE (element_mode), &sve_nunits))
3301 {
3302 machine_mode sve_mode;
3303 if (maybe_ne (nunits, 0U))
3304 {
3305 /* Try to find a full or partial SVE mode with exactly
3306 NUNITS units. */
3307 if (multiple_p (sve_nunits, nunits)
3308 && aarch64_sve_data_mode (element_mode,
3309 nunits).exists (&sve_mode))
3310 return sve_mode;
3311 }
3312 else
3313 {
3314 /* Take the preferred number of units from the number of bytes
3315 that fit in VECTOR_MODE. We always start by "autodetecting"
3316 a full vector mode with preferred_simd_mode, so vectors
3317 chosen here will also be full vector modes. Then
3318 autovectorize_vector_modes tries smaller starting modes
3319 and thus smaller preferred numbers of units. */
3320 sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
3321 if (aarch64_sve_data_mode (element_mode,
3322 sve_nunits).exists (&sve_mode))
3323 return sve_mode;
3324 }
3325 }
3326
74166aab
RS
3327 /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors. */
3328 if ((vec_flags & VEC_ADVSIMD)
3329 && known_eq (nunits, 0U)
3330 && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
3331 && maybe_ge (GET_MODE_BITSIZE (element_mode)
3332 * GET_MODE_NUNITS (vector_mode), 128U))
3333 {
3334 machine_mode res = aarch64_simd_container_mode (element_mode, 128);
3335 if (VECTOR_MODE_P (res))
3336 return res;
3337 }
3338
3339 return default_vectorize_related_mode (vector_mode, element_mode, nunits);
3340}
3341
b41d1f6e
RS
3342/* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
3343 prefer to use the first arithmetic operand as the else value if
3344 the else value doesn't matter, since that exactly matches the SVE
3345 destructive merging form. For ternary operations we could either
3346 pick the first operand and use FMAD-like instructions or the last
3347 operand and use FMLA-like instructions; the latter seems more
3348 natural. */
6a86928d
RS
3349
3350static tree
b41d1f6e 3351aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
6a86928d 3352{
b41d1f6e 3353 return nops == 3 ? ops[2] : ops[0];
6a86928d
RS
3354}
3355
c43f4279 3356/* Implement TARGET_HARD_REGNO_NREGS. */
43e9d192 3357
c43f4279 3358static unsigned int
ef4bddc2 3359aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
43e9d192 3360{
6a70badb
RS
3361 /* ??? Logically we should only need to provide a value when
3362 HARD_REGNO_MODE_OK says that the combination is valid,
3363 but at the moment we need to handle all modes. Just ignore
3364 any runtime parts for registers that can't store them. */
3365 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
43e9d192
IB
3366 switch (aarch64_regno_regclass (regno))
3367 {
3368 case FP_REGS:
3369 case FP_LO_REGS:
163b1f6a 3370 case FP_LO8_REGS:
550a3380
RS
3371 {
3372 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3373 if (vec_flags & VEC_SVE_DATA)
3374 return exact_div (GET_MODE_SIZE (mode),
3375 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
66f206b8
JW
3376 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL))
3377 return GET_MODE_SIZE (mode).to_constant () / 8;
550a3380
RS
3378 return CEIL (lowest_size, UNITS_PER_VREG);
3379 }
43cacb12
RS
3380 case PR_REGS:
3381 case PR_LO_REGS:
3382 case PR_HI_REGS:
183bfdaf
RS
3383 case FFR_REGS:
3384 case PR_AND_FFR_REGS:
43cacb12 3385 return 1;
43e9d192 3386 default:
6a70badb 3387 return CEIL (lowest_size, UNITS_PER_WORD);
43e9d192
IB
3388 }
3389 gcc_unreachable ();
3390}
3391
f939c3e6 3392/* Implement TARGET_HARD_REGNO_MODE_OK. */
43e9d192 3393
f939c3e6 3394static bool
ef4bddc2 3395aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
43e9d192 3396{
dd159a41
PW
3397 if (mode == V8DImode)
3398 return IN_RANGE (regno, R0_REGNUM, R23_REGNUM)
3399 && multiple_p (regno - R0_REGNUM, 2);
3400
43e9d192
IB
3401 if (GET_MODE_CLASS (mode) == MODE_CC)
3402 return regno == CC_REGNUM;
3403
43cacb12
RS
3404 if (regno == VG_REGNUM)
3405 /* This must have the same size as _Unwind_Word. */
3406 return mode == DImode;
3407
3408 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3409 if (vec_flags & VEC_SVE_PRED)
183bfdaf 3410 return pr_or_ffr_regnum_p (regno);
43cacb12 3411
183bfdaf
RS
3412 if (pr_or_ffr_regnum_p (regno))
3413 return false;
43cacb12 3414
9259db42
YZ
3415 if (regno == SP_REGNUM)
3416 /* The purpose of comparing with ptr_mode is to support the
3417 global register variable associated with the stack pointer
3418 register via the syntax of asm ("wsp") in ILP32. */
3419 return mode == Pmode || mode == ptr_mode;
3420
3421 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
43e9d192
IB
3422 return mode == Pmode;
3423
563cc649
RH
3424 if (GP_REGNUM_P (regno))
3425 {
aa1a2795
RS
3426 if (vec_flags & VEC_ANY_SVE)
3427 return false;
563cc649
RH
3428 if (known_le (GET_MODE_SIZE (mode), 8))
3429 return true;
aa1a2795 3430 if (known_le (GET_MODE_SIZE (mode), 16))
563cc649
RH
3431 return (regno & 1) == 0;
3432 }
3433 else if (FP_REGNUM_P (regno))
43e9d192 3434 {
43cacb12 3435 if (vec_flags & VEC_STRUCT)
4edd6298 3436 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
43e9d192 3437 else
43cacb12 3438 return !VECTOR_MODE_P (mode) || vec_flags != 0;
43e9d192
IB
3439 }
3440
f939c3e6 3441 return false;
43e9d192
IB
3442}
3443
c600df9a
RS
3444/* Return true if a function with type FNTYPE returns its value in
3445 SVE vector or predicate registers. */
3446
3447static bool
3448aarch64_returns_value_in_sve_regs_p (const_tree fntype)
3449{
c600df9a 3450 tree return_type = TREE_TYPE (fntype);
38e62001
RS
3451
3452 pure_scalable_type_info pst_info;
3453 switch (pst_info.analyze (return_type))
3454 {
3455 case pure_scalable_type_info::IS_PST:
3456 return (pst_info.num_zr () <= NUM_FP_ARG_REGS
3457 && pst_info.num_pr () <= NUM_PR_ARG_REGS);
3458
3459 case pure_scalable_type_info::DOESNT_MATTER:
3460 gcc_assert (aarch64_return_in_memory_1 (return_type));
3461 return false;
3462
3463 case pure_scalable_type_info::NO_ABI_IDENTITY:
3464 case pure_scalable_type_info::ISNT_PST:
3465 return false;
3466 }
3467 gcc_unreachable ();
c600df9a
RS
3468}
3469
3470/* Return true if a function with type FNTYPE takes arguments in
3471 SVE vector or predicate registers. */
3472
3473static bool
3474aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
3475{
3476 CUMULATIVE_ARGS args_so_far_v;
3477 aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
3478 NULL_TREE, 0, true);
3479 cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
3480
3481 for (tree chain = TYPE_ARG_TYPES (fntype);
3482 chain && chain != void_list_node;
3483 chain = TREE_CHAIN (chain))
3484 {
3485 tree arg_type = TREE_VALUE (chain);
3486 if (arg_type == error_mark_node)
3487 return false;
3488
3489 function_arg_info arg (arg_type, /*named=*/true);
3490 apply_pass_by_reference_rules (&args_so_far_v, arg);
38e62001
RS
3491 pure_scalable_type_info pst_info;
3492 if (pst_info.analyze_registers (arg.type))
3493 {
3494 unsigned int end_zr = args_so_far_v.aapcs_nvrn + pst_info.num_zr ();
3495 unsigned int end_pr = args_so_far_v.aapcs_nprn + pst_info.num_pr ();
3496 gcc_assert (end_zr <= NUM_FP_ARG_REGS && end_pr <= NUM_PR_ARG_REGS);
3497 return true;
3498 }
c600df9a
RS
3499
3500 targetm.calls.function_arg_advance (args_so_far, arg);
3501 }
3502 return false;
3503}
3504
002ffd3c
RS
3505/* Implement TARGET_FNTYPE_ABI. */
3506
3507static const predefined_function_abi &
3508aarch64_fntype_abi (const_tree fntype)
3509{
3510 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
3511 return aarch64_simd_abi ();
c600df9a
RS
3512
3513 if (aarch64_returns_value_in_sve_regs_p (fntype)
3514 || aarch64_takes_arguments_in_sve_regs_p (fntype))
3515 return aarch64_sve_abi ();
3516
002ffd3c
RS
3517 return default_function_abi;
3518}
3519
482b2b43
RS
3520/* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P. */
3521
3522static bool
3523aarch64_compatible_vector_types_p (const_tree type1, const_tree type2)
3524{
3525 return (aarch64_sve::builtin_type_p (type1)
3526 == aarch64_sve::builtin_type_p (type2));
3527}
3528
c600df9a 3529/* Return true if we should emit CFI for register REGNO. */
a0d0b980
SE
3530
3531static bool
c600df9a 3532aarch64_emit_cfi_for_reg_p (unsigned int regno)
a0d0b980 3533{
c600df9a
RS
3534 return (GP_REGNUM_P (regno)
3535 || !default_function_abi.clobbers_full_reg_p (regno));
a0d0b980
SE
3536}
3537
c600df9a 3538/* Return the mode we should use to save and restore register REGNO. */
a0d0b980
SE
3539
3540static machine_mode
c600df9a 3541aarch64_reg_save_mode (unsigned int regno)
a0d0b980 3542{
c600df9a
RS
3543 if (GP_REGNUM_P (regno))
3544 return DImode;
3545
3546 if (FP_REGNUM_P (regno))
3547 switch (crtl->abi->id ())
3548 {
3549 case ARM_PCS_AAPCS64:
3550 /* Only the low 64 bits are saved by the base PCS. */
3551 return DFmode;
3552
3553 case ARM_PCS_SIMD:
3554 /* The vector PCS saves the low 128 bits (which is the full
3555 register on non-SVE targets). */
3556 return TFmode;
3557
3558 case ARM_PCS_SVE:
3559 /* Use vectors of DImode for registers that need frame
3560 information, so that the first 64 bytes of the save slot
3561 are always the equivalent of what storing D<n> would give. */
3562 if (aarch64_emit_cfi_for_reg_p (regno))
3563 return VNx2DImode;
3564
3565 /* Use vectors of bytes otherwise, so that the layout is
3566 endian-agnostic, and so that we can use LDR and STR for
3567 big-endian targets. */
3568 return VNx16QImode;
3569
3570 case ARM_PCS_TLSDESC:
3571 case ARM_PCS_UNKNOWN:
3572 break;
3573 }
3574
3575 if (PR_REGNUM_P (regno))
3576 /* Save the full predicate register. */
3577 return VNx16BImode;
3578
3579 gcc_unreachable ();
a0d0b980
SE
3580}
3581
5a5a3bc5 3582/* Implement TARGET_INSN_CALLEE_ABI. */
b3650d40 3583
5a5a3bc5
RS
3584const predefined_function_abi &
3585aarch64_insn_callee_abi (const rtx_insn *insn)
b3650d40 3586{
08cc4d92
RS
3587 rtx pat = PATTERN (insn);
3588 gcc_assert (GET_CODE (pat) == PARALLEL);
3589 rtx unspec = XVECEXP (pat, 0, 1);
3590 gcc_assert (GET_CODE (unspec) == UNSPEC
3591 && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
3592 return function_abis[INTVAL (XVECEXP (unspec, 0, 0))];
b3650d40
SE
3593}
3594
80ec73f4
RS
3595/* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
3596 the lower 64 bits of a 128-bit register. Tell the compiler the callee
3597 clobbers the top 64 bits when restoring the bottom 64 bits. */
3598
3599static bool
6ee2cc70
RS
3600aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
3601 unsigned int regno,
473574ee 3602 machine_mode mode)
80ec73f4 3603{
c600df9a 3604 if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
51051f47 3605 {
51051f47
RS
3606 poly_int64 per_register_size = GET_MODE_SIZE (mode);
3607 unsigned int nregs = hard_regno_nregs (regno, mode);
3608 if (nregs > 1)
3609 per_register_size = exact_div (per_register_size, nregs);
bb6ce448
RS
3610 if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
3611 return maybe_gt (per_register_size, 16);
3612 return maybe_gt (per_register_size, 8);
51051f47
RS
3613 }
3614 return false;
473574ee
SE
3615}
3616
43cacb12
RS
3617/* Implement REGMODE_NATURAL_SIZE. */
3618poly_uint64
3619aarch64_regmode_natural_size (machine_mode mode)
3620{
3621 /* The natural size for SVE data modes is one SVE data vector,
3622 and similarly for predicates. We can't independently modify
3623 anything smaller than that. */
3624 /* ??? For now, only do this for variable-width SVE registers.
3625 Doing it for constant-sized registers breaks lower-subreg.c. */
3626 /* ??? And once that's fixed, we should probably have similar
3627 code for Advanced SIMD. */
3628 if (!aarch64_sve_vg.is_constant ())
3629 {
3630 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3631 if (vec_flags & VEC_SVE_PRED)
3632 return BYTES_PER_SVE_PRED;
3633 if (vec_flags & VEC_SVE_DATA)
3634 return BYTES_PER_SVE_VECTOR;
3635 }
3636 return UNITS_PER_WORD;
3637}
3638
73d9ac6a 3639/* Implement HARD_REGNO_CALLER_SAVE_MODE. */
ef4bddc2 3640machine_mode
43cacb12
RS
3641aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
3642 machine_mode mode)
3643{
3644 /* The predicate mode determines which bits are significant and
3645 which are "don't care". Decreasing the number of lanes would
3646 lose data while increasing the number of lanes would make bits
3647 unnecessarily significant. */
3648 if (PR_REGNUM_P (regno))
3649 return mode;
6a70badb
RS
3650 if (known_ge (GET_MODE_SIZE (mode), 4))
3651 return mode;
73d9ac6a 3652 else
6a70badb 3653 return SImode;
73d9ac6a
IB
3654}
3655
231c52ae
ST
3656/* Return true if I's bits are consecutive ones from the MSB. */
3657bool
3658aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
3659{
3660 return exact_log2 (-i) != HOST_WIDE_INT_M1;
3661}
3662
58e17cf8
RS
3663/* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
3664 that strcpy from constants will be faster. */
3665
3666static HOST_WIDE_INT
3667aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
3668{
3669 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
3670 return MAX (align, BITS_PER_WORD);
3671 return align;
3672}
3673
43e9d192
IB
3674/* Return true if calls to DECL should be treated as
3675 long-calls (ie called via a register). */
3676static bool
3677aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
3678{
3679 return false;
3680}
3681
3682/* Return true if calls to symbol-ref SYM should be treated as
3683 long-calls (ie called via a register). */
3684bool
3685aarch64_is_long_call_p (rtx sym)
3686{
3687 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
3688}
3689
b60d63cb
JW
3690/* Return true if calls to symbol-ref SYM should not go through
3691 plt stubs. */
3692
3693bool
3694aarch64_is_noplt_call_p (rtx sym)
3695{
3696 const_tree decl = SYMBOL_REF_DECL (sym);
3697
3698 if (flag_pic
3699 && decl
3700 && (!flag_plt
3701 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
3702 && !targetm.binds_local_p (decl))
3703 return true;
3704
3705 return false;
3706}
3707
43e9d192
IB
3708/* Emit an insn that's a simple single-set. Both the operands must be
3709 known to be valid. */
827ab47a 3710inline static rtx_insn *
43e9d192
IB
3711emit_set_insn (rtx x, rtx y)
3712{
f7df4a84 3713 return emit_insn (gen_rtx_SET (x, y));
43e9d192
IB
3714}
3715
3716/* X and Y are two things to compare using CODE. Emit the compare insn and
3717 return the rtx for register 0 in the proper mode. */
3718rtx
3719aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
3720{
4a2095eb
RH
3721 machine_mode cmp_mode = GET_MODE (x);
3722 machine_mode cc_mode;
3723 rtx cc_reg;
43e9d192 3724
4a2095eb
RH
3725 if (cmp_mode == TImode)
3726 {
3727 gcc_assert (code == NE);
3728
3729 cc_mode = CCmode;
3730 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
3731
3732 rtx x_lo = operand_subword (x, 0, 0, TImode);
3733 rtx y_lo = operand_subword (y, 0, 0, TImode);
3734 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
3735
3736 rtx x_hi = operand_subword (x, 1, 0, TImode);
3737 rtx y_hi = operand_subword (y, 1, 0, TImode);
865257c4
RS
3738 emit_insn (gen_ccmpccdi (cc_reg, cc_reg, x_hi, y_hi,
3739 gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
3740 GEN_INT (AARCH64_EQ)));
4a2095eb
RH
3741 }
3742 else
3743 {
3744 cc_mode = SELECT_CC_MODE (code, x, y);
3745 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
3746 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
3747 }
43e9d192
IB
3748 return cc_reg;
3749}
3750
d400fda3
RH
3751/* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
3752
3753static rtx
3754aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
3755 machine_mode y_mode)
3756{
3757 if (y_mode == E_QImode || y_mode == E_HImode)
3758 {
3759 if (CONST_INT_P (y))
df562b12
JJ
3760 {
3761 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
3762 y_mode = SImode;
3763 }
d400fda3
RH
3764 else
3765 {
3766 rtx t, cc_reg;
3767 machine_mode cc_mode;
3768
3769 t = gen_rtx_ZERO_EXTEND (SImode, y);
3770 t = gen_rtx_COMPARE (CC_SWPmode, t, x);
3771 cc_mode = CC_SWPmode;
3772 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
3773 emit_set_insn (cc_reg, t);
3774 return cc_reg;
3775 }
3776 }
3777
846f78d4
PK
3778 if (!aarch64_plus_operand (y, y_mode))
3779 y = force_reg (y_mode, y);
3780
d400fda3
RH
3781 return aarch64_gen_compare_reg (code, x, y);
3782}
3783
43e9d192
IB
3784/* Build the SYMBOL_REF for __tls_get_addr. */
3785
3786static GTY(()) rtx tls_get_addr_libfunc;
3787
3788rtx
3789aarch64_tls_get_addr (void)
3790{
3791 if (!tls_get_addr_libfunc)
3792 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
3793 return tls_get_addr_libfunc;
3794}
3795
3796/* Return the TLS model to use for ADDR. */
3797
3798static enum tls_model
3799tls_symbolic_operand_type (rtx addr)
3800{
3801 enum tls_model tls_kind = TLS_MODEL_NONE;
74b27d8e
RS
3802 poly_int64 offset;
3803 addr = strip_offset_and_salt (addr, &offset);
3793ecc1 3804 if (SYMBOL_REF_P (addr))
43e9d192
IB
3805 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
3806
3807 return tls_kind;
3808}
3809
3810/* We'll allow lo_sum's in addresses in our legitimate addresses
3811 so that combine would take care of combining addresses where
3812 necessary, but for generation purposes, we'll generate the address
3813 as :
3814 RTL Absolute
3815 tmp = hi (symbol_ref); adrp x1, foo
3816 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
3817 nop
3818
3819 PIC TLS
3820 adrp x1, :got:foo adrp tmp, :tlsgd:foo
3821 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
3822 bl __tls_get_addr
3823 nop
3824
3825 Load TLS symbol, depending on TLS mechanism and TLS access model.
3826
3827 Global Dynamic - Traditional TLS:
3828 adrp tmp, :tlsgd:imm
3829 add dest, tmp, #:tlsgd_lo12:imm
3830 bl __tls_get_addr
3831
3832 Global Dynamic - TLS Descriptors:
3833 adrp dest, :tlsdesc:imm
3834 ldr tmp, [dest, #:tlsdesc_lo12:imm]
3835 add dest, dest, #:tlsdesc_lo12:imm
3836 blr tmp
3837 mrs tp, tpidr_el0
3838 add dest, dest, tp
3839
3840 Initial Exec:
3841 mrs tp, tpidr_el0
3842 adrp tmp, :gottprel:imm
3843 ldr dest, [tmp, #:gottprel_lo12:imm]
3844 add dest, dest, tp
3845
3846 Local Exec:
3847 mrs tp, tpidr_el0
0699caae
RL
3848 add t0, tp, #:tprel_hi12:imm, lsl #12
3849 add t0, t0, #:tprel_lo12_nc:imm
43e9d192
IB
3850*/
3851
3852static void
3853aarch64_load_symref_appropriately (rtx dest, rtx imm,
3854 enum aarch64_symbol_type type)
3855{
3856 switch (type)
3857 {
3858 case SYMBOL_SMALL_ABSOLUTE:
3859 {
28514dda 3860 /* In ILP32, the mode of dest can be either SImode or DImode. */
43e9d192 3861 rtx tmp_reg = dest;
ef4bddc2 3862 machine_mode mode = GET_MODE (dest);
28514dda
YZ
3863
3864 gcc_assert (mode == Pmode || mode == ptr_mode);
3865
43e9d192 3866 if (can_create_pseudo_p ())
28514dda 3867 tmp_reg = gen_reg_rtx (mode);
43e9d192 3868
28514dda 3869 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
43e9d192
IB
3870 emit_insn (gen_add_losym (dest, tmp_reg, imm));
3871 return;
3872 }
3873
a5350ddc 3874 case SYMBOL_TINY_ABSOLUTE:
f7df4a84 3875 emit_insn (gen_rtx_SET (dest, imm));
a5350ddc
CSS
3876 return;
3877
1b1e81f8
JW
3878 case SYMBOL_SMALL_GOT_28K:
3879 {
3880 machine_mode mode = GET_MODE (dest);
3881 rtx gp_rtx = pic_offset_table_rtx;
53021678
JW
3882 rtx insn;
3883 rtx mem;
1b1e81f8
JW
3884
3885 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
3886 here before rtl expand. Tree IVOPT will generate rtl pattern to
3887 decide rtx costs, in which case pic_offset_table_rtx is not
3888 initialized. For that case no need to generate the first adrp
026c3cfd 3889 instruction as the final cost for global variable access is
1b1e81f8
JW
3890 one instruction. */
3891 if (gp_rtx != NULL)
3892 {
3893 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
3894 using the page base as GOT base, the first page may be wasted,
3895 in the worst scenario, there is only 28K space for GOT).
3896
3897 The generate instruction sequence for accessing global variable
3898 is:
3899
a3957742 3900 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1b1e81f8
JW
3901
3902 Only one instruction needed. But we must initialize
3903 pic_offset_table_rtx properly. We generate initialize insn for
3904 every global access, and allow CSE to remove all redundant.
3905
3906 The final instruction sequences will look like the following
3907 for multiply global variables access.
3908
a3957742 3909 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1b1e81f8 3910
a3957742
JW
3911 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
3912 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
3913 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
3914 ... */
1b1e81f8
JW
3915
3916 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
3917 crtl->uses_pic_offset_table = 1;
3918 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
3919
3920 if (mode != GET_MODE (gp_rtx))
4ba8f0a3
AP
3921 gp_rtx = gen_lowpart (mode, gp_rtx);
3922
1b1e81f8
JW
3923 }
3924
3925 if (mode == ptr_mode)
3926 {
3927 if (mode == DImode)
53021678 3928 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1b1e81f8 3929 else
53021678
JW
3930 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
3931
3932 mem = XVECEXP (SET_SRC (insn), 0, 0);
1b1e81f8
JW
3933 }
3934 else
3935 {
3936 gcc_assert (mode == Pmode);
53021678
JW
3937
3938 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
3939 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1b1e81f8
JW
3940 }
3941
53021678
JW
3942 /* The operand is expected to be MEM. Whenever the related insn
3943 pattern changed, above code which calculate mem should be
3944 updated. */
3793ecc1 3945 gcc_assert (MEM_P (mem));
53021678
JW
3946 MEM_READONLY_P (mem) = 1;
3947 MEM_NOTRAP_P (mem) = 1;
3948 emit_insn (insn);
1b1e81f8
JW
3949 return;
3950 }
3951
6642bdb4 3952 case SYMBOL_SMALL_GOT_4G:
a195c727
WD
3953 emit_insn (gen_rtx_SET (dest, imm));
3954 return;
43e9d192
IB
3955
3956 case SYMBOL_SMALL_TLSGD:
3957 {
5d8a22a5 3958 rtx_insn *insns;
87ca615a
AP
3959 /* The return type of __tls_get_addr is the C pointer type
3960 so use ptr_mode. */
3961 rtx result = gen_rtx_REG (ptr_mode, R0_REGNUM);
3962 rtx tmp_reg = dest;
3963
3964 if (GET_MODE (dest) != ptr_mode)
3965 tmp_reg = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : result;
43e9d192
IB
3966
3967 start_sequence ();
87ca615a 3968 if (ptr_mode == SImode)
23b88fda
N
3969 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
3970 else
3971 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
43e9d192
IB
3972 insns = get_insns ();
3973 end_sequence ();
3974
3975 RTL_CONST_CALL_P (insns) = 1;
87ca615a
AP
3976 emit_libcall_block (insns, tmp_reg, result, imm);
3977 /* Convert back to the mode of the dest adding a zero_extend
3978 from SImode (ptr_mode) to DImode (Pmode). */
3979 if (dest != tmp_reg)
3980 convert_move (dest, tmp_reg, true);
43e9d192
IB
3981 return;
3982 }
3983
3984 case SYMBOL_SMALL_TLSDESC:
3985 {
ef4bddc2 3986 machine_mode mode = GET_MODE (dest);
621ad2de 3987 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
43e9d192
IB
3988 rtx tp;
3989
621ad2de
AP
3990 gcc_assert (mode == Pmode || mode == ptr_mode);
3991
2876a13f
JW
3992 /* In ILP32, the got entry is always of SImode size. Unlike
3993 small GOT, the dest is fixed at reg 0. */
3994 if (TARGET_ILP32)
3995 emit_insn (gen_tlsdesc_small_si (imm));
621ad2de 3996 else
2876a13f 3997 emit_insn (gen_tlsdesc_small_di (imm));
43e9d192 3998 tp = aarch64_load_tp (NULL);
621ad2de
AP
3999
4000 if (mode != Pmode)
4001 tp = gen_lowpart (mode, tp);
4002
2876a13f 4003 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
241dbd9d
QZ
4004 if (REG_P (dest))
4005 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
43e9d192
IB
4006 return;
4007 }
4008
79496620 4009 case SYMBOL_SMALL_TLSIE:
43e9d192 4010 {
621ad2de
AP
4011 /* In ILP32, the mode of dest can be either SImode or DImode,
4012 while the got entry is always of SImode size. The mode of
4013 dest depends on how dest is used: if dest is assigned to a
4014 pointer (e.g. in the memory), it has SImode; it may have
4015 DImode if dest is dereferenced to access the memeory.
4016 This is why we have to handle three different tlsie_small
4017 patterns here (two patterns for ILP32). */
ef4bddc2 4018 machine_mode mode = GET_MODE (dest);
621ad2de 4019 rtx tmp_reg = gen_reg_rtx (mode);
43e9d192 4020 rtx tp = aarch64_load_tp (NULL);
621ad2de
AP
4021
4022 if (mode == ptr_mode)
4023 {
4024 if (mode == DImode)
4025 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
4026 else
4027 {
4028 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
4029 tp = gen_lowpart (mode, tp);
4030 }
4031 }
4032 else
4033 {
4034 gcc_assert (mode == Pmode);
4035 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
4036 }
4037
f7df4a84 4038 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
241dbd9d
QZ
4039 if (REG_P (dest))
4040 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
43e9d192
IB
4041 return;
4042 }
4043
cbf5629e 4044 case SYMBOL_TLSLE12:
d18ba284 4045 case SYMBOL_TLSLE24:
cbf5629e
JW
4046 case SYMBOL_TLSLE32:
4047 case SYMBOL_TLSLE48:
43e9d192 4048 {
cbf5629e 4049 machine_mode mode = GET_MODE (dest);
43e9d192 4050 rtx tp = aarch64_load_tp (NULL);
e6f7f0e9 4051
cbf5629e
JW
4052 if (mode != Pmode)
4053 tp = gen_lowpart (mode, tp);
4054
4055 switch (type)
4056 {
4057 case SYMBOL_TLSLE12:
4058 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
4059 (dest, tp, imm));
4060 break;
4061 case SYMBOL_TLSLE24:
4062 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
4063 (dest, tp, imm));
4064 break;
4065 case SYMBOL_TLSLE32:
4066 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
4067 (dest, imm));
4068 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
4069 (dest, dest, tp));
4070 break;
4071 case SYMBOL_TLSLE48:
4072 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
4073 (dest, imm));
4074 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
4075 (dest, dest, tp));
4076 break;
4077 default:
4078 gcc_unreachable ();
4079 }
e6f7f0e9 4080
241dbd9d
QZ
4081 if (REG_P (dest))
4082 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
43e9d192
IB
4083 return;
4084 }
4085
87dd8ab0 4086 case SYMBOL_TINY_GOT:
d91480de
D
4087 {
4088 rtx insn;
4089 machine_mode mode = GET_MODE (dest);
4090
4091 if (mode == ptr_mode)
4092 insn = gen_ldr_got_tiny (mode, dest, imm);
4093 else
4094 {
4095 gcc_assert (mode == Pmode);
4096 insn = gen_ldr_got_tiny_sidi (dest, imm);
4097 }
4098
4099 emit_insn (insn);
4100 return;
4101 }
87dd8ab0 4102
5ae7caad
JW
4103 case SYMBOL_TINY_TLSIE:
4104 {
4105 machine_mode mode = GET_MODE (dest);
4106 rtx tp = aarch64_load_tp (NULL);
4107
4108 if (mode == ptr_mode)
4109 {
4110 if (mode == DImode)
4111 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
4112 else
4113 {
4114 tp = gen_lowpart (mode, tp);
4115 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
4116 }
4117 }
4118 else
4119 {
4120 gcc_assert (mode == Pmode);
4121 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
4122 }
4123
241dbd9d
QZ
4124 if (REG_P (dest))
4125 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
5ae7caad
JW
4126 return;
4127 }
4128
43e9d192
IB
4129 default:
4130 gcc_unreachable ();
4131 }
4132}
4133
4134/* Emit a move from SRC to DEST. Assume that the move expanders can
4135 handle all moves if !can_create_pseudo_p (). The distinction is
4136 important because, unlike emit_move_insn, the move expanders know
4137 how to force Pmode objects into the constant pool even when the
4138 constant pool address is not itself legitimate. */
4139static rtx
4140aarch64_emit_move (rtx dest, rtx src)
4141{
4142 return (can_create_pseudo_p ()
4143 ? emit_move_insn (dest, src)
4144 : emit_move_insn_1 (dest, src));
4145}
4146
f22d7973
RS
4147/* Apply UNOPTAB to OP and store the result in DEST. */
4148
4149static void
4150aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
4151{
4152 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
4153 if (dest != tmp)
4154 emit_move_insn (dest, tmp);
4155}
4156
4157/* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
4158
4159static void
4160aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
4161{
4162 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
4163 OPTAB_DIRECT);
4164 if (dest != tmp)
4165 emit_move_insn (dest, tmp);
4166}
4167
030d03b8
RE
4168/* Split a 128-bit move operation into two 64-bit move operations,
4169 taking care to handle partial overlap of register to register
4170 copies. Special cases are needed when moving between GP regs and
4171 FP regs. SRC can be a register, constant or memory; DST a register
4172 or memory. If either operand is memory it must not have any side
4173 effects. */
43e9d192
IB
4174void
4175aarch64_split_128bit_move (rtx dst, rtx src)
4176{
030d03b8
RE
4177 rtx dst_lo, dst_hi;
4178 rtx src_lo, src_hi;
43e9d192 4179
ef4bddc2 4180 machine_mode mode = GET_MODE (dst);
12dc6974 4181
030d03b8
RE
4182 gcc_assert (mode == TImode || mode == TFmode);
4183 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
4184 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
43e9d192
IB
4185
4186 if (REG_P (dst) && REG_P (src))
4187 {
030d03b8
RE
4188 int src_regno = REGNO (src);
4189 int dst_regno = REGNO (dst);
43e9d192 4190
030d03b8 4191 /* Handle FP <-> GP regs. */
43e9d192
IB
4192 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
4193 {
030d03b8
RE
4194 src_lo = gen_lowpart (word_mode, src);
4195 src_hi = gen_highpart (word_mode, src);
4196
0016d8d9
RS
4197 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
4198 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
030d03b8 4199 return;
43e9d192
IB
4200 }
4201 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
4202 {
030d03b8
RE
4203 dst_lo = gen_lowpart (word_mode, dst);
4204 dst_hi = gen_highpart (word_mode, dst);
4205
0016d8d9
RS
4206 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
4207 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
030d03b8 4208 return;
43e9d192 4209 }
43e9d192
IB
4210 }
4211
030d03b8
RE
4212 dst_lo = gen_lowpart (word_mode, dst);
4213 dst_hi = gen_highpart (word_mode, dst);
4214 src_lo = gen_lowpart (word_mode, src);
4215 src_hi = gen_highpart_mode (word_mode, mode, src);
4216
4217 /* At most one pairing may overlap. */
4218 if (reg_overlap_mentioned_p (dst_lo, src_hi))
4219 {
4220 aarch64_emit_move (dst_hi, src_hi);
4221 aarch64_emit_move (dst_lo, src_lo);
4222 }
4223 else
4224 {
4225 aarch64_emit_move (dst_lo, src_lo);
4226 aarch64_emit_move (dst_hi, src_hi);
4227 }
43e9d192
IB
4228}
4229
d4f9e819
RS
4230/* Return true if we should split a move from 128-bit value SRC
4231 to 128-bit register DEST. */
4232
43e9d192
IB
4233bool
4234aarch64_split_128bit_move_p (rtx dst, rtx src)
4235{
d4f9e819
RS
4236 if (FP_REGNUM_P (REGNO (dst)))
4237 return REG_P (src) && !FP_REGNUM_P (REGNO (src));
4238 /* All moves to GPRs need to be split. */
4239 return true;
43e9d192
IB
4240}
4241
8b033a8a
SN
4242/* Split a complex SIMD combine. */
4243
4244void
4245aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
4246{
ef4bddc2
RS
4247 machine_mode src_mode = GET_MODE (src1);
4248 machine_mode dst_mode = GET_MODE (dst);
8b033a8a
SN
4249
4250 gcc_assert (VECTOR_MODE_P (dst_mode));
a977dc0c
MC
4251 gcc_assert (register_operand (dst, dst_mode)
4252 && register_operand (src1, src_mode)
4253 && register_operand (src2, src_mode));
8b033a8a 4254
0016d8d9 4255 emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
a977dc0c 4256 return;
8b033a8a
SN
4257}
4258
fd4842cd
SN
4259/* Split a complex SIMD move. */
4260
4261void
4262aarch64_split_simd_move (rtx dst, rtx src)
4263{
ef4bddc2
RS
4264 machine_mode src_mode = GET_MODE (src);
4265 machine_mode dst_mode = GET_MODE (dst);
fd4842cd
SN
4266
4267 gcc_assert (VECTOR_MODE_P (dst_mode));
4268
4269 if (REG_P (dst) && REG_P (src))
4270 {
4271 gcc_assert (VECTOR_MODE_P (src_mode));
0016d8d9 4272 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
fd4842cd
SN
4273 }
4274}
4275
ef22810a
RH
4276bool
4277aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
4278 machine_mode ymode, rtx y)
4279{
4280 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
4281 gcc_assert (r != NULL);
4282 return rtx_equal_p (x, r);
4283}
ef22810a 4284
678faefc
RS
4285/* Return TARGET if it is nonnull and a register of mode MODE.
4286 Otherwise, return a fresh register of mode MODE if we can,
4287 or TARGET reinterpreted as MODE if we can't. */
4288
4289static rtx
4290aarch64_target_reg (rtx target, machine_mode mode)
4291{
4292 if (target && REG_P (target) && GET_MODE (target) == mode)
4293 return target;
4294 if (!can_create_pseudo_p ())
4295 {
4296 gcc_assert (target);
4297 return gen_lowpart (mode, target);
4298 }
4299 return gen_reg_rtx (mode);
4300}
4301
4302/* Return a register that contains the constant in BUILDER, given that
4303 the constant is a legitimate move operand. Use TARGET as the register
4304 if it is nonnull and convenient. */
4305
4306static rtx
4307aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
4308{
4309 rtx src = builder.build ();
4310 target = aarch64_target_reg (target, GET_MODE (src));
4311 emit_insn (gen_rtx_SET (target, src));
4312 return target;
4313}
4314
43e9d192 4315static rtx
ef4bddc2 4316aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
43e9d192
IB
4317{
4318 if (can_create_pseudo_p ())
e18b4a81 4319 return force_reg (mode, value);
43e9d192
IB
4320 else
4321 {
f5470a77
RS
4322 gcc_assert (x);
4323 aarch64_emit_move (x, value);
43e9d192
IB
4324 return x;
4325 }
4326}
4327
0b1fe8cf
RS
4328/* Return true if predicate value X is a constant in which every element
4329 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
4330 value, i.e. as a predicate in which all bits are significant. */
4331
4332static bool
4333aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
4334{
568b9c0e 4335 if (!CONST_VECTOR_P (x))
0b1fe8cf
RS
4336 return false;
4337
4338 unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
4339 GET_MODE_NUNITS (GET_MODE (x)));
4340 unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
4341 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
4342 builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
4343
4344 unsigned int nelts = const_vector_encoded_nelts (x);
4345 for (unsigned int i = 0; i < nelts; ++i)
4346 {
4347 rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
4348 if (!CONST_INT_P (elt))
4349 return false;
4350
4351 builder.quick_push (elt);
4352 for (unsigned int j = 1; j < factor; ++j)
4353 builder.quick_push (const0_rtx);
4354 }
4355 builder.finalize ();
4356 return true;
4357}
4358
4359/* BUILDER contains a predicate constant of mode VNx16BI. Return the
4360 widest predicate element size it can have (that is, the largest size
4361 for which each element would still be 0 or 1). */
4362
4363unsigned int
4364aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
4365{
4366 /* Start with the most optimistic assumption: that we only need
4367 one bit per pattern. This is what we will use if only the first
4368 bit in each pattern is ever set. */
4369 unsigned int mask = GET_MODE_SIZE (DImode);
4370 mask |= builder.npatterns ();
4371
4372 /* Look for set bits. */
4373 unsigned int nelts = builder.encoded_nelts ();
4374 for (unsigned int i = 1; i < nelts; ++i)
4375 if (INTVAL (builder.elt (i)) != 0)
4376 {
4377 if (i & 1)
4378 return 1;
4379 mask |= i;
4380 }
4381 return mask & -mask;
4382}
4383
624d0f07
RS
4384/* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
4385 return that predicate mode, otherwise return opt_machine_mode (). */
4386
4387opt_machine_mode
4388aarch64_ptrue_all_mode (rtx x)
4389{
4390 gcc_assert (GET_MODE (x) == VNx16BImode);
568b9c0e 4391 if (!CONST_VECTOR_P (x)
624d0f07
RS
4392 || !CONST_VECTOR_DUPLICATE_P (x)
4393 || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
4394 || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
4395 return opt_machine_mode ();
4396
4397 unsigned int nelts = const_vector_encoded_nelts (x);
4398 for (unsigned int i = 1; i < nelts; ++i)
4399 if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
4400 return opt_machine_mode ();
4401
4402 return aarch64_sve_pred_mode (nelts);
4403}
4404
0b1fe8cf
RS
4405/* BUILDER is a predicate constant of mode VNx16BI. Consider the value
4406 that the constant would have with predicate element size ELT_SIZE
4407 (ignoring the upper bits in each element) and return:
4408
4409 * -1 if all bits are set
4410 * N if the predicate has N leading set bits followed by all clear bits
4411 * 0 if the predicate does not have any of these forms. */
4412
4413int
4414aarch64_partial_ptrue_length (rtx_vector_builder &builder,
4415 unsigned int elt_size)
4416{
4417 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
4418 followed by set bits. */
4419 if (builder.nelts_per_pattern () == 3)
4420 return 0;
4421
4422 /* Skip over leading set bits. */
4423 unsigned int nelts = builder.encoded_nelts ();
4424 unsigned int i = 0;
4425 for (; i < nelts; i += elt_size)
4426 if (INTVAL (builder.elt (i)) == 0)
4427 break;
4428 unsigned int vl = i / elt_size;
4429
4430 /* Check for the all-true case. */
4431 if (i == nelts)
4432 return -1;
4433
4434 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
4435 repeating pattern of set bits followed by clear bits. */
4436 if (builder.nelts_per_pattern () != 2)
4437 return 0;
4438
4439 /* We have a "foreground" value and a duplicated "background" value.
4440 If the background might repeat and the last set bit belongs to it,
4441 we might have set bits followed by clear bits followed by set bits. */
4442 if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
4443 return 0;
4444
4445 /* Make sure that the rest are all clear. */
4446 for (; i < nelts; i += elt_size)
4447 if (INTVAL (builder.elt (i)) != 0)
4448 return 0;
4449
4450 return vl;
4451}
4452
4453/* See if there is an svpattern that encodes an SVE predicate of mode
4454 PRED_MODE in which the first VL bits are set and the rest are clear.
4455 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
4456 A VL of -1 indicates an all-true vector. */
4457
4458aarch64_svpattern
4459aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
4460{
4461 if (vl < 0)
4462 return AARCH64_SV_ALL;
4463
4464 if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
4465 return AARCH64_NUM_SVPATTERNS;
4466
4467 if (vl >= 1 && vl <= 8)
4468 return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
4469
4470 if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
4471 return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
4472
4473 int max_vl;
4474 if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
4475 {
4476 if (vl == (max_vl / 3) * 3)
4477 return AARCH64_SV_MUL3;
4478 /* These would only trigger for non-power-of-2 lengths. */
4479 if (vl == (max_vl & -4))
4480 return AARCH64_SV_MUL4;
4481 if (vl == (1 << floor_log2 (max_vl)))
4482 return AARCH64_SV_POW2;
4483 if (vl == max_vl)
4484 return AARCH64_SV_ALL;
4485 }
4486 return AARCH64_NUM_SVPATTERNS;
4487}
4488
34467289
RS
4489/* Return a VNx16BImode constant in which every sequence of ELT_SIZE
4490 bits has the lowest bit set and the upper bits clear. This is the
4491 VNx16BImode equivalent of a PTRUE for controlling elements of
4492 ELT_SIZE bytes. However, because the constant is VNx16BImode,
4493 all bits are significant, even the upper zeros. */
4494
4495rtx
4496aarch64_ptrue_all (unsigned int elt_size)
4497{
4498 rtx_vector_builder builder (VNx16BImode, elt_size, 1);
4499 builder.quick_push (const1_rtx);
4500 for (unsigned int i = 1; i < elt_size; ++i)
4501 builder.quick_push (const0_rtx);
4502 return builder.build ();
4503}
4504
16de3637
RS
4505/* Return an all-true predicate register of mode MODE. */
4506
4507rtx
4508aarch64_ptrue_reg (machine_mode mode)
4509{
4510 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
678faefc
RS
4511 rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
4512 return gen_lowpart (mode, reg);
16de3637
RS
4513}
4514
e7053b0c
RS
4515/* Return an all-false predicate register of mode MODE. */
4516
4517rtx
4518aarch64_pfalse_reg (machine_mode mode)
4519{
4520 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
678faefc
RS
4521 rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
4522 return gen_lowpart (mode, reg);
4523}
4524
00fa90d9
RS
4525/* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
4526 for it. PRED2[0] is the predicate for the instruction whose result
4527 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
4528 for it. Return true if we can prove that the two predicates are
4529 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
4530 with PRED1[0] without changing behavior. */
4531
4532bool
4533aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
4534{
4535 machine_mode mode = GET_MODE (pred1[0]);
4536 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
4537 && mode == GET_MODE (pred2[0])
4538 && aarch64_sve_ptrue_flag (pred1[1], SImode)
4539 && aarch64_sve_ptrue_flag (pred2[1], SImode));
4540
4541 bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
4542 || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
4543 bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
4544 || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
4545 return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
4546}
4547
4548/* Emit a comparison CMP between OP0 and OP1, both of which have mode
4549 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
4550 Use TARGET as the target register if nonnull and convenient. */
4551
4552static rtx
4553aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
4554 machine_mode data_mode, rtx op1, rtx op2)
4555{
4556 insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
4557 expand_operand ops[5];
4558 create_output_operand (&ops[0], target, pred_mode);
4559 create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
4560 create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
4561 create_input_operand (&ops[3], op1, data_mode);
4562 create_input_operand (&ops[4], op2, data_mode);
4563 expand_insn (icode, 5, ops);
4564 return ops[0].value;
4565}
4566
678faefc
RS
4567/* Use a comparison to convert integer vector SRC into MODE, which is
4568 the corresponding SVE predicate mode. Use TARGET for the result
4569 if it's nonnull and convenient. */
4570
624d0f07 4571rtx
678faefc
RS
4572aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
4573{
4574 machine_mode src_mode = GET_MODE (src);
00fa90d9
RS
4575 return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
4576 src, CONST0_RTX (src_mode));
e7053b0c
RS
4577}
4578
624d0f07
RS
4579/* Return the assembly token for svprfop value PRFOP. */
4580
4581static const char *
4582svprfop_token (enum aarch64_svprfop prfop)
4583{
4584 switch (prfop)
4585 {
4586#define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
4587 AARCH64_FOR_SVPRFOP (CASE)
4588#undef CASE
4589 case AARCH64_NUM_SVPRFOPS:
4590 break;
4591 }
4592 gcc_unreachable ();
4593}
4594
4595/* Return the assembly string for an SVE prefetch operation with
4596 mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
4597 and that SUFFIX is the format for the remaining operands. */
4598
4599char *
4600aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
4601 const char *suffix)
4602{
4603 static char buffer[128];
4604 aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
4605 unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
4606 mnemonic, svprfop_token (prfop), suffix);
4607 gcc_assert (written < sizeof (buffer));
4608 return buffer;
4609}
4610
4611/* Check whether we can calculate the number of elements in PATTERN
4612 at compile time, given that there are NELTS_PER_VQ elements per
4613 128-bit block. Return the value if so, otherwise return -1. */
4614
4615HOST_WIDE_INT
4616aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
4617{
4618 unsigned int vl, const_vg;
4619 if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
4620 vl = 1 + (pattern - AARCH64_SV_VL1);
4621 else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
4622 vl = 16 << (pattern - AARCH64_SV_VL16);
4623 else if (aarch64_sve_vg.is_constant (&const_vg))
4624 {
4625 /* There are two vector granules per quadword. */
4626 unsigned int nelts = (const_vg / 2) * nelts_per_vq;
4627 switch (pattern)
4628 {
4629 case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
4630 case AARCH64_SV_MUL4: return nelts & -4;
4631 case AARCH64_SV_MUL3: return (nelts / 3) * 3;
4632 case AARCH64_SV_ALL: return nelts;
4633 default: gcc_unreachable ();
4634 }
4635 }
4636 else
4637 return -1;
4638
4639 /* There are two vector granules per quadword. */
4640 poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
4641 if (known_le (vl, nelts_all))
4642 return vl;
4643
4644 /* Requesting more elements than are available results in a PFALSE. */
4645 if (known_gt (vl, nelts_all))
4646 return 0;
4647
4648 return -1;
4649}
4650
43cacb12
RS
4651/* Return true if we can move VALUE into a register using a single
4652 CNT[BHWD] instruction. */
4653
4654static bool
4655aarch64_sve_cnt_immediate_p (poly_int64 value)
4656{
4657 HOST_WIDE_INT factor = value.coeffs[0];
4658 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
4659 return (value.coeffs[1] == factor
4660 && IN_RANGE (factor, 2, 16 * 16)
4661 && (factor & 1) == 0
4662 && factor <= 16 * (factor & -factor));
4663}
4664
4665/* Likewise for rtx X. */
4666
4667bool
4668aarch64_sve_cnt_immediate_p (rtx x)
4669{
4670 poly_int64 value;
4671 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
4672}
4673
4674/* Return the asm string for an instruction with a CNT-like vector size
4675 operand (a vector pattern followed by a multiplier in the range [1, 16]).
4676 PREFIX is the mnemonic without the size suffix and OPERANDS is the
4677 first part of the operands template (the part that comes before the
139df05a
RS
4678 vector size itself). PATTERN is the pattern to use. FACTOR is the
4679 number of quadwords. NELTS_PER_VQ, if nonzero, is the number of elements
4680 in each quadword. If it is zero, we can use any element size. */
43cacb12
RS
4681
4682static char *
4683aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
139df05a 4684 aarch64_svpattern pattern,
43cacb12
RS
4685 unsigned int factor,
4686 unsigned int nelts_per_vq)
4687{
139df05a 4688 static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
43cacb12
RS
4689
4690 if (nelts_per_vq == 0)
4691 /* There is some overlap in the ranges of the four CNT instructions.
4692 Here we always use the smallest possible element size, so that the
4693 multiplier is 1 whereever possible. */
4694 nelts_per_vq = factor & -factor;
4695 int shift = std::min (exact_log2 (nelts_per_vq), 4);
4696 gcc_assert (IN_RANGE (shift, 1, 4));
4697 char suffix = "dwhb"[shift - 1];
4698
4699 factor >>= shift;
4700 unsigned int written;
139df05a 4701 if (pattern == AARCH64_SV_ALL && factor == 1)
43cacb12
RS
4702 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
4703 prefix, suffix, operands);
139df05a
RS
4704 else if (factor == 1)
4705 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
4706 prefix, suffix, operands, svpattern_token (pattern));
43cacb12 4707 else
139df05a
RS
4708 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
4709 prefix, suffix, operands, svpattern_token (pattern),
4710 factor);
43cacb12
RS
4711 gcc_assert (written < sizeof (buffer));
4712 return buffer;
4713}
4714
4715/* Return the asm string for an instruction with a CNT-like vector size
4716 operand (a vector pattern followed by a multiplier in the range [1, 16]).
4717 PREFIX is the mnemonic without the size suffix and OPERANDS is the
4718 first part of the operands template (the part that comes before the
4719 vector size itself). X is the value of the vector size operand,
139df05a
RS
4720 as a polynomial integer rtx; we need to convert this into an "all"
4721 pattern with a multiplier. */
43cacb12
RS
4722
4723char *
4724aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
4725 rtx x)
4726{
4727 poly_int64 value = rtx_to_poly_int64 (x);
4728 gcc_assert (aarch64_sve_cnt_immediate_p (value));
139df05a 4729 return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
43cacb12
RS
4730 value.coeffs[1], 0);
4731}
4732
624d0f07
RS
4733/* Return the asm string for an instruction with a CNT-like vector size
4734 operand (a vector pattern followed by a multiplier in the range [1, 16]).
4735 PREFIX is the mnemonic without the size suffix and OPERANDS is the
4736 first part of the operands template (the part that comes before the
4737 vector size itself). CNT_PAT[0..2] are the operands of the
4738 UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details. */
4739
4740char *
4741aarch64_output_sve_cnt_pat_immediate (const char *prefix,
4742 const char *operands, rtx *cnt_pat)
4743{
4744 aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
4745 unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
4746 unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
4747 return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
4748 factor, nelts_per_vq);
4749}
4750
0fdc30bc
RS
4751/* Return true if we can add X using a single SVE INC or DEC instruction. */
4752
4753bool
4754aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
4755{
4756 poly_int64 value;
4757 return (poly_int_rtx_p (x, &value)
4758 && (aarch64_sve_cnt_immediate_p (value)
4759 || aarch64_sve_cnt_immediate_p (-value)));
4760}
4761
4762/* Return the asm string for adding SVE INC/DEC immediate OFFSET to
4763 operand 0. */
4764
4765char *
4766aarch64_output_sve_scalar_inc_dec (rtx offset)
4767{
4768 poly_int64 offset_value = rtx_to_poly_int64 (offset);
4769 gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
4770 if (offset_value.coeffs[1] > 0)
139df05a 4771 return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
0fdc30bc
RS
4772 offset_value.coeffs[1], 0);
4773 else
139df05a 4774 return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
0fdc30bc
RS
4775 -offset_value.coeffs[1], 0);
4776}
4777
43cacb12
RS
4778/* Return true if we can add VALUE to a register using a single ADDVL
4779 or ADDPL instruction. */
4780
4781static bool
4782aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
4783{
4784 HOST_WIDE_INT factor = value.coeffs[0];
4785 if (factor == 0 || value.coeffs[1] != factor)
4786 return false;
4787 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
4788 and a value of 16 is one vector width. */
4789 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
4790 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
4791}
4792
4793/* Likewise for rtx X. */
4794
4795bool
4796aarch64_sve_addvl_addpl_immediate_p (rtx x)
4797{
4798 poly_int64 value;
4799 return (poly_int_rtx_p (x, &value)
4800 && aarch64_sve_addvl_addpl_immediate_p (value));
4801}
4802
0fdc30bc
RS
4803/* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
4804 to operand 1 and storing the result in operand 0. */
43cacb12
RS
4805
4806char *
0fdc30bc 4807aarch64_output_sve_addvl_addpl (rtx offset)
43cacb12
RS
4808{
4809 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
4810 poly_int64 offset_value = rtx_to_poly_int64 (offset);
4811 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
4812
43cacb12
RS
4813 int factor = offset_value.coeffs[1];
4814 if ((factor & 15) == 0)
4815 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
4816 else
4817 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
4818 return buffer;
4819}
4820
4821/* Return true if X is a valid immediate for an SVE vector INC or DEC
4822 instruction. If it is, store the number of elements in each vector
4823 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
4824 factor in *FACTOR_OUT (if nonnull). */
4825
4826bool
0fdc30bc
RS
4827aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
4828 unsigned int *nelts_per_vq_out)
43cacb12
RS
4829{
4830 rtx elt;
4831 poly_int64 value;
4832
4833 if (!const_vec_duplicate_p (x, &elt)
4834 || !poly_int_rtx_p (elt, &value))
4835 return false;
4836
4837 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
4838 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
4839 /* There's no vector INCB. */
4840 return false;
4841
4842 HOST_WIDE_INT factor = value.coeffs[0];
4843 if (value.coeffs[1] != factor)
4844 return false;
4845
4846 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
4847 if ((factor % nelts_per_vq) != 0
4848 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
4849 return false;
4850
4851 if (factor_out)
4852 *factor_out = factor;
4853 if (nelts_per_vq_out)
4854 *nelts_per_vq_out = nelts_per_vq;
4855 return true;
4856}
4857
4858/* Return true if X is a valid immediate for an SVE vector INC or DEC
4859 instruction. */
4860
4861bool
0fdc30bc 4862aarch64_sve_vector_inc_dec_immediate_p (rtx x)
43cacb12 4863{
0fdc30bc 4864 return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
43cacb12
RS
4865}
4866
4867/* Return the asm template for an SVE vector INC or DEC instruction.
4868 OPERANDS gives the operands before the vector count and X is the
4869 value of the vector count operand itself. */
4870
4871char *
0fdc30bc 4872aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
43cacb12
RS
4873{
4874 int factor;
4875 unsigned int nelts_per_vq;
0fdc30bc 4876 if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
43cacb12
RS
4877 gcc_unreachable ();
4878 if (factor < 0)
139df05a
RS
4879 return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
4880 -factor, nelts_per_vq);
43cacb12 4881 else
139df05a
RS
4882 return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
4883 factor, nelts_per_vq);
43cacb12 4884}
43e9d192 4885
82614948
RR
4886static int
4887aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
77e994c9 4888 scalar_int_mode mode)
43e9d192 4889{
43e9d192 4890 int i;
9a4865db
WD
4891 unsigned HOST_WIDE_INT val, val2, mask;
4892 int one_match, zero_match;
4893 int num_insns;
43e9d192 4894
9a4865db
WD
4895 val = INTVAL (imm);
4896
4897 if (aarch64_move_imm (val, mode))
43e9d192 4898 {
82614948 4899 if (generate)
f7df4a84 4900 emit_insn (gen_rtx_SET (dest, imm));
9a4865db 4901 return 1;
43e9d192
IB
4902 }
4903
9de00935
TC
4904 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
4905 (with XXXX non-zero). In that case check to see if the move can be done in
4906 a smaller mode. */
4907 val2 = val & 0xffffffff;
4908 if (mode == DImode
4909 && aarch64_move_imm (val2, SImode)
4910 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
4911 {
4912 if (generate)
4913 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4914
4915 /* Check if we have to emit a second instruction by checking to see
4916 if any of the upper 32 bits of the original DI mode value is set. */
4917 if (val == val2)
4918 return 1;
4919
4920 i = (val >> 48) ? 48 : 32;
4921
4922 if (generate)
4923 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4924 GEN_INT ((val >> i) & 0xffff)));
4925
4926 return 2;
4927 }
4928
9a4865db 4929 if ((val >> 32) == 0 || mode == SImode)
43e9d192 4930 {
82614948
RR
4931 if (generate)
4932 {
9a4865db
WD
4933 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
4934 if (mode == SImode)
4935 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
4936 GEN_INT ((val >> 16) & 0xffff)));
4937 else
4938 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4939 GEN_INT ((val >> 16) & 0xffff)));
82614948 4940 }
9a4865db 4941 return 2;
43e9d192
IB
4942 }
4943
4944 /* Remaining cases are all for DImode. */
4945
43e9d192 4946 mask = 0xffff;
9a4865db
WD
4947 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
4948 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
4949 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
4950 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
43e9d192 4951
62c8d76c 4952 if (zero_match != 2 && one_match != 2)
43e9d192 4953 {
62c8d76c
WD
4954 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
4955 For a 64-bit bitmask try whether changing 16 bits to all ones or
4956 zeroes creates a valid bitmask. To check any repeated bitmask,
4957 try using 16 bits from the other 32-bit half of val. */
43e9d192 4958
62c8d76c 4959 for (i = 0; i < 64; i += 16, mask <<= 16)
43e9d192 4960 {
62c8d76c
WD
4961 val2 = val & ~mask;
4962 if (val2 != val && aarch64_bitmask_imm (val2, mode))
4963 break;
4964 val2 = val | mask;
4965 if (val2 != val && aarch64_bitmask_imm (val2, mode))
4966 break;
4967 val2 = val2 & ~mask;
4968 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
4969 if (val2 != val && aarch64_bitmask_imm (val2, mode))
4970 break;
43e9d192 4971 }
62c8d76c 4972 if (i != 64)
43e9d192 4973 {
62c8d76c 4974 if (generate)
43e9d192 4975 {
62c8d76c
WD
4976 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4977 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
9a4865db 4978 GEN_INT ((val >> i) & 0xffff)));
43e9d192 4979 }
1312b1ba 4980 return 2;
43e9d192
IB
4981 }
4982 }
4983
9a4865db
WD
4984 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
4985 are emitted by the initial mov. If one_match > zero_match, skip set bits,
4986 otherwise skip zero bits. */
2c274197 4987
9a4865db 4988 num_insns = 1;
43e9d192 4989 mask = 0xffff;
9a4865db
WD
4990 val2 = one_match > zero_match ? ~val : val;
4991 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
4992
4993 if (generate)
4994 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
4995 ? (val | ~(mask << i))
4996 : (val & (mask << i)))));
4997 for (i += 16; i < 64; i += 16)
43e9d192 4998 {
9a4865db
WD
4999 if ((val2 & (mask << i)) == 0)
5000 continue;
5001 if (generate)
5002 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
5003 GEN_INT ((val >> i) & 0xffff)));
5004 num_insns ++;
82614948
RR
5005 }
5006
5007 return num_insns;
5008}
5009
c0bb5bc5
WD
5010/* Return whether imm is a 128-bit immediate which is simple enough to
5011 expand inline. */
5012bool
5013aarch64_mov128_immediate (rtx imm)
5014{
3793ecc1 5015 if (CONST_INT_P (imm))
c0bb5bc5
WD
5016 return true;
5017
5018 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
5019
5020 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
5021 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
5022
5023 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
5024 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
5025}
5026
5027
43cacb12
RS
5028/* Return the number of temporary registers that aarch64_add_offset_1
5029 would need to add OFFSET to a register. */
5030
5031static unsigned int
5032aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
5033{
1bb3e2c0 5034 return absu_hwi (offset) < 0x1000000 ? 0 : 1;
43cacb12
RS
5035}
5036
f5470a77
RS
5037/* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
5038 a non-polynomial OFFSET. MODE is the mode of the addition.
5039 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
5040 be set and CFA adjustments added to the generated instructions.
5041
5042 TEMP1, if nonnull, is a register of mode MODE that can be used as a
5043 temporary if register allocation is already complete. This temporary
5044 register may overlap DEST but must not overlap SRC. If TEMP1 is known
5045 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
5046 the immediate again.
0100c5f9
RS
5047
5048 Since this function may be used to adjust the stack pointer, we must
5049 ensure that it cannot cause transient stack deallocation (for example
5050 by first incrementing SP and then decrementing when adjusting by a
5051 large immediate). */
5052
5053static void
f5470a77
RS
5054aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
5055 rtx src, HOST_WIDE_INT offset, rtx temp1,
5056 bool frame_related_p, bool emit_move_imm)
0100c5f9 5057{
f5470a77
RS
5058 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
5059 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
5060
42bc589e 5061 unsigned HOST_WIDE_INT moffset = absu_hwi (offset);
0100c5f9
RS
5062 rtx_insn *insn;
5063
f5470a77
RS
5064 if (!moffset)
5065 {
5066 if (!rtx_equal_p (dest, src))
5067 {
5068 insn = emit_insn (gen_rtx_SET (dest, src));
5069 RTX_FRAME_RELATED_P (insn) = frame_related_p;
5070 }
5071 return;
5072 }
0100c5f9
RS
5073
5074 /* Single instruction adjustment. */
f5470a77 5075 if (aarch64_uimm12_shift (moffset))
0100c5f9 5076 {
f5470a77 5077 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
0100c5f9
RS
5078 RTX_FRAME_RELATED_P (insn) = frame_related_p;
5079 return;
5080 }
5081
f5470a77
RS
5082 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
5083 and either:
5084
5085 a) the offset cannot be loaded by a 16-bit move or
5086 b) there is no spare register into which we can move it. */
5087 if (moffset < 0x1000000
5088 && ((!temp1 && !can_create_pseudo_p ())
5089 || !aarch64_move_imm (moffset, mode)))
0100c5f9 5090 {
f5470a77 5091 HOST_WIDE_INT low_off = moffset & 0xfff;
0100c5f9 5092
f5470a77
RS
5093 low_off = offset < 0 ? -low_off : low_off;
5094 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
0100c5f9 5095 RTX_FRAME_RELATED_P (insn) = frame_related_p;
f5470a77 5096 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
0100c5f9
RS
5097 RTX_FRAME_RELATED_P (insn) = frame_related_p;
5098 return;
5099 }
5100
5101 /* Emit a move immediate if required and an addition/subtraction. */
0100c5f9 5102 if (emit_move_imm)
f5470a77
RS
5103 {
5104 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
7aa605c9
JJ
5105 temp1 = aarch64_force_temporary (mode, temp1,
5106 gen_int_mode (moffset, mode));
f5470a77
RS
5107 }
5108 insn = emit_insn (offset < 0
5109 ? gen_sub3_insn (dest, src, temp1)
5110 : gen_add3_insn (dest, src, temp1));
0100c5f9
RS
5111 if (frame_related_p)
5112 {
5113 RTX_FRAME_RELATED_P (insn) = frame_related_p;
f5470a77
RS
5114 rtx adj = plus_constant (mode, src, offset);
5115 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
0100c5f9
RS
5116 }
5117}
5118
43cacb12
RS
5119/* Return the number of temporary registers that aarch64_add_offset
5120 would need to move OFFSET into a register or add OFFSET to a register;
5121 ADD_P is true if we want the latter rather than the former. */
5122
5123static unsigned int
5124aarch64_offset_temporaries (bool add_p, poly_int64 offset)
5125{
5126 /* This follows the same structure as aarch64_add_offset. */
5127 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
5128 return 0;
5129
5130 unsigned int count = 0;
5131 HOST_WIDE_INT factor = offset.coeffs[1];
5132 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
5133 poly_int64 poly_offset (factor, factor);
5134 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
5135 /* Need one register for the ADDVL/ADDPL result. */
5136 count += 1;
5137 else if (factor != 0)
5138 {
5139 factor = abs (factor);
5140 if (factor > 16 * (factor & -factor))
5141 /* Need one register for the CNT result and one for the multiplication
5142 factor. If necessary, the second temporary can be reused for the
5143 constant part of the offset. */
5144 return 2;
5145 /* Need one register for the CNT result (which might then
5146 be shifted). */
5147 count += 1;
5148 }
5149 return count + aarch64_add_offset_1_temporaries (constant);
5150}
5151
5152/* If X can be represented as a poly_int64, return the number
5153 of temporaries that are required to add it to a register.
5154 Return -1 otherwise. */
5155
5156int
5157aarch64_add_offset_temporaries (rtx x)
5158{
5159 poly_int64 offset;
5160 if (!poly_int_rtx_p (x, &offset))
5161 return -1;
5162 return aarch64_offset_temporaries (true, offset);
5163}
5164
f5470a77
RS
5165/* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
5166 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
5167 be set and CFA adjustments added to the generated instructions.
5168
5169 TEMP1, if nonnull, is a register of mode MODE that can be used as a
5170 temporary if register allocation is already complete. This temporary
43cacb12
RS
5171 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
5172 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
5173 false to avoid emitting the immediate again.
5174
5175 TEMP2, if nonnull, is a second temporary register that doesn't
5176 overlap either DEST or REG.
f5470a77
RS
5177
5178 Since this function may be used to adjust the stack pointer, we must
5179 ensure that it cannot cause transient stack deallocation (for example
5180 by first incrementing SP and then decrementing when adjusting by a
5181 large immediate). */
5182
5183static void
5184aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
43cacb12
RS
5185 poly_int64 offset, rtx temp1, rtx temp2,
5186 bool frame_related_p, bool emit_move_imm = true)
0100c5f9 5187{
f5470a77
RS
5188 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
5189 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
43cacb12
RS
5190 gcc_assert (temp1 == NULL_RTX
5191 || !frame_related_p
5192 || !reg_overlap_mentioned_p (temp1, dest));
5193 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
5194
5195 /* Try using ADDVL or ADDPL to add the whole value. */
5196 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
5197 {
5198 rtx offset_rtx = gen_int_mode (offset, mode);
5199 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
5200 RTX_FRAME_RELATED_P (insn) = frame_related_p;
5201 return;
5202 }
5203
5204 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
5205 SVE vector register, over and above the minimum size of 128 bits.
5206 This is equivalent to half the value returned by CNTD with a
5207 vector shape of ALL. */
5208 HOST_WIDE_INT factor = offset.coeffs[1];
5209 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
5210
5211 /* Try using ADDVL or ADDPL to add the VG-based part. */
5212 poly_int64 poly_offset (factor, factor);
5213 if (src != const0_rtx
5214 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
5215 {
5216 rtx offset_rtx = gen_int_mode (poly_offset, mode);
5217 if (frame_related_p)
5218 {
5219 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
5220 RTX_FRAME_RELATED_P (insn) = true;
5221 src = dest;
5222 }
5223 else
5224 {
5225 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
5226 src = aarch64_force_temporary (mode, temp1, addr);
5227 temp1 = temp2;
5228 temp2 = NULL_RTX;
5229 }
5230 }
5231 /* Otherwise use a CNT-based sequence. */
5232 else if (factor != 0)
5233 {
5234 /* Use a subtraction if we have a negative factor. */
5235 rtx_code code = PLUS;
5236 if (factor < 0)
5237 {
5238 factor = -factor;
5239 code = MINUS;
5240 }
5241
5242 /* Calculate CNTD * FACTOR / 2. First try to fold the division
5243 into the multiplication. */
5244 rtx val;
5245 int shift = 0;
5246 if (factor & 1)
5247 /* Use a right shift by 1. */
5248 shift = -1;
5249 else
5250 factor /= 2;
5251 HOST_WIDE_INT low_bit = factor & -factor;
5252 if (factor <= 16 * low_bit)
5253 {
5254 if (factor > 16 * 8)
5255 {
5256 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
5257 the value with the minimum multiplier and shift it into
5258 position. */
5259 int extra_shift = exact_log2 (low_bit);
5260 shift += extra_shift;
5261 factor >>= extra_shift;
5262 }
5263 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
5264 }
5265 else
5266 {
7d8bdfa7
RS
5267 /* Base the factor on LOW_BIT if we can calculate LOW_BIT
5268 directly, since that should increase the chances of being
5269 able to use a shift and add sequence. If LOW_BIT itself
5270 is out of range, just use CNTD. */
5271 if (low_bit <= 16 * 8)
5272 factor /= low_bit;
5273 else
5274 low_bit = 1;
5275
5276 val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
43cacb12
RS
5277 val = aarch64_force_temporary (mode, temp1, val);
5278
7d8bdfa7
RS
5279 if (can_create_pseudo_p ())
5280 {
5281 rtx coeff1 = gen_int_mode (factor, mode);
d7cea7ce 5282 val = expand_mult (mode, val, coeff1, NULL_RTX, true, true);
7d8bdfa7
RS
5283 }
5284 else
43cacb12 5285 {
7d8bdfa7
RS
5286 /* Go back to using a negative multiplication factor if we have
5287 no register from which to subtract. */
5288 if (code == MINUS && src == const0_rtx)
5289 {
5290 factor = -factor;
5291 code = PLUS;
5292 }
5293 rtx coeff1 = gen_int_mode (factor, mode);
5294 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
5295 val = gen_rtx_MULT (mode, val, coeff1);
43cacb12 5296 }
43cacb12
RS
5297 }
5298
5299 if (shift > 0)
5300 {
5301 /* Multiply by 1 << SHIFT. */
5302 val = aarch64_force_temporary (mode, temp1, val);
5303 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
5304 }
5305 else if (shift == -1)
5306 {
5307 /* Divide by 2. */
5308 val = aarch64_force_temporary (mode, temp1, val);
5309 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
5310 }
5311
5312 /* Calculate SRC +/- CNTD * FACTOR / 2. */
5313 if (src != const0_rtx)
5314 {
5315 val = aarch64_force_temporary (mode, temp1, val);
5316 val = gen_rtx_fmt_ee (code, mode, src, val);
5317 }
5318 else if (code == MINUS)
5319 {
5320 val = aarch64_force_temporary (mode, temp1, val);
5321 val = gen_rtx_NEG (mode, val);
5322 }
5323
5324 if (constant == 0 || frame_related_p)
5325 {
5326 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
5327 if (frame_related_p)
5328 {
5329 RTX_FRAME_RELATED_P (insn) = true;
5330 add_reg_note (insn, REG_CFA_ADJUST_CFA,
5331 gen_rtx_SET (dest, plus_constant (Pmode, src,
5332 poly_offset)));
5333 }
5334 src = dest;
5335 if (constant == 0)
5336 return;
5337 }
5338 else
5339 {
5340 src = aarch64_force_temporary (mode, temp1, val);
5341 temp1 = temp2;
5342 temp2 = NULL_RTX;
5343 }
5344
5345 emit_move_imm = true;
5346 }
f5470a77 5347
f5470a77
RS
5348 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
5349 frame_related_p, emit_move_imm);
0100c5f9
RS
5350}
5351
43cacb12
RS
5352/* Like aarch64_add_offset, but the offset is given as an rtx rather
5353 than a poly_int64. */
5354
5355void
5356aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
5357 rtx offset_rtx, rtx temp1, rtx temp2)
5358{
5359 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
5360 temp1, temp2, false);
5361}
5362
f5470a77
RS
5363/* Add DELTA to the stack pointer, marking the instructions frame-related.
5364 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
5365 if TEMP1 already contains abs (DELTA). */
5366
0100c5f9 5367static inline void
43cacb12 5368aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
0100c5f9 5369{
f5470a77 5370 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
43cacb12 5371 temp1, temp2, true, emit_move_imm);
0100c5f9
RS
5372}
5373
f5470a77
RS
5374/* Subtract DELTA from the stack pointer, marking the instructions
5375 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
5376 if nonnull. */
5377
0100c5f9 5378static inline void
cd1bef27
JL
5379aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
5380 bool emit_move_imm = true)
0100c5f9 5381{
f5470a77 5382 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
cd1bef27 5383 temp1, temp2, frame_related_p, emit_move_imm);
0100c5f9 5384}
82614948 5385
43cacb12
RS
5386/* Set DEST to (vec_series BASE STEP). */
5387
5388static void
5389aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
82614948
RR
5390{
5391 machine_mode mode = GET_MODE (dest);
43cacb12
RS
5392 scalar_mode inner = GET_MODE_INNER (mode);
5393
5394 /* Each operand can be a register or an immediate in the range [-16, 15]. */
5395 if (!aarch64_sve_index_immediate_p (base))
5396 base = force_reg (inner, base);
5397 if (!aarch64_sve_index_immediate_p (step))
5398 step = force_reg (inner, step);
5399
5400 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
5401}
82614948 5402
4aeb1ba7
RS
5403/* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
5404 register of mode MODE. Use TARGET for the result if it's nonnull
5405 and convenient.
5406
5407 The two vector modes must have the same element mode. The behavior
5408 is to duplicate architectural lane N of SRC into architectural lanes
5409 N + I * STEP of the result. On big-endian targets, architectural
5410 lane 0 of an Advanced SIMD vector is the last element of the vector
5411 in memory layout, so for big-endian targets this operation has the
5412 effect of reversing SRC before duplicating it. Callers need to
5413 account for this. */
43cacb12 5414
4aeb1ba7
RS
5415rtx
5416aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
5417{
5418 machine_mode src_mode = GET_MODE (src);
5419 gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
5420 insn_code icode = (BYTES_BIG_ENDIAN
5421 ? code_for_aarch64_vec_duplicate_vq_be (mode)
5422 : code_for_aarch64_vec_duplicate_vq_le (mode));
5423
5424 unsigned int i = 0;
5425 expand_operand ops[3];
5426 create_output_operand (&ops[i++], target, mode);
5427 create_output_operand (&ops[i++], src, src_mode);
5428 if (BYTES_BIG_ENDIAN)
5429 {
5430 /* Create a PARALLEL describing the reversal of SRC. */
5431 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
5432 rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
5433 nelts_per_vq - 1, -1);
5434 create_fixed_operand (&ops[i++], sel);
43cacb12 5435 }
4aeb1ba7
RS
5436 expand_insn (icode, i, ops);
5437 return ops[0].value;
5438}
5439
5440/* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
5441 the memory image into DEST. Return true on success. */
43cacb12 5442
4aeb1ba7
RS
5443static bool
5444aarch64_expand_sve_ld1rq (rtx dest, rtx src)
5445{
5446 src = force_const_mem (GET_MODE (src), src);
43cacb12
RS
5447 if (!src)
5448 return false;
5449
5450 /* Make sure that the address is legitimate. */
4aeb1ba7 5451 if (!aarch64_sve_ld1rq_operand_p (src))
43cacb12
RS
5452 {
5453 rtx addr = force_reg (Pmode, XEXP (src, 0));
5454 src = replace_equiv_address (src, addr);
5455 }
5456
947b1372 5457 machine_mode mode = GET_MODE (dest);
cc68f7c2 5458 machine_mode pred_mode = aarch64_sve_pred_mode (mode);
16de3637 5459 rtx ptrue = aarch64_ptrue_reg (pred_mode);
4aeb1ba7 5460 emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
43cacb12
RS
5461 return true;
5462}
5463
a065e0bb
RS
5464/* SRC is an SVE CONST_VECTOR that contains N "foreground" values followed
5465 by N "background" values. Try to move it into TARGET using:
5466
5467 PTRUE PRED.<T>, VL<N>
5468 MOV TRUE.<T>, #<foreground>
5469 MOV FALSE.<T>, #<background>
5470 SEL TARGET.<T>, PRED.<T>, TRUE.<T>, FALSE.<T>
5471
5472 The PTRUE is always a single instruction but the MOVs might need a
5473 longer sequence. If the background value is zero (as it often is),
5474 the sequence can sometimes collapse to a PTRUE followed by a
5475 zero-predicated move.
5476
5477 Return the target on success, otherwise return null. */
5478
5479static rtx
5480aarch64_expand_sve_const_vector_sel (rtx target, rtx src)
5481{
5482 gcc_assert (CONST_VECTOR_NELTS_PER_PATTERN (src) == 2);
5483
5484 /* Make sure that the PTRUE is valid. */
5485 machine_mode mode = GET_MODE (src);
5486 machine_mode pred_mode = aarch64_sve_pred_mode (mode);
5487 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
5488 if (aarch64_svpattern_for_vl (pred_mode, npatterns)
5489 == AARCH64_NUM_SVPATTERNS)
5490 return NULL_RTX;
5491
5492 rtx_vector_builder pred_builder (pred_mode, npatterns, 2);
5493 rtx_vector_builder true_builder (mode, npatterns, 1);
5494 rtx_vector_builder false_builder (mode, npatterns, 1);
5495 for (unsigned int i = 0; i < npatterns; ++i)
5496 {
5497 true_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
5498 pred_builder.quick_push (CONST1_RTX (BImode));
5499 }
5500 for (unsigned int i = 0; i < npatterns; ++i)
5501 {
5502 false_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i + npatterns));
5503 pred_builder.quick_push (CONST0_RTX (BImode));
5504 }
5505 expand_operand ops[4];
5506 create_output_operand (&ops[0], target, mode);
5507 create_input_operand (&ops[1], true_builder.build (), mode);
5508 create_input_operand (&ops[2], false_builder.build (), mode);
5509 create_input_operand (&ops[3], pred_builder.build (), pred_mode);
5510 expand_insn (code_for_vcond_mask (mode, mode), 4, ops);
5511 return target;
5512}
5513
4aeb1ba7
RS
5514/* Return a register containing CONST_VECTOR SRC, given that SRC has an
5515 SVE data mode and isn't a legitimate constant. Use TARGET for the
5516 result if convenient.
43cacb12 5517
4aeb1ba7
RS
5518 The returned register can have whatever mode seems most natural
5519 given the contents of SRC. */
5520
5521static rtx
5522aarch64_expand_sve_const_vector (rtx target, rtx src)
43cacb12
RS
5523{
5524 machine_mode mode = GET_MODE (src);
5525 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
5526 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
4aeb1ba7
RS
5527 scalar_mode elt_mode = GET_MODE_INNER (mode);
5528 unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
cc68f7c2
RS
5529 unsigned int container_bits = aarch64_sve_container_bits (mode);
5530 unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
5531
5532 if (nelts_per_pattern == 1
5533 && encoded_bits <= 128
5534 && container_bits != elt_bits)
5535 {
5536 /* We have a partial vector mode and a constant whose full-vector
5537 equivalent would occupy a repeating 128-bit sequence. Build that
5538 full-vector equivalent instead, so that we have the option of
5539 using LD1RQ and Advanced SIMD operations. */
5540 unsigned int repeat = container_bits / elt_bits;
5541 machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
5542 rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
5543 for (unsigned int i = 0; i < npatterns; ++i)
5544 for (unsigned int j = 0; j < repeat; ++j)
5545 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
5546 target = aarch64_target_reg (target, full_mode);
5547 return aarch64_expand_sve_const_vector (target, builder.build ());
5548 }
4aeb1ba7
RS
5549
5550 if (nelts_per_pattern == 1 && encoded_bits == 128)
5551 {
5552 /* The constant is a duplicated quadword but can't be narrowed
5553 beyond a quadword. Get the memory image of the first quadword
5554 as a 128-bit vector and try using LD1RQ to load it from memory.
5555
5556 The effect for both endiannesses is to load memory lane N into
5557 architectural lanes N + I * STEP of the result. On big-endian
5558 targets, the layout of the 128-bit vector in an Advanced SIMD
5559 register would be different from its layout in an SVE register,
5560 but this 128-bit vector is a memory value only. */
5561 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
5562 rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
5563 if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
5564 return target;
5565 }
5566
5567 if (nelts_per_pattern == 1 && encoded_bits < 128)
5568 {
5569 /* The vector is a repeating sequence of 64 bits or fewer.
5570 See if we can load them using an Advanced SIMD move and then
5571 duplicate it to fill a vector. This is better than using a GPR
5572 move because it keeps everything in the same register file. */
5573 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
5574 rtx_vector_builder builder (vq_mode, npatterns, 1);
5575 for (unsigned int i = 0; i < npatterns; ++i)
5576 {
5577 /* We want memory lane N to go into architectural lane N,
5578 so reverse for big-endian targets. The DUP .Q pattern
5579 has a compensating reverse built-in. */
5580 unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
5581 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
5582 }
5583 rtx vq_src = builder.build ();
5584 if (aarch64_simd_valid_immediate (vq_src, NULL))
5585 {
5586 vq_src = force_reg (vq_mode, vq_src);
5587 return aarch64_expand_sve_dupq (target, mode, vq_src);
5588 }
5589
5590 /* Get an integer representation of the repeating part of Advanced
5591 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
5592 which for big-endian targets is lane-swapped wrt a normal
5593 Advanced SIMD vector. This means that for both endiannesses,
5594 memory lane N of SVE vector SRC corresponds to architectural
5595 lane N of a register holding VQ_SRC. This in turn means that
5596 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
5597 as a single 128-bit value) and thus that memory lane 0 of SRC is
5598 in the lsb of the integer. Duplicating the integer therefore
5599 ensures that memory lane N of SRC goes into architectural lane
5600 N + I * INDEX of the SVE register. */
5601 scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
5602 rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
5603 if (elt_value)
5604 {
5605 /* Pretend that we had a vector of INT_MODE to start with. */
5606 elt_mode = int_mode;
5607 mode = aarch64_full_sve_mode (int_mode).require ();
5608
5609 /* If the integer can be moved into a general register by a
5610 single instruction, do that and duplicate the result. */
5611 if (CONST_INT_P (elt_value)
5612 && aarch64_move_imm (INTVAL (elt_value), elt_mode))
5613 {
5614 elt_value = force_reg (elt_mode, elt_value);
5615 return expand_vector_broadcast (mode, elt_value);
5616 }
5617 }
5618 else if (npatterns == 1)
5619 /* We're duplicating a single value, but can't do better than
5620 force it to memory and load from there. This handles things
5621 like symbolic constants. */
5622 elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
43cacb12 5623
4aeb1ba7 5624 if (elt_value)
8179efe0 5625 {
4aeb1ba7
RS
5626 /* Load the element from memory if we can, otherwise move it into
5627 a register and use a DUP. */
5628 rtx op = force_const_mem (elt_mode, elt_value);
5629 if (!op)
5630 op = force_reg (elt_mode, elt_value);
5631 return expand_vector_broadcast (mode, op);
8179efe0 5632 }
43cacb12
RS
5633 }
5634
4aeb1ba7
RS
5635 /* Try using INDEX. */
5636 rtx base, step;
5637 if (const_vec_series_p (src, &base, &step))
5638 {
5639 aarch64_expand_vec_series (target, base, step);
5640 return target;
5641 }
5642
5643 /* From here on, it's better to force the whole constant to memory
5644 if we can. */
5645 if (GET_MODE_NUNITS (mode).is_constant ())
5646 return NULL_RTX;
5647
a065e0bb
RS
5648 if (nelts_per_pattern == 2)
5649 if (rtx res = aarch64_expand_sve_const_vector_sel (target, src))
5650 return res;
5651
43cacb12 5652 /* Expand each pattern individually. */
4aeb1ba7 5653 gcc_assert (npatterns > 1);
43cacb12
RS
5654 rtx_vector_builder builder;
5655 auto_vec<rtx, 16> vectors (npatterns);
5656 for (unsigned int i = 0; i < npatterns; ++i)
5657 {
5658 builder.new_vector (mode, 1, nelts_per_pattern);
5659 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
5660 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
5661 vectors.quick_push (force_reg (mode, builder.build ()));
5662 }
5663
5664 /* Use permutes to interleave the separate vectors. */
5665 while (npatterns > 1)
5666 {
5667 npatterns /= 2;
5668 for (unsigned int i = 0; i < npatterns; ++i)
5669 {
4aeb1ba7 5670 rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
43cacb12
RS
5671 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
5672 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
5673 vectors[i] = tmp;
5674 }
5675 }
4aeb1ba7
RS
5676 gcc_assert (vectors[0] == target);
5677 return target;
43cacb12
RS
5678}
5679
678faefc
RS
5680/* Use WHILE to set a predicate register of mode MODE in which the first
5681 VL bits are set and the rest are clear. Use TARGET for the register
5682 if it's nonnull and convenient. */
0b1fe8cf 5683
678faefc
RS
5684static rtx
5685aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
5686 unsigned int vl)
0b1fe8cf
RS
5687{
5688 rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
678faefc 5689 target = aarch64_target_reg (target, mode);
6ad9571b 5690 emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
624d0f07 5691 target, const0_rtx, limit));
678faefc
RS
5692 return target;
5693}
5694
2803bc3b
RS
5695static rtx
5696aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
5697
5698/* BUILDER is a constant predicate in which the index of every set bit
5699 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
5700 by inverting every element at a multiple of ELT_SIZE and EORing the
5701 result with an ELT_SIZE PTRUE.
5702
5703 Return a register that contains the constant on success, otherwise
5704 return null. Use TARGET as the register if it is nonnull and
5705 convenient. */
5706
5707static rtx
5708aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
5709 unsigned int elt_size)
5710{
5711 /* Invert every element at a multiple of ELT_SIZE, keeping the
5712 other bits zero. */
5713 rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
5714 builder.nelts_per_pattern ());
5715 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
5716 if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
5717 inv_builder.quick_push (const1_rtx);
5718 else
5719 inv_builder.quick_push (const0_rtx);
5720 inv_builder.finalize ();
5721
5722 /* See if we can load the constant cheaply. */
5723 rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
5724 if (!inv)
5725 return NULL_RTX;
5726
5727 /* EOR the result with an ELT_SIZE PTRUE. */
5728 rtx mask = aarch64_ptrue_all (elt_size);
5729 mask = force_reg (VNx16BImode, mask);
26bebf57 5730 inv = gen_lowpart (VNx16BImode, inv);
2803bc3b
RS
5731 target = aarch64_target_reg (target, VNx16BImode);
5732 emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
5733 return target;
5734}
5735
5736/* BUILDER is a constant predicate in which the index of every set bit
5737 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
5738 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
5739 register on success, otherwise return null. Use TARGET as the register
5740 if nonnull and convenient. */
5741
5742static rtx
5743aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
5744 unsigned int elt_size,
5745 unsigned int permute_size)
5746{
5747 /* We're going to split the constant into two new constants A and B,
5748 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
5749 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
5750
5751 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
5752 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
5753
5754 where _ indicates elements that will be discarded by the permute.
5755
5756 First calculate the ELT_SIZEs for A and B. */
5757 unsigned int a_elt_size = GET_MODE_SIZE (DImode);
5758 unsigned int b_elt_size = GET_MODE_SIZE (DImode);
5759 for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
5760 if (INTVAL (builder.elt (i)) != 0)
5761 {
5762 if (i & permute_size)
5763 b_elt_size |= i - permute_size;
5764 else
5765 a_elt_size |= i;
5766 }
5767 a_elt_size &= -a_elt_size;
5768 b_elt_size &= -b_elt_size;
5769
5770 /* Now construct the vectors themselves. */
5771 rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
5772 builder.nelts_per_pattern ());
5773 rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
5774 builder.nelts_per_pattern ());
5775 unsigned int nelts = builder.encoded_nelts ();
5776 for (unsigned int i = 0; i < nelts; ++i)
5777 if (i & (elt_size - 1))
5778 {
5779 a_builder.quick_push (const0_rtx);
5780 b_builder.quick_push (const0_rtx);
5781 }
5782 else if ((i & permute_size) == 0)
5783 {
5784 /* The A and B elements are significant. */
5785 a_builder.quick_push (builder.elt (i));
5786 b_builder.quick_push (builder.elt (i + permute_size));
5787 }
5788 else
5789 {
5790 /* The A and B elements are going to be discarded, so pick whatever
5791 is likely to give a nice constant. We are targeting element
5792 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
5793 with the aim of each being a sequence of ones followed by
5794 a sequence of zeros. So:
5795
5796 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
5797 duplicate the last X_ELT_SIZE element, to extend the
5798 current sequence of ones or zeros.
5799
5800 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
5801 zero, so that the constant really does have X_ELT_SIZE and
5802 not a smaller size. */
5803 if (a_elt_size > permute_size)
5804 a_builder.quick_push (const0_rtx);
5805 else
5806 a_builder.quick_push (a_builder.elt (i - a_elt_size));
5807 if (b_elt_size > permute_size)
5808 b_builder.quick_push (const0_rtx);
5809 else
5810 b_builder.quick_push (b_builder.elt (i - b_elt_size));
5811 }
5812 a_builder.finalize ();
5813 b_builder.finalize ();
5814
5815 /* Try loading A into a register. */
5816 rtx_insn *last = get_last_insn ();
5817 rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
5818 if (!a)
5819 return NULL_RTX;
5820
5821 /* Try loading B into a register. */
5822 rtx b = a;
5823 if (a_builder != b_builder)
5824 {
5825 b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
5826 if (!b)
5827 {
5828 delete_insns_since (last);
5829 return NULL_RTX;
5830 }
5831 }
5832
8535755a
TC
5833 /* Emit the TRN1 itself. We emit a TRN that operates on VNx16BI
5834 operands but permutes them as though they had mode MODE. */
2803bc3b 5835 machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
8535755a
TC
5836 target = aarch64_target_reg (target, GET_MODE (a));
5837 rtx type_reg = CONST0_RTX (mode);
5838 emit_insn (gen_aarch64_sve_trn1_conv (mode, target, a, b, type_reg));
2803bc3b
RS
5839 return target;
5840}
5841
678faefc
RS
5842/* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
5843 constant in BUILDER into an SVE predicate register. Return the register
5844 on success, otherwise return null. Use TARGET for the register if
2803bc3b
RS
5845 nonnull and convenient.
5846
5847 ALLOW_RECURSE_P is true if we can use methods that would call this
5848 function recursively. */
678faefc
RS
5849
5850static rtx
2803bc3b
RS
5851aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
5852 bool allow_recurse_p)
678faefc
RS
5853{
5854 if (builder.encoded_nelts () == 1)
5855 /* A PFALSE or a PTRUE .B ALL. */
5856 return aarch64_emit_set_immediate (target, builder);
5857
5858 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
5859 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
5860 {
5861 /* If we can load the constant using PTRUE, use it as-is. */
5862 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
5863 if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
5864 return aarch64_emit_set_immediate (target, builder);
5865
5866 /* Otherwise use WHILE to set the first VL bits. */
5867 return aarch64_sve_move_pred_via_while (target, mode, vl);
5868 }
5869
2803bc3b
RS
5870 if (!allow_recurse_p)
5871 return NULL_RTX;
5872
5873 /* Try inverting the vector in element size ELT_SIZE and then EORing
5874 the result with an ELT_SIZE PTRUE. */
5875 if (INTVAL (builder.elt (0)) == 0)
5876 if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
5877 elt_size))
5878 return res;
5879
5880 /* Try using TRN1 to permute two simpler constants. */
5881 for (unsigned int i = elt_size; i <= 8; i *= 2)
5882 if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
5883 elt_size, i))
5884 return res;
5885
678faefc
RS
5886 return NULL_RTX;
5887}
5888
5889/* Return an SVE predicate register that contains the VNx16BImode
5890 constant in BUILDER, without going through the move expanders.
5891
5892 The returned register can have whatever mode seems most natural
5893 given the contents of BUILDER. Use TARGET for the result if
5894 convenient. */
5895
5896static rtx
5897aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
5898{
5899 /* Try loading the constant using pure predicate operations. */
2803bc3b 5900 if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
678faefc
RS
5901 return res;
5902
5903 /* Try forcing the constant to memory. */
5904 if (builder.full_nelts ().is_constant ())
5905 if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
5906 {
5907 target = aarch64_target_reg (target, VNx16BImode);
5908 emit_move_insn (target, mem);
5909 return target;
5910 }
5911
5912 /* The last resort is to load the constant as an integer and then
5913 compare it against zero. Use -1 for set bits in order to increase
5914 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
5915 rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
5916 builder.nelts_per_pattern ());
5917 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
5918 int_builder.quick_push (INTVAL (builder.elt (i))
5919 ? constm1_rtx : const0_rtx);
5920 return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
5921 int_builder.build ());
0b1fe8cf
RS
5922}
5923
4aeb1ba7 5924/* Set DEST to immediate IMM. */
43cacb12
RS
5925
5926void
4aeb1ba7 5927aarch64_expand_mov_immediate (rtx dest, rtx imm)
43cacb12
RS
5928{
5929 machine_mode mode = GET_MODE (dest);
82614948
RR
5930
5931 /* Check on what type of symbol it is. */
77e994c9 5932 scalar_int_mode int_mode;
3793ecc1
AC
5933 if ((SYMBOL_REF_P (imm)
5934 || LABEL_REF_P (imm)
43cacb12
RS
5935 || GET_CODE (imm) == CONST
5936 || GET_CODE (imm) == CONST_POLY_INT)
77e994c9 5937 && is_a <scalar_int_mode> (mode, &int_mode))
82614948 5938 {
43cacb12
RS
5939 rtx mem;
5940 poly_int64 offset;
5941 HOST_WIDE_INT const_offset;
82614948
RR
5942 enum aarch64_symbol_type sty;
5943
5944 /* If we have (const (plus symbol offset)), separate out the offset
5945 before we start classifying the symbol. */
43cacb12 5946 rtx base = strip_offset (imm, &offset);
82614948 5947
43cacb12
RS
5948 /* We must always add an offset involving VL separately, rather than
5949 folding it into the relocation. */
5950 if (!offset.is_constant (&const_offset))
5951 {
c0e0174b
RS
5952 if (!TARGET_SVE)
5953 {
5954 aarch64_report_sve_required ();
5955 return;
5956 }
43cacb12
RS
5957 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
5958 emit_insn (gen_rtx_SET (dest, imm));
5959 else
5960 {
5961 /* Do arithmetic on 32-bit values if the result is smaller
5962 than that. */
5963 if (partial_subreg_p (int_mode, SImode))
5964 {
5965 /* It is invalid to do symbol calculations in modes
5966 narrower than SImode. */
5967 gcc_assert (base == const0_rtx);
5968 dest = gen_lowpart (SImode, dest);
5969 int_mode = SImode;
5970 }
5971 if (base != const0_rtx)
5972 {
5973 base = aarch64_force_temporary (int_mode, dest, base);
5974 aarch64_add_offset (int_mode, dest, base, offset,
5975 NULL_RTX, NULL_RTX, false);
5976 }
5977 else
5978 aarch64_add_offset (int_mode, dest, base, offset,
5979 dest, NULL_RTX, false);
5980 }
5981 return;
5982 }
5983
5984 sty = aarch64_classify_symbol (base, const_offset);
82614948
RR
5985 switch (sty)
5986 {
5987 case SYMBOL_FORCE_TO_MEM:
e8beba1c
RS
5988 if (int_mode != ptr_mode)
5989 imm = convert_memory_address (ptr_mode, imm);
5990
43cacb12 5991 if (const_offset != 0
e8beba1c 5992 && targetm.cannot_force_const_mem (ptr_mode, imm))
82614948
RR
5993 {
5994 gcc_assert (can_create_pseudo_p ());
77e994c9 5995 base = aarch64_force_temporary (int_mode, dest, base);
43cacb12
RS
5996 aarch64_add_offset (int_mode, dest, base, const_offset,
5997 NULL_RTX, NULL_RTX, false);
82614948
RR
5998 return;
5999 }
b4f50fd4 6000
82614948
RR
6001 mem = force_const_mem (ptr_mode, imm);
6002 gcc_assert (mem);
b4f50fd4
RR
6003
6004 /* If we aren't generating PC relative literals, then
6005 we need to expand the literal pool access carefully.
6006 This is something that needs to be done in a number
6007 of places, so could well live as a separate function. */
9ee6540a 6008 if (!aarch64_pcrelative_literal_loads)
b4f50fd4
RR
6009 {
6010 gcc_assert (can_create_pseudo_p ());
6011 base = gen_reg_rtx (ptr_mode);
6012 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
00eee3fa
WD
6013 if (ptr_mode != Pmode)
6014 base = convert_memory_address (Pmode, base);
b4f50fd4
RR
6015 mem = gen_rtx_MEM (ptr_mode, base);
6016 }
6017
77e994c9
RS
6018 if (int_mode != ptr_mode)
6019 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
b4f50fd4 6020
f7df4a84 6021 emit_insn (gen_rtx_SET (dest, mem));
b4f50fd4 6022
82614948
RR
6023 return;
6024
6025 case SYMBOL_SMALL_TLSGD:
6026 case SYMBOL_SMALL_TLSDESC:
79496620 6027 case SYMBOL_SMALL_TLSIE:
1b1e81f8 6028 case SYMBOL_SMALL_GOT_28K:
6642bdb4 6029 case SYMBOL_SMALL_GOT_4G:
82614948 6030 case SYMBOL_TINY_GOT:
5ae7caad 6031 case SYMBOL_TINY_TLSIE:
43cacb12 6032 if (const_offset != 0)
82614948
RR
6033 {
6034 gcc_assert(can_create_pseudo_p ());
77e994c9 6035 base = aarch64_force_temporary (int_mode, dest, base);
43cacb12
RS
6036 aarch64_add_offset (int_mode, dest, base, const_offset,
6037 NULL_RTX, NULL_RTX, false);
82614948
RR
6038 return;
6039 }
6040 /* FALLTHRU */
6041
82614948
RR
6042 case SYMBOL_SMALL_ABSOLUTE:
6043 case SYMBOL_TINY_ABSOLUTE:
cbf5629e 6044 case SYMBOL_TLSLE12:
d18ba284 6045 case SYMBOL_TLSLE24:
cbf5629e
JW
6046 case SYMBOL_TLSLE32:
6047 case SYMBOL_TLSLE48:
82614948
RR
6048 aarch64_load_symref_appropriately (dest, imm, sty);
6049 return;
6050
6051 default:
6052 gcc_unreachable ();
6053 }
6054 }
6055
6056 if (!CONST_INT_P (imm))
6057 {
678faefc
RS
6058 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
6059 {
6060 /* Only the low bit of each .H, .S and .D element is defined,
6061 so we can set the upper bits to whatever we like. If the
6062 predicate is all-true in MODE, prefer to set all the undefined
6063 bits as well, so that we can share a single .B predicate for
6064 all modes. */
6065 if (imm == CONSTM1_RTX (mode))
6066 imm = CONSTM1_RTX (VNx16BImode);
6067
6068 /* All methods for constructing predicate modes wider than VNx16BI
6069 will set the upper bits of each element to zero. Expose this
6070 by moving such constants as a VNx16BI, so that all bits are
6071 significant and so that constants for different modes can be
6072 shared. The wider constant will still be available as a
6073 REG_EQUAL note. */
6074 rtx_vector_builder builder;
6075 if (aarch64_get_sve_pred_bits (builder, imm))
6076 {
6077 rtx res = aarch64_expand_sve_const_pred (dest, builder);
6078 if (dest != res)
6079 emit_move_insn (dest, gen_lowpart (mode, res));
6080 return;
6081 }
6082 }
6083
43cacb12
RS
6084 if (GET_CODE (imm) == HIGH
6085 || aarch64_simd_valid_immediate (imm, NULL))
43cacb12 6086 {
4aeb1ba7
RS
6087 emit_insn (gen_rtx_SET (dest, imm));
6088 return;
43e9d192 6089 }
82614948 6090
568b9c0e 6091 if (CONST_VECTOR_P (imm) && aarch64_sve_data_mode_p (mode))
4aeb1ba7
RS
6092 if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
6093 {
6094 if (dest != res)
6095 emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
6096 return;
6097 }
6098
6099 rtx mem = force_const_mem (mode, imm);
6100 gcc_assert (mem);
6101 emit_move_insn (dest, mem);
82614948 6102 return;
43e9d192 6103 }
82614948 6104
77e994c9
RS
6105 aarch64_internal_mov_immediate (dest, imm, true,
6106 as_a <scalar_int_mode> (mode));
43e9d192
IB
6107}
6108
74b27d8e
RS
6109/* Return the MEM rtx that provides the canary value that should be used
6110 for stack-smashing protection. MODE is the mode of the memory.
6111 For SSP_GLOBAL, DECL_RTL is the MEM rtx for the canary variable
6112 (__stack_chk_guard), otherwise it has no useful value. SALT_TYPE
6113 indicates whether the caller is performing a SET or a TEST operation. */
6114
6115rtx
6116aarch64_stack_protect_canary_mem (machine_mode mode, rtx decl_rtl,
6117 aarch64_salt_type salt_type)
6118{
6119 rtx addr;
6120 if (aarch64_stack_protector_guard == SSP_GLOBAL)
6121 {
6122 gcc_assert (MEM_P (decl_rtl));
6123 addr = XEXP (decl_rtl, 0);
6124 poly_int64 offset;
6125 rtx base = strip_offset_and_salt (addr, &offset);
6126 if (!SYMBOL_REF_P (base))
6127 return decl_rtl;
6128
6129 rtvec v = gen_rtvec (2, base, GEN_INT (salt_type));
6130 addr = gen_rtx_UNSPEC (Pmode, v, UNSPEC_SALT_ADDR);
6131 addr = gen_rtx_CONST (Pmode, addr);
6132 addr = plus_constant (Pmode, addr, offset);
6133 }
6134 else
6135 {
6136 /* Calculate the address from the system register. */
6137 rtx salt = GEN_INT (salt_type);
6138 addr = gen_reg_rtx (mode);
6139 if (mode == DImode)
6140 emit_insn (gen_reg_stack_protect_address_di (addr, salt));
6141 else
6142 {
6143 emit_insn (gen_reg_stack_protect_address_si (addr, salt));
6144 addr = convert_memory_address (Pmode, addr);
6145 }
6146 addr = plus_constant (Pmode, addr, aarch64_stack_protector_guard_offset);
6147 }
6148 return gen_rtx_MEM (mode, force_reg (Pmode, addr));
6149}
6150
43cacb12
RS
6151/* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
6152 that is known to contain PTRUE. */
6153
6154void
6155aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
6156{
0c63a8ee
TC
6157 expand_operand ops[3];
6158 machine_mode mode = GET_MODE (dest);
6159 create_output_operand (&ops[0], dest, mode);
6160 create_input_operand (&ops[1], pred, GET_MODE(pred));
6161 create_input_operand (&ops[2], src, mode);
f2b29269 6162 temporary_volatile_ok v (true);
0c63a8ee 6163 expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
43cacb12
RS
6164}
6165
6166/* Expand a pre-RA SVE data move from SRC to DEST in which at least one
6167 operand is in memory. In this case we need to use the predicated LD1
6168 and ST1 instead of LDR and STR, both for correctness on big-endian
6169 targets and because LD1 and ST1 support a wider range of addressing modes.
6170 PRED_MODE is the mode of the predicate.
6171
6172 See the comment at the head of aarch64-sve.md for details about the
6173 big-endian handling. */
6174
6175void
6176aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
6177{
6178 machine_mode mode = GET_MODE (dest);
16de3637 6179 rtx ptrue = aarch64_ptrue_reg (pred_mode);
43cacb12
RS
6180 if (!register_operand (src, mode)
6181 && !register_operand (dest, mode))
6182 {
6183 rtx tmp = gen_reg_rtx (mode);
6184 if (MEM_P (src))
6185 aarch64_emit_sve_pred_move (tmp, ptrue, src);
6186 else
6187 emit_move_insn (tmp, src);
6188 src = tmp;
6189 }
6190 aarch64_emit_sve_pred_move (dest, ptrue, src);
6191}
6192
002092be
RS
6193/* Called only on big-endian targets. See whether an SVE vector move
6194 from SRC to DEST is effectively a REV[BHW] instruction, because at
6195 least one operand is a subreg of an SVE vector that has wider or
6196 narrower elements. Return true and emit the instruction if so.
6197
6198 For example:
6199
6200 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
6201
6202 represents a VIEW_CONVERT between the following vectors, viewed
6203 in memory order:
6204
6205 R2: { [0].high, [0].low, [1].high, [1].low, ... }
6206 R1: { [0], [1], [2], [3], ... }
6207
6208 The high part of lane X in R2 should therefore correspond to lane X*2
6209 of R1, but the register representations are:
6210
6211 msb lsb
6212 R2: ...... [1].high [1].low [0].high [0].low
6213 R1: ...... [3] [2] [1] [0]
6214
6215 where the low part of lane X in R2 corresponds to lane X*2 in R1.
6216 We therefore need a reverse operation to swap the high and low values
6217 around.
6218
6219 This is purely an optimization. Without it we would spill the
6220 subreg operand to the stack in one mode and reload it in the
6221 other mode, which has the same effect as the REV. */
6222
6223bool
6224aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
6225{
6226 gcc_assert (BYTES_BIG_ENDIAN);
a4d9837e
RS
6227
6228 /* Do not try to optimize subregs that LRA has created for matched
6229 reloads. These subregs only exist as a temporary measure to make
6230 the RTL well-formed, but they are exempt from the usual
6231 TARGET_CAN_CHANGE_MODE_CLASS rules.
6232
6233 For example, if we have:
6234
6235 (set (reg:VNx8HI R1) (foo:VNx8HI (reg:VNx4SI R2)))
6236
6237 and the constraints require R1 and R2 to be in the same register,
6238 LRA may need to create RTL such as:
6239
6240 (set (subreg:VNx4SI (reg:VNx8HI TMP) 0) (reg:VNx4SI R2))
6241 (set (reg:VNx8HI TMP) (foo:VNx8HI (subreg:VNx4SI (reg:VNx8HI TMP) 0)))
6242 (set (reg:VNx8HI R1) (reg:VNx8HI TMP))
6243
6244 which forces both the input and output of the original instruction
6245 to use the same hard register. But for this to work, the normal
6246 rules have to be suppressed on the subreg input, otherwise LRA
6247 would need to reload that input too, meaning that the process
6248 would never terminate. To compensate for this, the normal rules
6249 are also suppressed for the subreg output of the first move.
6250 Ignoring the special case and handling the first move normally
6251 would therefore generate wrong code: we would reverse the elements
6252 for the first subreg but not reverse them back for the second subreg. */
6253 if (SUBREG_P (dest) && !LRA_SUBREG_P (dest))
002092be 6254 dest = SUBREG_REG (dest);
a4d9837e 6255 if (SUBREG_P (src) && !LRA_SUBREG_P (src))
002092be
RS
6256 src = SUBREG_REG (src);
6257
6258 /* The optimization handles two single SVE REGs with different element
6259 sizes. */
6260 if (!REG_P (dest)
6261 || !REG_P (src)
6262 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
6263 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
6264 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
6265 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
6266 return false;
6267
6268 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
16de3637 6269 rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
002092be
RS
6270 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
6271 UNSPEC_REV_SUBREG);
6272 emit_insn (gen_rtx_SET (dest, unspec));
6273 return true;
6274}
6275
6276/* Return a copy of X with mode MODE, without changing its other
6277 attributes. Unlike gen_lowpart, this doesn't care whether the
6278 mode change is valid. */
6279
624d0f07 6280rtx
002092be
RS
6281aarch64_replace_reg_mode (rtx x, machine_mode mode)
6282{
6283 if (GET_MODE (x) == mode)
6284 return x;
6285
6286 x = shallow_copy_rtx (x);
6287 set_mode_and_regno (x, mode, REGNO (x));
6288 return x;
6289}
6290
d7a09c44
RS
6291/* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
6292 stored in wider integer containers. */
6293
6294static unsigned int
6295aarch64_sve_rev_unspec (machine_mode mode)
6296{
6297 switch (GET_MODE_UNIT_SIZE (mode))
6298 {
6299 case 1: return UNSPEC_REVB;
6300 case 2: return UNSPEC_REVH;
6301 case 4: return UNSPEC_REVW;
6302 }
6303 gcc_unreachable ();
6304}
6305
002092be
RS
6306/* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
6307 operands. */
6308
6309void
6310aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
6311{
d7a09c44
RS
6312 /* Decide which REV operation we need. The mode with wider elements
6313 determines the mode of the operands and the mode with the narrower
002092be 6314 elements determines the reverse width. */
5c06093c
RS
6315 machine_mode mode_with_wider_elts = aarch64_sve_int_mode (GET_MODE (dest));
6316 machine_mode mode_with_narrower_elts = aarch64_sve_int_mode (GET_MODE (src));
002092be
RS
6317 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
6318 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
6319 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
6320
d7a09c44 6321 unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
cc68f7c2 6322 machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
002092be 6323
d7a09c44 6324 /* Get the operands in the appropriate modes and emit the instruction. */
002092be 6325 ptrue = gen_lowpart (pred_mode, ptrue);
d7a09c44
RS
6326 dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
6327 src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
6328 emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
6329 dest, ptrue, src));
002092be
RS
6330}
6331
43e9d192 6332static bool
c600df9a 6333aarch64_function_ok_for_sibcall (tree, tree exp)
43e9d192 6334{
c600df9a 6335 if (crtl->abi->id () != expr_callee_abi (exp).id ())
a0d0b980
SE
6336 return false;
6337
43e9d192
IB
6338 return true;
6339}
6340
38e62001
RS
6341/* Subroutine of aarch64_pass_by_reference for arguments that are not
6342 passed in SVE registers. */
43e9d192
IB
6343
6344static bool
56fe3ca3
RS
6345aarch64_pass_by_reference_1 (CUMULATIVE_ARGS *pcum,
6346 const function_arg_info &arg)
43e9d192
IB
6347{
6348 HOST_WIDE_INT size;
ef4bddc2 6349 machine_mode dummymode;
43e9d192
IB
6350 int nregs;
6351
6352 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
52090e4d
RS
6353 if (arg.mode == BLKmode && arg.type)
6354 size = int_size_in_bytes (arg.type);
6a70badb
RS
6355 else
6356 /* No frontends can create types with variable-sized modes, so we
6357 shouldn't be asked to pass or return them. */
52090e4d 6358 size = GET_MODE_SIZE (arg.mode).to_constant ();
43e9d192 6359
aadc1c43 6360 /* Aggregates are passed by reference based on their size. */
52090e4d
RS
6361 if (arg.aggregate_type_p ())
6362 size = int_size_in_bytes (arg.type);
43e9d192
IB
6363
6364 /* Variable sized arguments are always returned by reference. */
6365 if (size < 0)
6366 return true;
6367
6368 /* Can this be a candidate to be passed in fp/simd register(s)? */
52090e4d 6369 if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
56fe3ca3
RS
6370 &dummymode, &nregs, NULL,
6371 !pcum || pcum->silent_p))
43e9d192
IB
6372 return false;
6373
6374 /* Arguments which are variable sized or larger than 2 registers are
6375 passed by reference unless they are a homogenous floating point
6376 aggregate. */
6377 return size > 2 * UNITS_PER_WORD;
6378}
6379
38e62001
RS
6380/* Implement TARGET_PASS_BY_REFERENCE. */
6381
6382static bool
6383aarch64_pass_by_reference (cumulative_args_t pcum_v,
6384 const function_arg_info &arg)
6385{
6386 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6387
6388 if (!arg.type)
56fe3ca3 6389 return aarch64_pass_by_reference_1 (pcum, arg);
38e62001
RS
6390
6391 pure_scalable_type_info pst_info;
6392 switch (pst_info.analyze (arg.type))
6393 {
6394 case pure_scalable_type_info::IS_PST:
6395 if (pcum && !pcum->silent_p && !TARGET_SVE)
6396 /* We can't gracefully recover at this point, so make this a
6397 fatal error. */
6398 fatal_error (input_location, "arguments of type %qT require"
6399 " the SVE ISA extension", arg.type);
6400
6401 /* Variadic SVE types are passed by reference. Normal non-variadic
6402 arguments are too if we've run out of registers. */
6403 return (!arg.named
6404 || pcum->aapcs_nvrn + pst_info.num_zr () > NUM_FP_ARG_REGS
6405 || pcum->aapcs_nprn + pst_info.num_pr () > NUM_PR_ARG_REGS);
6406
6407 case pure_scalable_type_info::DOESNT_MATTER:
56fe3ca3 6408 gcc_assert (aarch64_pass_by_reference_1 (pcum, arg));
38e62001
RS
6409 return true;
6410
6411 case pure_scalable_type_info::NO_ABI_IDENTITY:
6412 case pure_scalable_type_info::ISNT_PST:
56fe3ca3 6413 return aarch64_pass_by_reference_1 (pcum, arg);
38e62001
RS
6414 }
6415 gcc_unreachable ();
6416}
6417
43e9d192
IB
6418/* Return TRUE if VALTYPE is padded to its least significant bits. */
6419static bool
6420aarch64_return_in_msb (const_tree valtype)
6421{
ef4bddc2 6422 machine_mode dummy_mode;
43e9d192
IB
6423 int dummy_int;
6424
6425 /* Never happens in little-endian mode. */
6426 if (!BYTES_BIG_ENDIAN)
6427 return false;
6428
6429 /* Only composite types smaller than or equal to 16 bytes can
6430 be potentially returned in registers. */
6431 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
6432 || int_size_in_bytes (valtype) <= 0
6433 || int_size_in_bytes (valtype) > 16)
6434 return false;
6435
6436 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
6437 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
6438 is always passed/returned in the least significant bits of fp/simd
6439 register(s). */
6440 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
56fe3ca3
RS
6441 &dummy_mode, &dummy_int, NULL,
6442 false))
43e9d192
IB
6443 return false;
6444
38e62001
RS
6445 /* Likewise pure scalable types for SVE vector and predicate registers. */
6446 pure_scalable_type_info pst_info;
6447 if (pst_info.analyze_registers (valtype))
6448 return false;
6449
43e9d192
IB
6450 return true;
6451}
6452
38e62001
RS
6453/* Implement TARGET_FUNCTION_VALUE.
6454 Define how to find the value returned by a function. */
6455
43e9d192 6456static rtx
38e62001
RS
6457aarch64_function_value (const_tree type, const_tree func,
6458 bool outgoing ATTRIBUTE_UNUSED)
43e9d192 6459{
38e62001
RS
6460 machine_mode mode;
6461 int unsignedp;
c600df9a 6462
38e62001
RS
6463 mode = TYPE_MODE (type);
6464 if (INTEGRAL_TYPE_P (type))
6465 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
c600df9a 6466
38e62001
RS
6467 pure_scalable_type_info pst_info;
6468 if (type && pst_info.analyze_registers (type))
6469 return pst_info.get_rtx (mode, V0_REGNUM, P0_REGNUM);
c600df9a 6470
38e62001
RS
6471 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
6472 are returned in memory, not by value. */
6473 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6474 bool sve_p = (vec_flags & VEC_ANY_SVE);
c600df9a 6475
43e9d192
IB
6476 if (aarch64_return_in_msb (type))
6477 {
6478 HOST_WIDE_INT size = int_size_in_bytes (type);
6479
6480 if (size % UNITS_PER_WORD != 0)
6481 {
6482 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
f4b31647 6483 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
43e9d192
IB
6484 }
6485 }
6486
6aa5370c
RS
6487 int count;
6488 machine_mode ag_mode;
56fe3ca3
RS
6489 if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &count,
6490 NULL, false))
43e9d192 6491 {
38e62001 6492 gcc_assert (!sve_p);
43e9d192
IB
6493 if (!aarch64_composite_type_p (type, mode))
6494 {
6495 gcc_assert (count == 1 && mode == ag_mode);
6496 return gen_rtx_REG (mode, V0_REGNUM);
6497 }
eb04ccf4
JW
6498 else if (aarch64_advsimd_full_struct_mode_p (mode)
6499 && known_eq (GET_MODE_SIZE (ag_mode), 16))
6500 return gen_rtx_REG (mode, V0_REGNUM);
6501 else if (aarch64_advsimd_partial_struct_mode_p (mode)
6502 && known_eq (GET_MODE_SIZE (ag_mode), 8))
6503 return gen_rtx_REG (mode, V0_REGNUM);
43e9d192
IB
6504 else
6505 {
6506 int i;
6507 rtx par;
6508
6509 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
6510 for (i = 0; i < count; i++)
6511 {
6512 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
6a70badb
RS
6513 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
6514 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
43e9d192
IB
6515 XVECEXP (par, 0, i) = tmp;
6516 }
6517 return par;
6518 }
6519 }
6520 else
6aa5370c 6521 {
38e62001
RS
6522 if (sve_p)
6523 {
6524 /* Vector types can acquire a partial SVE mode using things like
6525 __attribute__((vector_size(N))), and this is potentially useful.
6526 However, the choice of mode doesn't affect the type's ABI
6527 identity, so we should treat the types as though they had
6528 the associated integer mode, just like they did before SVE
6529 was introduced.
6530
6531 We know that the vector must be 128 bits or smaller,
6532 otherwise we'd have returned it in memory instead. */
6533 gcc_assert (type
6534 && (aarch64_some_values_include_pst_objects_p (type)
6535 || (vec_flags & VEC_PARTIAL)));
6536
6537 scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
6538 rtx reg = gen_rtx_REG (int_mode, R0_REGNUM);
6539 rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
6540 return gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
6541 }
6542 return gen_rtx_REG (mode, R0_REGNUM);
6aa5370c 6543 }
6aa5370c
RS
6544}
6545
43e9d192
IB
6546/* Implements TARGET_FUNCTION_VALUE_REGNO_P.
6547 Return true if REGNO is the number of a hard register in which the values
6548 of called function may come back. */
6549
6550static bool
6551aarch64_function_value_regno_p (const unsigned int regno)
6552{
6553 /* Maximum of 16 bytes can be returned in the general registers. Examples
6554 of 16-byte return values are: 128-bit integers and 16-byte small
6555 structures (excluding homogeneous floating-point aggregates). */
6556 if (regno == R0_REGNUM || regno == R1_REGNUM)
6557 return true;
6558
6559 /* Up to four fp/simd registers can return a function value, e.g. a
6560 homogeneous floating-point aggregate having four members. */
6561 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
d5726973 6562 return TARGET_FLOAT;
43e9d192
IB
6563
6564 return false;
6565}
6566
38e62001
RS
6567/* Subroutine for aarch64_return_in_memory for types that are not returned
6568 in SVE registers. */
43e9d192
IB
6569
6570static bool
38e62001 6571aarch64_return_in_memory_1 (const_tree type)
43e9d192
IB
6572{
6573 HOST_WIDE_INT size;
ef4bddc2 6574 machine_mode ag_mode;
43e9d192
IB
6575 int count;
6576
6577 if (!AGGREGATE_TYPE_P (type)
6578 && TREE_CODE (type) != COMPLEX_TYPE
6579 && TREE_CODE (type) != VECTOR_TYPE)
6580 /* Simple scalar types always returned in registers. */
6581 return false;
6582
56fe3ca3
RS
6583 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
6584 &ag_mode, &count, NULL, false))
43e9d192
IB
6585 return false;
6586
6587 /* Types larger than 2 registers returned in memory. */
6588 size = int_size_in_bytes (type);
6589 return (size < 0 || size > 2 * UNITS_PER_WORD);
6590}
6591
38e62001
RS
6592/* Implement TARGET_RETURN_IN_MEMORY.
6593
6594 If the type T of the result of a function is such that
6595 void func (T arg)
6596 would require that arg be passed as a value in a register (or set of
6597 registers) according to the parameter passing rules, then the result
6598 is returned in the same registers as would be used for such an
6599 argument. */
6600
6601static bool
6602aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
6603{
6604 pure_scalable_type_info pst_info;
6605 switch (pst_info.analyze (type))
6606 {
6607 case pure_scalable_type_info::IS_PST:
6608 return (pst_info.num_zr () > NUM_FP_ARG_REGS
6609 || pst_info.num_pr () > NUM_PR_ARG_REGS);
6610
6611 case pure_scalable_type_info::DOESNT_MATTER:
6612 gcc_assert (aarch64_return_in_memory_1 (type));
6613 return true;
6614
6615 case pure_scalable_type_info::NO_ABI_IDENTITY:
6616 case pure_scalable_type_info::ISNT_PST:
6617 return aarch64_return_in_memory_1 (type);
6618 }
6619 gcc_unreachable ();
6620}
6621
43e9d192 6622static bool
ef4bddc2 6623aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
43e9d192
IB
6624 const_tree type, int *nregs)
6625{
6626 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
56fe3ca3 6627 return aarch64_vfp_is_call_or_return_candidate (mode, type,
43e9d192 6628 &pcum->aapcs_vfp_rmode,
56fe3ca3 6629 nregs, NULL, pcum->silent_p);
43e9d192
IB
6630}
6631
985b8393 6632/* Given MODE and TYPE of a function argument, return the alignment in
43e9d192 6633 bits. The idea is to suppress any stronger alignment requested by
c590597c
RE
6634 the user and opt for the natural alignment (specified in AAPCS64 \S
6635 4.1). ABI_BREAK is set to true if the alignment was incorrectly
6636 calculated in versions of GCC prior to GCC-9. This is a helper
6637 function for local use only. */
43e9d192 6638
985b8393 6639static unsigned int
c590597c 6640aarch64_function_arg_alignment (machine_mode mode, const_tree type,
49813aad 6641 unsigned int *abi_break)
43e9d192 6642{
49813aad 6643 *abi_break = 0;
75d6cc81 6644 if (!type)
985b8393 6645 return GET_MODE_ALIGNMENT (mode);
2ec07fa6 6646
75d6cc81 6647 if (integer_zerop (TYPE_SIZE (type)))
985b8393 6648 return 0;
43e9d192 6649
75d6cc81
AL
6650 gcc_assert (TYPE_MODE (type) == mode);
6651
6652 if (!AGGREGATE_TYPE_P (type))
985b8393 6653 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
75d6cc81
AL
6654
6655 if (TREE_CODE (type) == ARRAY_TYPE)
985b8393 6656 return TYPE_ALIGN (TREE_TYPE (type));
75d6cc81 6657
985b8393 6658 unsigned int alignment = 0;
c590597c 6659 unsigned int bitfield_alignment = 0;
75d6cc81 6660 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
985b8393 6661 if (TREE_CODE (field) == FIELD_DECL)
c590597c 6662 {
56fe3ca3
RS
6663 /* Note that we explicitly consider zero-sized fields here,
6664 even though they don't map to AAPCS64 machine types.
6665 For example, in:
6666
6667 struct __attribute__((aligned(8))) empty {};
6668
6669 struct s {
6670 [[no_unique_address]] empty e;
6671 int x;
6672 };
6673
6674 "s" contains only one Fundamental Data Type (the int field)
6675 but gains 8-byte alignment and size thanks to "e". */
c590597c
RE
6676 alignment = std::max (alignment, DECL_ALIGN (field));
6677 if (DECL_BIT_FIELD_TYPE (field))
6678 bitfield_alignment
6679 = std::max (bitfield_alignment,
6680 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
6681 }
6682
6683 if (bitfield_alignment > alignment)
6684 {
49813aad 6685 *abi_break = alignment;
c590597c
RE
6686 return bitfield_alignment;
6687 }
43e9d192 6688
985b8393 6689 return alignment;
43e9d192
IB
6690}
6691
6692/* Layout a function argument according to the AAPCS64 rules. The rule
6aa5370c
RS
6693 numbers refer to the rule numbers in the AAPCS64. ORIG_MODE is the
6694 mode that was originally given to us by the target hook, whereas the
6695 mode in ARG might be the result of replacing partial SVE modes with
6696 the equivalent integer mode. */
43e9d192
IB
6697
6698static void
38e62001 6699aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
43e9d192
IB
6700{
6701 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
c600df9a
RS
6702 tree type = arg.type;
6703 machine_mode mode = arg.mode;
43e9d192
IB
6704 int ncrn, nvrn, nregs;
6705 bool allocate_ncrn, allocate_nvrn;
3abf17cf 6706 HOST_WIDE_INT size;
49813aad 6707 unsigned int abi_break;
43e9d192
IB
6708
6709 /* We need to do this once per argument. */
6710 if (pcum->aapcs_arg_processed)
6711 return;
6712
6713 pcum->aapcs_arg_processed = true;
6714
38e62001
RS
6715 pure_scalable_type_info pst_info;
6716 if (type && pst_info.analyze_registers (type))
c600df9a
RS
6717 {
6718 /* The PCS says that it is invalid to pass an SVE value to an
6719 unprototyped function. There is no ABI-defined location we
6720 can return in this case, so we have no real choice but to raise
6721 an error immediately, even though this is only a query function. */
6722 if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
6723 {
6724 gcc_assert (!pcum->silent_p);
6725 error ("SVE type %qT cannot be passed to an unprototyped function",
6726 arg.type);
6727 /* Avoid repeating the message, and avoid tripping the assert
6728 below. */
6729 pcum->pcs_variant = ARM_PCS_SVE;
6730 }
6731
6732 /* We would have converted the argument into pass-by-reference
6733 form if it didn't fit in registers. */
38e62001
RS
6734 pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + pst_info.num_zr ();
6735 pcum->aapcs_nextnprn = pcum->aapcs_nprn + pst_info.num_pr ();
c600df9a
RS
6736 gcc_assert (arg.named
6737 && pcum->pcs_variant == ARM_PCS_SVE
c600df9a
RS
6738 && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
6739 && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
38e62001
RS
6740 pcum->aapcs_reg = pst_info.get_rtx (mode, V0_REGNUM + pcum->aapcs_nvrn,
6741 P0_REGNUM + pcum->aapcs_nprn);
c600df9a
RS
6742 return;
6743 }
6744
38e62001
RS
6745 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
6746 are passed by reference, not by value. */
6747 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6748 bool sve_p = (vec_flags & VEC_ANY_SVE);
6749 if (sve_p)
6750 /* Vector types can acquire a partial SVE mode using things like
6751 __attribute__((vector_size(N))), and this is potentially useful.
6752 However, the choice of mode doesn't affect the type's ABI
6753 identity, so we should treat the types as though they had
6754 the associated integer mode, just like they did before SVE
6755 was introduced.
6756
6757 We know that the vector must be 128 bits or smaller,
6758 otherwise we'd have passed it in memory instead. */
6759 gcc_assert (type
6760 && (aarch64_some_values_include_pst_objects_p (type)
6761 || (vec_flags & VEC_PARTIAL)));
c600df9a 6762
3abf17cf 6763 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
6a70badb
RS
6764 if (type)
6765 size = int_size_in_bytes (type);
6766 else
6767 /* No frontends can create types with variable-sized modes, so we
6768 shouldn't be asked to pass or return them. */
6769 size = GET_MODE_SIZE (mode).to_constant ();
6770 size = ROUND_UP (size, UNITS_PER_WORD);
3abf17cf 6771
43e9d192
IB
6772 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
6773 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
6774 mode,
6775 type,
6776 &nregs);
38e62001 6777 gcc_assert (!sve_p || !allocate_nvrn);
43e9d192
IB
6778
6779 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
6780 The following code thus handles passing by SIMD/FP registers first. */
6781
6782 nvrn = pcum->aapcs_nvrn;
6783
6784 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
6785 and homogenous short-vector aggregates (HVA). */
6786 if (allocate_nvrn)
6787 {
c600df9a 6788 if (!pcum->silent_p && !TARGET_FLOAT)
fc29dfc9 6789 aarch64_err_no_fpadvsimd (mode);
261fb553 6790
43e9d192
IB
6791 if (nvrn + nregs <= NUM_FP_ARG_REGS)
6792 {
6793 pcum->aapcs_nextnvrn = nvrn + nregs;
6794 if (!aarch64_composite_type_p (type, mode))
6795 {
6796 gcc_assert (nregs == 1);
6797 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
6798 }
eb04ccf4
JW
6799 else if (aarch64_advsimd_full_struct_mode_p (mode)
6800 && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 16))
6801 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
6802 else if (aarch64_advsimd_partial_struct_mode_p (mode)
6803 && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 8))
6804 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
43e9d192
IB
6805 else
6806 {
6807 rtx par;
6808 int i;
6809 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
6810 for (i = 0; i < nregs; i++)
6811 {
6812 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
6813 V0_REGNUM + nvrn + i);
6a70badb
RS
6814 rtx offset = gen_int_mode
6815 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
6816 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
43e9d192
IB
6817 XVECEXP (par, 0, i) = tmp;
6818 }
6819 pcum->aapcs_reg = par;
6820 }
6821 return;
6822 }
6823 else
6824 {
6825 /* C.3 NSRN is set to 8. */
6826 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
6827 goto on_stack;
6828 }
6829 }
6830
6831 ncrn = pcum->aapcs_ncrn;
3abf17cf 6832 nregs = size / UNITS_PER_WORD;
43e9d192
IB
6833
6834 /* C6 - C9. though the sign and zero extension semantics are
6835 handled elsewhere. This is the case where the argument fits
6836 entirely general registers. */
6837 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
6838 {
43e9d192
IB
6839 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
6840
6841 /* C.8 if the argument has an alignment of 16 then the NGRN is
c590597c 6842 rounded up to the next even number. */
985b8393
JJ
6843 if (nregs == 2
6844 && ncrn % 2
2ec07fa6 6845 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
985b8393 6846 comparison is there because for > 16 * BITS_PER_UNIT
2ec07fa6
RR
6847 alignment nregs should be > 2 and therefore it should be
6848 passed by reference rather than value. */
38e62001 6849 && (aarch64_function_arg_alignment (mode, type, &abi_break)
c590597c 6850 == 16 * BITS_PER_UNIT))
985b8393 6851 {
c590597c
RE
6852 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
6853 inform (input_location, "parameter passing for argument of type "
6854 "%qT changed in GCC 9.1", type);
985b8393
JJ
6855 ++ncrn;
6856 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
43e9d192 6857 }
2ec07fa6 6858
38e62001
RS
6859 /* If an argument with an SVE mode needs to be shifted up to the
6860 high part of the register, treat it as though it had an integer mode.
6861 Using the normal (parallel [...]) would suppress the shifting. */
6862 if (sve_p
6863 && BYTES_BIG_ENDIAN
6864 && maybe_ne (GET_MODE_SIZE (mode), nregs * UNITS_PER_WORD)
6865 && aarch64_pad_reg_upward (mode, type, false))
6866 {
6867 mode = int_mode_for_mode (mode).require ();
6868 sve_p = false;
6869 }
6870
43e9d192 6871 /* NREGS can be 0 when e.g. an empty structure is to be passed.
c590597c 6872 A reg is still generated for it, but the caller should be smart
43e9d192 6873 enough not to use it. */
38e62001
RS
6874 if (nregs == 0
6875 || (nregs == 1 && !sve_p)
6876 || GET_MODE_CLASS (mode) == MODE_INT)
2ec07fa6 6877 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
43e9d192
IB
6878 else
6879 {
6880 rtx par;
6881 int i;
6882
6883 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
6884 for (i = 0; i < nregs; i++)
6885 {
38e62001
RS
6886 scalar_int_mode reg_mode = word_mode;
6887 if (nregs == 1)
6888 reg_mode = int_mode_for_mode (mode).require ();
6889 rtx tmp = gen_rtx_REG (reg_mode, R0_REGNUM + ncrn + i);
43e9d192
IB
6890 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
6891 GEN_INT (i * UNITS_PER_WORD));
6892 XVECEXP (par, 0, i) = tmp;
6893 }
6894 pcum->aapcs_reg = par;
6895 }
6896
6897 pcum->aapcs_nextncrn = ncrn + nregs;
6898 return;
6899 }
6900
6901 /* C.11 */
6902 pcum->aapcs_nextncrn = NUM_ARG_REGS;
6903
6904 /* The argument is passed on stack; record the needed number of words for
3abf17cf 6905 this argument and align the total size if necessary. */
43e9d192 6906on_stack:
3abf17cf 6907 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2ec07fa6 6908
38e62001 6909 if (aarch64_function_arg_alignment (mode, type, &abi_break)
c590597c
RE
6910 == 16 * BITS_PER_UNIT)
6911 {
6912 int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
6913 if (pcum->aapcs_stack_size != new_size)
6914 {
6915 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
6916 inform (input_location, "parameter passing for argument of type "
6917 "%qT changed in GCC 9.1", type);
6918 pcum->aapcs_stack_size = new_size;
6919 }
6920 }
43e9d192
IB
6921 return;
6922}
6923
6924/* Implement TARGET_FUNCTION_ARG. */
6925
6926static rtx
6783fdb7 6927aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
43e9d192
IB
6928{
6929 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
08cc4d92 6930 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
c600df9a
RS
6931 || pcum->pcs_variant == ARM_PCS_SIMD
6932 || pcum->pcs_variant == ARM_PCS_SVE);
43e9d192 6933
6783fdb7 6934 if (arg.end_marker_p ())
08cc4d92 6935 return gen_int_mode (pcum->pcs_variant, DImode);
43e9d192 6936
38e62001 6937 aarch64_layout_arg (pcum_v, arg);
43e9d192
IB
6938 return pcum->aapcs_reg;
6939}
6940
6941void
6942aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
08cc4d92
RS
6943 const_tree fntype,
6944 rtx libname ATTRIBUTE_UNUSED,
6945 const_tree fndecl ATTRIBUTE_UNUSED,
c600df9a
RS
6946 unsigned n_named ATTRIBUTE_UNUSED,
6947 bool silent_p)
43e9d192
IB
6948{
6949 pcum->aapcs_ncrn = 0;
6950 pcum->aapcs_nvrn = 0;
c600df9a 6951 pcum->aapcs_nprn = 0;
43e9d192
IB
6952 pcum->aapcs_nextncrn = 0;
6953 pcum->aapcs_nextnvrn = 0;
c600df9a 6954 pcum->aapcs_nextnprn = 0;
08cc4d92
RS
6955 if (fntype)
6956 pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
6957 else
6958 pcum->pcs_variant = ARM_PCS_AAPCS64;
43e9d192
IB
6959 pcum->aapcs_reg = NULL_RTX;
6960 pcum->aapcs_arg_processed = false;
6961 pcum->aapcs_stack_words = 0;
6962 pcum->aapcs_stack_size = 0;
c600df9a 6963 pcum->silent_p = silent_p;
43e9d192 6964
c600df9a
RS
6965 if (!silent_p
6966 && !TARGET_FLOAT
261fb553
AL
6967 && fntype && fntype != error_mark_node)
6968 {
6969 const_tree type = TREE_TYPE (fntype);
6970 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
6971 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
6972 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
56fe3ca3 6973 &mode, &nregs, NULL, false))
fc29dfc9 6974 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
261fb553 6975 }
c600df9a
RS
6976
6977 if (!silent_p
6978 && !TARGET_SVE
6979 && pcum->pcs_variant == ARM_PCS_SVE)
6980 {
6981 /* We can't gracefully recover at this point, so make this a
6982 fatal error. */
6983 if (fndecl)
6984 fatal_error (input_location, "%qE requires the SVE ISA extension",
6985 fndecl);
6986 else
6987 fatal_error (input_location, "calls to functions of type %qT require"
6988 " the SVE ISA extension", fntype);
6989 }
43e9d192
IB
6990}
6991
6992static void
6993aarch64_function_arg_advance (cumulative_args_t pcum_v,
6930c98c 6994 const function_arg_info &arg)
43e9d192
IB
6995{
6996 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
08cc4d92 6997 if (pcum->pcs_variant == ARM_PCS_AAPCS64
c600df9a
RS
6998 || pcum->pcs_variant == ARM_PCS_SIMD
6999 || pcum->pcs_variant == ARM_PCS_SVE)
43e9d192 7000 {
38e62001 7001 aarch64_layout_arg (pcum_v, arg);
43e9d192
IB
7002 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
7003 != (pcum->aapcs_stack_words != 0));
7004 pcum->aapcs_arg_processed = false;
7005 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
7006 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
c600df9a 7007 pcum->aapcs_nprn = pcum->aapcs_nextnprn;
43e9d192
IB
7008 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
7009 pcum->aapcs_stack_words = 0;
7010 pcum->aapcs_reg = NULL_RTX;
7011 }
7012}
7013
7014bool
7015aarch64_function_arg_regno_p (unsigned regno)
7016{
7017 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
7018 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
7019}
7020
7021/* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
7022 PARM_BOUNDARY bits of alignment, but will be given anything up
7023 to STACK_BOUNDARY bits if the type requires it. This makes sure
7024 that both before and after the layout of each argument, the Next
7025 Stacked Argument Address (NSAA) will have a minimum alignment of
7026 8 bytes. */
7027
7028static unsigned int
ef4bddc2 7029aarch64_function_arg_boundary (machine_mode mode, const_tree type)
43e9d192 7030{
49813aad 7031 unsigned int abi_break;
c590597c
RE
7032 unsigned int alignment = aarch64_function_arg_alignment (mode, type,
7033 &abi_break);
49813aad 7034 alignment = MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
c590597c 7035 if (abi_break & warn_psabi)
49813aad
JJ
7036 {
7037 abi_break = MIN (MAX (abi_break, PARM_BOUNDARY), STACK_BOUNDARY);
7038 if (alignment != abi_break)
7039 inform (input_location, "parameter passing for argument of type "
7040 "%qT changed in GCC 9.1", type);
7041 }
c590597c 7042
49813aad 7043 return alignment;
43e9d192
IB
7044}
7045
43cacb12
RS
7046/* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
7047
7048static fixed_size_mode
7049aarch64_get_reg_raw_mode (int regno)
7050{
7051 if (TARGET_SVE && FP_REGNUM_P (regno))
7052 /* Don't use the SVE part of the register for __builtin_apply and
7053 __builtin_return. The SVE registers aren't used by the normal PCS,
7054 so using them there would be a waste of time. The PCS extensions
7055 for SVE types are fundamentally incompatible with the
7056 __builtin_return/__builtin_apply interface. */
7057 return as_a <fixed_size_mode> (V16QImode);
7058 return default_get_reg_raw_mode (regno);
7059}
7060
76b0cbf8 7061/* Implement TARGET_FUNCTION_ARG_PADDING.
43e9d192
IB
7062
7063 Small aggregate types are placed in the lowest memory address.
7064
7065 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
7066
76b0cbf8
RS
7067static pad_direction
7068aarch64_function_arg_padding (machine_mode mode, const_tree type)
43e9d192
IB
7069{
7070 /* On little-endian targets, the least significant byte of every stack
7071 argument is passed at the lowest byte address of the stack slot. */
7072 if (!BYTES_BIG_ENDIAN)
76b0cbf8 7073 return PAD_UPWARD;
43e9d192 7074
00edcfbe 7075 /* Otherwise, integral, floating-point and pointer types are padded downward:
43e9d192
IB
7076 the least significant byte of a stack argument is passed at the highest
7077 byte address of the stack slot. */
7078 if (type
00edcfbe
YZ
7079 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
7080 || POINTER_TYPE_P (type))
43e9d192 7081 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
76b0cbf8 7082 return PAD_DOWNWARD;
43e9d192
IB
7083
7084 /* Everything else padded upward, i.e. data in first byte of stack slot. */
76b0cbf8 7085 return PAD_UPWARD;
43e9d192
IB
7086}
7087
7088/* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
7089
7090 It specifies padding for the last (may also be the only)
7091 element of a block move between registers and memory. If
7092 assuming the block is in the memory, padding upward means that
7093 the last element is padded after its highest significant byte,
7094 while in downward padding, the last element is padded at the
7095 its least significant byte side.
7096
7097 Small aggregates and small complex types are always padded
7098 upwards.
7099
7100 We don't need to worry about homogeneous floating-point or
7101 short-vector aggregates; their move is not affected by the
7102 padding direction determined here. Regardless of endianness,
7103 each element of such an aggregate is put in the least
7104 significant bits of a fp/simd register.
7105
7106 Return !BYTES_BIG_ENDIAN if the least significant byte of the
7107 register has useful data, and return the opposite if the most
7108 significant byte does. */
7109
7110bool
ef4bddc2 7111aarch64_pad_reg_upward (machine_mode mode, const_tree type,
43e9d192
IB
7112 bool first ATTRIBUTE_UNUSED)
7113{
7114
38e62001
RS
7115 /* Aside from pure scalable types, small composite types are always
7116 padded upward. */
43e9d192
IB
7117 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
7118 {
6a70badb
RS
7119 HOST_WIDE_INT size;
7120 if (type)
7121 size = int_size_in_bytes (type);
7122 else
7123 /* No frontends can create types with variable-sized modes, so we
7124 shouldn't be asked to pass or return them. */
7125 size = GET_MODE_SIZE (mode).to_constant ();
43e9d192 7126 if (size < 2 * UNITS_PER_WORD)
38e62001
RS
7127 {
7128 pure_scalable_type_info pst_info;
7129 if (pst_info.analyze_registers (type))
7130 return false;
7131 return true;
7132 }
43e9d192
IB
7133 }
7134
7135 /* Otherwise, use the default padding. */
7136 return !BYTES_BIG_ENDIAN;
7137}
7138
095a2d76 7139static scalar_int_mode
43e9d192
IB
7140aarch64_libgcc_cmp_return_mode (void)
7141{
7142 return SImode;
7143}
7144
a3eb8a52
EB
7145#define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
7146
7147/* We use the 12-bit shifted immediate arithmetic instructions so values
7148 must be multiple of (1 << 12), i.e. 4096. */
7149#define ARITH_FACTOR 4096
7150
7151#if (PROBE_INTERVAL % ARITH_FACTOR) != 0
7152#error Cannot use simple address calculation for stack probing
7153#endif
7154
6a70badb 7155/* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
a3eb8a52
EB
7156 inclusive. These are offsets from the current stack pointer. */
7157
7158static void
6a70badb 7159aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
a3eb8a52 7160{
6a70badb
RS
7161 HOST_WIDE_INT size;
7162 if (!poly_size.is_constant (&size))
7163 {
7164 sorry ("stack probes for SVE frames");
7165 return;
7166 }
7167
5773855c 7168 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REGNUM);
a3eb8a52
EB
7169
7170 /* See the same assertion on PROBE_INTERVAL above. */
7171 gcc_assert ((first % ARITH_FACTOR) == 0);
7172
7173 /* See if we have a constant small number of probes to generate. If so,
7174 that's the easy case. */
7175 if (size <= PROBE_INTERVAL)
7176 {
7177 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
7178
7179 emit_set_insn (reg1,
5f5c5e0f 7180 plus_constant (Pmode,
a3eb8a52 7181 stack_pointer_rtx, -(first + base)));
5f5c5e0f 7182 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
a3eb8a52
EB
7183 }
7184
7185 /* The run-time loop is made up of 8 insns in the generic case while the
7186 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
7187 else if (size <= 4 * PROBE_INTERVAL)
7188 {
7189 HOST_WIDE_INT i, rem;
7190
7191 emit_set_insn (reg1,
5f5c5e0f 7192 plus_constant (Pmode,
a3eb8a52
EB
7193 stack_pointer_rtx,
7194 -(first + PROBE_INTERVAL)));
7195 emit_stack_probe (reg1);
7196
7197 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
7198 it exceeds SIZE. If only two probes are needed, this will not
7199 generate any code. Then probe at FIRST + SIZE. */
7200 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
7201 {
7202 emit_set_insn (reg1,
5f5c5e0f 7203 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
a3eb8a52
EB
7204 emit_stack_probe (reg1);
7205 }
7206
7207 rem = size - (i - PROBE_INTERVAL);
7208 if (rem > 256)
7209 {
7210 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
7211
5f5c5e0f
EB
7212 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
7213 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
a3eb8a52
EB
7214 }
7215 else
5f5c5e0f 7216 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
a3eb8a52
EB
7217 }
7218
7219 /* Otherwise, do the same as above, but in a loop. Note that we must be
7220 extra careful with variables wrapping around because we might be at
7221 the very top (or the very bottom) of the address space and we have
7222 to be able to handle this case properly; in particular, we use an
7223 equality test for the loop condition. */
7224 else
7225 {
5773855c 7226 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REGNUM);
a3eb8a52
EB
7227
7228 /* Step 1: round SIZE to the previous multiple of the interval. */
7229
7230 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
7231
7232
7233 /* Step 2: compute initial and final value of the loop counter. */
7234
7235 /* TEST_ADDR = SP + FIRST. */
7236 emit_set_insn (reg1,
5f5c5e0f 7237 plus_constant (Pmode, stack_pointer_rtx, -first));
a3eb8a52
EB
7238
7239 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
13f752b2
JL
7240 HOST_WIDE_INT adjustment = - (first + rounded_size);
7241 if (! aarch64_uimm12_shift (adjustment))
7242 {
7243 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
7244 true, Pmode);
7245 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
7246 }
7247 else
8dd64cdf
EB
7248 emit_set_insn (reg2,
7249 plus_constant (Pmode, stack_pointer_rtx, adjustment));
7250
a3eb8a52
EB
7251 /* Step 3: the loop
7252
7253 do
7254 {
7255 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
7256 probe at TEST_ADDR
7257 }
7258 while (TEST_ADDR != LAST_ADDR)
7259
7260 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
7261 until it is equal to ROUNDED_SIZE. */
7262
5f5c5e0f 7263 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
a3eb8a52
EB
7264
7265
7266 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
7267 that SIZE is equal to ROUNDED_SIZE. */
7268
7269 if (size != rounded_size)
7270 {
7271 HOST_WIDE_INT rem = size - rounded_size;
7272
7273 if (rem > 256)
7274 {
7275 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
7276
5f5c5e0f
EB
7277 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
7278 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
a3eb8a52
EB
7279 }
7280 else
5f5c5e0f 7281 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
a3eb8a52
EB
7282 }
7283 }
7284
7285 /* Make sure nothing is scheduled before we are done. */
7286 emit_insn (gen_blockage ());
7287}
7288
7289/* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
7290 absolute addresses. */
7291
7292const char *
7293aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
7294{
7295 static int labelno = 0;
7296 char loop_lab[32];
7297 rtx xops[2];
7298
7299 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
7300
7301 /* Loop. */
7302 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
7303
cd1bef27 7304 HOST_WIDE_INT stack_clash_probe_interval
028d4092 7305 = 1 << param_stack_clash_protection_guard_size;
cd1bef27 7306
a3eb8a52
EB
7307 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
7308 xops[0] = reg1;
cd1bef27
JL
7309 HOST_WIDE_INT interval;
7310 if (flag_stack_clash_protection)
7311 interval = stack_clash_probe_interval;
7312 else
7313 interval = PROBE_INTERVAL;
7314
7315 gcc_assert (aarch64_uimm12_shift (interval));
7316 xops[1] = GEN_INT (interval);
7317
a3eb8a52
EB
7318 output_asm_insn ("sub\t%0, %0, %1", xops);
7319
cd1bef27
JL
7320 /* If doing stack clash protection then we probe up by the ABI specified
7321 amount. We do this because we're dropping full pages at a time in the
7322 loop. But if we're doing non-stack clash probing, probe at SP 0. */
7323 if (flag_stack_clash_protection)
7324 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
7325 else
7326 xops[1] = CONST0_RTX (GET_MODE (xops[1]));
7327
7328 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
7329 by this amount for each iteration. */
7330 output_asm_insn ("str\txzr, [%0, %1]", xops);
a3eb8a52
EB
7331
7332 /* Test if TEST_ADDR == LAST_ADDR. */
7333 xops[1] = reg2;
7334 output_asm_insn ("cmp\t%0, %1", xops);
7335
7336 /* Branch. */
7337 fputs ("\tb.ne\t", asm_out_file);
7338 assemble_name_raw (asm_out_file, loop_lab);
7339 fputc ('\n', asm_out_file);
7340
7341 return "";
7342}
7343
eb471ba3
TC
7344/* Emit the probe loop for doing stack clash probes and stack adjustments for
7345 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
7346 of GUARD_SIZE. When a probe is emitted it is done at most
7347 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
7348 at most MIN_PROBE_THRESHOLD. By the end of this function
7349 BASE = BASE - ADJUSTMENT. */
7350
7351const char *
7352aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
7353 rtx min_probe_threshold, rtx guard_size)
7354{
7355 /* This function is not allowed to use any instruction generation function
7356 like gen_ and friends. If you do you'll likely ICE during CFG validation,
7357 so instead emit the code you want using output_asm_insn. */
7358 gcc_assert (flag_stack_clash_protection);
7359 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
7360 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
7361
7362 /* The minimum required allocation before the residual requires probing. */
7363 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
7364
7365 /* Clamp the value down to the nearest value that can be used with a cmp. */
7366 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
7367 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
7368
7369 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
7370 gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
7371
7372 static int labelno = 0;
7373 char loop_start_lab[32];
7374 char loop_end_lab[32];
7375 rtx xops[2];
7376
7377 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
7378 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
7379
7380 /* Emit loop start label. */
7381 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
7382
7383 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
7384 xops[0] = adjustment;
7385 xops[1] = probe_offset_value_rtx;
7386 output_asm_insn ("cmp\t%0, %1", xops);
7387
7388 /* Branch to end if not enough adjustment to probe. */
7389 fputs ("\tb.lt\t", asm_out_file);
7390 assemble_name_raw (asm_out_file, loop_end_lab);
7391 fputc ('\n', asm_out_file);
7392
7393 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
7394 xops[0] = base;
7395 xops[1] = probe_offset_value_rtx;
7396 output_asm_insn ("sub\t%0, %0, %1", xops);
7397
7398 /* Probe at BASE. */
7399 xops[1] = const0_rtx;
7400 output_asm_insn ("str\txzr, [%0, %1]", xops);
7401
7402 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
7403 xops[0] = adjustment;
7404 xops[1] = probe_offset_value_rtx;
7405 output_asm_insn ("sub\t%0, %0, %1", xops);
7406
7407 /* Branch to start if still more bytes to allocate. */
7408 fputs ("\tb\t", asm_out_file);
7409 assemble_name_raw (asm_out_file, loop_start_lab);
7410 fputc ('\n', asm_out_file);
7411
7412 /* No probe leave. */
7413 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
7414
7415 /* BASE = BASE - ADJUSTMENT. */
7416 xops[0] = base;
7417 xops[1] = adjustment;
7418 output_asm_insn ("sub\t%0, %0, %1", xops);
7419 return "";
7420}
7421
d6cb6d6a
WD
7422/* Determine whether a frame chain needs to be generated. */
7423static bool
7424aarch64_needs_frame_chain (void)
7425{
7426 /* Force a frame chain for EH returns so the return address is at FP+8. */
7427 if (frame_pointer_needed || crtl->calls_eh_return)
7428 return true;
7429
7430 /* A leaf function cannot have calls or write LR. */
7431 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
7432
7433 /* Don't use a frame chain in leaf functions if leaf frame pointers
7434 are disabled. */
7435 if (flag_omit_leaf_frame_pointer && is_leaf)
7436 return false;
7437
7438 return aarch64_use_frame_pointer;
7439}
7440
43e9d192
IB
7441/* Mark the registers that need to be saved by the callee and calculate
7442 the size of the callee-saved registers area and frame record (both FP
33a2e348 7443 and LR may be omitted). */
43e9d192
IB
7444static void
7445aarch64_layout_frame (void)
7446{
c600df9a 7447 poly_int64 offset = 0;
4b0685d9 7448 int regno, last_fp_reg = INVALID_REGNUM;
c600df9a
RS
7449 machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
7450 poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
7451 bool frame_related_fp_reg_p = false;
ab43763e 7452 aarch64_frame &frame = cfun->machine->frame;
43e9d192 7453
ab43763e 7454 frame.emit_frame_chain = aarch64_needs_frame_chain ();
7040939b 7455
8c6e3b23
TC
7456 /* Adjust the outgoing arguments size if required. Keep it in sync with what
7457 the mid-end is doing. */
7458 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
7459
97826595
MS
7460#define SLOT_NOT_REQUIRED (-2)
7461#define SLOT_REQUIRED (-1)
7462
ab43763e
RS
7463 frame.wb_candidate1 = INVALID_REGNUM;
7464 frame.wb_candidate2 = INVALID_REGNUM;
c600df9a 7465 frame.spare_pred_reg = INVALID_REGNUM;
363ffa50 7466
43e9d192 7467 /* First mark all the registers that really need to be saved... */
c600df9a 7468 for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
ab43763e 7469 frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
43e9d192
IB
7470
7471 /* ... that includes the eh data registers (if needed)... */
7472 if (crtl->calls_eh_return)
7473 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
ab43763e 7474 frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
43e9d192
IB
7475
7476 /* ... and any callee saved register that dataflow says is live. */
7477 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
7478 if (df_regs_ever_live_p (regno)
dcdd0f05 7479 && !fixed_regs[regno]
1c923b60 7480 && (regno == R30_REGNUM
dcdd0f05 7481 || !crtl->abi->clobbers_full_reg_p (regno)))
ab43763e 7482 frame.reg_offset[regno] = SLOT_REQUIRED;
43e9d192
IB
7483
7484 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
7485 if (df_regs_ever_live_p (regno)
dcdd0f05
RS
7486 && !fixed_regs[regno]
7487 && !crtl->abi->clobbers_full_reg_p (regno))
4b0685d9 7488 {
ab43763e 7489 frame.reg_offset[regno] = SLOT_REQUIRED;
4b0685d9 7490 last_fp_reg = regno;
c600df9a
RS
7491 if (aarch64_emit_cfi_for_reg_p (regno))
7492 frame_related_fp_reg_p = true;
4b0685d9 7493 }
43e9d192 7494
c600df9a
RS
7495 /* Big-endian SVE frames need a spare predicate register in order
7496 to save Z8-Z15. Decide which register they should use. Prefer
7497 an unused argument register if possible, so that we don't force P4
7498 to be saved unnecessarily. */
7499 if (frame_related_fp_reg_p
7500 && crtl->abi->id () == ARM_PCS_SVE
7501 && BYTES_BIG_ENDIAN)
7502 {
7503 bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
7504 bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
7505 for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
7506 if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
7507 break;
7508 gcc_assert (regno <= P7_REGNUM);
7509 frame.spare_pred_reg = regno;
7510 df_set_regs_ever_live (regno, true);
7511 }
7512
7513 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
7514 if (df_regs_ever_live_p (regno)
7515 && !fixed_regs[regno]
7516 && !crtl->abi->clobbers_full_reg_p (regno))
7517 frame.reg_offset[regno] = SLOT_REQUIRED;
7518
d6430e3c
TC
7519 /* With stack-clash, LR must be saved in non-leaf functions. The saving of
7520 LR counts as an implicit probe which allows us to maintain the invariant
7521 described in the comment at expand_prologue. */
c600df9a
RS
7522 gcc_assert (crtl->is_leaf
7523 || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
7524
7525 /* Now assign stack slots for the registers. Start with the predicate
7526 registers, since predicate LDR and STR have a relatively small
7527 offset range. These saves happen below the hard frame pointer. */
7528 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
7529 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7530 {
7531 frame.reg_offset[regno] = offset;
7532 offset += BYTES_PER_SVE_PRED;
7533 }
7534
c600df9a
RS
7535 if (maybe_ne (offset, 0))
7536 {
cb26919c
RS
7537 /* If we have any vector registers to save above the predicate registers,
7538 the offset of the vector register save slots need to be a multiple
7539 of the vector size. This lets us use the immediate forms of LDR/STR
7540 (or LD1/ST1 for big-endian).
7541
7542 A vector register is 8 times the size of a predicate register,
7543 and we need to save a maximum of 12 predicate registers, so the
7544 first vector register will be at either #1, MUL VL or #2, MUL VL.
7545
7546 If we don't have any vector registers to save, and we know how
7547 big the predicate save area is, we can just round it up to the
7548 next 16-byte boundary. */
7549 if (last_fp_reg == (int) INVALID_REGNUM && offset.is_constant ())
7550 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
7551 else
7552 {
7553 if (known_le (offset, vector_save_size))
7554 offset = vector_save_size;
7555 else if (known_le (offset, vector_save_size * 2))
7556 offset = vector_save_size * 2;
7557 else
7558 gcc_unreachable ();
7559 }
c600df9a
RS
7560 }
7561
7562 /* If we need to save any SVE vector registers, add them next. */
7563 if (last_fp_reg != (int) INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
7564 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
7565 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7566 {
7567 frame.reg_offset[regno] = offset;
7568 offset += vector_save_size;
7569 }
7570
7571 /* OFFSET is now the offset of the hard frame pointer from the bottom
7572 of the callee save area. */
7573 bool saves_below_hard_fp_p = maybe_ne (offset, 0);
7574 frame.below_hard_fp_saved_regs_size = offset;
ab43763e 7575 if (frame.emit_frame_chain)
43e9d192 7576 {
2e1cdae5 7577 /* FP and LR are placed in the linkage record. */
c600df9a 7578 frame.reg_offset[R29_REGNUM] = offset;
ab43763e 7579 frame.wb_candidate1 = R29_REGNUM;
c600df9a 7580 frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
ab43763e 7581 frame.wb_candidate2 = R30_REGNUM;
c600df9a 7582 offset += 2 * UNITS_PER_WORD;
1f7bffd0 7583 }
43e9d192 7584
2e1cdae5 7585 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
c600df9a 7586 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
43e9d192 7587 {
ab43763e
RS
7588 frame.reg_offset[regno] = offset;
7589 if (frame.wb_candidate1 == INVALID_REGNUM)
7590 frame.wb_candidate1 = regno;
7591 else if (frame.wb_candidate2 == INVALID_REGNUM)
7592 frame.wb_candidate2 = regno;
43e9d192
IB
7593 offset += UNITS_PER_WORD;
7594 }
7595
c600df9a
RS
7596 poly_int64 max_int_offset = offset;
7597 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
7598 bool has_align_gap = maybe_ne (offset, max_int_offset);
4b0685d9 7599
43e9d192 7600 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
c600df9a 7601 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
43e9d192 7602 {
4b0685d9
WD
7603 /* If there is an alignment gap between integer and fp callee-saves,
7604 allocate the last fp register to it if possible. */
a0d0b980
SE
7605 if (regno == last_fp_reg
7606 && has_align_gap
c600df9a
RS
7607 && known_eq (vector_save_size, 8)
7608 && multiple_p (offset, 16))
4b0685d9 7609 {
ab43763e 7610 frame.reg_offset[regno] = max_int_offset;
4b0685d9
WD
7611 break;
7612 }
7613
ab43763e
RS
7614 frame.reg_offset[regno] = offset;
7615 if (frame.wb_candidate1 == INVALID_REGNUM)
7616 frame.wb_candidate1 = regno;
7617 else if (frame.wb_candidate2 == INVALID_REGNUM
7618 && frame.wb_candidate1 >= V0_REGNUM)
7619 frame.wb_candidate2 = regno;
c600df9a 7620 offset += vector_save_size;
43e9d192
IB
7621 }
7622
c600df9a 7623 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
43e9d192 7624
ab43763e 7625 frame.saved_regs_size = offset;
1c960e02 7626
c600df9a 7627 poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
71bfb77a 7628
c600df9a 7629 poly_int64 above_outgoing_args
6a70badb
RS
7630 = aligned_upper_bound (varargs_and_saved_regs_size
7631 + get_frame_size (),
7632 STACK_BOUNDARY / BITS_PER_UNIT);
1c960e02 7633
c600df9a
RS
7634 frame.hard_fp_offset
7635 = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
7636
6a70badb
RS
7637 /* Both these values are already aligned. */
7638 gcc_assert (multiple_p (crtl->outgoing_args_size,
7639 STACK_BOUNDARY / BITS_PER_UNIT));
c600df9a 7640 frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
1c960e02 7641
ab43763e 7642 frame.locals_offset = frame.saved_varargs_size;
71bfb77a 7643
ab43763e
RS
7644 frame.initial_adjust = 0;
7645 frame.final_adjust = 0;
7646 frame.callee_adjust = 0;
c600df9a 7647 frame.sve_callee_adjust = 0;
ab43763e 7648 frame.callee_offset = 0;
71bfb77a
WD
7649
7650 HOST_WIDE_INT max_push_offset = 0;
ab43763e 7651 if (frame.wb_candidate2 != INVALID_REGNUM)
71bfb77a 7652 max_push_offset = 512;
ab43763e 7653 else if (frame.wb_candidate1 != INVALID_REGNUM)
71bfb77a
WD
7654 max_push_offset = 256;
7655
9b17a646 7656 HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
c600df9a 7657 HOST_WIDE_INT const_saved_regs_size;
ab43763e 7658 if (frame.frame_size.is_constant (&const_size)
6a70badb 7659 && const_size < max_push_offset
c600df9a 7660 && known_eq (frame.hard_fp_offset, const_size))
71bfb77a
WD
7661 {
7662 /* Simple, small frame with no outgoing arguments:
c600df9a 7663
71bfb77a
WD
7664 stp reg1, reg2, [sp, -frame_size]!
7665 stp reg3, reg4, [sp, 16] */
ab43763e 7666 frame.callee_adjust = const_size;
71bfb77a 7667 }
9b17a646 7668 else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size)
c600df9a
RS
7669 && frame.saved_regs_size.is_constant (&const_saved_regs_size)
7670 && const_outgoing_args_size + const_saved_regs_size < 512
7671 /* We could handle this case even with outgoing args, provided
7672 that the number of args left us with valid offsets for all
7673 predicate and vector save slots. It's such a rare case that
7674 it hardly seems worth the effort though. */
7675 && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
71bfb77a 7676 && !(cfun->calls_alloca
9b17a646
RS
7677 && frame.hard_fp_offset.is_constant (&const_fp_offset)
7678 && const_fp_offset < max_push_offset))
71bfb77a
WD
7679 {
7680 /* Frame with small outgoing arguments:
c600df9a 7681
71bfb77a
WD
7682 sub sp, sp, frame_size
7683 stp reg1, reg2, [sp, outgoing_args_size]
7684 stp reg3, reg4, [sp, outgoing_args_size + 16] */
ab43763e 7685 frame.initial_adjust = frame.frame_size;
9b17a646 7686 frame.callee_offset = const_outgoing_args_size;
71bfb77a 7687 }
c600df9a
RS
7688 else if (saves_below_hard_fp_p
7689 && known_eq (frame.saved_regs_size,
7690 frame.below_hard_fp_saved_regs_size))
7691 {
7692 /* Frame in which all saves are SVE saves:
7693
7694 sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
7695 save SVE registers relative to SP
7696 sub sp, sp, outgoing_args_size */
7697 frame.initial_adjust = (frame.hard_fp_offset
7698 + frame.below_hard_fp_saved_regs_size);
7699 frame.final_adjust = crtl->outgoing_args_size;
7700 }
ab43763e 7701 else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
6a70badb 7702 && const_fp_offset < max_push_offset)
71bfb77a 7703 {
c600df9a
RS
7704 /* Frame with large outgoing arguments or SVE saves, but with
7705 a small local area:
7706
71bfb77a
WD
7707 stp reg1, reg2, [sp, -hard_fp_offset]!
7708 stp reg3, reg4, [sp, 16]
c600df9a
RS
7709 [sub sp, sp, below_hard_fp_saved_regs_size]
7710 [save SVE registers relative to SP]
71bfb77a 7711 sub sp, sp, outgoing_args_size */
ab43763e 7712 frame.callee_adjust = const_fp_offset;
c600df9a 7713 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
8e66b377 7714 frame.final_adjust = crtl->outgoing_args_size;
71bfb77a 7715 }
71bfb77a
WD
7716 else
7717 {
c600df9a
RS
7718 /* Frame with large local area and outgoing arguments or SVE saves,
7719 using frame pointer:
7720
71bfb77a
WD
7721 sub sp, sp, hard_fp_offset
7722 stp x29, x30, [sp, 0]
7723 add x29, sp, 0
7724 stp reg3, reg4, [sp, 16]
c600df9a
RS
7725 [sub sp, sp, below_hard_fp_saved_regs_size]
7726 [save SVE registers relative to SP]
71bfb77a 7727 sub sp, sp, outgoing_args_size */
ab43763e 7728 frame.initial_adjust = frame.hard_fp_offset;
c600df9a 7729 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
8e66b377 7730 frame.final_adjust = crtl->outgoing_args_size;
71bfb77a
WD
7731 }
7732
8e66b377
RS
7733 /* Make sure the individual adjustments add up to the full frame size. */
7734 gcc_assert (known_eq (frame.initial_adjust
7735 + frame.callee_adjust
c600df9a 7736 + frame.sve_callee_adjust
8e66b377
RS
7737 + frame.final_adjust, frame.frame_size));
7738
59a3d73d
RS
7739 if (!frame.emit_frame_chain && frame.callee_adjust == 0)
7740 {
7741 /* We've decided not to associate any register saves with the initial
7742 stack allocation. */
7743 frame.wb_candidate1 = INVALID_REGNUM;
7744 frame.wb_candidate2 = INVALID_REGNUM;
7745 }
7746
ab43763e 7747 frame.laid_out = true;
43e9d192
IB
7748}
7749
04ddfe06
KT
7750/* Return true if the register REGNO is saved on entry to
7751 the current function. */
7752
43e9d192
IB
7753static bool
7754aarch64_register_saved_on_entry (int regno)
7755{
c600df9a 7756 return known_ge (cfun->machine->frame.reg_offset[regno], 0);
43e9d192
IB
7757}
7758
04ddfe06
KT
7759/* Return the next register up from REGNO up to LIMIT for the callee
7760 to save. */
7761
64dedd72
JW
7762static unsigned
7763aarch64_next_callee_save (unsigned regno, unsigned limit)
7764{
7765 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
7766 regno ++;
7767 return regno;
7768}
43e9d192 7769
04ddfe06
KT
7770/* Push the register number REGNO of mode MODE to the stack with write-back
7771 adjusting the stack by ADJUSTMENT. */
7772
c5e1f66e 7773static void
ef4bddc2 7774aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
c5e1f66e
JW
7775 HOST_WIDE_INT adjustment)
7776 {
7777 rtx base_rtx = stack_pointer_rtx;
7778 rtx insn, reg, mem;
7779
7780 reg = gen_rtx_REG (mode, regno);
7781 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
7782 plus_constant (Pmode, base_rtx, -adjustment));
30079dde 7783 mem = gen_frame_mem (mode, mem);
c5e1f66e
JW
7784
7785 insn = emit_move_insn (mem, reg);
7786 RTX_FRAME_RELATED_P (insn) = 1;
7787}
7788
04ddfe06
KT
7789/* Generate and return an instruction to store the pair of registers
7790 REG and REG2 of mode MODE to location BASE with write-back adjusting
7791 the stack location BASE by ADJUSTMENT. */
7792
80c11907 7793static rtx
ef4bddc2 7794aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
80c11907
JW
7795 HOST_WIDE_INT adjustment)
7796{
7797 switch (mode)
7798 {
4e10a5a7 7799 case E_DImode:
80c11907
JW
7800 return gen_storewb_pairdi_di (base, base, reg, reg2,
7801 GEN_INT (-adjustment),
7802 GEN_INT (UNITS_PER_WORD - adjustment));
4e10a5a7 7803 case E_DFmode:
80c11907
JW
7804 return gen_storewb_pairdf_di (base, base, reg, reg2,
7805 GEN_INT (-adjustment),
7806 GEN_INT (UNITS_PER_WORD - adjustment));
a0d0b980
SE
7807 case E_TFmode:
7808 return gen_storewb_pairtf_di (base, base, reg, reg2,
7809 GEN_INT (-adjustment),
7810 GEN_INT (UNITS_PER_VREG - adjustment));
80c11907
JW
7811 default:
7812 gcc_unreachable ();
7813 }
7814}
7815
04ddfe06
KT
7816/* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
7817 stack pointer by ADJUSTMENT. */
7818
80c11907 7819static void
89ac681e 7820aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
80c11907 7821{
5d8a22a5 7822 rtx_insn *insn;
c600df9a 7823 machine_mode mode = aarch64_reg_save_mode (regno1);
89ac681e 7824
71bfb77a 7825 if (regno2 == INVALID_REGNUM)
89ac681e
WD
7826 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
7827
80c11907
JW
7828 rtx reg1 = gen_rtx_REG (mode, regno1);
7829 rtx reg2 = gen_rtx_REG (mode, regno2);
7830
7831 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
7832 reg2, adjustment));
7833 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
80c11907
JW
7834 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
7835 RTX_FRAME_RELATED_P (insn) = 1;
7836}
7837
04ddfe06
KT
7838/* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
7839 adjusting it by ADJUSTMENT afterwards. */
7840
159313d9 7841static rtx
ef4bddc2 7842aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
159313d9
JW
7843 HOST_WIDE_INT adjustment)
7844{
7845 switch (mode)
7846 {
4e10a5a7 7847 case E_DImode:
159313d9 7848 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3e322b3f 7849 GEN_INT (UNITS_PER_WORD));
4e10a5a7 7850 case E_DFmode:
159313d9 7851 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3e322b3f 7852 GEN_INT (UNITS_PER_WORD));
a0d0b980
SE
7853 case E_TFmode:
7854 return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
7855 GEN_INT (UNITS_PER_VREG));
159313d9
JW
7856 default:
7857 gcc_unreachable ();
7858 }
7859}
7860
04ddfe06
KT
7861/* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
7862 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
7863 into CFI_OPS. */
7864
89ac681e
WD
7865static void
7866aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
7867 rtx *cfi_ops)
7868{
c600df9a 7869 machine_mode mode = aarch64_reg_save_mode (regno1);
89ac681e
WD
7870 rtx reg1 = gen_rtx_REG (mode, regno1);
7871
7872 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
7873
71bfb77a 7874 if (regno2 == INVALID_REGNUM)
89ac681e
WD
7875 {
7876 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
7877 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
30079dde 7878 emit_move_insn (reg1, gen_frame_mem (mode, mem));
89ac681e
WD
7879 }
7880 else
7881 {
7882 rtx reg2 = gen_rtx_REG (mode, regno2);
7883 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
7884 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
7885 reg2, adjustment));
7886 }
7887}
7888
04ddfe06
KT
7889/* Generate and return a store pair instruction of mode MODE to store
7890 register REG1 to MEM1 and register REG2 to MEM2. */
7891
72df5c1f 7892static rtx
ef4bddc2 7893aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
72df5c1f
JW
7894 rtx reg2)
7895{
7896 switch (mode)
7897 {
4e10a5a7 7898 case E_DImode:
dfe1da23 7899 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
72df5c1f 7900
4e10a5a7 7901 case E_DFmode:
dfe1da23 7902 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
72df5c1f 7903
a0d0b980
SE
7904 case E_TFmode:
7905 return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
7906
7cda9e08
SD
7907 case E_V4SImode:
7908 return gen_vec_store_pairv4siv4si (mem1, reg1, mem2, reg2);
7909
54bbde55
SD
7910 case E_V16QImode:
7911 return gen_vec_store_pairv16qiv16qi (mem1, reg1, mem2, reg2);
7912
72df5c1f
JW
7913 default:
7914 gcc_unreachable ();
7915 }
7916}
7917
04ddfe06
KT
7918/* Generate and regurn a load pair isntruction of mode MODE to load register
7919 REG1 from MEM1 and register REG2 from MEM2. */
7920
72df5c1f 7921static rtx
ef4bddc2 7922aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
72df5c1f
JW
7923 rtx mem2)
7924{
7925 switch (mode)
7926 {
4e10a5a7 7927 case E_DImode:
dfe1da23 7928 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
72df5c1f 7929
4e10a5a7 7930 case E_DFmode:
dfe1da23 7931 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
72df5c1f 7932
a0d0b980
SE
7933 case E_TFmode:
7934 return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
7935
7cda9e08
SD
7936 case E_V4SImode:
7937 return gen_load_pairv4siv4si (reg1, mem1, reg2, mem2);
7938
72df5c1f
JW
7939 default:
7940 gcc_unreachable ();
7941 }
7942}
7943
db58fd89
JW
7944/* Return TRUE if return address signing should be enabled for the current
7945 function, otherwise return FALSE. */
7946
7947bool
7948aarch64_return_address_signing_enabled (void)
7949{
7950 /* This function should only be called after frame laid out. */
7951 gcc_assert (cfun->machine->frame.laid_out);
7952
2bc95be3
SN
7953 /* Turn return address signing off in any function that uses
7954 __builtin_eh_return. The address passed to __builtin_eh_return
7955 is not signed so either it has to be signed (with original sp)
7956 or the code path that uses it has to avoid authenticating it.
7957 Currently eh return introduces a return to anywhere gadget, no
7958 matter what we do here since it uses ret with user provided
7959 address. An ideal fix for that is to use indirect branch which
7960 can be protected with BTI j (to some extent). */
7961 if (crtl->calls_eh_return)
7962 return false;
7963
db58fd89 7964 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
8fc16d72 7965 if its LR is pushed onto stack. */
db58fd89
JW
7966 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
7967 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
c600df9a 7968 && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
db58fd89
JW
7969}
7970
30afdf34
SD
7971/* Return TRUE if Branch Target Identification Mechanism is enabled. */
7972bool
7973aarch64_bti_enabled (void)
7974{
7975 return (aarch64_enable_bti == 1);
7976}
7977
c600df9a
RS
7978/* The caller is going to use ST1D or LD1D to save or restore an SVE
7979 register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
7980 the range [1, 16] * GET_MODE_SIZE (MODE). Prepare for this by:
7981
7982 (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
7983 or LD1D address
7984
7985 (2) setting PRED to a valid predicate register for the ST1D or LD1D,
7986 if the variable isn't already nonnull
7987
7988 (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
7989 Handle this case using a temporary base register that is suitable for
7990 all offsets in that range. Use ANCHOR_REG as this base register if it
7991 is nonnull, otherwise create a new register and store it in ANCHOR_REG. */
7992
7993static inline void
7994aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
7995 rtx &anchor_reg, poly_int64 &offset,
7996 rtx &ptrue)
7997{
7998 if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
7999 {
8000 /* This is the maximum valid offset of the anchor from the base.
8001 Lower values would be valid too. */
8002 poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
8003 if (!anchor_reg)
8004 {
8005 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
8006 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
8007 gen_int_mode (anchor_offset, Pmode)));
8008 }
8009 base_rtx = anchor_reg;
8010 offset -= anchor_offset;
8011 }
8012 if (!ptrue)
8013 {
8014 int pred_reg = cfun->machine->frame.spare_pred_reg;
8015 emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
8016 CONSTM1_RTX (VNx16BImode));
8017 ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
8018 }
8019}
8020
8021/* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
8022 is saved at BASE + OFFSET. */
8023
8024static void
8025aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
8026 rtx base, poly_int64 offset)
8027{
8028 rtx mem = gen_frame_mem (GET_MODE (reg),
8029 plus_constant (Pmode, base, offset));
8030 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
8031}
8032
04ddfe06
KT
8033/* Emit code to save the callee-saved registers from register number START
8034 to LIMIT to the stack at the location starting at offset START_OFFSET,
c600df9a
RS
8035 skipping any write-back candidates if SKIP_WB is true. HARD_FP_VALID_P
8036 is true if the hard frame pointer has been set up. */
43e9d192 8037
43e9d192 8038static void
c600df9a
RS
8039aarch64_save_callee_saves (poly_int64 start_offset,
8040 unsigned start, unsigned limit, bool skip_wb,
8041 bool hard_fp_valid_p)
43e9d192 8042{
5d8a22a5 8043 rtx_insn *insn;
43e9d192
IB
8044 unsigned regno;
8045 unsigned regno2;
c600df9a 8046 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
43e9d192 8047
0ec74a1e 8048 for (regno = aarch64_next_callee_save (start, limit);
64dedd72
JW
8049 regno <= limit;
8050 regno = aarch64_next_callee_save (regno + 1, limit))
43e9d192 8051 {
ae13fce3 8052 rtx reg, mem;
6a70badb 8053 poly_int64 offset;
c600df9a 8054 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
64dedd72 8055
ae13fce3
JW
8056 if (skip_wb
8057 && (regno == cfun->machine->frame.wb_candidate1
8058 || regno == cfun->machine->frame.wb_candidate2))
8059 continue;
8060
827ab47a 8061 if (cfun->machine->reg_is_wrapped_separately[regno])
c600df9a 8062 continue;
827ab47a 8063
c600df9a 8064 machine_mode mode = aarch64_reg_save_mode (regno);
ae13fce3
JW
8065 reg = gen_rtx_REG (mode, regno);
8066 offset = start_offset + cfun->machine->frame.reg_offset[regno];
c600df9a
RS
8067 rtx base_rtx = stack_pointer_rtx;
8068 poly_int64 sp_offset = offset;
64dedd72 8069
c600df9a
RS
8070 HOST_WIDE_INT const_offset;
8071 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8072 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
8073 offset, ptrue);
8074 else if (GP_REGNUM_P (regno)
8075 && (!offset.is_constant (&const_offset) || const_offset >= 512))
8076 {
8077 gcc_assert (known_eq (start_offset, 0));
8078 poly_int64 fp_offset
8079 = cfun->machine->frame.below_hard_fp_saved_regs_size;
8080 if (hard_fp_valid_p)
8081 base_rtx = hard_frame_pointer_rtx;
8082 else
8083 {
8084 if (!anchor_reg)
8085 {
8086 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
8087 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
8088 gen_int_mode (fp_offset, Pmode)));
8089 }
8090 base_rtx = anchor_reg;
8091 }
8092 offset -= fp_offset;
8093 }
8094 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8095 bool need_cfa_note_p = (base_rtx != stack_pointer_rtx);
64dedd72 8096
c600df9a
RS
8097 if (!aarch64_sve_mode_p (mode)
8098 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
827ab47a 8099 && !cfun->machine->reg_is_wrapped_separately[regno2]
c600df9a
RS
8100 && known_eq (GET_MODE_SIZE (mode),
8101 cfun->machine->frame.reg_offset[regno2]
8102 - cfun->machine->frame.reg_offset[regno]))
43e9d192 8103 {
0ec74a1e 8104 rtx reg2 = gen_rtx_REG (mode, regno2);
64dedd72
JW
8105 rtx mem2;
8106
c600df9a
RS
8107 offset += GET_MODE_SIZE (mode);
8108 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8ed2fc62
JW
8109 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
8110 reg2));
0b4a9743 8111
64dedd72
JW
8112 /* The first part of a frame-related parallel insn is
8113 always assumed to be relevant to the frame
8114 calculations; subsequent parts, are only
8115 frame-related if explicitly marked. */
c600df9a
RS
8116 if (aarch64_emit_cfi_for_reg_p (regno2))
8117 {
8118 if (need_cfa_note_p)
8119 aarch64_add_cfa_expression (insn, reg2, stack_pointer_rtx,
8120 sp_offset + GET_MODE_SIZE (mode));
8121 else
8122 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
8123 }
8124
64dedd72
JW
8125 regno = regno2;
8126 }
c600df9a
RS
8127 else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8128 {
8129 insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, reg));
8130 need_cfa_note_p = true;
8131 }
8132 else if (aarch64_sve_mode_p (mode))
8133 insn = emit_insn (gen_rtx_SET (mem, reg));
64dedd72 8134 else
8ed2fc62
JW
8135 insn = emit_move_insn (mem, reg);
8136
c600df9a
RS
8137 RTX_FRAME_RELATED_P (insn) = frame_related_p;
8138 if (frame_related_p && need_cfa_note_p)
8139 aarch64_add_cfa_expression (insn, reg, stack_pointer_rtx, sp_offset);
8ed2fc62
JW
8140 }
8141}
8142
c600df9a
RS
8143/* Emit code to restore the callee registers from register number START
8144 up to and including LIMIT. Restore from the stack offset START_OFFSET,
8145 skipping any write-back candidates if SKIP_WB is true. Write the
8146 appropriate REG_CFA_RESTORE notes into CFI_OPS. */
04ddfe06 8147
8ed2fc62 8148static void
c600df9a 8149aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
dd991abb 8150 unsigned limit, bool skip_wb, rtx *cfi_ops)
8ed2fc62 8151{
8ed2fc62
JW
8152 unsigned regno;
8153 unsigned regno2;
6a70badb 8154 poly_int64 offset;
c600df9a 8155 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
8ed2fc62
JW
8156
8157 for (regno = aarch64_next_callee_save (start, limit);
8158 regno <= limit;
8159 regno = aarch64_next_callee_save (regno + 1, limit))
8160 {
c600df9a 8161 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
827ab47a 8162 if (cfun->machine->reg_is_wrapped_separately[regno])
c600df9a 8163 continue;
827ab47a 8164
ae13fce3 8165 rtx reg, mem;
8ed2fc62 8166
ae13fce3
JW
8167 if (skip_wb
8168 && (regno == cfun->machine->frame.wb_candidate1
8169 || regno == cfun->machine->frame.wb_candidate2))
8170 continue;
8171
c600df9a 8172 machine_mode mode = aarch64_reg_save_mode (regno);
ae13fce3 8173 reg = gen_rtx_REG (mode, regno);
8ed2fc62 8174 offset = start_offset + cfun->machine->frame.reg_offset[regno];
c600df9a
RS
8175 rtx base_rtx = stack_pointer_rtx;
8176 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8177 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
8178 offset, ptrue);
30079dde 8179 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8ed2fc62 8180
c600df9a
RS
8181 if (!aarch64_sve_mode_p (mode)
8182 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
827ab47a 8183 && !cfun->machine->reg_is_wrapped_separately[regno2]
c600df9a
RS
8184 && known_eq (GET_MODE_SIZE (mode),
8185 cfun->machine->frame.reg_offset[regno2]
8186 - cfun->machine->frame.reg_offset[regno]))
64dedd72 8187 {
8ed2fc62
JW
8188 rtx reg2 = gen_rtx_REG (mode, regno2);
8189 rtx mem2;
8190
c600df9a 8191 offset += GET_MODE_SIZE (mode);
30079dde 8192 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
dd991abb 8193 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
8ed2fc62 8194
dd991abb 8195 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
8ed2fc62 8196 regno = regno2;
43e9d192 8197 }
c600df9a
RS
8198 else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8199 emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
8200 else if (aarch64_sve_mode_p (mode))
8201 emit_insn (gen_rtx_SET (reg, mem));
8ed2fc62 8202 else
dd991abb 8203 emit_move_insn (reg, mem);
c600df9a
RS
8204 if (frame_related_p)
8205 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
43e9d192 8206 }
43e9d192
IB
8207}
8208
43cacb12
RS
8209/* Return true if OFFSET is a signed 4-bit value multiplied by the size
8210 of MODE. */
8211
8212static inline bool
8213offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8214{
8215 HOST_WIDE_INT multiple;
8216 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8217 && IN_RANGE (multiple, -8, 7));
8218}
8219
ba15b0fa
RS
8220/* Return true if OFFSET is a signed 6-bit value multiplied by the size
8221 of MODE. */
8222
8223static inline bool
8224offset_6bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8225{
8226 HOST_WIDE_INT multiple;
8227 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8228 && IN_RANGE (multiple, -32, 31));
8229}
8230
8231/* Return true if OFFSET is an unsigned 6-bit value multiplied by the size
43cacb12
RS
8232 of MODE. */
8233
8234static inline bool
8235offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
8236{
8237 HOST_WIDE_INT multiple;
8238 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8239 && IN_RANGE (multiple, 0, 63));
8240}
8241
8242/* Return true if OFFSET is a signed 7-bit value multiplied by the size
8243 of MODE. */
8244
8245bool
8246aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8247{
8248 HOST_WIDE_INT multiple;
8249 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8250 && IN_RANGE (multiple, -64, 63));
8251}
8252
8253/* Return true if OFFSET is a signed 9-bit value. */
8254
3c5af608
MM
8255bool
8256aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
8257 poly_int64 offset)
827ab47a 8258{
6a70badb
RS
8259 HOST_WIDE_INT const_offset;
8260 return (offset.is_constant (&const_offset)
8261 && IN_RANGE (const_offset, -256, 255));
827ab47a
KT
8262}
8263
43cacb12
RS
8264/* Return true if OFFSET is a signed 9-bit value multiplied by the size
8265 of MODE. */
8266
827ab47a 8267static inline bool
43cacb12 8268offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
827ab47a 8269{
6a70badb
RS
8270 HOST_WIDE_INT multiple;
8271 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
43cacb12 8272 && IN_RANGE (multiple, -256, 255));
827ab47a
KT
8273}
8274
43cacb12
RS
8275/* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
8276 of MODE. */
8277
8278static inline bool
8279offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
827ab47a 8280{
6a70badb
RS
8281 HOST_WIDE_INT multiple;
8282 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
43cacb12 8283 && IN_RANGE (multiple, 0, 4095));
827ab47a
KT
8284}
8285
8286/* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
8287
8288static sbitmap
8289aarch64_get_separate_components (void)
8290{
827ab47a
KT
8291 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
8292 bitmap_clear (components);
8293
8294 /* The registers we need saved to the frame. */
8295 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
8296 if (aarch64_register_saved_on_entry (regno))
8297 {
c600df9a
RS
8298 /* Punt on saves and restores that use ST1D and LD1D. We could
8299 try to be smarter, but it would involve making sure that the
8300 spare predicate register itself is safe to use at the save
8301 and restore points. Also, when a frame pointer is being used,
8302 the slots are often out of reach of ST1D and LD1D anyway. */
8303 machine_mode mode = aarch64_reg_save_mode (regno);
8304 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8305 continue;
8306
6a70badb 8307 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
c600df9a
RS
8308
8309 /* If the register is saved in the first SVE save slot, we use
8310 it as a stack probe for -fstack-clash-protection. */
8311 if (flag_stack_clash_protection
8312 && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
8313 && known_eq (offset, 0))
8314 continue;
8315
8316 /* Get the offset relative to the register we'll use. */
8317 if (frame_pointer_needed)
8318 offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
8319 else
8320 offset += crtl->outgoing_args_size;
8321
827ab47a
KT
8322 /* Check that we can access the stack slot of the register with one
8323 direct load with no adjustments needed. */
c600df9a
RS
8324 if (aarch64_sve_mode_p (mode)
8325 ? offset_9bit_signed_scaled_p (mode, offset)
8326 : offset_12bit_unsigned_scaled_p (mode, offset))
827ab47a
KT
8327 bitmap_set_bit (components, regno);
8328 }
8329
8330 /* Don't mess with the hard frame pointer. */
8331 if (frame_pointer_needed)
8332 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
8333
c600df9a
RS
8334 /* If the spare predicate register used by big-endian SVE code
8335 is call-preserved, it must be saved in the main prologue
8336 before any saves that use it. */
8337 if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
8338 bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
8339
827ab47a
KT
8340 unsigned reg1 = cfun->machine->frame.wb_candidate1;
8341 unsigned reg2 = cfun->machine->frame.wb_candidate2;
0795f659 8342 /* If registers have been chosen to be stored/restored with
827ab47a
KT
8343 writeback don't interfere with them to avoid having to output explicit
8344 stack adjustment instructions. */
8345 if (reg2 != INVALID_REGNUM)
8346 bitmap_clear_bit (components, reg2);
8347 if (reg1 != INVALID_REGNUM)
8348 bitmap_clear_bit (components, reg1);
8349
8350 bitmap_clear_bit (components, LR_REGNUM);
8351 bitmap_clear_bit (components, SP_REGNUM);
8352
8353 return components;
8354}
8355
8356/* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
8357
8358static sbitmap
8359aarch64_components_for_bb (basic_block bb)
8360{
8361 bitmap in = DF_LIVE_IN (bb);
8362 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
8363 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
8364
8365 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
8366 bitmap_clear (components);
8367
ce9d2a37
RS
8368 /* Clobbered registers don't generate values in any meaningful sense,
8369 since nothing after the clobber can rely on their value. And we can't
8370 say that partially-clobbered registers are unconditionally killed,
8371 because whether they're killed or not depends on the mode of the
8372 value they're holding. Thus partially call-clobbered registers
8373 appear in neither the kill set nor the gen set.
8374
8375 Check manually for any calls that clobber more of a register than the
8376 current function can. */
8377 function_abi_aggregator callee_abis;
8378 rtx_insn *insn;
8379 FOR_BB_INSNS (bb, insn)
8380 if (CALL_P (insn))
8381 callee_abis.note_callee_abi (insn_callee_abi (insn));
8382 HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
8383
827ab47a
KT
8384 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
8385 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
dcdd0f05
RS
8386 if (!fixed_regs[regno]
8387 && !crtl->abi->clobbers_full_reg_p (regno)
ce9d2a37
RS
8388 && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
8389 || bitmap_bit_p (in, regno)
8390 || bitmap_bit_p (gen, regno)
8391 || bitmap_bit_p (kill, regno)))
3f26f054 8392 {
3f26f054
WD
8393 bitmap_set_bit (components, regno);
8394
8395 /* If there is a callee-save at an adjacent offset, add it too
8396 to increase the use of LDP/STP. */
c600df9a
RS
8397 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
8398 unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
3f26f054
WD
8399
8400 if (regno2 <= LAST_SAVED_REGNUM)
8401 {
c600df9a
RS
8402 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
8403 if (regno < regno2
8404 ? known_eq (offset + 8, offset2)
8405 : multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
3f26f054
WD
8406 bitmap_set_bit (components, regno2);
8407 }
8408 }
827ab47a
KT
8409
8410 return components;
8411}
8412
8413/* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
8414 Nothing to do for aarch64. */
8415
8416static void
8417aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
8418{
8419}
8420
8421/* Return the next set bit in BMP from START onwards. Return the total number
8422 of bits in BMP if no set bit is found at or after START. */
8423
8424static unsigned int
8425aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
8426{
8427 unsigned int nbits = SBITMAP_SIZE (bmp);
8428 if (start == nbits)
8429 return start;
8430
8431 gcc_assert (start < nbits);
8432 for (unsigned int i = start; i < nbits; i++)
8433 if (bitmap_bit_p (bmp, i))
8434 return i;
8435
8436 return nbits;
8437}
8438
8439/* Do the work for aarch64_emit_prologue_components and
8440 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
8441 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
8442 for these components or the epilogue sequence. That is, it determines
8443 whether we should emit stores or loads and what kind of CFA notes to attach
8444 to the insns. Otherwise the logic for the two sequences is very
8445 similar. */
8446
8447static void
8448aarch64_process_components (sbitmap components, bool prologue_p)
8449{
8450 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
8451 ? HARD_FRAME_POINTER_REGNUM
8452 : STACK_POINTER_REGNUM);
8453
8454 unsigned last_regno = SBITMAP_SIZE (components);
8455 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
8456 rtx_insn *insn = NULL;
8457
8458 while (regno != last_regno)
8459 {
c600df9a
RS
8460 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
8461 machine_mode mode = aarch64_reg_save_mode (regno);
a0d0b980 8462
827ab47a 8463 rtx reg = gen_rtx_REG (mode, regno);
6a70badb 8464 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
c600df9a
RS
8465 if (frame_pointer_needed)
8466 offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
8467 else
8468 offset += crtl->outgoing_args_size;
8469
827ab47a
KT
8470 rtx addr = plus_constant (Pmode, ptr_reg, offset);
8471 rtx mem = gen_frame_mem (mode, addr);
8472
8473 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
8474 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
8475 /* No more registers to handle after REGNO.
8476 Emit a single save/restore and exit. */
8477 if (regno2 == last_regno)
8478 {
8479 insn = emit_insn (set);
c600df9a
RS
8480 if (frame_related_p)
8481 {
8482 RTX_FRAME_RELATED_P (insn) = 1;
8483 if (prologue_p)
8484 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
8485 else
8486 add_reg_note (insn, REG_CFA_RESTORE, reg);
8487 }
827ab47a
KT
8488 break;
8489 }
8490
6a70badb 8491 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
827ab47a
KT
8492 /* The next register is not of the same class or its offset is not
8493 mergeable with the current one into a pair. */
c600df9a
RS
8494 if (aarch64_sve_mode_p (mode)
8495 || !satisfies_constraint_Ump (mem)
827ab47a 8496 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
dcdd0f05 8497 || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
6a70badb
RS
8498 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
8499 GET_MODE_SIZE (mode)))
827ab47a
KT
8500 {
8501 insn = emit_insn (set);
c600df9a
RS
8502 if (frame_related_p)
8503 {
8504 RTX_FRAME_RELATED_P (insn) = 1;
8505 if (prologue_p)
8506 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
8507 else
8508 add_reg_note (insn, REG_CFA_RESTORE, reg);
8509 }
827ab47a
KT
8510
8511 regno = regno2;
8512 continue;
8513 }
8514
c600df9a
RS
8515 bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
8516
827ab47a
KT
8517 /* REGNO2 can be saved/restored in a pair with REGNO. */
8518 rtx reg2 = gen_rtx_REG (mode, regno2);
c600df9a
RS
8519 if (frame_pointer_needed)
8520 offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
8521 else
8522 offset2 += crtl->outgoing_args_size;
827ab47a
KT
8523 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
8524 rtx mem2 = gen_frame_mem (mode, addr2);
8525 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
8526 : gen_rtx_SET (reg2, mem2);
8527
8528 if (prologue_p)
8529 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
8530 else
8531 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
8532
c600df9a 8533 if (frame_related_p || frame_related2_p)
827ab47a 8534 {
c600df9a
RS
8535 RTX_FRAME_RELATED_P (insn) = 1;
8536 if (prologue_p)
8537 {
8538 if (frame_related_p)
8539 add_reg_note (insn, REG_CFA_OFFSET, set);
8540 if (frame_related2_p)
8541 add_reg_note (insn, REG_CFA_OFFSET, set2);
8542 }
8543 else
8544 {
8545 if (frame_related_p)
8546 add_reg_note (insn, REG_CFA_RESTORE, reg);
8547 if (frame_related2_p)
8548 add_reg_note (insn, REG_CFA_RESTORE, reg2);
8549 }
827ab47a
KT
8550 }
8551
8552 regno = aarch64_get_next_set_bit (components, regno2 + 1);
8553 }
8554}
8555
8556/* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
8557
8558static void
8559aarch64_emit_prologue_components (sbitmap components)
8560{
8561 aarch64_process_components (components, true);
8562}
8563
8564/* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
8565
8566static void
8567aarch64_emit_epilogue_components (sbitmap components)
8568{
8569 aarch64_process_components (components, false);
8570}
8571
8572/* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
8573
8574static void
8575aarch64_set_handled_components (sbitmap components)
8576{
8577 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
8578 if (bitmap_bit_p (components, regno))
8579 cfun->machine->reg_is_wrapped_separately[regno] = true;
8580}
8581
8c6e3b23
TC
8582/* On AArch64 we have an ABI defined safe buffer. This constant is used to
8583 determining the probe offset for alloca. */
8584
8585static HOST_WIDE_INT
8586aarch64_stack_clash_protection_alloca_probe_range (void)
8587{
8588 return STACK_CLASH_CALLER_GUARD;
8589}
8590
8591
cd1bef27
JL
8592/* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
8593 registers. If POLY_SIZE is not large enough to require a probe this function
8594 will only adjust the stack. When allocating the stack space
8595 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
8596 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
8597 arguments. If we are then we ensure that any allocation larger than the ABI
8598 defined buffer needs a probe so that the invariant of having a 1KB buffer is
8599 maintained.
8600
8601 We emit barriers after each stack adjustment to prevent optimizations from
8602 breaking the invariant that we never drop the stack more than a page. This
8603 invariant is needed to make it easier to correctly handle asynchronous
8604 events, e.g. if we were to allow the stack to be dropped by more than a page
8605 and then have multiple probes up and we take a signal somewhere in between
8606 then the signal handler doesn't know the state of the stack and can make no
8607 assumptions about which pages have been probed. */
8608
8609static void
8610aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
8611 poly_int64 poly_size,
8612 bool frame_related_p,
8613 bool final_adjustment_p)
8614{
8615 HOST_WIDE_INT guard_size
028d4092 8616 = 1 << param_stack_clash_protection_guard_size;
cd1bef27 8617 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
cd1bef27 8618 HOST_WIDE_INT min_probe_threshold
c600df9a
RS
8619 = (final_adjustment_p
8620 ? guard_used_by_caller
8621 : guard_size - guard_used_by_caller);
8622 /* When doing the final adjustment for the outgoing arguments, take into
8623 account any unprobed space there is above the current SP. There are
8624 two cases:
8625
8626 - When saving SVE registers below the hard frame pointer, we force
8627 the lowest save to take place in the prologue before doing the final
8628 adjustment (i.e. we don't allow the save to be shrink-wrapped).
8629 This acts as a probe at SP, so there is no unprobed space.
8630
8631 - When there are no SVE register saves, we use the store of the link
8632 register as a probe. We can't assume that LR was saved at position 0
8633 though, so treat any space below it as unprobed. */
8634 if (final_adjustment_p
8635 && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0))
8636 {
8637 poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
8638 if (known_ge (lr_offset, 0))
8639 min_probe_threshold -= lr_offset.to_constant ();
8640 else
8641 gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
8642 }
cd1bef27
JL
8643
8644 poly_int64 frame_size = cfun->machine->frame.frame_size;
8645
8646 /* We should always have a positive probe threshold. */
8647 gcc_assert (min_probe_threshold > 0);
8648
8649 if (flag_stack_clash_protection && !final_adjustment_p)
8650 {
8651 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
c600df9a 8652 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
cd1bef27
JL
8653 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
8654
8655 if (known_eq (frame_size, 0))
8656 {
8657 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
8658 }
c600df9a
RS
8659 else if (known_lt (initial_adjust + sve_callee_adjust,
8660 guard_size - guard_used_by_caller)
cd1bef27
JL
8661 && known_lt (final_adjust, guard_used_by_caller))
8662 {
8663 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
8664 }
8665 }
8666
cd1bef27
JL
8667 /* If SIZE is not large enough to require probing, just adjust the stack and
8668 exit. */
eb471ba3 8669 if (known_lt (poly_size, min_probe_threshold)
cd1bef27
JL
8670 || !flag_stack_clash_protection)
8671 {
8672 aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
8673 return;
8674 }
8675
eb471ba3
TC
8676 HOST_WIDE_INT size;
8677 /* Handle the SVE non-constant case first. */
8678 if (!poly_size.is_constant (&size))
8679 {
8680 if (dump_file)
8681 {
8682 fprintf (dump_file, "Stack clash SVE prologue: ");
8683 print_dec (poly_size, dump_file);
8684 fprintf (dump_file, " bytes, dynamic probing will be required.\n");
8685 }
8686
8687 /* First calculate the amount of bytes we're actually spilling. */
8688 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
8689 poly_size, temp1, temp2, false, true);
8690
8691 rtx_insn *insn = get_last_insn ();
8692
8693 if (frame_related_p)
8694 {
8695 /* This is done to provide unwinding information for the stack
8696 adjustments we're about to do, however to prevent the optimizers
143d3b15 8697 from removing the R11 move and leaving the CFA note (which would be
eb471ba3
TC
8698 very wrong) we tie the old and new stack pointer together.
8699 The tie will expand to nothing but the optimizers will not touch
8700 the instruction. */
143d3b15 8701 rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
eb471ba3
TC
8702 emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
8703 emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
8704
8705 /* We want the CFA independent of the stack pointer for the
8706 duration of the loop. */
8707 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
8708 RTX_FRAME_RELATED_P (insn) = 1;
8709 }
8710
8711 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
8712 rtx guard_const = gen_int_mode (guard_size, Pmode);
8713
8714 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
8715 stack_pointer_rtx, temp1,
8716 probe_const, guard_const));
8717
8718 /* Now reset the CFA register if needed. */
8719 if (frame_related_p)
8720 {
8721 add_reg_note (insn, REG_CFA_DEF_CFA,
8722 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
8723 gen_int_mode (poly_size, Pmode)));
8724 RTX_FRAME_RELATED_P (insn) = 1;
8725 }
8726
8727 return;
8728 }
8729
cd1bef27
JL
8730 if (dump_file)
8731 fprintf (dump_file,
eb471ba3
TC
8732 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
8733 " bytes, probing will be required.\n", size);
cd1bef27
JL
8734
8735 /* Round size to the nearest multiple of guard_size, and calculate the
8736 residual as the difference between the original size and the rounded
8737 size. */
8738 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
8739 HOST_WIDE_INT residual = size - rounded_size;
8740
8741 /* We can handle a small number of allocations/probes inline. Otherwise
8742 punt to a loop. */
8743 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
8744 {
8745 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
8746 {
8747 aarch64_sub_sp (NULL, temp2, guard_size, true);
8748 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
8749 guard_used_by_caller));
8750 emit_insn (gen_blockage ());
8751 }
8752 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
8753 }
8754 else
8755 {
8756 /* Compute the ending address. */
8757 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
8758 temp1, NULL, false, true);
8759 rtx_insn *insn = get_last_insn ();
8760
8761 /* For the initial allocation, we don't have a frame pointer
8762 set up, so we always need CFI notes. If we're doing the
8763 final allocation, then we may have a frame pointer, in which
8764 case it is the CFA, otherwise we need CFI notes.
8765
8766 We can determine which allocation we are doing by looking at
8767 the value of FRAME_RELATED_P since the final allocations are not
8768 frame related. */
8769 if (frame_related_p)
8770 {
8771 /* We want the CFA independent of the stack pointer for the
8772 duration of the loop. */
8773 add_reg_note (insn, REG_CFA_DEF_CFA,
8774 plus_constant (Pmode, temp1, rounded_size));
8775 RTX_FRAME_RELATED_P (insn) = 1;
8776 }
8777
8778 /* This allocates and probes the stack. Note that this re-uses some of
8779 the existing Ada stack protection code. However we are guaranteed not
8780 to enter the non loop or residual branches of that code.
8781
8782 The non-loop part won't be entered because if our allocation amount
8783 doesn't require a loop, the case above would handle it.
8784
8785 The residual amount won't be entered because TEMP1 is a mutliple of
8786 the allocation size. The residual will always be 0. As such, the only
8787 part we are actually using from that code is the loop setup. The
8788 actual probing is done in aarch64_output_probe_stack_range. */
8789 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
8790 stack_pointer_rtx, temp1));
8791
8792 /* Now reset the CFA register if needed. */
8793 if (frame_related_p)
8794 {
8795 add_reg_note (insn, REG_CFA_DEF_CFA,
8796 plus_constant (Pmode, stack_pointer_rtx, rounded_size));
8797 RTX_FRAME_RELATED_P (insn) = 1;
8798 }
8799
8800 emit_insn (gen_blockage ());
8801 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
8802 }
8803
8804 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
8805 be probed. This maintains the requirement that each page is probed at
8806 least once. For initial probing we probe only if the allocation is
8807 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
8808 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
8809 GUARD_SIZE. This works that for any allocation that is large enough to
8810 trigger a probe here, we'll have at least one, and if they're not large
8811 enough for this code to emit anything for them, The page would have been
8812 probed by the saving of FP/LR either by this function or any callees. If
8813 we don't have any callees then we won't have more stack adjustments and so
8814 are still safe. */
8815 if (residual)
8816 {
8817 HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
8818 /* If we're doing final adjustments, and we've done any full page
8819 allocations then any residual needs to be probed. */
8820 if (final_adjustment_p && rounded_size != 0)
8821 min_probe_threshold = 0;
8822 /* If doing a small final adjustment, we always probe at offset 0.
8823 This is done to avoid issues when LR is not at position 0 or when
8824 the final adjustment is smaller than the probing offset. */
8825 else if (final_adjustment_p && rounded_size == 0)
8826 residual_probe_offset = 0;
8827
8828 aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
8829 if (residual >= min_probe_threshold)
8830 {
8831 if (dump_file)
8832 fprintf (dump_file,
8833 "Stack clash AArch64 prologue residuals: "
8834 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
8835 "\n", residual);
8836
8837 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
8838 residual_probe_offset));
8839 emit_insn (gen_blockage ());
8840 }
8841 }
8842}
8843
a0d0b980
SE
8844/* Return 1 if the register is used by the epilogue. We need to say the
8845 return register is used, but only after epilogue generation is complete.
8846 Note that in the case of sibcalls, the values "used by the epilogue" are
8847 considered live at the start of the called function.
8848
8849 For SIMD functions we need to return 1 for FP registers that are saved and
8850 restored by a function but are not zero in call_used_regs. If we do not do
8851 this optimizations may remove the restore of the register. */
8852
8853int
8854aarch64_epilogue_uses (int regno)
8855{
8856 if (epilogue_completed)
8857 {
8858 if (regno == LR_REGNUM)
8859 return 1;
a0d0b980
SE
8860 }
8861 return 0;
8862}
8863
43e9d192
IB
8864/* AArch64 stack frames generated by this compiler look like:
8865
8866 +-------------------------------+
8867 | |
8868 | incoming stack arguments |
8869 | |
34834420
MS
8870 +-------------------------------+
8871 | | <-- incoming stack pointer (aligned)
43e9d192
IB
8872 | callee-allocated save area |
8873 | for register varargs |
8874 | |
34834420
MS
8875 +-------------------------------+
8876 | local variables | <-- frame_pointer_rtx
43e9d192
IB
8877 | |
8878 +-------------------------------+
cd1bef27 8879 | padding | \
454fdba9 8880 +-------------------------------+ |
454fdba9 8881 | callee-saved registers | | frame.saved_regs_size
454fdba9
RL
8882 +-------------------------------+ |
8883 | LR' | |
8884 +-------------------------------+ |
c600df9a
RS
8885 | FP' | |
8886 +-------------------------------+ |<- hard_frame_pointer_rtx (aligned)
8887 | SVE vector registers | | \
8888 +-------------------------------+ | | below_hard_fp_saved_regs_size
8889 | SVE predicate registers | / /
8890 +-------------------------------+
43e9d192
IB
8891 | dynamic allocation |
8892 +-------------------------------+
34834420
MS
8893 | padding |
8894 +-------------------------------+
8895 | outgoing stack arguments | <-- arg_pointer
8896 | |
8897 +-------------------------------+
8898 | | <-- stack_pointer_rtx (aligned)
43e9d192 8899
34834420
MS
8900 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
8901 but leave frame_pointer_rtx and hard_frame_pointer_rtx
cd1bef27
JL
8902 unchanged.
8903
8904 By default for stack-clash we assume the guard is at least 64KB, but this
8905 value is configurable to either 4KB or 64KB. We also force the guard size to
8906 be the same as the probing interval and both values are kept in sync.
8907
8908 With those assumptions the callee can allocate up to 63KB (or 3KB depending
8909 on the guard size) of stack space without probing.
8910
8911 When probing is needed, we emit a probe at the start of the prologue
8912 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
8913
8914 We have to track how much space has been allocated and the only stores
8915 to the stack we track as implicit probes are the FP/LR stores.
8916
8917 For outgoing arguments we probe if the size is larger than 1KB, such that
143d3b15
TC
8918 the ABI specified buffer is maintained for the next callee.
8919
8920 The following registers are reserved during frame layout and should not be
8921 used for any other purpose:
8922
c600df9a
RS
8923 - r11: Used by stack clash protection when SVE is enabled, and also
8924 as an anchor register when saving and restoring registers
143d3b15
TC
8925 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
8926 - r14 and r15: Used for speculation tracking.
8927 - r16(IP0), r17(IP1): Used by indirect tailcalls.
8928 - r30(LR), r29(FP): Used by standard frame layout.
8929
8930 These registers must be avoided in frame layout related code unless the
8931 explicit intention is to interact with one of the features listed above. */
43e9d192
IB
8932
8933/* Generate the prologue instructions for entry into a function.
8934 Establish the stack frame by decreasing the stack pointer with a
8935 properly calculated size and, if necessary, create a frame record
8936 filled with the values of LR and previous frame pointer. The
6991c977 8937 current FP is also set up if it is in use. */
43e9d192
IB
8938
8939void
8940aarch64_expand_prologue (void)
8941{
6a70badb
RS
8942 poly_int64 frame_size = cfun->machine->frame.frame_size;
8943 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
71bfb77a 8944 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6a70badb
RS
8945 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
8946 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
c600df9a
RS
8947 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
8948 poly_int64 below_hard_fp_saved_regs_size
8949 = cfun->machine->frame.below_hard_fp_saved_regs_size;
71bfb77a
WD
8950 unsigned reg1 = cfun->machine->frame.wb_candidate1;
8951 unsigned reg2 = cfun->machine->frame.wb_candidate2;
204d2c03 8952 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
71bfb77a 8953 rtx_insn *insn;
43e9d192 8954
c600df9a
RS
8955 if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
8956 {
8957 /* Fold the SVE allocation into the initial allocation.
8958 We don't do this in aarch64_layout_arg to avoid pessimizing
8959 the epilogue code. */
8960 initial_adjust += sve_callee_adjust;
8961 sve_callee_adjust = 0;
8962 }
8963
db58fd89
JW
8964 /* Sign return address for functions. */
8965 if (aarch64_return_address_signing_enabled ())
27169e45 8966 {
8fc16d72
ST
8967 switch (aarch64_ra_sign_key)
8968 {
8969 case AARCH64_KEY_A:
8970 insn = emit_insn (gen_paciasp ());
8971 break;
8972 case AARCH64_KEY_B:
8973 insn = emit_insn (gen_pacibsp ());
8974 break;
8975 default:
8976 gcc_unreachable ();
8977 }
27169e45
JW
8978 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
8979 RTX_FRAME_RELATED_P (insn) = 1;
8980 }
db58fd89 8981
dd991abb 8982 if (flag_stack_usage_info)
6a70badb 8983 current_function_static_stack_size = constant_lower_bound (frame_size);
43e9d192 8984
a3eb8a52
EB
8985 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
8986 {
8987 if (crtl->is_leaf && !cfun->calls_alloca)
8988 {
6a70badb
RS
8989 if (maybe_gt (frame_size, PROBE_INTERVAL)
8990 && maybe_gt (frame_size, get_stack_check_protect ()))
8c1dd970
JL
8991 aarch64_emit_probe_stack_range (get_stack_check_protect (),
8992 (frame_size
8993 - get_stack_check_protect ()));
a3eb8a52 8994 }
6a70badb 8995 else if (maybe_gt (frame_size, 0))
8c1dd970 8996 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
a3eb8a52
EB
8997 }
8998
901e66e0
SD
8999 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
9000 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
f5470a77 9001
cd1bef27
JL
9002 /* In theory we should never have both an initial adjustment
9003 and a callee save adjustment. Verify that is the case since the
9004 code below does not handle it for -fstack-clash-protection. */
9005 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
9006
9007 /* Will only probe if the initial adjustment is larger than the guard
9008 less the amount of the guard reserved for use by the caller's
9009 outgoing args. */
901e66e0 9010 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
cd1bef27 9011 true, false);
43e9d192 9012
71bfb77a
WD
9013 if (callee_adjust != 0)
9014 aarch64_push_regs (reg1, reg2, callee_adjust);
43e9d192 9015
c600df9a
RS
9016 /* The offset of the frame chain record (if any) from the current SP. */
9017 poly_int64 chain_offset = (initial_adjust + callee_adjust
9018 - cfun->machine->frame.hard_fp_offset);
9019 gcc_assert (known_ge (chain_offset, 0));
9020
9021 /* The offset of the bottom of the save area from the current SP. */
9022 poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size;
9023
204d2c03 9024 if (emit_frame_chain)
43e9d192 9025 {
71bfb77a 9026 if (callee_adjust == 0)
43cacb12
RS
9027 {
9028 reg1 = R29_REGNUM;
9029 reg2 = R30_REGNUM;
c600df9a
RS
9030 aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
9031 false, false);
43cacb12 9032 }
c600df9a
RS
9033 else
9034 gcc_assert (known_eq (chain_offset, 0));
f5470a77 9035 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
c600df9a 9036 stack_pointer_rtx, chain_offset,
901e66e0 9037 tmp1_rtx, tmp0_rtx, frame_pointer_needed);
43cacb12
RS
9038 if (frame_pointer_needed && !frame_size.is_constant ())
9039 {
9040 /* Variable-sized frames need to describe the save slot
9041 address using DW_CFA_expression rather than DW_CFA_offset.
9042 This means that, without taking further action, the
9043 locations of the registers that we've already saved would
9044 remain based on the stack pointer even after we redefine
9045 the CFA based on the frame pointer. We therefore need new
9046 DW_CFA_expressions to re-express the save slots with addresses
9047 based on the frame pointer. */
9048 rtx_insn *insn = get_last_insn ();
9049 gcc_assert (RTX_FRAME_RELATED_P (insn));
9050
9051 /* Add an explicit CFA definition if this was previously
9052 implicit. */
9053 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
9054 {
9055 rtx src = plus_constant (Pmode, stack_pointer_rtx,
9056 callee_offset);
9057 add_reg_note (insn, REG_CFA_ADJUST_CFA,
9058 gen_rtx_SET (hard_frame_pointer_rtx, src));
9059 }
9060
9061 /* Change the save slot expressions for the registers that
9062 we've already saved. */
c600df9a
RS
9063 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
9064 hard_frame_pointer_rtx, UNITS_PER_WORD);
9065 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
9066 hard_frame_pointer_rtx, 0);
43cacb12 9067 }
71bfb77a 9068 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
43e9d192 9069 }
71bfb77a 9070
c600df9a
RS
9071 aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
9072 callee_adjust != 0 || emit_frame_chain,
9073 emit_frame_chain);
9074 if (maybe_ne (sve_callee_adjust, 0))
9075 {
9076 gcc_assert (!flag_stack_clash_protection
9077 || known_eq (initial_adjust, 0));
9078 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
9079 sve_callee_adjust,
9080 !frame_pointer_needed, false);
9081 saved_regs_offset += sve_callee_adjust;
9082 }
9083 aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
9084 false, emit_frame_chain);
9085 aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
9086 callee_adjust != 0 || emit_frame_chain,
9087 emit_frame_chain);
cd1bef27
JL
9088
9089 /* We may need to probe the final adjustment if it is larger than the guard
9090 that is assumed by the called. */
901e66e0 9091 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
cd1bef27 9092 !frame_pointer_needed, true);
43e9d192
IB
9093}
9094
4f942779
RL
9095/* Return TRUE if we can use a simple_return insn.
9096
9097 This function checks whether the callee saved stack is empty, which
9098 means no restore actions are need. The pro_and_epilogue will use
9099 this to check whether shrink-wrapping opt is feasible. */
9100
9101bool
9102aarch64_use_return_insn_p (void)
9103{
9104 if (!reload_completed)
9105 return false;
9106
9107 if (crtl->profile)
9108 return false;
9109
6a70badb 9110 return known_eq (cfun->machine->frame.frame_size, 0);
4f942779
RL
9111}
9112
71bfb77a
WD
9113/* Generate the epilogue instructions for returning from a function.
9114 This is almost exactly the reverse of the prolog sequence, except
9115 that we need to insert barriers to avoid scheduling loads that read
9116 from a deallocated stack, and we optimize the unwind records by
9117 emitting them all together if possible. */
43e9d192
IB
9118void
9119aarch64_expand_epilogue (bool for_sibcall)
9120{
6a70badb 9121 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
71bfb77a 9122 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6a70badb
RS
9123 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
9124 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
c600df9a
RS
9125 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
9126 poly_int64 below_hard_fp_saved_regs_size
9127 = cfun->machine->frame.below_hard_fp_saved_regs_size;
71bfb77a
WD
9128 unsigned reg1 = cfun->machine->frame.wb_candidate1;
9129 unsigned reg2 = cfun->machine->frame.wb_candidate2;
9130 rtx cfi_ops = NULL;
9131 rtx_insn *insn;
901e66e0
SD
9132 /* A stack clash protection prologue may not have left EP0_REGNUM or
9133 EP1_REGNUM in a usable state. The same is true for allocations
43cacb12 9134 with an SVE component, since we then need both temporary registers
cd1bef27
JL
9135 for each allocation. For stack clash we are in a usable state if
9136 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
9137 HOST_WIDE_INT guard_size
028d4092 9138 = 1 << param_stack_clash_protection_guard_size;
cd1bef27
JL
9139 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
9140
c600df9a
RS
9141 /* We can re-use the registers when:
9142
9143 (a) the deallocation amount is the same as the corresponding
9144 allocation amount (which is false if we combine the initial
9145 and SVE callee save allocations in the prologue); and
9146
9147 (b) the allocation amount doesn't need a probe (which is false
9148 if the amount is guard_size - guard_used_by_caller or greater).
9149
9150 In such situations the register should remain live with the correct
cd1bef27 9151 value. */
43cacb12 9152 bool can_inherit_p = (initial_adjust.is_constant ()
c600df9a 9153 && final_adjust.is_constant ()
cd1bef27 9154 && (!flag_stack_clash_protection
c600df9a
RS
9155 || (known_lt (initial_adjust,
9156 guard_size - guard_used_by_caller)
9157 && known_eq (sve_callee_adjust, 0))));
44c0e7b9 9158
71bfb77a 9159 /* We need to add memory barrier to prevent read from deallocated stack. */
6a70badb
RS
9160 bool need_barrier_p
9161 = maybe_ne (get_frame_size ()
9162 + cfun->machine->frame.saved_varargs_size, 0);
43e9d192 9163
71bfb77a 9164 /* Emit a barrier to prevent loads from a deallocated stack. */
6a70badb
RS
9165 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
9166 || cfun->calls_alloca
8144a493 9167 || crtl->calls_eh_return)
43e9d192 9168 {
71bfb77a
WD
9169 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
9170 need_barrier_p = false;
9171 }
7e8c2bd5 9172
71bfb77a
WD
9173 /* Restore the stack pointer from the frame pointer if it may not
9174 be the same as the stack pointer. */
901e66e0
SD
9175 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
9176 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6a70badb
RS
9177 if (frame_pointer_needed
9178 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
f5470a77
RS
9179 /* If writeback is used when restoring callee-saves, the CFA
9180 is restored on the instruction doing the writeback. */
9181 aarch64_add_offset (Pmode, stack_pointer_rtx,
c600df9a
RS
9182 hard_frame_pointer_rtx,
9183 -callee_offset - below_hard_fp_saved_regs_size,
901e66e0 9184 tmp1_rtx, tmp0_rtx, callee_adjust == 0);
71bfb77a 9185 else
cd1bef27
JL
9186 /* The case where we need to re-use the register here is very rare, so
9187 avoid the complicated condition and just always emit a move if the
9188 immediate doesn't fit. */
901e66e0 9189 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
43e9d192 9190
c600df9a
RS
9191 /* Restore the vector registers before the predicate registers,
9192 so that we can use P4 as a temporary for big-endian SVE frames. */
9193 aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
9194 callee_adjust != 0, &cfi_ops);
9195 aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
9196 false, &cfi_ops);
9197 if (maybe_ne (sve_callee_adjust, 0))
9198 aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
9199 aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
9200 R0_REGNUM, R30_REGNUM,
71bfb77a 9201 callee_adjust != 0, &cfi_ops);
43e9d192 9202
71bfb77a
WD
9203 if (need_barrier_p)
9204 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
9205
9206 if (callee_adjust != 0)
9207 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
9208
1ccbfffb
RS
9209 /* If we have no register restore information, the CFA must have been
9210 defined in terms of the stack pointer since the end of the prologue. */
9211 gcc_assert (cfi_ops || !frame_pointer_needed);
9212
9213 if (cfi_ops && (callee_adjust != 0 || maybe_gt (initial_adjust, 65536)))
71bfb77a
WD
9214 {
9215 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
89ac681e 9216 insn = get_last_insn ();
71bfb77a
WD
9217 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
9218 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
43e9d192 9219 RTX_FRAME_RELATED_P (insn) = 1;
71bfb77a 9220 cfi_ops = NULL;
43e9d192
IB
9221 }
9222
901e66e0
SD
9223 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
9224 add restriction on emit_move optimization to leaf functions. */
9225 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
9226 (!can_inherit_p || !crtl->is_leaf
9227 || df_regs_ever_live_p (EP0_REGNUM)));
7e8c2bd5 9228
71bfb77a
WD
9229 if (cfi_ops)
9230 {
9231 /* Emit delayed restores and reset the CFA to be SP. */
9232 insn = get_last_insn ();
9233 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
9234 REG_NOTES (insn) = cfi_ops;
9235 RTX_FRAME_RELATED_P (insn) = 1;
dd991abb
RH
9236 }
9237
db58fd89
JW
9238 /* We prefer to emit the combined return/authenticate instruction RETAA,
9239 however there are three cases in which we must instead emit an explicit
9240 authentication instruction.
9241
9242 1) Sibcalls don't return in a normal way, so if we're about to call one
9243 we must authenticate.
9244
9245 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
9246 generating code for !TARGET_ARMV8_3 we can't use it and must
9247 explicitly authenticate.
db58fd89
JW
9248 */
9249 if (aarch64_return_address_signing_enabled ()
14d31404 9250 && (for_sibcall || !TARGET_ARMV8_3))
27169e45 9251 {
8fc16d72
ST
9252 switch (aarch64_ra_sign_key)
9253 {
9254 case AARCH64_KEY_A:
9255 insn = emit_insn (gen_autiasp ());
9256 break;
9257 case AARCH64_KEY_B:
9258 insn = emit_insn (gen_autibsp ());
9259 break;
9260 default:
9261 gcc_unreachable ();
9262 }
27169e45
JW
9263 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
9264 RTX_FRAME_RELATED_P (insn) = 1;
9265 }
db58fd89 9266
dd991abb 9267 /* Stack adjustment for exception handler. */
b5b9147d 9268 if (crtl->calls_eh_return && !for_sibcall)
dd991abb
RH
9269 {
9270 /* We need to unwind the stack by the offset computed by
9271 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
9272 to be SP; letting the CFA move during this adjustment
9273 is just as correct as retaining the CFA from the body
9274 of the function. Therefore, do nothing special. */
9275 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
43e9d192
IB
9276 }
9277
9278 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
9279 if (!for_sibcall)
9280 emit_jump_insn (ret_rtx);
9281}
9282
8144a493
WD
9283/* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
9284 normally or return to a previous frame after unwinding.
1c960e02 9285
8144a493
WD
9286 An EH return uses a single shared return sequence. The epilogue is
9287 exactly like a normal epilogue except that it has an extra input
9288 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
9289 that must be applied after the frame has been destroyed. An extra label
9290 is inserted before the epilogue which initializes this register to zero,
9291 and this is the entry point for a normal return.
43e9d192 9292
8144a493
WD
9293 An actual EH return updates the return address, initializes the stack
9294 adjustment and jumps directly into the epilogue (bypassing the zeroing
9295 of the adjustment). Since the return address is typically saved on the
9296 stack when a function makes a call, the saved LR must be updated outside
9297 the epilogue.
43e9d192 9298
8144a493
WD
9299 This poses problems as the store is generated well before the epilogue,
9300 so the offset of LR is not known yet. Also optimizations will remove the
9301 store as it appears dead, even after the epilogue is generated (as the
9302 base or offset for loading LR is different in many cases).
43e9d192 9303
8144a493
WD
9304 To avoid these problems this implementation forces the frame pointer
9305 in eh_return functions so that the location of LR is fixed and known early.
9306 It also marks the store volatile, so no optimization is permitted to
9307 remove the store. */
9308rtx
9309aarch64_eh_return_handler_rtx (void)
9310{
9311 rtx tmp = gen_frame_mem (Pmode,
9312 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
43e9d192 9313
8144a493
WD
9314 /* Mark the store volatile, so no optimization is permitted to remove it. */
9315 MEM_VOLATILE_P (tmp) = true;
9316 return tmp;
43e9d192
IB
9317}
9318
43e9d192
IB
9319/* Output code to add DELTA to the first argument, and then jump
9320 to FUNCTION. Used for C++ multiple inheritance. */
9321static void
9322aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
9323 HOST_WIDE_INT delta,
9324 HOST_WIDE_INT vcall_offset,
9325 tree function)
9326{
9327 /* The this pointer is always in x0. Note that this differs from
9328 Arm where the this pointer maybe bumped to r1 if r0 is required
9329 to return a pointer to an aggregate. On AArch64 a result value
9330 pointer will be in x8. */
9331 int this_regno = R0_REGNUM;
5d8a22a5
DM
9332 rtx this_rtx, temp0, temp1, addr, funexp;
9333 rtx_insn *insn;
6b5777c6 9334 const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
43e9d192 9335
c904388d
SD
9336 if (aarch64_bti_enabled ())
9337 emit_insn (gen_bti_c());
9338
75f1d6fc
SN
9339 reload_completed = 1;
9340 emit_note (NOTE_INSN_PROLOGUE_END);
43e9d192 9341
f5470a77 9342 this_rtx = gen_rtx_REG (Pmode, this_regno);
901e66e0
SD
9343 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
9344 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
f5470a77 9345
43e9d192 9346 if (vcall_offset == 0)
43cacb12 9347 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
43e9d192
IB
9348 else
9349 {
28514dda 9350 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
43e9d192 9351
75f1d6fc
SN
9352 addr = this_rtx;
9353 if (delta != 0)
9354 {
9355 if (delta >= -256 && delta < 256)
9356 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
9357 plus_constant (Pmode, this_rtx, delta));
9358 else
43cacb12
RS
9359 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
9360 temp1, temp0, false);
43e9d192
IB
9361 }
9362
28514dda
YZ
9363 if (Pmode == ptr_mode)
9364 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
9365 else
9366 aarch64_emit_move (temp0,
9367 gen_rtx_ZERO_EXTEND (Pmode,
9368 gen_rtx_MEM (ptr_mode, addr)));
75f1d6fc 9369
28514dda 9370 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
75f1d6fc 9371 addr = plus_constant (Pmode, temp0, vcall_offset);
43e9d192
IB
9372 else
9373 {
f43657b4
JW
9374 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
9375 Pmode);
75f1d6fc 9376 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
43e9d192
IB
9377 }
9378
28514dda
YZ
9379 if (Pmode == ptr_mode)
9380 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
9381 else
9382 aarch64_emit_move (temp1,
9383 gen_rtx_SIGN_EXTEND (Pmode,
9384 gen_rtx_MEM (ptr_mode, addr)));
9385
75f1d6fc 9386 emit_insn (gen_add2_insn (this_rtx, temp1));
43e9d192
IB
9387 }
9388
75f1d6fc
SN
9389 /* Generate a tail call to the target function. */
9390 if (!TREE_USED (function))
9391 {
9392 assemble_external (function);
9393 TREE_USED (function) = 1;
9394 }
9395 funexp = XEXP (DECL_RTL (function), 0);
9396 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
08cc4d92
RS
9397 rtx callee_abi = gen_int_mode (fndecl_abi (function).id (), DImode);
9398 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
75f1d6fc
SN
9399 SIBLING_CALL_P (insn) = 1;
9400
9401 insn = get_insns ();
9402 shorten_branches (insn);
6b5777c6
MF
9403
9404 assemble_start_function (thunk, fnname);
75f1d6fc
SN
9405 final_start_function (insn, file, 1);
9406 final (insn, file, 1);
43e9d192 9407 final_end_function ();
6b5777c6 9408 assemble_end_function (thunk, fnname);
75f1d6fc
SN
9409
9410 /* Stop pretending to be a post-reload pass. */
9411 reload_completed = 0;
43e9d192
IB
9412}
9413
43e9d192
IB
9414static bool
9415aarch64_tls_referenced_p (rtx x)
9416{
9417 if (!TARGET_HAVE_TLS)
9418 return false;
e7de8563
RS
9419 subrtx_iterator::array_type array;
9420 FOR_EACH_SUBRTX (iter, array, x, ALL)
9421 {
9422 const_rtx x = *iter;
3793ecc1 9423 if (SYMBOL_REF_P (x) && SYMBOL_REF_TLS_MODEL (x) != 0)
e7de8563
RS
9424 return true;
9425 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
9426 TLS offsets, not real symbol references. */
9427 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
9428 iter.skip_subrtxes ();
9429 }
9430 return false;
43e9d192
IB
9431}
9432
9433
43e9d192
IB
9434/* Return true if val can be encoded as a 12-bit unsigned immediate with
9435 a left shift of 0 or 12 bits. */
9436bool
9437aarch64_uimm12_shift (HOST_WIDE_INT val)
9438{
9439 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
9440 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
9441 );
9442}
9443
eb471ba3
TC
9444/* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
9445 that can be created with a left shift of 0 or 12. */
9446static HOST_WIDE_INT
9447aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
9448{
9449 /* Check to see if the value fits in 24 bits, as that is the maximum we can
9450 handle correctly. */
9451 gcc_assert ((val & 0xffffff) == val);
9452
9453 if (((val & 0xfff) << 0) == val)
9454 return val;
9455
9456 return val & (0xfff << 12);
9457}
43e9d192
IB
9458
9459/* Return true if val is an immediate that can be loaded into a
9460 register by a MOVZ instruction. */
9461static bool
77e994c9 9462aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
43e9d192
IB
9463{
9464 if (GET_MODE_SIZE (mode) > 4)
9465 {
9466 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
9467 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
9468 return 1;
9469 }
9470 else
9471 {
43cacb12
RS
9472 /* Ignore sign extension. */
9473 val &= (HOST_WIDE_INT) 0xffffffff;
9474 }
9475 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
9476 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
9477}
9478
bba0c624
RS
9479/* Test whether:
9480
9481 X = (X & AND_VAL) | IOR_VAL;
9482
9483 can be implemented using:
9484
9485 MOVK X, #(IOR_VAL >> shift), LSL #shift
9486
9487 Return the shift if so, otherwise return -1. */
9488int
9489aarch64_movk_shift (const wide_int_ref &and_val,
9490 const wide_int_ref &ior_val)
9491{
9492 unsigned int precision = and_val.get_precision ();
9493 unsigned HOST_WIDE_INT mask = 0xffff;
9494 for (unsigned int shift = 0; shift < precision; shift += 16)
9495 {
9496 if (and_val == ~mask && (ior_val & mask) == ior_val)
9497 return shift;
9498 mask <<= 16;
9499 }
9500 return -1;
9501}
9502
43cacb12
RS
9503/* VAL is a value with the inner mode of MODE. Replicate it to fill a
9504 64-bit (DImode) integer. */
9505
9506static unsigned HOST_WIDE_INT
9507aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
9508{
9509 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
9510 while (size < 64)
9511 {
9512 val &= (HOST_WIDE_INT_1U << size) - 1;
9513 val |= val << size;
9514 size *= 2;
43e9d192 9515 }
43cacb12 9516 return val;
43e9d192
IB
9517}
9518
a64c73a2
WD
9519/* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
9520
9521static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
9522 {
9523 0x0000000100000001ull,
9524 0x0001000100010001ull,
9525 0x0101010101010101ull,
9526 0x1111111111111111ull,
9527 0x5555555555555555ull,
9528 };
9529
43e9d192
IB
9530
9531/* Return true if val is a valid bitmask immediate. */
a64c73a2 9532
43e9d192 9533bool
a64c73a2 9534aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
43e9d192 9535{
a64c73a2
WD
9536 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
9537 int bits;
9538
9539 /* Check for a single sequence of one bits and return quickly if so.
9540 The special cases of all ones and all zeroes returns false. */
43cacb12 9541 val = aarch64_replicate_bitmask_imm (val_in, mode);
a64c73a2
WD
9542 tmp = val + (val & -val);
9543
9544 if (tmp == (tmp & -tmp))
9545 return (val + 1) > 1;
9546
9547 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
9548 if (mode == SImode)
9549 val = (val << 32) | (val & 0xffffffff);
9550
9551 /* Invert if the immediate doesn't start with a zero bit - this means we
9552 only need to search for sequences of one bits. */
9553 if (val & 1)
9554 val = ~val;
9555
9556 /* Find the first set bit and set tmp to val with the first sequence of one
9557 bits removed. Return success if there is a single sequence of ones. */
9558 first_one = val & -val;
9559 tmp = val & (val + first_one);
9560
9561 if (tmp == 0)
9562 return true;
9563
9564 /* Find the next set bit and compute the difference in bit position. */
9565 next_one = tmp & -tmp;
9566 bits = clz_hwi (first_one) - clz_hwi (next_one);
9567 mask = val ^ tmp;
9568
9569 /* Check the bit position difference is a power of 2, and that the first
9570 sequence of one bits fits within 'bits' bits. */
9571 if ((mask >> bits) != 0 || bits != (bits & -bits))
9572 return false;
9573
9574 /* Check the sequence of one bits is repeated 64/bits times. */
9575 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
43e9d192
IB
9576}
9577
43fd192f
MC
9578/* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
9579 Assumed precondition: VAL_IN Is not zero. */
9580
9581unsigned HOST_WIDE_INT
9582aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
9583{
9584 int lowest_bit_set = ctz_hwi (val_in);
9585 int highest_bit_set = floor_log2 (val_in);
9586 gcc_assert (val_in != 0);
9587
9588 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
9589 (HOST_WIDE_INT_1U << lowest_bit_set));
9590}
9591
9592/* Create constant where bits outside of lowest bit set to highest bit set
9593 are set to 1. */
9594
9595unsigned HOST_WIDE_INT
9596aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
9597{
9598 return val_in | ~aarch64_and_split_imm1 (val_in);
9599}
9600
9601/* Return true if VAL_IN is a valid 'and' bitmask immediate. */
9602
9603bool
9604aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
9605{
77e994c9
RS
9606 scalar_int_mode int_mode;
9607 if (!is_a <scalar_int_mode> (mode, &int_mode))
9608 return false;
9609
9610 if (aarch64_bitmask_imm (val_in, int_mode))
43fd192f
MC
9611 return false;
9612
77e994c9 9613 if (aarch64_move_imm (val_in, int_mode))
43fd192f
MC
9614 return false;
9615
9616 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
9617
77e994c9 9618 return aarch64_bitmask_imm (imm2, int_mode);
43fd192f 9619}
43e9d192
IB
9620
9621/* Return true if val is an immediate that can be loaded into a
9622 register in a single instruction. */
9623bool
ef4bddc2 9624aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
43e9d192 9625{
77e994c9
RS
9626 scalar_int_mode int_mode;
9627 if (!is_a <scalar_int_mode> (mode, &int_mode))
9628 return false;
9629
9630 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
43e9d192 9631 return 1;
77e994c9 9632 return aarch64_bitmask_imm (val, int_mode);
43e9d192
IB
9633}
9634
9635static bool
ef4bddc2 9636aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
43e9d192 9637{
43e9d192
IB
9638 if (GET_CODE (x) == HIGH)
9639 return true;
9640
43cacb12
RS
9641 /* There's no way to calculate VL-based values using relocations. */
9642 subrtx_iterator::array_type array;
9643 FOR_EACH_SUBRTX (iter, array, x, ALL)
9644 if (GET_CODE (*iter) == CONST_POLY_INT)
9645 return true;
9646
74b27d8e
RS
9647 poly_int64 offset;
9648 rtx base = strip_offset_and_salt (x, &offset);
3793ecc1 9649 if (SYMBOL_REF_P (base) || LABEL_REF_P (base))
28514dda 9650 {
74b27d8e
RS
9651 /* We checked for POLY_INT_CST offsets above. */
9652 if (aarch64_classify_symbol (base, offset.to_constant ())
28514dda
YZ
9653 != SYMBOL_FORCE_TO_MEM)
9654 return true;
9655 else
9656 /* Avoid generating a 64-bit relocation in ILP32; leave
9657 to aarch64_expand_mov_immediate to handle it properly. */
9658 return mode != ptr_mode;
9659 }
43e9d192
IB
9660
9661 return aarch64_tls_referenced_p (x);
9662}
9663
e79136e4
WD
9664/* Implement TARGET_CASE_VALUES_THRESHOLD.
9665 The expansion for a table switch is quite expensive due to the number
9666 of instructions, the table lookup and hard to predict indirect jump.
9667 When optimizing for speed, and -O3 enabled, use the per-core tuning if
9c751b88
WD
9668 set, otherwise use tables for >= 11 cases as a tradeoff between size and
9669 performance. When optimizing for size, use 8 for smallest codesize. */
50487d79
EM
9670
9671static unsigned int
9672aarch64_case_values_threshold (void)
9673{
9674 /* Use the specified limit for the number of cases before using jump
9675 tables at higher optimization levels. */
9676 if (optimize > 2
9677 && selected_cpu->tune->max_case_values != 0)
9678 return selected_cpu->tune->max_case_values;
9679 else
9c751b88 9680 return optimize_size ? 8 : 11;
50487d79
EM
9681}
9682
43e9d192
IB
9683/* Return true if register REGNO is a valid index register.
9684 STRICT_P is true if REG_OK_STRICT is in effect. */
9685
9686bool
9687aarch64_regno_ok_for_index_p (int regno, bool strict_p)
9688{
9689 if (!HARD_REGISTER_NUM_P (regno))
9690 {
9691 if (!strict_p)
9692 return true;
9693
9694 if (!reg_renumber)
9695 return false;
9696
9697 regno = reg_renumber[regno];
9698 }
9699 return GP_REGNUM_P (regno);
9700}
9701
9702/* Return true if register REGNO is a valid base register for mode MODE.
9703 STRICT_P is true if REG_OK_STRICT is in effect. */
9704
9705bool
9706aarch64_regno_ok_for_base_p (int regno, bool strict_p)
9707{
9708 if (!HARD_REGISTER_NUM_P (regno))
9709 {
9710 if (!strict_p)
9711 return true;
9712
9713 if (!reg_renumber)
9714 return false;
9715
9716 regno = reg_renumber[regno];
9717 }
9718
9719 /* The fake registers will be eliminated to either the stack or
9720 hard frame pointer, both of which are usually valid base registers.
9721 Reload deals with the cases where the eliminated form isn't valid. */
9722 return (GP_REGNUM_P (regno)
9723 || regno == SP_REGNUM
9724 || regno == FRAME_POINTER_REGNUM
9725 || regno == ARG_POINTER_REGNUM);
9726}
9727
9728/* Return true if X is a valid base register for mode MODE.
9729 STRICT_P is true if REG_OK_STRICT is in effect. */
9730
9731static bool
9732aarch64_base_register_rtx_p (rtx x, bool strict_p)
9733{
76160199 9734 if (!strict_p
3793ecc1 9735 && SUBREG_P (x)
76160199 9736 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
43e9d192
IB
9737 x = SUBREG_REG (x);
9738
9739 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
9740}
9741
9742/* Return true if address offset is a valid index. If it is, fill in INFO
9743 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
9744
9745static bool
9746aarch64_classify_index (struct aarch64_address_info *info, rtx x,
ef4bddc2 9747 machine_mode mode, bool strict_p)
43e9d192
IB
9748{
9749 enum aarch64_address_type type;
9750 rtx index;
9751 int shift;
9752
9753 /* (reg:P) */
3793ecc1 9754 if ((REG_P (x) || SUBREG_P (x))
43e9d192
IB
9755 && GET_MODE (x) == Pmode)
9756 {
9757 type = ADDRESS_REG_REG;
9758 index = x;
9759 shift = 0;
9760 }
9761 /* (sign_extend:DI (reg:SI)) */
9762 else if ((GET_CODE (x) == SIGN_EXTEND
9763 || GET_CODE (x) == ZERO_EXTEND)
9764 && GET_MODE (x) == DImode
9765 && GET_MODE (XEXP (x, 0)) == SImode)
9766 {
9767 type = (GET_CODE (x) == SIGN_EXTEND)
9768 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
9769 index = XEXP (x, 0);
9770 shift = 0;
9771 }
9772 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
9773 else if (GET_CODE (x) == MULT
9774 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
9775 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
9776 && GET_MODE (XEXP (x, 0)) == DImode
9777 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
9778 && CONST_INT_P (XEXP (x, 1)))
9779 {
9780 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
9781 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
9782 index = XEXP (XEXP (x, 0), 0);
9783 shift = exact_log2 (INTVAL (XEXP (x, 1)));
9784 }
9785 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
9786 else if (GET_CODE (x) == ASHIFT
9787 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
9788 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
9789 && GET_MODE (XEXP (x, 0)) == DImode
9790 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
9791 && CONST_INT_P (XEXP (x, 1)))
9792 {
9793 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
9794 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
9795 index = XEXP (XEXP (x, 0), 0);
9796 shift = INTVAL (XEXP (x, 1));
9797 }
43e9d192
IB
9798 /* (and:DI (mult:DI (reg:DI) (const_int scale))
9799 (const_int 0xffffffff<<shift)) */
9800 else if (GET_CODE (x) == AND
9801 && GET_MODE (x) == DImode
9802 && GET_CODE (XEXP (x, 0)) == MULT
9803 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
9804 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9805 && CONST_INT_P (XEXP (x, 1)))
9806 {
9807 type = ADDRESS_REG_UXTW;
9808 index = XEXP (XEXP (x, 0), 0);
9809 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
9810 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
9811 shift = -1;
9812 }
43e9d192
IB
9813 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
9814 (const_int 0xffffffff<<shift)) */
9815 else if (GET_CODE (x) == AND
9816 && GET_MODE (x) == DImode
9817 && GET_CODE (XEXP (x, 0)) == ASHIFT
9818 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
9819 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9820 && CONST_INT_P (XEXP (x, 1)))
9821 {
9822 type = ADDRESS_REG_UXTW;
9823 index = XEXP (XEXP (x, 0), 0);
9824 shift = INTVAL (XEXP (XEXP (x, 0), 1));
9825 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
9826 shift = -1;
9827 }
9828 /* (mult:P (reg:P) (const_int scale)) */
9829 else if (GET_CODE (x) == MULT
9830 && GET_MODE (x) == Pmode
9831 && GET_MODE (XEXP (x, 0)) == Pmode
9832 && CONST_INT_P (XEXP (x, 1)))
9833 {
9834 type = ADDRESS_REG_REG;
9835 index = XEXP (x, 0);
9836 shift = exact_log2 (INTVAL (XEXP (x, 1)));
9837 }
9838 /* (ashift:P (reg:P) (const_int shift)) */
9839 else if (GET_CODE (x) == ASHIFT
9840 && GET_MODE (x) == Pmode
9841 && GET_MODE (XEXP (x, 0)) == Pmode
9842 && CONST_INT_P (XEXP (x, 1)))
9843 {
9844 type = ADDRESS_REG_REG;
9845 index = XEXP (x, 0);
9846 shift = INTVAL (XEXP (x, 1));
9847 }
9848 else
9849 return false;
9850
76160199 9851 if (!strict_p
3793ecc1 9852 && SUBREG_P (index)
76160199 9853 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
43e9d192
IB
9854 index = SUBREG_REG (index);
9855
43cacb12
RS
9856 if (aarch64_sve_data_mode_p (mode))
9857 {
9858 if (type != ADDRESS_REG_REG
9859 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
9860 return false;
9861 }
9862 else
9863 {
9864 if (shift != 0
9865 && !(IN_RANGE (shift, 1, 3)
9866 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
9867 return false;
9868 }
9869
9870 if (REG_P (index)
43e9d192
IB
9871 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
9872 {
9873 info->type = type;
9874 info->offset = index;
9875 info->shift = shift;
9876 return true;
9877 }
9878
9879 return false;
9880}
9881
abc52318
KT
9882/* Return true if MODE is one of the modes for which we
9883 support LDP/STP operations. */
9884
9885static bool
9886aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
9887{
9888 return mode == SImode || mode == DImode
9889 || mode == SFmode || mode == DFmode
9890 || (aarch64_vector_mode_supported_p (mode)
9f5361c8
KT
9891 && (known_eq (GET_MODE_SIZE (mode), 8)
9892 || (known_eq (GET_MODE_SIZE (mode), 16)
9893 && (aarch64_tune_params.extra_tuning_flags
9894 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
abc52318
KT
9895}
9896
9e0218fc
RH
9897/* Return true if REGNO is a virtual pointer register, or an eliminable
9898 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
9899 include stack_pointer or hard_frame_pointer. */
9900static bool
9901virt_or_elim_regno_p (unsigned regno)
9902{
9903 return ((regno >= FIRST_VIRTUAL_REGISTER
9904 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
9905 || regno == FRAME_POINTER_REGNUM
9906 || regno == ARG_POINTER_REGNUM);
9907}
9908
a97d8b98
RS
9909/* Return true if X is a valid address of type TYPE for machine mode MODE.
9910 If it is, fill in INFO appropriately. STRICT_P is true if
9911 REG_OK_STRICT is in effect. */
43e9d192 9912
a98824ac 9913bool
43e9d192 9914aarch64_classify_address (struct aarch64_address_info *info,
a97d8b98 9915 rtx x, machine_mode mode, bool strict_p,
a98824ac 9916 aarch64_addr_query_type type)
43e9d192
IB
9917{
9918 enum rtx_code code = GET_CODE (x);
9919 rtx op0, op1;
dc640181
RS
9920 poly_int64 offset;
9921
6a70badb 9922 HOST_WIDE_INT const_size;
2d8c6dc1 9923
550a3380
RS
9924 /* Whether a vector mode is partial doesn't affect address legitimacy.
9925 Partial vectors like VNx8QImode allow the same indexed addressing
9926 mode and MUL VL addressing mode as full vectors like VNx16QImode;
9927 in both cases, MUL VL counts multiples of GET_MODE_SIZE. */
9928 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
9929 vec_flags &= ~VEC_PARTIAL;
9930
80d43579
WD
9931 /* On BE, we use load/store pair for all large int mode load/stores.
9932 TI/TFmode may also use a load/store pair. */
43cacb12 9933 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
a97d8b98 9934 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
a25831ac 9935 || type == ADDR_QUERY_LDP_STP_N
80d43579
WD
9936 || mode == TImode
9937 || mode == TFmode
43cacb12 9938 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
a25831ac
AV
9939 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
9940 corresponds to the actual size of the memory being loaded/stored and the
9941 mode of the corresponding addressing mode is half of that. */
9942 if (type == ADDR_QUERY_LDP_STP_N
9943 && known_eq (GET_MODE_SIZE (mode), 16))
9944 mode = DFmode;
9945
6a70badb 9946 bool allow_reg_index_p = (!load_store_pair_p
512b3835
KT
9947 && ((vec_flags == 0
9948 && known_lt (GET_MODE_SIZE (mode), 16))
43cacb12 9949 || vec_flags == VEC_ADVSIMD
fa9863e7 9950 || vec_flags & VEC_SVE_DATA));
43cacb12 9951
512b3835
KT
9952 /* For SVE, only accept [Rn], [Rn, #offset, MUL VL] and [Rn, Rm, LSL #shift].
9953 The latter is not valid for SVE predicates, and that's rejected through
9954 allow_reg_index_p above. */
43cacb12
RS
9955 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
9956 && (code != REG && code != PLUS))
9957 return false;
2d8c6dc1
AH
9958
9959 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
9960 REG addressing. */
43cacb12
RS
9961 if (advsimd_struct_p
9962 && !BYTES_BIG_ENDIAN
43e9d192
IB
9963 && (code != POST_INC && code != REG))
9964 return false;
9965
43cacb12
RS
9966 gcc_checking_assert (GET_MODE (x) == VOIDmode
9967 || SCALAR_INT_MODE_P (GET_MODE (x)));
9968
43e9d192
IB
9969 switch (code)
9970 {
9971 case REG:
9972 case SUBREG:
9973 info->type = ADDRESS_REG_IMM;
9974 info->base = x;
9975 info->offset = const0_rtx;
dc640181 9976 info->const_offset = 0;
43e9d192
IB
9977 return aarch64_base_register_rtx_p (x, strict_p);
9978
9979 case PLUS:
9980 op0 = XEXP (x, 0);
9981 op1 = XEXP (x, 1);
15c0c5c9
JW
9982
9983 if (! strict_p
4aa81c2e 9984 && REG_P (op0)
9e0218fc 9985 && virt_or_elim_regno_p (REGNO (op0))
dc640181 9986 && poly_int_rtx_p (op1, &offset))
15c0c5c9
JW
9987 {
9988 info->type = ADDRESS_REG_IMM;
9989 info->base = op0;
9990 info->offset = op1;
dc640181 9991 info->const_offset = offset;
15c0c5c9
JW
9992
9993 return true;
9994 }
9995
6a70badb 9996 if (maybe_ne (GET_MODE_SIZE (mode), 0)
dc640181
RS
9997 && aarch64_base_register_rtx_p (op0, strict_p)
9998 && poly_int_rtx_p (op1, &offset))
43e9d192 9999 {
43e9d192
IB
10000 info->type = ADDRESS_REG_IMM;
10001 info->base = op0;
10002 info->offset = op1;
dc640181 10003 info->const_offset = offset;
43e9d192
IB
10004
10005 /* TImode and TFmode values are allowed in both pairs of X
10006 registers and individual Q registers. The available
10007 address modes are:
10008 X,X: 7-bit signed scaled offset
10009 Q: 9-bit signed offset
10010 We conservatively require an offset representable in either mode.
8ed49fab
KT
10011 When performing the check for pairs of X registers i.e. LDP/STP
10012 pass down DImode since that is the natural size of the LDP/STP
10013 instruction memory accesses. */
43e9d192 10014 if (mode == TImode || mode == TFmode)
8ed49fab 10015 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
3c5af608 10016 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
8734dfac 10017 || offset_12bit_unsigned_scaled_p (mode, offset)));
43e9d192 10018
fdcddba8
PW
10019 if (mode == V8DImode)
10020 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10021 && aarch64_offset_7bit_signed_scaled_p (DImode, offset + 48));
10022
2d8c6dc1
AH
10023 /* A 7bit offset check because OImode will emit a ldp/stp
10024 instruction (only big endian will get here).
10025 For ldp/stp instructions, the offset is scaled for the size of a
10026 single element of the pair. */
66f206b8
JW
10027 if (aarch64_advsimd_partial_struct_mode_p (mode)
10028 && known_eq (GET_MODE_SIZE (mode), 16))
10029 return aarch64_offset_7bit_signed_scaled_p (DImode, offset);
10030 if (aarch64_advsimd_full_struct_mode_p (mode)
10031 && known_eq (GET_MODE_SIZE (mode), 32))
2d8c6dc1
AH
10032 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
10033
10034 /* Three 9/12 bit offsets checks because CImode will emit three
10035 ldr/str instructions (only big endian will get here). */
66f206b8
JW
10036 if (aarch64_advsimd_partial_struct_mode_p (mode)
10037 && known_eq (GET_MODE_SIZE (mode), 24))
10038 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10039 && (aarch64_offset_9bit_signed_unscaled_p (DImode,
10040 offset + 16)
10041 || offset_12bit_unsigned_scaled_p (DImode,
10042 offset + 16)));
10043 if (aarch64_advsimd_full_struct_mode_p (mode)
10044 && known_eq (GET_MODE_SIZE (mode), 48))
2d8c6dc1 10045 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
66f206b8 10046 && (aarch64_offset_9bit_signed_unscaled_p (TImode,
3c5af608 10047 offset + 32)
66f206b8 10048 || offset_12bit_unsigned_scaled_p (TImode,
2d8c6dc1
AH
10049 offset + 32)));
10050
10051 /* Two 7bit offsets checks because XImode will emit two ldp/stp
10052 instructions (only big endian will get here). */
66f206b8
JW
10053 if (aarch64_advsimd_partial_struct_mode_p (mode)
10054 && known_eq (GET_MODE_SIZE (mode), 32))
10055 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10056 && aarch64_offset_7bit_signed_scaled_p (DImode,
10057 offset + 16));
10058 if (aarch64_advsimd_full_struct_mode_p (mode)
10059 && known_eq (GET_MODE_SIZE (mode), 64))
2d8c6dc1
AH
10060 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
10061 && aarch64_offset_7bit_signed_scaled_p (TImode,
10062 offset + 32));
10063
43cacb12
RS
10064 /* Make "m" use the LD1 offset range for SVE data modes, so
10065 that pre-RTL optimizers like ivopts will work to that
10066 instead of the wider LDR/STR range. */
10067 if (vec_flags == VEC_SVE_DATA)
10068 return (type == ADDR_QUERY_M
10069 ? offset_4bit_signed_scaled_p (mode, offset)
10070 : offset_9bit_signed_scaled_p (mode, offset));
10071
9f4cbab8
RS
10072 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
10073 {
10074 poly_int64 end_offset = (offset
10075 + GET_MODE_SIZE (mode)
10076 - BYTES_PER_SVE_VECTOR);
10077 return (type == ADDR_QUERY_M
10078 ? offset_4bit_signed_scaled_p (mode, offset)
10079 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
10080 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
10081 end_offset)));
10082 }
10083
43cacb12
RS
10084 if (vec_flags == VEC_SVE_PRED)
10085 return offset_9bit_signed_scaled_p (mode, offset);
10086
2d8c6dc1 10087 if (load_store_pair_p)
6a70badb 10088 return ((known_eq (GET_MODE_SIZE (mode), 4)
9f5361c8
KT
10089 || known_eq (GET_MODE_SIZE (mode), 8)
10090 || known_eq (GET_MODE_SIZE (mode), 16))
44707478 10091 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
43e9d192 10092 else
3c5af608 10093 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
43e9d192
IB
10094 || offset_12bit_unsigned_scaled_p (mode, offset));
10095 }
10096
10097 if (allow_reg_index_p)
10098 {
10099 /* Look for base + (scaled/extended) index register. */
10100 if (aarch64_base_register_rtx_p (op0, strict_p)
10101 && aarch64_classify_index (info, op1, mode, strict_p))
10102 {
10103 info->base = op0;
10104 return true;
10105 }
10106 if (aarch64_base_register_rtx_p (op1, strict_p)
10107 && aarch64_classify_index (info, op0, mode, strict_p))
10108 {
10109 info->base = op1;
10110 return true;
10111 }
10112 }
10113
10114 return false;
10115
10116 case POST_INC:
10117 case POST_DEC:
10118 case PRE_INC:
10119 case PRE_DEC:
10120 info->type = ADDRESS_REG_WB;
10121 info->base = XEXP (x, 0);
10122 info->offset = NULL_RTX;
10123 return aarch64_base_register_rtx_p (info->base, strict_p);
10124
10125 case POST_MODIFY:
10126 case PRE_MODIFY:
10127 info->type = ADDRESS_REG_WB;
10128 info->base = XEXP (x, 0);
10129 if (GET_CODE (XEXP (x, 1)) == PLUS
dc640181 10130 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
43e9d192
IB
10131 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
10132 && aarch64_base_register_rtx_p (info->base, strict_p))
10133 {
43e9d192 10134 info->offset = XEXP (XEXP (x, 1), 1);
dc640181 10135 info->const_offset = offset;
43e9d192
IB
10136
10137 /* TImode and TFmode values are allowed in both pairs of X
10138 registers and individual Q registers. The available
10139 address modes are:
10140 X,X: 7-bit signed scaled offset
10141 Q: 9-bit signed offset
10142 We conservatively require an offset representable in either mode.
10143 */
10144 if (mode == TImode || mode == TFmode)
44707478 10145 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3c5af608 10146 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
43e9d192 10147
2d8c6dc1 10148 if (load_store_pair_p)
6a70badb 10149 return ((known_eq (GET_MODE_SIZE (mode), 4)
9f5361c8
KT
10150 || known_eq (GET_MODE_SIZE (mode), 8)
10151 || known_eq (GET_MODE_SIZE (mode), 16))
44707478 10152 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
43e9d192 10153 else
3c5af608 10154 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
43e9d192
IB
10155 }
10156 return false;
10157
10158 case CONST:
10159 case SYMBOL_REF:
10160 case LABEL_REF:
79517551
SN
10161 /* load literal: pc-relative constant pool entry. Only supported
10162 for SI mode or larger. */
43e9d192 10163 info->type = ADDRESS_SYMBOLIC;
2d8c6dc1 10164
6a70badb
RS
10165 if (!load_store_pair_p
10166 && GET_MODE_SIZE (mode).is_constant (&const_size)
10167 && const_size >= 4)
43e9d192 10168 {
74b27d8e
RS
10169 poly_int64 offset;
10170 rtx sym = strip_offset_and_salt (x, &offset);
3793ecc1
AC
10171 return ((LABEL_REF_P (sym)
10172 || (SYMBOL_REF_P (sym)
b4f50fd4 10173 && CONSTANT_POOL_ADDRESS_P (sym)
9ee6540a 10174 && aarch64_pcrelative_literal_loads)));
43e9d192
IB
10175 }
10176 return false;
10177
10178 case LO_SUM:
10179 info->type = ADDRESS_LO_SUM;
10180 info->base = XEXP (x, 0);
10181 info->offset = XEXP (x, 1);
10182 if (allow_reg_index_p
10183 && aarch64_base_register_rtx_p (info->base, strict_p))
10184 {
74b27d8e
RS
10185 poly_int64 offset;
10186 HOST_WIDE_INT const_offset;
10187 rtx sym = strip_offset_and_salt (info->offset, &offset);
3793ecc1 10188 if (SYMBOL_REF_P (sym)
74b27d8e
RS
10189 && offset.is_constant (&const_offset)
10190 && (aarch64_classify_symbol (sym, const_offset)
43cacb12 10191 == SYMBOL_SMALL_ABSOLUTE))
43e9d192
IB
10192 {
10193 /* The symbol and offset must be aligned to the access size. */
10194 unsigned int align;
43e9d192
IB
10195
10196 if (CONSTANT_POOL_ADDRESS_P (sym))
10197 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
10198 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
10199 {
10200 tree exp = SYMBOL_REF_DECL (sym);
10201 align = TYPE_ALIGN (TREE_TYPE (exp));
58e17cf8 10202 align = aarch64_constant_alignment (exp, align);
43e9d192
IB
10203 }
10204 else if (SYMBOL_REF_DECL (sym))
10205 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
6c031d8d
KV
10206 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
10207 && SYMBOL_REF_BLOCK (sym) != NULL)
10208 align = SYMBOL_REF_BLOCK (sym)->alignment;
43e9d192
IB
10209 else
10210 align = BITS_PER_UNIT;
10211
6a70badb
RS
10212 poly_int64 ref_size = GET_MODE_SIZE (mode);
10213 if (known_eq (ref_size, 0))
43e9d192
IB
10214 ref_size = GET_MODE_SIZE (DImode);
10215
74b27d8e 10216 return (multiple_p (const_offset, ref_size)
6a70badb 10217 && multiple_p (align / BITS_PER_UNIT, ref_size));
43e9d192
IB
10218 }
10219 }
10220 return false;
10221
10222 default:
10223 return false;
10224 }
10225}
10226
9bf2f779
KT
10227/* Return true if the address X is valid for a PRFM instruction.
10228 STRICT_P is true if we should do strict checking with
10229 aarch64_classify_address. */
10230
10231bool
10232aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
10233{
10234 struct aarch64_address_info addr;
10235
10236 /* PRFM accepts the same addresses as DImode... */
a97d8b98 10237 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
9bf2f779
KT
10238 if (!res)
10239 return false;
10240
10241 /* ... except writeback forms. */
10242 return addr.type != ADDRESS_REG_WB;
10243}
10244
43e9d192
IB
10245bool
10246aarch64_symbolic_address_p (rtx x)
10247{
74b27d8e
RS
10248 poly_int64 offset;
10249 x = strip_offset_and_salt (x, &offset);
3793ecc1 10250 return SYMBOL_REF_P (x) || LABEL_REF_P (x);
43e9d192
IB
10251}
10252
a6e0bfa7 10253/* Classify the base of symbolic expression X. */
da4f13a4
MS
10254
10255enum aarch64_symbol_type
a6e0bfa7 10256aarch64_classify_symbolic_expression (rtx x)
43e9d192
IB
10257{
10258 rtx offset;
da4f13a4 10259
43e9d192 10260 split_const (x, &x, &offset);
43cacb12 10261 return aarch64_classify_symbol (x, INTVAL (offset));
43e9d192
IB
10262}
10263
10264
10265/* Return TRUE if X is a legitimate address for accessing memory in
10266 mode MODE. */
10267static bool
ef4bddc2 10268aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
43e9d192
IB
10269{
10270 struct aarch64_address_info addr;
10271
a97d8b98 10272 return aarch64_classify_address (&addr, x, mode, strict_p);
43e9d192
IB
10273}
10274
a97d8b98
RS
10275/* Return TRUE if X is a legitimate address of type TYPE for accessing
10276 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
43e9d192 10277bool
a97d8b98
RS
10278aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
10279 aarch64_addr_query_type type)
43e9d192
IB
10280{
10281 struct aarch64_address_info addr;
10282
a97d8b98 10283 return aarch64_classify_address (&addr, x, mode, strict_p, type);
43e9d192
IB
10284}
10285
9005477f
RS
10286/* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
10287
491ec060 10288static bool
9005477f
RS
10289aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
10290 poly_int64 orig_offset,
10291 machine_mode mode)
491ec060 10292{
6a70badb
RS
10293 HOST_WIDE_INT size;
10294 if (GET_MODE_SIZE (mode).is_constant (&size))
10295 {
9005477f
RS
10296 HOST_WIDE_INT const_offset, second_offset;
10297
10298 /* A general SVE offset is A * VQ + B. Remove the A component from
10299 coefficient 0 in order to get the constant B. */
10300 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
10301
10302 /* Split an out-of-range address displacement into a base and
10303 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
10304 range otherwise to increase opportunities for sharing the base
10305 address of different sizes. Unaligned accesses use the signed
10306 9-bit range, TImode/TFmode use the intersection of signed
10307 scaled 7-bit and signed 9-bit offset. */
6a70badb 10308 if (mode == TImode || mode == TFmode)
9005477f
RS
10309 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
10310 else if ((const_offset & (size - 1)) != 0)
10311 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6a70badb 10312 else
9005477f 10313 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
491ec060 10314
9005477f
RS
10315 if (second_offset == 0 || known_eq (orig_offset, second_offset))
10316 return false;
10317
10318 /* Split the offset into second_offset and the rest. */
10319 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
10320 *offset2 = gen_int_mode (second_offset, Pmode);
10321 return true;
10322 }
10323 else
10324 {
10325 /* Get the mode we should use as the basis of the range. For structure
10326 modes this is the mode of one vector. */
10327 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10328 machine_mode step_mode
10329 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
10330
10331 /* Get the "mul vl" multiplier we'd like to use. */
10332 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
10333 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
10334 if (vec_flags & VEC_SVE_DATA)
10335 /* LDR supports a 9-bit range, but the move patterns for
10336 structure modes require all vectors to be in range of the
10337 same base. The simplest way of accomodating that while still
10338 promoting reuse of anchor points between different modes is
10339 to use an 8-bit range unconditionally. */
10340 vnum = ((vnum + 128) & 255) - 128;
10341 else
10342 /* Predicates are only handled singly, so we might as well use
10343 the full range. */
10344 vnum = ((vnum + 256) & 511) - 256;
10345 if (vnum == 0)
10346 return false;
10347
10348 /* Convert the "mul vl" multiplier into a byte offset. */
10349 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
10350 if (known_eq (second_offset, orig_offset))
10351 return false;
10352
10353 /* Split the offset into second_offset and the rest. */
10354 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
10355 *offset2 = gen_int_mode (second_offset, Pmode);
6a70badb
RS
10356 return true;
10357 }
491ec060
WD
10358}
10359
a2170965
TC
10360/* Return the binary representation of floating point constant VALUE in INTVAL.
10361 If the value cannot be converted, return false without setting INTVAL.
10362 The conversion is done in the given MODE. */
10363bool
10364aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
10365{
10366
10367 /* We make a general exception for 0. */
10368 if (aarch64_float_const_zero_rtx_p (value))
10369 {
10370 *intval = 0;
10371 return true;
10372 }
10373
0d0e0188 10374 scalar_float_mode mode;
3793ecc1 10375 if (!CONST_DOUBLE_P (value)
0d0e0188 10376 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
a2170965
TC
10377 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
10378 /* Only support up to DF mode. */
10379 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
10380 return false;
10381
10382 unsigned HOST_WIDE_INT ival = 0;
10383
10384 long res[2];
10385 real_to_target (res,
10386 CONST_DOUBLE_REAL_VALUE (value),
10387 REAL_MODE_FORMAT (mode));
10388
5c22bb48
TC
10389 if (mode == DFmode)
10390 {
10391 int order = BYTES_BIG_ENDIAN ? 1 : 0;
10392 ival = zext_hwi (res[order], 32);
10393 ival |= (zext_hwi (res[1 - order], 32) << 32);
10394 }
10395 else
10396 ival = zext_hwi (res[0], 32);
a2170965
TC
10397
10398 *intval = ival;
10399 return true;
10400}
10401
10402/* Return TRUE if rtx X is an immediate constant that can be moved using a
10403 single MOV(+MOVK) followed by an FMOV. */
10404bool
10405aarch64_float_const_rtx_p (rtx x)
10406{
10407 machine_mode mode = GET_MODE (x);
10408 if (mode == VOIDmode)
10409 return false;
10410
10411 /* Determine whether it's cheaper to write float constants as
10412 mov/movk pairs over ldr/adrp pairs. */
10413 unsigned HOST_WIDE_INT ival;
10414
3793ecc1 10415 if (CONST_DOUBLE_P (x)
a2170965
TC
10416 && SCALAR_FLOAT_MODE_P (mode)
10417 && aarch64_reinterpret_float_as_int (x, &ival))
10418 {
77e994c9
RS
10419 scalar_int_mode imode = (mode == HFmode
10420 ? SImode
10421 : int_mode_for_mode (mode).require ());
a2170965
TC
10422 int num_instr = aarch64_internal_mov_immediate
10423 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
10424 return num_instr < 3;
10425 }
10426
10427 return false;
10428}
10429
43e9d192
IB
10430/* Return TRUE if rtx X is immediate constant 0.0 */
10431bool
3520f7cc 10432aarch64_float_const_zero_rtx_p (rtx x)
43e9d192 10433{
43e9d192
IB
10434 if (GET_MODE (x) == VOIDmode)
10435 return false;
10436
34a72c33 10437 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
43e9d192 10438 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
34a72c33 10439 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
43e9d192
IB
10440}
10441
a2170965
TC
10442/* Return TRUE if rtx X is immediate constant that fits in a single
10443 MOVI immediate operation. */
10444bool
10445aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
10446{
10447 if (!TARGET_SIMD)
10448 return false;
10449
77e994c9
RS
10450 machine_mode vmode;
10451 scalar_int_mode imode;
a2170965
TC
10452 unsigned HOST_WIDE_INT ival;
10453
3793ecc1 10454 if (CONST_DOUBLE_P (x)
a2170965
TC
10455 && SCALAR_FLOAT_MODE_P (mode))
10456 {
10457 if (!aarch64_reinterpret_float_as_int (x, &ival))
10458 return false;
10459
35c38fa6
TC
10460 /* We make a general exception for 0. */
10461 if (aarch64_float_const_zero_rtx_p (x))
10462 return true;
10463
304b9962 10464 imode = int_mode_for_mode (mode).require ();
a2170965 10465 }
3793ecc1 10466 else if (CONST_INT_P (x)
77e994c9
RS
10467 && is_a <scalar_int_mode> (mode, &imode))
10468 ival = INTVAL (x);
a2170965
TC
10469 else
10470 return false;
10471
10472 /* use a 64 bit mode for everything except for DI/DF mode, where we use
10473 a 128 bit vector mode. */
77e994c9 10474 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
a2170965
TC
10475
10476 vmode = aarch64_simd_container_mode (imode, width);
10477 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
10478
b187677b 10479 return aarch64_simd_valid_immediate (v_op, NULL);
a2170965
TC
10480}
10481
10482
70f09188
AP
10483/* Return the fixed registers used for condition codes. */
10484
10485static bool
10486aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
10487{
10488 *p1 = CC_REGNUM;
10489 *p2 = INVALID_REGNUM;
10490 return true;
10491}
10492
47210a04
RL
10493/* This function is used by the call expanders of the machine description.
10494 RESULT is the register in which the result is returned. It's NULL for
10495 "call" and "sibcall".
10496 MEM is the location of the function call.
08cc4d92 10497 CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
47210a04
RL
10498 SIBCALL indicates whether this function call is normal call or sibling call.
10499 It will generate different pattern accordingly. */
10500
10501void
08cc4d92 10502aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall)
47210a04
RL
10503{
10504 rtx call, callee, tmp;
10505 rtvec vec;
10506 machine_mode mode;
10507
10508 gcc_assert (MEM_P (mem));
10509 callee = XEXP (mem, 0);
10510 mode = GET_MODE (callee);
10511 gcc_assert (mode == Pmode);
10512
10513 /* Decide if we should generate indirect calls by loading the
10514 address of the callee into a register before performing
10515 the branch-and-link. */
10516 if (SYMBOL_REF_P (callee)
10517 ? (aarch64_is_long_call_p (callee)
10518 || aarch64_is_noplt_call_p (callee))
10519 : !REG_P (callee))
10520 XEXP (mem, 0) = force_reg (mode, callee);
10521
10522 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
10523
10524 if (result != NULL_RTX)
10525 call = gen_rtx_SET (result, call);
10526
10527 if (sibcall)
10528 tmp = ret_rtx;
10529 else
10530 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
10531
08cc4d92
RS
10532 gcc_assert (CONST_INT_P (callee_abi));
10533 callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
10534 UNSPEC_CALLEE_ABI);
10535
10536 vec = gen_rtvec (3, call, callee_abi, tmp);
47210a04
RL
10537 call = gen_rtx_PARALLEL (VOIDmode, vec);
10538
10539 aarch64_emit_call_insn (call);
10540}
10541
78607708
TV
10542/* Emit call insn with PAT and do aarch64-specific handling. */
10543
d07a3fed 10544void
78607708
TV
10545aarch64_emit_call_insn (rtx pat)
10546{
10547 rtx insn = emit_call_insn (pat);
10548
10549 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
10550 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
10551 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
10552}
10553
ef4bddc2 10554machine_mode
43e9d192
IB
10555aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
10556{
f7343f20
RE
10557 machine_mode mode_x = GET_MODE (x);
10558 rtx_code code_x = GET_CODE (x);
10559
43e9d192
IB
10560 /* All floating point compares return CCFP if it is an equality
10561 comparison, and CCFPE otherwise. */
f7343f20 10562 if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
43e9d192
IB
10563 {
10564 switch (code)
10565 {
10566 case EQ:
10567 case NE:
10568 case UNORDERED:
10569 case ORDERED:
10570 case UNLT:
10571 case UNLE:
10572 case UNGT:
10573 case UNGE:
10574 case UNEQ:
43e9d192
IB
10575 return CCFPmode;
10576
10577 case LT:
10578 case LE:
10579 case GT:
10580 case GE:
8332c5ee 10581 case LTGT:
43e9d192
IB
10582 return CCFPEmode;
10583
10584 default:
10585 gcc_unreachable ();
10586 }
10587 }
10588
2b8568fe
KT
10589 /* Equality comparisons of short modes against zero can be performed
10590 using the TST instruction with the appropriate bitmask. */
f73dc006 10591 if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
2b8568fe 10592 && (code == EQ || code == NE)
f7343f20 10593 && (mode_x == HImode || mode_x == QImode))
2b8568fe
KT
10594 return CC_NZmode;
10595
b06335f9
KT
10596 /* Similarly, comparisons of zero_extends from shorter modes can
10597 be performed using an ANDS with an immediate mask. */
f7343f20
RE
10598 if (y == const0_rtx && code_x == ZERO_EXTEND
10599 && (mode_x == SImode || mode_x == DImode)
b06335f9
KT
10600 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
10601 && (code == EQ || code == NE))
10602 return CC_NZmode;
10603
f7343f20 10604 if ((mode_x == SImode || mode_x == DImode)
43e9d192
IB
10605 && y == const0_rtx
10606 && (code == EQ || code == NE || code == LT || code == GE)
f7343f20
RE
10607 && (code_x == PLUS || code_x == MINUS || code_x == AND
10608 || code_x == NEG
10609 || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
7325d85a 10610 && CONST_INT_P (XEXP (x, 2)))))
43e9d192
IB
10611 return CC_NZmode;
10612
1c992d1e 10613 /* A compare with a shifted operand. Because of canonicalization,
43e9d192
IB
10614 the comparison will have to be swapped when we emit the assembly
10615 code. */
f7343f20 10616 if ((mode_x == SImode || mode_x == DImode)
3793ecc1 10617 && (REG_P (y) || SUBREG_P (y) || y == const0_rtx)
f7343f20
RE
10618 && (code_x == ASHIFT || code_x == ASHIFTRT
10619 || code_x == LSHIFTRT
10620 || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
43e9d192
IB
10621 return CC_SWPmode;
10622
1c992d1e
RE
10623 /* Similarly for a negated operand, but we can only do this for
10624 equalities. */
f7343f20 10625 if ((mode_x == SImode || mode_x == DImode)
3793ecc1 10626 && (REG_P (y) || SUBREG_P (y))
1c992d1e 10627 && (code == EQ || code == NE)
f7343f20 10628 && code_x == NEG)
1c992d1e
RE
10629 return CC_Zmode;
10630
f7343f20
RE
10631 /* A test for unsigned overflow from an addition. */
10632 if ((mode_x == DImode || mode_x == TImode)
10633 && (code == LTU || code == GEU)
10634 && code_x == PLUS
10635 && rtx_equal_p (XEXP (x, 0), y))
ef22810a
RH
10636 return CC_Cmode;
10637
f7343f20
RE
10638 /* A test for unsigned overflow from an add with carry. */
10639 if ((mode_x == DImode || mode_x == TImode)
10640 && (code == LTU || code == GEU)
10641 && code_x == PLUS
10642 && CONST_SCALAR_INT_P (y)
10643 && (rtx_mode_t (y, mode_x)
10644 == (wi::shwi (1, mode_x)
10645 << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
10646 return CC_ADCmode;
10647
30c46053 10648 /* A test for signed overflow. */
f7343f20 10649 if ((mode_x == DImode || mode_x == TImode)
30c46053 10650 && code == NE
f7343f20 10651 && code_x == PLUS
30c46053
MC
10652 && GET_CODE (y) == SIGN_EXTEND)
10653 return CC_Vmode;
10654
43e9d192
IB
10655 /* For everything else, return CCmode. */
10656 return CCmode;
10657}
10658
3dfa7055 10659static int
b8506a8a 10660aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
3dfa7055 10661
cd5660ab 10662int
43e9d192
IB
10663aarch64_get_condition_code (rtx x)
10664{
ef4bddc2 10665 machine_mode mode = GET_MODE (XEXP (x, 0));
43e9d192
IB
10666 enum rtx_code comp_code = GET_CODE (x);
10667
10668 if (GET_MODE_CLASS (mode) != MODE_CC)
10669 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3dfa7055
ZC
10670 return aarch64_get_condition_code_1 (mode, comp_code);
10671}
43e9d192 10672
3dfa7055 10673static int
b8506a8a 10674aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
3dfa7055 10675{
43e9d192
IB
10676 switch (mode)
10677 {
4e10a5a7
RS
10678 case E_CCFPmode:
10679 case E_CCFPEmode:
43e9d192
IB
10680 switch (comp_code)
10681 {
10682 case GE: return AARCH64_GE;
10683 case GT: return AARCH64_GT;
10684 case LE: return AARCH64_LS;
10685 case LT: return AARCH64_MI;
10686 case NE: return AARCH64_NE;
10687 case EQ: return AARCH64_EQ;
10688 case ORDERED: return AARCH64_VC;
10689 case UNORDERED: return AARCH64_VS;
10690 case UNLT: return AARCH64_LT;
10691 case UNLE: return AARCH64_LE;
10692 case UNGT: return AARCH64_HI;
10693 case UNGE: return AARCH64_PL;
cd5660ab 10694 default: return -1;
43e9d192
IB
10695 }
10696 break;
10697
4e10a5a7 10698 case E_CCmode:
43e9d192
IB
10699 switch (comp_code)
10700 {
10701 case NE: return AARCH64_NE;
10702 case EQ: return AARCH64_EQ;
10703 case GE: return AARCH64_GE;
10704 case GT: return AARCH64_GT;
10705 case LE: return AARCH64_LE;
10706 case LT: return AARCH64_LT;
10707 case GEU: return AARCH64_CS;
10708 case GTU: return AARCH64_HI;
10709 case LEU: return AARCH64_LS;
10710 case LTU: return AARCH64_CC;
cd5660ab 10711 default: return -1;
43e9d192
IB
10712 }
10713 break;
10714
4e10a5a7 10715 case E_CC_SWPmode:
43e9d192
IB
10716 switch (comp_code)
10717 {
10718 case NE: return AARCH64_NE;
10719 case EQ: return AARCH64_EQ;
10720 case GE: return AARCH64_LE;
10721 case GT: return AARCH64_LT;
10722 case LE: return AARCH64_GE;
10723 case LT: return AARCH64_GT;
10724 case GEU: return AARCH64_LS;
10725 case GTU: return AARCH64_CC;
10726 case LEU: return AARCH64_CS;
10727 case LTU: return AARCH64_HI;
cd5660ab 10728 default: return -1;
43e9d192
IB
10729 }
10730 break;
10731
57d6f4d0
RS
10732 case E_CC_NZCmode:
10733 switch (comp_code)
10734 {
10735 case NE: return AARCH64_NE; /* = any */
10736 case EQ: return AARCH64_EQ; /* = none */
10737 case GE: return AARCH64_PL; /* = nfrst */
10738 case LT: return AARCH64_MI; /* = first */
10739 case GEU: return AARCH64_CS; /* = nlast */
10740 case GTU: return AARCH64_HI; /* = pmore */
10741 case LEU: return AARCH64_LS; /* = plast */
10742 case LTU: return AARCH64_CC; /* = last */
10743 default: return -1;
10744 }
10745 break;
10746
4e10a5a7 10747 case E_CC_NZmode:
43e9d192
IB
10748 switch (comp_code)
10749 {
10750 case NE: return AARCH64_NE;
10751 case EQ: return AARCH64_EQ;
10752 case GE: return AARCH64_PL;
10753 case LT: return AARCH64_MI;
cd5660ab 10754 default: return -1;
43e9d192
IB
10755 }
10756 break;
10757
4e10a5a7 10758 case E_CC_Zmode:
1c992d1e
RE
10759 switch (comp_code)
10760 {
10761 case NE: return AARCH64_NE;
10762 case EQ: return AARCH64_EQ;
cd5660ab 10763 default: return -1;
1c992d1e
RE
10764 }
10765 break;
10766
4e10a5a7 10767 case E_CC_Cmode:
ef22810a
RH
10768 switch (comp_code)
10769 {
f7343f20
RE
10770 case LTU: return AARCH64_CS;
10771 case GEU: return AARCH64_CC;
10772 default: return -1;
10773 }
10774 break;
10775
10776 case E_CC_ADCmode:
10777 switch (comp_code)
10778 {
10779 case GEU: return AARCH64_CS;
10780 case LTU: return AARCH64_CC;
ef22810a
RH
10781 default: return -1;
10782 }
10783 break;
10784
30c46053
MC
10785 case E_CC_Vmode:
10786 switch (comp_code)
10787 {
10788 case NE: return AARCH64_VS;
10789 case EQ: return AARCH64_VC;
10790 default: return -1;
10791 }
10792 break;
10793
43e9d192 10794 default:
cd5660ab 10795 return -1;
43e9d192 10796 }
3dfa7055 10797
3dfa7055 10798 return -1;
43e9d192
IB
10799}
10800
ddeabd3e
AL
10801bool
10802aarch64_const_vec_all_same_in_range_p (rtx x,
6a70badb
RS
10803 HOST_WIDE_INT minval,
10804 HOST_WIDE_INT maxval)
ddeabd3e 10805{
6a70badb
RS
10806 rtx elt;
10807 return (const_vec_duplicate_p (x, &elt)
10808 && CONST_INT_P (elt)
10809 && IN_RANGE (INTVAL (elt), minval, maxval));
ddeabd3e
AL
10810}
10811
10812bool
10813aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
10814{
10815 return aarch64_const_vec_all_same_in_range_p (x, val, val);
10816}
10817
43cacb12
RS
10818/* Return true if VEC is a constant in which every element is in the range
10819 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
10820
10821static bool
10822aarch64_const_vec_all_in_range_p (rtx vec,
10823 HOST_WIDE_INT minval,
10824 HOST_WIDE_INT maxval)
10825{
568b9c0e 10826 if (!CONST_VECTOR_P (vec)
43cacb12
RS
10827 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
10828 return false;
10829
10830 int nunits;
10831 if (!CONST_VECTOR_STEPPED_P (vec))
10832 nunits = const_vector_encoded_nelts (vec);
10833 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
10834 return false;
10835
10836 for (int i = 0; i < nunits; i++)
10837 {
10838 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
10839 if (!CONST_INT_P (vec_elem)
10840 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
10841 return false;
10842 }
10843 return true;
10844}
43e9d192 10845
cf670503
ZC
10846/* N Z C V. */
10847#define AARCH64_CC_V 1
10848#define AARCH64_CC_C (1 << 1)
10849#define AARCH64_CC_Z (1 << 2)
10850#define AARCH64_CC_N (1 << 3)
10851
c8012fbc
WD
10852/* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
10853static const int aarch64_nzcv_codes[] =
10854{
10855 0, /* EQ, Z == 1. */
10856 AARCH64_CC_Z, /* NE, Z == 0. */
10857 0, /* CS, C == 1. */
10858 AARCH64_CC_C, /* CC, C == 0. */
10859 0, /* MI, N == 1. */
10860 AARCH64_CC_N, /* PL, N == 0. */
10861 0, /* VS, V == 1. */
10862 AARCH64_CC_V, /* VC, V == 0. */
10863 0, /* HI, C ==1 && Z == 0. */
10864 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
10865 AARCH64_CC_V, /* GE, N == V. */
10866 0, /* LT, N != V. */
10867 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
10868 0, /* LE, !(Z == 0 && N == V). */
10869 0, /* AL, Any. */
10870 0 /* NV, Any. */
cf670503
ZC
10871};
10872
43cacb12
RS
10873/* Print floating-point vector immediate operand X to F, negating it
10874 first if NEGATE is true. Return true on success, false if it isn't
10875 a constant we can handle. */
10876
10877static bool
10878aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
10879{
10880 rtx elt;
10881
10882 if (!const_vec_duplicate_p (x, &elt))
10883 return false;
10884
10885 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
10886 if (negate)
10887 r = real_value_negate (&r);
10888
d29f7dd5
RS
10889 /* Handle the SVE single-bit immediates specially, since they have a
10890 fixed form in the assembly syntax. */
43cacb12
RS
10891 if (real_equal (&r, &dconst0))
10892 asm_fprintf (f, "0.0");
a19ba9e1
RS
10893 else if (real_equal (&r, &dconst2))
10894 asm_fprintf (f, "2.0");
43cacb12
RS
10895 else if (real_equal (&r, &dconst1))
10896 asm_fprintf (f, "1.0");
10897 else if (real_equal (&r, &dconsthalf))
10898 asm_fprintf (f, "0.5");
10899 else
d29f7dd5
RS
10900 {
10901 const int buf_size = 20;
10902 char float_buf[buf_size] = {'\0'};
10903 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
10904 1, GET_MODE (elt));
10905 asm_fprintf (f, "%s", float_buf);
10906 }
43cacb12
RS
10907
10908 return true;
10909}
10910
9f4cbab8
RS
10911/* Return the equivalent letter for size. */
10912static char
10913sizetochar (int size)
10914{
10915 switch (size)
10916 {
10917 case 64: return 'd';
10918 case 32: return 's';
10919 case 16: return 'h';
10920 case 8 : return 'b';
10921 default: gcc_unreachable ();
10922 }
10923}
10924
bcf19844
JW
10925/* Print operand X to file F in a target specific manner according to CODE.
10926 The acceptable formatting commands given by CODE are:
10927 'c': An integer or symbol address without a preceding #
10928 sign.
43cacb12
RS
10929 'C': Take the duplicated element in a vector constant
10930 and print it in hex.
10931 'D': Take the duplicated element in a vector constant
10932 and print it as an unsigned integer, in decimal.
bcf19844 10933 'e': Print the sign/zero-extend size as a character 8->b,
d113ece6
RS
10934 16->h, 32->w. Can also be used for masks:
10935 0xff->b, 0xffff->h, 0xffffffff->w.
d29f7dd5
RS
10936 'I': If the operand is a duplicated vector constant,
10937 replace it with the duplicated scalar. If the
10938 operand is then a floating-point constant, replace
10939 it with the integer bit representation. Print the
10940 transformed constant as a signed decimal number.
bcf19844
JW
10941 'p': Prints N such that 2^N == X (X must be power of 2 and
10942 const int).
10943 'P': Print the number of non-zero bits in X (a const_int).
10944 'H': Print the higher numbered register of a pair (TImode)
10945 of regs.
10946 'm': Print a condition (eq, ne, etc).
10947 'M': Same as 'm', but invert condition.
43cacb12
RS
10948 'N': Take the duplicated element in a vector constant
10949 and print the negative of it in decimal.
bcf19844
JW
10950 'b/h/s/d/q': Print a scalar FP/SIMD register name.
10951 'S/T/U/V': Print a FP/SIMD register name for a register list.
10952 The register printed is the FP/SIMD register name
10953 of X + 0/1/2/3 for S/T/U/V.
e3f15286 10954 'R': Print a scalar Integer/FP/SIMD register name + 1.
bcf19844
JW
10955 'X': Print bottom 16 bits of integer constant in hex.
10956 'w/x': Print a general register name or the zero register
10957 (32-bit or 64-bit).
10958 '0': Print a normal operand, if it's a general register,
10959 then we assume DImode.
10960 'k': Print NZCV for conditional compare instructions.
10961 'A': Output address constant representing the first
10962 argument of X, specifying a relocation offset
10963 if appropriate.
10964 'L': Output constant address specified by X
10965 with a relocation offset if appropriate.
10966 'G': Prints address of X, specifying a PC relative
e69a816d
WD
10967 relocation mode if appropriate.
10968 'y': Output address of LDP or STP - this is used for
10969 some LDP/STPs which don't use a PARALLEL in their
10970 pattern (so the mode needs to be adjusted).
10971 'z': Output address of a typical LDP or STP. */
bcf19844 10972
cc8ca59e
JB
10973static void
10974aarch64_print_operand (FILE *f, rtx x, int code)
43e9d192 10975{
43cacb12 10976 rtx elt;
43e9d192
IB
10977 switch (code)
10978 {
f541a481 10979 case 'c':
74b27d8e
RS
10980 if (CONST_INT_P (x))
10981 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
10982 else
f541a481 10983 {
74b27d8e
RS
10984 poly_int64 offset;
10985 rtx base = strip_offset_and_salt (x, &offset);
10986 if (SYMBOL_REF_P (base))
10987 output_addr_const (f, x);
10988 else
10989 output_operand_lossage ("unsupported operand for code '%c'", code);
f541a481
KT
10990 }
10991 break;
10992
43e9d192 10993 case 'e':
43e9d192 10994 {
d113ece6
RS
10995 x = unwrap_const_vec_duplicate (x);
10996 if (!CONST_INT_P (x))
43e9d192
IB
10997 {
10998 output_operand_lossage ("invalid operand for '%%%c'", code);
10999 return;
11000 }
11001
d113ece6
RS
11002 HOST_WIDE_INT val = INTVAL (x);
11003 if ((val & ~7) == 8 || val == 0xff)
11004 fputc ('b', f);
11005 else if ((val & ~7) == 16 || val == 0xffff)
11006 fputc ('h', f);
11007 else if ((val & ~7) == 32 || val == 0xffffffff)
11008 fputc ('w', f);
11009 else
43e9d192 11010 {
43e9d192
IB
11011 output_operand_lossage ("invalid operand for '%%%c'", code);
11012 return;
11013 }
11014 }
11015 break;
11016
11017 case 'p':
11018 {
11019 int n;
11020
4aa81c2e 11021 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
43e9d192
IB
11022 {
11023 output_operand_lossage ("invalid operand for '%%%c'", code);
11024 return;
11025 }
11026
11027 asm_fprintf (f, "%d", n);
11028 }
11029 break;
11030
11031 case 'P':
4aa81c2e 11032 if (!CONST_INT_P (x))
43e9d192
IB
11033 {
11034 output_operand_lossage ("invalid operand for '%%%c'", code);
11035 return;
11036 }
11037
8d55c61b 11038 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
43e9d192
IB
11039 break;
11040
11041 case 'H':
c0111dc4
RE
11042 if (x == const0_rtx)
11043 {
11044 asm_fprintf (f, "xzr");
11045 break;
11046 }
11047
4aa81c2e 11048 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
43e9d192
IB
11049 {
11050 output_operand_lossage ("invalid operand for '%%%c'", code);
11051 return;
11052 }
11053
01a3a324 11054 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
43e9d192
IB
11055 break;
11056
d29f7dd5
RS
11057 case 'I':
11058 {
11059 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
11060 if (CONST_INT_P (x))
11061 asm_fprintf (f, "%wd", INTVAL (x));
11062 else
11063 {
11064 output_operand_lossage ("invalid operand for '%%%c'", code);
11065 return;
11066 }
11067 break;
11068 }
11069
43e9d192 11070 case 'M':
c8012fbc 11071 case 'm':
cd5660ab
KT
11072 {
11073 int cond_code;
c8012fbc
WD
11074 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
11075 if (x == const_true_rtx)
cd5660ab 11076 {
c8012fbc
WD
11077 if (code == 'M')
11078 fputs ("nv", f);
cd5660ab
KT
11079 return;
11080 }
43e9d192 11081
cd5660ab
KT
11082 if (!COMPARISON_P (x))
11083 {
11084 output_operand_lossage ("invalid operand for '%%%c'", code);
11085 return;
11086 }
c8012fbc 11087
cd5660ab
KT
11088 cond_code = aarch64_get_condition_code (x);
11089 gcc_assert (cond_code >= 0);
c8012fbc
WD
11090 if (code == 'M')
11091 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
57d6f4d0
RS
11092 if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
11093 fputs (aarch64_sve_condition_codes[cond_code], f);
11094 else
11095 fputs (aarch64_condition_codes[cond_code], f);
cd5660ab 11096 }
43e9d192
IB
11097 break;
11098
43cacb12
RS
11099 case 'N':
11100 if (!const_vec_duplicate_p (x, &elt))
11101 {
11102 output_operand_lossage ("invalid vector constant");
11103 return;
11104 }
11105
11106 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
1c0c371d 11107 asm_fprintf (f, "%wd", (HOST_WIDE_INT) -UINTVAL (elt));
43cacb12
RS
11108 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11109 && aarch64_print_vector_float_operand (f, x, true))
11110 ;
11111 else
11112 {
11113 output_operand_lossage ("invalid vector constant");
11114 return;
11115 }
11116 break;
11117
43e9d192
IB
11118 case 'b':
11119 case 'h':
11120 case 's':
11121 case 'd':
11122 case 'q':
43e9d192
IB
11123 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
11124 {
11125 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
11126 return;
11127 }
50ce6f88 11128 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
43e9d192
IB
11129 break;
11130
11131 case 'S':
11132 case 'T':
11133 case 'U':
11134 case 'V':
43e9d192
IB
11135 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
11136 {
11137 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
11138 return;
11139 }
43cacb12
RS
11140 asm_fprintf (f, "%c%d",
11141 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
11142 REGNO (x) - V0_REGNUM + (code - 'S'));
43e9d192
IB
11143 break;
11144
2d8c6dc1 11145 case 'R':
66f206b8
JW
11146 if (REG_P (x) && FP_REGNUM_P (REGNO (x))
11147 && (aarch64_advsimd_partial_struct_mode_p (GET_MODE (x))))
11148 asm_fprintf (f, "d%d", REGNO (x) - V0_REGNUM + 1);
11149 else if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
e3f15286
RH
11150 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
11151 else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
11152 asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
11153 else
11154 output_operand_lossage ("incompatible register operand for '%%%c'",
11155 code);
2d8c6dc1
AH
11156 break;
11157
a05c0ddf 11158 case 'X':
4aa81c2e 11159 if (!CONST_INT_P (x))
a05c0ddf
IB
11160 {
11161 output_operand_lossage ("invalid operand for '%%%c'", code);
11162 return;
11163 }
50d38551 11164 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
a05c0ddf
IB
11165 break;
11166
43cacb12
RS
11167 case 'C':
11168 {
11169 /* Print a replicated constant in hex. */
11170 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
11171 {
11172 output_operand_lossage ("invalid operand for '%%%c'", code);
11173 return;
11174 }
11175 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
11176 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
11177 }
11178 break;
11179
11180 case 'D':
11181 {
11182 /* Print a replicated constant in decimal, treating it as
11183 unsigned. */
11184 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
11185 {
11186 output_operand_lossage ("invalid operand for '%%%c'", code);
11187 return;
11188 }
11189 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
11190 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
11191 }
11192 break;
11193
43e9d192
IB
11194 case 'w':
11195 case 'x':
3520f7cc
JG
11196 if (x == const0_rtx
11197 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
43e9d192 11198 {
50ce6f88 11199 asm_fprintf (f, "%czr", code);
43e9d192
IB
11200 break;
11201 }
11202
11203 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
11204 {
50ce6f88 11205 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
43e9d192
IB
11206 break;
11207 }
11208
11209 if (REG_P (x) && REGNO (x) == SP_REGNUM)
11210 {
50ce6f88 11211 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
43e9d192
IB
11212 break;
11213 }
11214
11215 /* Fall through */
11216
11217 case 0:
43e9d192
IB
11218 if (x == NULL)
11219 {
11220 output_operand_lossage ("missing operand");
11221 return;
11222 }
11223
11224 switch (GET_CODE (x))
11225 {
11226 case REG:
43cacb12 11227 if (aarch64_sve_data_mode_p (GET_MODE (x)))
9f4cbab8
RS
11228 {
11229 if (REG_NREGS (x) == 1)
11230 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
11231 else
11232 {
11233 char suffix
11234 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
11235 asm_fprintf (f, "{z%d.%c - z%d.%c}",
11236 REGNO (x) - V0_REGNUM, suffix,
11237 END_REGNO (x) - V0_REGNUM - 1, suffix);
11238 }
11239 }
43cacb12
RS
11240 else
11241 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
43e9d192
IB
11242 break;
11243
11244 case MEM:
cc8ca59e 11245 output_address (GET_MODE (x), XEXP (x, 0));
43e9d192
IB
11246 break;
11247
11248 case LABEL_REF:
11249 case SYMBOL_REF:
11250 output_addr_const (asm_out_file, x);
11251 break;
11252
11253 case CONST_INT:
11254 asm_fprintf (f, "%wd", INTVAL (x));
11255 break;
11256
43cacb12
RS
11257 case CONST:
11258 if (!VECTOR_MODE_P (GET_MODE (x)))
3520f7cc 11259 {
43cacb12
RS
11260 output_addr_const (asm_out_file, x);
11261 break;
3520f7cc 11262 }
43cacb12
RS
11263 /* fall through */
11264
11265 case CONST_VECTOR:
11266 if (!const_vec_duplicate_p (x, &elt))
3520f7cc 11267 {
43cacb12
RS
11268 output_operand_lossage ("invalid vector constant");
11269 return;
3520f7cc 11270 }
43cacb12
RS
11271
11272 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
11273 asm_fprintf (f, "%wd", INTVAL (elt));
11274 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11275 && aarch64_print_vector_float_operand (f, x, false))
11276 ;
3520f7cc 11277 else
43cacb12
RS
11278 {
11279 output_operand_lossage ("invalid vector constant");
11280 return;
11281 }
43e9d192
IB
11282 break;
11283
3520f7cc 11284 case CONST_DOUBLE:
2ca5b430
KT
11285 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
11286 be getting CONST_DOUBLEs holding integers. */
11287 gcc_assert (GET_MODE (x) != VOIDmode);
11288 if (aarch64_float_const_zero_rtx_p (x))
3520f7cc
JG
11289 {
11290 fputc ('0', f);
11291 break;
11292 }
11293 else if (aarch64_float_const_representable_p (x))
11294 {
11295#define buf_size 20
11296 char float_buf[buf_size] = {'\0'};
34a72c33
RS
11297 real_to_decimal_for_mode (float_buf,
11298 CONST_DOUBLE_REAL_VALUE (x),
3520f7cc
JG
11299 buf_size, buf_size,
11300 1, GET_MODE (x));
11301 asm_fprintf (asm_out_file, "%s", float_buf);
11302 break;
11303#undef buf_size
11304 }
11305 output_operand_lossage ("invalid constant");
11306 return;
43e9d192
IB
11307 default:
11308 output_operand_lossage ("invalid operand");
11309 return;
11310 }
11311 break;
11312
11313 case 'A':
11314 if (GET_CODE (x) == HIGH)
11315 x = XEXP (x, 0);
11316
a6e0bfa7 11317 switch (aarch64_classify_symbolic_expression (x))
43e9d192 11318 {
6642bdb4 11319 case SYMBOL_SMALL_GOT_4G:
43e9d192
IB
11320 asm_fprintf (asm_out_file, ":got:");
11321 break;
11322
11323 case SYMBOL_SMALL_TLSGD:
11324 asm_fprintf (asm_out_file, ":tlsgd:");
11325 break;
11326
11327 case SYMBOL_SMALL_TLSDESC:
11328 asm_fprintf (asm_out_file, ":tlsdesc:");
11329 break;
11330
79496620 11331 case SYMBOL_SMALL_TLSIE:
43e9d192
IB
11332 asm_fprintf (asm_out_file, ":gottprel:");
11333 break;
11334
d18ba284 11335 case SYMBOL_TLSLE24:
43e9d192
IB
11336 asm_fprintf (asm_out_file, ":tprel:");
11337 break;
11338
87dd8ab0
MS
11339 case SYMBOL_TINY_GOT:
11340 gcc_unreachable ();
11341 break;
11342
43e9d192
IB
11343 default:
11344 break;
11345 }
11346 output_addr_const (asm_out_file, x);
11347 break;
11348
11349 case 'L':
a6e0bfa7 11350 switch (aarch64_classify_symbolic_expression (x))
43e9d192 11351 {
6642bdb4 11352 case SYMBOL_SMALL_GOT_4G:
a195c727 11353 asm_fprintf (asm_out_file, ":got_lo12:");
43e9d192
IB
11354 break;
11355
11356 case SYMBOL_SMALL_TLSGD:
11357 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
11358 break;
11359
11360 case SYMBOL_SMALL_TLSDESC:
11361 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
11362 break;
11363
79496620 11364 case SYMBOL_SMALL_TLSIE:
43e9d192
IB
11365 asm_fprintf (asm_out_file, ":gottprel_lo12:");
11366 break;
11367
cbf5629e
JW
11368 case SYMBOL_TLSLE12:
11369 asm_fprintf (asm_out_file, ":tprel_lo12:");
11370 break;
11371
d18ba284 11372 case SYMBOL_TLSLE24:
43e9d192
IB
11373 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
11374 break;
11375
87dd8ab0
MS
11376 case SYMBOL_TINY_GOT:
11377 asm_fprintf (asm_out_file, ":got:");
11378 break;
11379
5ae7caad
JW
11380 case SYMBOL_TINY_TLSIE:
11381 asm_fprintf (asm_out_file, ":gottprel:");
11382 break;
11383
43e9d192
IB
11384 default:
11385 break;
11386 }
11387 output_addr_const (asm_out_file, x);
11388 break;
11389
11390 case 'G':
a6e0bfa7 11391 switch (aarch64_classify_symbolic_expression (x))
43e9d192 11392 {
d18ba284 11393 case SYMBOL_TLSLE24:
43e9d192
IB
11394 asm_fprintf (asm_out_file, ":tprel_hi12:");
11395 break;
11396 default:
11397 break;
11398 }
11399 output_addr_const (asm_out_file, x);
11400 break;
11401
cf670503
ZC
11402 case 'k':
11403 {
c8012fbc 11404 HOST_WIDE_INT cond_code;
cf670503 11405
c8012fbc 11406 if (!CONST_INT_P (x))
cf670503
ZC
11407 {
11408 output_operand_lossage ("invalid operand for '%%%c'", code);
11409 return;
11410 }
11411
c8012fbc
WD
11412 cond_code = INTVAL (x);
11413 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
11414 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
cf670503
ZC
11415 }
11416 break;
11417
e69a816d
WD
11418 case 'y':
11419 case 'z':
11420 {
11421 machine_mode mode = GET_MODE (x);
11422
3793ecc1 11423 if (!MEM_P (x)
6a70badb 11424 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
e69a816d
WD
11425 {
11426 output_operand_lossage ("invalid operand for '%%%c'", code);
11427 return;
11428 }
11429
a25831ac
AV
11430 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
11431 code == 'y'
11432 ? ADDR_QUERY_LDP_STP_N
11433 : ADDR_QUERY_LDP_STP))
c348cab0 11434 output_operand_lossage ("invalid operand prefix '%%%c'", code);
e69a816d
WD
11435 }
11436 break;
11437
43e9d192
IB
11438 default:
11439 output_operand_lossage ("invalid operand prefix '%%%c'", code);
11440 return;
11441 }
11442}
11443
e69a816d
WD
11444/* Print address 'x' of a memory access with mode 'mode'.
11445 'op' is the context required by aarch64_classify_address. It can either be
11446 MEM for a normal memory access or PARALLEL for LDP/STP. */
c348cab0 11447static bool
a97d8b98
RS
11448aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
11449 aarch64_addr_query_type type)
43e9d192
IB
11450{
11451 struct aarch64_address_info addr;
550a3380 11452 unsigned int size, vec_flags;
43e9d192 11453
e69a816d 11454 /* Check all addresses are Pmode - including ILP32. */
31460ed2
JJ
11455 if (GET_MODE (x) != Pmode
11456 && (!CONST_INT_P (x)
11457 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
11458 {
11459 output_operand_lossage ("invalid address mode");
11460 return false;
11461 }
e69a816d 11462
a97d8b98 11463 if (aarch64_classify_address (&addr, x, mode, true, type))
43e9d192
IB
11464 switch (addr.type)
11465 {
11466 case ADDRESS_REG_IMM:
dc640181 11467 if (known_eq (addr.const_offset, 0))
43cacb12 11468 {
550a3380
RS
11469 asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
11470 return true;
43cacb12 11471 }
550a3380
RS
11472
11473 vec_flags = aarch64_classify_vector_mode (mode);
11474 if (vec_flags & VEC_ANY_SVE)
43cacb12
RS
11475 {
11476 HOST_WIDE_INT vnum
11477 = exact_div (addr.const_offset,
550a3380 11478 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
43cacb12
RS
11479 asm_fprintf (f, "[%s, #%wd, mul vl]",
11480 reg_names[REGNO (addr.base)], vnum);
550a3380 11481 return true;
43cacb12 11482 }
550a3380
RS
11483
11484 asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
11485 INTVAL (addr.offset));
c348cab0 11486 return true;
43e9d192
IB
11487
11488 case ADDRESS_REG_REG:
11489 if (addr.shift == 0)
16a3246f 11490 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
01a3a324 11491 reg_names [REGNO (addr.offset)]);
43e9d192 11492 else
16a3246f 11493 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
01a3a324 11494 reg_names [REGNO (addr.offset)], addr.shift);
c348cab0 11495 return true;
43e9d192
IB
11496
11497 case ADDRESS_REG_UXTW:
11498 if (addr.shift == 0)
16a3246f 11499 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
43e9d192
IB
11500 REGNO (addr.offset) - R0_REGNUM);
11501 else
16a3246f 11502 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
43e9d192 11503 REGNO (addr.offset) - R0_REGNUM, addr.shift);
c348cab0 11504 return true;
43e9d192
IB
11505
11506 case ADDRESS_REG_SXTW:
11507 if (addr.shift == 0)
16a3246f 11508 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
43e9d192
IB
11509 REGNO (addr.offset) - R0_REGNUM);
11510 else
16a3246f 11511 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
43e9d192 11512 REGNO (addr.offset) - R0_REGNUM, addr.shift);
c348cab0 11513 return true;
43e9d192
IB
11514
11515 case ADDRESS_REG_WB:
6a70badb
RS
11516 /* Writeback is only supported for fixed-width modes. */
11517 size = GET_MODE_SIZE (mode).to_constant ();
43e9d192
IB
11518 switch (GET_CODE (x))
11519 {
11520 case PRE_INC:
6a70badb 11521 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
c348cab0 11522 return true;
43e9d192 11523 case POST_INC:
6a70badb 11524 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
c348cab0 11525 return true;
43e9d192 11526 case PRE_DEC:
6a70badb 11527 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
c348cab0 11528 return true;
43e9d192 11529 case POST_DEC:
6a70badb 11530 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
c348cab0 11531 return true;
43e9d192 11532 case PRE_MODIFY:
6a70badb 11533 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
43e9d192 11534 INTVAL (addr.offset));
c348cab0 11535 return true;
43e9d192 11536 case POST_MODIFY:
6a70badb 11537 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
43e9d192 11538 INTVAL (addr.offset));
c348cab0 11539 return true;
43e9d192
IB
11540 default:
11541 break;
11542 }
11543 break;
11544
11545 case ADDRESS_LO_SUM:
16a3246f 11546 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
43e9d192
IB
11547 output_addr_const (f, addr.offset);
11548 asm_fprintf (f, "]");
c348cab0 11549 return true;
43e9d192
IB
11550
11551 case ADDRESS_SYMBOLIC:
d6591257 11552 output_addr_const (f, x);
c348cab0 11553 return true;
43e9d192
IB
11554 }
11555
c348cab0 11556 return false;
43e9d192
IB
11557}
11558
e69a816d
WD
11559/* Print address 'x' of a memory access with mode 'mode'. */
11560static void
11561aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
11562{
43cacb12 11563 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
c348cab0 11564 output_addr_const (f, x);
e69a816d
WD
11565}
11566
74b27d8e
RS
11567/* Implement TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
11568
11569static bool
11570aarch64_output_addr_const_extra (FILE *file, rtx x)
11571{
11572 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SALT_ADDR)
11573 {
11574 output_addr_const (file, XVECEXP (x, 0, 0));
11575 return true;
11576 }
11577 return false;
11578}
11579
43e9d192
IB
11580bool
11581aarch64_label_mentioned_p (rtx x)
11582{
11583 const char *fmt;
11584 int i;
11585
3793ecc1 11586 if (LABEL_REF_P (x))
43e9d192
IB
11587 return true;
11588
11589 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
11590 referencing instruction, but they are constant offsets, not
11591 symbols. */
11592 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
11593 return false;
11594
11595 fmt = GET_RTX_FORMAT (GET_CODE (x));
11596 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
11597 {
11598 if (fmt[i] == 'E')
11599 {
11600 int j;
11601
11602 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
11603 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
11604 return 1;
11605 }
11606 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
11607 return 1;
11608 }
11609
11610 return 0;
11611}
11612
11613/* Implement REGNO_REG_CLASS. */
11614
11615enum reg_class
11616aarch64_regno_regclass (unsigned regno)
11617{
96b7f495
MM
11618 if (STUB_REGNUM_P (regno))
11619 return STUB_REGS;
11620
43e9d192 11621 if (GP_REGNUM_P (regno))
a4a182c6 11622 return GENERAL_REGS;
43e9d192
IB
11623
11624 if (regno == SP_REGNUM)
11625 return STACK_REG;
11626
11627 if (regno == FRAME_POINTER_REGNUM
11628 || regno == ARG_POINTER_REGNUM)
f24bb080 11629 return POINTER_REGS;
43e9d192
IB
11630
11631 if (FP_REGNUM_P (regno))
163b1f6a
RS
11632 return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
11633 : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
43e9d192 11634
43cacb12
RS
11635 if (PR_REGNUM_P (regno))
11636 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
11637
183bfdaf
RS
11638 if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
11639 return FFR_REGS;
11640
43e9d192
IB
11641 return NO_REGS;
11642}
11643
6a70badb
RS
11644/* OFFSET is an address offset for mode MODE, which has SIZE bytes.
11645 If OFFSET is out of range, return an offset of an anchor point
11646 that is in range. Return 0 otherwise. */
11647
11648static HOST_WIDE_INT
11649aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
11650 machine_mode mode)
11651{
11652 /* Does it look like we'll need a 16-byte load/store-pair operation? */
11653 if (size > 16)
11654 return (offset + 0x400) & ~0x7f0;
11655
11656 /* For offsets that aren't a multiple of the access size, the limit is
11657 -256...255. */
11658 if (offset & (size - 1))
11659 {
11660 /* BLKmode typically uses LDP of X-registers. */
11661 if (mode == BLKmode)
11662 return (offset + 512) & ~0x3ff;
11663 return (offset + 0x100) & ~0x1ff;
11664 }
11665
11666 /* Small negative offsets are supported. */
11667 if (IN_RANGE (offset, -256, 0))
11668 return 0;
11669
11670 if (mode == TImode || mode == TFmode)
11671 return (offset + 0x100) & ~0x1ff;
11672
11673 /* Use 12-bit offset by access size. */
11674 return offset & (~0xfff * size);
11675}
11676
0c4ec427 11677static rtx
ef4bddc2 11678aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
0c4ec427
RE
11679{
11680 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
11681 where mask is selected by alignment and size of the offset.
11682 We try to pick as large a range for the offset as possible to
11683 maximize the chance of a CSE. However, for aligned addresses
11684 we limit the range to 4k so that structures with different sized
e8426e0a
BC
11685 elements are likely to use the same base. We need to be careful
11686 not to split a CONST for some forms of address expression, otherwise
11687 it will generate sub-optimal code. */
0c4ec427
RE
11688
11689 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
11690 {
9e0218fc 11691 rtx base = XEXP (x, 0);
17d7bdd8 11692 rtx offset_rtx = XEXP (x, 1);
9e0218fc 11693 HOST_WIDE_INT offset = INTVAL (offset_rtx);
0c4ec427 11694
9e0218fc 11695 if (GET_CODE (base) == PLUS)
e8426e0a 11696 {
9e0218fc
RH
11697 rtx op0 = XEXP (base, 0);
11698 rtx op1 = XEXP (base, 1);
11699
11700 /* Force any scaling into a temp for CSE. */
11701 op0 = force_reg (Pmode, op0);
11702 op1 = force_reg (Pmode, op1);
11703
11704 /* Let the pointer register be in op0. */
11705 if (REG_POINTER (op1))
11706 std::swap (op0, op1);
11707
11708 /* If the pointer is virtual or frame related, then we know that
11709 virtual register instantiation or register elimination is going
11710 to apply a second constant. We want the two constants folded
11711 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
11712 if (virt_or_elim_regno_p (REGNO (op0)))
e8426e0a 11713 {
9e0218fc
RH
11714 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
11715 NULL_RTX, true, OPTAB_DIRECT);
11716 return gen_rtx_PLUS (Pmode, base, op1);
e8426e0a 11717 }
e8426e0a 11718
9e0218fc
RH
11719 /* Otherwise, in order to encourage CSE (and thence loop strength
11720 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
11721 base = expand_binop (Pmode, add_optab, op0, op1,
11722 NULL_RTX, true, OPTAB_DIRECT);
11723 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
e8426e0a
BC
11724 }
11725
6a70badb
RS
11726 HOST_WIDE_INT size;
11727 if (GET_MODE_SIZE (mode).is_constant (&size))
ff0f3f1c 11728 {
6a70badb
RS
11729 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
11730 mode);
11731 if (base_offset != 0)
11732 {
11733 base = plus_constant (Pmode, base, base_offset);
11734 base = force_operand (base, NULL_RTX);
11735 return plus_constant (Pmode, base, offset - base_offset);
11736 }
9e0218fc 11737 }
0c4ec427
RE
11738 }
11739
11740 return x;
11741}
11742
43e9d192
IB
11743static reg_class_t
11744aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
11745 reg_class_t rclass,
ef4bddc2 11746 machine_mode mode,
43e9d192
IB
11747 secondary_reload_info *sri)
11748{
cc68f7c2
RS
11749 /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
11750 LDR and STR. See the comment at the head of aarch64-sve.md for
11751 more details about the big-endian handling. */
11752 if (reg_class_subset_p (rclass, FP_REGS)
9a1b9cb4
RS
11753 && !((REG_P (x) && HARD_REGISTER_P (x))
11754 || aarch64_simd_valid_immediate (x, NULL))
cc68f7c2 11755 && mode != VNx16QImode)
43cacb12 11756 {
cc68f7c2
RS
11757 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11758 if ((vec_flags & VEC_SVE_DATA)
11759 && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
11760 {
11761 sri->icode = CODE_FOR_aarch64_sve_reload_mem;
11762 return NO_REGS;
11763 }
43cacb12 11764 }
b4f50fd4
RR
11765
11766 /* If we have to disable direct literal pool loads and stores because the
11767 function is too big, then we need a scratch register. */
3793ecc1 11768 if (MEM_P (x) && SYMBOL_REF_P (x) && CONSTANT_POOL_ADDRESS_P (x)
b4f50fd4
RR
11769 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
11770 || targetm.vector_mode_supported_p (GET_MODE (x)))
9ee6540a 11771 && !aarch64_pcrelative_literal_loads)
b4f50fd4 11772 {
0016d8d9 11773 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
b4f50fd4
RR
11774 return NO_REGS;
11775 }
11776
43e9d192
IB
11777 /* Without the TARGET_SIMD instructions we cannot move a Q register
11778 to a Q register directly. We need a scratch. */
11779 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
11780 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
11781 && reg_class_subset_p (rclass, FP_REGS))
11782 {
0016d8d9 11783 sri->icode = code_for_aarch64_reload_mov (mode);
43e9d192
IB
11784 return NO_REGS;
11785 }
11786
11787 /* A TFmode or TImode memory access should be handled via an FP_REGS
11788 because AArch64 has richer addressing modes for LDR/STR instructions
11789 than LDP/STP instructions. */
d5726973 11790 if (TARGET_FLOAT && rclass == GENERAL_REGS
6a70badb 11791 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
43e9d192
IB
11792 return FP_REGS;
11793
11794 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
a4a182c6 11795 return GENERAL_REGS;
43e9d192
IB
11796
11797 return NO_REGS;
11798}
11799
11800static bool
6216fd90 11801aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
43e9d192 11802{
6216fd90 11803 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
43e9d192 11804
6216fd90
WD
11805 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
11806 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
43e9d192 11807 if (frame_pointer_needed)
6216fd90 11808 return to == HARD_FRAME_POINTER_REGNUM;
43e9d192
IB
11809 return true;
11810}
11811
6a70badb 11812poly_int64
43e9d192
IB
11813aarch64_initial_elimination_offset (unsigned from, unsigned to)
11814{
78c29983
MS
11815 if (to == HARD_FRAME_POINTER_REGNUM)
11816 {
11817 if (from == ARG_POINTER_REGNUM)
71bfb77a 11818 return cfun->machine->frame.hard_fp_offset;
78c29983
MS
11819
11820 if (from == FRAME_POINTER_REGNUM)
71bfb77a
WD
11821 return cfun->machine->frame.hard_fp_offset
11822 - cfun->machine->frame.locals_offset;
78c29983
MS
11823 }
11824
11825 if (to == STACK_POINTER_REGNUM)
11826 {
11827 if (from == FRAME_POINTER_REGNUM)
71bfb77a
WD
11828 return cfun->machine->frame.frame_size
11829 - cfun->machine->frame.locals_offset;
78c29983
MS
11830 }
11831
1c960e02 11832 return cfun->machine->frame.frame_size;
43e9d192
IB
11833}
11834
463a54e5
SN
11835
11836/* Get return address without mangling. */
11837
11838rtx
11839aarch64_return_addr_rtx (void)
11840{
11841 rtx val = get_hard_reg_initial_val (Pmode, LR_REGNUM);
11842 /* Note: aarch64_return_address_signing_enabled only
11843 works after cfun->machine->frame.laid_out is set,
11844 so here we don't know if the return address will
11845 be signed or not. */
11846 rtx lr = gen_rtx_REG (Pmode, LR_REGNUM);
11847 emit_move_insn (lr, val);
11848 emit_insn (GEN_FCN (CODE_FOR_xpaclri) ());
11849 return lr;
11850}
11851
11852
43e9d192
IB
11853/* Implement RETURN_ADDR_RTX. We do not support moving back to a
11854 previous frame. */
11855
11856rtx
11857aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
11858{
11859 if (count != 0)
11860 return const0_rtx;
463a54e5 11861 return aarch64_return_addr_rtx ();
43e9d192
IB
11862}
11863
43e9d192
IB
11864static void
11865aarch64_asm_trampoline_template (FILE *f)
11866{
be7c41a5
OT
11867 /* Even if the current function doesn't have branch protection, some
11868 later function might, so since this template is only generated once
11869 we have to add a BTI just in case. */
11870 asm_fprintf (f, "\thint\t34 // bti c\n");
b5f794b4 11871
28514dda
YZ
11872 if (TARGET_ILP32)
11873 {
be178ecd
MM
11874 asm_fprintf (f, "\tldr\tw%d, .+20\n", IP1_REGNUM - R0_REGNUM);
11875 asm_fprintf (f, "\tldr\tw%d, .+20\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
28514dda
YZ
11876 }
11877 else
11878 {
be178ecd
MM
11879 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [IP1_REGNUM]);
11880 asm_fprintf (f, "\tldr\t%s, .+24\n", reg_names [STATIC_CHAIN_REGNUM]);
28514dda 11881 }
01a3a324 11882 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
b5f794b4 11883
be178ecd
MM
11884 /* We always emit a speculation barrier.
11885 This is because the same trampoline template is used for every nested
11886 function. Since nested functions are not particularly common or
11887 performant we don't worry too much about the extra instructions to copy
11888 around.
11889 This is not yet a problem, since we have not yet implemented function
11890 specific attributes to choose between hardening against straight line
11891 speculation or not, but such function specific attributes are likely to
11892 happen in the future. */
11893 asm_fprintf (f, "\tdsb\tsy\n\tisb\n");
11894
28514dda
YZ
11895 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
11896 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
43e9d192
IB
11897}
11898
11899static void
11900aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
11901{
11902 rtx fnaddr, mem, a_tramp;
be178ecd 11903 const int tramp_code_sz = 24;
43e9d192
IB
11904
11905 /* Don't need to copy the trailing D-words, we fill those in below. */
be178ecd
MM
11906 /* We create our own memory address in Pmode so that `emit_block_move` can
11907 use parts of the backend which expect Pmode addresses. */
11908 rtx temp = convert_memory_address (Pmode, XEXP (m_tramp, 0));
11909 emit_block_move (gen_rtx_MEM (BLKmode, temp),
11910 assemble_trampoline_template (),
28514dda
YZ
11911 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
11912 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
43e9d192 11913 fnaddr = XEXP (DECL_RTL (fndecl), 0);
28514dda
YZ
11914 if (GET_MODE (fnaddr) != ptr_mode)
11915 fnaddr = convert_memory_address (ptr_mode, fnaddr);
43e9d192
IB
11916 emit_move_insn (mem, fnaddr);
11917
28514dda 11918 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
43e9d192
IB
11919 emit_move_insn (mem, chain_value);
11920
11921 /* XXX We should really define a "clear_cache" pattern and use
11922 gen_clear_cache(). */
11923 a_tramp = XEXP (m_tramp, 0);
c05ece92
AO
11924 maybe_emit_call_builtin___clear_cache (a_tramp,
11925 plus_constant (ptr_mode,
11926 a_tramp,
11927 TRAMPOLINE_SIZE));
43e9d192
IB
11928}
11929
11930static unsigned char
ef4bddc2 11931aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
43e9d192 11932{
6a70badb
RS
11933 /* ??? Logically we should only need to provide a value when
11934 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
11935 can hold MODE, but at the moment we need to handle all modes.
11936 Just ignore any runtime parts for registers that can't store them. */
11937 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
550a3380 11938 unsigned int nregs, vec_flags;
43e9d192
IB
11939 switch (regclass)
11940 {
96b7f495 11941 case STUB_REGS:
d677263e 11942 case TAILCALL_ADDR_REGS:
43e9d192
IB
11943 case POINTER_REGS:
11944 case GENERAL_REGS:
11945 case ALL_REGS:
f25a140b 11946 case POINTER_AND_FP_REGS:
43e9d192
IB
11947 case FP_REGS:
11948 case FP_LO_REGS:
163b1f6a 11949 case FP_LO8_REGS:
550a3380
RS
11950 vec_flags = aarch64_classify_vector_mode (mode);
11951 if ((vec_flags & VEC_SVE_DATA)
43cacb12 11952 && constant_multiple_p (GET_MODE_SIZE (mode),
550a3380 11953 aarch64_vl_bytes (mode, vec_flags), &nregs))
43cacb12 11954 return nregs;
550a3380 11955 return (vec_flags & VEC_ADVSIMD
6a70badb
RS
11956 ? CEIL (lowest_size, UNITS_PER_VREG)
11957 : CEIL (lowest_size, UNITS_PER_WORD));
43e9d192 11958 case STACK_REG:
43cacb12
RS
11959 case PR_REGS:
11960 case PR_LO_REGS:
11961 case PR_HI_REGS:
183bfdaf
RS
11962 case FFR_REGS:
11963 case PR_AND_FFR_REGS:
43e9d192
IB
11964 return 1;
11965
11966 case NO_REGS:
11967 return 0;
11968
11969 default:
11970 break;
11971 }
11972 gcc_unreachable ();
11973}
11974
11975static reg_class_t
78d8b9f0 11976aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
43e9d192 11977{
51bb310d 11978 if (regclass == POINTER_REGS)
78d8b9f0
IB
11979 return GENERAL_REGS;
11980
51bb310d
MS
11981 if (regclass == STACK_REG)
11982 {
11983 if (REG_P(x)
11984 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
11985 return regclass;
11986
11987 return NO_REGS;
11988 }
11989
27bd251b
IB
11990 /* Register eliminiation can result in a request for
11991 SP+constant->FP_REGS. We cannot support such operations which
11992 use SP as source and an FP_REG as destination, so reject out
11993 right now. */
11994 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
11995 {
11996 rtx lhs = XEXP (x, 0);
11997
11998 /* Look through a possible SUBREG introduced by ILP32. */
3793ecc1 11999 if (SUBREG_P (lhs))
27bd251b
IB
12000 lhs = SUBREG_REG (lhs);
12001
12002 gcc_assert (REG_P (lhs));
12003 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
12004 POINTER_REGS));
12005 return NO_REGS;
12006 }
12007
78d8b9f0 12008 return regclass;
43e9d192
IB
12009}
12010
12011void
12012aarch64_asm_output_labelref (FILE* f, const char *name)
12013{
12014 asm_fprintf (f, "%U%s", name);
12015}
12016
12017static void
12018aarch64_elf_asm_constructor (rtx symbol, int priority)
12019{
12020 if (priority == DEFAULT_INIT_PRIORITY)
12021 default_ctor_section_asm_out_constructor (symbol, priority);
12022 else
12023 {
12024 section *s;
53d190c1
AT
12025 /* While priority is known to be in range [0, 65535], so 18 bytes
12026 would be enough, the compiler might not know that. To avoid
12027 -Wformat-truncation false positive, use a larger size. */
12028 char buf[23];
43e9d192 12029 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
fcef3abd 12030 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
43e9d192
IB
12031 switch_to_section (s);
12032 assemble_align (POINTER_SIZE);
28514dda 12033 assemble_aligned_integer (POINTER_BYTES, symbol);
43e9d192
IB
12034 }
12035}
12036
12037static void
12038aarch64_elf_asm_destructor (rtx symbol, int priority)
12039{
12040 if (priority == DEFAULT_INIT_PRIORITY)
12041 default_dtor_section_asm_out_destructor (symbol, priority);
12042 else
12043 {
12044 section *s;
53d190c1
AT
12045 /* While priority is known to be in range [0, 65535], so 18 bytes
12046 would be enough, the compiler might not know that. To avoid
12047 -Wformat-truncation false positive, use a larger size. */
12048 char buf[23];
43e9d192 12049 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
fcef3abd 12050 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
43e9d192
IB
12051 switch_to_section (s);
12052 assemble_align (POINTER_SIZE);
28514dda 12053 assemble_aligned_integer (POINTER_BYTES, symbol);
43e9d192
IB
12054 }
12055}
12056
12057const char*
12058aarch64_output_casesi (rtx *operands)
12059{
12060 char buf[100];
12061 char label[100];
b32d5189 12062 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
43e9d192
IB
12063 int index;
12064 static const char *const patterns[4][2] =
12065 {
12066 {
12067 "ldrb\t%w3, [%0,%w1,uxtw]",
12068 "add\t%3, %4, %w3, sxtb #2"
12069 },
12070 {
12071 "ldrh\t%w3, [%0,%w1,uxtw #1]",
12072 "add\t%3, %4, %w3, sxth #2"
12073 },
12074 {
12075 "ldr\t%w3, [%0,%w1,uxtw #2]",
12076 "add\t%3, %4, %w3, sxtw #2"
12077 },
12078 /* We assume that DImode is only generated when not optimizing and
12079 that we don't really need 64-bit address offsets. That would
12080 imply an object file with 8GB of code in a single function! */
12081 {
12082 "ldr\t%w3, [%0,%w1,uxtw #2]",
12083 "add\t%3, %4, %w3, sxtw #2"
12084 }
12085 };
12086
12087 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
12088
77e994c9
RS
12089 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
12090 index = exact_log2 (GET_MODE_SIZE (mode));
43e9d192
IB
12091
12092 gcc_assert (index >= 0 && index <= 3);
12093
12094 /* Need to implement table size reduction, by chaning the code below. */
12095 output_asm_insn (patterns[index][0], operands);
12096 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
12097 snprintf (buf, sizeof (buf),
12098 "adr\t%%4, %s", targetm.strip_name_encoding (label));
12099 output_asm_insn (buf, operands);
12100 output_asm_insn (patterns[index][1], operands);
12101 output_asm_insn ("br\t%3", operands);
be178ecd
MM
12102 output_asm_insn (aarch64_sls_barrier (aarch64_harden_sls_retbr_p ()),
12103 operands);
43e9d192
IB
12104 assemble_label (asm_out_file, label);
12105 return "";
12106}
12107
12108
12109/* Return size in bits of an arithmetic operand which is shifted/scaled and
12110 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
12111 operator. */
12112
12113int
12114aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
12115{
12116 if (shift >= 0 && shift <= 3)
12117 {
12118 int size;
12119 for (size = 8; size <= 32; size *= 2)
12120 {
12121 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
12122 if (mask == bits << shift)
12123 return size;
12124 }
12125 }
12126 return 0;
12127}
12128
e78d485e
RR
12129/* Constant pools are per function only when PC relative
12130 literal loads are true or we are in the large memory
12131 model. */
12132
12133static inline bool
12134aarch64_can_use_per_function_literal_pools_p (void)
12135{
9ee6540a 12136 return (aarch64_pcrelative_literal_loads
e78d485e
RR
12137 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
12138}
12139
43e9d192 12140static bool
e78d485e 12141aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
43e9d192 12142{
74a9301d
VM
12143 /* We can't use blocks for constants when we're using a per-function
12144 constant pool. */
12145 return !aarch64_can_use_per_function_literal_pools_p ();
43e9d192
IB
12146}
12147
e78d485e
RR
12148/* Select appropriate section for constants depending
12149 on where we place literal pools. */
12150
43e9d192 12151static section *
e78d485e
RR
12152aarch64_select_rtx_section (machine_mode mode,
12153 rtx x,
12154 unsigned HOST_WIDE_INT align)
43e9d192 12155{
e78d485e
RR
12156 if (aarch64_can_use_per_function_literal_pools_p ())
12157 return function_section (current_function_decl);
43e9d192 12158
e78d485e
RR
12159 return default_elf_select_rtx_section (mode, x, align);
12160}
43e9d192 12161
5fca7b66
RH
12162/* Implement ASM_OUTPUT_POOL_EPILOGUE. */
12163void
12164aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
12165 HOST_WIDE_INT offset)
12166{
12167 /* When using per-function literal pools, we must ensure that any code
12168 section is aligned to the minimal instruction length, lest we get
12169 errors from the assembler re "unaligned instructions". */
12170 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
12171 ASM_OUTPUT_ALIGN (f, 2);
12172}
12173
43e9d192
IB
12174/* Costs. */
12175
12176/* Helper function for rtx cost calculation. Strip a shift expression
12177 from X. Returns the inner operand if successful, or the original
12178 expression on failure. */
12179static rtx
12180aarch64_strip_shift (rtx x)
12181{
12182 rtx op = x;
12183
57b77d46
RE
12184 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
12185 we can convert both to ROR during final output. */
43e9d192
IB
12186 if ((GET_CODE (op) == ASHIFT
12187 || GET_CODE (op) == ASHIFTRT
57b77d46
RE
12188 || GET_CODE (op) == LSHIFTRT
12189 || GET_CODE (op) == ROTATERT
12190 || GET_CODE (op) == ROTATE)
43e9d192
IB
12191 && CONST_INT_P (XEXP (op, 1)))
12192 return XEXP (op, 0);
12193
12194 if (GET_CODE (op) == MULT
12195 && CONST_INT_P (XEXP (op, 1))
12196 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
12197 return XEXP (op, 0);
12198
12199 return x;
12200}
12201
4745e701 12202/* Helper function for rtx cost calculation. Strip an extend
43e9d192
IB
12203 expression from X. Returns the inner operand if successful, or the
12204 original expression on failure. We deal with a number of possible
b10f1009
AP
12205 canonicalization variations here. If STRIP_SHIFT is true, then
12206 we can strip off a shift also. */
43e9d192 12207static rtx
b10f1009 12208aarch64_strip_extend (rtx x, bool strip_shift)
43e9d192 12209{
77e994c9 12210 scalar_int_mode mode;
43e9d192
IB
12211 rtx op = x;
12212
77e994c9
RS
12213 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
12214 return op;
12215
43e9d192
IB
12216 if (GET_CODE (op) == AND
12217 && GET_CODE (XEXP (op, 0)) == MULT
12218 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
12219 && CONST_INT_P (XEXP (op, 1))
12220 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
12221 INTVAL (XEXP (op, 1))) != 0)
12222 return XEXP (XEXP (op, 0), 0);
12223
12224 /* Now handle extended register, as this may also have an optional
12225 left shift by 1..4. */
b10f1009
AP
12226 if (strip_shift
12227 && GET_CODE (op) == ASHIFT
43e9d192
IB
12228 && CONST_INT_P (XEXP (op, 1))
12229 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
12230 op = XEXP (op, 0);
12231
12232 if (GET_CODE (op) == ZERO_EXTEND
12233 || GET_CODE (op) == SIGN_EXTEND)
12234 op = XEXP (op, 0);
12235
12236 if (op != x)
12237 return op;
12238
4745e701
JG
12239 return x;
12240}
12241
63834c84
JW
12242/* Helper function for rtx cost calculation. Strip extension as well as any
12243 inner VEC_SELECT high-half from X. Returns the inner vector operand if
12244 successful, or the original expression on failure. */
12245static rtx
12246aarch64_strip_extend_vec_half (rtx x)
12247{
12248 if (GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
12249 {
12250 x = XEXP (x, 0);
12251 if (GET_CODE (x) == VEC_SELECT
12252 && vec_series_highpart_p (GET_MODE (x), GET_MODE (XEXP (x, 0)),
12253 XEXP (x, 1)))
12254 x = XEXP (x, 0);
12255 }
12256 return x;
12257}
1d65c9d2
JW
12258
12259/* Helper function for rtx cost calculation. Strip VEC_DUPLICATE as well as
12260 any subsequent extend and VEC_SELECT from X. Returns the inner scalar
12261 operand if successful, or the original expression on failure. */
12262static rtx
12263aarch64_strip_duplicate_vec_elt (rtx x)
12264{
12265 if (GET_CODE (x) == VEC_DUPLICATE
12266 && is_a<scalar_mode> (GET_MODE (XEXP (x, 0))))
12267 {
12268 x = XEXP (x, 0);
12269 if (GET_CODE (x) == VEC_SELECT)
12270 x = XEXP (x, 0);
12271 else if ((GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
12272 && GET_CODE (XEXP (x, 0)) == VEC_SELECT)
12273 x = XEXP (XEXP (x, 0), 0);
12274 }
12275 return x;
12276}
12277
0a78ebe4
KT
12278/* Return true iff CODE is a shift supported in combination
12279 with arithmetic instructions. */
4d1919ed 12280
0a78ebe4
KT
12281static bool
12282aarch64_shift_p (enum rtx_code code)
12283{
12284 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
12285}
12286
b10f1009
AP
12287
12288/* Return true iff X is a cheap shift without a sign extend. */
12289
12290static bool
12291aarch64_cheap_mult_shift_p (rtx x)
12292{
12293 rtx op0, op1;
12294
12295 op0 = XEXP (x, 0);
12296 op1 = XEXP (x, 1);
12297
12298 if (!(aarch64_tune_params.extra_tuning_flags
12299 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
12300 return false;
12301
12302 if (GET_CODE (op0) == SIGN_EXTEND)
12303 return false;
12304
12305 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
12306 && UINTVAL (op1) <= 4)
12307 return true;
12308
12309 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
12310 return false;
12311
12312 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
12313
12314 if (l2 > 0 && l2 <= 4)
12315 return true;
12316
12317 return false;
12318}
12319
4745e701 12320/* Helper function for rtx cost calculation. Calculate the cost of
0a78ebe4
KT
12321 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
12322 Return the calculated cost of the expression, recursing manually in to
4745e701
JG
12323 operands where needed. */
12324
12325static int
e548c9df 12326aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
4745e701
JG
12327{
12328 rtx op0, op1;
12329 const struct cpu_cost_table *extra_cost
b175b679 12330 = aarch64_tune_params.insn_extra_cost;
4745e701 12331 int cost = 0;
0a78ebe4 12332 bool compound_p = (outer == PLUS || outer == MINUS);
ef4bddc2 12333 machine_mode mode = GET_MODE (x);
4745e701
JG
12334
12335 gcc_checking_assert (code == MULT);
12336
12337 op0 = XEXP (x, 0);
12338 op1 = XEXP (x, 1);
12339
12340 if (VECTOR_MODE_P (mode))
df81764b
TC
12341 {
12342 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
df81764b
TC
12343 if (vec_flags & VEC_ADVSIMD)
12344 {
63834c84
JW
12345 /* The select-operand-high-half versions of the instruction have the
12346 same cost as the three vector version - don't add the costs of the
12347 extension or selection into the costs of the multiply. */
12348 op0 = aarch64_strip_extend_vec_half (op0);
12349 op1 = aarch64_strip_extend_vec_half (op1);
df81764b 12350 /* The by-element versions of the instruction have the same costs as
1d65c9d2
JW
12351 the normal 3-vector version. We make an assumption that the input
12352 to the VEC_DUPLICATE is already on the FP & SIMD side. This means
12353 costing of a MUL by element pre RA is a bit optimistic. */
12354 op0 = aarch64_strip_duplicate_vec_elt (op0);
12355 op1 = aarch64_strip_duplicate_vec_elt (op1);
df81764b 12356 }
a11ef532
AV
12357 cost += rtx_cost (op0, mode, MULT, 0, speed);
12358 cost += rtx_cost (op1, mode, MULT, 1, speed);
12359 if (speed)
12360 {
12361 if (GET_CODE (x) == MULT)
12362 cost += extra_cost->vect.mult;
12363 /* This is to catch the SSRA costing currently flowing here. */
12364 else
12365 cost += extra_cost->vect.alu;
12366 }
12367 return cost;
df81764b 12368 }
4745e701
JG
12369
12370 /* Integer multiply/fma. */
12371 if (GET_MODE_CLASS (mode) == MODE_INT)
12372 {
12373 /* The multiply will be canonicalized as a shift, cost it as such. */
0a78ebe4
KT
12374 if (aarch64_shift_p (GET_CODE (x))
12375 || (CONST_INT_P (op1)
12376 && exact_log2 (INTVAL (op1)) > 0))
4745e701 12377 {
0a78ebe4
KT
12378 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
12379 || GET_CODE (op0) == SIGN_EXTEND;
4745e701
JG
12380 if (speed)
12381 {
0a78ebe4
KT
12382 if (compound_p)
12383 {
b10f1009
AP
12384 /* If the shift is considered cheap,
12385 then don't add any cost. */
12386 if (aarch64_cheap_mult_shift_p (x))
12387 ;
12388 else if (REG_P (op1))
0a78ebe4
KT
12389 /* ARITH + shift-by-register. */
12390 cost += extra_cost->alu.arith_shift_reg;
12391 else if (is_extend)
12392 /* ARITH + extended register. We don't have a cost field
12393 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
12394 cost += extra_cost->alu.extend_arith;
12395 else
12396 /* ARITH + shift-by-immediate. */
12397 cost += extra_cost->alu.arith_shift;
12398 }
4745e701
JG
12399 else
12400 /* LSL (immediate). */
0a78ebe4
KT
12401 cost += extra_cost->alu.shift;
12402
4745e701 12403 }
0a78ebe4
KT
12404 /* Strip extends as we will have costed them in the case above. */
12405 if (is_extend)
b10f1009 12406 op0 = aarch64_strip_extend (op0, true);
4745e701 12407
e548c9df 12408 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
4745e701
JG
12409
12410 return cost;
12411 }
12412
d2ac256b
KT
12413 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
12414 compound and let the below cases handle it. After all, MNEG is a
12415 special-case alias of MSUB. */
12416 if (GET_CODE (op0) == NEG)
12417 {
12418 op0 = XEXP (op0, 0);
12419 compound_p = true;
12420 }
12421
4745e701
JG
12422 /* Integer multiplies or FMAs have zero/sign extending variants. */
12423 if ((GET_CODE (op0) == ZERO_EXTEND
12424 && GET_CODE (op1) == ZERO_EXTEND)
12425 || (GET_CODE (op0) == SIGN_EXTEND
12426 && GET_CODE (op1) == SIGN_EXTEND))
12427 {
e548c9df
AM
12428 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
12429 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
4745e701
JG
12430
12431 if (speed)
12432 {
0a78ebe4 12433 if (compound_p)
d2ac256b 12434 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
4745e701
JG
12435 cost += extra_cost->mult[0].extend_add;
12436 else
12437 /* MUL/SMULL/UMULL. */
12438 cost += extra_cost->mult[0].extend;
12439 }
12440
12441 return cost;
12442 }
12443
d2ac256b 12444 /* This is either an integer multiply or a MADD. In both cases
4745e701 12445 we want to recurse and cost the operands. */
e548c9df
AM
12446 cost += rtx_cost (op0, mode, MULT, 0, speed);
12447 cost += rtx_cost (op1, mode, MULT, 1, speed);
4745e701
JG
12448
12449 if (speed)
12450 {
0a78ebe4 12451 if (compound_p)
d2ac256b 12452 /* MADD/MSUB. */
4745e701
JG
12453 cost += extra_cost->mult[mode == DImode].add;
12454 else
12455 /* MUL. */
12456 cost += extra_cost->mult[mode == DImode].simple;
12457 }
12458
12459 return cost;
12460 }
12461 else
12462 {
12463 if (speed)
12464 {
3d840f7d 12465 /* Floating-point FMA/FMUL can also support negations of the
d318517d
SN
12466 operands, unless the rounding mode is upward or downward in
12467 which case FNMUL is different than FMUL with operand negation. */
12468 bool neg0 = GET_CODE (op0) == NEG;
12469 bool neg1 = GET_CODE (op1) == NEG;
12470 if (compound_p || !flag_rounding_math || (neg0 && neg1))
12471 {
12472 if (neg0)
12473 op0 = XEXP (op0, 0);
12474 if (neg1)
12475 op1 = XEXP (op1, 0);
12476 }
4745e701 12477
0a78ebe4 12478 if (compound_p)
4745e701
JG
12479 /* FMADD/FNMADD/FNMSUB/FMSUB. */
12480 cost += extra_cost->fp[mode == DFmode].fma;
12481 else
3d840f7d 12482 /* FMUL/FNMUL. */
4745e701
JG
12483 cost += extra_cost->fp[mode == DFmode].mult;
12484 }
12485
e548c9df
AM
12486 cost += rtx_cost (op0, mode, MULT, 0, speed);
12487 cost += rtx_cost (op1, mode, MULT, 1, speed);
4745e701
JG
12488 return cost;
12489 }
43e9d192
IB
12490}
12491
67747367
JG
12492static int
12493aarch64_address_cost (rtx x,
ef4bddc2 12494 machine_mode mode,
67747367
JG
12495 addr_space_t as ATTRIBUTE_UNUSED,
12496 bool speed)
12497{
12498 enum rtx_code c = GET_CODE (x);
b175b679 12499 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
67747367
JG
12500 struct aarch64_address_info info;
12501 int cost = 0;
12502 info.shift = 0;
12503
a97d8b98 12504 if (!aarch64_classify_address (&info, x, mode, false))
67747367 12505 {
3793ecc1 12506 if (GET_CODE (x) == CONST || SYMBOL_REF_P (x))
67747367
JG
12507 {
12508 /* This is a CONST or SYMBOL ref which will be split
12509 in a different way depending on the code model in use.
12510 Cost it through the generic infrastructure. */
e548c9df 12511 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
67747367
JG
12512 /* Divide through by the cost of one instruction to
12513 bring it to the same units as the address costs. */
12514 cost_symbol_ref /= COSTS_N_INSNS (1);
12515 /* The cost is then the cost of preparing the address,
12516 followed by an immediate (possibly 0) offset. */
12517 return cost_symbol_ref + addr_cost->imm_offset;
12518 }
12519 else
12520 {
12521 /* This is most likely a jump table from a case
12522 statement. */
12523 return addr_cost->register_offset;
12524 }
12525 }
12526
12527 switch (info.type)
12528 {
12529 case ADDRESS_LO_SUM:
12530 case ADDRESS_SYMBOLIC:
12531 case ADDRESS_REG_IMM:
12532 cost += addr_cost->imm_offset;
12533 break;
12534
12535 case ADDRESS_REG_WB:
12536 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
12537 cost += addr_cost->pre_modify;
12538 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
6b8b0c8e 12539 {
05783fe6
RS
12540 unsigned int nvectors = aarch64_ldn_stn_vectors (mode);
12541 if (nvectors == 3)
6b8b0c8e 12542 cost += addr_cost->post_modify_ld3_st3;
05783fe6 12543 else if (nvectors == 4)
6b8b0c8e
RS
12544 cost += addr_cost->post_modify_ld4_st4;
12545 else
12546 cost += addr_cost->post_modify;
12547 }
67747367
JG
12548 else
12549 gcc_unreachable ();
12550
12551 break;
12552
12553 case ADDRESS_REG_REG:
12554 cost += addr_cost->register_offset;
12555 break;
12556
67747367 12557 case ADDRESS_REG_SXTW:
783879e6
EM
12558 cost += addr_cost->register_sextend;
12559 break;
12560
12561 case ADDRESS_REG_UXTW:
12562 cost += addr_cost->register_zextend;
67747367
JG
12563 break;
12564
12565 default:
12566 gcc_unreachable ();
12567 }
12568
12569
12570 if (info.shift > 0)
12571 {
12572 /* For the sake of calculating the cost of the shifted register
12573 component, we can treat same sized modes in the same way. */
6a70badb
RS
12574 if (known_eq (GET_MODE_BITSIZE (mode), 16))
12575 cost += addr_cost->addr_scale_costs.hi;
12576 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
12577 cost += addr_cost->addr_scale_costs.si;
12578 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
12579 cost += addr_cost->addr_scale_costs.di;
12580 else
12581 /* We can't tell, or this is a 128-bit vector. */
12582 cost += addr_cost->addr_scale_costs.ti;
67747367
JG
12583 }
12584
12585 return cost;
12586}
12587
b9066f5a
MW
12588/* Return the cost of a branch. If SPEED_P is true then the compiler is
12589 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
12590 to be taken. */
12591
12592int
12593aarch64_branch_cost (bool speed_p, bool predictable_p)
12594{
12595 /* When optimizing for speed, use the cost of unpredictable branches. */
12596 const struct cpu_branch_cost *branch_costs =
b175b679 12597 aarch64_tune_params.branch_costs;
b9066f5a
MW
12598
12599 if (!speed_p || predictable_p)
12600 return branch_costs->predictable;
12601 else
12602 return branch_costs->unpredictable;
12603}
12604
7de23b8c 12605/* Return true if X is a zero or sign extract
7cc2145f
JG
12606 usable in an ADD or SUB (extended register) instruction. */
12607static bool
7de23b8c 12608aarch64_rtx_arith_op_extract_p (rtx x)
7cc2145f 12609{
e47c4031
KT
12610 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
12611 No shift. */
7de23b8c
AC
12612 if (GET_CODE (x) == SIGN_EXTEND
12613 || GET_CODE (x) == ZERO_EXTEND)
e47c4031 12614 return REG_P (XEXP (x, 0));
7cc2145f
JG
12615
12616 return false;
12617}
12618
61263118
KT
12619static bool
12620aarch64_frint_unspec_p (unsigned int u)
12621{
12622 switch (u)
12623 {
12624 case UNSPEC_FRINTZ:
12625 case UNSPEC_FRINTP:
12626 case UNSPEC_FRINTM:
12627 case UNSPEC_FRINTA:
12628 case UNSPEC_FRINTN:
12629 case UNSPEC_FRINTX:
12630 case UNSPEC_FRINTI:
12631 return true;
12632
12633 default:
12634 return false;
12635 }
12636}
12637
fb0cb7fa
KT
12638/* Return true iff X is an rtx that will match an extr instruction
12639 i.e. as described in the *extr<mode>5_insn family of patterns.
12640 OP0 and OP1 will be set to the operands of the shifts involved
12641 on success and will be NULL_RTX otherwise. */
12642
12643static bool
12644aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
12645{
12646 rtx op0, op1;
77e994c9
RS
12647 scalar_int_mode mode;
12648 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
12649 return false;
fb0cb7fa
KT
12650
12651 *res_op0 = NULL_RTX;
12652 *res_op1 = NULL_RTX;
12653
12654 if (GET_CODE (x) != IOR)
12655 return false;
12656
12657 op0 = XEXP (x, 0);
12658 op1 = XEXP (x, 1);
12659
12660 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
12661 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
12662 {
12663 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
12664 if (GET_CODE (op1) == ASHIFT)
12665 std::swap (op0, op1);
12666
12667 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
12668 return false;
12669
12670 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
12671 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
12672
12673 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
12674 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
12675 {
12676 *res_op0 = XEXP (op0, 0);
12677 *res_op1 = XEXP (op1, 0);
12678 return true;
12679 }
12680 }
12681
12682 return false;
12683}
12684
2d5ffe46
AP
12685/* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
12686 storing it in *COST. Result is true if the total cost of the operation
12687 has now been calculated. */
12688static bool
12689aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
12690{
b9e3afe9
AP
12691 rtx inner;
12692 rtx comparator;
12693 enum rtx_code cmpcode;
e2a14bec
RS
12694 const struct cpu_cost_table *extra_cost
12695 = aarch64_tune_params.insn_extra_cost;
b9e3afe9
AP
12696
12697 if (COMPARISON_P (op0))
12698 {
12699 inner = XEXP (op0, 0);
12700 comparator = XEXP (op0, 1);
12701 cmpcode = GET_CODE (op0);
12702 }
12703 else
12704 {
12705 inner = op0;
12706 comparator = const0_rtx;
12707 cmpcode = NE;
12708 }
12709
2d5ffe46
AP
12710 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
12711 {
12712 /* Conditional branch. */
b9e3afe9 12713 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
2d5ffe46
AP
12714 return true;
12715 else
12716 {
b9e3afe9 12717 if (cmpcode == NE || cmpcode == EQ)
2d5ffe46 12718 {
2d5ffe46
AP
12719 if (comparator == const0_rtx)
12720 {
12721 /* TBZ/TBNZ/CBZ/CBNZ. */
12722 if (GET_CODE (inner) == ZERO_EXTRACT)
12723 /* TBZ/TBNZ. */
e548c9df
AM
12724 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
12725 ZERO_EXTRACT, 0, speed);
12726 else
12727 /* CBZ/CBNZ. */
12728 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
2d5ffe46 12729
e2a14bec
RS
12730 return true;
12731 }
12732 if (register_operand (inner, VOIDmode)
12733 && aarch64_imm24 (comparator, VOIDmode))
12734 {
12735 /* SUB and SUBS. */
12736 *cost += COSTS_N_INSNS (2);
12737 if (speed)
12738 *cost += extra_cost->alu.arith * 2;
12739 return true;
12740 }
2d5ffe46 12741 }
b9e3afe9 12742 else if (cmpcode == LT || cmpcode == GE)
2d5ffe46 12743 {
2d5ffe46
AP
12744 /* TBZ/TBNZ. */
12745 if (comparator == const0_rtx)
12746 return true;
12747 }
12748 }
12749 }
b9e3afe9 12750 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
2d5ffe46 12751 {
786298dc 12752 /* CCMP. */
6dfeb7ce 12753 if (GET_CODE (op1) == COMPARE)
786298dc
WD
12754 {
12755 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
12756 if (XEXP (op1, 1) == const0_rtx)
12757 *cost += 1;
12758 if (speed)
12759 {
12760 machine_mode mode = GET_MODE (XEXP (op1, 0));
786298dc
WD
12761
12762 if (GET_MODE_CLASS (mode) == MODE_INT)
12763 *cost += extra_cost->alu.arith;
12764 else
12765 *cost += extra_cost->fp[mode == DFmode].compare;
12766 }
12767 return true;
12768 }
12769
2d5ffe46
AP
12770 /* It's a conditional operation based on the status flags,
12771 so it must be some flavor of CSEL. */
12772
12773 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
12774 if (GET_CODE (op1) == NEG
12775 || GET_CODE (op1) == NOT
12776 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
12777 op1 = XEXP (op1, 0);
bad00732
KT
12778 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
12779 {
12780 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
12781 op1 = XEXP (op1, 0);
12782 op2 = XEXP (op2, 0);
12783 }
d572ad49
AC
12784 else if (GET_CODE (op1) == ZERO_EXTEND && op2 == const0_rtx)
12785 {
12786 inner = XEXP (op1, 0);
12787 if (GET_CODE (inner) == NEG || GET_CODE (inner) == NOT)
12788 /* CSINV/NEG with zero extend + const 0 (*csinv3_uxtw_insn3). */
12789 op1 = XEXP (inner, 0);
12790 }
2d5ffe46 12791
e548c9df
AM
12792 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
12793 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
2d5ffe46
AP
12794 return true;
12795 }
12796
12797 /* We don't know what this is, cost all operands. */
12798 return false;
12799}
12800
283b6c85
KT
12801/* Check whether X is a bitfield operation of the form shift + extend that
12802 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
12803 operand to which the bitfield operation is applied. Otherwise return
12804 NULL_RTX. */
12805
12806static rtx
12807aarch64_extend_bitfield_pattern_p (rtx x)
12808{
12809 rtx_code outer_code = GET_CODE (x);
12810 machine_mode outer_mode = GET_MODE (x);
12811
12812 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
12813 && outer_mode != SImode && outer_mode != DImode)
12814 return NULL_RTX;
12815
12816 rtx inner = XEXP (x, 0);
12817 rtx_code inner_code = GET_CODE (inner);
12818 machine_mode inner_mode = GET_MODE (inner);
12819 rtx op = NULL_RTX;
12820
12821 switch (inner_code)
12822 {
12823 case ASHIFT:
12824 if (CONST_INT_P (XEXP (inner, 1))
12825 && (inner_mode == QImode || inner_mode == HImode))
12826 op = XEXP (inner, 0);
12827 break;
12828 case LSHIFTRT:
12829 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
12830 && (inner_mode == QImode || inner_mode == HImode))
12831 op = XEXP (inner, 0);
12832 break;
12833 case ASHIFTRT:
12834 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
12835 && (inner_mode == QImode || inner_mode == HImode))
12836 op = XEXP (inner, 0);
12837 break;
12838 default:
12839 break;
12840 }
12841
12842 return op;
12843}
12844
8c83f71d
KT
12845/* Return true if the mask and a shift amount from an RTX of the form
12846 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
12847 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
12848
12849bool
77e994c9
RS
12850aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
12851 rtx shft_amnt)
8c83f71d
KT
12852{
12853 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
17ad8cde
JJ
12854 && INTVAL (mask) > 0
12855 && UINTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
12856 && exact_log2 ((UINTVAL (mask) >> UINTVAL (shft_amnt)) + 1) >= 0
12857 && (UINTVAL (mask)
12858 & ((HOST_WIDE_INT_1U << UINTVAL (shft_amnt)) - 1)) == 0;
8c83f71d
KT
12859}
12860
6a0d3939
SE
12861/* Return true if the masks and a shift amount from an RTX of the form
12862 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
12863 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
12864
12865bool
12866aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
12867 unsigned HOST_WIDE_INT mask1,
12868 unsigned HOST_WIDE_INT shft_amnt,
12869 unsigned HOST_WIDE_INT mask2)
12870{
12871 unsigned HOST_WIDE_INT t;
12872
12873 /* Verify that there is no overlap in what bits are set in the two masks. */
12874 if (mask1 != ~mask2)
12875 return false;
12876
12877 /* Verify that mask2 is not all zeros or ones. */
12878 if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
12879 return false;
12880
12881 /* The shift amount should always be less than the mode size. */
12882 gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
12883
12884 /* Verify that the mask being shifted is contiguous and would be in the
12885 least significant bits after shifting by shft_amnt. */
12886 t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
12887 return (t == (t & -t));
12888}
12889
43e9d192
IB
12890/* Calculate the cost of calculating X, storing it in *COST. Result
12891 is true if the total cost of the operation has now been calculated. */
12892static bool
e548c9df 12893aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
43e9d192
IB
12894 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
12895{
a8eecd00 12896 rtx op0, op1, op2;
73250c4c 12897 const struct cpu_cost_table *extra_cost
b175b679 12898 = aarch64_tune_params.insn_extra_cost;
1d5c43db 12899 rtx_code code = GET_CODE (x);
b4206259 12900 scalar_int_mode int_mode;
43e9d192 12901
7fc5ef02
JG
12902 /* By default, assume that everything has equivalent cost to the
12903 cheapest instruction. Any additional costs are applied as a delta
12904 above this default. */
12905 *cost = COSTS_N_INSNS (1);
12906
43e9d192
IB
12907 switch (code)
12908 {
12909 case SET:
ba123b0d
JG
12910 /* The cost depends entirely on the operands to SET. */
12911 *cost = 0;
43e9d192
IB
12912 op0 = SET_DEST (x);
12913 op1 = SET_SRC (x);
12914
12915 switch (GET_CODE (op0))
12916 {
12917 case MEM:
12918 if (speed)
2961177e
JG
12919 {
12920 rtx address = XEXP (op0, 0);
b6875aac
KV
12921 if (VECTOR_MODE_P (mode))
12922 *cost += extra_cost->ldst.storev;
12923 else if (GET_MODE_CLASS (mode) == MODE_INT)
2961177e
JG
12924 *cost += extra_cost->ldst.store;
12925 else if (mode == SFmode)
12926 *cost += extra_cost->ldst.storef;
12927 else if (mode == DFmode)
12928 *cost += extra_cost->ldst.stored;
12929
12930 *cost +=
12931 COSTS_N_INSNS (aarch64_address_cost (address, mode,
12932 0, speed));
12933 }
43e9d192 12934
e548c9df 12935 *cost += rtx_cost (op1, mode, SET, 1, speed);
43e9d192
IB
12936 return true;
12937
12938 case SUBREG:
12939 if (! REG_P (SUBREG_REG (op0)))
e548c9df 12940 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
ba123b0d 12941
43e9d192
IB
12942 /* Fall through. */
12943 case REG:
b6875aac
KV
12944 /* The cost is one per vector-register copied. */
12945 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
12946 {
fe1447a1
RS
12947 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
12948 *cost = COSTS_N_INSNS (nregs);
b6875aac 12949 }
ba123b0d
JG
12950 /* const0_rtx is in general free, but we will use an
12951 instruction to set a register to 0. */
b6875aac
KV
12952 else if (REG_P (op1) || op1 == const0_rtx)
12953 {
12954 /* The cost is 1 per register copied. */
fe1447a1
RS
12955 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
12956 *cost = COSTS_N_INSNS (nregs);
b6875aac 12957 }
ba123b0d
JG
12958 else
12959 /* Cost is just the cost of the RHS of the set. */
e548c9df 12960 *cost += rtx_cost (op1, mode, SET, 1, speed);
43e9d192
IB
12961 return true;
12962
ba123b0d 12963 case ZERO_EXTRACT:
43e9d192 12964 case SIGN_EXTRACT:
ba123b0d
JG
12965 /* Bit-field insertion. Strip any redundant widening of
12966 the RHS to meet the width of the target. */
568b9c0e 12967 if (SUBREG_P (op1))
43e9d192
IB
12968 op1 = SUBREG_REG (op1);
12969 if ((GET_CODE (op1) == ZERO_EXTEND
12970 || GET_CODE (op1) == SIGN_EXTEND)
4aa81c2e 12971 && CONST_INT_P (XEXP (op0, 1))
77e994c9
RS
12972 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
12973 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
43e9d192 12974 op1 = XEXP (op1, 0);
ba123b0d
JG
12975
12976 if (CONST_INT_P (op1))
12977 {
12978 /* MOV immediate is assumed to always be cheap. */
12979 *cost = COSTS_N_INSNS (1);
12980 }
12981 else
12982 {
12983 /* BFM. */
12984 if (speed)
12985 *cost += extra_cost->alu.bfi;
e548c9df 12986 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
ba123b0d
JG
12987 }
12988
43e9d192
IB
12989 return true;
12990
12991 default:
ba123b0d
JG
12992 /* We can't make sense of this, assume default cost. */
12993 *cost = COSTS_N_INSNS (1);
61263118 12994 return false;
43e9d192
IB
12995 }
12996 return false;
12997
9dfc162c
JG
12998 case CONST_INT:
12999 /* If an instruction can incorporate a constant within the
13000 instruction, the instruction's expression avoids calling
13001 rtx_cost() on the constant. If rtx_cost() is called on a
13002 constant, then it is usually because the constant must be
13003 moved into a register by one or more instructions.
13004
13005 The exception is constant 0, which can be expressed
13006 as XZR/WZR and is therefore free. The exception to this is
13007 if we have (set (reg) (const0_rtx)) in which case we must cost
13008 the move. However, we can catch that when we cost the SET, so
13009 we don't need to consider that here. */
13010 if (x == const0_rtx)
13011 *cost = 0;
13012 else
13013 {
13014 /* To an approximation, building any other constant is
13015 proportionally expensive to the number of instructions
13016 required to build that constant. This is true whether we
13017 are compiling for SPEED or otherwise. */
77e994c9
RS
13018 if (!is_a <scalar_int_mode> (mode, &int_mode))
13019 int_mode = word_mode;
82614948 13020 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
77e994c9 13021 (NULL_RTX, x, false, int_mode));
9dfc162c
JG
13022 }
13023 return true;
13024
13025 case CONST_DOUBLE:
a2170965
TC
13026
13027 /* First determine number of instructions to do the move
13028 as an integer constant. */
13029 if (!aarch64_float_const_representable_p (x)
13030 && !aarch64_can_const_movi_rtx_p (x, mode)
13031 && aarch64_float_const_rtx_p (x))
13032 {
13033 unsigned HOST_WIDE_INT ival;
13034 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
13035 gcc_assert (succeed);
13036
77e994c9
RS
13037 scalar_int_mode imode = (mode == HFmode
13038 ? SImode
13039 : int_mode_for_mode (mode).require ());
a2170965
TC
13040 int ncost = aarch64_internal_mov_immediate
13041 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
13042 *cost += COSTS_N_INSNS (ncost);
13043 return true;
13044 }
13045
9dfc162c
JG
13046 if (speed)
13047 {
13048 /* mov[df,sf]_aarch64. */
13049 if (aarch64_float_const_representable_p (x))
13050 /* FMOV (scalar immediate). */
13051 *cost += extra_cost->fp[mode == DFmode].fpconst;
13052 else if (!aarch64_float_const_zero_rtx_p (x))
13053 {
13054 /* This will be a load from memory. */
13055 if (mode == DFmode)
13056 *cost += extra_cost->ldst.loadd;
13057 else
13058 *cost += extra_cost->ldst.loadf;
13059 }
13060 else
13061 /* Otherwise this is +0.0. We get this using MOVI d0, #0
13062 or MOV v0.s[0], wzr - neither of which are modeled by the
13063 cost tables. Just use the default cost. */
13064 {
13065 }
13066 }
13067
13068 return true;
13069
43e9d192
IB
13070 case MEM:
13071 if (speed)
2961177e
JG
13072 {
13073 /* For loads we want the base cost of a load, plus an
13074 approximation for the additional cost of the addressing
13075 mode. */
13076 rtx address = XEXP (x, 0);
b6875aac
KV
13077 if (VECTOR_MODE_P (mode))
13078 *cost += extra_cost->ldst.loadv;
13079 else if (GET_MODE_CLASS (mode) == MODE_INT)
2961177e
JG
13080 *cost += extra_cost->ldst.load;
13081 else if (mode == SFmode)
13082 *cost += extra_cost->ldst.loadf;
13083 else if (mode == DFmode)
13084 *cost += extra_cost->ldst.loadd;
13085
13086 *cost +=
13087 COSTS_N_INSNS (aarch64_address_cost (address, mode,
13088 0, speed));
13089 }
43e9d192
IB
13090
13091 return true;
13092
13093 case NEG:
4745e701
JG
13094 op0 = XEXP (x, 0);
13095
b6875aac
KV
13096 if (VECTOR_MODE_P (mode))
13097 {
13098 if (speed)
13099 {
13100 /* FNEG. */
13101 *cost += extra_cost->vect.alu;
13102 }
13103 return false;
13104 }
13105
e548c9df
AM
13106 if (GET_MODE_CLASS (mode) == MODE_INT)
13107 {
4745e701
JG
13108 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
13109 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
13110 {
13111 /* CSETM. */
e548c9df 13112 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
4745e701
JG
13113 return true;
13114 }
13115
13116 /* Cost this as SUB wzr, X. */
e548c9df 13117 op0 = CONST0_RTX (mode);
4745e701
JG
13118 op1 = XEXP (x, 0);
13119 goto cost_minus;
13120 }
13121
e548c9df 13122 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
4745e701
JG
13123 {
13124 /* Support (neg(fma...)) as a single instruction only if
13125 sign of zeros is unimportant. This matches the decision
13126 making in aarch64.md. */
13127 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
13128 {
13129 /* FNMADD. */
e548c9df 13130 *cost = rtx_cost (op0, mode, NEG, 0, speed);
4745e701
JG
13131 return true;
13132 }
d318517d
SN
13133 if (GET_CODE (op0) == MULT)
13134 {
13135 /* FNMUL. */
13136 *cost = rtx_cost (op0, mode, NEG, 0, speed);
13137 return true;
13138 }
4745e701
JG
13139 if (speed)
13140 /* FNEG. */
13141 *cost += extra_cost->fp[mode == DFmode].neg;
13142 return false;
13143 }
13144
13145 return false;
43e9d192 13146
781aeb73
KT
13147 case CLRSB:
13148 case CLZ:
13149 if (speed)
b6875aac
KV
13150 {
13151 if (VECTOR_MODE_P (mode))
13152 *cost += extra_cost->vect.alu;
13153 else
13154 *cost += extra_cost->alu.clz;
13155 }
781aeb73
KT
13156
13157 return false;
13158
5bfc8303
WD
13159 case CTZ:
13160 *cost = COSTS_N_INSNS (2);
13161
13162 if (speed)
13163 *cost += extra_cost->alu.clz + extra_cost->alu.rev;
13164 return false;
13165
43e9d192
IB
13166 case COMPARE:
13167 op0 = XEXP (x, 0);
13168 op1 = XEXP (x, 1);
13169
13170 if (op1 == const0_rtx
13171 && GET_CODE (op0) == AND)
13172 {
13173 x = op0;
e548c9df 13174 mode = GET_MODE (op0);
43e9d192
IB
13175 goto cost_logic;
13176 }
13177
a8eecd00
JG
13178 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
13179 {
13180 /* TODO: A write to the CC flags possibly costs extra, this
13181 needs encoding in the cost tables. */
13182
e548c9df 13183 mode = GET_MODE (op0);
a8eecd00
JG
13184 /* ANDS. */
13185 if (GET_CODE (op0) == AND)
13186 {
13187 x = op0;
13188 goto cost_logic;
13189 }
13190
13191 if (GET_CODE (op0) == PLUS)
13192 {
13193 /* ADDS (and CMN alias). */
13194 x = op0;
13195 goto cost_plus;
13196 }
13197
13198 if (GET_CODE (op0) == MINUS)
13199 {
13200 /* SUBS. */
13201 x = op0;
13202 goto cost_minus;
13203 }
13204
345854d8
KT
13205 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
13206 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
13207 && CONST_INT_P (XEXP (op0, 2)))
13208 {
13209 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
13210 Handle it here directly rather than going to cost_logic
13211 since we know the immediate generated for the TST is valid
13212 so we can avoid creating an intermediate rtx for it only
13213 for costing purposes. */
13214 if (speed)
13215 *cost += extra_cost->alu.logical;
13216
13217 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
13218 ZERO_EXTRACT, 0, speed);
13219 return true;
13220 }
13221
a8eecd00
JG
13222 if (GET_CODE (op1) == NEG)
13223 {
13224 /* CMN. */
13225 if (speed)
13226 *cost += extra_cost->alu.arith;
13227
e548c9df
AM
13228 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
13229 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
a8eecd00
JG
13230 return true;
13231 }
13232
13233 /* CMP.
13234
13235 Compare can freely swap the order of operands, and
13236 canonicalization puts the more complex operation first.
13237 But the integer MINUS logic expects the shift/extend
13238 operation in op1. */
13239 if (! (REG_P (op0)
568b9c0e 13240 || (SUBREG_P (op0) && REG_P (SUBREG_REG (op0)))))
a8eecd00
JG
13241 {
13242 op0 = XEXP (x, 1);
13243 op1 = XEXP (x, 0);
13244 }
13245 goto cost_minus;
13246 }
13247
13248 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
13249 {
13250 /* FCMP. */
13251 if (speed)
13252 *cost += extra_cost->fp[mode == DFmode].compare;
13253
13254 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
13255 {
e548c9df 13256 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
a8eecd00
JG
13257 /* FCMP supports constant 0.0 for no extra cost. */
13258 return true;
13259 }
13260 return false;
13261 }
13262
b6875aac
KV
13263 if (VECTOR_MODE_P (mode))
13264 {
13265 /* Vector compare. */
13266 if (speed)
13267 *cost += extra_cost->vect.alu;
13268
13269 if (aarch64_float_const_zero_rtx_p (op1))
13270 {
13271 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
13272 cost. */
13273 return true;
13274 }
13275 return false;
13276 }
a8eecd00 13277 return false;
43e9d192
IB
13278
13279 case MINUS:
4745e701
JG
13280 {
13281 op0 = XEXP (x, 0);
13282 op1 = XEXP (x, 1);
13283
13284cost_minus:
0c3aab7f
JW
13285 if (VECTOR_MODE_P (mode))
13286 {
13287 /* SUBL2 and SUBW2. */
13288 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13289 if (vec_flags & VEC_ADVSIMD)
13290 {
13291 /* The select-operand-high-half versions of the sub instruction
13292 have the same cost as the regular three vector version -
13293 don't add the costs of the select into the costs of the sub.
13294 */
13295 op0 = aarch64_strip_extend_vec_half (op0);
13296 op1 = aarch64_strip_extend_vec_half (op1);
13297 }
13298 }
13299
e548c9df 13300 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
23cb6618 13301
4745e701
JG
13302 /* Detect valid immediates. */
13303 if ((GET_MODE_CLASS (mode) == MODE_INT
13304 || (GET_MODE_CLASS (mode) == MODE_CC
13305 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
13306 && CONST_INT_P (op1)
13307 && aarch64_uimm12_shift (INTVAL (op1)))
13308 {
4745e701
JG
13309 if (speed)
13310 /* SUB(S) (immediate). */
13311 *cost += extra_cost->alu.arith;
13312 return true;
4745e701
JG
13313 }
13314
7cc2145f 13315 /* Look for SUB (extended register). */
7de23b8c
AC
13316 if (is_a <scalar_int_mode> (mode)
13317 && aarch64_rtx_arith_op_extract_p (op1))
7cc2145f
JG
13318 {
13319 if (speed)
2533c820 13320 *cost += extra_cost->alu.extend_arith;
7cc2145f 13321
b10f1009 13322 op1 = aarch64_strip_extend (op1, true);
e47c4031 13323 *cost += rtx_cost (op1, VOIDmode,
e548c9df 13324 (enum rtx_code) GET_CODE (op1), 0, speed);
7cc2145f
JG
13325 return true;
13326 }
13327
b10f1009 13328 rtx new_op1 = aarch64_strip_extend (op1, false);
4745e701
JG
13329
13330 /* Cost this as an FMA-alike operation. */
13331 if ((GET_CODE (new_op1) == MULT
0a78ebe4 13332 || aarch64_shift_p (GET_CODE (new_op1)))
4745e701
JG
13333 && code != COMPARE)
13334 {
13335 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
13336 (enum rtx_code) code,
13337 speed);
4745e701
JG
13338 return true;
13339 }
43e9d192 13340
e548c9df 13341 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
43e9d192 13342
4745e701
JG
13343 if (speed)
13344 {
b6875aac
KV
13345 if (VECTOR_MODE_P (mode))
13346 {
13347 /* Vector SUB. */
13348 *cost += extra_cost->vect.alu;
13349 }
13350 else if (GET_MODE_CLASS (mode) == MODE_INT)
13351 {
13352 /* SUB(S). */
13353 *cost += extra_cost->alu.arith;
13354 }
4745e701 13355 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
b6875aac
KV
13356 {
13357 /* FSUB. */
13358 *cost += extra_cost->fp[mode == DFmode].addsub;
13359 }
4745e701
JG
13360 }
13361 return true;
13362 }
43e9d192
IB
13363
13364 case PLUS:
4745e701
JG
13365 {
13366 rtx new_op0;
43e9d192 13367
4745e701
JG
13368 op0 = XEXP (x, 0);
13369 op1 = XEXP (x, 1);
43e9d192 13370
a8eecd00 13371cost_plus:
8cd27a3b
JW
13372 if (VECTOR_MODE_P (mode))
13373 {
13374 /* ADDL2 and ADDW2. */
13375 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13376 if (vec_flags & VEC_ADVSIMD)
13377 {
13378 /* The select-operand-high-half versions of the add instruction
13379 have the same cost as the regular three vector version -
13380 don't add the costs of the select into the costs of the add.
13381 */
13382 op0 = aarch64_strip_extend_vec_half (op0);
13383 op1 = aarch64_strip_extend_vec_half (op1);
13384 }
13385 }
13386
4745e701
JG
13387 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
13388 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
13389 {
13390 /* CSINC. */
e548c9df
AM
13391 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
13392 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
4745e701
JG
13393 return true;
13394 }
43e9d192 13395
4745e701 13396 if (GET_MODE_CLASS (mode) == MODE_INT
835d50c6 13397 && (aarch64_plus_immediate (op1, mode)
43cacb12 13398 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
4745e701 13399 {
e548c9df 13400 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
43e9d192 13401
4745e701 13402 if (speed)
a65b9ad8
KT
13403 {
13404 /* ADD (immediate). */
13405 *cost += extra_cost->alu.arith;
13406
13407 /* Some tunings prefer to not use the VL-based scalar ops.
13408 Increase the cost of the poly immediate to prevent their
13409 formation. */
13410 if (GET_CODE (op1) == CONST_POLY_INT
13411 && (aarch64_tune_params.extra_tuning_flags
13412 & AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS))
13413 *cost += COSTS_N_INSNS (1);
13414 }
4745e701
JG
13415 return true;
13416 }
13417
e548c9df 13418 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
23cb6618 13419
7cc2145f 13420 /* Look for ADD (extended register). */
7de23b8c
AC
13421 if (is_a <scalar_int_mode> (mode)
13422 && aarch64_rtx_arith_op_extract_p (op0))
7cc2145f
JG
13423 {
13424 if (speed)
2533c820 13425 *cost += extra_cost->alu.extend_arith;
7cc2145f 13426
b10f1009 13427 op0 = aarch64_strip_extend (op0, true);
e47c4031 13428 *cost += rtx_cost (op0, VOIDmode,
e548c9df 13429 (enum rtx_code) GET_CODE (op0), 0, speed);
7cc2145f
JG
13430 return true;
13431 }
13432
4745e701
JG
13433 /* Strip any extend, leave shifts behind as we will
13434 cost them through mult_cost. */
b10f1009 13435 new_op0 = aarch64_strip_extend (op0, false);
4745e701
JG
13436
13437 if (GET_CODE (new_op0) == MULT
0a78ebe4 13438 || aarch64_shift_p (GET_CODE (new_op0)))
4745e701
JG
13439 {
13440 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
13441 speed);
4745e701
JG
13442 return true;
13443 }
13444
e548c9df 13445 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
4745e701
JG
13446
13447 if (speed)
13448 {
b6875aac
KV
13449 if (VECTOR_MODE_P (mode))
13450 {
13451 /* Vector ADD. */
13452 *cost += extra_cost->vect.alu;
13453 }
13454 else if (GET_MODE_CLASS (mode) == MODE_INT)
13455 {
13456 /* ADD. */
13457 *cost += extra_cost->alu.arith;
13458 }
4745e701 13459 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
b6875aac
KV
13460 {
13461 /* FADD. */
13462 *cost += extra_cost->fp[mode == DFmode].addsub;
13463 }
4745e701
JG
13464 }
13465 return true;
13466 }
43e9d192 13467
18b42b2a
KT
13468 case BSWAP:
13469 *cost = COSTS_N_INSNS (1);
13470
13471 if (speed)
b6875aac
KV
13472 {
13473 if (VECTOR_MODE_P (mode))
13474 *cost += extra_cost->vect.alu;
13475 else
13476 *cost += extra_cost->alu.rev;
13477 }
18b42b2a
KT
13478 return false;
13479
43e9d192 13480 case IOR:
f7d5cf8d
KT
13481 if (aarch_rev16_p (x))
13482 {
13483 *cost = COSTS_N_INSNS (1);
13484
b6875aac
KV
13485 if (speed)
13486 {
13487 if (VECTOR_MODE_P (mode))
13488 *cost += extra_cost->vect.alu;
13489 else
13490 *cost += extra_cost->alu.rev;
13491 }
13492 return true;
f7d5cf8d 13493 }
fb0cb7fa
KT
13494
13495 if (aarch64_extr_rtx_p (x, &op0, &op1))
13496 {
e548c9df
AM
13497 *cost += rtx_cost (op0, mode, IOR, 0, speed);
13498 *cost += rtx_cost (op1, mode, IOR, 1, speed);
fb0cb7fa
KT
13499 if (speed)
13500 *cost += extra_cost->alu.shift;
13501
13502 return true;
13503 }
f7d5cf8d 13504 /* Fall through. */
43e9d192
IB
13505 case XOR:
13506 case AND:
13507 cost_logic:
13508 op0 = XEXP (x, 0);
13509 op1 = XEXP (x, 1);
13510
b6875aac
KV
13511 if (VECTOR_MODE_P (mode))
13512 {
13513 if (speed)
13514 *cost += extra_cost->vect.alu;
13515 return true;
13516 }
13517
268c3b47
JG
13518 if (code == AND
13519 && GET_CODE (op0) == MULT
13520 && CONST_INT_P (XEXP (op0, 1))
13521 && CONST_INT_P (op1)
13522 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
13523 INTVAL (op1)) != 0)
13524 {
13525 /* This is a UBFM/SBFM. */
e548c9df 13526 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
268c3b47
JG
13527 if (speed)
13528 *cost += extra_cost->alu.bfx;
13529 return true;
13530 }
13531
b4206259 13532 if (is_int_mode (mode, &int_mode))
43e9d192 13533 {
8c83f71d 13534 if (CONST_INT_P (op1))
43e9d192 13535 {
8c83f71d
KT
13536 /* We have a mask + shift version of a UBFIZ
13537 i.e. the *andim_ashift<mode>_bfiz pattern. */
13538 if (GET_CODE (op0) == ASHIFT
b4206259
RS
13539 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
13540 XEXP (op0, 1)))
8c83f71d 13541 {
b4206259 13542 *cost += rtx_cost (XEXP (op0, 0), int_mode,
8c83f71d
KT
13543 (enum rtx_code) code, 0, speed);
13544 if (speed)
13545 *cost += extra_cost->alu.bfx;
268c3b47 13546
8c83f71d
KT
13547 return true;
13548 }
b4206259 13549 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
8c83f71d
KT
13550 {
13551 /* We possibly get the immediate for free, this is not
13552 modelled. */
b4206259
RS
13553 *cost += rtx_cost (op0, int_mode,
13554 (enum rtx_code) code, 0, speed);
8c83f71d
KT
13555 if (speed)
13556 *cost += extra_cost->alu.logical;
268c3b47 13557
8c83f71d
KT
13558 return true;
13559 }
43e9d192
IB
13560 }
13561 else
13562 {
268c3b47
JG
13563 rtx new_op0 = op0;
13564
13565 /* Handle ORN, EON, or BIC. */
43e9d192
IB
13566 if (GET_CODE (op0) == NOT)
13567 op0 = XEXP (op0, 0);
268c3b47
JG
13568
13569 new_op0 = aarch64_strip_shift (op0);
13570
13571 /* If we had a shift on op0 then this is a logical-shift-
13572 by-register/immediate operation. Otherwise, this is just
13573 a logical operation. */
13574 if (speed)
13575 {
13576 if (new_op0 != op0)
13577 {
13578 /* Shift by immediate. */
13579 if (CONST_INT_P (XEXP (op0, 1)))
13580 *cost += extra_cost->alu.log_shift;
13581 else
13582 *cost += extra_cost->alu.log_shift_reg;
13583 }
13584 else
13585 *cost += extra_cost->alu.logical;
13586 }
13587
13588 /* In both cases we want to cost both operands. */
b4206259
RS
13589 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
13590 0, speed);
13591 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
13592 1, speed);
268c3b47
JG
13593
13594 return true;
43e9d192 13595 }
43e9d192
IB
13596 }
13597 return false;
13598
268c3b47 13599 case NOT:
6365da9e
KT
13600 x = XEXP (x, 0);
13601 op0 = aarch64_strip_shift (x);
13602
b6875aac
KV
13603 if (VECTOR_MODE_P (mode))
13604 {
13605 /* Vector NOT. */
13606 *cost += extra_cost->vect.alu;
13607 return false;
13608 }
13609
6365da9e
KT
13610 /* MVN-shifted-reg. */
13611 if (op0 != x)
13612 {
e548c9df 13613 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6365da9e
KT
13614
13615 if (speed)
13616 *cost += extra_cost->alu.log_shift;
13617
13618 return true;
13619 }
13620 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
13621 Handle the second form here taking care that 'a' in the above can
13622 be a shift. */
13623 else if (GET_CODE (op0) == XOR)
13624 {
13625 rtx newop0 = XEXP (op0, 0);
13626 rtx newop1 = XEXP (op0, 1);
13627 rtx op0_stripped = aarch64_strip_shift (newop0);
13628
e548c9df
AM
13629 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
13630 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
6365da9e
KT
13631
13632 if (speed)
13633 {
13634 if (op0_stripped != newop0)
13635 *cost += extra_cost->alu.log_shift;
13636 else
13637 *cost += extra_cost->alu.logical;
13638 }
13639
13640 return true;
13641 }
268c3b47
JG
13642 /* MVN. */
13643 if (speed)
13644 *cost += extra_cost->alu.logical;
13645
268c3b47
JG
13646 return false;
13647
43e9d192 13648 case ZERO_EXTEND:
b1685e62
JG
13649
13650 op0 = XEXP (x, 0);
13651 /* If a value is written in SI mode, then zero extended to DI
13652 mode, the operation will in general be free as a write to
13653 a 'w' register implicitly zeroes the upper bits of an 'x'
13654 register. However, if this is
13655
13656 (set (reg) (zero_extend (reg)))
13657
13658 we must cost the explicit register move. */
13659 if (mode == DImode
1d5c43db 13660 && GET_MODE (op0) == SImode)
b1685e62 13661 {
e548c9df 13662 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
b1685e62 13663
dde23f43
KM
13664 /* If OP_COST is non-zero, then the cost of the zero extend
13665 is effectively the cost of the inner operation. Otherwise
13666 we have a MOV instruction and we take the cost from the MOV
13667 itself. This is true independently of whether we are
13668 optimizing for space or time. */
13669 if (op_cost)
b1685e62
JG
13670 *cost = op_cost;
13671
13672 return true;
13673 }
e548c9df 13674 else if (MEM_P (op0))
43e9d192 13675 {
b1685e62 13676 /* All loads can zero extend to any size for free. */
e548c9df 13677 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
43e9d192
IB
13678 return true;
13679 }
b1685e62 13680
283b6c85
KT
13681 op0 = aarch64_extend_bitfield_pattern_p (x);
13682 if (op0)
13683 {
13684 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
13685 if (speed)
13686 *cost += extra_cost->alu.bfx;
13687 return true;
13688 }
13689
b1685e62 13690 if (speed)
b6875aac
KV
13691 {
13692 if (VECTOR_MODE_P (mode))
13693 {
13694 /* UMOV. */
13695 *cost += extra_cost->vect.alu;
13696 }
13697 else
13698 {
63715e5e
WD
13699 /* We generate an AND instead of UXTB/UXTH. */
13700 *cost += extra_cost->alu.logical;
b6875aac
KV
13701 }
13702 }
43e9d192
IB
13703 return false;
13704
13705 case SIGN_EXTEND:
b1685e62 13706 if (MEM_P (XEXP (x, 0)))
43e9d192 13707 {
b1685e62
JG
13708 /* LDRSH. */
13709 if (speed)
13710 {
13711 rtx address = XEXP (XEXP (x, 0), 0);
13712 *cost += extra_cost->ldst.load_sign_extend;
13713
13714 *cost +=
13715 COSTS_N_INSNS (aarch64_address_cost (address, mode,
13716 0, speed));
13717 }
43e9d192
IB
13718 return true;
13719 }
b1685e62 13720
283b6c85
KT
13721 op0 = aarch64_extend_bitfield_pattern_p (x);
13722 if (op0)
13723 {
13724 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
13725 if (speed)
13726 *cost += extra_cost->alu.bfx;
13727 return true;
13728 }
13729
b1685e62 13730 if (speed)
b6875aac
KV
13731 {
13732 if (VECTOR_MODE_P (mode))
13733 *cost += extra_cost->vect.alu;
13734 else
13735 *cost += extra_cost->alu.extend;
13736 }
43e9d192
IB
13737 return false;
13738
ba0cfa17
JG
13739 case ASHIFT:
13740 op0 = XEXP (x, 0);
13741 op1 = XEXP (x, 1);
13742
13743 if (CONST_INT_P (op1))
13744 {
ba0cfa17 13745 if (speed)
b6875aac
KV
13746 {
13747 if (VECTOR_MODE_P (mode))
13748 {
13749 /* Vector shift (immediate). */
13750 *cost += extra_cost->vect.alu;
13751 }
13752 else
13753 {
13754 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
13755 aliases. */
13756 *cost += extra_cost->alu.shift;
13757 }
13758 }
ba0cfa17
JG
13759
13760 /* We can incorporate zero/sign extend for free. */
13761 if (GET_CODE (op0) == ZERO_EXTEND
13762 || GET_CODE (op0) == SIGN_EXTEND)
13763 op0 = XEXP (op0, 0);
13764
e548c9df 13765 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
ba0cfa17
JG
13766 return true;
13767 }
13768 else
13769 {
7813b280 13770 if (VECTOR_MODE_P (mode))
b6875aac 13771 {
7813b280
KT
13772 if (speed)
13773 /* Vector shift (register). */
13774 *cost += extra_cost->vect.alu;
13775 }
13776 else
13777 {
13778 if (speed)
13779 /* LSLV. */
13780 *cost += extra_cost->alu.shift_reg;
13781
13782 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
13783 && CONST_INT_P (XEXP (op1, 1))
6a70badb
RS
13784 && known_eq (INTVAL (XEXP (op1, 1)),
13785 GET_MODE_BITSIZE (mode) - 1))
b6875aac 13786 {
7813b280
KT
13787 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
13788 /* We already demanded XEXP (op1, 0) to be REG_P, so
13789 don't recurse into it. */
13790 return true;
b6875aac
KV
13791 }
13792 }
ba0cfa17
JG
13793 return false; /* All arguments need to be in registers. */
13794 }
13795
43e9d192 13796 case ROTATE:
43e9d192
IB
13797 case ROTATERT:
13798 case LSHIFTRT:
43e9d192 13799 case ASHIFTRT:
ba0cfa17
JG
13800 op0 = XEXP (x, 0);
13801 op1 = XEXP (x, 1);
43e9d192 13802
ba0cfa17
JG
13803 if (CONST_INT_P (op1))
13804 {
13805 /* ASR (immediate) and friends. */
13806 if (speed)
b6875aac
KV
13807 {
13808 if (VECTOR_MODE_P (mode))
13809 *cost += extra_cost->vect.alu;
13810 else
13811 *cost += extra_cost->alu.shift;
13812 }
43e9d192 13813
e548c9df 13814 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
ba0cfa17
JG
13815 return true;
13816 }
13817 else
13818 {
7813b280 13819 if (VECTOR_MODE_P (mode))
b6875aac 13820 {
7813b280
KT
13821 if (speed)
13822 /* Vector shift (register). */
b6875aac 13823 *cost += extra_cost->vect.alu;
7813b280
KT
13824 }
13825 else
13826 {
13827 if (speed)
13828 /* ASR (register) and friends. */
b6875aac 13829 *cost += extra_cost->alu.shift_reg;
7813b280
KT
13830
13831 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
13832 && CONST_INT_P (XEXP (op1, 1))
6a70badb
RS
13833 && known_eq (INTVAL (XEXP (op1, 1)),
13834 GET_MODE_BITSIZE (mode) - 1))
7813b280
KT
13835 {
13836 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
13837 /* We already demanded XEXP (op1, 0) to be REG_P, so
13838 don't recurse into it. */
13839 return true;
13840 }
b6875aac 13841 }
ba0cfa17
JG
13842 return false; /* All arguments need to be in registers. */
13843 }
43e9d192 13844
909734be
JG
13845 case SYMBOL_REF:
13846
1b1e81f8
JW
13847 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
13848 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
909734be
JG
13849 {
13850 /* LDR. */
13851 if (speed)
13852 *cost += extra_cost->ldst.load;
13853 }
13854 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
13855 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
13856 {
13857 /* ADRP, followed by ADD. */
13858 *cost += COSTS_N_INSNS (1);
13859 if (speed)
13860 *cost += 2 * extra_cost->alu.arith;
13861 }
13862 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
13863 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
13864 {
13865 /* ADR. */
13866 if (speed)
13867 *cost += extra_cost->alu.arith;
13868 }
13869
13870 if (flag_pic)
13871 {
13872 /* One extra load instruction, after accessing the GOT. */
13873 *cost += COSTS_N_INSNS (1);
13874 if (speed)
13875 *cost += extra_cost->ldst.load;
13876 }
43e9d192
IB
13877 return true;
13878
909734be 13879 case HIGH:
43e9d192 13880 case LO_SUM:
909734be
JG
13881 /* ADRP/ADD (immediate). */
13882 if (speed)
13883 *cost += extra_cost->alu.arith;
43e9d192
IB
13884 return true;
13885
13886 case ZERO_EXTRACT:
13887 case SIGN_EXTRACT:
7cc2145f
JG
13888 /* UBFX/SBFX. */
13889 if (speed)
b6875aac
KV
13890 {
13891 if (VECTOR_MODE_P (mode))
13892 *cost += extra_cost->vect.alu;
13893 else
13894 *cost += extra_cost->alu.bfx;
13895 }
7cc2145f
JG
13896
13897 /* We can trust that the immediates used will be correct (there
13898 are no by-register forms), so we need only cost op0. */
e548c9df 13899 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
43e9d192
IB
13900 return true;
13901
13902 case MULT:
4745e701
JG
13903 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
13904 /* aarch64_rtx_mult_cost always handles recursion to its
13905 operands. */
13906 return true;
43e9d192
IB
13907
13908 case MOD:
4f58fe36
KT
13909 /* We can expand signed mod by power of 2 using a NEGS, two parallel
13910 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
13911 an unconditional negate. This case should only ever be reached through
13912 the set_smod_pow2_cheap check in expmed.c. */
13913 if (CONST_INT_P (XEXP (x, 1))
13914 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
13915 && (mode == SImode || mode == DImode))
13916 {
13917 /* We expand to 4 instructions. Reset the baseline. */
13918 *cost = COSTS_N_INSNS (4);
13919
13920 if (speed)
13921 *cost += 2 * extra_cost->alu.logical
13922 + 2 * extra_cost->alu.arith;
13923
13924 return true;
13925 }
13926
13927 /* Fall-through. */
43e9d192 13928 case UMOD:
43e9d192
IB
13929 if (speed)
13930 {
cb9ac430 13931 /* Slighly prefer UMOD over SMOD. */
b6875aac
KV
13932 if (VECTOR_MODE_P (mode))
13933 *cost += extra_cost->vect.alu;
e548c9df
AM
13934 else if (GET_MODE_CLASS (mode) == MODE_INT)
13935 *cost += (extra_cost->mult[mode == DImode].add
cb9ac430
TC
13936 + extra_cost->mult[mode == DImode].idiv
13937 + (code == MOD ? 1 : 0));
43e9d192
IB
13938 }
13939 return false; /* All arguments need to be in registers. */
13940
13941 case DIV:
13942 case UDIV:
4105fe38 13943 case SQRT:
43e9d192
IB
13944 if (speed)
13945 {
b6875aac
KV
13946 if (VECTOR_MODE_P (mode))
13947 *cost += extra_cost->vect.alu;
13948 else if (GET_MODE_CLASS (mode) == MODE_INT)
4105fe38
JG
13949 /* There is no integer SQRT, so only DIV and UDIV can get
13950 here. */
cb9ac430
TC
13951 *cost += (extra_cost->mult[mode == DImode].idiv
13952 /* Slighly prefer UDIV over SDIV. */
13953 + (code == DIV ? 1 : 0));
4105fe38
JG
13954 else
13955 *cost += extra_cost->fp[mode == DFmode].div;
43e9d192
IB
13956 }
13957 return false; /* All arguments need to be in registers. */
13958
a8eecd00 13959 case IF_THEN_ELSE:
2d5ffe46
AP
13960 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
13961 XEXP (x, 2), cost, speed);
a8eecd00
JG
13962
13963 case EQ:
13964 case NE:
13965 case GT:
13966 case GTU:
13967 case LT:
13968 case LTU:
13969 case GE:
13970 case GEU:
13971 case LE:
13972 case LEU:
13973
13974 return false; /* All arguments must be in registers. */
13975
b292109f
JG
13976 case FMA:
13977 op0 = XEXP (x, 0);
13978 op1 = XEXP (x, 1);
13979 op2 = XEXP (x, 2);
13980
13981 if (speed)
b6875aac
KV
13982 {
13983 if (VECTOR_MODE_P (mode))
13984 *cost += extra_cost->vect.alu;
13985 else
13986 *cost += extra_cost->fp[mode == DFmode].fma;
13987 }
b292109f
JG
13988
13989 /* FMSUB, FNMADD, and FNMSUB are free. */
13990 if (GET_CODE (op0) == NEG)
13991 op0 = XEXP (op0, 0);
13992
13993 if (GET_CODE (op2) == NEG)
13994 op2 = XEXP (op2, 0);
13995
13996 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
13997 and the by-element operand as operand 0. */
13998 if (GET_CODE (op1) == NEG)
13999 op1 = XEXP (op1, 0);
14000
14001 /* Catch vector-by-element operations. The by-element operand can
14002 either be (vec_duplicate (vec_select (x))) or just
14003 (vec_select (x)), depending on whether we are multiplying by
14004 a vector or a scalar.
14005
14006 Canonicalization is not very good in these cases, FMA4 will put the
14007 by-element operand as operand 0, FNMA4 will have it as operand 1. */
14008 if (GET_CODE (op0) == VEC_DUPLICATE)
14009 op0 = XEXP (op0, 0);
14010 else if (GET_CODE (op1) == VEC_DUPLICATE)
14011 op1 = XEXP (op1, 0);
14012
14013 if (GET_CODE (op0) == VEC_SELECT)
14014 op0 = XEXP (op0, 0);
14015 else if (GET_CODE (op1) == VEC_SELECT)
14016 op1 = XEXP (op1, 0);
14017
14018 /* If the remaining parameters are not registers,
14019 get the cost to put them into registers. */
e548c9df
AM
14020 *cost += rtx_cost (op0, mode, FMA, 0, speed);
14021 *cost += rtx_cost (op1, mode, FMA, 1, speed);
14022 *cost += rtx_cost (op2, mode, FMA, 2, speed);
b292109f
JG
14023 return true;
14024
5e2a765b
KT
14025 case FLOAT:
14026 case UNSIGNED_FLOAT:
14027 if (speed)
14028 *cost += extra_cost->fp[mode == DFmode].fromint;
14029 return false;
14030
b292109f
JG
14031 case FLOAT_EXTEND:
14032 if (speed)
b6875aac
KV
14033 {
14034 if (VECTOR_MODE_P (mode))
14035 {
14036 /*Vector truncate. */
14037 *cost += extra_cost->vect.alu;
14038 }
14039 else
14040 *cost += extra_cost->fp[mode == DFmode].widen;
14041 }
b292109f
JG
14042 return false;
14043
14044 case FLOAT_TRUNCATE:
14045 if (speed)
b6875aac
KV
14046 {
14047 if (VECTOR_MODE_P (mode))
14048 {
14049 /*Vector conversion. */
14050 *cost += extra_cost->vect.alu;
14051 }
14052 else
14053 *cost += extra_cost->fp[mode == DFmode].narrow;
14054 }
b292109f
JG
14055 return false;
14056
61263118
KT
14057 case FIX:
14058 case UNSIGNED_FIX:
14059 x = XEXP (x, 0);
14060 /* Strip the rounding part. They will all be implemented
14061 by the fcvt* family of instructions anyway. */
14062 if (GET_CODE (x) == UNSPEC)
14063 {
14064 unsigned int uns_code = XINT (x, 1);
14065
14066 if (uns_code == UNSPEC_FRINTA
14067 || uns_code == UNSPEC_FRINTM
14068 || uns_code == UNSPEC_FRINTN
14069 || uns_code == UNSPEC_FRINTP
14070 || uns_code == UNSPEC_FRINTZ)
14071 x = XVECEXP (x, 0, 0);
14072 }
14073
14074 if (speed)
b6875aac
KV
14075 {
14076 if (VECTOR_MODE_P (mode))
14077 *cost += extra_cost->vect.alu;
14078 else
14079 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
14080 }
39252973
KT
14081
14082 /* We can combine fmul by a power of 2 followed by a fcvt into a single
14083 fixed-point fcvt. */
14084 if (GET_CODE (x) == MULT
14085 && ((VECTOR_MODE_P (mode)
14086 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
14087 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
14088 {
14089 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
14090 0, speed);
14091 return true;
14092 }
14093
e548c9df 14094 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
61263118
KT
14095 return true;
14096
b292109f 14097 case ABS:
b6875aac
KV
14098 if (VECTOR_MODE_P (mode))
14099 {
14100 /* ABS (vector). */
14101 if (speed)
14102 *cost += extra_cost->vect.alu;
14103 }
14104 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
b292109f 14105 {
19261b99
KT
14106 op0 = XEXP (x, 0);
14107
14108 /* FABD, which is analogous to FADD. */
14109 if (GET_CODE (op0) == MINUS)
14110 {
e548c9df
AM
14111 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
14112 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
19261b99
KT
14113 if (speed)
14114 *cost += extra_cost->fp[mode == DFmode].addsub;
14115
14116 return true;
14117 }
14118 /* Simple FABS is analogous to FNEG. */
b292109f
JG
14119 if (speed)
14120 *cost += extra_cost->fp[mode == DFmode].neg;
14121 }
14122 else
14123 {
14124 /* Integer ABS will either be split to
14125 two arithmetic instructions, or will be an ABS
14126 (scalar), which we don't model. */
14127 *cost = COSTS_N_INSNS (2);
14128 if (speed)
14129 *cost += 2 * extra_cost->alu.arith;
14130 }
14131 return false;
14132
14133 case SMAX:
14134 case SMIN:
14135 if (speed)
14136 {
b6875aac
KV
14137 if (VECTOR_MODE_P (mode))
14138 *cost += extra_cost->vect.alu;
14139 else
14140 {
14141 /* FMAXNM/FMINNM/FMAX/FMIN.
14142 TODO: This may not be accurate for all implementations, but
14143 we do not model this in the cost tables. */
14144 *cost += extra_cost->fp[mode == DFmode].addsub;
14145 }
b292109f
JG
14146 }
14147 return false;
14148
61263118
KT
14149 case UNSPEC:
14150 /* The floating point round to integer frint* instructions. */
14151 if (aarch64_frint_unspec_p (XINT (x, 1)))
14152 {
14153 if (speed)
14154 *cost += extra_cost->fp[mode == DFmode].roundint;
14155
14156 return false;
14157 }
781aeb73
KT
14158
14159 if (XINT (x, 1) == UNSPEC_RBIT)
14160 {
14161 if (speed)
14162 *cost += extra_cost->alu.rev;
14163
14164 return false;
14165 }
61263118
KT
14166 break;
14167
fb620c4a
JG
14168 case TRUNCATE:
14169
14170 /* Decompose <su>muldi3_highpart. */
14171 if (/* (truncate:DI */
14172 mode == DImode
14173 /* (lshiftrt:TI */
14174 && GET_MODE (XEXP (x, 0)) == TImode
14175 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
14176 /* (mult:TI */
14177 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
14178 /* (ANY_EXTEND:TI (reg:DI))
14179 (ANY_EXTEND:TI (reg:DI))) */
14180 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
14181 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
14182 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
14183 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
14184 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
14185 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
14186 /* (const_int 64) */
14187 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
14188 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
14189 {
14190 /* UMULH/SMULH. */
14191 if (speed)
14192 *cost += extra_cost->mult[mode == DImode].extend;
e548c9df
AM
14193 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
14194 mode, MULT, 0, speed);
14195 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
14196 mode, MULT, 1, speed);
fb620c4a
JG
14197 return true;
14198 }
1d5c43db
TC
14199 break;
14200 case CONST_VECTOR:
14201 {
14202 /* Load using MOVI/MVNI. */
14203 if (aarch64_simd_valid_immediate (x, NULL))
14204 *cost = extra_cost->vect.movi;
14205 else /* Load using constant pool. */
14206 *cost = extra_cost->ldst.load;
14207 break;
14208 }
14209 case VEC_CONCAT:
14210 /* depending on the operation, either DUP or INS.
14211 For now, keep default costing. */
14212 break;
14213 case VEC_DUPLICATE:
14214 /* Load using a DUP. */
14215 *cost = extra_cost->vect.dup;
14216 return false;
14217 case VEC_SELECT:
14218 {
14219 rtx op0 = XEXP (x, 0);
14220 *cost = rtx_cost (op0, GET_MODE (op0), VEC_SELECT, 0, speed);
fb620c4a 14221
1d5c43db
TC
14222 /* cost subreg of 0 as free, otherwise as DUP */
14223 rtx op1 = XEXP (x, 1);
14224 if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
14225 ;
14226 else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
14227 *cost = extra_cost->vect.dup;
14228 else
14229 *cost = extra_cost->vect.extract;
14230 return true;
14231 }
43e9d192 14232 default:
61263118 14233 break;
43e9d192 14234 }
61263118 14235
c10e3d7f
AP
14236 if (dump_file
14237 && flag_aarch64_verbose_cost)
61263118
KT
14238 fprintf (dump_file,
14239 "\nFailed to cost RTX. Assuming default cost.\n");
14240
14241 return true;
43e9d192
IB
14242}
14243
0ee859b5
JG
14244/* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
14245 calculated for X. This cost is stored in *COST. Returns true
14246 if the total cost of X was calculated. */
14247static bool
e548c9df 14248aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
0ee859b5
JG
14249 int param, int *cost, bool speed)
14250{
e548c9df 14251 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
0ee859b5 14252
c10e3d7f
AP
14253 if (dump_file
14254 && flag_aarch64_verbose_cost)
0ee859b5
JG
14255 {
14256 print_rtl_single (dump_file, x);
14257 fprintf (dump_file, "\n%s cost: %d (%s)\n",
14258 speed ? "Hot" : "Cold",
14259 *cost, result ? "final" : "partial");
14260 }
14261
14262 return result;
14263}
14264
43e9d192 14265static int
ef4bddc2 14266aarch64_register_move_cost (machine_mode mode,
8a3a7e67 14267 reg_class_t from_i, reg_class_t to_i)
43e9d192 14268{
8a3a7e67
RH
14269 enum reg_class from = (enum reg_class) from_i;
14270 enum reg_class to = (enum reg_class) to_i;
43e9d192 14271 const struct cpu_regmove_cost *regmove_cost
b175b679 14272 = aarch64_tune_params.regmove_cost;
43e9d192 14273
3be07662 14274 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
96b7f495
MM
14275 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS
14276 || to == STUB_REGS)
3be07662
WD
14277 to = GENERAL_REGS;
14278
96b7f495
MM
14279 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS
14280 || from == STUB_REGS)
3be07662
WD
14281 from = GENERAL_REGS;
14282
183bfdaf
RS
14283 /* Make RDFFR very expensive. In particular, if we know that the FFR
14284 contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
14285 as a way of obtaining a PTRUE. */
14286 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
14287 && hard_reg_set_subset_p (reg_class_contents[from_i],
14288 reg_class_contents[FFR_REGS]))
14289 return 80;
14290
6ee70f81
AP
14291 /* Moving between GPR and stack cost is the same as GP2GP. */
14292 if ((from == GENERAL_REGS && to == STACK_REG)
14293 || (to == GENERAL_REGS && from == STACK_REG))
14294 return regmove_cost->GP2GP;
14295
14296 /* To/From the stack register, we move via the gprs. */
14297 if (to == STACK_REG || from == STACK_REG)
14298 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
14299 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
14300
6a70badb 14301 if (known_eq (GET_MODE_SIZE (mode), 16))
8919453c
WD
14302 {
14303 /* 128-bit operations on general registers require 2 instructions. */
14304 if (from == GENERAL_REGS && to == GENERAL_REGS)
14305 return regmove_cost->GP2GP * 2;
14306 else if (from == GENERAL_REGS)
14307 return regmove_cost->GP2FP * 2;
14308 else if (to == GENERAL_REGS)
14309 return regmove_cost->FP2GP * 2;
14310
14311 /* When AdvSIMD instructions are disabled it is not possible to move
14312 a 128-bit value directly between Q registers. This is handled in
14313 secondary reload. A general register is used as a scratch to move
14314 the upper DI value and the lower DI value is moved directly,
14315 hence the cost is the sum of three moves. */
14316 if (! TARGET_SIMD)
14317 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
14318
14319 return regmove_cost->FP2FP;
14320 }
14321
43e9d192
IB
14322 if (from == GENERAL_REGS && to == GENERAL_REGS)
14323 return regmove_cost->GP2GP;
14324 else if (from == GENERAL_REGS)
14325 return regmove_cost->GP2FP;
14326 else if (to == GENERAL_REGS)
14327 return regmove_cost->FP2GP;
14328
43e9d192
IB
14329 return regmove_cost->FP2FP;
14330}
14331
14332static int
ef4bddc2 14333aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
43e9d192
IB
14334 reg_class_t rclass ATTRIBUTE_UNUSED,
14335 bool in ATTRIBUTE_UNUSED)
14336{
b175b679 14337 return aarch64_tune_params.memmov_cost;
43e9d192
IB
14338}
14339
6d4d616a
RS
14340/* Implement TARGET_INIT_BUILTINS. */
14341static void
14342aarch64_init_builtins ()
14343{
14344 aarch64_general_init_builtins ();
624d0f07 14345 aarch64_sve::init_builtins ();
f9d4544d
MR
14346#ifdef SUBTARGET_INIT_BUILTINS
14347 SUBTARGET_INIT_BUILTINS;
14348#endif
6d4d616a
RS
14349}
14350
14351/* Implement TARGET_FOLD_BUILTIN. */
14352static tree
14353aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
14354{
14355 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
14356 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
14357 tree type = TREE_TYPE (TREE_TYPE (fndecl));
14358 switch (code & AARCH64_BUILTIN_CLASS)
14359 {
14360 case AARCH64_BUILTIN_GENERAL:
14361 return aarch64_general_fold_builtin (subcode, type, nargs, args);
624d0f07
RS
14362
14363 case AARCH64_BUILTIN_SVE:
14364 return NULL_TREE;
6d4d616a
RS
14365 }
14366 gcc_unreachable ();
14367}
14368
14369/* Implement TARGET_GIMPLE_FOLD_BUILTIN. */
14370static bool
14371aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
14372{
14373 gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
14374 tree fndecl = gimple_call_fndecl (stmt);
14375 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
14376 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
14377 gimple *new_stmt = NULL;
14378 switch (code & AARCH64_BUILTIN_CLASS)
14379 {
14380 case AARCH64_BUILTIN_GENERAL:
ad44c6a5 14381 new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt, gsi);
6d4d616a 14382 break;
624d0f07
RS
14383
14384 case AARCH64_BUILTIN_SVE:
14385 new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
14386 break;
6d4d616a
RS
14387 }
14388
14389 if (!new_stmt)
14390 return false;
14391
14392 gsi_replace (gsi, new_stmt, true);
14393 return true;
14394}
14395
14396/* Implement TARGET_EXPAND_BUILTIN. */
14397static rtx
c5dc215d 14398aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
6d4d616a
RS
14399{
14400 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
14401 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
14402 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
14403 switch (code & AARCH64_BUILTIN_CLASS)
14404 {
14405 case AARCH64_BUILTIN_GENERAL:
c5dc215d 14406 return aarch64_general_expand_builtin (subcode, exp, target, ignore);
624d0f07
RS
14407
14408 case AARCH64_BUILTIN_SVE:
14409 return aarch64_sve::expand_builtin (subcode, exp, target);
6d4d616a
RS
14410 }
14411 gcc_unreachable ();
14412}
14413
14414/* Implement TARGET_BUILTIN_DECL. */
14415static tree
14416aarch64_builtin_decl (unsigned int code, bool initialize_p)
14417{
14418 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
14419 switch (code & AARCH64_BUILTIN_CLASS)
14420 {
14421 case AARCH64_BUILTIN_GENERAL:
14422 return aarch64_general_builtin_decl (subcode, initialize_p);
624d0f07
RS
14423
14424 case AARCH64_BUILTIN_SVE:
14425 return aarch64_sve::builtin_decl (subcode, initialize_p);
6d4d616a
RS
14426 }
14427 gcc_unreachable ();
14428}
14429
0c30e0f3
EM
14430/* Return true if it is safe and beneficial to use the approximate rsqrt optabs
14431 to optimize 1.0/sqrt. */
ee62a5a6
RS
14432
14433static bool
9acc9cbe 14434use_rsqrt_p (machine_mode mode)
ee62a5a6
RS
14435{
14436 return (!flag_trapping_math
14437 && flag_unsafe_math_optimizations
9acc9cbe
EM
14438 && ((aarch64_tune_params.approx_modes->recip_sqrt
14439 & AARCH64_APPROX_MODE (mode))
1a33079e 14440 || flag_mrecip_low_precision_sqrt));
ee62a5a6
RS
14441}
14442
0c30e0f3
EM
14443/* Function to decide when to use the approximate reciprocal square root
14444 builtin. */
a6fc00da
BH
14445
14446static tree
ee62a5a6 14447aarch64_builtin_reciprocal (tree fndecl)
a6fc00da 14448{
9acc9cbe
EM
14449 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
14450
14451 if (!use_rsqrt_p (mode))
a6fc00da 14452 return NULL_TREE;
6d4d616a
RS
14453 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
14454 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
14455 switch (code & AARCH64_BUILTIN_CLASS)
14456 {
14457 case AARCH64_BUILTIN_GENERAL:
14458 return aarch64_general_builtin_rsqrt (subcode);
624d0f07
RS
14459
14460 case AARCH64_BUILTIN_SVE:
14461 return NULL_TREE;
6d4d616a
RS
14462 }
14463 gcc_unreachable ();
a6fc00da
BH
14464}
14465
04f307cb
RS
14466/* Emit code to perform the floating-point operation:
14467
14468 DST = SRC1 * SRC2
14469
14470 where all three operands are already known to be registers.
14471 If the operation is an SVE one, PTRUE is a suitable all-true
14472 predicate. */
14473
14474static void
14475aarch64_emit_mult (rtx dst, rtx ptrue, rtx src1, rtx src2)
14476{
14477 if (ptrue)
14478 emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL, GET_MODE (dst),
14479 dst, ptrue, src1, src2,
14480 gen_int_mode (SVE_RELAXED_GP, SImode)));
14481 else
14482 emit_set_insn (dst, gen_rtx_MULT (GET_MODE (dst), src1, src2));
14483}
14484
98daafa0
EM
14485/* Emit instruction sequence to compute either the approximate square root
14486 or its approximate reciprocal, depending on the flag RECP, and return
14487 whether the sequence was emitted or not. */
a6fc00da 14488
98daafa0
EM
14489bool
14490aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
a6fc00da 14491{
98daafa0 14492 machine_mode mode = GET_MODE (dst);
daef0a8c
JW
14493
14494 if (GET_MODE_INNER (mode) == HFmode)
2e19adc8
RE
14495 {
14496 gcc_assert (!recp);
14497 return false;
14498 }
14499
2e19adc8
RE
14500 if (!recp)
14501 {
14502 if (!(flag_mlow_precision_sqrt
14503 || (aarch64_tune_params.approx_modes->sqrt
14504 & AARCH64_APPROX_MODE (mode))))
14505 return false;
14506
902d28bd 14507 if (!flag_finite_math_only
2e19adc8
RE
14508 || flag_trapping_math
14509 || !flag_unsafe_math_optimizations
14510 || optimize_function_for_size_p (cfun))
14511 return false;
14512 }
14513 else
14514 /* Caller assumes we cannot fail. */
14515 gcc_assert (use_rsqrt_p (mode));
daef0a8c 14516
a0ee8352
RS
14517 rtx pg = NULL_RTX;
14518 if (aarch64_sve_mode_p (mode))
14519 pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
d7814449 14520 machine_mode mmsk = (VECTOR_MODE_P (mode)
d083ee47 14521 ? related_int_vector_mode (mode).require ()
d7814449 14522 : int_mode_for_mode (mode).require ());
0df28e68 14523 rtx xmsk = NULL_RTX;
98daafa0 14524 if (!recp)
0df28e68
RS
14525 {
14526 /* When calculating the approximate square root, compare the
14527 argument with 0.0 and create a mask. */
a0ee8352
RS
14528 rtx zero = CONST0_RTX (mode);
14529 if (pg)
14530 {
14531 xmsk = gen_reg_rtx (GET_MODE (pg));
14532 rtx hint = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
14533 emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE, mode,
14534 xmsk, pg, hint, src, zero));
14535 }
14536 else
14537 {
14538 xmsk = gen_reg_rtx (mmsk);
14539 emit_insn (gen_rtx_SET (xmsk,
14540 gen_rtx_NEG (mmsk,
14541 gen_rtx_EQ (mmsk, src, zero))));
14542 }
0df28e68 14543 }
a6fc00da 14544
98daafa0
EM
14545 /* Estimate the approximate reciprocal square root. */
14546 rtx xdst = gen_reg_rtx (mode);
0016d8d9 14547 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
a6fc00da 14548
98daafa0
EM
14549 /* Iterate over the series twice for SF and thrice for DF. */
14550 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
a6fc00da 14551
98daafa0
EM
14552 /* Optionally iterate over the series once less for faster performance
14553 while sacrificing the accuracy. */
14554 if ((recp && flag_mrecip_low_precision_sqrt)
14555 || (!recp && flag_mlow_precision_sqrt))
a6fc00da
BH
14556 iterations--;
14557
98daafa0
EM
14558 /* Iterate over the series to calculate the approximate reciprocal square
14559 root. */
14560 rtx x1 = gen_reg_rtx (mode);
14561 while (iterations--)
a6fc00da 14562 {
a6fc00da 14563 rtx x2 = gen_reg_rtx (mode);
a0ee8352 14564 aarch64_emit_mult (x2, pg, xdst, xdst);
98daafa0 14565
0016d8d9 14566 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
a6fc00da 14567
98daafa0 14568 if (iterations > 0)
a0ee8352 14569 aarch64_emit_mult (xdst, pg, xdst, x1);
98daafa0
EM
14570 }
14571
14572 if (!recp)
14573 {
a0ee8352
RS
14574 if (pg)
14575 /* Multiply nonzero source values by the corresponding intermediate
14576 result elements, so that the final calculation is the approximate
14577 square root rather than its reciprocal. Select a zero result for
14578 zero source values, to avoid the Inf * 0 -> NaN that we'd get
14579 otherwise. */
14580 emit_insn (gen_cond (UNSPEC_COND_FMUL, mode,
14581 xdst, xmsk, xdst, src, CONST0_RTX (mode)));
14582 else
14583 {
14584 /* Qualify the approximate reciprocal square root when the
14585 argument is 0.0 by squashing the intermediary result to 0.0. */
14586 rtx xtmp = gen_reg_rtx (mmsk);
14587 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
14588 gen_rtx_SUBREG (mmsk, xdst, 0)));
14589 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
a6fc00da 14590
a0ee8352
RS
14591 /* Calculate the approximate square root. */
14592 aarch64_emit_mult (xdst, pg, xdst, src);
14593 }
a6fc00da
BH
14594 }
14595
98daafa0 14596 /* Finalize the approximation. */
a0ee8352 14597 aarch64_emit_mult (dst, pg, xdst, x1);
98daafa0
EM
14598
14599 return true;
a6fc00da
BH
14600}
14601
79a2bc2d
EM
14602/* Emit the instruction sequence to compute the approximation for the division
14603 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
14604
14605bool
14606aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
14607{
14608 machine_mode mode = GET_MODE (quo);
33d72b63
JW
14609
14610 if (GET_MODE_INNER (mode) == HFmode)
14611 return false;
14612
79a2bc2d
EM
14613 bool use_approx_division_p = (flag_mlow_precision_div
14614 || (aarch64_tune_params.approx_modes->division
14615 & AARCH64_APPROX_MODE (mode)));
14616
14617 if (!flag_finite_math_only
14618 || flag_trapping_math
14619 || !flag_unsafe_math_optimizations
14620 || optimize_function_for_size_p (cfun)
14621 || !use_approx_division_p)
14622 return false;
14623
1be49a38
RR
14624 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
14625 return false;
14626
04f307cb
RS
14627 rtx pg = NULL_RTX;
14628 if (aarch64_sve_mode_p (mode))
14629 pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
14630
79a2bc2d
EM
14631 /* Estimate the approximate reciprocal. */
14632 rtx xrcp = gen_reg_rtx (mode);
0016d8d9 14633 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
79a2bc2d
EM
14634
14635 /* Iterate over the series twice for SF and thrice for DF. */
14636 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
14637
dbf3dc75
BL
14638 /* Optionally iterate over the series less for faster performance,
14639 while sacrificing the accuracy. The default is 2 for DF and 1 for SF. */
79a2bc2d 14640 if (flag_mlow_precision_div)
dbf3dc75
BL
14641 iterations = (GET_MODE_INNER (mode) == DFmode
14642 ? aarch64_double_recp_precision
14643 : aarch64_float_recp_precision);
79a2bc2d
EM
14644
14645 /* Iterate over the series to calculate the approximate reciprocal. */
14646 rtx xtmp = gen_reg_rtx (mode);
14647 while (iterations--)
14648 {
0016d8d9 14649 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
79a2bc2d
EM
14650
14651 if (iterations > 0)
04f307cb 14652 aarch64_emit_mult (xrcp, pg, xrcp, xtmp);
79a2bc2d
EM
14653 }
14654
14655 if (num != CONST1_RTX (mode))
14656 {
14657 /* As the approximate reciprocal of DEN is already calculated, only
14658 calculate the approximate division when NUM is not 1.0. */
14659 rtx xnum = force_reg (mode, num);
04f307cb 14660 aarch64_emit_mult (xrcp, pg, xrcp, xnum);
79a2bc2d
EM
14661 }
14662
14663 /* Finalize the approximation. */
04f307cb 14664 aarch64_emit_mult (quo, pg, xrcp, xtmp);
79a2bc2d
EM
14665 return true;
14666}
14667
d126a4ae
AP
14668/* Return the number of instructions that can be issued per cycle. */
14669static int
14670aarch64_sched_issue_rate (void)
14671{
b175b679 14672 return aarch64_tune_params.issue_rate;
d126a4ae
AP
14673}
14674
d0bc0cb6
RS
14675/* Implement TARGET_SCHED_VARIABLE_ISSUE. */
14676static int
14677aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
14678{
14679 if (DEBUG_INSN_P (insn))
14680 return more;
14681
14682 rtx_code code = GET_CODE (PATTERN (insn));
14683 if (code == USE || code == CLOBBER)
14684 return more;
14685
14686 if (get_attr_type (insn) == TYPE_NO_INSN)
14687 return more;
14688
14689 return more - 1;
14690}
14691
d03f7e44
MK
14692static int
14693aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
14694{
14695 int issue_rate = aarch64_sched_issue_rate ();
14696
14697 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
14698}
14699
2d6bc7fa
KT
14700
14701/* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
14702 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
14703 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
14704
14705static int
14706aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
14707 int ready_index)
14708{
14709 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
14710}
14711
14712
8990e73a
TB
14713/* Vectorizer cost model target hooks. */
14714
1205a8ca
RS
14715/* Information about how the CPU would issue the scalar, Advanced SIMD
14716 or SVE version of a vector loop, using the scheme defined by the
14717 aarch64_base_vec_issue_info hierarchy of structures. */
15aba5a6 14718class aarch64_vec_op_count
1205a8ca 14719{
15aba5a6 14720public:
1a5288fe 14721 aarch64_vec_op_count () = default;
2e1886ea
RS
14722 aarch64_vec_op_count (const aarch64_vec_issue_info *, unsigned int,
14723 unsigned int = 1);
15aba5a6
RS
14724
14725 unsigned int vec_flags () const { return m_vec_flags; }
2e1886ea
RS
14726 unsigned int vf_factor () const { return m_vf_factor; }
14727
15aba5a6
RS
14728 const aarch64_base_vec_issue_info *base_issue_info () const;
14729 const aarch64_simd_vec_issue_info *simd_issue_info () const;
14730 const aarch64_sve_vec_issue_info *sve_issue_info () const;
14731
a82ffd43
RS
14732 fractional_cost rename_cycles_per_iter () const;
14733 fractional_cost min_nonpred_cycles_per_iter () const;
14734 fractional_cost min_pred_cycles_per_iter () const;
14735 fractional_cost min_cycles_per_iter () const;
14736
1205a8ca
RS
14737 void dump () const;
14738
14739 /* The number of individual "general" operations. See the comments
14740 in aarch64_base_vec_issue_info for details. */
14741 unsigned int general_ops = 0;
14742
14743 /* The number of load and store operations, under the same scheme
14744 as above. */
14745 unsigned int loads = 0;
14746 unsigned int stores = 0;
14747
14748 /* The minimum number of cycles needed to execute all loop-carried
14749 operations, which in the vector code become associated with
14750 reductions. */
14751 unsigned int reduction_latency = 0;
1205a8ca
RS
14752
14753 /* The number of individual predicate operations. See the comments
14754 in aarch64_sve_vec_issue_info for details. */
14755 unsigned int pred_ops = 0;
15aba5a6
RS
14756
14757private:
14758 /* The issue information for the core. */
1a5288fe 14759 const aarch64_vec_issue_info *m_issue_info = nullptr;
15aba5a6
RS
14760
14761 /* - If M_VEC_FLAGS is zero then this structure describes scalar code
14762 - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then this structure describes
14763 Advanced SIMD code.
14764 - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then this structure describes
14765 SVE code. */
1a5288fe 14766 unsigned int m_vec_flags = 0;
2e1886ea
RS
14767
14768 /* Assume that, when the code is executing on the core described
14769 by M_ISSUE_INFO, one iteration of the loop will handle M_VF_FACTOR
14770 times more data than the vectorizer anticipates.
14771
14772 This is only ever different from 1 for SVE. It allows us to consider
14773 what would happen on a 256-bit SVE target even when the -mtune
14774 parameters say that the “likely” SVE length is 128 bits. */
14775 unsigned int m_vf_factor = 1;
1205a8ca
RS
14776};
14777
15aba5a6
RS
14778aarch64_vec_op_count::
14779aarch64_vec_op_count (const aarch64_vec_issue_info *issue_info,
2e1886ea 14780 unsigned int vec_flags, unsigned int vf_factor)
15aba5a6 14781 : m_issue_info (issue_info),
2e1886ea
RS
14782 m_vec_flags (vec_flags),
14783 m_vf_factor (vf_factor)
15aba5a6
RS
14784{
14785}
14786
14787/* Return the base issue information (i.e. the parts that make sense
14788 for both scalar and vector code). Return null if we have no issue
14789 information. */
14790const aarch64_base_vec_issue_info *
14791aarch64_vec_op_count::base_issue_info () const
14792{
14793 if (auto *ret = simd_issue_info ())
14794 return ret;
1a5288fe 14795 return m_issue_info->scalar;
15aba5a6
RS
14796}
14797
14798/* If the structure describes vector code and we have associated issue
14799 information, return that issue information, otherwise return null. */
14800const aarch64_simd_vec_issue_info *
14801aarch64_vec_op_count::simd_issue_info () const
14802{
14803 if (auto *ret = sve_issue_info ())
14804 return ret;
1a5288fe 14805 if (m_vec_flags)
15aba5a6
RS
14806 return m_issue_info->advsimd;
14807 return nullptr;
14808}
14809
14810/* If the structure describes SVE code and we have associated issue
14811 information, return that issue information, otherwise return null. */
14812const aarch64_sve_vec_issue_info *
14813aarch64_vec_op_count::sve_issue_info () const
14814{
1a5288fe 14815 if (m_vec_flags & VEC_ANY_SVE)
15aba5a6
RS
14816 return m_issue_info->sve;
14817 return nullptr;
14818}
14819
a82ffd43
RS
14820/* Estimate the minimum number of cycles per iteration needed to rename
14821 the instructions.
14822
14823 ??? For now this is done inline rather than via cost tables, since it
14824 isn't clear how it should be parameterized for the general case. */
14825fractional_cost
14826aarch64_vec_op_count::rename_cycles_per_iter () const
14827{
14828 if (sve_issue_info () == &neoverse512tvb_sve_issue_info)
14829 /* + 1 for an addition. We've already counted a general op for each
14830 store, so we don't need to account for stores separately. The branch
14831 reads no registers and so does not need to be counted either.
14832
14833 ??? This value is very much on the pessimistic side, but seems to work
14834 pretty well in practice. */
14835 return { general_ops + loads + pred_ops + 1, 5 };
14836
14837 return 0;
14838}
14839
14840/* Like min_cycles_per_iter, but excluding predicate operations. */
14841fractional_cost
14842aarch64_vec_op_count::min_nonpred_cycles_per_iter () const
14843{
14844 auto *issue_info = base_issue_info ();
14845
14846 fractional_cost cycles = MAX (reduction_latency, 1);
14847 cycles = std::max (cycles, { stores, issue_info->stores_per_cycle });
14848 cycles = std::max (cycles, { loads + stores,
14849 issue_info->loads_stores_per_cycle });
14850 cycles = std::max (cycles, { general_ops,
14851 issue_info->general_ops_per_cycle });
14852 cycles = std::max (cycles, rename_cycles_per_iter ());
14853 return cycles;
14854}
14855
14856/* Like min_cycles_per_iter, but including only the predicate operations. */
14857fractional_cost
14858aarch64_vec_op_count::min_pred_cycles_per_iter () const
14859{
14860 if (auto *issue_info = sve_issue_info ())
14861 return { pred_ops, issue_info->pred_ops_per_cycle };
14862 return 0;
14863}
14864
14865/* Estimate the minimum number of cycles needed to issue the operations.
14866 This is a very simplistic model! */
14867fractional_cost
14868aarch64_vec_op_count::min_cycles_per_iter () const
14869{
14870 return std::max (min_nonpred_cycles_per_iter (),
14871 min_pred_cycles_per_iter ());
14872}
14873
14874/* Dump information about the structure. */
14875void
14876aarch64_vec_op_count::dump () const
14877{
14878 dump_printf_loc (MSG_NOTE, vect_location,
14879 " load operations = %d\n", loads);
14880 dump_printf_loc (MSG_NOTE, vect_location,
14881 " store operations = %d\n", stores);
14882 dump_printf_loc (MSG_NOTE, vect_location,
14883 " general operations = %d\n", general_ops);
14884 if (sve_issue_info ())
14885 dump_printf_loc (MSG_NOTE, vect_location,
14886 " predicate operations = %d\n", pred_ops);
14887 dump_printf_loc (MSG_NOTE, vect_location,
14888 " reduction latency = %d\n", reduction_latency);
14889 if (auto rcpi = rename_cycles_per_iter ())
14890 dump_printf_loc (MSG_NOTE, vect_location,
14891 " estimated cycles per iteration to rename = %f\n",
14892 rcpi.as_double ());
14893 if (auto pred_cpi = min_pred_cycles_per_iter ())
14894 {
14895 dump_printf_loc (MSG_NOTE, vect_location,
14896 " estimated min cycles per iteration"
14897 " without predication = %f\n",
14898 min_nonpred_cycles_per_iter ().as_double ());
14899 dump_printf_loc (MSG_NOTE, vect_location,
14900 " estimated min cycles per iteration"
14901 " for predication = %f\n", pred_cpi.as_double ());
14902 }
14903 if (auto cpi = min_cycles_per_iter ())
14904 dump_printf_loc (MSG_NOTE, vect_location,
14905 " estimated min cycles per iteration = %f\n",
14906 cpi.as_double ());
14907}
14908
50a525b5 14909/* Information about vector code that we're in the process of costing. */
d43fc1df 14910class aarch64_vector_costs : public vector_costs
50a525b5 14911{
d43fc1df 14912public:
15aba5a6 14913 aarch64_vector_costs (vec_info *, bool);
6239dd05
RS
14914
14915 unsigned int add_stmt_cost (int count, vect_cost_for_stmt kind,
14916 stmt_vec_info stmt_info, tree vectype,
14917 int misalign,
14918 vect_cost_model_location where) override;
0612883d 14919 void finish_cost (const vector_costs *) override;
c6c5c5eb 14920 bool better_main_loop_than_p (const vector_costs *other) const override;
3b924b0d 14921
d43fc1df
RS
14922private:
14923 void record_potential_advsimd_unrolling (loop_vec_info);
14924 void analyze_loop_vinfo (loop_vec_info);
87fcff96
RS
14925 void count_ops (unsigned int, vect_cost_for_stmt, stmt_vec_info,
14926 aarch64_vec_op_count *);
1a5288fe 14927 fractional_cost adjust_body_cost_sve (const aarch64_vec_op_count *,
c6c5c5eb
RS
14928 fractional_cost, unsigned int,
14929 unsigned int *, bool *);
6756706e
RS
14930 unsigned int adjust_body_cost (loop_vec_info, const aarch64_vector_costs *,
14931 unsigned int);
c6c5c5eb 14932 bool prefer_unrolled_loop () const;
d43fc1df
RS
14933
14934 /* True if we have performed one-time initialization based on the
14935 vec_info. */
14936 bool m_analyzed_vinfo = false;
3b924b0d 14937
d43fc1df
RS
14938 /* - If M_VEC_FLAGS is zero then we're costing the original scalar code.
14939 - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
3b924b0d 14940 SIMD code.
d43fc1df
RS
14941 - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then we're costing SVE code. */
14942 unsigned int m_vec_flags = 0;
3b924b0d
RS
14943
14944 /* On some CPUs, SVE and Advanced SIMD provide the same theoretical vector
14945 throughput, such as 4x128 Advanced SIMD vs. 2x256 SVE. In those
14946 situations, we try to predict whether an Advanced SIMD implementation
14947 of the loop could be completely unrolled and become straight-line code.
14948 If so, it is generally better to use the Advanced SIMD version rather
14949 than length-agnostic SVE, since the SVE loop would execute an unknown
14950 number of times and so could not be completely unrolled in the same way.
14951
d43fc1df 14952 If we're applying this heuristic, M_UNROLLED_ADVSIMD_NITERS is the
3b924b0d 14953 number of Advanced SIMD loop iterations that would be unrolled and
d43fc1df 14954 M_UNROLLED_ADVSIMD_STMTS estimates the total number of statements
3b924b0d
RS
14955 in the unrolled loop. Both values are zero if we're not applying
14956 the heuristic. */
d43fc1df
RS
14957 unsigned HOST_WIDE_INT m_unrolled_advsimd_niters = 0;
14958 unsigned HOST_WIDE_INT m_unrolled_advsimd_stmts = 0;
1205a8ca
RS
14959
14960 /* If we're vectorizing a loop that executes a constant number of times,
14961 this variable gives the number of times that the vector loop would
14962 iterate, otherwise it is zero. */
d43fc1df 14963 uint64_t m_num_vector_iterations = 0;
1205a8ca 14964
6756706e
RS
14965 /* Used only when vectorizing loops. Estimates the number and kind of
14966 operations that would be needed by one iteration of the scalar
1a5288fe
RS
14967 or vector loop. There is one entry for each tuning option of
14968 interest. */
14969 auto_vec<aarch64_vec_op_count, 2> m_ops;
50a525b5
RS
14970};
14971
15aba5a6
RS
14972aarch64_vector_costs::aarch64_vector_costs (vec_info *vinfo,
14973 bool costing_for_scalar)
14974 : vector_costs (vinfo, costing_for_scalar),
6756706e 14975 m_vec_flags (costing_for_scalar ? 0
1a5288fe 14976 : aarch64_classify_vector_mode (vinfo->vector_mode))
15aba5a6 14977{
1a5288fe
RS
14978 if (auto *issue_info = aarch64_tune_params.vec_costs->issue_info)
14979 {
14980 m_ops.quick_push ({ issue_info, m_vec_flags });
1a5288fe 14981 if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
2e1886ea
RS
14982 {
14983 unsigned int vf_factor = (m_vec_flags & VEC_ANY_SVE) ? 2 : 1;
14984 m_ops.quick_push ({ &neoversev1_vec_issue_info, m_vec_flags,
14985 vf_factor });
14986 }
1a5288fe 14987 }
15aba5a6
RS
14988}
14989
6239dd05
RS
14990/* Implement TARGET_VECTORIZE_CREATE_COSTS. */
14991vector_costs *
14992aarch64_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar)
50a525b5 14993{
6239dd05 14994 return new aarch64_vector_costs (vinfo, costing_for_scalar);
50a525b5
RS
14995}
14996
e253bb8b
RS
14997/* Return true if the current CPU should use the new costs defined
14998 in GCC 11. This should be removed for GCC 12 and above, with the
14999 costs applying to all CPUs instead. */
15000static bool
15001aarch64_use_new_vector_costs_p ()
15002{
15003 return (aarch64_tune_params.extra_tuning_flags
15004 & AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS);
15005}
15006
15007/* Return the appropriate SIMD costs for vectors of type VECTYPE. */
15008static const simd_vec_cost *
15009aarch64_simd_vec_costs (tree vectype)
15010{
15011 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
15012 if (vectype != NULL
15013 && aarch64_sve_mode_p (TYPE_MODE (vectype))
15014 && costs->sve != NULL)
15015 return costs->sve;
15016 return costs->advsimd;
15017}
15018
1205a8ca
RS
15019/* Return the appropriate SIMD costs for vectors with VEC_* flags FLAGS. */
15020static const simd_vec_cost *
15021aarch64_simd_vec_costs_for_flags (unsigned int flags)
15022{
15023 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
15024 if ((flags & VEC_ANY_SVE) && costs->sve)
15025 return costs->sve;
15026 return costs->advsimd;
15027}
15028
902b7c9e
RS
15029/* If STMT_INFO is a memory reference, return the scalar memory type,
15030 otherwise return null. */
15031static tree
15032aarch64_dr_type (stmt_vec_info stmt_info)
15033{
15034 if (auto dr = STMT_VINFO_DATA_REF (stmt_info))
15035 return TREE_TYPE (DR_REF (dr));
15036 return NULL_TREE;
15037}
15038
3b924b0d 15039/* Decide whether to use the unrolling heuristic described above
d43fc1df
RS
15040 m_unrolled_advsimd_niters, updating that field if so. LOOP_VINFO
15041 describes the loop that we're vectorizing. */
15042void
15043aarch64_vector_costs::
15044record_potential_advsimd_unrolling (loop_vec_info loop_vinfo)
3b924b0d
RS
15045{
15046 /* The heuristic only makes sense on targets that have the same
15047 vector throughput for SVE and Advanced SIMD. */
15048 if (!(aarch64_tune_params.extra_tuning_flags
15049 & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT))
15050 return;
15051
15052 /* We only want to apply the heuristic if LOOP_VINFO is being
15053 vectorized for SVE. */
d43fc1df 15054 if (!(m_vec_flags & VEC_ANY_SVE))
3b924b0d
RS
15055 return;
15056
15057 /* Check whether it is possible in principle to use Advanced SIMD
15058 instead. */
15059 if (aarch64_autovec_preference == 2)
15060 return;
15061
15062 /* We don't want to apply the heuristic to outer loops, since it's
15063 harder to track two levels of unrolling. */
15064 if (LOOP_VINFO_LOOP (loop_vinfo)->inner)
15065 return;
15066
15067 /* Only handle cases in which the number of Advanced SIMD iterations
15068 would be known at compile time but the number of SVE iterations
15069 would not. */
15070 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
15071 || aarch64_sve_vg.is_constant ())
15072 return;
15073
15074 /* Guess how many times the Advanced SIMD loop would iterate and make
15075 sure that it is within the complete unrolling limit. Even if the
15076 number of iterations is small enough, the number of statements might
15077 not be, which is why we need to estimate the number of statements too. */
15078 unsigned int estimated_vq = aarch64_estimated_sve_vq ();
15079 unsigned int advsimd_vf = CEIL (vect_vf_for_cost (loop_vinfo), estimated_vq);
15080 unsigned HOST_WIDE_INT unrolled_advsimd_niters
15081 = LOOP_VINFO_INT_NITERS (loop_vinfo) / advsimd_vf;
15082 if (unrolled_advsimd_niters > (unsigned int) param_max_completely_peel_times)
15083 return;
15084
15085 /* Record that we're applying the heuristic and should try to estimate
15086 the number of statements in the Advanced SIMD loop. */
d43fc1df 15087 m_unrolled_advsimd_niters = unrolled_advsimd_niters;
3b924b0d
RS
15088}
15089
d43fc1df
RS
15090/* Do one-time initialization of the aarch64_vector_costs given that we're
15091 costing the loop vectorization described by LOOP_VINFO. */
15092void
15093aarch64_vector_costs::analyze_loop_vinfo (loop_vec_info loop_vinfo)
3b924b0d 15094{
1205a8ca
RS
15095 /* Record the number of times that the vector loop would execute,
15096 if known. */
15097 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
15098 auto scalar_niters = max_stmt_executions_int (loop);
15099 if (scalar_niters >= 0)
15100 {
15101 unsigned int vf = vect_vf_for_cost (loop_vinfo);
15102 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
d43fc1df 15103 m_num_vector_iterations = scalar_niters / vf;
1205a8ca 15104 else
d43fc1df 15105 m_num_vector_iterations = CEIL (scalar_niters, vf);
1205a8ca
RS
15106 }
15107
d43fc1df
RS
15108 /* Detect whether we're vectorizing for SVE and should apply the unrolling
15109 heuristic described above m_unrolled_advsimd_niters. */
15110 record_potential_advsimd_unrolling (loop_vinfo);
1205a8ca
RS
15111
15112 /* Record the issue information for any SVE WHILE instructions that the
15113 loop needs. */
1a5288fe 15114 if (!m_ops.is_empty () && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1205a8ca
RS
15115 {
15116 unsigned int num_masks = 0;
15117 rgroup_controls *rgm;
15118 unsigned int num_vectors_m1;
15119 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
15120 if (rgm->type)
15121 num_masks += num_vectors_m1 + 1;
1a5288fe
RS
15122 for (auto &ops : m_ops)
15123 if (auto *issue = ops.sve_issue_info ())
15124 ops.pred_ops += num_masks * issue->while_pred_ops;
1205a8ca 15125 }
3b924b0d
RS
15126}
15127
8990e73a
TB
15128/* Implement targetm.vectorize.builtin_vectorization_cost. */
15129static int
15130aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
15131 tree vectype,
15132 int misalign ATTRIBUTE_UNUSED)
15133{
15134 unsigned elements;
cd8ae5ed
AP
15135 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
15136 bool fp = false;
15137
15138 if (vectype != NULL)
15139 fp = FLOAT_TYPE_P (vectype);
8990e73a 15140
e253bb8b 15141 const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
76e4f444 15142
8990e73a
TB
15143 switch (type_of_cost)
15144 {
15145 case scalar_stmt:
cd8ae5ed 15146 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
8990e73a
TB
15147
15148 case scalar_load:
cd8ae5ed 15149 return costs->scalar_load_cost;
8990e73a
TB
15150
15151 case scalar_store:
cd8ae5ed 15152 return costs->scalar_store_cost;
8990e73a
TB
15153
15154 case vector_stmt:
76e4f444
KT
15155 return fp ? simd_costs->fp_stmt_cost
15156 : simd_costs->int_stmt_cost;
8990e73a
TB
15157
15158 case vector_load:
76e4f444 15159 return simd_costs->align_load_cost;
8990e73a
TB
15160
15161 case vector_store:
76e4f444 15162 return simd_costs->store_cost;
8990e73a
TB
15163
15164 case vec_to_scalar:
76e4f444 15165 return simd_costs->vec_to_scalar_cost;
8990e73a
TB
15166
15167 case scalar_to_vec:
76e4f444 15168 return simd_costs->scalar_to_vec_cost;
8990e73a
TB
15169
15170 case unaligned_load:
cc9fe6bb 15171 case vector_gather_load:
76e4f444 15172 return simd_costs->unalign_load_cost;
8990e73a
TB
15173
15174 case unaligned_store:
cc9fe6bb 15175 case vector_scatter_store:
76e4f444 15176 return simd_costs->unalign_store_cost;
8990e73a
TB
15177
15178 case cond_branch_taken:
cd8ae5ed 15179 return costs->cond_taken_branch_cost;
8990e73a
TB
15180
15181 case cond_branch_not_taken:
cd8ae5ed 15182 return costs->cond_not_taken_branch_cost;
8990e73a
TB
15183
15184 case vec_perm:
76e4f444 15185 return simd_costs->permute_cost;
c428f91c 15186
8990e73a 15187 case vec_promote_demote:
76e4f444
KT
15188 return fp ? simd_costs->fp_stmt_cost
15189 : simd_costs->int_stmt_cost;
8990e73a
TB
15190
15191 case vec_construct:
6a70badb 15192 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
8990e73a
TB
15193 return elements / 2 + 1;
15194
15195 default:
15196 gcc_unreachable ();
15197 }
15198}
15199
b1a831f0
RS
15200/* Return true if an access of kind KIND for STMT_INFO represents one
15201 vector of an LD[234] or ST[234] operation. Return the total number of
15202 vectors (2, 3 or 4) if so, otherwise return a value outside that range. */
15203static int
15204aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info)
15205{
15206 if ((kind == vector_load
15207 || kind == unaligned_load
15208 || kind == vector_store
15209 || kind == unaligned_store)
15210 && STMT_VINFO_DATA_REF (stmt_info))
15211 {
15212 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
15213 if (stmt_info
15214 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES)
15215 return DR_GROUP_SIZE (stmt_info);
15216 }
15217 return 0;
15218}
15219
8b50d7a4
RS
15220/* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
15221 vectors would produce a series of LDP or STP operations. KIND is the
15222 kind of statement that STMT_INFO represents. */
15223static bool
15224aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
15225 stmt_vec_info stmt_info)
15226{
15227 switch (kind)
15228 {
15229 case vector_load:
15230 case vector_store:
15231 case unaligned_load:
15232 case unaligned_store:
15233 break;
15234
15235 default:
15236 return false;
15237 }
15238
15239 if (aarch64_tune_params.extra_tuning_flags
15240 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
15241 return false;
15242
15243 return is_gimple_assign (stmt_info->stmt);
15244}
15245
1205a8ca
RS
15246/* Return true if STMT_INFO is the second part of a two-statement multiply-add
15247 or multiply-subtract sequence that might be suitable for fusing into a
028059b4
RS
15248 single instruction. If VEC_FLAGS is zero, analyze the operation as
15249 a scalar one, otherwise analyze it as an operation on vectors with those
15250 VEC_* flags. */
1205a8ca 15251static bool
028059b4
RS
15252aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
15253 unsigned int vec_flags)
1205a8ca
RS
15254{
15255 gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
15256 if (!assign)
15257 return false;
15258 tree_code code = gimple_assign_rhs_code (assign);
15259 if (code != PLUS_EXPR && code != MINUS_EXPR)
15260 return false;
15261
15262 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign))
15263 || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign)))
15264 return false;
15265
15266 for (int i = 1; i < 3; ++i)
15267 {
15268 tree rhs = gimple_op (assign, i);
15269 /* ??? Should we try to check for a single use as well? */
15270 if (TREE_CODE (rhs) != SSA_NAME)
15271 continue;
15272
15273 stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
15274 if (!def_stmt_info
15275 || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def)
15276 continue;
15277 gassign *rhs_assign = dyn_cast<gassign *> (def_stmt_info->stmt);
15278 if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR)
15279 continue;
15280
028059b4
RS
15281 if (vec_flags & VEC_ADVSIMD)
15282 {
15283 /* Scalar and SVE code can tie the result to any FMLA input (or none,
15284 although that requires a MOVPRFX for SVE). However, Advanced SIMD
15285 only supports MLA forms, so will require a move if the result
15286 cannot be tied to the accumulator. The most important case in
15287 which this is true is when the accumulator input is invariant. */
15288 rhs = gimple_op (assign, 3 - i);
15289 if (TREE_CODE (rhs) != SSA_NAME)
15290 return false;
15291 def_stmt_info = vinfo->lookup_def (rhs);
15292 if (!def_stmt_info
15293 || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def)
15294 return false;
15295 }
15296
1205a8ca
RS
15297 return true;
15298 }
15299 return false;
15300}
15301
26122469
RS
15302/* We are considering implementing STMT_INFO using SVE. If STMT_INFO is an
15303 in-loop reduction that SVE supports directly, return its latency in cycles,
15304 otherwise return zero. SVE_COSTS specifies the latencies of the relevant
15305 instructions. */
1282988b
RS
15306static unsigned int
15307aarch64_sve_in_loop_reduction_latency (vec_info *vinfo,
15308 stmt_vec_info stmt_info,
1282988b
RS
15309 const sve_vec_cost *sve_costs)
15310{
783d809f 15311 switch (vect_reduc_type (vinfo, stmt_info))
1282988b
RS
15312 {
15313 case EXTRACT_LAST_REDUCTION:
15314 return sve_costs->clast_cost;
15315
15316 case FOLD_LEFT_REDUCTION:
26122469 15317 switch (TYPE_MODE (TREE_TYPE (gimple_get_lhs (stmt_info->stmt))))
1282988b
RS
15318 {
15319 case E_HFmode:
15320 case E_BFmode:
15321 return sve_costs->fadda_f16_cost;
15322
15323 case E_SFmode:
15324 return sve_costs->fadda_f32_cost;
15325
15326 case E_DFmode:
15327 return sve_costs->fadda_f64_cost;
15328
15329 default:
15330 break;
15331 }
15332 break;
15333 }
15334
15335 return 0;
15336}
15337
1205a8ca
RS
15338/* STMT_INFO describes a loop-carried operation in the original scalar code
15339 that we are considering implementing as a reduction. Return one of the
15340 following values, depending on VEC_FLAGS:
15341
15342 - If VEC_FLAGS is zero, return the loop carry latency of the original
15343 scalar operation.
15344
15345 - If VEC_FLAGS & VEC_ADVSIMD, return the loop carry latency of the
eb55b5b0 15346 Advanced SIMD implementation.
1205a8ca
RS
15347
15348 - If VEC_FLAGS & VEC_ANY_SVE, return the loop carry latency of the
26122469 15349 SVE implementation. */
1205a8ca
RS
15350static unsigned int
15351aarch64_in_loop_reduction_latency (vec_info *vinfo, stmt_vec_info stmt_info,
26122469 15352 unsigned int vec_flags)
1205a8ca
RS
15353{
15354 const cpu_vector_cost *vec_costs = aarch64_tune_params.vec_costs;
15355 const sve_vec_cost *sve_costs = nullptr;
15356 if (vec_flags & VEC_ANY_SVE)
15357 sve_costs = aarch64_tune_params.vec_costs->sve;
15358
15359 /* If the caller is asking for the SVE latency, check for forms of reduction
15360 that only SVE can handle directly. */
15361 if (sve_costs)
15362 {
15363 unsigned int latency
26122469 15364 = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
1205a8ca
RS
15365 if (latency)
15366 return latency;
15367 }
15368
15369 /* Handle scalar costs. */
26122469 15370 bool is_float = FLOAT_TYPE_P (TREE_TYPE (gimple_get_lhs (stmt_info->stmt)));
1205a8ca
RS
15371 if (vec_flags == 0)
15372 {
26122469 15373 if (is_float)
1205a8ca
RS
15374 return vec_costs->scalar_fp_stmt_cost;
15375 return vec_costs->scalar_int_stmt_cost;
15376 }
15377
15378 /* Otherwise, the loop body just contains normal integer or FP operations,
15379 with a vector reduction outside the loop. */
15380 const simd_vec_cost *simd_costs
15381 = aarch64_simd_vec_costs_for_flags (vec_flags);
26122469 15382 if (is_float)
1205a8ca
RS
15383 return simd_costs->fp_stmt_cost;
15384 return simd_costs->int_stmt_cost;
15385}
15386
ed17ad5e
RS
15387/* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
15388 for STMT_INFO, which has cost kind KIND. If this is a scalar operation,
15389 try to subdivide the target-independent categorization provided by KIND
15390 to get a more accurate cost. */
83d796d3 15391static fractional_cost
ed17ad5e
RS
15392aarch64_detect_scalar_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
15393 stmt_vec_info stmt_info,
83d796d3 15394 fractional_cost stmt_cost)
ed17ad5e
RS
15395{
15396 /* Detect an extension of a loaded value. In general, we'll be able to fuse
15397 the extension with the load. */
783d809f 15398 if (kind == scalar_stmt && vect_is_extending_load (vinfo, stmt_info))
ed17ad5e
RS
15399 return 0;
15400
15401 return stmt_cost;
15402}
15403
e253bb8b
RS
15404/* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
15405 for the vectorized form of STMT_INFO, which has cost kind KIND and which
15406 when vectorized would operate on vector type VECTYPE. Try to subdivide
15407 the target-independent categorization provided by KIND to get a more
15408 accurate cost. WHERE specifies where the cost associated with KIND
15409 occurs. */
83d796d3 15410static fractional_cost
1282988b 15411aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
e253bb8b
RS
15412 stmt_vec_info stmt_info, tree vectype,
15413 enum vect_cost_model_location where,
83d796d3 15414 fractional_cost stmt_cost)
e253bb8b
RS
15415{
15416 const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
1282988b
RS
15417 const sve_vec_cost *sve_costs = nullptr;
15418 if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
15419 sve_costs = aarch64_tune_params.vec_costs->sve;
15420
e4180ab2
RS
15421 /* It's generally better to avoid costing inductions, since the induction
15422 will usually be hidden by other operations. This is particularly true
15423 for things like COND_REDUCTIONS. */
15424 if (is_a<gphi *> (stmt_info->stmt))
15425 return 0;
15426
d1ff0847
RS
15427 /* Detect cases in which vec_to_scalar is describing the extraction of a
15428 vector element in preparation for a scalar store. The store itself is
15429 costed separately. */
783d809f 15430 if (vect_is_store_elt_extraction (kind, stmt_info))
d1ff0847
RS
15431 return simd_costs->store_elt_extra_cost;
15432
78770e0e
RS
15433 /* Detect SVE gather loads, which are costed as a single scalar_load
15434 for each element. We therefore need to divide the full-instruction
15435 cost by the number of elements in the vector. */
15436 if (kind == scalar_load
15437 && sve_costs
15438 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
15439 {
15440 unsigned int nunits = vect_nunits_for_cost (vectype);
15441 if (GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype)) == 64)
15442 return { sve_costs->gather_load_x64_cost, nunits };
15443 return { sve_costs->gather_load_x32_cost, nunits };
15444 }
15445
7c679969
RS
15446 /* Detect cases in which a scalar_store is really storing one element
15447 in a scatter operation. */
15448 if (kind == scalar_store
15449 && sve_costs
15450 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
15451 return sve_costs->scatter_store_elt_cost;
15452
1282988b
RS
15453 /* Detect cases in which vec_to_scalar represents an in-loop reduction. */
15454 if (kind == vec_to_scalar
15455 && where == vect_body
15456 && sve_costs)
15457 {
15458 unsigned int latency
26122469 15459 = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
1282988b
RS
15460 if (latency)
15461 return latency;
15462 }
e253bb8b
RS
15463
15464 /* Detect cases in which vec_to_scalar represents a single reduction
15465 instruction like FADDP or MAXV. */
15466 if (kind == vec_to_scalar
15467 && where == vect_epilogue
783d809f 15468 && vect_is_reduction (stmt_info))
e253bb8b
RS
15469 switch (GET_MODE_INNER (TYPE_MODE (vectype)))
15470 {
15471 case E_QImode:
15472 return simd_costs->reduc_i8_cost;
15473
15474 case E_HImode:
15475 return simd_costs->reduc_i16_cost;
15476
15477 case E_SImode:
15478 return simd_costs->reduc_i32_cost;
15479
15480 case E_DImode:
15481 return simd_costs->reduc_i64_cost;
15482
15483 case E_HFmode:
15484 case E_BFmode:
15485 return simd_costs->reduc_f16_cost;
15486
15487 case E_SFmode:
15488 return simd_costs->reduc_f32_cost;
15489
15490 case E_DFmode:
15491 return simd_costs->reduc_f64_cost;
15492
15493 default:
15494 break;
15495 }
15496
15497 /* Otherwise stick with the original categorization. */
15498 return stmt_cost;
15499}
15500
217ccab8 15501/* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
8b50d7a4
RS
15502 for STMT_INFO, which has cost kind KIND and which when vectorized would
15503 operate on vector type VECTYPE. Adjust the cost as necessary for SVE
15504 targets. */
83d796d3 15505static fractional_cost
308bc496 15506aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
8b50d7a4 15507 stmt_vec_info stmt_info, tree vectype,
83d796d3 15508 fractional_cost stmt_cost)
217ccab8
RS
15509{
15510 /* Unlike vec_promote_demote, vector_stmt conversions do not change the
15511 vector register size or number of units. Integer promotions of this
15512 type therefore map to SXT[BHW] or UXT[BHW].
15513
15514 Most loads have extending forms that can do the sign or zero extension
15515 on the fly. Optimistically assume that a load followed by an extension
15516 will fold to this form during combine, and that the extension therefore
15517 comes for free. */
783d809f 15518 if (kind == vector_stmt && vect_is_extending_load (vinfo, stmt_info))
217ccab8
RS
15519 stmt_cost = 0;
15520
2d56600c
RS
15521 /* For similar reasons, vector_stmt integer truncations are a no-op,
15522 because we can just ignore the unused upper bits of the source. */
783d809f 15523 if (kind == vector_stmt && vect_is_integer_truncation (stmt_info))
2d56600c
RS
15524 stmt_cost = 0;
15525
8b50d7a4
RS
15526 /* Advanced SIMD can load and store pairs of registers using LDP and STP,
15527 but there are no equivalent instructions for SVE. This means that
15528 (all other things being equal) 128-bit SVE needs twice as many load
15529 and store instructions as Advanced SIMD in order to process vector pairs.
15530
15531 Also, scalar code can often use LDP and STP to access pairs of values,
15532 so it is too simplistic to say that one SVE load or store replaces
15533 VF scalar loads and stores.
15534
15535 Ideally we would account for this in the scalar and Advanced SIMD
15536 costs by making suitable load/store pairs as cheap as a single
15537 load/store. However, that would be a very invasive change and in
15538 practice it tends to stress other parts of the cost model too much.
15539 E.g. stores of scalar constants currently count just a store,
15540 whereas stores of vector constants count a store and a vec_init.
15541 This is an artificial distinction for AArch64, where stores of
15542 nonzero scalar constants need the same kind of register invariant
15543 as vector stores.
15544
15545 An alternative would be to double the cost of any SVE loads and stores
15546 that could be paired in Advanced SIMD (and possibly also paired in
15547 scalar code). But this tends to stress other parts of the cost model
15548 in the same way. It also means that we can fall back to Advanced SIMD
15549 even if full-loop predication would have been useful.
15550
15551 Here we go for a more conservative version: double the costs of SVE
15552 loads and stores if one iteration of the scalar loop processes enough
15553 elements for it to use a whole number of Advanced SIMD LDP or STP
15554 instructions. This makes it very likely that the VF would be 1 for
15555 Advanced SIMD, and so no epilogue should be needed. */
15556 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
15557 {
15558 stmt_vec_info first = DR_GROUP_FIRST_ELEMENT (stmt_info);
15559 unsigned int count = DR_GROUP_SIZE (first) - DR_GROUP_GAP (first);
15560 unsigned int elt_bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype));
15561 if (multiple_p (count * elt_bits, 256)
15562 && aarch64_advsimd_ldp_stp_p (kind, stmt_info))
15563 stmt_cost *= 2;
15564 }
15565
217ccab8
RS
15566 return stmt_cost;
15567}
15568
b1a831f0
RS
15569/* STMT_COST is the cost calculated for STMT_INFO, which has cost kind KIND
15570 and which when vectorized would operate on vector type VECTYPE. Add the
15571 cost of any embedded operations. */
83d796d3 15572static fractional_cost
b1a831f0 15573aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
83d796d3 15574 tree vectype, fractional_cost stmt_cost)
b1a831f0
RS
15575{
15576 if (vectype)
15577 {
15578 const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
15579
15580 /* Detect cases in which a vector load or store represents an
15581 LD[234] or ST[234] instruction. */
15582 switch (aarch64_ld234_st234_vectors (kind, stmt_info))
15583 {
15584 case 2:
15585 stmt_cost += simd_costs->ld2_st2_permute_cost;
15586 break;
15587
15588 case 3:
15589 stmt_cost += simd_costs->ld3_st3_permute_cost;
15590 break;
15591
15592 case 4:
15593 stmt_cost += simd_costs->ld4_st4_permute_cost;
15594 break;
15595 }
99f94ae5
RS
15596
15597 if (kind == vector_stmt || kind == vec_to_scalar)
783d809f 15598 if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
99f94ae5
RS
15599 {
15600 if (FLOAT_TYPE_P (cmp_type))
15601 stmt_cost += simd_costs->fp_stmt_cost;
15602 else
15603 stmt_cost += simd_costs->int_stmt_cost;
15604 }
b1a831f0
RS
15605 }
15606
99f94ae5 15607 if (kind == scalar_stmt)
783d809f 15608 if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
99f94ae5
RS
15609 {
15610 if (FLOAT_TYPE_P (cmp_type))
15611 stmt_cost += aarch64_tune_params.vec_costs->scalar_fp_stmt_cost;
15612 else
15613 stmt_cost += aarch64_tune_params.vec_costs->scalar_int_stmt_cost;
15614 }
15615
b1a831f0
RS
15616 return stmt_cost;
15617}
15618
87fcff96
RS
15619/* COUNT, KIND and STMT_INFO are the same as for vector_costs::add_stmt_cost
15620 and they describe an operation in the body of a vector loop. Record issue
15621 information relating to the vector operation in OPS. */
d43fc1df
RS
15622void
15623aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
87fcff96
RS
15624 stmt_vec_info stmt_info,
15625 aarch64_vec_op_count *ops)
1205a8ca 15626{
15aba5a6
RS
15627 const aarch64_base_vec_issue_info *base_issue = ops->base_issue_info ();
15628 if (!base_issue)
1205a8ca 15629 return;
15aba5a6
RS
15630 const aarch64_simd_vec_issue_info *simd_issue = ops->simd_issue_info ();
15631 const aarch64_sve_vec_issue_info *sve_issue = ops->sve_issue_info ();
1205a8ca
RS
15632
15633 /* Calculate the minimum cycles per iteration imposed by a reduction
15634 operation. */
6756706e 15635 if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
783d809f 15636 && vect_is_reduction (stmt_info))
1205a8ca
RS
15637 {
15638 unsigned int base
87fcff96 15639 = aarch64_in_loop_reduction_latency (m_vinfo, stmt_info, m_vec_flags);
1205a8ca 15640
87fcff96
RS
15641 /* ??? Ideally we'd do COUNT reductions in parallel, but unfortunately
15642 that's not yet the case. */
15643 ops->reduction_latency = MAX (ops->reduction_latency, base * count);
1205a8ca
RS
15644 }
15645
15646 /* Assume that multiply-adds will become a single operation. */
87fcff96 15647 if (stmt_info && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
1205a8ca
RS
15648 return;
15649
1205a8ca
RS
15650 /* Count the basic operation cost associated with KIND. */
15651 switch (kind)
15652 {
15653 case cond_branch_taken:
15654 case cond_branch_not_taken:
15655 case vector_gather_load:
15656 case vector_scatter_store:
15657 /* We currently don't expect these to be used in a loop body. */
15658 break;
15659
15660 case vec_perm:
15661 case vec_promote_demote:
15662 case vec_construct:
15663 case vec_to_scalar:
15664 case scalar_to_vec:
1205a8ca
RS
15665 case vector_stmt:
15666 case scalar_stmt:
87fcff96 15667 ops->general_ops += count;
1205a8ca
RS
15668 break;
15669
15670 case scalar_load:
15671 case vector_load:
15672 case unaligned_load:
87fcff96
RS
15673 ops->loads += count;
15674 if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
15675 ops->general_ops += base_issue->fp_simd_load_general_ops * count;
1205a8ca
RS
15676 break;
15677
15678 case vector_store:
15679 case unaligned_store:
15680 case scalar_store:
87fcff96
RS
15681 ops->stores += count;
15682 if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
15683 ops->general_ops += base_issue->fp_simd_store_general_ops * count;
1205a8ca
RS
15684 break;
15685 }
15686
15687 /* Add any embedded comparison operations. */
15688 if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
783d809f 15689 && vect_embedded_comparison_type (stmt_info))
87fcff96 15690 ops->general_ops += count;
1205a8ca 15691
87fcff96 15692 /* COND_REDUCTIONS need two sets of VEC_COND_EXPRs, whereas so far we
1205a8ca 15693 have only accounted for one. */
87fcff96
RS
15694 if ((kind == vector_stmt || kind == vec_to_scalar)
15695 && vect_reduc_type (m_vinfo, stmt_info) == COND_REDUCTION)
15696 ops->general_ops += count;
1205a8ca
RS
15697
15698 /* Count the predicate operations needed by an SVE comparison. */
15699 if (sve_issue && (kind == vector_stmt || kind == vec_to_scalar))
783d809f 15700 if (tree type = vect_comparison_type (stmt_info))
1205a8ca
RS
15701 {
15702 unsigned int base = (FLOAT_TYPE_P (type)
15703 ? sve_issue->fp_cmp_pred_ops
15704 : sve_issue->int_cmp_pred_ops);
87fcff96 15705 ops->pred_ops += base * count;
1205a8ca
RS
15706 }
15707
15708 /* Add any extra overhead associated with LD[234] and ST[234] operations. */
15709 if (simd_issue)
15710 switch (aarch64_ld234_st234_vectors (kind, stmt_info))
15711 {
15712 case 2:
87fcff96 15713 ops->general_ops += simd_issue->ld2_st2_general_ops * count;
1205a8ca
RS
15714 break;
15715
15716 case 3:
87fcff96 15717 ops->general_ops += simd_issue->ld3_st3_general_ops * count;
1205a8ca
RS
15718 break;
15719
15720 case 4:
87fcff96 15721 ops->general_ops += simd_issue->ld4_st4_general_ops * count;
1205a8ca
RS
15722 break;
15723 }
15724
15725 /* Add any overhead associated with gather loads and scatter stores. */
15726 if (sve_issue
15727 && (kind == scalar_load || kind == scalar_store)
15728 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
15729 {
15730 unsigned int pairs = CEIL (count, 2);
15aba5a6 15731 ops->pred_ops += sve_issue->gather_scatter_pair_pred_ops * pairs;
1205a8ca
RS
15732 ops->general_ops += sve_issue->gather_scatter_pair_general_ops * pairs;
15733 }
15734}
15735
6239dd05
RS
15736unsigned
15737aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
15738 stmt_vec_info stmt_info, tree vectype,
15739 int misalign,
15740 vect_cost_model_location where)
8990e73a 15741{
f837785c
RS
15742 fractional_cost stmt_cost
15743 = aarch64_builtin_vectorization_cost (kind, vectype, misalign);
3b924b0d 15744
f837785c
RS
15745 bool in_inner_loop_p = (where == vect_body
15746 && stmt_info
6239dd05 15747 && stmt_in_inner_loop_p (m_vinfo, stmt_info));
9690309b 15748
f837785c 15749 /* Do one-time initialization based on the vinfo. */
6239dd05 15750 loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
d43fc1df 15751 if (!m_analyzed_vinfo && aarch64_use_new_vector_costs_p ())
f837785c
RS
15752 {
15753 if (loop_vinfo)
d43fc1df
RS
15754 analyze_loop_vinfo (loop_vinfo);
15755
15756 m_analyzed_vinfo = true;
f837785c
RS
15757 }
15758
15759 /* Try to get a more accurate cost by looking at STMT_INFO instead
15760 of just looking at KIND. */
15761 if (stmt_info && aarch64_use_new_vector_costs_p ())
15762 {
f837785c
RS
15763 /* If we scalarize a strided store, the vectorizer costs one
15764 vec_to_scalar for each element. However, we can store the first
15765 element using an FP store without a separate extract step. */
15766 if (vect_is_store_elt_extraction (kind, stmt_info))
15767 count -= 1;
15768
d43fc1df
RS
15769 stmt_cost = aarch64_detect_scalar_stmt_subtype (m_vinfo, kind,
15770 stmt_info, stmt_cost);
f837785c 15771
d43fc1df 15772 if (vectype && m_vec_flags)
6239dd05 15773 stmt_cost = aarch64_detect_vector_stmt_subtype (m_vinfo, kind,
f837785c
RS
15774 stmt_info, vectype,
15775 where, stmt_cost);
15776 }
15777
15778 /* Do any SVE-specific adjustments to the cost. */
15779 if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
6239dd05 15780 stmt_cost = aarch64_sve_adjust_stmt_cost (m_vinfo, kind, stmt_info,
f837785c
RS
15781 vectype, stmt_cost);
15782
15783 if (stmt_info && aarch64_use_new_vector_costs_p ())
15784 {
15785 /* Account for any extra "embedded" costs that apply additively
15786 to the base cost calculated above. */
15787 stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype,
15788 stmt_cost);
15789
15790 /* If we're recording a nonzero vector loop body cost for the
15791 innermost loop, also estimate the operations that would need
15792 to be issued by all relevant implementations of the loop. */
f837785c 15793 if (loop_vinfo
6756706e 15794 && (m_costing_for_scalar || where == vect_body)
f837785c 15795 && (!LOOP_VINFO_LOOP (loop_vinfo)->inner || in_inner_loop_p)
f837785c 15796 && stmt_cost != 0)
87fcff96
RS
15797 for (auto &ops : m_ops)
15798 count_ops (count, kind, stmt_info, &ops);
b1a831f0 15799
f837785c
RS
15800 /* If we're applying the SVE vs. Advanced SIMD unrolling heuristic,
15801 estimate the number of statements in the unrolled Advanced SIMD
15802 loop. For simplicitly, we assume that one iteration of the
15803 Advanced SIMD loop would need the same number of statements
15804 as one iteration of the SVE loop. */
d43fc1df
RS
15805 if (where == vect_body && m_unrolled_advsimd_niters)
15806 m_unrolled_advsimd_stmts += count * m_unrolled_advsimd_niters;
f837785c 15807 }
6239dd05 15808 return record_stmt_cost (stmt_info, where, (count * stmt_cost).ceil ());
8990e73a
TB
15809}
15810
c6c5c5eb
RS
15811/* Return true if (a) we're applying the Advanced SIMD vs. SVE unrolling
15812 heuristic described above m_unrolled_advsimd_niters and (b) the heuristic
15813 says that we should prefer the Advanced SIMD loop. */
15814bool
15815aarch64_vector_costs::prefer_unrolled_loop () const
15816{
15817 if (!m_unrolled_advsimd_stmts)
15818 return false;
15819
15820 if (dump_enabled_p ())
15821 dump_printf_loc (MSG_NOTE, vect_location, "Number of insns in"
15822 " unrolled Advanced SIMD loop = %d\n",
15823 m_unrolled_advsimd_stmts);
15824
15825 /* The balance here is tricky. On the one hand, we can't be sure whether
15826 the code is vectorizable with Advanced SIMD or not. However, even if
15827 it isn't vectorizable with Advanced SIMD, there's a possibility that
15828 the scalar code could also be unrolled. Some of the code might then
15829 benefit from SLP, or from using LDP and STP. We therefore apply
15830 the heuristic regardless of can_use_advsimd_p. */
15831 return (m_unrolled_advsimd_stmts
15832 && (m_unrolled_advsimd_stmts
15833 <= (unsigned int) param_max_completely_peeled_insns));
15834}
15835
d43fc1df
RS
15836/* Subroutine of adjust_body_cost for handling SVE. Use ISSUE_INFO to work out
15837 how fast the SVE code can be issued and compare it to the equivalent value
15838 for scalar code (SCALAR_CYCLES_PER_ITER). If COULD_USE_ADVSIMD is true,
15839 also compare it to the issue rate of Advanced SIMD code
15840 (ADVSIMD_CYCLES_PER_ITER).
b585f011 15841
d43fc1df
RS
15842 ORIG_BODY_COST is the cost originally passed to adjust_body_cost and
15843 *BODY_COST is the current value of the adjusted cost. *SHOULD_DISPARAGE
15844 is true if we think the loop body is too expensive. */
b585f011 15845
d43fc1df
RS
15846fractional_cost
15847aarch64_vector_costs::
1a5288fe 15848adjust_body_cost_sve (const aarch64_vec_op_count *ops,
d43fc1df 15849 fractional_cost scalar_cycles_per_iter,
c6c5c5eb
RS
15850 unsigned int orig_body_cost, unsigned int *body_cost,
15851 bool *should_disparage)
b585f011 15852{
a82ffd43
RS
15853 if (dump_enabled_p ())
15854 ops->dump ();
048039c4 15855
a82ffd43
RS
15856 fractional_cost sve_pred_cycles_per_iter = ops->min_pred_cycles_per_iter ();
15857 fractional_cost sve_cycles_per_iter = ops->min_cycles_per_iter ();
b585f011
RS
15858
15859 /* If the scalar version of the loop could issue at least as
15860 quickly as the predicate parts of the SVE loop, make the SVE loop
15861 prohibitively expensive. In this case vectorization is adding an
15862 overhead that the original scalar code didn't have.
15863
15864 This is mostly intended to detect cases in which WHILELOs dominate
15865 for very tight loops, which is something that normal latency-based
15866 costs would not model. Adding this kind of cliffedge would be
15867 too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter;
15868 code in the caller handles that case in a more conservative way. */
a82ffd43 15869 fractional_cost sve_estimate = sve_pred_cycles_per_iter + 1;
b585f011
RS
15870 if (scalar_cycles_per_iter < sve_estimate)
15871 {
15872 unsigned int min_cost
15873 = orig_body_cost * estimated_poly_value (BYTES_PER_SVE_VECTOR);
15874 if (*body_cost < min_cost)
15875 {
15876 if (dump_enabled_p ())
15877 dump_printf_loc (MSG_NOTE, vect_location,
15878 "Increasing body cost to %d because the"
15879 " scalar code could issue within the limit"
15880 " imposed by predicate operations\n",
15881 min_cost);
15882 *body_cost = min_cost;
15883 *should_disparage = true;
15884 }
15885 }
15886
b585f011
RS
15887 return sve_cycles_per_iter;
15888}
15889
d43fc1df
RS
15890/* BODY_COST is the cost of a vector loop body. Adjust the cost as necessary
15891 and return the new cost. */
15892unsigned int
6756706e
RS
15893aarch64_vector_costs::
15894adjust_body_cost (loop_vec_info loop_vinfo,
15895 const aarch64_vector_costs *scalar_costs,
15896 unsigned int body_cost)
3b924b0d 15897{
1a5288fe
RS
15898 if (scalar_costs->m_ops.is_empty () || m_ops.is_empty ())
15899 return body_cost;
15900
15901 const auto &scalar_ops = scalar_costs->m_ops[0];
15902 const auto &vector_ops = m_ops[0];
6756706e 15903 unsigned int estimated_vf = vect_vf_for_cost (loop_vinfo);
3b924b0d 15904 unsigned int orig_body_cost = body_cost;
1205a8ca
RS
15905 bool should_disparage = false;
15906
15907 if (dump_enabled_p ())
15908 dump_printf_loc (MSG_NOTE, vect_location,
15909 "Original vector body cost = %d\n", body_cost);
3b924b0d 15910
83d796d3 15911 fractional_cost scalar_cycles_per_iter
a82ffd43 15912 = scalar_ops.min_cycles_per_iter () * estimated_vf;
83d796d3 15913
a82ffd43 15914 fractional_cost vector_cycles_per_iter = vector_ops.min_cycles_per_iter ();
1205a8ca
RS
15915
15916 if (dump_enabled_p ())
15917 {
d43fc1df 15918 if (IN_RANGE (m_num_vector_iterations, 0, 65536))
1205a8ca
RS
15919 dump_printf_loc (MSG_NOTE, vect_location,
15920 "Vector loop iterates at most %wd times\n",
d43fc1df 15921 m_num_vector_iterations);
1205a8ca 15922 dump_printf_loc (MSG_NOTE, vect_location, "Scalar issue estimate:\n");
6756706e 15923 scalar_ops.dump ();
1205a8ca 15924 dump_printf_loc (MSG_NOTE, vect_location,
6756706e
RS
15925 " estimated cycles per vector iteration"
15926 " (for VF %d) = %f\n",
15927 estimated_vf, scalar_cycles_per_iter.as_double ());
1205a8ca
RS
15928 }
15929
1a5288fe 15930 if (vector_ops.sve_issue_info ())
1205a8ca 15931 {
1205a8ca 15932 if (dump_enabled_p ())
c6c5c5eb 15933 dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n");
b585f011 15934 vector_cycles_per_iter
1a5288fe 15935 = adjust_body_cost_sve (&vector_ops, scalar_cycles_per_iter,
d43fc1df 15936 orig_body_cost, &body_cost, &should_disparage);
048039c4
RS
15937
15938 if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
15939 {
15940 /* Also take Neoverse V1 tuning into account, doubling the
15941 scalar and Advanced SIMD estimates to account for the
15942 doubling in SVE vector length. */
15943 if (dump_enabled_p ())
15944 dump_printf_loc (MSG_NOTE, vect_location,
15945 "Neoverse V1 estimate:\n");
2e1886ea
RS
15946 auto vf_factor = m_ops[1].vf_factor ();
15947 adjust_body_cost_sve (&m_ops[1], scalar_cycles_per_iter * vf_factor,
c6c5c5eb 15948 orig_body_cost, &body_cost, &should_disparage);
048039c4 15949 }
1205a8ca 15950 }
6756706e
RS
15951 else
15952 {
15953 if (dump_enabled_p ())
15954 {
15955 dump_printf_loc (MSG_NOTE, vect_location,
15956 "Vector issue estimate:\n");
1a5288fe 15957 vector_ops.dump ();
6756706e
RS
15958 }
15959 }
1205a8ca
RS
15960
15961 /* Decide whether to stick to latency-based costs or whether to try to
15962 take issue rates into account. */
15963 unsigned int threshold = aarch64_loop_vect_issue_rate_niters;
d43fc1df 15964 if (m_vec_flags & VEC_ANY_SVE)
1205a8ca
RS
15965 threshold = CEIL (threshold, aarch64_estimated_sve_vq ());
15966
d43fc1df
RS
15967 if (m_num_vector_iterations >= 1
15968 && m_num_vector_iterations < threshold)
1205a8ca
RS
15969 {
15970 if (dump_enabled_p ())
15971 dump_printf_loc (MSG_NOTE, vect_location,
15972 "Low iteration count, so using pure latency"
15973 " costs\n");
15974 }
15975 /* Increase the cost of the vector code if it looks like the scalar code
15976 could issue more quickly. These values are only rough estimates,
15977 so minor differences should only result in minor changes. */
15978 else if (scalar_cycles_per_iter < vector_cycles_per_iter)
15979 {
83d796d3
RS
15980 body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
15981 scalar_cycles_per_iter);
1205a8ca
RS
15982 if (dump_enabled_p ())
15983 dump_printf_loc (MSG_NOTE, vect_location,
15984 "Increasing body cost to %d because scalar code"
15985 " would issue more quickly\n", body_cost);
15986 }
15987 /* In general, it's expected that the proposed vector code would be able
15988 to issue more quickly than the original scalar code. This should
15989 already be reflected to some extent in the latency-based costs.
15990
15991 However, the latency-based costs effectively assume that the scalar
15992 code and the vector code execute serially, which tends to underplay
15993 one important case: if the real (non-serialized) execution time of
15994 a scalar iteration is dominated by loop-carried dependencies,
15995 and if the vector code is able to reduce both the length of
15996 the loop-carried dependencies *and* the number of cycles needed
15997 to issue the code in general, we can be more confident that the
15998 vector code is an improvement, even if adding the other (non-loop-carried)
15999 latencies tends to hide this saving. We therefore reduce the cost of the
16000 vector loop body in proportion to the saving. */
1a5288fe 16001 else if (scalar_ops.reduction_latency > vector_ops.reduction_latency
6756706e 16002 && scalar_ops.reduction_latency == scalar_cycles_per_iter
1205a8ca
RS
16003 && scalar_cycles_per_iter > vector_cycles_per_iter
16004 && !should_disparage)
16005 {
83d796d3
RS
16006 body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
16007 scalar_cycles_per_iter);
1205a8ca
RS
16008 if (dump_enabled_p ())
16009 dump_printf_loc (MSG_NOTE, vect_location,
16010 "Decreasing body cost to %d account for smaller"
16011 " reduction latency\n", body_cost);
16012 }
16013
3b924b0d
RS
16014 return body_cost;
16015}
16016
6239dd05 16017void
6756706e 16018aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
50a525b5 16019{
6756706e
RS
16020 auto *scalar_costs
16021 = static_cast<const aarch64_vector_costs *> (uncast_scalar_costs);
d43fc1df
RS
16022 loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
16023 if (loop_vinfo
16024 && m_vec_flags
3b924b0d 16025 && aarch64_use_new_vector_costs_p ())
6756706e
RS
16026 m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
16027 m_costs[vect_body]);
50a525b5 16028
0612883d 16029 vector_costs::finish_cost (scalar_costs);
50a525b5
RS
16030}
16031
c6c5c5eb
RS
16032bool
16033aarch64_vector_costs::
16034better_main_loop_than_p (const vector_costs *uncast_other) const
16035{
16036 auto other = static_cast<const aarch64_vector_costs *> (uncast_other);
16037
16038 auto this_loop_vinfo = as_a<loop_vec_info> (this->m_vinfo);
16039 auto other_loop_vinfo = as_a<loop_vec_info> (other->m_vinfo);
16040
16041 if (dump_enabled_p ())
16042 dump_printf_loc (MSG_NOTE, vect_location,
16043 "Comparing two main loops (%s at VF %d vs %s at VF %d)\n",
16044 GET_MODE_NAME (this_loop_vinfo->vector_mode),
16045 vect_vf_for_cost (this_loop_vinfo),
16046 GET_MODE_NAME (other_loop_vinfo->vector_mode),
16047 vect_vf_for_cost (other_loop_vinfo));
16048
16049 /* Apply the unrolling heuristic described above
16050 m_unrolled_advsimd_niters. */
16051 if (bool (m_unrolled_advsimd_stmts)
16052 != bool (other->m_unrolled_advsimd_stmts))
16053 {
16054 bool this_prefer_unrolled = this->prefer_unrolled_loop ();
16055 bool other_prefer_unrolled = other->prefer_unrolled_loop ();
16056 if (this_prefer_unrolled != other_prefer_unrolled)
16057 {
16058 if (dump_enabled_p ())
16059 dump_printf_loc (MSG_NOTE, vect_location,
16060 "Preferring Advanced SIMD loop because"
16061 " it can be unrolled\n");
16062 return other_prefer_unrolled;
16063 }
16064 }
16065
16066 for (unsigned int i = 0; i < m_ops.length (); ++i)
16067 {
16068 if (dump_enabled_p ())
16069 {
16070 if (i)
16071 dump_printf_loc (MSG_NOTE, vect_location,
16072 "Reconsidering with subtuning %d\n", i);
16073 dump_printf_loc (MSG_NOTE, vect_location,
16074 "Issue info for %s loop:\n",
16075 GET_MODE_NAME (this_loop_vinfo->vector_mode));
16076 this->m_ops[i].dump ();
16077 dump_printf_loc (MSG_NOTE, vect_location,
16078 "Issue info for %s loop:\n",
16079 GET_MODE_NAME (other_loop_vinfo->vector_mode));
16080 other->m_ops[i].dump ();
16081 }
16082
16083 auto this_estimated_vf = (vect_vf_for_cost (this_loop_vinfo)
16084 * this->m_ops[i].vf_factor ());
16085 auto other_estimated_vf = (vect_vf_for_cost (other_loop_vinfo)
16086 * other->m_ops[i].vf_factor ());
16087
16088 /* If it appears that one loop could process the same amount of data
16089 in fewer cycles, prefer that loop over the other one. */
16090 fractional_cost this_cost
16091 = this->m_ops[i].min_cycles_per_iter () * other_estimated_vf;
16092 fractional_cost other_cost
16093 = other->m_ops[i].min_cycles_per_iter () * this_estimated_vf;
16094 if (dump_enabled_p ())
16095 {
16096 dump_printf_loc (MSG_NOTE, vect_location,
16097 "Weighted cycles per iteration of %s loop ~= %f\n",
16098 GET_MODE_NAME (this_loop_vinfo->vector_mode),
16099 this_cost.as_double ());
16100 dump_printf_loc (MSG_NOTE, vect_location,
16101 "Weighted cycles per iteration of %s loop ~= %f\n",
16102 GET_MODE_NAME (other_loop_vinfo->vector_mode),
16103 other_cost.as_double ());
16104 }
16105 if (this_cost != other_cost)
16106 {
16107 if (dump_enabled_p ())
16108 dump_printf_loc (MSG_NOTE, vect_location,
16109 "Preferring loop with lower cycles"
16110 " per iteration\n");
16111 return this_cost < other_cost;
16112 }
16113
16114 /* If the issue rate of SVE code is limited by predicate operations
16115 (i.e. if sve_pred_cycles_per_iter > sve_nonpred_cycles_per_iter),
16116 and if Advanced SIMD code could issue within the limit imposed
16117 by the predicate operations, the predicate operations are adding an
16118 overhead that the original code didn't have and so we should prefer
16119 the Advanced SIMD version. */
16120 auto better_pred_limit_p = [](const aarch64_vec_op_count &a,
16121 const aarch64_vec_op_count &b) -> bool
16122 {
16123 if (a.pred_ops == 0
16124 && (b.min_pred_cycles_per_iter ()
16125 > b.min_nonpred_cycles_per_iter ()))
16126 {
16127 if (dump_enabled_p ())
16128 dump_printf_loc (MSG_NOTE, vect_location,
16129 "Preferring Advanced SIMD loop since"
16130 " SVE loop is predicate-limited\n");
16131 return true;
16132 }
16133 return false;
16134 };
16135 if (better_pred_limit_p (this->m_ops[i], other->m_ops[i]))
16136 return true;
16137 if (better_pred_limit_p (other->m_ops[i], this->m_ops[i]))
16138 return false;
16139 }
16140
16141 return vector_costs::better_main_loop_than_p (other);
16142}
16143
0cfff2a1 16144static void initialize_aarch64_code_model (struct gcc_options *);
43e9d192 16145
0cfff2a1
KT
16146/* Parse the TO_PARSE string and put the architecture struct that it
16147 selects into RES and the architectural features into ISA_FLAGS.
16148 Return an aarch64_parse_opt_result describing the parse result.
c7887347
ML
16149 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
16150 When the TO_PARSE string contains an invalid extension,
16151 a copy of the string is created and stored to INVALID_EXTENSION. */
43e9d192 16152
0cfff2a1
KT
16153static enum aarch64_parse_opt_result
16154aarch64_parse_arch (const char *to_parse, const struct processor **res,
28108a53 16155 uint64_t *isa_flags, std::string *invalid_extension)
43e9d192 16156{
ff150bc4 16157 const char *ext;
43e9d192 16158 const struct processor *arch;
43e9d192
IB
16159 size_t len;
16160
ff150bc4 16161 ext = strchr (to_parse, '+');
43e9d192
IB
16162
16163 if (ext != NULL)
ff150bc4 16164 len = ext - to_parse;
43e9d192 16165 else
ff150bc4 16166 len = strlen (to_parse);
43e9d192
IB
16167
16168 if (len == 0)
0cfff2a1
KT
16169 return AARCH64_PARSE_MISSING_ARG;
16170
43e9d192 16171
0cfff2a1 16172 /* Loop through the list of supported ARCHes to find a match. */
43e9d192
IB
16173 for (arch = all_architectures; arch->name != NULL; arch++)
16174 {
ff150bc4
ML
16175 if (strlen (arch->name) == len
16176 && strncmp (arch->name, to_parse, len) == 0)
43e9d192 16177 {
28108a53 16178 uint64_t isa_temp = arch->flags;
43e9d192
IB
16179
16180 if (ext != NULL)
16181 {
0cfff2a1
KT
16182 /* TO_PARSE string contains at least one extension. */
16183 enum aarch64_parse_opt_result ext_res
c7887347 16184 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
43e9d192 16185
0cfff2a1
KT
16186 if (ext_res != AARCH64_PARSE_OK)
16187 return ext_res;
ffee7aa9 16188 }
0cfff2a1
KT
16189 /* Extension parsing was successful. Confirm the result
16190 arch and ISA flags. */
16191 *res = arch;
16192 *isa_flags = isa_temp;
16193 return AARCH64_PARSE_OK;
43e9d192
IB
16194 }
16195 }
16196
16197 /* ARCH name not found in list. */
0cfff2a1 16198 return AARCH64_PARSE_INVALID_ARG;
43e9d192
IB
16199}
16200
0cfff2a1
KT
16201/* Parse the TO_PARSE string and put the result tuning in RES and the
16202 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
16203 describing the parse result. If there is an error parsing, RES and
c7887347
ML
16204 ISA_FLAGS are left unchanged.
16205 When the TO_PARSE string contains an invalid extension,
16206 a copy of the string is created and stored to INVALID_EXTENSION. */
43e9d192 16207
0cfff2a1
KT
16208static enum aarch64_parse_opt_result
16209aarch64_parse_cpu (const char *to_parse, const struct processor **res,
28108a53 16210 uint64_t *isa_flags, std::string *invalid_extension)
43e9d192 16211{
ff150bc4 16212 const char *ext;
43e9d192 16213 const struct processor *cpu;
43e9d192
IB
16214 size_t len;
16215
ff150bc4 16216 ext = strchr (to_parse, '+');
43e9d192
IB
16217
16218 if (ext != NULL)
ff150bc4 16219 len = ext - to_parse;
43e9d192 16220 else
ff150bc4 16221 len = strlen (to_parse);
43e9d192
IB
16222
16223 if (len == 0)
0cfff2a1
KT
16224 return AARCH64_PARSE_MISSING_ARG;
16225
43e9d192
IB
16226
16227 /* Loop through the list of supported CPUs to find a match. */
16228 for (cpu = all_cores; cpu->name != NULL; cpu++)
16229 {
ff150bc4 16230 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
43e9d192 16231 {
28108a53 16232 uint64_t isa_temp = cpu->flags;
0cfff2a1 16233
43e9d192
IB
16234
16235 if (ext != NULL)
16236 {
0cfff2a1
KT
16237 /* TO_PARSE string contains at least one extension. */
16238 enum aarch64_parse_opt_result ext_res
c7887347 16239 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
43e9d192 16240
0cfff2a1
KT
16241 if (ext_res != AARCH64_PARSE_OK)
16242 return ext_res;
16243 }
16244 /* Extension parsing was successfull. Confirm the result
16245 cpu and ISA flags. */
16246 *res = cpu;
16247 *isa_flags = isa_temp;
16248 return AARCH64_PARSE_OK;
43e9d192
IB
16249 }
16250 }
16251
16252 /* CPU name not found in list. */
0cfff2a1 16253 return AARCH64_PARSE_INVALID_ARG;
43e9d192
IB
16254}
16255
0cfff2a1
KT
16256/* Parse the TO_PARSE string and put the cpu it selects into RES.
16257 Return an aarch64_parse_opt_result describing the parse result.
16258 If the parsing fails the RES does not change. */
43e9d192 16259
0cfff2a1
KT
16260static enum aarch64_parse_opt_result
16261aarch64_parse_tune (const char *to_parse, const struct processor **res)
43e9d192
IB
16262{
16263 const struct processor *cpu;
43e9d192
IB
16264
16265 /* Loop through the list of supported CPUs to find a match. */
16266 for (cpu = all_cores; cpu->name != NULL; cpu++)
16267 {
ff150bc4 16268 if (strcmp (cpu->name, to_parse) == 0)
43e9d192 16269 {
0cfff2a1
KT
16270 *res = cpu;
16271 return AARCH64_PARSE_OK;
43e9d192
IB
16272 }
16273 }
16274
16275 /* CPU name not found in list. */
0cfff2a1 16276 return AARCH64_PARSE_INVALID_ARG;
43e9d192
IB
16277}
16278
8dec06f2
JG
16279/* Parse TOKEN, which has length LENGTH to see if it is an option
16280 described in FLAG. If it is, return the index bit for that fusion type.
16281 If not, error (printing OPTION_NAME) and return zero. */
16282
16283static unsigned int
16284aarch64_parse_one_option_token (const char *token,
16285 size_t length,
16286 const struct aarch64_flag_desc *flag,
16287 const char *option_name)
16288{
16289 for (; flag->name != NULL; flag++)
16290 {
16291 if (length == strlen (flag->name)
16292 && !strncmp (flag->name, token, length))
16293 return flag->flag;
16294 }
16295
a3f9f006 16296 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
8dec06f2
JG
16297 return 0;
16298}
16299
16300/* Parse OPTION which is a comma-separated list of flags to enable.
16301 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
16302 default state we inherit from the CPU tuning structures. OPTION_NAME
16303 gives the top-level option we are parsing in the -moverride string,
16304 for use in error messages. */
16305
16306static unsigned int
16307aarch64_parse_boolean_options (const char *option,
16308 const struct aarch64_flag_desc *flags,
16309 unsigned int initial_state,
16310 const char *option_name)
16311{
16312 const char separator = '.';
16313 const char* specs = option;
16314 const char* ntoken = option;
16315 unsigned int found_flags = initial_state;
16316
16317 while ((ntoken = strchr (specs, separator)))
16318 {
16319 size_t token_length = ntoken - specs;
16320 unsigned token_ops = aarch64_parse_one_option_token (specs,
16321 token_length,
16322 flags,
16323 option_name);
16324 /* If we find "none" (or, for simplicity's sake, an error) anywhere
16325 in the token stream, reset the supported operations. So:
16326
16327 adrp+add.cmp+branch.none.adrp+add
16328
16329 would have the result of turning on only adrp+add fusion. */
16330 if (!token_ops)
16331 found_flags = 0;
16332
16333 found_flags |= token_ops;
16334 specs = ++ntoken;
16335 }
16336
16337 /* We ended with a comma, print something. */
16338 if (!(*specs))
16339 {
16340 error ("%s string ill-formed\n", option_name);
16341 return 0;
16342 }
16343
16344 /* We still have one more token to parse. */
16345 size_t token_length = strlen (specs);
16346 unsigned token_ops = aarch64_parse_one_option_token (specs,
16347 token_length,
16348 flags,
16349 option_name);
16350 if (!token_ops)
16351 found_flags = 0;
16352
16353 found_flags |= token_ops;
16354 return found_flags;
16355}
16356
16357/* Support for overriding instruction fusion. */
16358
16359static void
16360aarch64_parse_fuse_string (const char *fuse_string,
16361 struct tune_params *tune)
16362{
16363 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
16364 aarch64_fusible_pairs,
16365 tune->fusible_ops,
16366 "fuse=");
16367}
16368
16369/* Support for overriding other tuning flags. */
16370
16371static void
16372aarch64_parse_tune_string (const char *tune_string,
16373 struct tune_params *tune)
16374{
16375 tune->extra_tuning_flags
16376 = aarch64_parse_boolean_options (tune_string,
16377 aarch64_tuning_flags,
16378 tune->extra_tuning_flags,
16379 "tune=");
16380}
16381
886f092f
KT
16382/* Parse the sve_width tuning moverride string in TUNE_STRING.
16383 Accept the valid SVE vector widths allowed by
16384 aarch64_sve_vector_bits_enum and use it to override sve_width
16385 in TUNE. */
16386
16387static void
16388aarch64_parse_sve_width_string (const char *tune_string,
16389 struct tune_params *tune)
16390{
16391 int width = -1;
16392
16393 int n = sscanf (tune_string, "%d", &width);
16394 if (n == EOF)
16395 {
16396 error ("invalid format for sve_width");
16397 return;
16398 }
16399 switch (width)
16400 {
16401 case SVE_128:
16402 case SVE_256:
16403 case SVE_512:
16404 case SVE_1024:
16405 case SVE_2048:
16406 break;
16407 default:
16408 error ("invalid sve_width value: %d", width);
16409 }
16410 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
16411}
16412
8dec06f2
JG
16413/* Parse TOKEN, which has length LENGTH to see if it is a tuning option
16414 we understand. If it is, extract the option string and handoff to
16415 the appropriate function. */
16416
16417void
16418aarch64_parse_one_override_token (const char* token,
16419 size_t length,
16420 struct tune_params *tune)
16421{
16422 const struct aarch64_tuning_override_function *fn
16423 = aarch64_tuning_override_functions;
16424
16425 const char *option_part = strchr (token, '=');
16426 if (!option_part)
16427 {
16428 error ("tuning string missing in option (%s)", token);
16429 return;
16430 }
16431
16432 /* Get the length of the option name. */
16433 length = option_part - token;
16434 /* Skip the '=' to get to the option string. */
16435 option_part++;
16436
16437 for (; fn->name != NULL; fn++)
16438 {
16439 if (!strncmp (fn->name, token, length))
16440 {
16441 fn->parse_override (option_part, tune);
16442 return;
16443 }
16444 }
16445
16446 error ("unknown tuning option (%s)",token);
16447 return;
16448}
16449
5eee3c34
JW
16450/* A checking mechanism for the implementation of the tls size. */
16451
16452static void
16453initialize_aarch64_tls_size (struct gcc_options *opts)
16454{
16455 if (aarch64_tls_size == 0)
16456 aarch64_tls_size = 24;
16457
16458 switch (opts->x_aarch64_cmodel_var)
16459 {
16460 case AARCH64_CMODEL_TINY:
16461 /* Both the default and maximum TLS size allowed under tiny is 1M which
16462 needs two instructions to address, so we clamp the size to 24. */
16463 if (aarch64_tls_size > 24)
16464 aarch64_tls_size = 24;
16465 break;
16466 case AARCH64_CMODEL_SMALL:
16467 /* The maximum TLS size allowed under small is 4G. */
16468 if (aarch64_tls_size > 32)
16469 aarch64_tls_size = 32;
16470 break;
16471 case AARCH64_CMODEL_LARGE:
16472 /* The maximum TLS size allowed under large is 16E.
16473 FIXME: 16E should be 64bit, we only support 48bit offset now. */
16474 if (aarch64_tls_size > 48)
16475 aarch64_tls_size = 48;
16476 break;
16477 default:
16478 gcc_unreachable ();
16479 }
16480
16481 return;
16482}
16483
8dec06f2
JG
16484/* Parse STRING looking for options in the format:
16485 string :: option:string
16486 option :: name=substring
16487 name :: {a-z}
16488 substring :: defined by option. */
16489
16490static void
16491aarch64_parse_override_string (const char* input_string,
16492 struct tune_params* tune)
16493{
16494 const char separator = ':';
16495 size_t string_length = strlen (input_string) + 1;
16496 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
16497 char *string = string_root;
16498 strncpy (string, input_string, string_length);
16499 string[string_length - 1] = '\0';
16500
16501 char* ntoken = string;
16502
16503 while ((ntoken = strchr (string, separator)))
16504 {
16505 size_t token_length = ntoken - string;
16506 /* Make this substring look like a string. */
16507 *ntoken = '\0';
16508 aarch64_parse_one_override_token (string, token_length, tune);
16509 string = ++ntoken;
16510 }
16511
16512 /* One last option to parse. */
16513 aarch64_parse_one_override_token (string, strlen (string), tune);
16514 free (string_root);
16515}
43e9d192 16516
8f0c9d53
KT
16517/* Adjust CURRENT_TUNE (a generic tuning struct) with settings that
16518 are best for a generic target with the currently-enabled architecture
16519 extensions. */
16520static void
16521aarch64_adjust_generic_arch_tuning (struct tune_params &current_tune)
16522{
16523 /* Neoverse V1 is the only core that is known to benefit from
16524 AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS. There is therefore no
16525 point enabling it for SVE2 and above. */
16526 if (TARGET_SVE2)
16527 current_tune.extra_tuning_flags
16528 &= ~AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS;
16529}
43e9d192
IB
16530
16531static void
0cfff2a1 16532aarch64_override_options_after_change_1 (struct gcc_options *opts)
43e9d192 16533{
efac62a3
ST
16534 if (accepted_branch_protection_string)
16535 {
16536 opts->x_aarch64_branch_protection_string
16537 = xstrdup (accepted_branch_protection_string);
16538 }
16539
acea40ac
WD
16540 /* PR 70044: We have to be careful about being called multiple times for the
16541 same function. This means all changes should be repeatable. */
16542
d6cb6d6a
WD
16543 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
16544 Disable the frame pointer flag so the mid-end will not use a frame
16545 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
16546 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
16547 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
16548 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
acea40ac 16549 if (opts->x_flag_omit_frame_pointer == 0)
a3dc8760 16550 opts->x_flag_omit_frame_pointer = 2;
43e9d192 16551
1be34295 16552 /* If not optimizing for size, set the default
0cfff2a1
KT
16553 alignment to what the target wants. */
16554 if (!opts->x_optimize_size)
43e9d192 16555 {
c518c102
ML
16556 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
16557 opts->x_str_align_loops = aarch64_tune_params.loop_align;
16558 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
16559 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
16560 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
16561 opts->x_str_align_functions = aarch64_tune_params.function_align;
43e9d192 16562 }
b4f50fd4 16563
9ee6540a
WD
16564 /* We default to no pc-relative literal loads. */
16565
16566 aarch64_pcrelative_literal_loads = false;
16567
16568 /* If -mpc-relative-literal-loads is set on the command line, this
b4f50fd4 16569 implies that the user asked for PC relative literal loads. */
9ee6540a
WD
16570 if (opts->x_pcrelative_literal_loads == 1)
16571 aarch64_pcrelative_literal_loads = true;
b4f50fd4 16572
9ee6540a
WD
16573 /* In the tiny memory model it makes no sense to disallow PC relative
16574 literal pool loads. */
16575 if (aarch64_cmodel == AARCH64_CMODEL_TINY
16576 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
16577 aarch64_pcrelative_literal_loads = true;
98daafa0
EM
16578
16579 /* When enabling the lower precision Newton series for the square root, also
16580 enable it for the reciprocal square root, since the latter is an
16581 intermediary step for the former. */
16582 if (flag_mlow_precision_sqrt)
16583 flag_mrecip_low_precision_sqrt = true;
0cfff2a1 16584}
43e9d192 16585
0cfff2a1
KT
16586/* 'Unpack' up the internal tuning structs and update the options
16587 in OPTS. The caller must have set up selected_tune and selected_arch
16588 as all the other target-specific codegen decisions are
16589 derived from them. */
16590
e4ea20c8 16591void
0cfff2a1
KT
16592aarch64_override_options_internal (struct gcc_options *opts)
16593{
16594 aarch64_tune_flags = selected_tune->flags;
16595 aarch64_tune = selected_tune->sched_core;
16596 /* Make a copy of the tuning parameters attached to the core, which
16597 we may later overwrite. */
16598 aarch64_tune_params = *(selected_tune->tune);
16599 aarch64_architecture_version = selected_arch->architecture_version;
8f0c9d53
KT
16600 if (selected_tune->tune == &generic_tunings)
16601 aarch64_adjust_generic_arch_tuning (aarch64_tune_params);
0cfff2a1
KT
16602
16603 if (opts->x_aarch64_override_tune_string)
16604 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
16605 &aarch64_tune_params);
16606
16607 /* This target defaults to strict volatile bitfields. */
16608 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
16609 opts->x_flag_strict_volatile_bitfields = 1;
16610
cd0b2d36
RR
16611 if (aarch64_stack_protector_guard == SSP_GLOBAL
16612 && opts->x_aarch64_stack_protector_guard_offset_str)
16613 {
41804907 16614 error ("incompatible options %<-mstack-protector-guard=global%> and "
63d42e89 16615 "%<-mstack-protector-guard-offset=%s%>",
cd0b2d36
RR
16616 aarch64_stack_protector_guard_offset_str);
16617 }
16618
16619 if (aarch64_stack_protector_guard == SSP_SYSREG
16620 && !(opts->x_aarch64_stack_protector_guard_offset_str
16621 && opts->x_aarch64_stack_protector_guard_reg_str))
16622 {
a3f9f006
ML
16623 error ("both %<-mstack-protector-guard-offset%> and "
16624 "%<-mstack-protector-guard-reg%> must be used "
16625 "with %<-mstack-protector-guard=sysreg%>");
cd0b2d36
RR
16626 }
16627
16628 if (opts->x_aarch64_stack_protector_guard_reg_str)
16629 {
16630 if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
16631 error ("specify a system register with a small string length.");
16632 }
16633
16634 if (opts->x_aarch64_stack_protector_guard_offset_str)
16635 {
16636 char *end;
16637 const char *str = aarch64_stack_protector_guard_offset_str;
16638 errno = 0;
16639 long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
16640 if (!*str || *end || errno)
16641 error ("%qs is not a valid offset in %qs", str,
63d42e89 16642 "-mstack-protector-guard-offset=");
cd0b2d36
RR
16643 aarch64_stack_protector_guard_offset = offs;
16644 }
16645
0cfff2a1 16646 initialize_aarch64_code_model (opts);
5eee3c34 16647 initialize_aarch64_tls_size (opts);
63892fa2 16648
2d6bc7fa
KT
16649 int queue_depth = 0;
16650 switch (aarch64_tune_params.autoprefetcher_model)
16651 {
16652 case tune_params::AUTOPREFETCHER_OFF:
16653 queue_depth = -1;
16654 break;
16655 case tune_params::AUTOPREFETCHER_WEAK:
16656 queue_depth = 0;
16657 break;
16658 case tune_params::AUTOPREFETCHER_STRONG:
16659 queue_depth = max_insn_queue_index + 1;
16660 break;
16661 default:
16662 gcc_unreachable ();
16663 }
16664
16665 /* We don't mind passing in global_options_set here as we don't use
16666 the *options_set structs anyway. */
028d4092
ML
16667 SET_OPTION_IF_UNSET (opts, &global_options_set,
16668 param_sched_autopref_queue_depth, queue_depth);
2d6bc7fa 16669
5f29f3d5
KT
16670 /* If using Advanced SIMD only for autovectorization disable SVE vector costs
16671 comparison. */
16672 if (aarch64_autovec_preference == 1)
16673 SET_OPTION_IF_UNSET (opts, &global_options_set,
16674 aarch64_sve_compare_costs, 0);
16675
9d2c6e2e
MK
16676 /* Set up parameters to be used in prefetching algorithm. Do not
16677 override the defaults unless we are tuning for a core we have
16678 researched values for. */
16679 if (aarch64_tune_params.prefetch->num_slots > 0)
028d4092
ML
16680 SET_OPTION_IF_UNSET (opts, &global_options_set,
16681 param_simultaneous_prefetches,
16682 aarch64_tune_params.prefetch->num_slots);
9d2c6e2e 16683 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
028d4092
ML
16684 SET_OPTION_IF_UNSET (opts, &global_options_set,
16685 param_l1_cache_size,
16686 aarch64_tune_params.prefetch->l1_cache_size);
9d2c6e2e 16687 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
028d4092
ML
16688 SET_OPTION_IF_UNSET (opts, &global_options_set,
16689 param_l1_cache_line_size,
16690 aarch64_tune_params.prefetch->l1_cache_line_size);
76b75018
JM
16691
16692 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
16693 {
16694 SET_OPTION_IF_UNSET (opts, &global_options_set,
16695 param_destruct_interfere_size,
16696 aarch64_tune_params.prefetch->l1_cache_line_size);
16697 SET_OPTION_IF_UNSET (opts, &global_options_set,
16698 param_construct_interfere_size,
16699 aarch64_tune_params.prefetch->l1_cache_line_size);
16700 }
16701 else
16702 {
16703 /* For a generic AArch64 target, cover the current range of cache line
16704 sizes. */
16705 SET_OPTION_IF_UNSET (opts, &global_options_set,
16706 param_destruct_interfere_size,
16707 256);
16708 SET_OPTION_IF_UNSET (opts, &global_options_set,
16709 param_construct_interfere_size,
16710 64);
16711 }
16712
9d2c6e2e 16713 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
028d4092
ML
16714 SET_OPTION_IF_UNSET (opts, &global_options_set,
16715 param_l2_cache_size,
16716 aarch64_tune_params.prefetch->l2_cache_size);
d2ff35c0 16717 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
028d4092
ML
16718 SET_OPTION_IF_UNSET (opts, &global_options_set,
16719 param_prefetch_dynamic_strides, 0);
59100dfc 16720 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
028d4092
ML
16721 SET_OPTION_IF_UNSET (opts, &global_options_set,
16722 param_prefetch_minimum_stride,
16723 aarch64_tune_params.prefetch->minimum_stride);
50487d79 16724
13494fcb 16725 /* Use the alternative scheduling-pressure algorithm by default. */
028d4092
ML
16726 SET_OPTION_IF_UNSET (opts, &global_options_set,
16727 param_sched_pressure_algorithm,
16728 SCHED_PRESSURE_MODEL);
13494fcb 16729
fbe9af50 16730 /* Validate the guard size. */
028d4092 16731 int guard_size = param_stack_clash_protection_guard_size;
fbe9af50 16732
8100e93b
ML
16733 if (guard_size != 12 && guard_size != 16)
16734 error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
16735 "size. Given value %d (%llu KB) is out of range",
16736 guard_size, (1ULL << guard_size) / 1024ULL);
16737
fbe9af50
TC
16738 /* Enforce that interval is the same size as size so the mid-end does the
16739 right thing. */
028d4092
ML
16740 SET_OPTION_IF_UNSET (opts, &global_options_set,
16741 param_stack_clash_protection_probe_interval,
16742 guard_size);
fbe9af50
TC
16743
16744 /* The maybe_set calls won't update the value if the user has explicitly set
16745 one. Which means we need to validate that probing interval and guard size
16746 are equal. */
16747 int probe_interval
028d4092 16748 = param_stack_clash_protection_probe_interval;
fbe9af50 16749 if (guard_size != probe_interval)
904f3daa
ML
16750 error ("stack clash guard size %<%d%> must be equal to probing interval "
16751 "%<%d%>", guard_size, probe_interval);
fbe9af50 16752
16b2cafd
MK
16753 /* Enable sw prefetching at specified optimization level for
16754 CPUS that have prefetch. Lower optimization level threshold by 1
16755 when profiling is enabled. */
16756 if (opts->x_flag_prefetch_loop_arrays < 0
16757 && !opts->x_optimize_size
16758 && aarch64_tune_params.prefetch->default_opt_level >= 0
16759 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
16760 opts->x_flag_prefetch_loop_arrays = 1;
16761
266c2b54
ML
16762 if (opts->x_aarch64_arch_string == NULL)
16763 opts->x_aarch64_arch_string = selected_arch->name;
16764 if (opts->x_aarch64_cpu_string == NULL)
16765 opts->x_aarch64_cpu_string = selected_cpu->name;
16766 if (opts->x_aarch64_tune_string == NULL)
16767 opts->x_aarch64_tune_string = selected_tune->name;
16768
0cfff2a1
KT
16769 aarch64_override_options_after_change_1 (opts);
16770}
43e9d192 16771
01f44038
KT
16772/* Print a hint with a suggestion for a core or architecture name that
16773 most closely resembles what the user passed in STR. ARCH is true if
16774 the user is asking for an architecture name. ARCH is false if the user
16775 is asking for a core name. */
16776
16777static void
16778aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
16779{
16780 auto_vec<const char *> candidates;
16781 const struct processor *entry = arch ? all_architectures : all_cores;
16782 for (; entry->name != NULL; entry++)
16783 candidates.safe_push (entry->name);
a08b5429
ML
16784
16785#ifdef HAVE_LOCAL_CPU_DETECT
16786 /* Add also "native" as possible value. */
16787 if (arch)
16788 candidates.safe_push ("native");
16789#endif
16790
01f44038
KT
16791 char *s;
16792 const char *hint = candidates_list_and_hint (str, s, candidates);
16793 if (hint)
16794 inform (input_location, "valid arguments are: %s;"
16795 " did you mean %qs?", s, hint);
6285e915
ML
16796 else
16797 inform (input_location, "valid arguments are: %s", s);
16798
01f44038
KT
16799 XDELETEVEC (s);
16800}
16801
16802/* Print a hint with a suggestion for a core name that most closely resembles
16803 what the user passed in STR. */
16804
16805inline static void
16806aarch64_print_hint_for_core (const char *str)
16807{
16808 aarch64_print_hint_for_core_or_arch (str, false);
16809}
16810
16811/* Print a hint with a suggestion for an architecture name that most closely
16812 resembles what the user passed in STR. */
16813
16814inline static void
16815aarch64_print_hint_for_arch (const char *str)
16816{
16817 aarch64_print_hint_for_core_or_arch (str, true);
16818}
16819
c7887347
ML
16820
16821/* Print a hint with a suggestion for an extension name
16822 that most closely resembles what the user passed in STR. */
16823
16824void
16825aarch64_print_hint_for_extensions (const std::string &str)
16826{
16827 auto_vec<const char *> candidates;
16828 aarch64_get_all_extension_candidates (&candidates);
16829 char *s;
16830 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
16831 if (hint)
16832 inform (input_location, "valid arguments are: %s;"
16833 " did you mean %qs?", s, hint);
16834 else
16835 inform (input_location, "valid arguments are: %s;", s);
16836
16837 XDELETEVEC (s);
16838}
16839
0cfff2a1
KT
16840/* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
16841 specified in STR and throw errors if appropriate. Put the results if
361fb3ee
KT
16842 they are valid in RES and ISA_FLAGS. Return whether the option is
16843 valid. */
43e9d192 16844
361fb3ee 16845static bool
0cfff2a1 16846aarch64_validate_mcpu (const char *str, const struct processor **res,
28108a53 16847 uint64_t *isa_flags)
0cfff2a1 16848{
c7887347 16849 std::string invalid_extension;
0cfff2a1 16850 enum aarch64_parse_opt_result parse_res
c7887347 16851 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
0cfff2a1
KT
16852
16853 if (parse_res == AARCH64_PARSE_OK)
361fb3ee 16854 return true;
0cfff2a1
KT
16855
16856 switch (parse_res)
16857 {
16858 case AARCH64_PARSE_MISSING_ARG:
fb241da2 16859 error ("missing cpu name in %<-mcpu=%s%>", str);
0cfff2a1
KT
16860 break;
16861 case AARCH64_PARSE_INVALID_ARG:
a3f9f006 16862 error ("unknown value %qs for %<-mcpu%>", str);
01f44038 16863 aarch64_print_hint_for_core (str);
0cfff2a1
KT
16864 break;
16865 case AARCH64_PARSE_INVALID_FEATURE:
c7887347
ML
16866 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
16867 invalid_extension.c_str (), str);
16868 aarch64_print_hint_for_extensions (invalid_extension);
0cfff2a1
KT
16869 break;
16870 default:
16871 gcc_unreachable ();
16872 }
361fb3ee
KT
16873
16874 return false;
0cfff2a1
KT
16875}
16876
a9ba2a9b
MM
16877/* Straight line speculation indicators. */
16878enum aarch64_sls_hardening_type
16879{
16880 SLS_NONE = 0,
16881 SLS_RETBR = 1,
16882 SLS_BLR = 2,
16883 SLS_ALL = 3,
16884};
16885static enum aarch64_sls_hardening_type aarch64_sls_hardening;
16886
16887/* Return whether we should mitigatate Straight Line Speculation for the RET
16888 and BR instructions. */
16889bool
16890aarch64_harden_sls_retbr_p (void)
16891{
16892 return aarch64_sls_hardening & SLS_RETBR;
16893}
16894
16895/* Return whether we should mitigatate Straight Line Speculation for the BLR
16896 instruction. */
16897bool
16898aarch64_harden_sls_blr_p (void)
16899{
16900 return aarch64_sls_hardening & SLS_BLR;
16901}
16902
16903/* As of yet we only allow setting these options globally, in the future we may
16904 allow setting them per function. */
16905static void
16906aarch64_validate_sls_mitigation (const char *const_str)
16907{
16908 char *token_save = NULL;
16909 char *str = NULL;
16910
16911 if (strcmp (const_str, "none") == 0)
16912 {
16913 aarch64_sls_hardening = SLS_NONE;
16914 return;
16915 }
16916 if (strcmp (const_str, "all") == 0)
16917 {
16918 aarch64_sls_hardening = SLS_ALL;
16919 return;
16920 }
16921
16922 char *str_root = xstrdup (const_str);
16923 str = strtok_r (str_root, ",", &token_save);
16924 if (!str)
16925 error ("invalid argument given to %<-mharden-sls=%>");
16926
16927 int temp = SLS_NONE;
16928 while (str)
16929 {
16930 if (strcmp (str, "blr") == 0)
16931 temp |= SLS_BLR;
16932 else if (strcmp (str, "retbr") == 0)
16933 temp |= SLS_RETBR;
16934 else if (strcmp (str, "none") == 0 || strcmp (str, "all") == 0)
16935 {
16936 error ("%<%s%> must be by itself for %<-mharden-sls=%>", str);
16937 break;
16938 }
16939 else
16940 {
16941 error ("invalid argument %<%s%> for %<-mharden-sls=%>", str);
16942 break;
16943 }
16944 str = strtok_r (NULL, ",", &token_save);
16945 }
16946 aarch64_sls_hardening = (aarch64_sls_hardening_type) temp;
16947 free (str_root);
16948}
16949
efac62a3
ST
16950/* Parses CONST_STR for branch protection features specified in
16951 aarch64_branch_protect_types, and set any global variables required. Returns
16952 the parsing result and assigns LAST_STR to the last processed token from
16953 CONST_STR so that it can be used for error reporting. */
16954
16955static enum
16956aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
16957 char** last_str)
16958{
16959 char *str_root = xstrdup (const_str);
16960 char* token_save = NULL;
16961 char *str = strtok_r (str_root, "+", &token_save);
16962 enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
16963 if (!str)
16964 res = AARCH64_PARSE_MISSING_ARG;
16965 else
16966 {
16967 char *next_str = strtok_r (NULL, "+", &token_save);
16968 /* Reset the branch protection features to their defaults. */
16969 aarch64_handle_no_branch_protection (NULL, NULL);
16970
16971 while (str && res == AARCH64_PARSE_OK)
16972 {
16973 const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
16974 bool found = false;
16975 /* Search for this type. */
16976 while (type && type->name && !found && res == AARCH64_PARSE_OK)
16977 {
16978 if (strcmp (str, type->name) == 0)
16979 {
16980 found = true;
16981 res = type->handler (str, next_str);
16982 str = next_str;
16983 next_str = strtok_r (NULL, "+", &token_save);
16984 }
16985 else
16986 type++;
16987 }
16988 if (found && res == AARCH64_PARSE_OK)
16989 {
16990 bool found_subtype = true;
16991 /* Loop through each token until we find one that isn't a
16992 subtype. */
16993 while (found_subtype)
16994 {
16995 found_subtype = false;
16996 const aarch64_branch_protect_type *subtype = type->subtypes;
16997 /* Search for the subtype. */
16998 while (str && subtype && subtype->name && !found_subtype
16999 && res == AARCH64_PARSE_OK)
17000 {
17001 if (strcmp (str, subtype->name) == 0)
17002 {
17003 found_subtype = true;
17004 res = subtype->handler (str, next_str);
17005 str = next_str;
17006 next_str = strtok_r (NULL, "+", &token_save);
17007 }
17008 else
17009 subtype++;
17010 }
17011 }
17012 }
17013 else if (!found)
17014 res = AARCH64_PARSE_INVALID_ARG;
17015 }
17016 }
17017 /* Copy the last processed token into the argument to pass it back.
17018 Used by option and attribute validation to print the offending token. */
17019 if (last_str)
17020 {
17021 if (str) strcpy (*last_str, str);
17022 else *last_str = NULL;
17023 }
17024 if (res == AARCH64_PARSE_OK)
17025 {
17026 /* If needed, alloc the accepted string then copy in const_str.
17027 Used by override_option_after_change_1. */
17028 if (!accepted_branch_protection_string)
17029 accepted_branch_protection_string = (char *) xmalloc (
17030 BRANCH_PROTECT_STR_MAX
17031 + 1);
17032 strncpy (accepted_branch_protection_string, const_str,
17033 BRANCH_PROTECT_STR_MAX + 1);
17034 /* Forcibly null-terminate. */
17035 accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
17036 }
17037 return res;
17038}
17039
17040static bool
17041aarch64_validate_mbranch_protection (const char *const_str)
17042{
17043 char *str = (char *) xmalloc (strlen (const_str));
17044 enum aarch64_parse_opt_result res =
17045 aarch64_parse_branch_protection (const_str, &str);
17046 if (res == AARCH64_PARSE_INVALID_ARG)
a9c697b8 17047 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
efac62a3 17048 else if (res == AARCH64_PARSE_MISSING_ARG)
a9c697b8 17049 error ("missing argument for %<-mbranch-protection=%>");
efac62a3
ST
17050 free (str);
17051 return res == AARCH64_PARSE_OK;
17052}
17053
0cfff2a1
KT
17054/* Validate a command-line -march option. Parse the arch and extensions
17055 (if any) specified in STR and throw errors if appropriate. Put the
361fb3ee
KT
17056 results, if they are valid, in RES and ISA_FLAGS. Return whether the
17057 option is valid. */
0cfff2a1 17058
361fb3ee 17059static bool
0cfff2a1 17060aarch64_validate_march (const char *str, const struct processor **res,
28108a53 17061 uint64_t *isa_flags)
0cfff2a1 17062{
c7887347 17063 std::string invalid_extension;
0cfff2a1 17064 enum aarch64_parse_opt_result parse_res
c7887347 17065 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
0cfff2a1
KT
17066
17067 if (parse_res == AARCH64_PARSE_OK)
361fb3ee 17068 return true;
0cfff2a1
KT
17069
17070 switch (parse_res)
17071 {
17072 case AARCH64_PARSE_MISSING_ARG:
fb241da2 17073 error ("missing arch name in %<-march=%s%>", str);
0cfff2a1
KT
17074 break;
17075 case AARCH64_PARSE_INVALID_ARG:
a3f9f006 17076 error ("unknown value %qs for %<-march%>", str);
01f44038 17077 aarch64_print_hint_for_arch (str);
0cfff2a1
KT
17078 break;
17079 case AARCH64_PARSE_INVALID_FEATURE:
c7887347
ML
17080 error ("invalid feature modifier %qs in %<-march=%s%>",
17081 invalid_extension.c_str (), str);
17082 aarch64_print_hint_for_extensions (invalid_extension);
0cfff2a1
KT
17083 break;
17084 default:
17085 gcc_unreachable ();
17086 }
361fb3ee
KT
17087
17088 return false;
0cfff2a1
KT
17089}
17090
17091/* Validate a command-line -mtune option. Parse the cpu
17092 specified in STR and throw errors if appropriate. Put the
361fb3ee
KT
17093 result, if it is valid, in RES. Return whether the option is
17094 valid. */
0cfff2a1 17095
361fb3ee 17096static bool
0cfff2a1
KT
17097aarch64_validate_mtune (const char *str, const struct processor **res)
17098{
17099 enum aarch64_parse_opt_result parse_res
17100 = aarch64_parse_tune (str, res);
17101
17102 if (parse_res == AARCH64_PARSE_OK)
361fb3ee 17103 return true;
0cfff2a1
KT
17104
17105 switch (parse_res)
17106 {
17107 case AARCH64_PARSE_MISSING_ARG:
fb241da2 17108 error ("missing cpu name in %<-mtune=%s%>", str);
0cfff2a1
KT
17109 break;
17110 case AARCH64_PARSE_INVALID_ARG:
a3f9f006 17111 error ("unknown value %qs for %<-mtune%>", str);
01f44038 17112 aarch64_print_hint_for_core (str);
0cfff2a1
KT
17113 break;
17114 default:
17115 gcc_unreachable ();
17116 }
361fb3ee
KT
17117 return false;
17118}
17119
17120/* Return the CPU corresponding to the enum CPU.
17121 If it doesn't specify a cpu, return the default. */
17122
17123static const struct processor *
17124aarch64_get_tune_cpu (enum aarch64_processor cpu)
17125{
17126 if (cpu != aarch64_none)
17127 return &all_cores[cpu];
17128
17129 /* The & 0x3f is to extract the bottom 6 bits that encode the
17130 default cpu as selected by the --with-cpu GCC configure option
17131 in config.gcc.
17132 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
17133 flags mechanism should be reworked to make it more sane. */
17134 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
17135}
17136
17137/* Return the architecture corresponding to the enum ARCH.
17138 If it doesn't specify a valid architecture, return the default. */
17139
17140static const struct processor *
17141aarch64_get_arch (enum aarch64_arch arch)
17142{
17143 if (arch != aarch64_no_arch)
17144 return &all_architectures[arch];
17145
17146 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
17147
17148 return &all_architectures[cpu->arch];
0cfff2a1
KT
17149}
17150
43cacb12
RS
17151/* Return the VG value associated with -msve-vector-bits= value VALUE. */
17152
17153static poly_uint16
17154aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
17155{
9b070057
RS
17156 /* 128-bit SVE and Advanced SIMD modes use different register layouts
17157 on big-endian targets, so we would need to forbid subregs that convert
17158 from one to the other. By default a reinterpret sequence would then
17159 involve a store to memory in one mode and a load back in the other.
17160 Even if we optimize that sequence using reverse instructions,
17161 it would still be a significant potential overhead.
17162
17163 For now, it seems better to generate length-agnostic code for that
17164 case instead. */
17165 if (value == SVE_SCALABLE
17166 || (value == SVE_128 && BYTES_BIG_ENDIAN))
43cacb12
RS
17167 return poly_uint16 (2, 2);
17168 else
17169 return (int) value / 64;
17170}
17171
0cfff2a1
KT
17172/* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
17173 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
17174 tuning structs. In particular it must set selected_tune and
17175 aarch64_isa_flags that define the available ISA features and tuning
17176 decisions. It must also set selected_arch as this will be used to
17177 output the .arch asm tags for each function. */
17178
17179static void
17180aarch64_override_options (void)
17181{
28108a53
MM
17182 uint64_t cpu_isa = 0;
17183 uint64_t arch_isa = 0;
0cfff2a1
KT
17184 aarch64_isa_flags = 0;
17185
361fb3ee
KT
17186 bool valid_cpu = true;
17187 bool valid_tune = true;
17188 bool valid_arch = true;
17189
0cfff2a1
KT
17190 selected_cpu = NULL;
17191 selected_arch = NULL;
17192 selected_tune = NULL;
17193
a9ba2a9b
MM
17194 if (aarch64_harden_sls_string)
17195 aarch64_validate_sls_mitigation (aarch64_harden_sls_string);
17196
efac62a3
ST
17197 if (aarch64_branch_protection_string)
17198 aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
17199
0cfff2a1
KT
17200 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
17201 If either of -march or -mtune is given, they override their
17202 respective component of -mcpu. */
17203 if (aarch64_cpu_string)
361fb3ee
KT
17204 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
17205 &cpu_isa);
0cfff2a1
KT
17206
17207 if (aarch64_arch_string)
361fb3ee
KT
17208 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
17209 &arch_isa);
0cfff2a1
KT
17210
17211 if (aarch64_tune_string)
361fb3ee 17212 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
43e9d192 17213
6881e3c1
OH
17214#ifdef SUBTARGET_OVERRIDE_OPTIONS
17215 SUBTARGET_OVERRIDE_OPTIONS;
17216#endif
17217
43e9d192
IB
17218 /* If the user did not specify a processor, choose the default
17219 one for them. This will be the CPU set during configuration using
a3cd0246 17220 --with-cpu, otherwise it is "generic". */
43e9d192
IB
17221 if (!selected_cpu)
17222 {
0cfff2a1
KT
17223 if (selected_arch)
17224 {
17225 selected_cpu = &all_cores[selected_arch->ident];
17226 aarch64_isa_flags = arch_isa;
361fb3ee 17227 explicit_arch = selected_arch->arch;
0cfff2a1
KT
17228 }
17229 else
17230 {
361fb3ee
KT
17231 /* Get default configure-time CPU. */
17232 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
0cfff2a1
KT
17233 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
17234 }
361fb3ee
KT
17235
17236 if (selected_tune)
17237 explicit_tune_core = selected_tune->ident;
0cfff2a1
KT
17238 }
17239 /* If both -mcpu and -march are specified check that they are architecturally
17240 compatible, warn if they're not and prefer the -march ISA flags. */
17241 else if (selected_arch)
17242 {
17243 if (selected_arch->arch != selected_cpu->arch)
17244 {
a3f9f006 17245 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
349297b6
JH
17246 aarch64_cpu_string,
17247 aarch64_arch_string);
0cfff2a1
KT
17248 }
17249 aarch64_isa_flags = arch_isa;
361fb3ee
KT
17250 explicit_arch = selected_arch->arch;
17251 explicit_tune_core = selected_tune ? selected_tune->ident
17252 : selected_cpu->ident;
0cfff2a1
KT
17253 }
17254 else
17255 {
17256 /* -mcpu but no -march. */
17257 aarch64_isa_flags = cpu_isa;
361fb3ee
KT
17258 explicit_tune_core = selected_tune ? selected_tune->ident
17259 : selected_cpu->ident;
17260 gcc_assert (selected_cpu);
17261 selected_arch = &all_architectures[selected_cpu->arch];
17262 explicit_arch = selected_arch->arch;
43e9d192
IB
17263 }
17264
0cfff2a1
KT
17265 /* Set the arch as well as we will need it when outputing
17266 the .arch directive in assembly. */
17267 if (!selected_arch)
17268 {
17269 gcc_assert (selected_cpu);
17270 selected_arch = &all_architectures[selected_cpu->arch];
17271 }
43e9d192 17272
43e9d192 17273 if (!selected_tune)
3edaf26d 17274 selected_tune = selected_cpu;
43e9d192 17275
c7ff4f0f
SD
17276 if (aarch64_enable_bti == 2)
17277 {
17278#ifdef TARGET_ENABLE_BTI
17279 aarch64_enable_bti = 1;
17280#else
17281 aarch64_enable_bti = 0;
17282#endif
17283 }
17284
17285 /* Return address signing is currently not supported for ILP32 targets. For
17286 LP64 targets use the configured option in the absence of a command-line
17287 option for -mbranch-protection. */
17288 if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
17289 {
17290#ifdef TARGET_ENABLE_PAC_RET
17291 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
c7ff4f0f
SD
17292#else
17293 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
17294#endif
17295 }
17296
0cfff2a1
KT
17297#ifndef HAVE_AS_MABI_OPTION
17298 /* The compiler may have been configured with 2.23.* binutils, which does
17299 not have support for ILP32. */
17300 if (TARGET_ILP32)
a3f9f006 17301 error ("assembler does not support %<-mabi=ilp32%>");
0cfff2a1 17302#endif
43e9d192 17303
43cacb12
RS
17304 /* Convert -msve-vector-bits to a VG count. */
17305 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
17306
db58fd89 17307 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
a3f9f006 17308 sorry ("return address signing is only supported for %<-mabi=lp64%>");
db58fd89 17309
361fb3ee
KT
17310 /* Make sure we properly set up the explicit options. */
17311 if ((aarch64_cpu_string && valid_cpu)
17312 || (aarch64_tune_string && valid_tune))
17313 gcc_assert (explicit_tune_core != aarch64_none);
17314
17315 if ((aarch64_cpu_string && valid_cpu)
17316 || (aarch64_arch_string && valid_arch))
17317 gcc_assert (explicit_arch != aarch64_no_arch);
17318
5f7dbaa0
RE
17319 /* The pass to insert speculation tracking runs before
17320 shrink-wrapping and the latter does not know how to update the
17321 tracking status. So disable it in this case. */
17322 if (aarch64_track_speculation)
17323 flag_shrink_wrap = 0;
17324
0cfff2a1
KT
17325 aarch64_override_options_internal (&global_options);
17326
17327 /* Save these options as the default ones in case we push and pop them later
17328 while processing functions with potential target attributes. */
17329 target_option_default_node = target_option_current_node
ba948b37 17330 = build_target_option_node (&global_options, &global_options_set);
43e9d192
IB
17331}
17332
17333/* Implement targetm.override_options_after_change. */
17334
17335static void
17336aarch64_override_options_after_change (void)
17337{
0cfff2a1 17338 aarch64_override_options_after_change_1 (&global_options);
43e9d192
IB
17339}
17340
29a14a1a
MK
17341/* Implement the TARGET_OFFLOAD_OPTIONS hook. */
17342static char *
17343aarch64_offload_options (void)
17344{
17345 if (TARGET_ILP32)
17346 return xstrdup ("-foffload-abi=ilp32");
17347 else
17348 return xstrdup ("-foffload-abi=lp64");
17349}
17350
43e9d192
IB
17351static struct machine_function *
17352aarch64_init_machine_status (void)
17353{
17354 struct machine_function *machine;
766090c2 17355 machine = ggc_cleared_alloc<machine_function> ();
43e9d192
IB
17356 return machine;
17357}
17358
17359void
17360aarch64_init_expanders (void)
17361{
17362 init_machine_status = aarch64_init_machine_status;
17363}
17364
17365/* A checking mechanism for the implementation of the various code models. */
17366static void
0cfff2a1 17367initialize_aarch64_code_model (struct gcc_options *opts)
43e9d192 17368{
6c0ab626
X
17369 aarch64_cmodel = opts->x_aarch64_cmodel_var;
17370 switch (opts->x_aarch64_cmodel_var)
17371 {
17372 case AARCH64_CMODEL_TINY:
17373 if (opts->x_flag_pic)
17374 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
17375 break;
17376 case AARCH64_CMODEL_SMALL:
17377 if (opts->x_flag_pic)
17378 {
34ecdb0f 17379#ifdef HAVE_AS_SMALL_PIC_RELOCS
6c0ab626
X
17380 aarch64_cmodel = (flag_pic == 2
17381 ? AARCH64_CMODEL_SMALL_PIC
17382 : AARCH64_CMODEL_SMALL_SPIC);
34ecdb0f 17383#else
6c0ab626 17384 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
34ecdb0f 17385#endif
6c0ab626
X
17386 }
17387 break;
17388 case AARCH64_CMODEL_LARGE:
17389 if (opts->x_flag_pic)
17390 sorry ("code model %qs with %<-f%s%>", "large",
17391 opts->x_flag_pic > 1 ? "PIC" : "pic");
17392 if (opts->x_aarch64_abi == AARCH64_ABI_ILP32)
17393 sorry ("code model %qs not supported in ilp32 mode", "large");
17394 break;
17395 case AARCH64_CMODEL_TINY_PIC:
17396 case AARCH64_CMODEL_SMALL_PIC:
17397 case AARCH64_CMODEL_SMALL_SPIC:
17398 gcc_unreachable ();
17399 }
43e9d192
IB
17400}
17401
361fb3ee
KT
17402/* Implement TARGET_OPTION_SAVE. */
17403
17404static void
ba948b37
JJ
17405aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts,
17406 struct gcc_options */* opts_set */)
361fb3ee
KT
17407{
17408 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
efac62a3
ST
17409 ptr->x_aarch64_branch_protection_string
17410 = opts->x_aarch64_branch_protection_string;
361fb3ee
KT
17411}
17412
17413/* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
17414 using the information saved in PTR. */
17415
17416static void
ba948b37
JJ
17417aarch64_option_restore (struct gcc_options *opts,
17418 struct gcc_options */* opts_set */,
17419 struct cl_target_option *ptr)
361fb3ee 17420{
361fb3ee
KT
17421 opts->x_explicit_arch = ptr->x_explicit_arch;
17422 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
1a5c8291
RS
17423 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
17424 if (opts->x_explicit_tune_core == aarch64_none
17425 && opts->x_explicit_arch != aarch64_no_arch)
17426 selected_tune = &all_cores[selected_arch->ident];
17427 else
17428 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
361fb3ee 17429 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
efac62a3
ST
17430 opts->x_aarch64_branch_protection_string
17431 = ptr->x_aarch64_branch_protection_string;
17432 if (opts->x_aarch64_branch_protection_string)
17433 {
17434 aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
17435 NULL);
17436 }
361fb3ee
KT
17437
17438 aarch64_override_options_internal (opts);
17439}
17440
17441/* Implement TARGET_OPTION_PRINT. */
17442
17443static void
17444aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
17445{
17446 const struct processor *cpu
17447 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
28108a53 17448 uint64_t isa_flags = ptr->x_aarch64_isa_flags;
361fb3ee 17449 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
054b4005 17450 std::string extension
04a99ebe 17451 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
361fb3ee
KT
17452
17453 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
054b4005
JG
17454 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
17455 arch->name, extension.c_str ());
361fb3ee
KT
17456}
17457
d78006d9
KT
17458static GTY(()) tree aarch64_previous_fndecl;
17459
e4ea20c8
KT
17460void
17461aarch64_reset_previous_fndecl (void)
17462{
17463 aarch64_previous_fndecl = NULL;
17464}
17465
acfc1ac1
KT
17466/* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
17467 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
17468 make sure optab availability predicates are recomputed when necessary. */
17469
17470void
17471aarch64_save_restore_target_globals (tree new_tree)
17472{
17473 if (TREE_TARGET_GLOBALS (new_tree))
17474 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
17475 else if (new_tree == target_option_default_node)
17476 restore_target_globals (&default_target_globals);
17477 else
17478 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
17479}
17480
d78006d9
KT
17481/* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
17482 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
17483 of the function, if such exists. This function may be called multiple
17484 times on a single function so use aarch64_previous_fndecl to avoid
17485 setting up identical state. */
17486
17487static void
17488aarch64_set_current_function (tree fndecl)
17489{
acfc1ac1
KT
17490 if (!fndecl || fndecl == aarch64_previous_fndecl)
17491 return;
17492
d78006d9
KT
17493 tree old_tree = (aarch64_previous_fndecl
17494 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
17495 : NULL_TREE);
17496
acfc1ac1 17497 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
d78006d9 17498
acfc1ac1
KT
17499 /* If current function has no attributes but the previous one did,
17500 use the default node. */
17501 if (!new_tree && old_tree)
17502 new_tree = target_option_default_node;
d78006d9 17503
acfc1ac1
KT
17504 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
17505 the default have been handled by aarch64_save_restore_target_globals from
17506 aarch64_pragma_target_parse. */
17507 if (old_tree == new_tree)
17508 return;
d78006d9 17509
acfc1ac1 17510 aarch64_previous_fndecl = fndecl;
6e17a23b 17511
acfc1ac1 17512 /* First set the target options. */
ba948b37
JJ
17513 cl_target_option_restore (&global_options, &global_options_set,
17514 TREE_TARGET_OPTION (new_tree));
6e17a23b 17515
acfc1ac1 17516 aarch64_save_restore_target_globals (new_tree);
d78006d9 17517}
361fb3ee 17518
5a2c8331
KT
17519/* Enum describing the various ways we can handle attributes.
17520 In many cases we can reuse the generic option handling machinery. */
17521
17522enum aarch64_attr_opt_type
17523{
17524 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
17525 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
17526 aarch64_attr_enum, /* Attribute sets an enum variable. */
17527 aarch64_attr_custom /* Attribute requires a custom handling function. */
17528};
17529
17530/* All the information needed to handle a target attribute.
17531 NAME is the name of the attribute.
9c582551 17532 ATTR_TYPE specifies the type of behavior of the attribute as described
5a2c8331
KT
17533 in the definition of enum aarch64_attr_opt_type.
17534 ALLOW_NEG is true if the attribute supports a "no-" form.
ab93e9b7
SE
17535 HANDLER is the function that takes the attribute string as an argument
17536 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
5a2c8331 17537 OPT_NUM is the enum specifying the option that the attribute modifies.
9c582551 17538 This is needed for attributes that mirror the behavior of a command-line
5a2c8331
KT
17539 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
17540 aarch64_attr_enum. */
17541
17542struct aarch64_attribute_info
17543{
17544 const char *name;
17545 enum aarch64_attr_opt_type attr_type;
17546 bool allow_neg;
ab93e9b7 17547 bool (*handler) (const char *);
5a2c8331
KT
17548 enum opt_code opt_num;
17549};
17550
ab93e9b7 17551/* Handle the ARCH_STR argument to the arch= target attribute. */
5a2c8331
KT
17552
17553static bool
ab93e9b7 17554aarch64_handle_attr_arch (const char *str)
5a2c8331
KT
17555{
17556 const struct processor *tmp_arch = NULL;
c7887347 17557 std::string invalid_extension;
5a2c8331 17558 enum aarch64_parse_opt_result parse_res
c7887347 17559 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
5a2c8331
KT
17560
17561 if (parse_res == AARCH64_PARSE_OK)
17562 {
17563 gcc_assert (tmp_arch);
17564 selected_arch = tmp_arch;
17565 explicit_arch = selected_arch->arch;
17566 return true;
17567 }
17568
17569 switch (parse_res)
17570 {
17571 case AARCH64_PARSE_MISSING_ARG:
ab93e9b7 17572 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
5a2c8331
KT
17573 break;
17574 case AARCH64_PARSE_INVALID_ARG:
ab93e9b7 17575 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
01f44038 17576 aarch64_print_hint_for_arch (str);
5a2c8331
KT
17577 break;
17578 case AARCH64_PARSE_INVALID_FEATURE:
c7887347
ML
17579 error ("invalid feature modifier %s of value (\"%s\") in "
17580 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
17581 aarch64_print_hint_for_extensions (invalid_extension);
5a2c8331
KT
17582 break;
17583 default:
17584 gcc_unreachable ();
17585 }
17586
17587 return false;
17588}
17589
ab93e9b7 17590/* Handle the argument CPU_STR to the cpu= target attribute. */
5a2c8331
KT
17591
17592static bool
ab93e9b7 17593aarch64_handle_attr_cpu (const char *str)
5a2c8331
KT
17594{
17595 const struct processor *tmp_cpu = NULL;
c7887347 17596 std::string invalid_extension;
5a2c8331 17597 enum aarch64_parse_opt_result parse_res
c7887347 17598 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
5a2c8331
KT
17599
17600 if (parse_res == AARCH64_PARSE_OK)
17601 {
17602 gcc_assert (tmp_cpu);
17603 selected_tune = tmp_cpu;
17604 explicit_tune_core = selected_tune->ident;
17605
17606 selected_arch = &all_architectures[tmp_cpu->arch];
17607 explicit_arch = selected_arch->arch;
17608 return true;
17609 }
17610
17611 switch (parse_res)
17612 {
17613 case AARCH64_PARSE_MISSING_ARG:
ab93e9b7 17614 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
5a2c8331
KT
17615 break;
17616 case AARCH64_PARSE_INVALID_ARG:
ab93e9b7 17617 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
01f44038 17618 aarch64_print_hint_for_core (str);
5a2c8331
KT
17619 break;
17620 case AARCH64_PARSE_INVALID_FEATURE:
c7887347
ML
17621 error ("invalid feature modifier %s of value (\"%s\") in "
17622 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
17623 aarch64_print_hint_for_extensions (invalid_extension);
5a2c8331
KT
17624 break;
17625 default:
17626 gcc_unreachable ();
17627 }
17628
17629 return false;
17630}
17631
efac62a3
ST
17632/* Handle the argument STR to the branch-protection= attribute. */
17633
17634 static bool
17635 aarch64_handle_attr_branch_protection (const char* str)
17636 {
81e40f3a 17637 char *err_str = (char *) xmalloc (strlen (str) + 1);
efac62a3
ST
17638 enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
17639 &err_str);
17640 bool success = false;
17641 switch (res)
17642 {
17643 case AARCH64_PARSE_MISSING_ARG:
17644 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
17645 " attribute");
17646 break;
17647 case AARCH64_PARSE_INVALID_ARG:
17648 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
17649 "=\")%> pragma or attribute", err_str);
17650 break;
17651 case AARCH64_PARSE_OK:
17652 success = true;
17653 /* Fall through. */
17654 case AARCH64_PARSE_INVALID_FEATURE:
17655 break;
17656 default:
17657 gcc_unreachable ();
17658 }
17659 free (err_str);
17660 return success;
17661 }
17662
ab93e9b7 17663/* Handle the argument STR to the tune= target attribute. */
5a2c8331
KT
17664
17665static bool
ab93e9b7 17666aarch64_handle_attr_tune (const char *str)
5a2c8331
KT
17667{
17668 const struct processor *tmp_tune = NULL;
17669 enum aarch64_parse_opt_result parse_res
17670 = aarch64_parse_tune (str, &tmp_tune);
17671
17672 if (parse_res == AARCH64_PARSE_OK)
17673 {
17674 gcc_assert (tmp_tune);
17675 selected_tune = tmp_tune;
17676 explicit_tune_core = selected_tune->ident;
17677 return true;
17678 }
17679
17680 switch (parse_res)
17681 {
17682 case AARCH64_PARSE_INVALID_ARG:
ab93e9b7 17683 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
01f44038 17684 aarch64_print_hint_for_core (str);
5a2c8331
KT
17685 break;
17686 default:
17687 gcc_unreachable ();
17688 }
17689
17690 return false;
17691}
17692
17693/* Parse an architecture extensions target attribute string specified in STR.
17694 For example "+fp+nosimd". Show any errors if needed. Return TRUE
17695 if successful. Update aarch64_isa_flags to reflect the ISA features
ab93e9b7 17696 modified. */
5a2c8331
KT
17697
17698static bool
ab93e9b7 17699aarch64_handle_attr_isa_flags (char *str)
5a2c8331
KT
17700{
17701 enum aarch64_parse_opt_result parse_res;
28108a53 17702 uint64_t isa_flags = aarch64_isa_flags;
5a2c8331 17703
e4ea20c8
KT
17704 /* We allow "+nothing" in the beginning to clear out all architectural
17705 features if the user wants to handpick specific features. */
17706 if (strncmp ("+nothing", str, 8) == 0)
17707 {
17708 isa_flags = 0;
17709 str += 8;
17710 }
17711
c7887347
ML
17712 std::string invalid_extension;
17713 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
5a2c8331
KT
17714
17715 if (parse_res == AARCH64_PARSE_OK)
17716 {
17717 aarch64_isa_flags = isa_flags;
17718 return true;
17719 }
17720
17721 switch (parse_res)
17722 {
17723 case AARCH64_PARSE_MISSING_ARG:
ab93e9b7 17724 error ("missing value in %<target()%> pragma or attribute");
5a2c8331
KT
17725 break;
17726
17727 case AARCH64_PARSE_INVALID_FEATURE:
c7887347
ML
17728 error ("invalid feature modifier %s of value (\"%s\") in "
17729 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
5a2c8331
KT
17730 break;
17731
17732 default:
17733 gcc_unreachable ();
17734 }
17735
17736 return false;
17737}
17738
17739/* The target attributes that we support. On top of these we also support just
17740 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
17741 handled explicitly in aarch64_process_one_target_attr. */
17742
17743static const struct aarch64_attribute_info aarch64_attributes[] =
17744{
17745 { "general-regs-only", aarch64_attr_mask, false, NULL,
17746 OPT_mgeneral_regs_only },
17747 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
17748 OPT_mfix_cortex_a53_835769 },
48bb1a55
CL
17749 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
17750 OPT_mfix_cortex_a53_843419 },
5a2c8331 17751 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
675d044c 17752 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
5a2c8331
KT
17753 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
17754 OPT_momit_leaf_frame_pointer },
17755 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
17756 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
17757 OPT_march_ },
17758 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
17759 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
17760 OPT_mtune_ },
efac62a3
ST
17761 { "branch-protection", aarch64_attr_custom, false,
17762 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
db58fd89
JW
17763 { "sign-return-address", aarch64_attr_enum, false, NULL,
17764 OPT_msign_return_address_ },
9e02b45f
ML
17765 { "outline-atomics", aarch64_attr_bool, true, NULL,
17766 OPT_moutline_atomics},
5a2c8331
KT
17767 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
17768};
17769
17770/* Parse ARG_STR which contains the definition of one target attribute.
ab93e9b7 17771 Show appropriate errors if any or return true if the attribute is valid. */
5a2c8331
KT
17772
17773static bool
ab93e9b7 17774aarch64_process_one_target_attr (char *arg_str)
5a2c8331
KT
17775{
17776 bool invert = false;
17777
17778 size_t len = strlen (arg_str);
17779
17780 if (len == 0)
17781 {
ab93e9b7 17782 error ("malformed %<target()%> pragma or attribute");
5a2c8331
KT
17783 return false;
17784 }
17785
17786 char *str_to_check = (char *) alloca (len + 1);
17787 strcpy (str_to_check, arg_str);
17788
5a2c8331
KT
17789 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
17790 It is easier to detect and handle it explicitly here rather than going
17791 through the machinery for the rest of the target attributes in this
17792 function. */
17793 if (*str_to_check == '+')
ab93e9b7 17794 return aarch64_handle_attr_isa_flags (str_to_check);
5a2c8331 17795
c0129e2d 17796 if (len > 3 && startswith (str_to_check, "no-"))
5a2c8331
KT
17797 {
17798 invert = true;
17799 str_to_check += 3;
17800 }
17801 char *arg = strchr (str_to_check, '=');
17802
17803 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
17804 and point ARG to "foo". */
17805 if (arg)
17806 {
17807 *arg = '\0';
17808 arg++;
17809 }
17810 const struct aarch64_attribute_info *p_attr;
16d12992 17811 bool found = false;
5a2c8331
KT
17812 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
17813 {
17814 /* If the names don't match up, or the user has given an argument
17815 to an attribute that doesn't accept one, or didn't give an argument
17816 to an attribute that expects one, fail to match. */
17817 if (strcmp (str_to_check, p_attr->name) != 0)
17818 continue;
17819
16d12992 17820 found = true;
5a2c8331
KT
17821 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
17822 || p_attr->attr_type == aarch64_attr_enum;
17823
17824 if (attr_need_arg_p ^ (arg != NULL))
17825 {
ab93e9b7 17826 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
5a2c8331
KT
17827 return false;
17828 }
17829
17830 /* If the name matches but the attribute does not allow "no-" versions
17831 then we can't match. */
17832 if (invert && !p_attr->allow_neg)
17833 {
ab93e9b7 17834 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
5a2c8331
KT
17835 return false;
17836 }
17837
17838 switch (p_attr->attr_type)
17839 {
17840 /* Has a custom handler registered.
17841 For example, cpu=, arch=, tune=. */
17842 case aarch64_attr_custom:
17843 gcc_assert (p_attr->handler);
ab93e9b7 17844 if (!p_attr->handler (arg))
5a2c8331
KT
17845 return false;
17846 break;
17847
17848 /* Either set or unset a boolean option. */
17849 case aarch64_attr_bool:
17850 {
17851 struct cl_decoded_option decoded;
17852
17853 generate_option (p_attr->opt_num, NULL, !invert,
17854 CL_TARGET, &decoded);
17855 aarch64_handle_option (&global_options, &global_options_set,
17856 &decoded, input_location);
17857 break;
17858 }
17859 /* Set or unset a bit in the target_flags. aarch64_handle_option
17860 should know what mask to apply given the option number. */
17861 case aarch64_attr_mask:
17862 {
17863 struct cl_decoded_option decoded;
17864 /* We only need to specify the option number.
17865 aarch64_handle_option will know which mask to apply. */
17866 decoded.opt_index = p_attr->opt_num;
17867 decoded.value = !invert;
17868 aarch64_handle_option (&global_options, &global_options_set,
17869 &decoded, input_location);
17870 break;
17871 }
17872 /* Use the option setting machinery to set an option to an enum. */
17873 case aarch64_attr_enum:
17874 {
17875 gcc_assert (arg);
17876 bool valid;
17877 int value;
17878 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
17879 &value, CL_TARGET);
17880 if (valid)
17881 {
17882 set_option (&global_options, NULL, p_attr->opt_num, value,
17883 NULL, DK_UNSPECIFIED, input_location,
17884 global_dc);
17885 }
17886 else
17887 {
ab93e9b7 17888 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
5a2c8331
KT
17889 }
17890 break;
17891 }
17892 default:
17893 gcc_unreachable ();
17894 }
17895 }
17896
16d12992
KT
17897 /* If we reached here we either have found an attribute and validated
17898 it or didn't match any. If we matched an attribute but its arguments
17899 were malformed we will have returned false already. */
17900 return found;
5a2c8331
KT
17901}
17902
17903/* Count how many times the character C appears in
17904 NULL-terminated string STR. */
17905
17906static unsigned int
17907num_occurences_in_str (char c, char *str)
17908{
17909 unsigned int res = 0;
17910 while (*str != '\0')
17911 {
17912 if (*str == c)
17913 res++;
17914
17915 str++;
17916 }
17917
17918 return res;
17919}
17920
17921/* Parse the tree in ARGS that contains the target attribute information
ab93e9b7 17922 and update the global target options space. */
5a2c8331
KT
17923
17924bool
ab93e9b7 17925aarch64_process_target_attr (tree args)
5a2c8331
KT
17926{
17927 if (TREE_CODE (args) == TREE_LIST)
17928 {
17929 do
17930 {
17931 tree head = TREE_VALUE (args);
17932 if (head)
17933 {
ab93e9b7 17934 if (!aarch64_process_target_attr (head))
5a2c8331
KT
17935 return false;
17936 }
17937 args = TREE_CHAIN (args);
17938 } while (args);
17939
17940 return true;
17941 }
3b6cb9e3
ML
17942
17943 if (TREE_CODE (args) != STRING_CST)
17944 {
17945 error ("attribute %<target%> argument not a string");
17946 return false;
17947 }
5a2c8331
KT
17948
17949 size_t len = strlen (TREE_STRING_POINTER (args));
17950 char *str_to_check = (char *) alloca (len + 1);
17951 strcpy (str_to_check, TREE_STRING_POINTER (args));
17952
17953 if (len == 0)
17954 {
ab93e9b7 17955 error ("malformed %<target()%> pragma or attribute");
5a2c8331
KT
17956 return false;
17957 }
17958
17959 /* Used to catch empty spaces between commas i.e.
17960 attribute ((target ("attr1,,attr2"))). */
17961 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
17962
17963 /* Handle multiple target attributes separated by ','. */
7185a4eb 17964 char *token = strtok_r (str_to_check, ",", &str_to_check);
5a2c8331
KT
17965
17966 unsigned int num_attrs = 0;
17967 while (token)
17968 {
17969 num_attrs++;
ab93e9b7 17970 if (!aarch64_process_one_target_attr (token))
5a2c8331 17971 {
145be5ef
PK
17972 /* Check if token is possibly an arch extension without
17973 leading '+'. */
17974 uint64_t isa_temp = 0;
17975 auto with_plus = std::string ("+") + token;
17976 enum aarch64_parse_opt_result ext_res
17977 = aarch64_parse_extension (with_plus.c_str (), &isa_temp, nullptr);
17978
17979 if (ext_res == AARCH64_PARSE_OK)
17980 error ("arch extension %<%s%> should be prefixed by %<+%>",
17981 token);
17982 else
17983 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
5a2c8331
KT
17984 return false;
17985 }
17986
7185a4eb 17987 token = strtok_r (NULL, ",", &str_to_check);
5a2c8331
KT
17988 }
17989
17990 if (num_attrs != num_commas + 1)
17991 {
ab93e9b7 17992 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
5a2c8331
KT
17993 return false;
17994 }
17995
17996 return true;
17997}
17998
17999/* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
18000 process attribute ((target ("..."))). */
18001
18002static bool
18003aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
18004{
18005 struct cl_target_option cur_target;
18006 bool ret;
18007 tree old_optimize;
18008 tree new_target, new_optimize;
18009 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
91d0e8de
KT
18010
18011 /* If what we're processing is the current pragma string then the
18012 target option node is already stored in target_option_current_node
18013 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
18014 having to re-parse the string. This is especially useful to keep
18015 arm_neon.h compile times down since that header contains a lot
18016 of intrinsics enclosed in pragmas. */
18017 if (!existing_target && args == current_target_pragma)
18018 {
18019 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
18020 return true;
18021 }
5a2c8331
KT
18022 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
18023
ba948b37
JJ
18024 old_optimize
18025 = build_optimization_node (&global_options, &global_options_set);
5a2c8331
KT
18026 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
18027
18028 /* If the function changed the optimization levels as well as setting
18029 target options, start with the optimizations specified. */
18030 if (func_optimize && func_optimize != old_optimize)
ba948b37 18031 cl_optimization_restore (&global_options, &global_options_set,
5a2c8331
KT
18032 TREE_OPTIMIZATION (func_optimize));
18033
18034 /* Save the current target options to restore at the end. */
ba948b37 18035 cl_target_option_save (&cur_target, &global_options, &global_options_set);
5a2c8331
KT
18036
18037 /* If fndecl already has some target attributes applied to it, unpack
18038 them so that we add this attribute on top of them, rather than
18039 overwriting them. */
18040 if (existing_target)
18041 {
18042 struct cl_target_option *existing_options
18043 = TREE_TARGET_OPTION (existing_target);
18044
18045 if (existing_options)
ba948b37
JJ
18046 cl_target_option_restore (&global_options, &global_options_set,
18047 existing_options);
5a2c8331
KT
18048 }
18049 else
ba948b37
JJ
18050 cl_target_option_restore (&global_options, &global_options_set,
18051 TREE_TARGET_OPTION (target_option_current_node));
5a2c8331 18052
ab93e9b7 18053 ret = aarch64_process_target_attr (args);
5a2c8331
KT
18054
18055 /* Set up any additional state. */
18056 if (ret)
18057 {
18058 aarch64_override_options_internal (&global_options);
e95a988a
KT
18059 /* Initialize SIMD builtins if we haven't already.
18060 Set current_target_pragma to NULL for the duration so that
18061 the builtin initialization code doesn't try to tag the functions
18062 being built with the attributes specified by any current pragma, thus
18063 going into an infinite recursion. */
18064 if (TARGET_SIMD)
18065 {
18066 tree saved_current_target_pragma = current_target_pragma;
18067 current_target_pragma = NULL;
18068 aarch64_init_simd_builtins ();
18069 current_target_pragma = saved_current_target_pragma;
18070 }
ba948b37
JJ
18071 new_target = build_target_option_node (&global_options,
18072 &global_options_set);
5a2c8331
KT
18073 }
18074 else
18075 new_target = NULL;
18076
ba948b37
JJ
18077 new_optimize = build_optimization_node (&global_options,
18078 &global_options_set);
5a2c8331
KT
18079
18080 if (fndecl && ret)
18081 {
18082 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
18083
18084 if (old_optimize != new_optimize)
18085 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
18086 }
18087
ba948b37 18088 cl_target_option_restore (&global_options, &global_options_set, &cur_target);
5a2c8331
KT
18089
18090 if (old_optimize != new_optimize)
ba948b37 18091 cl_optimization_restore (&global_options, &global_options_set,
5a2c8331
KT
18092 TREE_OPTIMIZATION (old_optimize));
18093 return ret;
18094}
18095
1fd8d40c
KT
18096/* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
18097 tri-bool options (yes, no, don't care) and the default value is
18098 DEF, determine whether to reject inlining. */
18099
18100static bool
18101aarch64_tribools_ok_for_inlining_p (int caller, int callee,
18102 int dont_care, int def)
18103{
18104 /* If the callee doesn't care, always allow inlining. */
18105 if (callee == dont_care)
18106 return true;
18107
18108 /* If the caller doesn't care, always allow inlining. */
18109 if (caller == dont_care)
18110 return true;
18111
18112 /* Otherwise, allow inlining if either the callee and caller values
18113 agree, or if the callee is using the default value. */
18114 return (callee == caller || callee == def);
18115}
18116
18117/* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
18118 to inline CALLEE into CALLER based on target-specific info.
18119 Make sure that the caller and callee have compatible architectural
18120 features. Then go through the other possible target attributes
18121 and see if they can block inlining. Try not to reject always_inline
18122 callees unless they are incompatible architecturally. */
18123
18124static bool
18125aarch64_can_inline_p (tree caller, tree callee)
18126{
18127 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
18128 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
18129
1fd8d40c
KT
18130 struct cl_target_option *caller_opts
18131 = TREE_TARGET_OPTION (caller_tree ? caller_tree
18132 : target_option_default_node);
18133
675d044c
SD
18134 struct cl_target_option *callee_opts
18135 = TREE_TARGET_OPTION (callee_tree ? callee_tree
18136 : target_option_default_node);
1fd8d40c
KT
18137
18138 /* Callee's ISA flags should be a subset of the caller's. */
18139 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
18140 != callee_opts->x_aarch64_isa_flags)
18141 return false;
18142
18143 /* Allow non-strict aligned functions inlining into strict
18144 aligned ones. */
18145 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
18146 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
18147 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
18148 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
18149 return false;
18150
18151 bool always_inline = lookup_attribute ("always_inline",
18152 DECL_ATTRIBUTES (callee));
18153
18154 /* If the architectural features match up and the callee is always_inline
18155 then the other attributes don't matter. */
18156 if (always_inline)
18157 return true;
18158
18159 if (caller_opts->x_aarch64_cmodel_var
18160 != callee_opts->x_aarch64_cmodel_var)
18161 return false;
18162
18163 if (caller_opts->x_aarch64_tls_dialect
18164 != callee_opts->x_aarch64_tls_dialect)
18165 return false;
18166
18167 /* Honour explicit requests to workaround errata. */
18168 if (!aarch64_tribools_ok_for_inlining_p (
18169 caller_opts->x_aarch64_fix_a53_err835769,
18170 callee_opts->x_aarch64_fix_a53_err835769,
18171 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
18172 return false;
18173
48bb1a55
CL
18174 if (!aarch64_tribools_ok_for_inlining_p (
18175 caller_opts->x_aarch64_fix_a53_err843419,
18176 callee_opts->x_aarch64_fix_a53_err843419,
18177 2, TARGET_FIX_ERR_A53_843419))
18178 return false;
18179
1fd8d40c
KT
18180 /* If the user explicitly specified -momit-leaf-frame-pointer for the
18181 caller and calle and they don't match up, reject inlining. */
18182 if (!aarch64_tribools_ok_for_inlining_p (
18183 caller_opts->x_flag_omit_leaf_frame_pointer,
18184 callee_opts->x_flag_omit_leaf_frame_pointer,
18185 2, 1))
18186 return false;
18187
18188 /* If the callee has specific tuning overrides, respect them. */
18189 if (callee_opts->x_aarch64_override_tune_string != NULL
18190 && caller_opts->x_aarch64_override_tune_string == NULL)
18191 return false;
18192
18193 /* If the user specified tuning override strings for the
18194 caller and callee and they don't match up, reject inlining.
18195 We just do a string compare here, we don't analyze the meaning
18196 of the string, as it would be too costly for little gain. */
18197 if (callee_opts->x_aarch64_override_tune_string
18198 && caller_opts->x_aarch64_override_tune_string
18199 && (strcmp (callee_opts->x_aarch64_override_tune_string,
18200 caller_opts->x_aarch64_override_tune_string) != 0))
18201 return false;
18202
18203 return true;
18204}
18205
bb6ce448
RS
18206/* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
18207 been already. */
18208
18209unsigned int
18210aarch64_tlsdesc_abi_id ()
18211{
18212 predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
18213 if (!tlsdesc_abi.initialized_p ())
18214 {
18215 HARD_REG_SET full_reg_clobbers;
18216 CLEAR_HARD_REG_SET (full_reg_clobbers);
18217 SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
18218 SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
18219 for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
18220 SET_HARD_REG_BIT (full_reg_clobbers, regno);
18221 tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
18222 }
18223 return tlsdesc_abi.id ();
18224}
18225
43e9d192
IB
18226/* Return true if SYMBOL_REF X binds locally. */
18227
18228static bool
18229aarch64_symbol_binds_local_p (const_rtx x)
18230{
18231 return (SYMBOL_REF_DECL (x)
18232 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
18233 : SYMBOL_REF_LOCAL_P (x));
18234}
18235
18236/* Return true if SYMBOL_REF X is thread local */
18237static bool
18238aarch64_tls_symbol_p (rtx x)
18239{
18240 if (! TARGET_HAVE_TLS)
18241 return false;
18242
74b27d8e 18243 x = strip_salt (x);
3793ecc1 18244 if (!SYMBOL_REF_P (x))
43e9d192
IB
18245 return false;
18246
18247 return SYMBOL_REF_TLS_MODEL (x) != 0;
18248}
18249
18250/* Classify a TLS symbol into one of the TLS kinds. */
18251enum aarch64_symbol_type
18252aarch64_classify_tls_symbol (rtx x)
18253{
18254 enum tls_model tls_kind = tls_symbolic_operand_type (x);
18255
18256 switch (tls_kind)
18257 {
18258 case TLS_MODEL_GLOBAL_DYNAMIC:
18259 case TLS_MODEL_LOCAL_DYNAMIC:
18260 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
18261
18262 case TLS_MODEL_INITIAL_EXEC:
5ae7caad
JW
18263 switch (aarch64_cmodel)
18264 {
18265 case AARCH64_CMODEL_TINY:
18266 case AARCH64_CMODEL_TINY_PIC:
18267 return SYMBOL_TINY_TLSIE;
18268 default:
79496620 18269 return SYMBOL_SMALL_TLSIE;
5ae7caad 18270 }
43e9d192
IB
18271
18272 case TLS_MODEL_LOCAL_EXEC:
cbf5629e
JW
18273 if (aarch64_tls_size == 12)
18274 return SYMBOL_TLSLE12;
18275 else if (aarch64_tls_size == 24)
18276 return SYMBOL_TLSLE24;
18277 else if (aarch64_tls_size == 32)
18278 return SYMBOL_TLSLE32;
18279 else if (aarch64_tls_size == 48)
18280 return SYMBOL_TLSLE48;
18281 else
18282 gcc_unreachable ();
43e9d192
IB
18283
18284 case TLS_MODEL_EMULATED:
18285 case TLS_MODEL_NONE:
18286 return SYMBOL_FORCE_TO_MEM;
18287
18288 default:
18289 gcc_unreachable ();
18290 }
18291}
18292
43cacb12
RS
18293/* Return the correct method for accessing X + OFFSET, where X is either
18294 a SYMBOL_REF or LABEL_REF. */
17f4d4bf 18295
43e9d192 18296enum aarch64_symbol_type
43cacb12 18297aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
43e9d192 18298{
74b27d8e
RS
18299 x = strip_salt (x);
18300
3793ecc1 18301 if (LABEL_REF_P (x))
43e9d192
IB
18302 {
18303 switch (aarch64_cmodel)
18304 {
18305 case AARCH64_CMODEL_LARGE:
18306 return SYMBOL_FORCE_TO_MEM;
18307
18308 case AARCH64_CMODEL_TINY_PIC:
18309 case AARCH64_CMODEL_TINY:
a5350ddc
CSS
18310 return SYMBOL_TINY_ABSOLUTE;
18311
1b1e81f8 18312 case AARCH64_CMODEL_SMALL_SPIC:
43e9d192
IB
18313 case AARCH64_CMODEL_SMALL_PIC:
18314 case AARCH64_CMODEL_SMALL:
18315 return SYMBOL_SMALL_ABSOLUTE;
18316
18317 default:
18318 gcc_unreachable ();
18319 }
18320 }
18321
3793ecc1 18322 if (SYMBOL_REF_P (x))
43e9d192 18323 {
43e9d192
IB
18324 if (aarch64_tls_symbol_p (x))
18325 return aarch64_classify_tls_symbol (x);
18326
17f4d4bf
CSS
18327 switch (aarch64_cmodel)
18328 {
fb0746f3 18329 case AARCH64_CMODEL_TINY_PIC:
17f4d4bf 18330 case AARCH64_CMODEL_TINY:
fb0746f3
WD
18331 /* With -fPIC non-local symbols use the GOT. For orthogonality
18332 always use the GOT for extern weak symbols. */
18333 if ((flag_pic || SYMBOL_REF_WEAK (x))
18334 && !aarch64_symbol_binds_local_p (x))
18335 return SYMBOL_TINY_GOT;
18336
15f6e0da 18337 /* When we retrieve symbol + offset address, we have to make sure
f8b756b7
TB
18338 the offset does not cause overflow of the final address. But
18339 we have no way of knowing the address of symbol at compile time
18340 so we can't accurately say if the distance between the PC and
7d3b27ff
WD
18341 symbol + offset is outside the addressible range of +/-1MB in the
18342 TINY code model. So we limit the maximum offset to +/-64KB and
18343 assume the offset to the symbol is not larger than +/-(1MB - 64KB).
fb0746f3 18344 If offset_within_block_p is true we allow larger offsets. */
7d3b27ff
WD
18345 if (!(IN_RANGE (offset, -0x10000, 0x10000)
18346 || offset_within_block_p (x, offset)))
18347 return SYMBOL_FORCE_TO_MEM;
18348
a5350ddc
CSS
18349 return SYMBOL_TINY_ABSOLUTE;
18350
fb0746f3
WD
18351
18352 case AARCH64_CMODEL_SMALL_SPIC:
18353 case AARCH64_CMODEL_SMALL_PIC:
17f4d4bf 18354 case AARCH64_CMODEL_SMALL:
fb0746f3
WD
18355 if ((flag_pic || SYMBOL_REF_WEAK (x))
18356 && !aarch64_symbol_binds_local_p (x))
18357 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
18358 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G;
18359
f8b756b7 18360 /* Same reasoning as the tiny code model, but the offset cap here is
7d3b27ff 18361 1MB, allowing +/-3.9GB for the offset to the symbol. */
7d3b27ff
WD
18362 if (!(IN_RANGE (offset, -0x100000, 0x100000)
18363 || offset_within_block_p (x, offset)))
18364 return SYMBOL_FORCE_TO_MEM;
18365
17f4d4bf 18366 return SYMBOL_SMALL_ABSOLUTE;
43e9d192 18367
9ee6540a
WD
18368 case AARCH64_CMODEL_LARGE:
18369 /* This is alright even in PIC code as the constant
18370 pool reference is always PC relative and within
18371 the same translation unit. */
d47d34bb 18372 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
9ee6540a
WD
18373 return SYMBOL_SMALL_ABSOLUTE;
18374 else
18375 return SYMBOL_FORCE_TO_MEM;
18376
17f4d4bf
CSS
18377 default:
18378 gcc_unreachable ();
18379 }
43e9d192 18380 }
17f4d4bf 18381
43e9d192
IB
18382 /* By default push everything into the constant pool. */
18383 return SYMBOL_FORCE_TO_MEM;
18384}
18385
43e9d192
IB
18386bool
18387aarch64_constant_address_p (rtx x)
18388{
18389 return (CONSTANT_P (x) && memory_address_p (DImode, x));
18390}
18391
18392bool
18393aarch64_legitimate_pic_operand_p (rtx x)
18394{
74b27d8e
RS
18395 poly_int64 offset;
18396 x = strip_offset_and_salt (x, &offset);
3793ecc1 18397 if (SYMBOL_REF_P (x))
74b27d8e 18398 return false;
43e9d192
IB
18399
18400 return true;
18401}
18402
26895c21
WD
18403/* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
18404 that should be rematerialized rather than spilled. */
3520f7cc 18405
43e9d192 18406static bool
ef4bddc2 18407aarch64_legitimate_constant_p (machine_mode mode, rtx x)
43e9d192 18408{
26895c21 18409 /* Support CSE and rematerialization of common constants. */
c0bb5bc5 18410 if (CONST_INT_P (x)
1b5f74e8 18411 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT))
26895c21
WD
18412 return true;
18413
1b5f74e8
RS
18414 /* Only accept variable-length vector constants if they can be
18415 handled directly.
18416
18417 ??? It would be possible (but complex) to handle rematerialization
18418 of other constants via secondary reloads. */
18419 if (!GET_MODE_SIZE (mode).is_constant ())
18420 return aarch64_simd_valid_immediate (x, NULL);
18421
18422 /* Otherwise, accept any CONST_VECTOR that, if all else fails, can at
18423 least be forced to memory and loaded from there. */
568b9c0e 18424 if (CONST_VECTOR_P (x))
1b5f74e8
RS
18425 return !targetm.cannot_force_const_mem (mode, x);
18426
43cacb12
RS
18427 /* Do not allow vector struct mode constants for Advanced SIMD.
18428 We could support 0 and -1 easily, but they need support in
18429 aarch64-simd.md. */
18430 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
18431 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
43e9d192
IB
18432 return false;
18433
509bb9b6
RS
18434 if (GET_CODE (x) == HIGH)
18435 x = XEXP (x, 0);
18436
43cacb12
RS
18437 /* Accept polynomial constants that can be calculated by using the
18438 destination of a move as the sole temporary. Constants that
18439 require a second temporary cannot be rematerialized (they can't be
18440 forced to memory and also aren't legitimate constants). */
18441 poly_int64 offset;
18442 if (poly_int_rtx_p (x, &offset))
18443 return aarch64_offset_temporaries (false, offset) <= 1;
18444
18445 /* If an offset is being added to something else, we need to allow the
18446 base to be moved into the destination register, meaning that there
18447 are no free temporaries for the offset. */
74b27d8e 18448 x = strip_offset_and_salt (x, &offset);
43cacb12
RS
18449 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
18450 return false;
26895c21 18451
43cacb12
RS
18452 /* Do not allow const (plus (anchor_symbol, const_int)). */
18453 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
18454 return false;
26895c21 18455
f28e54bd
WD
18456 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
18457 so spilling them is better than rematerialization. */
18458 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
18459 return true;
18460
26895c21 18461 /* Label references are always constant. */
3793ecc1 18462 if (LABEL_REF_P (x))
26895c21
WD
18463 return true;
18464
18465 return false;
43e9d192
IB
18466}
18467
a5bc806c 18468rtx
43e9d192
IB
18469aarch64_load_tp (rtx target)
18470{
18471 if (!target
18472 || GET_MODE (target) != Pmode
18473 || !register_operand (target, Pmode))
18474 target = gen_reg_rtx (Pmode);
18475
18476 /* Can return in any reg. */
18477 emit_insn (gen_aarch64_load_tp_hard (target));
18478 return target;
18479}
18480
43e9d192
IB
18481/* On AAPCS systems, this is the "struct __va_list". */
18482static GTY(()) tree va_list_type;
18483
18484/* Implement TARGET_BUILD_BUILTIN_VA_LIST.
18485 Return the type to use as __builtin_va_list.
18486
18487 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
18488
18489 struct __va_list
18490 {
18491 void *__stack;
18492 void *__gr_top;
18493 void *__vr_top;
18494 int __gr_offs;
18495 int __vr_offs;
18496 }; */
18497
18498static tree
18499aarch64_build_builtin_va_list (void)
18500{
18501 tree va_list_name;
18502 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
18503
18504 /* Create the type. */
18505 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
18506 /* Give it the required name. */
18507 va_list_name = build_decl (BUILTINS_LOCATION,
18508 TYPE_DECL,
18509 get_identifier ("__va_list"),
18510 va_list_type);
18511 DECL_ARTIFICIAL (va_list_name) = 1;
18512 TYPE_NAME (va_list_type) = va_list_name;
665c56c6 18513 TYPE_STUB_DECL (va_list_type) = va_list_name;
43e9d192
IB
18514
18515 /* Create the fields. */
18516 f_stack = build_decl (BUILTINS_LOCATION,
18517 FIELD_DECL, get_identifier ("__stack"),
18518 ptr_type_node);
18519 f_grtop = build_decl (BUILTINS_LOCATION,
18520 FIELD_DECL, get_identifier ("__gr_top"),
18521 ptr_type_node);
18522 f_vrtop = build_decl (BUILTINS_LOCATION,
18523 FIELD_DECL, get_identifier ("__vr_top"),
18524 ptr_type_node);
18525 f_groff = build_decl (BUILTINS_LOCATION,
18526 FIELD_DECL, get_identifier ("__gr_offs"),
18527 integer_type_node);
18528 f_vroff = build_decl (BUILTINS_LOCATION,
18529 FIELD_DECL, get_identifier ("__vr_offs"),
18530 integer_type_node);
18531
88e3bdd1 18532 /* Tell tree-stdarg pass about our internal offset fields.
3fd6b9cc
JW
18533 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
18534 purpose to identify whether the code is updating va_list internal
18535 offset fields through irregular way. */
18536 va_list_gpr_counter_field = f_groff;
18537 va_list_fpr_counter_field = f_vroff;
18538
43e9d192
IB
18539 DECL_ARTIFICIAL (f_stack) = 1;
18540 DECL_ARTIFICIAL (f_grtop) = 1;
18541 DECL_ARTIFICIAL (f_vrtop) = 1;
18542 DECL_ARTIFICIAL (f_groff) = 1;
18543 DECL_ARTIFICIAL (f_vroff) = 1;
18544
18545 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
18546 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
18547 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
18548 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
18549 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
18550
18551 TYPE_FIELDS (va_list_type) = f_stack;
18552 DECL_CHAIN (f_stack) = f_grtop;
18553 DECL_CHAIN (f_grtop) = f_vrtop;
18554 DECL_CHAIN (f_vrtop) = f_groff;
18555 DECL_CHAIN (f_groff) = f_vroff;
18556
18557 /* Compute its layout. */
18558 layout_type (va_list_type);
18559
18560 return va_list_type;
18561}
18562
18563/* Implement TARGET_EXPAND_BUILTIN_VA_START. */
18564static void
18565aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
18566{
18567 const CUMULATIVE_ARGS *cum;
18568 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
18569 tree stack, grtop, vrtop, groff, vroff;
18570 tree t;
88e3bdd1
JW
18571 int gr_save_area_size = cfun->va_list_gpr_size;
18572 int vr_save_area_size = cfun->va_list_fpr_size;
43e9d192
IB
18573 int vr_offset;
18574
18575 cum = &crtl->args.info;
88e3bdd1
JW
18576 if (cfun->va_list_gpr_size)
18577 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
18578 cfun->va_list_gpr_size);
18579 if (cfun->va_list_fpr_size)
18580 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
18581 * UNITS_PER_VREG, cfun->va_list_fpr_size);
43e9d192 18582
d5726973 18583 if (!TARGET_FLOAT)
43e9d192 18584 {
261fb553 18585 gcc_assert (cum->aapcs_nvrn == 0);
43e9d192
IB
18586 vr_save_area_size = 0;
18587 }
18588
18589 f_stack = TYPE_FIELDS (va_list_type_node);
18590 f_grtop = DECL_CHAIN (f_stack);
18591 f_vrtop = DECL_CHAIN (f_grtop);
18592 f_groff = DECL_CHAIN (f_vrtop);
18593 f_vroff = DECL_CHAIN (f_groff);
18594
18595 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
18596 NULL_TREE);
18597 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
18598 NULL_TREE);
18599 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
18600 NULL_TREE);
18601 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
18602 NULL_TREE);
18603 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
18604 NULL_TREE);
18605
18606 /* Emit code to initialize STACK, which points to the next varargs stack
18607 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
18608 by named arguments. STACK is 8-byte aligned. */
18609 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
18610 if (cum->aapcs_stack_size > 0)
18611 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
18612 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
18613 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
18614
18615 /* Emit code to initialize GRTOP, the top of the GR save area.
18616 virtual_incoming_args_rtx should have been 16 byte aligned. */
18617 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
18618 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
18619 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
18620
18621 /* Emit code to initialize VRTOP, the top of the VR save area.
18622 This address is gr_save_area_bytes below GRTOP, rounded
18623 down to the next 16-byte boundary. */
18624 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
4f59f9f2
UB
18625 vr_offset = ROUND_UP (gr_save_area_size,
18626 STACK_BOUNDARY / BITS_PER_UNIT);
43e9d192
IB
18627
18628 if (vr_offset)
18629 t = fold_build_pointer_plus_hwi (t, -vr_offset);
18630 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
18631 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
18632
18633 /* Emit code to initialize GROFF, the offset from GRTOP of the
18634 next GPR argument. */
18635 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
18636 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
18637 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
18638
18639 /* Likewise emit code to initialize VROFF, the offset from FTOP
18640 of the next VR argument. */
18641 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
18642 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
18643 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
18644}
18645
18646/* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
18647
18648static tree
18649aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
18650 gimple_seq *post_p ATTRIBUTE_UNUSED)
18651{
18652 tree addr;
18653 bool indirect_p;
18654 bool is_ha; /* is HFA or HVA. */
18655 bool dw_align; /* double-word align. */
ef4bddc2 18656 machine_mode ag_mode = VOIDmode;
43e9d192 18657 int nregs;
ef4bddc2 18658 machine_mode mode;
43e9d192
IB
18659
18660 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
18661 tree stack, f_top, f_off, off, arg, roundup, on_stack;
18662 HOST_WIDE_INT size, rsize, adjust, align;
18663 tree t, u, cond1, cond2;
18664
fde65a89 18665 indirect_p = pass_va_arg_by_reference (type);
43e9d192
IB
18666 if (indirect_p)
18667 type = build_pointer_type (type);
18668
18669 mode = TYPE_MODE (type);
18670
18671 f_stack = TYPE_FIELDS (va_list_type_node);
18672 f_grtop = DECL_CHAIN (f_stack);
18673 f_vrtop = DECL_CHAIN (f_grtop);
18674 f_groff = DECL_CHAIN (f_vrtop);
18675 f_vroff = DECL_CHAIN (f_groff);
18676
18677 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
18678 f_stack, NULL_TREE);
18679 size = int_size_in_bytes (type);
c590597c 18680
49813aad 18681 unsigned int abi_break;
c590597c
RE
18682 align
18683 = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
43e9d192
IB
18684
18685 dw_align = false;
18686 adjust = 0;
56fe3ca3
RS
18687 if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &nregs,
18688 &is_ha, false))
43e9d192 18689 {
6a70badb
RS
18690 /* No frontends can create types with variable-sized modes, so we
18691 shouldn't be asked to pass or return them. */
18692 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
18693
43e9d192 18694 /* TYPE passed in fp/simd registers. */
d5726973 18695 if (!TARGET_FLOAT)
fc29dfc9 18696 aarch64_err_no_fpadvsimd (mode);
43e9d192
IB
18697
18698 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
18699 unshare_expr (valist), f_vrtop, NULL_TREE);
18700 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
18701 unshare_expr (valist), f_vroff, NULL_TREE);
18702
18703 rsize = nregs * UNITS_PER_VREG;
18704
18705 if (is_ha)
18706 {
6a70badb
RS
18707 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
18708 adjust = UNITS_PER_VREG - ag_size;
43e9d192 18709 }
76b0cbf8 18710 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
43e9d192
IB
18711 && size < UNITS_PER_VREG)
18712 {
18713 adjust = UNITS_PER_VREG - size;
18714 }
18715 }
18716 else
18717 {
18718 /* TYPE passed in general registers. */
18719 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
18720 unshare_expr (valist), f_grtop, NULL_TREE);
18721 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
18722 unshare_expr (valist), f_groff, NULL_TREE);
4f59f9f2 18723 rsize = ROUND_UP (size, UNITS_PER_WORD);
43e9d192
IB
18724 nregs = rsize / UNITS_PER_WORD;
18725
18726 if (align > 8)
c590597c
RE
18727 {
18728 if (abi_break && warn_psabi)
18729 inform (input_location, "parameter passing for argument of type "
18730 "%qT changed in GCC 9.1", type);
18731 dw_align = true;
18732 }
43e9d192 18733
76b0cbf8 18734 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
43e9d192
IB
18735 && size < UNITS_PER_WORD)
18736 {
18737 adjust = UNITS_PER_WORD - size;
18738 }
18739 }
18740
18741 /* Get a local temporary for the field value. */
18742 off = get_initialized_tmp_var (f_off, pre_p, NULL);
18743
18744 /* Emit code to branch if off >= 0. */
18745 t = build2 (GE_EXPR, boolean_type_node, off,
18746 build_int_cst (TREE_TYPE (off), 0));
18747 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
18748
18749 if (dw_align)
18750 {
18751 /* Emit: offs = (offs + 15) & -16. */
18752 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
18753 build_int_cst (TREE_TYPE (off), 15));
18754 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
18755 build_int_cst (TREE_TYPE (off), -16));
18756 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
18757 }
18758 else
18759 roundup = NULL;
18760
18761 /* Update ap.__[g|v]r_offs */
18762 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
18763 build_int_cst (TREE_TYPE (off), rsize));
18764 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
18765
18766 /* String up. */
18767 if (roundup)
18768 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
18769
18770 /* [cond2] if (ap.__[g|v]r_offs > 0) */
18771 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
18772 build_int_cst (TREE_TYPE (f_off), 0));
18773 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
18774
18775 /* String up: make sure the assignment happens before the use. */
18776 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
18777 COND_EXPR_ELSE (cond1) = t;
18778
18779 /* Prepare the trees handling the argument that is passed on the stack;
18780 the top level node will store in ON_STACK. */
18781 arg = get_initialized_tmp_var (stack, pre_p, NULL);
18782 if (align > 8)
18783 {
18784 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
4bdc2738 18785 t = fold_build_pointer_plus_hwi (arg, 15);
43e9d192
IB
18786 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
18787 build_int_cst (TREE_TYPE (t), -16));
43e9d192
IB
18788 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
18789 }
18790 else
18791 roundup = NULL;
18792 /* Advance ap.__stack */
4bdc2738 18793 t = fold_build_pointer_plus_hwi (arg, size + 7);
43e9d192
IB
18794 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
18795 build_int_cst (TREE_TYPE (t), -8));
43e9d192
IB
18796 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
18797 /* String up roundup and advance. */
18798 if (roundup)
18799 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
18800 /* String up with arg */
18801 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
18802 /* Big-endianness related address adjustment. */
76b0cbf8 18803 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
43e9d192
IB
18804 && size < UNITS_PER_WORD)
18805 {
18806 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
18807 size_int (UNITS_PER_WORD - size));
18808 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
18809 }
18810
18811 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
18812 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
18813
18814 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
18815 t = off;
18816 if (adjust)
18817 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
18818 build_int_cst (TREE_TYPE (off), adjust));
18819
18820 t = fold_convert (sizetype, t);
18821 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
18822
18823 if (is_ha)
18824 {
18825 /* type ha; // treat as "struct {ftype field[n];}"
18826 ... [computing offs]
18827 for (i = 0; i <nregs; ++i, offs += 16)
18828 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
18829 return ha; */
18830 int i;
18831 tree tmp_ha, field_t, field_ptr_t;
18832
18833 /* Declare a local variable. */
18834 tmp_ha = create_tmp_var_raw (type, "ha");
18835 gimple_add_tmp_var (tmp_ha);
18836
18837 /* Establish the base type. */
18838 switch (ag_mode)
18839 {
4e10a5a7 18840 case E_SFmode:
43e9d192
IB
18841 field_t = float_type_node;
18842 field_ptr_t = float_ptr_type_node;
18843 break;
4e10a5a7 18844 case E_DFmode:
43e9d192
IB
18845 field_t = double_type_node;
18846 field_ptr_t = double_ptr_type_node;
18847 break;
4e10a5a7 18848 case E_TFmode:
43e9d192
IB
18849 field_t = long_double_type_node;
18850 field_ptr_t = long_double_ptr_type_node;
18851 break;
4e10a5a7 18852 case E_HFmode:
1b62ed4f
JG
18853 field_t = aarch64_fp16_type_node;
18854 field_ptr_t = aarch64_fp16_ptr_type_node;
43e9d192 18855 break;
abbe1ed2
SMW
18856 case E_BFmode:
18857 field_t = aarch64_bf16_type_node;
18858 field_ptr_t = aarch64_bf16_ptr_type_node;
18859 break;
4e10a5a7
RS
18860 case E_V2SImode:
18861 case E_V4SImode:
43e9d192
IB
18862 {
18863 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
18864 field_t = build_vector_type_for_mode (innertype, ag_mode);
18865 field_ptr_t = build_pointer_type (field_t);
18866 }
18867 break;
18868 default:
18869 gcc_assert (0);
18870 }
18871
18872 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
ab563903 18873 TREE_ADDRESSABLE (tmp_ha) = 1;
43e9d192
IB
18874 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
18875 addr = t;
18876 t = fold_convert (field_ptr_t, addr);
18877 t = build2 (MODIFY_EXPR, field_t,
18878 build1 (INDIRECT_REF, field_t, tmp_ha),
18879 build1 (INDIRECT_REF, field_t, t));
18880
18881 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
18882 for (i = 1; i < nregs; ++i)
18883 {
18884 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
18885 u = fold_convert (field_ptr_t, addr);
18886 u = build2 (MODIFY_EXPR, field_t,
18887 build2 (MEM_REF, field_t, tmp_ha,
18888 build_int_cst (field_ptr_t,
18889 (i *
18890 int_size_in_bytes (field_t)))),
18891 build1 (INDIRECT_REF, field_t, u));
18892 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
18893 }
18894
18895 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
18896 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
18897 }
18898
18899 COND_EXPR_ELSE (cond2) = t;
18900 addr = fold_convert (build_pointer_type (type), cond1);
18901 addr = build_va_arg_indirect_ref (addr);
18902
18903 if (indirect_p)
18904 addr = build_va_arg_indirect_ref (addr);
18905
18906 return addr;
18907}
18908
18909/* Implement TARGET_SETUP_INCOMING_VARARGS. */
18910
18911static void
e7056ca4
RS
18912aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
18913 const function_arg_info &arg,
18914 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
43e9d192
IB
18915{
18916 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
18917 CUMULATIVE_ARGS local_cum;
88e3bdd1
JW
18918 int gr_saved = cfun->va_list_gpr_size;
18919 int vr_saved = cfun->va_list_fpr_size;
43e9d192
IB
18920
18921 /* The caller has advanced CUM up to, but not beyond, the last named
18922 argument. Advance a local copy of CUM past the last "real" named
18923 argument, to find out how many registers are left over. */
18924 local_cum = *cum;
6930c98c 18925 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
43e9d192 18926
88e3bdd1
JW
18927 /* Found out how many registers we need to save.
18928 Honor tree-stdvar analysis results. */
18929 if (cfun->va_list_gpr_size)
18930 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
18931 cfun->va_list_gpr_size / UNITS_PER_WORD);
18932 if (cfun->va_list_fpr_size)
18933 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
18934 cfun->va_list_fpr_size / UNITS_PER_VREG);
43e9d192 18935
d5726973 18936 if (!TARGET_FLOAT)
43e9d192 18937 {
261fb553 18938 gcc_assert (local_cum.aapcs_nvrn == 0);
43e9d192
IB
18939 vr_saved = 0;
18940 }
18941
18942 if (!no_rtl)
18943 {
18944 if (gr_saved > 0)
18945 {
18946 rtx ptr, mem;
18947
18948 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
18949 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
18950 - gr_saved * UNITS_PER_WORD);
18951 mem = gen_frame_mem (BLKmode, ptr);
18952 set_mem_alias_set (mem, get_varargs_alias_set ());
18953
18954 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
18955 mem, gr_saved);
18956 }
18957 if (vr_saved > 0)
18958 {
18959 /* We can't use move_block_from_reg, because it will use
18960 the wrong mode, storing D regs only. */
ef4bddc2 18961 machine_mode mode = TImode;
88e3bdd1 18962 int off, i, vr_start;
43e9d192
IB
18963
18964 /* Set OFF to the offset from virtual_incoming_args_rtx of
18965 the first vector register. The VR save area lies below
18966 the GR one, and is aligned to 16 bytes. */
4f59f9f2
UB
18967 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
18968 STACK_BOUNDARY / BITS_PER_UNIT);
43e9d192
IB
18969 off -= vr_saved * UNITS_PER_VREG;
18970
88e3bdd1
JW
18971 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
18972 for (i = 0; i < vr_saved; ++i)
43e9d192
IB
18973 {
18974 rtx ptr, mem;
18975
18976 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
18977 mem = gen_frame_mem (mode, ptr);
18978 set_mem_alias_set (mem, get_varargs_alias_set ());
88e3bdd1 18979 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
43e9d192
IB
18980 off += UNITS_PER_VREG;
18981 }
18982 }
18983 }
18984
18985 /* We don't save the size into *PRETEND_SIZE because we want to avoid
18986 any complication of having crtl->args.pretend_args_size changed. */
8799637a 18987 cfun->machine->frame.saved_varargs_size
4f59f9f2
UB
18988 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
18989 STACK_BOUNDARY / BITS_PER_UNIT)
43e9d192
IB
18990 + vr_saved * UNITS_PER_VREG);
18991}
18992
18993static void
18994aarch64_conditional_register_usage (void)
18995{
18996 int i;
18997 if (!TARGET_FLOAT)
18998 {
18999 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
19000 {
19001 fixed_regs[i] = 1;
19002 call_used_regs[i] = 1;
19003 }
19004 }
43cacb12
RS
19005 if (!TARGET_SVE)
19006 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
19007 {
19008 fixed_regs[i] = 1;
19009 call_used_regs[i] = 1;
19010 }
3751345d 19011
183bfdaf
RS
19012 /* Only allow the FFR and FFRT to be accessed via special patterns. */
19013 CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
19014 CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
19015
3751345d
RE
19016 /* When tracking speculation, we need a couple of call-clobbered registers
19017 to track the speculation state. It would be nice to just use
19018 IP0 and IP1, but currently there are numerous places that just
19019 assume these registers are free for other uses (eg pointer
19020 authentication). */
19021 if (aarch64_track_speculation)
19022 {
19023 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
19024 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
19025 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
19026 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
19027 }
43e9d192
IB
19028}
19029
38e62001
RS
19030/* Implement TARGET_MEMBER_TYPE_FORCES_BLK. */
19031
19032bool
19033aarch64_member_type_forces_blk (const_tree field_or_array, machine_mode mode)
19034{
19035 /* For records we're passed a FIELD_DECL, for arrays we're passed
19036 an ARRAY_TYPE. In both cases we're interested in the TREE_TYPE. */
19037 const_tree type = TREE_TYPE (field_or_array);
19038
19039 /* Assign BLKmode to anything that contains multiple SVE predicates.
19040 For structures, the "multiple" case is indicated by MODE being
19041 VOIDmode. */
19042 unsigned int num_zr, num_pr;
19043 if (aarch64_sve::builtin_type_p (type, &num_zr, &num_pr) && num_pr != 0)
19044 {
19045 if (TREE_CODE (field_or_array) == ARRAY_TYPE)
19046 return !simple_cst_equal (TYPE_SIZE (field_or_array),
19047 TYPE_SIZE (type));
19048 return mode == VOIDmode;
19049 }
19050
19051 return default_member_type_forces_blk (field_or_array, mode);
19052}
19053
56fe3ca3
RS
19054/* Bitmasks that indicate whether earlier versions of GCC would have
19055 taken a different path through the ABI logic. This should result in
19056 a -Wpsabi warning if the earlier path led to a different ABI decision.
19057
19058 WARN_PSABI_EMPTY_CXX17_BASE
19059 Indicates that the type includes an artificial empty C++17 base field
19060 that, prior to GCC 10.1, would prevent the type from being treated as
19061 a HFA or HVA. See PR94383 for details.
19062
19063 WARN_PSABI_NO_UNIQUE_ADDRESS
19064 Indicates that the type includes an empty [[no_unique_address]] field
19065 that, prior to GCC 10.1, would prevent the type from being treated as
19066 a HFA or HVA. */
19067const unsigned int WARN_PSABI_EMPTY_CXX17_BASE = 1U << 0;
19068const unsigned int WARN_PSABI_NO_UNIQUE_ADDRESS = 1U << 1;
19069
43e9d192
IB
19070/* Walk down the type tree of TYPE counting consecutive base elements.
19071 If *MODEP is VOIDmode, then set it to the first valid floating point
19072 type. If a non-floating point type is found, or if a floating point
19073 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
e73a32d6
MM
19074 otherwise return the count in the sub-tree.
19075
56fe3ca3
RS
19076 The WARN_PSABI_FLAGS argument allows the caller to check whether this
19077 function has changed its behavior relative to earlier versions of GCC.
19078 Normally the argument should be nonnull and point to a zero-initialized
19079 variable. The function then records whether the ABI decision might
19080 be affected by a known fix to the ABI logic, setting the associated
19081 WARN_PSABI_* bits if so.
19082
19083 When the argument is instead a null pointer, the function tries to
19084 simulate the behavior of GCC before all such ABI fixes were made.
19085 This is useful to check whether the function returns something
19086 different after the ABI fixes. */
43e9d192 19087static int
e73a32d6 19088aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep,
56fe3ca3 19089 unsigned int *warn_psabi_flags)
43e9d192 19090{
ef4bddc2 19091 machine_mode mode;
43e9d192
IB
19092 HOST_WIDE_INT size;
19093
38e62001
RS
19094 if (aarch64_sve::builtin_type_p (type))
19095 return -1;
c600df9a 19096
43e9d192
IB
19097 switch (TREE_CODE (type))
19098 {
19099 case REAL_TYPE:
19100 mode = TYPE_MODE (type);
1b62ed4f
JG
19101 if (mode != DFmode && mode != SFmode
19102 && mode != TFmode && mode != HFmode)
43e9d192
IB
19103 return -1;
19104
19105 if (*modep == VOIDmode)
19106 *modep = mode;
19107
19108 if (*modep == mode)
19109 return 1;
19110
19111 break;
19112
19113 case COMPLEX_TYPE:
19114 mode = TYPE_MODE (TREE_TYPE (type));
1b62ed4f
JG
19115 if (mode != DFmode && mode != SFmode
19116 && mode != TFmode && mode != HFmode)
43e9d192
IB
19117 return -1;
19118
19119 if (*modep == VOIDmode)
19120 *modep = mode;
19121
19122 if (*modep == mode)
19123 return 2;
19124
19125 break;
19126
19127 case VECTOR_TYPE:
19128 /* Use V2SImode and V4SImode as representatives of all 64-bit
19129 and 128-bit vector types. */
19130 size = int_size_in_bytes (type);
19131 switch (size)
19132 {
19133 case 8:
19134 mode = V2SImode;
19135 break;
19136 case 16:
19137 mode = V4SImode;
19138 break;
19139 default:
19140 return -1;
19141 }
19142
19143 if (*modep == VOIDmode)
19144 *modep = mode;
19145
19146 /* Vector modes are considered to be opaque: two vectors are
19147 equivalent for the purposes of being homogeneous aggregates
19148 if they are the same size. */
19149 if (*modep == mode)
19150 return 1;
19151
19152 break;
19153
19154 case ARRAY_TYPE:
19155 {
19156 int count;
19157 tree index = TYPE_DOMAIN (type);
19158
807e902e
KZ
19159 /* Can't handle incomplete types nor sizes that are not
19160 fixed. */
19161 if (!COMPLETE_TYPE_P (type)
19162 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
43e9d192
IB
19163 return -1;
19164
e73a32d6 19165 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep,
56fe3ca3 19166 warn_psabi_flags);
43e9d192
IB
19167 if (count == -1
19168 || !index
19169 || !TYPE_MAX_VALUE (index)
cc269bb6 19170 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
43e9d192 19171 || !TYPE_MIN_VALUE (index)
cc269bb6 19172 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
43e9d192
IB
19173 || count < 0)
19174 return -1;
19175
ae7e9ddd
RS
19176 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
19177 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
43e9d192
IB
19178
19179 /* There must be no padding. */
6a70badb
RS
19180 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
19181 count * GET_MODE_BITSIZE (*modep)))
43e9d192
IB
19182 return -1;
19183
19184 return count;
19185 }
19186
19187 case RECORD_TYPE:
19188 {
19189 int count = 0;
19190 int sub_count;
19191 tree field;
19192
807e902e
KZ
19193 /* Can't handle incomplete types nor sizes that are not
19194 fixed. */
19195 if (!COMPLETE_TYPE_P (type)
19196 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
43e9d192
IB
19197 return -1;
19198
19199 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
19200 {
19201 if (TREE_CODE (field) != FIELD_DECL)
19202 continue;
19203
56fe3ca3 19204 if (DECL_FIELD_ABI_IGNORED (field))
e73a32d6 19205 {
56fe3ca3
RS
19206 /* See whether this is something that earlier versions of
19207 GCC failed to ignore. */
19208 unsigned int flag;
19209 if (lookup_attribute ("no_unique_address",
19210 DECL_ATTRIBUTES (field)))
19211 flag = WARN_PSABI_NO_UNIQUE_ADDRESS;
19212 else if (cxx17_empty_base_field_p (field))
19213 flag = WARN_PSABI_EMPTY_CXX17_BASE;
19214 else
19215 /* No compatibility problem. */
19216 continue;
19217
19218 /* Simulate the old behavior when WARN_PSABI_FLAGS is null. */
19219 if (warn_psabi_flags)
19220 {
19221 *warn_psabi_flags |= flag;
19222 continue;
19223 }
e73a32d6
MM
19224 }
19225
19226 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
56fe3ca3 19227 warn_psabi_flags);
43e9d192
IB
19228 if (sub_count < 0)
19229 return -1;
19230 count += sub_count;
19231 }
19232
19233 /* There must be no padding. */
6a70badb
RS
19234 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
19235 count * GET_MODE_BITSIZE (*modep)))
43e9d192
IB
19236 return -1;
19237
19238 return count;
19239 }
19240
19241 case UNION_TYPE:
19242 case QUAL_UNION_TYPE:
19243 {
19244 /* These aren't very interesting except in a degenerate case. */
19245 int count = 0;
19246 int sub_count;
19247 tree field;
19248
807e902e
KZ
19249 /* Can't handle incomplete types nor sizes that are not
19250 fixed. */
19251 if (!COMPLETE_TYPE_P (type)
19252 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
43e9d192
IB
19253 return -1;
19254
19255 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
19256 {
19257 if (TREE_CODE (field) != FIELD_DECL)
19258 continue;
19259
e73a32d6 19260 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
56fe3ca3 19261 warn_psabi_flags);
43e9d192
IB
19262 if (sub_count < 0)
19263 return -1;
19264 count = count > sub_count ? count : sub_count;
19265 }
19266
19267 /* There must be no padding. */
6a70badb
RS
19268 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
19269 count * GET_MODE_BITSIZE (*modep)))
43e9d192
IB
19270 return -1;
19271
19272 return count;
19273 }
19274
19275 default:
19276 break;
19277 }
19278
19279 return -1;
19280}
19281
b6ec6215
KT
19282/* Return TRUE if the type, as described by TYPE and MODE, is a short vector
19283 type as described in AAPCS64 \S 4.1.2.
19284
19285 See the comment above aarch64_composite_type_p for the notes on MODE. */
19286
19287static bool
19288aarch64_short_vector_p (const_tree type,
19289 machine_mode mode)
19290{
6a70badb 19291 poly_int64 size = -1;
b6ec6215
KT
19292
19293 if (type && TREE_CODE (type) == VECTOR_TYPE)
38e62001
RS
19294 {
19295 if (aarch64_sve::builtin_type_p (type))
19296 return false;
19297 size = int_size_in_bytes (type);
19298 }
b6ec6215 19299 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
38e62001
RS
19300 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
19301 {
19302 /* Rely only on the type, not the mode, when processing SVE types. */
19303 if (type && aarch64_some_values_include_pst_objects_p (type))
b2672dd6
FY
19304 /* Leave later code to report an error if SVE is disabled. */
19305 gcc_assert (!TARGET_SVE || aarch64_sve_mode_p (mode));
38e62001
RS
19306 else
19307 size = GET_MODE_SIZE (mode);
19308 }
19309 if (known_eq (size, 8) || known_eq (size, 16))
19310 {
19311 /* 64-bit and 128-bit vectors should only acquire an SVE mode if
19312 they are being treated as scalable AAPCS64 types. */
19313 gcc_assert (!aarch64_sve_mode_p (mode));
19314 return true;
19315 }
19316 return false;
b6ec6215
KT
19317}
19318
43e9d192
IB
19319/* Return TRUE if the type, as described by TYPE and MODE, is a composite
19320 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
19321 array types. The C99 floating-point complex types are also considered
19322 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
19323 types, which are GCC extensions and out of the scope of AAPCS64, are
19324 treated as composite types here as well.
19325
19326 Note that MODE itself is not sufficient in determining whether a type
19327 is such a composite type or not. This is because
19328 stor-layout.c:compute_record_mode may have already changed the MODE
19329 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
19330 structure with only one field may have its MODE set to the mode of the
19331 field. Also an integer mode whose size matches the size of the
19332 RECORD_TYPE type may be used to substitute the original mode
19333 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
19334 solely relied on. */
19335
19336static bool
19337aarch64_composite_type_p (const_tree type,
ef4bddc2 19338 machine_mode mode)
43e9d192 19339{
b6ec6215
KT
19340 if (aarch64_short_vector_p (type, mode))
19341 return false;
19342
43e9d192
IB
19343 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
19344 return true;
19345
19346 if (mode == BLKmode
19347 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
19348 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
19349 return true;
19350
19351 return false;
19352}
19353
43e9d192
IB
19354/* Return TRUE if an argument, whose type is described by TYPE and MODE,
19355 shall be passed or returned in simd/fp register(s) (providing these
19356 parameter passing registers are available).
19357
19358 Upon successful return, *COUNT returns the number of needed registers,
b6073c9f 19359 *BASE_MODE returns the mode of the individual register and when IS_HA
43e9d192 19360 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
56fe3ca3
RS
19361 floating-point aggregate or a homogeneous short-vector aggregate.
19362
19363 SILENT_P is true if the function should refrain from reporting any
19364 diagnostics. This should only be used if the caller is certain that
19365 any ABI decisions would eventually come through this function with
19366 SILENT_P set to false. */
43e9d192
IB
19367
19368static bool
ef4bddc2 19369aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
43e9d192 19370 const_tree type,
ef4bddc2 19371 machine_mode *base_mode,
43e9d192 19372 int *count,
56fe3ca3
RS
19373 bool *is_ha,
19374 bool silent_p)
43e9d192 19375{
c600df9a
RS
19376 if (is_ha != NULL) *is_ha = false;
19377
ef4bddc2 19378 machine_mode new_mode = VOIDmode;
43e9d192
IB
19379 bool composite_p = aarch64_composite_type_p (type, mode);
19380
43e9d192
IB
19381 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
19382 || aarch64_short_vector_p (type, mode))
19383 {
19384 *count = 1;
19385 new_mode = mode;
19386 }
19387 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
19388 {
19389 if (is_ha != NULL) *is_ha = true;
19390 *count = 2;
19391 new_mode = GET_MODE_INNER (mode);
19392 }
19393 else if (type && composite_p)
19394 {
56fe3ca3
RS
19395 unsigned int warn_psabi_flags = 0;
19396 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode,
19397 &warn_psabi_flags);
43e9d192
IB
19398 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
19399 {
e73a32d6
MM
19400 static unsigned last_reported_type_uid;
19401 unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (type));
19402 int alt;
56fe3ca3
RS
19403 if (!silent_p
19404 && warn_psabi
19405 && warn_psabi_flags
e73a32d6
MM
19406 && uid != last_reported_type_uid
19407 && ((alt = aapcs_vfp_sub_candidate (type, &new_mode, NULL))
19408 != ag_count))
19409 {
e33a1eae
JJ
19410 const char *url
19411 = CHANGES_ROOT_URL "gcc-10/changes.html#empty_base";
e73a32d6
MM
19412 gcc_assert (alt == -1);
19413 last_reported_type_uid = uid;
56fe3ca3
RS
19414 /* Use TYPE_MAIN_VARIANT to strip any redundant const
19415 qualification. */
19416 if (warn_psabi_flags & WARN_PSABI_NO_UNIQUE_ADDRESS)
19417 inform (input_location, "parameter passing for argument of "
19418 "type %qT with %<[[no_unique_address]]%> members "
691eeb65
JJ
19419 "changed %{in GCC 10.1%}",
19420 TYPE_MAIN_VARIANT (type), url);
56fe3ca3
RS
19421 else if (warn_psabi_flags & WARN_PSABI_EMPTY_CXX17_BASE)
19422 inform (input_location, "parameter passing for argument of "
19423 "type %qT when C++17 is enabled changed to match "
691eeb65
JJ
19424 "C++14 %{in GCC 10.1%}",
19425 TYPE_MAIN_VARIANT (type), url);
e73a32d6
MM
19426 }
19427
43e9d192
IB
19428 if (is_ha != NULL) *is_ha = true;
19429 *count = ag_count;
19430 }
19431 else
19432 return false;
19433 }
19434 else
19435 return false;
19436
38e62001 19437 gcc_assert (!aarch64_sve_mode_p (new_mode));
43e9d192
IB
19438 *base_mode = new_mode;
19439 return true;
19440}
19441
19442/* Implement TARGET_STRUCT_VALUE_RTX. */
19443
19444static rtx
19445aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
19446 int incoming ATTRIBUTE_UNUSED)
19447{
19448 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
19449}
19450
19451/* Implements target hook vector_mode_supported_p. */
19452static bool
ef4bddc2 19453aarch64_vector_mode_supported_p (machine_mode mode)
43e9d192 19454{
43cacb12 19455 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
cc68f7c2 19456 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
43e9d192
IB
19457}
19458
4aeb1ba7
RS
19459/* Return the full-width SVE vector mode for element mode MODE, if one
19460 exists. */
19461opt_machine_mode
19462aarch64_full_sve_mode (scalar_mode mode)
19463{
19464 switch (mode)
19465 {
19466 case E_DFmode:
19467 return VNx2DFmode;
19468 case E_SFmode:
19469 return VNx4SFmode;
19470 case E_HFmode:
19471 return VNx8HFmode;
02fcd8ac
RS
19472 case E_BFmode:
19473 return VNx8BFmode;
4aeb1ba7 19474 case E_DImode:
02fcd8ac 19475 return VNx2DImode;
4aeb1ba7
RS
19476 case E_SImode:
19477 return VNx4SImode;
19478 case E_HImode:
19479 return VNx8HImode;
19480 case E_QImode:
19481 return VNx16QImode;
19482 default:
19483 return opt_machine_mode ();
19484 }
19485}
19486
19487/* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
19488 if it exists. */
19489opt_machine_mode
19490aarch64_vq_mode (scalar_mode mode)
19491{
19492 switch (mode)
19493 {
19494 case E_DFmode:
19495 return V2DFmode;
19496 case E_SFmode:
19497 return V4SFmode;
19498 case E_HFmode:
19499 return V8HFmode;
abbe1ed2
SMW
19500 case E_BFmode:
19501 return V8BFmode;
4aeb1ba7
RS
19502 case E_SImode:
19503 return V4SImode;
19504 case E_HImode:
19505 return V8HImode;
19506 case E_QImode:
19507 return V16QImode;
19508 case E_DImode:
19509 return V2DImode;
19510 default:
19511 return opt_machine_mode ();
19512 }
19513}
19514
b7342d25
IB
19515/* Return appropriate SIMD container
19516 for MODE within a vector of WIDTH bits. */
ef4bddc2 19517static machine_mode
43cacb12 19518aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
43e9d192 19519{
9b070057
RS
19520 if (TARGET_SVE
19521 && maybe_ne (width, 128)
19522 && known_eq (width, BITS_PER_SVE_VECTOR))
4aeb1ba7 19523 return aarch64_full_sve_mode (mode).else_mode (word_mode);
43cacb12
RS
19524
19525 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
43e9d192 19526 if (TARGET_SIMD)
b7342d25 19527 {
43cacb12 19528 if (known_eq (width, 128))
4aeb1ba7 19529 return aarch64_vq_mode (mode).else_mode (word_mode);
b7342d25
IB
19530 else
19531 switch (mode)
19532 {
4e10a5a7 19533 case E_SFmode:
b7342d25 19534 return V2SFmode;
4e10a5a7 19535 case E_HFmode:
b719f884 19536 return V4HFmode;
abbe1ed2
SMW
19537 case E_BFmode:
19538 return V4BFmode;
4e10a5a7 19539 case E_SImode:
b7342d25 19540 return V2SImode;
4e10a5a7 19541 case E_HImode:
b7342d25 19542 return V4HImode;
4e10a5a7 19543 case E_QImode:
b7342d25
IB
19544 return V8QImode;
19545 default:
19546 break;
19547 }
19548 }
43e9d192
IB
19549 return word_mode;
19550}
19551
5f29f3d5
KT
19552/* Compare an SVE mode SVE_M and an Advanced SIMD mode ASIMD_M
19553 and return whether the SVE mode should be preferred over the
19554 Advanced SIMD one in aarch64_autovectorize_vector_modes. */
19555static bool
19556aarch64_cmp_autovec_modes (machine_mode sve_m, machine_mode asimd_m)
19557{
19558 /* Take into account the aarch64-autovec-preference param if non-zero. */
19559 bool only_asimd_p = aarch64_autovec_preference == 1;
19560 bool only_sve_p = aarch64_autovec_preference == 2;
19561
19562 if (only_asimd_p)
19563 return false;
19564 if (only_sve_p)
19565 return true;
19566
19567 /* The preference in case of a tie in costs. */
19568 bool prefer_asimd = aarch64_autovec_preference == 3;
19569 bool prefer_sve = aarch64_autovec_preference == 4;
19570
5f29f3d5
KT
19571 poly_int64 nunits_sve = GET_MODE_NUNITS (sve_m);
19572 poly_int64 nunits_asimd = GET_MODE_NUNITS (asimd_m);
19573 /* If the CPU information does not have an SVE width registered use the
19574 generic poly_int comparison that prefers SVE. If a preference is
19575 explicitly requested avoid this path. */
fa3ca615 19576 if (aarch64_tune_params.sve_width == SVE_SCALABLE
5f29f3d5
KT
19577 && !prefer_asimd
19578 && !prefer_sve)
19579 return maybe_gt (nunits_sve, nunits_asimd);
19580
19581 /* Otherwise estimate the runtime width of the modes involved. */
64432b68
KT
19582 HOST_WIDE_INT est_sve = estimated_poly_value (nunits_sve);
19583 HOST_WIDE_INT est_asimd = estimated_poly_value (nunits_asimd);
5f29f3d5
KT
19584
19585 /* Preferring SVE means picking it first unless the Advanced SIMD mode
19586 is clearly wider. */
19587 if (prefer_sve)
19588 return est_sve >= est_asimd;
19589 /* Conversely, preferring Advanced SIMD means picking SVE only if SVE
19590 is clearly wider. */
19591 if (prefer_asimd)
19592 return est_sve > est_asimd;
19593
19594 /* In the default case prefer Advanced SIMD over SVE in case of a tie. */
19595 return est_sve > est_asimd;
19596}
19597
b7342d25 19598/* Return 128-bit container as the preferred SIMD mode for MODE. */
ef4bddc2 19599static machine_mode
005ba29c 19600aarch64_preferred_simd_mode (scalar_mode mode)
b7342d25 19601{
5f29f3d5
KT
19602 /* Take into account explicit auto-vectorization ISA preferences through
19603 aarch64_cmp_autovec_modes. */
7ff5706f
RS
19604 if (TARGET_SVE && aarch64_cmp_autovec_modes (VNx16QImode, V16QImode))
19605 return aarch64_full_sve_mode (mode).else_mode (word_mode);
19606 if (TARGET_SIMD)
19607 return aarch64_vq_mode (mode).else_mode (word_mode);
19608 return word_mode;
b7342d25
IB
19609}
19610
86e36728 19611/* Return a list of possible vector sizes for the vectorizer
3b357264 19612 to iterate over. */
bcc7e346 19613static unsigned int
e021fb86 19614aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
3b357264 19615{
cc68f7c2
RS
19616 static const machine_mode sve_modes[] = {
19617 /* Try using full vectors for all element types. */
19618 VNx16QImode,
19619
19620 /* Try using 16-bit containers for 8-bit elements and full vectors
19621 for wider elements. */
19622 VNx8QImode,
19623
19624 /* Try using 32-bit containers for 8-bit and 16-bit elements and
19625 full vectors for wider elements. */
19626 VNx4QImode,
74166aab 19627
cc68f7c2
RS
19628 /* Try using 64-bit containers for all element types. */
19629 VNx2QImode
19630 };
19631
19632 static const machine_mode advsimd_modes[] = {
19633 /* Try using 128-bit vectors for all element types. */
19634 V16QImode,
19635
19636 /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
19637 for wider elements. */
19638 V8QImode,
19639
19640 /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
19641 for wider elements.
19642
19643 TODO: We could support a limited form of V4QImode too, so that
19644 we use 32-bit vectors for 8-bit elements. */
19645 V4HImode,
19646
19647 /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
19648 for 64-bit elements.
74166aab 19649
cc68f7c2
RS
19650 TODO: We could similarly support limited forms of V2QImode and V2HImode
19651 for this case. */
19652 V2SImode
19653 };
74166aab 19654
cc68f7c2
RS
19655 /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
19656 This is because:
74166aab 19657
cc68f7c2
RS
19658 - If we can't use N-byte Advanced SIMD vectors then the placement
19659 doesn't matter; we'll just continue as though the Advanced SIMD
19660 entry didn't exist.
74166aab 19661
cc68f7c2
RS
19662 - If an SVE main loop with N bytes ends up being cheaper than an
19663 Advanced SIMD main loop with N bytes then by default we'll replace
19664 the Advanced SIMD version with the SVE one.
74166aab 19665
cc68f7c2
RS
19666 - If an Advanced SIMD main loop with N bytes ends up being cheaper
19667 than an SVE main loop with N bytes then by default we'll try to
19668 use the SVE loop to vectorize the epilogue instead. */
5f29f3d5
KT
19669
19670 bool only_asimd_p = aarch64_autovec_preference == 1;
19671 bool only_sve_p = aarch64_autovec_preference == 2;
19672
19673 unsigned int sve_i = (TARGET_SVE && !only_asimd_p) ? 0 : ARRAY_SIZE (sve_modes);
cc68f7c2 19674 unsigned int advsimd_i = 0;
5f29f3d5
KT
19675
19676 while (!only_sve_p && advsimd_i < ARRAY_SIZE (advsimd_modes))
cc68f7c2
RS
19677 {
19678 if (sve_i < ARRAY_SIZE (sve_modes)
5f29f3d5
KT
19679 && aarch64_cmp_autovec_modes (sve_modes[sve_i],
19680 advsimd_modes[advsimd_i]))
cc68f7c2
RS
19681 modes->safe_push (sve_modes[sve_i++]);
19682 else
19683 modes->safe_push (advsimd_modes[advsimd_i++]);
19684 }
19685 while (sve_i < ARRAY_SIZE (sve_modes))
5f29f3d5 19686 modes->safe_push (sve_modes[sve_i++]);
bcc7e346 19687
eb23241b
RS
19688 unsigned int flags = 0;
19689 /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we
19690 can compare SVE against Advanced SIMD and so that we can compare
19691 multiple SVE vectorization approaches against each other. There's
19692 not really any point doing this for Advanced SIMD only, since the
19693 first mode that works should always be the best. */
19694 if (TARGET_SVE && aarch64_sve_compare_costs)
19695 flags |= VECT_COMPARE_COSTS;
19696 return flags;
3b357264
JG
19697}
19698
ac2b960f
YZ
19699/* Implement TARGET_MANGLE_TYPE. */
19700
6f549691 19701static const char *
ac2b960f
YZ
19702aarch64_mangle_type (const_tree type)
19703{
19704 /* The AArch64 ABI documents say that "__va_list" has to be
17f8ace2 19705 mangled as if it is in the "std" namespace. */
ac2b960f
YZ
19706 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
19707 return "St9__va_list";
19708
abbe1ed2 19709 /* Half-precision floating point types. */
c2ec330c 19710 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
abbe1ed2
SMW
19711 {
19712 if (TYPE_MODE (type) == BFmode)
19713 return "u6__bf16";
19714 else
19715 return "Dh";
19716 }
c2ec330c 19717
f9d53c27
TB
19718 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
19719 builtin types. */
19720 if (TYPE_NAME (type) != NULL)
624d0f07
RS
19721 {
19722 const char *res;
19723 if ((res = aarch64_general_mangle_builtin_type (type))
19724 || (res = aarch64_sve::mangle_builtin_type (type)))
19725 return res;
19726 }
c6fc9e43 19727
ac2b960f
YZ
19728 /* Use the default mangling. */
19729 return NULL;
19730}
19731
65ef05d0
RS
19732/* Implement TARGET_VERIFY_TYPE_CONTEXT. */
19733
19734static bool
19735aarch64_verify_type_context (location_t loc, type_context_kind context,
19736 const_tree type, bool silent_p)
19737{
19738 return aarch64_sve::verify_type_context (loc, context, type, silent_p);
19739}
19740
75cf1494
KT
19741/* Find the first rtx_insn before insn that will generate an assembly
19742 instruction. */
19743
19744static rtx_insn *
19745aarch64_prev_real_insn (rtx_insn *insn)
19746{
19747 if (!insn)
19748 return NULL;
19749
19750 do
19751 {
19752 insn = prev_real_insn (insn);
19753 }
19754 while (insn && recog_memoized (insn) < 0);
19755
19756 return insn;
19757}
19758
19759static bool
19760is_madd_op (enum attr_type t1)
19761{
19762 unsigned int i;
19763 /* A number of these may be AArch32 only. */
19764 enum attr_type mlatypes[] = {
19765 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
19766 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
19767 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
19768 };
19769
19770 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
19771 {
19772 if (t1 == mlatypes[i])
19773 return true;
19774 }
19775
19776 return false;
19777}
19778
19779/* Check if there is a register dependency between a load and the insn
19780 for which we hold recog_data. */
19781
19782static bool
19783dep_between_memop_and_curr (rtx memop)
19784{
19785 rtx load_reg;
19786 int opno;
19787
8baff86e 19788 gcc_assert (GET_CODE (memop) == SET);
75cf1494
KT
19789
19790 if (!REG_P (SET_DEST (memop)))
19791 return false;
19792
19793 load_reg = SET_DEST (memop);
8baff86e 19794 for (opno = 1; opno < recog_data.n_operands; opno++)
75cf1494
KT
19795 {
19796 rtx operand = recog_data.operand[opno];
19797 if (REG_P (operand)
19798 && reg_overlap_mentioned_p (load_reg, operand))
19799 return true;
19800
19801 }
19802 return false;
19803}
19804
8baff86e
KT
19805
19806/* When working around the Cortex-A53 erratum 835769,
19807 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
19808 instruction and has a preceding memory instruction such that a NOP
19809 should be inserted between them. */
19810
75cf1494
KT
19811bool
19812aarch64_madd_needs_nop (rtx_insn* insn)
19813{
19814 enum attr_type attr_type;
19815 rtx_insn *prev;
19816 rtx body;
19817
b32c1043 19818 if (!TARGET_FIX_ERR_A53_835769)
75cf1494
KT
19819 return false;
19820
e322d6e3 19821 if (!INSN_P (insn) || recog_memoized (insn) < 0)
75cf1494
KT
19822 return false;
19823
19824 attr_type = get_attr_type (insn);
19825 if (!is_madd_op (attr_type))
19826 return false;
19827
19828 prev = aarch64_prev_real_insn (insn);
3fea1a75
KT
19829 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
19830 Restore recog state to INSN to avoid state corruption. */
19831 extract_constrain_insn_cached (insn);
19832
550e2205 19833 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
75cf1494
KT
19834 return false;
19835
19836 body = single_set (prev);
19837
19838 /* If the previous insn is a memory op and there is no dependency between
8baff86e
KT
19839 it and the DImode madd, emit a NOP between them. If body is NULL then we
19840 have a complex memory operation, probably a load/store pair.
19841 Be conservative for now and emit a NOP. */
19842 if (GET_MODE (recog_data.operand[0]) == DImode
19843 && (!body || !dep_between_memop_and_curr (body)))
75cf1494
KT
19844 return true;
19845
19846 return false;
19847
19848}
19849
8baff86e
KT
19850
19851/* Implement FINAL_PRESCAN_INSN. */
19852
75cf1494
KT
19853void
19854aarch64_final_prescan_insn (rtx_insn *insn)
19855{
19856 if (aarch64_madd_needs_nop (insn))
19857 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
19858}
19859
19860
43cacb12
RS
19861/* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
19862 instruction. */
19863
19864bool
19865aarch64_sve_index_immediate_p (rtx base_or_step)
19866{
19867 return (CONST_INT_P (base_or_step)
19868 && IN_RANGE (INTVAL (base_or_step), -16, 15));
19869}
19870
f3582fda
RS
19871/* Return true if X is a valid immediate for the SVE ADD and SUB instructions
19872 when applied to mode MODE. Negate X first if NEGATE_P is true. */
43cacb12
RS
19873
19874bool
f3582fda 19875aarch64_sve_arith_immediate_p (machine_mode mode, rtx x, bool negate_p)
43cacb12 19876{
f3582fda
RS
19877 rtx elt = unwrap_const_vec_duplicate (x);
19878 if (!CONST_INT_P (elt))
43cacb12
RS
19879 return false;
19880
19881 HOST_WIDE_INT val = INTVAL (elt);
19882 if (negate_p)
19883 val = -val;
f3582fda 19884 val &= GET_MODE_MASK (GET_MODE_INNER (mode));
43cacb12
RS
19885
19886 if (val & 0xff)
19887 return IN_RANGE (val, 0, 0xff);
19888 return IN_RANGE (val, 0, 0xff00);
19889}
19890
624d0f07 19891/* Return true if X is a valid immediate for the SVE SQADD and SQSUB
f3582fda
RS
19892 instructions when applied to mode MODE. Negate X first if NEGATE_P
19893 is true. */
624d0f07
RS
19894
19895bool
f3582fda 19896aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode, rtx x, bool negate_p)
624d0f07 19897{
f3582fda 19898 if (!aarch64_sve_arith_immediate_p (mode, x, negate_p))
624d0f07
RS
19899 return false;
19900
19901 /* After the optional negation, the immediate must be nonnegative.
19902 E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
19903 instead of SQADD Zn.B, Zn.B, #129. */
f3582fda 19904 rtx elt = unwrap_const_vec_duplicate (x);
624d0f07
RS
19905 return negate_p == (INTVAL (elt) < 0);
19906}
19907
43cacb12
RS
19908/* Return true if X is a valid immediate operand for an SVE logical
19909 instruction such as AND. */
19910
19911bool
19912aarch64_sve_bitmask_immediate_p (rtx x)
19913{
19914 rtx elt;
19915
19916 return (const_vec_duplicate_p (x, &elt)
19917 && CONST_INT_P (elt)
19918 && aarch64_bitmask_imm (INTVAL (elt),
19919 GET_MODE_INNER (GET_MODE (x))));
19920}
19921
19922/* Return true if X is a valid immediate for the SVE DUP and CPY
19923 instructions. */
19924
19925bool
19926aarch64_sve_dup_immediate_p (rtx x)
19927{
d29f7dd5
RS
19928 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
19929 if (!CONST_INT_P (x))
43cacb12
RS
19930 return false;
19931
d29f7dd5 19932 HOST_WIDE_INT val = INTVAL (x);
43cacb12
RS
19933 if (val & 0xff)
19934 return IN_RANGE (val, -0x80, 0x7f);
19935 return IN_RANGE (val, -0x8000, 0x7f00);
19936}
19937
19938/* Return true if X is a valid immediate operand for an SVE CMP instruction.
19939 SIGNED_P says whether the operand is signed rather than unsigned. */
19940
19941bool
19942aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
19943{
6bc67182
RS
19944 x = unwrap_const_vec_duplicate (x);
19945 return (CONST_INT_P (x)
43cacb12 19946 && (signed_p
6bc67182
RS
19947 ? IN_RANGE (INTVAL (x), -16, 15)
19948 : IN_RANGE (INTVAL (x), 0, 127)));
43cacb12
RS
19949}
19950
19951/* Return true if X is a valid immediate operand for an SVE FADD or FSUB
19952 instruction. Negate X first if NEGATE_P is true. */
19953
19954bool
19955aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
19956{
19957 rtx elt;
19958 REAL_VALUE_TYPE r;
19959
19960 if (!const_vec_duplicate_p (x, &elt)
3793ecc1 19961 || !CONST_DOUBLE_P (elt))
43cacb12
RS
19962 return false;
19963
19964 r = *CONST_DOUBLE_REAL_VALUE (elt);
19965
19966 if (negate_p)
19967 r = real_value_negate (&r);
19968
19969 if (real_equal (&r, &dconst1))
19970 return true;
19971 if (real_equal (&r, &dconsthalf))
19972 return true;
19973 return false;
19974}
19975
19976/* Return true if X is a valid immediate operand for an SVE FMUL
19977 instruction. */
19978
19979bool
19980aarch64_sve_float_mul_immediate_p (rtx x)
19981{
19982 rtx elt;
19983
43cacb12 19984 return (const_vec_duplicate_p (x, &elt)
3793ecc1 19985 && CONST_DOUBLE_P (elt)
a19ba9e1
RS
19986 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
19987 || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
43cacb12
RS
19988}
19989
b187677b
RS
19990/* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
19991 for the Advanced SIMD operation described by WHICH and INSN. If INFO
19992 is nonnull, use it to describe valid immediates. */
3520f7cc 19993static bool
b187677b
RS
19994aarch64_advsimd_valid_immediate_hs (unsigned int val32,
19995 simd_immediate_info *info,
19996 enum simd_immediate_check which,
19997 simd_immediate_info::insn_type insn)
19998{
19999 /* Try a 4-byte immediate with LSL. */
20000 for (unsigned int shift = 0; shift < 32; shift += 8)
20001 if ((val32 & (0xff << shift)) == val32)
20002 {
20003 if (info)
20004 *info = simd_immediate_info (SImode, val32 >> shift, insn,
20005 simd_immediate_info::LSL, shift);
20006 return true;
20007 }
3520f7cc 20008
b187677b
RS
20009 /* Try a 2-byte immediate with LSL. */
20010 unsigned int imm16 = val32 & 0xffff;
20011 if (imm16 == (val32 >> 16))
20012 for (unsigned int shift = 0; shift < 16; shift += 8)
20013 if ((imm16 & (0xff << shift)) == imm16)
48063b9d 20014 {
b187677b
RS
20015 if (info)
20016 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
20017 simd_immediate_info::LSL, shift);
20018 return true;
48063b9d 20019 }
3520f7cc 20020
b187677b
RS
20021 /* Try a 4-byte immediate with MSL, except for cases that MVN
20022 can handle. */
20023 if (which == AARCH64_CHECK_MOV)
20024 for (unsigned int shift = 8; shift < 24; shift += 8)
20025 {
20026 unsigned int low = (1 << shift) - 1;
20027 if (((val32 & (0xff << shift)) | low) == val32)
20028 {
20029 if (info)
20030 *info = simd_immediate_info (SImode, val32 >> shift, insn,
20031 simd_immediate_info::MSL, shift);
20032 return true;
20033 }
20034 }
43e9d192 20035
b187677b
RS
20036 return false;
20037}
20038
20039/* Return true if replicating VAL64 is a valid immediate for the
20040 Advanced SIMD operation described by WHICH. If INFO is nonnull,
20041 use it to describe valid immediates. */
20042static bool
20043aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
20044 simd_immediate_info *info,
20045 enum simd_immediate_check which)
20046{
20047 unsigned int val32 = val64 & 0xffffffff;
20048 unsigned int val16 = val64 & 0xffff;
20049 unsigned int val8 = val64 & 0xff;
20050
20051 if (val32 == (val64 >> 32))
43e9d192 20052 {
b187677b
RS
20053 if ((which & AARCH64_CHECK_ORR) != 0
20054 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
20055 simd_immediate_info::MOV))
20056 return true;
43e9d192 20057
b187677b
RS
20058 if ((which & AARCH64_CHECK_BIC) != 0
20059 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
20060 simd_immediate_info::MVN))
20061 return true;
ee78df47 20062
b187677b
RS
20063 /* Try using a replicated byte. */
20064 if (which == AARCH64_CHECK_MOV
20065 && val16 == (val32 >> 16)
20066 && val8 == (val16 >> 8))
ee78df47 20067 {
b187677b
RS
20068 if (info)
20069 *info = simd_immediate_info (QImode, val8);
20070 return true;
ee78df47 20071 }
43e9d192
IB
20072 }
20073
b187677b
RS
20074 /* Try using a bit-to-bytemask. */
20075 if (which == AARCH64_CHECK_MOV)
43e9d192 20076 {
b187677b
RS
20077 unsigned int i;
20078 for (i = 0; i < 64; i += 8)
ab6501d7 20079 {
b187677b
RS
20080 unsigned char byte = (val64 >> i) & 0xff;
20081 if (byte != 0 && byte != 0xff)
20082 break;
ab6501d7 20083 }
b187677b 20084 if (i == 64)
ab6501d7 20085 {
b187677b
RS
20086 if (info)
20087 *info = simd_immediate_info (DImode, val64);
20088 return true;
ab6501d7 20089 }
43e9d192 20090 }
b187677b
RS
20091 return false;
20092}
43e9d192 20093
43cacb12
RS
20094/* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
20095 instruction. If INFO is nonnull, use it to describe valid immediates. */
20096
20097static bool
20098aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
20099 simd_immediate_info *info)
20100{
20101 scalar_int_mode mode = DImode;
20102 unsigned int val32 = val64 & 0xffffffff;
20103 if (val32 == (val64 >> 32))
20104 {
20105 mode = SImode;
20106 unsigned int val16 = val32 & 0xffff;
20107 if (val16 == (val32 >> 16))
20108 {
20109 mode = HImode;
20110 unsigned int val8 = val16 & 0xff;
20111 if (val8 == (val16 >> 8))
20112 mode = QImode;
20113 }
20114 }
20115 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
20116 if (IN_RANGE (val, -0x80, 0x7f))
20117 {
20118 /* DUP with no shift. */
20119 if (info)
20120 *info = simd_immediate_info (mode, val);
20121 return true;
20122 }
20123 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
20124 {
20125 /* DUP with LSL #8. */
20126 if (info)
20127 *info = simd_immediate_info (mode, val);
20128 return true;
20129 }
20130 if (aarch64_bitmask_imm (val64, mode))
20131 {
20132 /* DUPM. */
20133 if (info)
20134 *info = simd_immediate_info (mode, val);
20135 return true;
20136 }
20137 return false;
20138}
20139
624d0f07
RS
20140/* Return true if X is an UNSPEC_PTRUE constant of the form:
20141
20142 (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
20143
20144 where PATTERN is the svpattern as a CONST_INT and where ZERO
20145 is a zero constant of the required PTRUE mode (which can have
20146 fewer elements than X's mode, if zero bits are significant).
20147
20148 If so, and if INFO is nonnull, describe the immediate in INFO. */
20149bool
20150aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
20151{
20152 if (GET_CODE (x) != CONST)
20153 return false;
20154
20155 x = XEXP (x, 0);
20156 if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
20157 return false;
20158
20159 if (info)
20160 {
20161 aarch64_svpattern pattern
20162 = (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
20163 machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
20164 scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
20165 *info = simd_immediate_info (int_mode, pattern);
20166 }
20167 return true;
20168}
20169
0b1fe8cf
RS
20170/* Return true if X is a valid SVE predicate. If INFO is nonnull, use
20171 it to describe valid immediates. */
20172
20173static bool
20174aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
20175{
624d0f07
RS
20176 if (aarch64_sve_ptrue_svpattern_p (x, info))
20177 return true;
20178
0b1fe8cf
RS
20179 if (x == CONST0_RTX (GET_MODE (x)))
20180 {
20181 if (info)
20182 *info = simd_immediate_info (DImode, 0);
20183 return true;
20184 }
20185
20186 /* Analyze the value as a VNx16BImode. This should be relatively
20187 efficient, since rtx_vector_builder has enough built-in capacity
20188 to store all VLA predicate constants without needing the heap. */
20189 rtx_vector_builder builder;
20190 if (!aarch64_get_sve_pred_bits (builder, x))
20191 return false;
20192
20193 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
20194 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
20195 {
20196 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
20197 aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
20198 if (pattern != AARCH64_NUM_SVPATTERNS)
20199 {
20200 if (info)
20201 {
20202 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
20203 *info = simd_immediate_info (int_mode, pattern);
20204 }
20205 return true;
20206 }
20207 }
20208 return false;
20209}
20210
b187677b
RS
20211/* Return true if OP is a valid SIMD immediate for the operation
20212 described by WHICH. If INFO is nonnull, use it to describe valid
20213 immediates. */
20214bool
20215aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
20216 enum simd_immediate_check which)
20217{
43cacb12
RS
20218 machine_mode mode = GET_MODE (op);
20219 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
20220 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
20221 return false;
20222
0b1fe8cf
RS
20223 if (vec_flags & VEC_SVE_PRED)
20224 return aarch64_sve_pred_valid_immediate (op, info);
20225
43cacb12 20226 scalar_mode elt_mode = GET_MODE_INNER (mode);
f9093f23 20227 rtx base, step;
b187677b 20228 unsigned int n_elts;
568b9c0e 20229 if (CONST_VECTOR_P (op)
f9093f23
RS
20230 && CONST_VECTOR_DUPLICATE_P (op))
20231 n_elts = CONST_VECTOR_NPATTERNS (op);
43cacb12
RS
20232 else if ((vec_flags & VEC_SVE_DATA)
20233 && const_vec_series_p (op, &base, &step))
20234 {
20235 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
20236 if (!aarch64_sve_index_immediate_p (base)
20237 || !aarch64_sve_index_immediate_p (step))
20238 return false;
20239
20240 if (info)
cc68f7c2
RS
20241 {
20242 /* Get the corresponding container mode. E.g. an INDEX on V2SI
20243 should yield two integer values per 128-bit block, meaning
20244 that we need to treat it in the same way as V2DI and then
20245 ignore the upper 32 bits of each element. */
20246 elt_mode = aarch64_sve_container_int_mode (mode);
20247 *info = simd_immediate_info (elt_mode, base, step);
20248 }
43cacb12
RS
20249 return true;
20250 }
568b9c0e 20251 else if (CONST_VECTOR_P (op)
6a70badb
RS
20252 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
20253 /* N_ELTS set above. */;
b187677b 20254 else
d8edd899 20255 return false;
43e9d192 20256
b187677b 20257 scalar_float_mode elt_float_mode;
f9093f23
RS
20258 if (n_elts == 1
20259 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
43e9d192 20260 {
f9093f23
RS
20261 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
20262 if (aarch64_float_const_zero_rtx_p (elt)
20263 || aarch64_float_const_representable_p (elt))
20264 {
20265 if (info)
20266 *info = simd_immediate_info (elt_float_mode, elt);
20267 return true;
20268 }
b187677b 20269 }
43e9d192 20270
b23c6a2c
RS
20271 /* If all elements in an SVE vector have the same value, we have a free
20272 choice between using the element mode and using the container mode.
20273 Using the element mode means that unused parts of the vector are
20274 duplicates of the used elements, while using the container mode means
20275 that the unused parts are an extension of the used elements. Using the
20276 element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
20277 for its container mode VNx4SI while 0x00000101 isn't.
20278
20279 If not all elements in an SVE vector have the same value, we need the
20280 transition from one element to the next to occur at container boundaries.
20281 E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
20282 in the same way as a VNx4SI containing { 1, 2, 3, 4 }. */
20283 scalar_int_mode elt_int_mode;
20284 if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
20285 elt_int_mode = aarch64_sve_container_int_mode (mode);
20286 else
20287 elt_int_mode = int_mode_for_mode (elt_mode).require ();
20288
20289 unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
b187677b
RS
20290 if (elt_size > 8)
20291 return false;
e4f0f84d 20292
b187677b
RS
20293 /* Expand the vector constant out into a byte vector, with the least
20294 significant byte of the register first. */
20295 auto_vec<unsigned char, 16> bytes;
20296 bytes.reserve (n_elts * elt_size);
20297 for (unsigned int i = 0; i < n_elts; i++)
20298 {
f9093f23
RS
20299 /* The vector is provided in gcc endian-neutral fashion.
20300 For aarch64_be Advanced SIMD, it must be laid out in the vector
20301 register in reverse order. */
20302 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
20303 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
43e9d192 20304
b187677b
RS
20305 if (elt_mode != elt_int_mode)
20306 elt = gen_lowpart (elt_int_mode, elt);
43e9d192 20307
b187677b
RS
20308 if (!CONST_INT_P (elt))
20309 return false;
43e9d192 20310
b187677b
RS
20311 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
20312 for (unsigned int byte = 0; byte < elt_size; byte++)
48063b9d 20313 {
b187677b
RS
20314 bytes.quick_push (elt_val & 0xff);
20315 elt_val >>= BITS_PER_UNIT;
48063b9d 20316 }
43e9d192
IB
20317 }
20318
b187677b
RS
20319 /* The immediate must repeat every eight bytes. */
20320 unsigned int nbytes = bytes.length ();
20321 for (unsigned i = 8; i < nbytes; ++i)
20322 if (bytes[i] != bytes[i - 8])
20323 return false;
20324
20325 /* Get the repeating 8-byte value as an integer. No endian correction
20326 is needed here because bytes is already in lsb-first order. */
20327 unsigned HOST_WIDE_INT val64 = 0;
20328 for (unsigned int i = 0; i < 8; i++)
20329 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
20330 << (i * BITS_PER_UNIT));
20331
43cacb12
RS
20332 if (vec_flags & VEC_SVE_DATA)
20333 return aarch64_sve_valid_immediate (val64, info);
20334 else
20335 return aarch64_advsimd_valid_immediate (val64, info, which);
20336}
20337
20338/* Check whether X is a VEC_SERIES-like constant that starts at 0 and
20339 has a step in the range of INDEX. Return the index expression if so,
20340 otherwise return null. */
20341rtx
20342aarch64_check_zero_based_sve_index_immediate (rtx x)
20343{
20344 rtx base, step;
20345 if (const_vec_series_p (x, &base, &step)
20346 && base == const0_rtx
20347 && aarch64_sve_index_immediate_p (step))
20348 return step;
20349 return NULL_RTX;
43e9d192
IB
20350}
20351
43e9d192
IB
20352/* Check of immediate shift constants are within range. */
20353bool
ef4bddc2 20354aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
43e9d192 20355{
6bc67182
RS
20356 x = unwrap_const_vec_duplicate (x);
20357 if (!CONST_INT_P (x))
20358 return false;
43e9d192
IB
20359 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
20360 if (left)
6bc67182 20361 return IN_RANGE (INTVAL (x), 0, bit_width - 1);
43e9d192 20362 else
6bc67182 20363 return IN_RANGE (INTVAL (x), 1, bit_width);
43e9d192
IB
20364}
20365
7325d85a
KT
20366/* Return the bitmask CONST_INT to select the bits required by a zero extract
20367 operation of width WIDTH at bit position POS. */
20368
20369rtx
20370aarch64_mask_from_zextract_ops (rtx width, rtx pos)
20371{
20372 gcc_assert (CONST_INT_P (width));
20373 gcc_assert (CONST_INT_P (pos));
20374
20375 unsigned HOST_WIDE_INT mask
20376 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
20377 return GEN_INT (mask << UINTVAL (pos));
20378}
20379
83f8c414 20380bool
a6e0bfa7 20381aarch64_mov_operand_p (rtx x, machine_mode mode)
83f8c414 20382{
83f8c414
CSS
20383 if (GET_CODE (x) == HIGH
20384 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
20385 return true;
20386
82614948 20387 if (CONST_INT_P (x))
83f8c414
CSS
20388 return true;
20389
43cacb12 20390 if (VECTOR_MODE_P (GET_MODE (x)))
678faefc
RS
20391 {
20392 /* Require predicate constants to be VNx16BI before RA, so that we
20393 force everything to have a canonical form. */
20394 if (!lra_in_progress
20395 && !reload_completed
20396 && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
20397 && GET_MODE (x) != VNx16BImode)
20398 return false;
20399
20400 return aarch64_simd_valid_immediate (x, NULL);
20401 }
43cacb12 20402
b33b2678
WD
20403 /* Remove UNSPEC_SALT_ADDR before checking symbol reference. */
20404 x = strip_salt (x);
20405
a195c727
WD
20406 /* GOT accesses are valid moves. */
20407 if (SYMBOL_REF_P (x)
20408 && aarch64_classify_symbolic_expression (x) == SYMBOL_SMALL_GOT_4G)
20409 return true;
20410
3793ecc1 20411 if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x))
83f8c414
CSS
20412 return true;
20413
c0e0174b 20414 if (TARGET_SVE && aarch64_sve_cnt_immediate_p (x))
43cacb12
RS
20415 return true;
20416
a6e0bfa7 20417 return aarch64_classify_symbolic_expression (x)
a5350ddc 20418 == SYMBOL_TINY_ABSOLUTE;
83f8c414
CSS
20419}
20420
9b8830b6
TC
20421/* Create a 0 constant that is based on V4SI to allow CSE to optimally share
20422 the constant creation. */
20423
20424rtx
20425aarch64_gen_shareable_zero (machine_mode mode)
20426{
20427 machine_mode zmode = V4SImode;
20428 rtx tmp = gen_reg_rtx (zmode);
20429 emit_move_insn (tmp, CONST0_RTX (zmode));
20430 return lowpart_subreg (mode, tmp, zmode);
20431}
20432
43e9d192
IB
20433/* Return a const_int vector of VAL. */
20434rtx
ab014eb3 20435aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
43e9d192 20436{
59d06c05
RS
20437 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
20438 return gen_const_vec_duplicate (mode, c);
43e9d192
IB
20439}
20440
051d0e2f
SN
20441/* Check OP is a legal scalar immediate for the MOVI instruction. */
20442
20443bool
77e994c9 20444aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
051d0e2f 20445{
ef4bddc2 20446 machine_mode vmode;
051d0e2f 20447
43cacb12 20448 vmode = aarch64_simd_container_mode (mode, 64);
051d0e2f 20449 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
b187677b 20450 return aarch64_simd_valid_immediate (op_v, NULL);
051d0e2f
SN
20451}
20452
988fa693
JG
20453/* Construct and return a PARALLEL RTX vector with elements numbering the
20454 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
20455 the vector - from the perspective of the architecture. This does not
20456 line up with GCC's perspective on lane numbers, so we end up with
20457 different masks depending on our target endian-ness. The diagram
20458 below may help. We must draw the distinction when building masks
20459 which select one half of the vector. An instruction selecting
20460 architectural low-lanes for a big-endian target, must be described using
20461 a mask selecting GCC high-lanes.
20462
20463 Big-Endian Little-Endian
20464
20465GCC 0 1 2 3 3 2 1 0
20466 | x | x | x | x | | x | x | x | x |
20467Architecture 3 2 1 0 3 2 1 0
20468
20469Low Mask: { 2, 3 } { 0, 1 }
20470High Mask: { 0, 1 } { 2, 3 }
f5cbabc1
RS
20471
20472 MODE Is the mode of the vector and NUNITS is the number of units in it. */
988fa693 20473
43e9d192 20474rtx
f5cbabc1 20475aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
43e9d192 20476{
43e9d192 20477 rtvec v = rtvec_alloc (nunits / 2);
988fa693
JG
20478 int high_base = nunits / 2;
20479 int low_base = 0;
20480 int base;
43e9d192
IB
20481 rtx t1;
20482 int i;
20483
988fa693
JG
20484 if (BYTES_BIG_ENDIAN)
20485 base = high ? low_base : high_base;
20486 else
20487 base = high ? high_base : low_base;
20488
20489 for (i = 0; i < nunits / 2; i++)
43e9d192
IB
20490 RTVEC_ELT (v, i) = GEN_INT (base + i);
20491
20492 t1 = gen_rtx_PARALLEL (mode, v);
20493 return t1;
20494}
20495
988fa693
JG
20496/* Check OP for validity as a PARALLEL RTX vector with elements
20497 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
20498 from the perspective of the architecture. See the diagram above
20499 aarch64_simd_vect_par_cnst_half for more details. */
20500
20501bool
ef4bddc2 20502aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
988fa693
JG
20503 bool high)
20504{
6a70badb
RS
20505 int nelts;
20506 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
f5cbabc1
RS
20507 return false;
20508
6a70badb 20509 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
988fa693
JG
20510 HOST_WIDE_INT count_op = XVECLEN (op, 0);
20511 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
20512 int i = 0;
20513
988fa693
JG
20514 if (count_op != count_ideal)
20515 return false;
20516
20517 for (i = 0; i < count_ideal; i++)
20518 {
20519 rtx elt_op = XVECEXP (op, 0, i);
20520 rtx elt_ideal = XVECEXP (ideal, 0, i);
20521
4aa81c2e 20522 if (!CONST_INT_P (elt_op)
988fa693
JG
20523 || INTVAL (elt_ideal) != INTVAL (elt_op))
20524 return false;
20525 }
20526 return true;
20527}
20528
4aeb1ba7
RS
20529/* Return a PARALLEL containing NELTS elements, with element I equal
20530 to BASE + I * STEP. */
20531
20532rtx
20533aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
20534{
20535 rtvec vec = rtvec_alloc (nelts);
20536 for (unsigned int i = 0; i < nelts; ++i)
20537 RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
20538 return gen_rtx_PARALLEL (VOIDmode, vec);
20539}
20540
20541/* Return true if OP is a PARALLEL of CONST_INTs that form a linear
20542 series with step STEP. */
20543
20544bool
20545aarch64_stepped_int_parallel_p (rtx op, int step)
20546{
20547 if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
20548 return false;
20549
20550 unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
20551 for (int i = 1; i < XVECLEN (op, 0); ++i)
20552 if (!CONST_INT_P (XVECEXP (op, 0, i))
20553 || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
20554 return false;
20555
20556 return true;
20557}
20558
43e9d192
IB
20559/* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
20560 HIGH (exclusive). */
20561void
46ed6024
CB
20562aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
20563 const_tree exp)
43e9d192
IB
20564{
20565 HOST_WIDE_INT lane;
4aa81c2e 20566 gcc_assert (CONST_INT_P (operand));
43e9d192
IB
20567 lane = INTVAL (operand);
20568
20569 if (lane < low || lane >= high)
46ed6024
CB
20570 {
20571 if (exp)
06357071
MS
20572 error_at (EXPR_LOCATION (exp), "lane %wd out of range %wd - %wd",
20573 lane, low, high - 1);
46ed6024 20574 else
cf0c27ef 20575 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
46ed6024 20576 }
43e9d192
IB
20577}
20578
7ac29c0f
RS
20579/* Peform endian correction on lane number N, which indexes a vector
20580 of mode MODE, and return the result as an SImode rtx. */
20581
20582rtx
20583aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
20584{
20585 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
20586}
20587
43e9d192 20588/* Return TRUE if OP is a valid vector addressing mode. */
43cacb12 20589
43e9d192
IB
20590bool
20591aarch64_simd_mem_operand_p (rtx op)
20592{
20593 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
4aa81c2e 20594 || REG_P (XEXP (op, 0)));
43e9d192
IB
20595}
20596
43cacb12
RS
20597/* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
20598
20599bool
20600aarch64_sve_ld1r_operand_p (rtx op)
20601{
20602 struct aarch64_address_info addr;
20603 scalar_mode mode;
20604
20605 return (MEM_P (op)
20606 && is_a <scalar_mode> (GET_MODE (op), &mode)
20607 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
20608 && addr.type == ADDRESS_REG_IMM
20609 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
20610}
20611
9ceec73f
MM
20612/* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
20613 where the size of the read data is specified by `mode` and the size of the
20614 vector elements are specified by `elem_mode`. */
4aeb1ba7 20615bool
9ceec73f
MM
20616aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode,
20617 scalar_mode elem_mode)
4aeb1ba7
RS
20618{
20619 struct aarch64_address_info addr;
4aeb1ba7
RS
20620 if (!MEM_P (op)
20621 || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
20622 return false;
20623
20624 if (addr.type == ADDRESS_REG_IMM)
9ceec73f 20625 return offset_4bit_signed_scaled_p (mode, addr.const_offset);
4aeb1ba7
RS
20626
20627 if (addr.type == ADDRESS_REG_REG)
20628 return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
20629
20630 return false;
20631}
20632
9ceec73f
MM
20633/* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
20634bool
20635aarch64_sve_ld1rq_operand_p (rtx op)
20636{
20637 return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode,
20638 GET_MODE_INNER (GET_MODE (op)));
20639}
20640
20641/* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
20642 accessing a vector where the element size is specified by `elem_mode`. */
20643bool
20644aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode)
20645{
20646 return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode);
20647}
20648
624d0f07
RS
20649/* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction. */
20650bool
20651aarch64_sve_ldff1_operand_p (rtx op)
20652{
20653 if (!MEM_P (op))
20654 return false;
20655
20656 struct aarch64_address_info addr;
20657 if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
20658 return false;
20659
20660 if (addr.type == ADDRESS_REG_IMM)
20661 return known_eq (addr.const_offset, 0);
20662
20663 return addr.type == ADDRESS_REG_REG;
20664}
20665
20666/* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction. */
20667bool
20668aarch64_sve_ldnf1_operand_p (rtx op)
20669{
20670 struct aarch64_address_info addr;
20671
20672 return (MEM_P (op)
20673 && aarch64_classify_address (&addr, XEXP (op, 0),
20674 GET_MODE (op), false)
20675 && addr.type == ADDRESS_REG_IMM);
20676}
20677
43cacb12
RS
20678/* Return true if OP is a valid MEM operand for an SVE LDR instruction.
20679 The conditions for STR are the same. */
20680bool
20681aarch64_sve_ldr_operand_p (rtx op)
20682{
20683 struct aarch64_address_info addr;
20684
20685 return (MEM_P (op)
20686 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
20687 false, ADDR_QUERY_ANY)
20688 && addr.type == ADDRESS_REG_IMM);
20689}
20690
624d0f07
RS
20691/* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
20692 addressing memory of mode MODE. */
20693bool
20694aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
20695{
20696 struct aarch64_address_info addr;
ba15b0fa 20697 if (!aarch64_classify_address (&addr, op, mode, false, ADDR_QUERY_ANY))
624d0f07
RS
20698 return false;
20699
20700 if (addr.type == ADDRESS_REG_IMM)
ba15b0fa 20701 return offset_6bit_signed_scaled_p (mode, addr.const_offset);
624d0f07
RS
20702
20703 return addr.type == ADDRESS_REG_REG;
20704}
20705
9f4cbab8
RS
20706/* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
20707 We need to be able to access the individual pieces, so the range
20708 is different from LD[234] and ST[234]. */
20709bool
20710aarch64_sve_struct_memory_operand_p (rtx op)
20711{
20712 if (!MEM_P (op))
20713 return false;
20714
20715 machine_mode mode = GET_MODE (op);
20716 struct aarch64_address_info addr;
20717 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
20718 ADDR_QUERY_ANY)
20719 || addr.type != ADDRESS_REG_IMM)
20720 return false;
20721
20722 poly_int64 first = addr.const_offset;
20723 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
20724 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
20725 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
20726}
20727
2d8c6dc1
AH
20728/* Emit a register copy from operand to operand, taking care not to
20729 early-clobber source registers in the process.
43e9d192 20730
2d8c6dc1
AH
20731 COUNT is the number of components into which the copy needs to be
20732 decomposed. */
43e9d192 20733void
b8506a8a 20734aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
2d8c6dc1 20735 unsigned int count)
43e9d192
IB
20736{
20737 unsigned int i;
2d8c6dc1
AH
20738 int rdest = REGNO (operands[0]);
20739 int rsrc = REGNO (operands[1]);
43e9d192
IB
20740
20741 if (!reg_overlap_mentioned_p (operands[0], operands[1])
2d8c6dc1
AH
20742 || rdest < rsrc)
20743 for (i = 0; i < count; i++)
20744 emit_move_insn (gen_rtx_REG (mode, rdest + i),
20745 gen_rtx_REG (mode, rsrc + i));
43e9d192 20746 else
2d8c6dc1
AH
20747 for (i = 0; i < count; i++)
20748 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
20749 gen_rtx_REG (mode, rsrc + count - i - 1));
43e9d192
IB
20750}
20751
668046d1 20752/* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
6ec0e5b9 20753 one of VSTRUCT modes: OI, CI, or XI. */
668046d1 20754int
b8506a8a 20755aarch64_simd_attr_length_rglist (machine_mode mode)
668046d1 20756{
6a70badb
RS
20757 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
20758 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
668046d1
DS
20759}
20760
db0253a4 20761/* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
43cacb12
RS
20762 alignment of a vector to 128 bits. SVE predicates have an alignment of
20763 16 bits. */
db0253a4
TB
20764static HOST_WIDE_INT
20765aarch64_simd_vector_alignment (const_tree type)
20766{
07108a9e
RS
20767 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
20768 be set for non-predicate vectors of booleans. Modes are the most
20769 direct way we have of identifying real SVE predicate types. */
20770 if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
20771 return 16;
cc68f7c2
RS
20772 widest_int min_size
20773 = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
20774 return wi::umin (min_size, 128).to_uhwi ();
db0253a4
TB
20775}
20776
43cacb12 20777/* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
ca31798e 20778static poly_uint64
43cacb12
RS
20779aarch64_vectorize_preferred_vector_alignment (const_tree type)
20780{
20781 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
20782 {
1393938e
RS
20783 /* If the length of the vector is a fixed power of 2, try to align
20784 to that length, otherwise don't try to align at all. */
43cacb12 20785 HOST_WIDE_INT result;
1393938e
RS
20786 if (!GET_MODE_BITSIZE (TYPE_MODE (type)).is_constant (&result)
20787 || !pow2p_hwi (result))
43cacb12
RS
20788 result = TYPE_ALIGN (TREE_TYPE (type));
20789 return result;
20790 }
20791 return TYPE_ALIGN (type);
20792}
20793
db0253a4
TB
20794/* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
20795static bool
20796aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
20797{
20798 if (is_packed)
20799 return false;
20800
43cacb12
RS
20801 /* For fixed-length vectors, check that the vectorizer will aim for
20802 full-vector alignment. This isn't true for generic GCC vectors
20803 that are wider than the ABI maximum of 128 bits. */
ca31798e
AV
20804 poly_uint64 preferred_alignment =
20805 aarch64_vectorize_preferred_vector_alignment (type);
43cacb12 20806 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
ca31798e
AV
20807 && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
20808 preferred_alignment))
db0253a4
TB
20809 return false;
20810
20811 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
20812 return true;
20813}
20814
7df76747
N
20815/* Return true if the vector misalignment factor is supported by the
20816 target. */
20817static bool
20818aarch64_builtin_support_vector_misalignment (machine_mode mode,
20819 const_tree type, int misalignment,
20820 bool is_packed)
20821{
20822 if (TARGET_SIMD && STRICT_ALIGNMENT)
20823 {
20824 /* Return if movmisalign pattern is not supported for this mode. */
20825 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
20826 return false;
20827
a509c571 20828 /* Misalignment factor is unknown at compile time. */
7df76747 20829 if (misalignment == -1)
a509c571 20830 return false;
7df76747
N
20831 }
20832 return default_builtin_support_vector_misalignment (mode, type, misalignment,
20833 is_packed);
20834}
20835
4369c11e
TB
20836/* If VALS is a vector constant that can be loaded into a register
20837 using DUP, generate instructions to do so and return an RTX to
20838 assign to the register. Otherwise return NULL_RTX. */
20839static rtx
20840aarch64_simd_dup_constant (rtx vals)
20841{
ef4bddc2
RS
20842 machine_mode mode = GET_MODE (vals);
20843 machine_mode inner_mode = GET_MODE_INNER (mode);
4369c11e 20844 rtx x;
4369c11e 20845
92695fbb 20846 if (!const_vec_duplicate_p (vals, &x))
4369c11e
TB
20847 return NULL_RTX;
20848
20849 /* We can load this constant by using DUP and a constant in a
20850 single ARM register. This will be cheaper than a vector
20851 load. */
92695fbb 20852 x = copy_to_mode_reg (inner_mode, x);
59d06c05 20853 return gen_vec_duplicate (mode, x);
4369c11e
TB
20854}
20855
20856
20857/* Generate code to load VALS, which is a PARALLEL containing only
20858 constants (for vec_init) or CONST_VECTOR, efficiently into a
20859 register. Returns an RTX to copy into the register, or NULL_RTX
67914693 20860 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
1df3f464 20861static rtx
4369c11e
TB
20862aarch64_simd_make_constant (rtx vals)
20863{
ef4bddc2 20864 machine_mode mode = GET_MODE (vals);
4369c11e
TB
20865 rtx const_dup;
20866 rtx const_vec = NULL_RTX;
4369c11e
TB
20867 int n_const = 0;
20868 int i;
20869
568b9c0e 20870 if (CONST_VECTOR_P (vals))
4369c11e
TB
20871 const_vec = vals;
20872 else if (GET_CODE (vals) == PARALLEL)
20873 {
20874 /* A CONST_VECTOR must contain only CONST_INTs and
20875 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
20876 Only store valid constants in a CONST_VECTOR. */
6a70badb 20877 int n_elts = XVECLEN (vals, 0);
4369c11e
TB
20878 for (i = 0; i < n_elts; ++i)
20879 {
20880 rtx x = XVECEXP (vals, 0, i);
20881 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
20882 n_const++;
20883 }
20884 if (n_const == n_elts)
20885 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
20886 }
20887 else
20888 gcc_unreachable ();
20889
20890 if (const_vec != NULL_RTX
b187677b 20891 && aarch64_simd_valid_immediate (const_vec, NULL))
4369c11e
TB
20892 /* Load using MOVI/MVNI. */
20893 return const_vec;
20894 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
20895 /* Loaded using DUP. */
20896 return const_dup;
20897 else if (const_vec != NULL_RTX)
67914693 20898 /* Load from constant pool. We cannot take advantage of single-cycle
4369c11e
TB
20899 LD1 because we need a PC-relative addressing mode. */
20900 return const_vec;
20901 else
20902 /* A PARALLEL containing something not valid inside CONST_VECTOR.
67914693 20903 We cannot construct an initializer. */
4369c11e
TB
20904 return NULL_RTX;
20905}
20906
35a093b6
JG
20907/* Expand a vector initialisation sequence, such that TARGET is
20908 initialised to contain VALS. */
20909
4369c11e
TB
20910void
20911aarch64_expand_vector_init (rtx target, rtx vals)
20912{
ef4bddc2 20913 machine_mode mode = GET_MODE (target);
146c2e3a 20914 scalar_mode inner_mode = GET_MODE_INNER (mode);
35a093b6 20915 /* The number of vector elements. */
6a70badb 20916 int n_elts = XVECLEN (vals, 0);
35a093b6 20917 /* The number of vector elements which are not constant. */
8b66a2d4
AL
20918 int n_var = 0;
20919 rtx any_const = NULL_RTX;
35a093b6
JG
20920 /* The first element of vals. */
20921 rtx v0 = XVECEXP (vals, 0, 0);
4369c11e 20922 bool all_same = true;
4369c11e 20923
41dab855
KT
20924 /* This is a special vec_init<M><N> where N is not an element mode but a
20925 vector mode with half the elements of M. We expect to find two entries
20926 of mode N in VALS and we must put their concatentation into TARGET. */
20927 if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
20928 {
20929 gcc_assert (known_eq (GET_MODE_SIZE (mode),
20930 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
20931 rtx lo = XVECEXP (vals, 0, 0);
20932 rtx hi = XVECEXP (vals, 0, 1);
20933 machine_mode narrow_mode = GET_MODE (lo);
20934 gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
20935 gcc_assert (narrow_mode == GET_MODE (hi));
20936
20937 /* When we want to concatenate a half-width vector with zeroes we can
20938 use the aarch64_combinez[_be] patterns. Just make sure that the
20939 zeroes are in the right half. */
20940 if (BYTES_BIG_ENDIAN
20941 && aarch64_simd_imm_zero (lo, narrow_mode)
20942 && general_operand (hi, narrow_mode))
20943 emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
20944 else if (!BYTES_BIG_ENDIAN
20945 && aarch64_simd_imm_zero (hi, narrow_mode)
20946 && general_operand (lo, narrow_mode))
20947 emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
20948 else
20949 {
20950 /* Else create the two half-width registers and combine them. */
20951 if (!REG_P (lo))
20952 lo = force_reg (GET_MODE (lo), lo);
20953 if (!REG_P (hi))
20954 hi = force_reg (GET_MODE (hi), hi);
20955
20956 if (BYTES_BIG_ENDIAN)
20957 std::swap (lo, hi);
20958 emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
20959 }
20960 return;
20961 }
20962
35a093b6 20963 /* Count the number of variable elements to initialise. */
8b66a2d4 20964 for (int i = 0; i < n_elts; ++i)
4369c11e 20965 {
8b66a2d4 20966 rtx x = XVECEXP (vals, 0, i);
35a093b6 20967 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
8b66a2d4
AL
20968 ++n_var;
20969 else
20970 any_const = x;
4369c11e 20971
35a093b6 20972 all_same &= rtx_equal_p (x, v0);
4369c11e
TB
20973 }
20974
35a093b6
JG
20975 /* No variable elements, hand off to aarch64_simd_make_constant which knows
20976 how best to handle this. */
4369c11e
TB
20977 if (n_var == 0)
20978 {
20979 rtx constant = aarch64_simd_make_constant (vals);
20980 if (constant != NULL_RTX)
20981 {
20982 emit_move_insn (target, constant);
20983 return;
20984 }
20985 }
20986
20987 /* Splat a single non-constant element if we can. */
20988 if (all_same)
20989 {
35a093b6 20990 rtx x = copy_to_mode_reg (inner_mode, v0);
59d06c05 20991 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
4369c11e
TB
20992 return;
20993 }
20994
85c1b6d7
AP
20995 enum insn_code icode = optab_handler (vec_set_optab, mode);
20996 gcc_assert (icode != CODE_FOR_nothing);
20997
20998 /* If there are only variable elements, try to optimize
20999 the insertion using dup for the most common element
21000 followed by insertions. */
21001
21002 /* The algorithm will fill matches[*][0] with the earliest matching element,
21003 and matches[X][1] with the count of duplicate elements (if X is the
21004 earliest element which has duplicates). */
21005
21006 if (n_var == n_elts && n_elts <= 16)
21007 {
21008 int matches[16][2] = {0};
21009 for (int i = 0; i < n_elts; i++)
21010 {
21011 for (int j = 0; j <= i; j++)
21012 {
21013 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
21014 {
21015 matches[i][0] = j;
21016 matches[j][1]++;
21017 break;
21018 }
21019 }
21020 }
21021 int maxelement = 0;
21022 int maxv = 0;
21023 for (int i = 0; i < n_elts; i++)
21024 if (matches[i][1] > maxv)
21025 {
21026 maxelement = i;
21027 maxv = matches[i][1];
21028 }
21029
b4e2cd5b
JG
21030 /* Create a duplicate of the most common element, unless all elements
21031 are equally useless to us, in which case just immediately set the
21032 vector register using the first element. */
21033
21034 if (maxv == 1)
21035 {
21036 /* For vectors of two 64-bit elements, we can do even better. */
21037 if (n_elts == 2
21038 && (inner_mode == E_DImode
21039 || inner_mode == E_DFmode))
21040
21041 {
21042 rtx x0 = XVECEXP (vals, 0, 0);
21043 rtx x1 = XVECEXP (vals, 0, 1);
21044 /* Combine can pick up this case, but handling it directly
21045 here leaves clearer RTL.
21046
21047 This is load_pair_lanes<mode>, and also gives us a clean-up
21048 for store_pair_lanes<mode>. */
21049 if (memory_operand (x0, inner_mode)
21050 && memory_operand (x1, inner_mode)
21051 && !STRICT_ALIGNMENT
21052 && rtx_equal_p (XEXP (x1, 0),
21053 plus_constant (Pmode,
21054 XEXP (x0, 0),
21055 GET_MODE_SIZE (inner_mode))))
21056 {
21057 rtx t;
21058 if (inner_mode == DFmode)
21059 t = gen_load_pair_lanesdf (target, x0, x1);
21060 else
21061 t = gen_load_pair_lanesdi (target, x0, x1);
21062 emit_insn (t);
21063 return;
21064 }
21065 }
21066 /* The subreg-move sequence below will move into lane zero of the
21067 vector register. For big-endian we want that position to hold
21068 the last element of VALS. */
21069 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
21070 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
21071 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
21072 }
21073 else
21074 {
21075 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
21076 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
21077 }
85c1b6d7
AP
21078
21079 /* Insert the rest. */
21080 for (int i = 0; i < n_elts; i++)
21081 {
21082 rtx x = XVECEXP (vals, 0, i);
21083 if (matches[i][0] == maxelement)
21084 continue;
21085 x = copy_to_mode_reg (inner_mode, x);
21086 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
21087 }
21088 return;
21089 }
21090
35a093b6
JG
21091 /* Initialise a vector which is part-variable. We want to first try
21092 to build those lanes which are constant in the most efficient way we
21093 can. */
21094 if (n_var != n_elts)
4369c11e
TB
21095 {
21096 rtx copy = copy_rtx (vals);
4369c11e 21097
8b66a2d4
AL
21098 /* Load constant part of vector. We really don't care what goes into the
21099 parts we will overwrite, but we're more likely to be able to load the
21100 constant efficiently if it has fewer, larger, repeating parts
21101 (see aarch64_simd_valid_immediate). */
21102 for (int i = 0; i < n_elts; i++)
21103 {
21104 rtx x = XVECEXP (vals, 0, i);
21105 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
21106 continue;
21107 rtx subst = any_const;
21108 for (int bit = n_elts / 2; bit > 0; bit /= 2)
21109 {
21110 /* Look in the copied vector, as more elements are const. */
21111 rtx test = XVECEXP (copy, 0, i ^ bit);
21112 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
21113 {
21114 subst = test;
21115 break;
21116 }
21117 }
21118 XVECEXP (copy, 0, i) = subst;
21119 }
4369c11e 21120 aarch64_expand_vector_init (target, copy);
35a093b6 21121 }
4369c11e 21122
35a093b6 21123 /* Insert the variable lanes directly. */
8b66a2d4 21124 for (int i = 0; i < n_elts; i++)
35a093b6
JG
21125 {
21126 rtx x = XVECEXP (vals, 0, i);
21127 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
21128 continue;
21129 x = copy_to_mode_reg (inner_mode, x);
21130 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
21131 }
4369c11e
TB
21132}
21133
3a0afad0
PK
21134/* Emit RTL corresponding to:
21135 insr TARGET, ELEM. */
21136
21137static void
21138emit_insr (rtx target, rtx elem)
21139{
21140 machine_mode mode = GET_MODE (target);
21141 scalar_mode elem_mode = GET_MODE_INNER (mode);
21142 elem = force_reg (elem_mode, elem);
21143
21144 insn_code icode = optab_handler (vec_shl_insert_optab, mode);
21145 gcc_assert (icode != CODE_FOR_nothing);
21146 emit_insn (GEN_FCN (icode) (target, target, elem));
21147}
21148
21149/* Subroutine of aarch64_sve_expand_vector_init for handling
21150 trailing constants.
21151 This function works as follows:
21152 (a) Create a new vector consisting of trailing constants.
21153 (b) Initialize TARGET with the constant vector using emit_move_insn.
21154 (c) Insert remaining elements in TARGET using insr.
21155 NELTS is the total number of elements in original vector while
21156 while NELTS_REQD is the number of elements that are actually
21157 significant.
21158
21159 ??? The heuristic used is to do above only if number of constants
21160 is at least half the total number of elements. May need fine tuning. */
21161
21162static bool
21163aarch64_sve_expand_vector_init_handle_trailing_constants
21164 (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
21165{
21166 machine_mode mode = GET_MODE (target);
21167 scalar_mode elem_mode = GET_MODE_INNER (mode);
21168 int n_trailing_constants = 0;
21169
21170 for (int i = nelts_reqd - 1;
5da301cb 21171 i >= 0 && valid_for_const_vector_p (elem_mode, builder.elt (i));
3a0afad0
PK
21172 i--)
21173 n_trailing_constants++;
21174
21175 if (n_trailing_constants >= nelts_reqd / 2)
21176 {
5da301cb
RS
21177 /* Try to use the natural pattern of BUILDER to extend the trailing
21178 constant elements to a full vector. Replace any variables in the
21179 extra elements with zeros.
21180
21181 ??? It would be better if the builders supported "don't care"
21182 elements, with the builder filling in whichever elements
21183 give the most compact encoding. */
21184 rtx_vector_builder v (mode, nelts, 1);
3a0afad0 21185 for (int i = 0; i < nelts; i++)
5da301cb
RS
21186 {
21187 rtx x = builder.elt (i + nelts_reqd - n_trailing_constants);
21188 if (!valid_for_const_vector_p (elem_mode, x))
21189 x = const0_rtx;
21190 v.quick_push (x);
21191 }
3a0afad0
PK
21192 rtx const_vec = v.build ();
21193 emit_move_insn (target, const_vec);
21194
21195 for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
21196 emit_insr (target, builder.elt (i));
21197
21198 return true;
21199 }
21200
21201 return false;
21202}
21203
21204/* Subroutine of aarch64_sve_expand_vector_init.
21205 Works as follows:
21206 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
21207 (b) Skip trailing elements from BUILDER, which are the same as
21208 element NELTS_REQD - 1.
21209 (c) Insert earlier elements in reverse order in TARGET using insr. */
21210
21211static void
21212aarch64_sve_expand_vector_init_insert_elems (rtx target,
21213 const rtx_vector_builder &builder,
21214 int nelts_reqd)
21215{
21216 machine_mode mode = GET_MODE (target);
21217 scalar_mode elem_mode = GET_MODE_INNER (mode);
21218
21219 struct expand_operand ops[2];
21220 enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
21221 gcc_assert (icode != CODE_FOR_nothing);
21222
21223 create_output_operand (&ops[0], target, mode);
21224 create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
21225 expand_insn (icode, 2, ops);
21226
21227 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
21228 for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
21229 emit_insr (target, builder.elt (i));
21230}
21231
21232/* Subroutine of aarch64_sve_expand_vector_init to handle case
21233 when all trailing elements of builder are same.
21234 This works as follows:
21235 (a) Use expand_insn interface to broadcast last vector element in TARGET.
21236 (b) Insert remaining elements in TARGET using insr.
21237
21238 ??? The heuristic used is to do above if number of same trailing elements
21239 is at least 3/4 of total number of elements, loosely based on
21240 heuristic from mostly_zeros_p. May need fine-tuning. */
21241
21242static bool
21243aarch64_sve_expand_vector_init_handle_trailing_same_elem
21244 (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
21245{
21246 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
21247 if (ndups >= (3 * nelts_reqd) / 4)
21248 {
21249 aarch64_sve_expand_vector_init_insert_elems (target, builder,
21250 nelts_reqd - ndups + 1);
21251 return true;
21252 }
21253
21254 return false;
21255}
21256
21257/* Initialize register TARGET from BUILDER. NELTS is the constant number
21258 of elements in BUILDER.
21259
21260 The function tries to initialize TARGET from BUILDER if it fits one
21261 of the special cases outlined below.
21262
21263 Failing that, the function divides BUILDER into two sub-vectors:
21264 v_even = even elements of BUILDER;
21265 v_odd = odd elements of BUILDER;
21266
21267 and recursively calls itself with v_even and v_odd.
21268
21269 if (recursive call succeeded for v_even or v_odd)
21270 TARGET = zip (v_even, v_odd)
21271
21272 The function returns true if it managed to build TARGET from BUILDER
21273 with one of the special cases, false otherwise.
21274
21275 Example: {a, 1, b, 2, c, 3, d, 4}
21276
21277 The vector gets divided into:
21278 v_even = {a, b, c, d}
21279 v_odd = {1, 2, 3, 4}
21280
21281 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
21282 initialize tmp2 from constant vector v_odd using emit_move_insn.
21283
21284 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
21285 4 elements, so we construct tmp1 from v_even using insr:
21286 tmp1 = dup(d)
21287 insr tmp1, c
21288 insr tmp1, b
21289 insr tmp1, a
21290
21291 And finally:
21292 TARGET = zip (tmp1, tmp2)
21293 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
21294
21295static bool
21296aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
21297 int nelts, int nelts_reqd)
21298{
21299 machine_mode mode = GET_MODE (target);
21300
21301 /* Case 1: Vector contains trailing constants. */
21302
21303 if (aarch64_sve_expand_vector_init_handle_trailing_constants
21304 (target, builder, nelts, nelts_reqd))
21305 return true;
21306
21307 /* Case 2: Vector contains leading constants. */
21308
5da301cb 21309 rtx_vector_builder rev_builder (mode, nelts_reqd, 1);
3a0afad0
PK
21310 for (int i = 0; i < nelts_reqd; i++)
21311 rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
21312 rev_builder.finalize ();
21313
21314 if (aarch64_sve_expand_vector_init_handle_trailing_constants
21315 (target, rev_builder, nelts, nelts_reqd))
21316 {
21317 emit_insn (gen_aarch64_sve_rev (mode, target, target));
21318 return true;
21319 }
21320
21321 /* Case 3: Vector contains trailing same element. */
21322
21323 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
21324 (target, builder, nelts_reqd))
21325 return true;
21326
21327 /* Case 4: Vector contains leading same element. */
21328
21329 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
21330 (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
21331 {
21332 emit_insn (gen_aarch64_sve_rev (mode, target, target));
21333 return true;
21334 }
21335
21336 /* Avoid recursing below 4-elements.
21337 ??? The threshold 4 may need fine-tuning. */
21338
21339 if (nelts_reqd <= 4)
21340 return false;
21341
5da301cb
RS
21342 rtx_vector_builder v_even (mode, nelts, 1);
21343 rtx_vector_builder v_odd (mode, nelts, 1);
3a0afad0
PK
21344
21345 for (int i = 0; i < nelts * 2; i += 2)
21346 {
21347 v_even.quick_push (builder.elt (i));
21348 v_odd.quick_push (builder.elt (i + 1));
21349 }
21350
21351 v_even.finalize ();
21352 v_odd.finalize ();
21353
21354 rtx tmp1 = gen_reg_rtx (mode);
21355 bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
21356 nelts, nelts_reqd / 2);
21357
21358 rtx tmp2 = gen_reg_rtx (mode);
21359 bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
21360 nelts, nelts_reqd / 2);
21361
21362 if (!did_even_p && !did_odd_p)
21363 return false;
21364
21365 /* Initialize v_even and v_odd using INSR if it didn't match any of the
21366 special cases and zip v_even, v_odd. */
21367
21368 if (!did_even_p)
21369 aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
21370
21371 if (!did_odd_p)
21372 aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
21373
21374 rtvec v = gen_rtvec (2, tmp1, tmp2);
21375 emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
21376 return true;
21377}
21378
21379/* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
21380
21381void
21382aarch64_sve_expand_vector_init (rtx target, rtx vals)
21383{
21384 machine_mode mode = GET_MODE (target);
21385 int nelts = XVECLEN (vals, 0);
21386
5da301cb 21387 rtx_vector_builder v (mode, nelts, 1);
3a0afad0
PK
21388 for (int i = 0; i < nelts; i++)
21389 v.quick_push (XVECEXP (vals, 0, i));
21390 v.finalize ();
21391
21392 /* If neither sub-vectors of v could be initialized specially,
21393 then use INSR to insert all elements from v into TARGET.
21394 ??? This might not be optimal for vectors with large
21395 initializers like 16-element or above.
21396 For nelts < 4, it probably isn't useful to handle specially. */
21397
21398 if (nelts < 4
21399 || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
21400 aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
21401}
21402
b6c3aea1
RS
21403/* Check whether VALUE is a vector constant in which every element
21404 is either a power of 2 or a negated power of 2. If so, return
21405 a constant vector of log2s, and flip CODE between PLUS and MINUS
21406 if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
21407
21408static rtx
21409aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
21410{
568b9c0e 21411 if (!CONST_VECTOR_P (value))
b6c3aea1
RS
21412 return NULL_RTX;
21413
21414 rtx_vector_builder builder;
21415 if (!builder.new_unary_operation (GET_MODE (value), value, false))
21416 return NULL_RTX;
21417
21418 scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
21419 /* 1 if the result of the multiplication must be negated,
21420 0 if it mustn't, or -1 if we don't yet care. */
21421 int negate = -1;
21422 unsigned int encoded_nelts = const_vector_encoded_nelts (value);
21423 for (unsigned int i = 0; i < encoded_nelts; ++i)
21424 {
21425 rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
21426 if (!CONST_SCALAR_INT_P (elt))
21427 return NULL_RTX;
21428 rtx_mode_t val (elt, int_mode);
21429 wide_int pow2 = wi::neg (val);
21430 if (val != pow2)
21431 {
21432 /* It matters whether we negate or not. Make that choice,
21433 and make sure that it's consistent with previous elements. */
21434 if (negate == !wi::neg_p (val))
21435 return NULL_RTX;
21436 negate = wi::neg_p (val);
21437 if (!negate)
21438 pow2 = val;
21439 }
21440 /* POW2 is now the value that we want to be a power of 2. */
21441 int shift = wi::exact_log2 (pow2);
21442 if (shift < 0)
21443 return NULL_RTX;
21444 builder.quick_push (gen_int_mode (shift, int_mode));
21445 }
21446 if (negate == -1)
21447 /* PLUS and MINUS are equivalent; canonicalize on PLUS. */
21448 code = PLUS;
21449 else if (negate == 1)
21450 code = code == PLUS ? MINUS : PLUS;
21451 return builder.build ();
21452}
21453
21454/* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
21455 CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
21456 operands array, in the same order as for fma_optab. Return true if
21457 the function emitted all the necessary instructions, false if the caller
21458 should generate the pattern normally with the new OPERANDS array. */
21459
21460bool
21461aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
21462{
21463 machine_mode mode = GET_MODE (operands[0]);
21464 if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
21465 {
21466 rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
21467 NULL_RTX, true, OPTAB_DIRECT);
21468 force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
21469 operands[3], product, operands[0], true,
21470 OPTAB_DIRECT);
21471 return true;
21472 }
21473 operands[2] = force_reg (mode, operands[2]);
21474 return false;
21475}
21476
21477/* Likewise, but for a conditional pattern. */
21478
21479bool
21480aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
21481{
21482 machine_mode mode = GET_MODE (operands[0]);
21483 if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
21484 {
21485 rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
21486 NULL_RTX, true, OPTAB_DIRECT);
21487 emit_insn (gen_cond (code, mode, operands[0], operands[1],
21488 operands[4], product, operands[5]));
21489 return true;
21490 }
21491 operands[3] = force_reg (mode, operands[3]);
21492 return false;
21493}
21494
43e9d192 21495static unsigned HOST_WIDE_INT
ef4bddc2 21496aarch64_shift_truncation_mask (machine_mode mode)
43e9d192 21497{
43cacb12
RS
21498 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
21499 return 0;
21500 return GET_MODE_UNIT_BITSIZE (mode) - 1;
43e9d192
IB
21501}
21502
43e9d192
IB
21503/* Select a format to encode pointers in exception handling data. */
21504int
21505aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
21506{
21507 int type;
21508 switch (aarch64_cmodel)
21509 {
21510 case AARCH64_CMODEL_TINY:
21511 case AARCH64_CMODEL_TINY_PIC:
21512 case AARCH64_CMODEL_SMALL:
21513 case AARCH64_CMODEL_SMALL_PIC:
1b1e81f8 21514 case AARCH64_CMODEL_SMALL_SPIC:
43e9d192
IB
21515 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
21516 for everything. */
21517 type = DW_EH_PE_sdata4;
21518 break;
21519 default:
21520 /* No assumptions here. 8-byte relocs required. */
21521 type = DW_EH_PE_sdata8;
21522 break;
21523 }
21524 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
21525}
21526
b07fc91c
SN
21527/* Output .variant_pcs for aarch64_vector_pcs function symbols. */
21528
21529static void
21530aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
21531{
c600df9a 21532 if (TREE_CODE (decl) == FUNCTION_DECL)
b07fc91c 21533 {
c600df9a
RS
21534 arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
21535 if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
21536 {
21537 fprintf (stream, "\t.variant_pcs\t");
21538 assemble_name (stream, name);
21539 fprintf (stream, "\n");
21540 }
b07fc91c
SN
21541 }
21542}
21543
e1c1ecb0
KT
21544/* The last .arch and .tune assembly strings that we printed. */
21545static std::string aarch64_last_printed_arch_string;
21546static std::string aarch64_last_printed_tune_string;
21547
361fb3ee
KT
21548/* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
21549 by the function fndecl. */
21550
21551void
21552aarch64_declare_function_name (FILE *stream, const char* name,
21553 tree fndecl)
21554{
21555 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
21556
21557 struct cl_target_option *targ_options;
21558 if (target_parts)
21559 targ_options = TREE_TARGET_OPTION (target_parts);
21560 else
21561 targ_options = TREE_TARGET_OPTION (target_option_current_node);
21562 gcc_assert (targ_options);
21563
21564 const struct processor *this_arch
21565 = aarch64_get_arch (targ_options->x_explicit_arch);
21566
28108a53 21567 uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
054b4005 21568 std::string extension
04a99ebe
JG
21569 = aarch64_get_extension_string_for_isa_flags (isa_flags,
21570 this_arch->flags);
e1c1ecb0
KT
21571 /* Only update the assembler .arch string if it is distinct from the last
21572 such string we printed. */
21573 std::string to_print = this_arch->name + extension;
21574 if (to_print != aarch64_last_printed_arch_string)
21575 {
21576 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
21577 aarch64_last_printed_arch_string = to_print;
21578 }
361fb3ee
KT
21579
21580 /* Print the cpu name we're tuning for in the comments, might be
e1c1ecb0
KT
21581 useful to readers of the generated asm. Do it only when it changes
21582 from function to function and verbose assembly is requested. */
361fb3ee
KT
21583 const struct processor *this_tune
21584 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
21585
e1c1ecb0
KT
21586 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
21587 {
21588 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
21589 this_tune->name);
21590 aarch64_last_printed_tune_string = this_tune->name;
21591 }
361fb3ee 21592
b07fc91c
SN
21593 aarch64_asm_output_variant_pcs (stream, fndecl, name);
21594
361fb3ee
KT
21595 /* Don't forget the type directive for ELF. */
21596 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
21597 ASM_OUTPUT_LABEL (stream, name);
c292cfe5
SN
21598
21599 cfun->machine->label_is_assembled = true;
21600}
21601
21602/* Implement PRINT_PATCHABLE_FUNCTION_ENTRY. Check if the patch area is after
21603 the function label and emit a BTI if necessary. */
21604
21605void
21606aarch64_print_patchable_function_entry (FILE *file,
21607 unsigned HOST_WIDE_INT patch_area_size,
21608 bool record_p)
21609{
21610 if (cfun->machine->label_is_assembled
21611 && aarch64_bti_enabled ()
21612 && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
21613 {
21614 /* Remove the BTI that follows the patch area and insert a new BTI
21615 before the patch area right after the function label. */
21616 rtx_insn *insn = next_real_nondebug_insn (get_insns ());
21617 if (insn
21618 && INSN_P (insn)
21619 && GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
21620 && XINT (PATTERN (insn), 1) == UNSPECV_BTI_C)
21621 delete_insn (insn);
21622 asm_fprintf (file, "\thint\t34 // bti c\n");
21623 }
21624
21625 default_print_patchable_function_entry (file, patch_area_size, record_p);
361fb3ee
KT
21626}
21627
b07fc91c
SN
21628/* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
21629
21630void
21631aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
21632{
21633 const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
21634 const char *value = IDENTIFIER_POINTER (target);
21635 aarch64_asm_output_variant_pcs (stream, decl, name);
21636 ASM_OUTPUT_DEF (stream, name, value);
21637}
21638
21639/* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
21640 function symbol references. */
21641
21642void
e8c47069 21643aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
b07fc91c 21644{
e8c47069 21645 default_elf_asm_output_external (stream, decl, name);
b07fc91c
SN
21646 aarch64_asm_output_variant_pcs (stream, decl, name);
21647}
21648
8fc16d72
ST
21649/* Triggered after a .cfi_startproc directive is emitted into the assembly file.
21650 Used to output the .cfi_b_key_frame directive when signing the current
21651 function with the B key. */
21652
21653void
21654aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
21655{
2bdc7dcb 21656 if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
8fc16d72
ST
21657 && aarch64_ra_sign_key == AARCH64_KEY_B)
21658 asm_fprintf (f, "\t.cfi_b_key_frame\n");
21659}
21660
e1c1ecb0
KT
21661/* Implements TARGET_ASM_FILE_START. Output the assembly header. */
21662
21663static void
21664aarch64_start_file (void)
21665{
21666 struct cl_target_option *default_options
21667 = TREE_TARGET_OPTION (target_option_default_node);
21668
21669 const struct processor *default_arch
21670 = aarch64_get_arch (default_options->x_explicit_arch);
28108a53 21671 uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
e1c1ecb0 21672 std::string extension
04a99ebe
JG
21673 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
21674 default_arch->flags);
e1c1ecb0
KT
21675
21676 aarch64_last_printed_arch_string = default_arch->name + extension;
21677 aarch64_last_printed_tune_string = "";
21678 asm_fprintf (asm_out_file, "\t.arch %s\n",
21679 aarch64_last_printed_arch_string.c_str ());
21680
21681 default_file_start ();
21682}
21683
0462169c
SN
21684/* Emit load exclusive. */
21685
21686static void
ef4bddc2 21687aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
0462169c
SN
21688 rtx mem, rtx model_rtx)
21689{
4a2095eb
RH
21690 if (mode == TImode)
21691 emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
21692 gen_highpart (DImode, rval),
21693 mem, model_rtx));
21694 else
21695 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
0462169c
SN
21696}
21697
21698/* Emit store exclusive. */
21699
21700static void
ef4bddc2 21701aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
4a2095eb 21702 rtx mem, rtx rval, rtx model_rtx)
0462169c 21703{
4a2095eb
RH
21704 if (mode == TImode)
21705 emit_insn (gen_aarch64_store_exclusive_pair
21706 (bval, mem, operand_subword (rval, 0, 0, TImode),
21707 operand_subword (rval, 1, 0, TImode), model_rtx));
21708 else
21709 emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
0462169c
SN
21710}
21711
21712/* Mark the previous jump instruction as unlikely. */
21713
21714static void
21715aarch64_emit_unlikely_jump (rtx insn)
21716{
f370536c 21717 rtx_insn *jump = emit_jump_insn (insn);
5fa396ad 21718 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
0462169c
SN
21719}
21720
3950b229
RH
21721/* We store the names of the various atomic helpers in a 5x4 array.
21722 Return the libcall function given MODE, MODEL and NAMES. */
21723
21724rtx
21725aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
21726 const atomic_ool_names *names)
21727{
21728 memmodel model = memmodel_base (INTVAL (model_rtx));
21729 int mode_idx, model_idx;
21730
21731 switch (mode)
21732 {
21733 case E_QImode:
21734 mode_idx = 0;
21735 break;
21736 case E_HImode:
21737 mode_idx = 1;
21738 break;
21739 case E_SImode:
21740 mode_idx = 2;
21741 break;
21742 case E_DImode:
21743 mode_idx = 3;
21744 break;
21745 case E_TImode:
21746 mode_idx = 4;
21747 break;
21748 default:
21749 gcc_unreachable ();
21750 }
21751
21752 switch (model)
21753 {
21754 case MEMMODEL_RELAXED:
21755 model_idx = 0;
21756 break;
21757 case MEMMODEL_CONSUME:
21758 case MEMMODEL_ACQUIRE:
21759 model_idx = 1;
21760 break;
21761 case MEMMODEL_RELEASE:
21762 model_idx = 2;
21763 break;
21764 case MEMMODEL_ACQ_REL:
21765 case MEMMODEL_SEQ_CST:
21766 model_idx = 3;
21767 break;
21768 default:
21769 gcc_unreachable ();
21770 }
21771
21772 return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
21773 VISIBILITY_HIDDEN);
21774}
21775
21776#define DEF0(B, N) \
21777 { "__aarch64_" #B #N "_relax", \
21778 "__aarch64_" #B #N "_acq", \
21779 "__aarch64_" #B #N "_rel", \
21780 "__aarch64_" #B #N "_acq_rel" }
21781
21782#define DEF4(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
21783 { NULL, NULL, NULL, NULL }
21784#define DEF5(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
21785
21786static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
21787const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
21788const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
21789const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
21790const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
21791const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
21792
21793#undef DEF0
21794#undef DEF4
21795#undef DEF5
21796
0462169c
SN
21797/* Expand a compare and swap pattern. */
21798
21799void
21800aarch64_expand_compare_and_swap (rtx operands[])
21801{
d400fda3
RH
21802 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
21803 machine_mode mode, r_mode;
0462169c
SN
21804
21805 bval = operands[0];
21806 rval = operands[1];
21807 mem = operands[2];
21808 oldval = operands[3];
21809 newval = operands[4];
21810 is_weak = operands[5];
21811 mod_s = operands[6];
21812 mod_f = operands[7];
21813 mode = GET_MODE (mem);
0462169c
SN
21814
21815 /* Normally the succ memory model must be stronger than fail, but in the
21816 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
21817 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
46b35980
AM
21818 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
21819 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
0462169c
SN
21820 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
21821
d400fda3
RH
21822 r_mode = mode;
21823 if (mode == QImode || mode == HImode)
0462169c 21824 {
d400fda3
RH
21825 r_mode = SImode;
21826 rval = gen_reg_rtx (r_mode);
0462169c
SN
21827 }
21828
b0770c0f 21829 if (TARGET_LSE)
77f33f44
RH
21830 {
21831 /* The CAS insn requires oldval and rval overlap, but we need to
21832 have a copy of oldval saved across the operation to tell if
21833 the operation is successful. */
d400fda3
RH
21834 if (reg_overlap_mentioned_p (rval, oldval))
21835 rval = copy_to_mode_reg (r_mode, oldval);
77f33f44 21836 else
d400fda3
RH
21837 emit_move_insn (rval, gen_lowpart (r_mode, oldval));
21838
77f33f44
RH
21839 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
21840 newval, mod_s));
d400fda3 21841 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
77f33f44 21842 }
3950b229
RH
21843 else if (TARGET_OUTLINE_ATOMICS)
21844 {
21845 /* Oldval must satisfy compare afterward. */
21846 if (!aarch64_plus_operand (oldval, mode))
21847 oldval = force_reg (mode, oldval);
21848 rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
21849 rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
21850 oldval, mode, newval, mode,
21851 XEXP (mem, 0), Pmode);
21852 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
21853 }
b0770c0f 21854 else
d400fda3
RH
21855 {
21856 /* The oldval predicate varies by mode. Test it and force to reg. */
21857 insn_code code = code_for_aarch64_compare_and_swap (mode);
21858 if (!insn_data[code].operand[2].predicate (oldval, mode))
21859 oldval = force_reg (mode, oldval);
0462169c 21860
d400fda3
RH
21861 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
21862 is_weak, mod_s, mod_f));
21863 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
21864 }
21865
21866 if (r_mode != mode)
77f33f44
RH
21867 rval = gen_lowpart (mode, rval);
21868 emit_move_insn (operands[1], rval);
0462169c 21869
d400fda3 21870 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
f7df4a84 21871 emit_insn (gen_rtx_SET (bval, x));
0462169c
SN
21872}
21873
f70fb3b6
MW
21874/* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
21875 sequence implementing an atomic operation. */
21876
21877static void
21878aarch64_emit_post_barrier (enum memmodel model)
21879{
21880 const enum memmodel base_model = memmodel_base (model);
21881
21882 if (is_mm_sync (model)
21883 && (base_model == MEMMODEL_ACQUIRE
21884 || base_model == MEMMODEL_ACQ_REL
21885 || base_model == MEMMODEL_SEQ_CST))
21886 {
21887 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
21888 }
21889}
21890
0462169c
SN
21891/* Split a compare and swap pattern. */
21892
21893void
21894aarch64_split_compare_and_swap (rtx operands[])
21895{
e5e07b68
WD
21896 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
21897 gcc_assert (epilogue_completed);
21898
b7e560de 21899 rtx rval, mem, oldval, newval, scratch, x, model_rtx;
ef4bddc2 21900 machine_mode mode;
0462169c 21901 bool is_weak;
5d8a22a5 21902 rtx_code_label *label1, *label2;
ab876106 21903 enum memmodel model;
0462169c
SN
21904
21905 rval = operands[0];
21906 mem = operands[1];
21907 oldval = operands[2];
21908 newval = operands[3];
21909 is_weak = (operands[4] != const0_rtx);
ab876106 21910 model_rtx = operands[5];
0462169c
SN
21911 scratch = operands[7];
21912 mode = GET_MODE (mem);
ab876106 21913 model = memmodel_from_int (INTVAL (model_rtx));
0462169c 21914
17f47f86
KT
21915 /* When OLDVAL is zero and we want the strong version we can emit a tighter
21916 loop:
21917 .label1:
21918 LD[A]XR rval, [mem]
21919 CBNZ rval, .label2
21920 ST[L]XR scratch, newval, [mem]
21921 CBNZ scratch, .label1
21922 .label2:
21923 CMP rval, 0. */
b7e560de
RH
21924 bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
21925 oldval == const0_rtx && mode != TImode);
17f47f86 21926
5d8a22a5 21927 label1 = NULL;
0462169c
SN
21928 if (!is_weak)
21929 {
21930 label1 = gen_label_rtx ();
21931 emit_label (label1);
21932 }
21933 label2 = gen_label_rtx ();
21934
ab876106
MW
21935 /* The initial load can be relaxed for a __sync operation since a final
21936 barrier will be emitted to stop code hoisting. */
21937 if (is_mm_sync (model))
b7e560de 21938 aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
ab876106
MW
21939 else
21940 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
0462169c 21941
17f47f86 21942 if (strong_zero_p)
b7e560de 21943 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
17f47f86
KT
21944 else
21945 {
b7e560de
RH
21946 rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
21947 x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
17f47f86 21948 }
b7e560de
RH
21949 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
21950 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
21951 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
0462169c 21952
ab876106 21953 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
0462169c
SN
21954
21955 if (!is_weak)
21956 {
6e1eaca9
RE
21957 if (aarch64_track_speculation)
21958 {
21959 /* Emit an explicit compare instruction, so that we can correctly
21960 track the condition codes. */
21961 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
21962 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
21963 }
21964 else
21965 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
21966
0462169c
SN
21967 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
21968 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
f7df4a84 21969 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
0462169c
SN
21970 }
21971 else
b7e560de 21972 aarch64_gen_compare_reg (NE, scratch, const0_rtx);
0462169c
SN
21973
21974 emit_label (label2);
b7e560de 21975
17f47f86
KT
21976 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
21977 to set the condition flags. If this is not used it will be removed by
21978 later passes. */
21979 if (strong_zero_p)
b7e560de
RH
21980 aarch64_gen_compare_reg (NE, rval, const0_rtx);
21981
ab876106
MW
21982 /* Emit any final barrier needed for a __sync operation. */
21983 if (is_mm_sync (model))
21984 aarch64_emit_post_barrier (model);
0462169c 21985}
9cd7b720 21986
0462169c
SN
21987/* Split an atomic operation. */
21988
21989void
21990aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
9cd7b720 21991 rtx value, rtx model_rtx, rtx cond)
0462169c 21992{
e5e07b68
WD
21993 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
21994 gcc_assert (epilogue_completed);
21995
ef4bddc2
RS
21996 machine_mode mode = GET_MODE (mem);
21997 machine_mode wmode = (mode == DImode ? DImode : SImode);
f70fb3b6
MW
21998 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
21999 const bool is_sync = is_mm_sync (model);
5d8a22a5
DM
22000 rtx_code_label *label;
22001 rtx x;
0462169c 22002
9cd7b720 22003 /* Split the atomic operation into a sequence. */
0462169c
SN
22004 label = gen_label_rtx ();
22005 emit_label (label);
22006
22007 if (new_out)
22008 new_out = gen_lowpart (wmode, new_out);
22009 if (old_out)
22010 old_out = gen_lowpart (wmode, old_out);
22011 else
22012 old_out = new_out;
22013 value = simplify_gen_subreg (wmode, value, mode, 0);
22014
f70fb3b6
MW
22015 /* The initial load can be relaxed for a __sync operation since a final
22016 barrier will be emitted to stop code hoisting. */
22017 if (is_sync)
22018 aarch64_emit_load_exclusive (mode, old_out, mem,
22019 GEN_INT (MEMMODEL_RELAXED));
22020 else
22021 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
0462169c
SN
22022
22023 switch (code)
22024 {
22025 case SET:
22026 new_out = value;
22027 break;
22028
22029 case NOT:
22030 x = gen_rtx_AND (wmode, old_out, value);
f7df4a84 22031 emit_insn (gen_rtx_SET (new_out, x));
0462169c 22032 x = gen_rtx_NOT (wmode, new_out);
f7df4a84 22033 emit_insn (gen_rtx_SET (new_out, x));
0462169c
SN
22034 break;
22035
22036 case MINUS:
22037 if (CONST_INT_P (value))
22038 {
618ae596 22039 value = GEN_INT (-UINTVAL (value));
0462169c
SN
22040 code = PLUS;
22041 }
22042 /* Fall through. */
22043
22044 default:
22045 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
f7df4a84 22046 emit_insn (gen_rtx_SET (new_out, x));
0462169c
SN
22047 break;
22048 }
22049
22050 aarch64_emit_store_exclusive (mode, cond, mem,
22051 gen_lowpart (mode, new_out), model_rtx);
22052
6e1eaca9
RE
22053 if (aarch64_track_speculation)
22054 {
22055 /* Emit an explicit compare instruction, so that we can correctly
22056 track the condition codes. */
22057 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
22058 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
22059 }
22060 else
22061 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
22062
0462169c
SN
22063 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
22064 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
f7df4a84 22065 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
f70fb3b6
MW
22066
22067 /* Emit any final barrier needed for a __sync operation. */
22068 if (is_sync)
22069 aarch64_emit_post_barrier (model);
0462169c
SN
22070}
22071
c2ec330c
AL
22072static void
22073aarch64_init_libfuncs (void)
22074{
22075 /* Half-precision float operations. The compiler handles all operations
22076 with NULL libfuncs by converting to SFmode. */
22077
22078 /* Conversions. */
22079 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
22080 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
22081
22082 /* Arithmetic. */
22083 set_optab_libfunc (add_optab, HFmode, NULL);
22084 set_optab_libfunc (sdiv_optab, HFmode, NULL);
22085 set_optab_libfunc (smul_optab, HFmode, NULL);
22086 set_optab_libfunc (neg_optab, HFmode, NULL);
22087 set_optab_libfunc (sub_optab, HFmode, NULL);
22088
22089 /* Comparisons. */
22090 set_optab_libfunc (eq_optab, HFmode, NULL);
22091 set_optab_libfunc (ne_optab, HFmode, NULL);
22092 set_optab_libfunc (lt_optab, HFmode, NULL);
22093 set_optab_libfunc (le_optab, HFmode, NULL);
22094 set_optab_libfunc (ge_optab, HFmode, NULL);
22095 set_optab_libfunc (gt_optab, HFmode, NULL);
22096 set_optab_libfunc (unord_optab, HFmode, NULL);
22097}
22098
43e9d192 22099/* Target hook for c_mode_for_suffix. */
ef4bddc2 22100static machine_mode
43e9d192
IB
22101aarch64_c_mode_for_suffix (char suffix)
22102{
22103 if (suffix == 'q')
22104 return TFmode;
22105
22106 return VOIDmode;
22107}
22108
3520f7cc
JG
22109/* We can only represent floating point constants which will fit in
22110 "quarter-precision" values. These values are characterised by
22111 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
22112 by:
22113
22114 (-1)^s * (n/16) * 2^r
22115
22116 Where:
22117 's' is the sign bit.
22118 'n' is an integer in the range 16 <= n <= 31.
22119 'r' is an integer in the range -3 <= r <= 4. */
22120
22121/* Return true iff X can be represented by a quarter-precision
22122 floating point immediate operand X. Note, we cannot represent 0.0. */
22123bool
22124aarch64_float_const_representable_p (rtx x)
22125{
22126 /* This represents our current view of how many bits
22127 make up the mantissa. */
22128 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
ba96cdfb 22129 int exponent;
3520f7cc 22130 unsigned HOST_WIDE_INT mantissa, mask;
3520f7cc 22131 REAL_VALUE_TYPE r, m;
807e902e 22132 bool fail;
3520f7cc 22133
d29f7dd5 22134 x = unwrap_const_vec_duplicate (x);
3520f7cc
JG
22135 if (!CONST_DOUBLE_P (x))
22136 return false;
22137
a4518821
RS
22138 if (GET_MODE (x) == VOIDmode
22139 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
94bfa2da
TV
22140 return false;
22141
34a72c33 22142 r = *CONST_DOUBLE_REAL_VALUE (x);
3520f7cc
JG
22143
22144 /* We cannot represent infinities, NaNs or +/-zero. We won't
22145 know if we have +zero until we analyse the mantissa, but we
22146 can reject the other invalid values. */
22147 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
22148 || REAL_VALUE_MINUS_ZERO (r))
22149 return false;
22150
ba96cdfb 22151 /* Extract exponent. */
3520f7cc
JG
22152 r = real_value_abs (&r);
22153 exponent = REAL_EXP (&r);
22154
22155 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
22156 highest (sign) bit, with a fixed binary point at bit point_pos.
22157 m1 holds the low part of the mantissa, m2 the high part.
22158 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
22159 bits for the mantissa, this can fail (low bits will be lost). */
22160 real_ldexp (&m, &r, point_pos - exponent);
807e902e 22161 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
3520f7cc
JG
22162
22163 /* If the low part of the mantissa has bits set we cannot represent
22164 the value. */
d9074b29 22165 if (w.ulow () != 0)
3520f7cc
JG
22166 return false;
22167 /* We have rejected the lower HOST_WIDE_INT, so update our
22168 understanding of how many bits lie in the mantissa and
22169 look only at the high HOST_WIDE_INT. */
807e902e 22170 mantissa = w.elt (1);
3520f7cc
JG
22171 point_pos -= HOST_BITS_PER_WIDE_INT;
22172
22173 /* We can only represent values with a mantissa of the form 1.xxxx. */
22174 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
22175 if ((mantissa & mask) != 0)
22176 return false;
22177
22178 /* Having filtered unrepresentable values, we may now remove all
22179 but the highest 5 bits. */
22180 mantissa >>= point_pos - 5;
22181
22182 /* We cannot represent the value 0.0, so reject it. This is handled
22183 elsewhere. */
22184 if (mantissa == 0)
22185 return false;
22186
22187 /* Then, as bit 4 is always set, we can mask it off, leaving
22188 the mantissa in the range [0, 15]. */
22189 mantissa &= ~(1 << 4);
22190 gcc_assert (mantissa <= 15);
22191
22192 /* GCC internally does not use IEEE754-like encoding (where normalized
22193 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
22194 Our mantissa values are shifted 4 places to the left relative to
22195 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
22196 by 5 places to correct for GCC's representation. */
22197 exponent = 5 - exponent;
22198
22199 return (exponent >= 0 && exponent <= 7);
22200}
22201
ab6501d7
SD
22202/* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
22203 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
22204 output MOVI/MVNI, ORR or BIC immediate. */
3520f7cc 22205char*
b187677b 22206aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
ab6501d7 22207 enum simd_immediate_check which)
3520f7cc 22208{
3ea63f60 22209 bool is_valid;
3520f7cc 22210 static char templ[40];
3520f7cc 22211 const char *mnemonic;
e4f0f84d 22212 const char *shift_op;
3520f7cc 22213 unsigned int lane_count = 0;
81c2dfb9 22214 char element_char;
3520f7cc 22215
b187677b 22216 struct simd_immediate_info info;
48063b9d
IB
22217
22218 /* This will return true to show const_vector is legal for use as either
ab6501d7
SD
22219 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
22220 It will also update INFO to show how the immediate should be generated.
22221 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
b187677b 22222 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
3520f7cc
JG
22223 gcc_assert (is_valid);
22224
b187677b
RS
22225 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
22226 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
48063b9d 22227
b187677b 22228 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
3520f7cc 22229 {
1da83cce
RS
22230 gcc_assert (info.insn == simd_immediate_info::MOV
22231 && info.u.mov.shift == 0);
0d8e1702
KT
22232 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
22233 move immediate path. */
1da83cce
RS
22234 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
22235 info.u.mov.value = GEN_INT (0);
48063b9d
IB
22236 else
22237 {
83faf7d0 22238 const unsigned int buf_size = 20;
48063b9d 22239 char float_buf[buf_size] = {'\0'};
34a72c33 22240 real_to_decimal_for_mode (float_buf,
1da83cce 22241 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
b187677b 22242 buf_size, buf_size, 1, info.elt_mode);
48063b9d
IB
22243
22244 if (lane_count == 1)
22245 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
22246 else
22247 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
81c2dfb9 22248 lane_count, element_char, float_buf);
48063b9d
IB
22249 return templ;
22250 }
3520f7cc 22251 }
3520f7cc 22252
1da83cce 22253 gcc_assert (CONST_INT_P (info.u.mov.value));
ab6501d7
SD
22254
22255 if (which == AARCH64_CHECK_MOV)
22256 {
b187677b 22257 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
1da83cce
RS
22258 shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
22259 ? "msl" : "lsl");
ab6501d7
SD
22260 if (lane_count == 1)
22261 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
1da83cce
RS
22262 mnemonic, UINTVAL (info.u.mov.value));
22263 else if (info.u.mov.shift)
ab6501d7
SD
22264 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
22265 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
1da83cce
RS
22266 element_char, UINTVAL (info.u.mov.value), shift_op,
22267 info.u.mov.shift);
ab6501d7
SD
22268 else
22269 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
22270 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
1da83cce 22271 element_char, UINTVAL (info.u.mov.value));
ab6501d7 22272 }
3520f7cc 22273 else
ab6501d7
SD
22274 {
22275 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
b187677b 22276 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
1da83cce 22277 if (info.u.mov.shift)
ab6501d7
SD
22278 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
22279 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
1da83cce
RS
22280 element_char, UINTVAL (info.u.mov.value), "lsl",
22281 info.u.mov.shift);
ab6501d7
SD
22282 else
22283 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
22284 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
1da83cce 22285 element_char, UINTVAL (info.u.mov.value));
ab6501d7 22286 }
3520f7cc
JG
22287 return templ;
22288}
22289
b7342d25 22290char*
77e994c9 22291aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
b7342d25 22292{
a2170965
TC
22293
22294 /* If a floating point number was passed and we desire to use it in an
22295 integer mode do the conversion to integer. */
22296 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
22297 {
22298 unsigned HOST_WIDE_INT ival;
22299 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
22300 gcc_unreachable ();
22301 immediate = gen_int_mode (ival, mode);
22302 }
22303
ef4bddc2 22304 machine_mode vmode;
a2170965
TC
22305 /* use a 64 bit mode for everything except for DI/DF mode, where we use
22306 a 128 bit vector mode. */
22307 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
b7342d25 22308
a2170965 22309 vmode = aarch64_simd_container_mode (mode, width);
b7342d25 22310 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
b187677b 22311 return aarch64_output_simd_mov_immediate (v_op, width);
b7342d25
IB
22312}
22313
43cacb12
RS
22314/* Return the output string to use for moving immediate CONST_VECTOR
22315 into an SVE register. */
22316
22317char *
22318aarch64_output_sve_mov_immediate (rtx const_vector)
22319{
22320 static char templ[40];
22321 struct simd_immediate_info info;
22322 char element_char;
22323
22324 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
22325 gcc_assert (is_valid);
22326
22327 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
22328
1044fa32
RS
22329 machine_mode vec_mode = GET_MODE (const_vector);
22330 if (aarch64_sve_pred_mode_p (vec_mode))
22331 {
22332 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
0b1fe8cf
RS
22333 if (info.insn == simd_immediate_info::MOV)
22334 {
22335 gcc_assert (info.u.mov.value == const0_rtx);
22336 snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
22337 }
1044fa32 22338 else
0b1fe8cf
RS
22339 {
22340 gcc_assert (info.insn == simd_immediate_info::PTRUE);
22341 unsigned int total_bytes;
22342 if (info.u.pattern == AARCH64_SV_ALL
22343 && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
22344 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
22345 total_bytes / GET_MODE_SIZE (info.elt_mode));
22346 else
22347 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
22348 svpattern_token (info.u.pattern));
22349 }
1044fa32
RS
22350 return buf;
22351 }
22352
1da83cce 22353 if (info.insn == simd_immediate_info::INDEX)
43cacb12
RS
22354 {
22355 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
22356 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
1da83cce
RS
22357 element_char, INTVAL (info.u.index.base),
22358 INTVAL (info.u.index.step));
43cacb12
RS
22359 return templ;
22360 }
22361
22362 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
22363 {
1da83cce
RS
22364 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
22365 info.u.mov.value = GEN_INT (0);
43cacb12
RS
22366 else
22367 {
22368 const int buf_size = 20;
22369 char float_buf[buf_size] = {};
22370 real_to_decimal_for_mode (float_buf,
1da83cce 22371 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
43cacb12
RS
22372 buf_size, buf_size, 1, info.elt_mode);
22373
22374 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
22375 element_char, float_buf);
22376 return templ;
22377 }
22378 }
22379
22380 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
1da83cce 22381 element_char, INTVAL (info.u.mov.value));
43cacb12
RS
22382 return templ;
22383}
22384
624d0f07
RS
22385/* Return the asm template for a PTRUES. CONST_UNSPEC is the
22386 aarch64_sve_ptrue_svpattern_immediate that describes the predicate
22387 pattern. */
22388
22389char *
22390aarch64_output_sve_ptrues (rtx const_unspec)
22391{
22392 static char templ[40];
22393
22394 struct simd_immediate_info info;
22395 bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
22396 gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
22397
22398 char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
22399 snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
22400 svpattern_token (info.u.pattern));
22401 return templ;
22402}
22403
88b08073
JG
22404/* Split operands into moves from op[1] + op[2] into op[0]. */
22405
22406void
22407aarch64_split_combinev16qi (rtx operands[3])
22408{
22409 unsigned int dest = REGNO (operands[0]);
22410 unsigned int src1 = REGNO (operands[1]);
22411 unsigned int src2 = REGNO (operands[2]);
ef4bddc2 22412 machine_mode halfmode = GET_MODE (operands[1]);
462a99aa 22413 unsigned int halfregs = REG_NREGS (operands[1]);
88b08073
JG
22414 rtx destlo, desthi;
22415
22416 gcc_assert (halfmode == V16QImode);
22417
22418 if (src1 == dest && src2 == dest + halfregs)
22419 {
22420 /* No-op move. Can't split to nothing; emit something. */
22421 emit_note (NOTE_INSN_DELETED);
22422 return;
22423 }
22424
22425 /* Preserve register attributes for variable tracking. */
22426 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
22427 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
22428 GET_MODE_SIZE (halfmode));
22429
22430 /* Special case of reversed high/low parts. */
22431 if (reg_overlap_mentioned_p (operands[2], destlo)
22432 && reg_overlap_mentioned_p (operands[1], desthi))
22433 {
22434 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
22435 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
22436 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
22437 }
22438 else if (!reg_overlap_mentioned_p (operands[2], destlo))
22439 {
22440 /* Try to avoid unnecessary moves if part of the result
22441 is in the right place already. */
22442 if (src1 != dest)
22443 emit_move_insn (destlo, operands[1]);
22444 if (src2 != dest + halfregs)
22445 emit_move_insn (desthi, operands[2]);
22446 }
22447 else
22448 {
22449 if (src2 != dest + halfregs)
22450 emit_move_insn (desthi, operands[2]);
22451 if (src1 != dest)
22452 emit_move_insn (destlo, operands[1]);
22453 }
22454}
22455
22456/* vec_perm support. */
22457
88b08073
JG
22458struct expand_vec_perm_d
22459{
22460 rtx target, op0, op1;
e3342de4 22461 vec_perm_indices perm;
ef4bddc2 22462 machine_mode vmode;
43cacb12 22463 unsigned int vec_flags;
88b08073
JG
22464 bool one_vector_p;
22465 bool testing_p;
22466};
22467
7efc03fd
DP
22468static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d);
22469
88b08073
JG
22470/* Generate a variable permutation. */
22471
22472static void
22473aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
22474{
ef4bddc2 22475 machine_mode vmode = GET_MODE (target);
88b08073
JG
22476 bool one_vector_p = rtx_equal_p (op0, op1);
22477
22478 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
22479 gcc_checking_assert (GET_MODE (op0) == vmode);
22480 gcc_checking_assert (GET_MODE (op1) == vmode);
22481 gcc_checking_assert (GET_MODE (sel) == vmode);
22482 gcc_checking_assert (TARGET_SIMD);
22483
22484 if (one_vector_p)
22485 {
22486 if (vmode == V8QImode)
22487 {
22488 /* Expand the argument to a V16QI mode by duplicating it. */
22489 rtx pair = gen_reg_rtx (V16QImode);
22490 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
b7e450c9 22491 emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
88b08073
JG
22492 }
22493 else
22494 {
b7e450c9 22495 emit_insn (gen_aarch64_qtbl1v16qi (target, op0, sel));
88b08073
JG
22496 }
22497 }
22498 else
22499 {
22500 rtx pair;
22501
22502 if (vmode == V8QImode)
22503 {
22504 pair = gen_reg_rtx (V16QImode);
22505 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
b7e450c9 22506 emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
88b08073
JG
22507 }
22508 else
22509 {
66f206b8 22510 pair = gen_reg_rtx (V2x16QImode);
88b08073 22511 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
b7e450c9 22512 emit_insn (gen_aarch64_qtbl2v16qi (target, pair, sel));
88b08073
JG
22513 }
22514 }
22515}
22516
80940017
RS
22517/* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
22518 NELT is the number of elements in the vector. */
22519
88b08073 22520void
80940017
RS
22521aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
22522 unsigned int nelt)
88b08073 22523{
ef4bddc2 22524 machine_mode vmode = GET_MODE (target);
88b08073 22525 bool one_vector_p = rtx_equal_p (op0, op1);
f7c4e5b8 22526 rtx mask;
88b08073
JG
22527
22528 /* The TBL instruction does not use a modulo index, so we must take care
22529 of that ourselves. */
f7c4e5b8
AL
22530 mask = aarch64_simd_gen_const_vector_dup (vmode,
22531 one_vector_p ? nelt - 1 : 2 * nelt - 1);
88b08073
JG
22532 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
22533
f7c4e5b8
AL
22534 /* For big-endian, we also need to reverse the index within the vector
22535 (but not which vector). */
22536 if (BYTES_BIG_ENDIAN)
22537 {
22538 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
22539 if (!one_vector_p)
22540 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
22541 sel = expand_simple_binop (vmode, XOR, sel, mask,
22542 NULL, 0, OPTAB_LIB_WIDEN);
22543 }
88b08073
JG
22544 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
22545}
22546
43cacb12
RS
22547/* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
22548
22549static void
22550emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
22551{
22552 emit_insn (gen_rtx_SET (target,
22553 gen_rtx_UNSPEC (GET_MODE (target),
22554 gen_rtvec (2, op0, op1), code)));
22555}
22556
22557/* Expand an SVE vec_perm with the given operands. */
22558
22559void
22560aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
22561{
22562 machine_mode data_mode = GET_MODE (target);
22563 machine_mode sel_mode = GET_MODE (sel);
22564 /* Enforced by the pattern condition. */
22565 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
22566
22567 /* Note: vec_perm indices are supposed to wrap when they go beyond the
22568 size of the two value vectors, i.e. the upper bits of the indices
22569 are effectively ignored. SVE TBL instead produces 0 for any
22570 out-of-range indices, so we need to modulo all the vec_perm indices
22571 to ensure they are all in range. */
22572 rtx sel_reg = force_reg (sel_mode, sel);
22573
22574 /* Check if the sel only references the first values vector. */
568b9c0e 22575 if (CONST_VECTOR_P (sel)
43cacb12
RS
22576 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
22577 {
22578 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
22579 return;
22580 }
22581
22582 /* Check if the two values vectors are the same. */
22583 if (rtx_equal_p (op0, op1))
22584 {
22585 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
22586 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
22587 NULL, 0, OPTAB_DIRECT);
22588 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
22589 return;
22590 }
22591
22592 /* Run TBL on for each value vector and combine the results. */
22593
22594 rtx res0 = gen_reg_rtx (data_mode);
22595 rtx res1 = gen_reg_rtx (data_mode);
22596 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
568b9c0e 22597 if (!CONST_VECTOR_P (sel)
43cacb12
RS
22598 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
22599 {
22600 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
22601 2 * nunits - 1);
22602 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
22603 NULL, 0, OPTAB_DIRECT);
22604 }
22605 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
22606 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
22607 NULL, 0, OPTAB_DIRECT);
22608 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
22609 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
22610 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
22611 else
22612 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
22613}
22614
cc4d934f
JG
22615/* Recognize patterns suitable for the TRN instructions. */
22616static bool
22617aarch64_evpc_trn (struct expand_vec_perm_d *d)
22618{
6a70badb
RS
22619 HOST_WIDE_INT odd;
22620 poly_uint64 nelt = d->perm.length ();
cc4d934f 22621 rtx out, in0, in1, x;
ef4bddc2 22622 machine_mode vmode = d->vmode;
cc4d934f
JG
22623
22624 if (GET_MODE_UNIT_SIZE (vmode) > 8)
22625 return false;
22626
22627 /* Note that these are little-endian tests.
22628 We correct for big-endian later. */
6a70badb
RS
22629 if (!d->perm[0].is_constant (&odd)
22630 || (odd != 0 && odd != 1)
326ac20e
RS
22631 || !d->perm.series_p (0, 2, odd, 2)
22632 || !d->perm.series_p (1, 2, nelt + odd, 2))
cc4d934f 22633 return false;
cc4d934f
JG
22634
22635 /* Success! */
22636 if (d->testing_p)
22637 return true;
22638
22639 in0 = d->op0;
22640 in1 = d->op1;
43cacb12
RS
22641 /* We don't need a big-endian lane correction for SVE; see the comment
22642 at the head of aarch64-sve.md for details. */
22643 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
cc4d934f
JG
22644 {
22645 x = in0, in0 = in1, in1 = x;
22646 odd = !odd;
22647 }
22648 out = d->target;
22649
3f8334a5
RS
22650 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
22651 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
cc4d934f
JG
22652 return true;
22653}
22654
7efc03fd
DP
22655/* Try to re-encode the PERM constant so it combines odd and even elements.
22656 This rewrites constants such as {0, 1, 4, 5}/V4SF to {0, 2}/V2DI.
22657 We retry with this new constant with the full suite of patterns. */
22658static bool
22659aarch64_evpc_reencode (struct expand_vec_perm_d *d)
22660{
22661 expand_vec_perm_d newd;
22662 unsigned HOST_WIDE_INT nelt;
22663
22664 if (d->vec_flags != VEC_ADVSIMD)
22665 return false;
22666
22667 /* Get the new mode. Always twice the size of the inner
22668 and half the elements. */
22669 poly_uint64 vec_bits = GET_MODE_BITSIZE (d->vmode);
22670 unsigned int new_elt_bits = GET_MODE_UNIT_BITSIZE (d->vmode) * 2;
22671 auto new_elt_mode = int_mode_for_size (new_elt_bits, false).require ();
22672 machine_mode new_mode = aarch64_simd_container_mode (new_elt_mode, vec_bits);
22673
22674 if (new_mode == word_mode)
22675 return false;
22676
22677 /* to_constant is safe since this routine is specific to Advanced SIMD
22678 vectors. */
22679 nelt = d->perm.length ().to_constant ();
22680
22681 vec_perm_builder newpermconst;
22682 newpermconst.new_vector (nelt / 2, nelt / 2, 1);
22683
22684 /* Convert the perm constant if we can. Require even, odd as the pairs. */
22685 for (unsigned int i = 0; i < nelt; i += 2)
22686 {
22687 poly_int64 elt0 = d->perm[i];
22688 poly_int64 elt1 = d->perm[i + 1];
22689 poly_int64 newelt;
22690 if (!multiple_p (elt0, 2, &newelt) || maybe_ne (elt0 + 1, elt1))
22691 return false;
22692 newpermconst.quick_push (newelt.to_constant ());
22693 }
22694 newpermconst.finalize ();
22695
22696 newd.vmode = new_mode;
22697 newd.vec_flags = VEC_ADVSIMD;
22698 newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
22699 newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
22700 newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
22701 newd.testing_p = d->testing_p;
22702 newd.one_vector_p = d->one_vector_p;
22703
22704 newd.perm.new_vector (newpermconst, newd.one_vector_p ? 1 : 2, nelt / 2);
22705 return aarch64_expand_vec_perm_const_1 (&newd);
22706}
22707
cc4d934f
JG
22708/* Recognize patterns suitable for the UZP instructions. */
22709static bool
22710aarch64_evpc_uzp (struct expand_vec_perm_d *d)
22711{
6a70badb 22712 HOST_WIDE_INT odd;
cc4d934f 22713 rtx out, in0, in1, x;
ef4bddc2 22714 machine_mode vmode = d->vmode;
cc4d934f
JG
22715
22716 if (GET_MODE_UNIT_SIZE (vmode) > 8)
22717 return false;
22718
22719 /* Note that these are little-endian tests.
22720 We correct for big-endian later. */
6a70badb
RS
22721 if (!d->perm[0].is_constant (&odd)
22722 || (odd != 0 && odd != 1)
326ac20e 22723 || !d->perm.series_p (0, 1, odd, 2))
cc4d934f 22724 return false;
cc4d934f
JG
22725
22726 /* Success! */
22727 if (d->testing_p)
22728 return true;
22729
22730 in0 = d->op0;
22731 in1 = d->op1;
43cacb12
RS
22732 /* We don't need a big-endian lane correction for SVE; see the comment
22733 at the head of aarch64-sve.md for details. */
22734 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
cc4d934f
JG
22735 {
22736 x = in0, in0 = in1, in1 = x;
22737 odd = !odd;
22738 }
22739 out = d->target;
22740
3f8334a5
RS
22741 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
22742 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
cc4d934f
JG
22743 return true;
22744}
22745
22746/* Recognize patterns suitable for the ZIP instructions. */
22747static bool
22748aarch64_evpc_zip (struct expand_vec_perm_d *d)
22749{
6a70badb
RS
22750 unsigned int high;
22751 poly_uint64 nelt = d->perm.length ();
cc4d934f 22752 rtx out, in0, in1, x;
ef4bddc2 22753 machine_mode vmode = d->vmode;
cc4d934f
JG
22754
22755 if (GET_MODE_UNIT_SIZE (vmode) > 8)
22756 return false;
22757
22758 /* Note that these are little-endian tests.
22759 We correct for big-endian later. */
6a70badb
RS
22760 poly_uint64 first = d->perm[0];
22761 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
22762 || !d->perm.series_p (0, 2, first, 1)
22763 || !d->perm.series_p (1, 2, first + nelt, 1))
cc4d934f 22764 return false;
6a70badb 22765 high = maybe_ne (first, 0U);
cc4d934f
JG
22766
22767 /* Success! */
22768 if (d->testing_p)
22769 return true;
22770
22771 in0 = d->op0;
22772 in1 = d->op1;
43cacb12
RS
22773 /* We don't need a big-endian lane correction for SVE; see the comment
22774 at the head of aarch64-sve.md for details. */
22775 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
cc4d934f
JG
22776 {
22777 x = in0, in0 = in1, in1 = x;
22778 high = !high;
22779 }
22780 out = d->target;
22781
3f8334a5
RS
22782 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
22783 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
cc4d934f
JG
22784 return true;
22785}
22786
ae0533da
AL
22787/* Recognize patterns for the EXT insn. */
22788
22789static bool
22790aarch64_evpc_ext (struct expand_vec_perm_d *d)
22791{
6a70badb 22792 HOST_WIDE_INT location;
ae0533da
AL
22793 rtx offset;
22794
6a70badb
RS
22795 /* The first element always refers to the first vector.
22796 Check if the extracted indices are increasing by one. */
43cacb12
RS
22797 if (d->vec_flags == VEC_SVE_PRED
22798 || !d->perm[0].is_constant (&location)
6a70badb 22799 || !d->perm.series_p (0, 1, location, 1))
326ac20e 22800 return false;
ae0533da 22801
ae0533da
AL
22802 /* Success! */
22803 if (d->testing_p)
22804 return true;
22805
b31e65bb 22806 /* The case where (location == 0) is a no-op for both big- and little-endian,
43cacb12 22807 and is removed by the mid-end at optimization levels -O1 and higher.
b31e65bb 22808
43cacb12
RS
22809 We don't need a big-endian lane correction for SVE; see the comment
22810 at the head of aarch64-sve.md for details. */
22811 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
ae0533da
AL
22812 {
22813 /* After setup, we want the high elements of the first vector (stored
22814 at the LSB end of the register), and the low elements of the second
22815 vector (stored at the MSB end of the register). So swap. */
cb5c6c29 22816 std::swap (d->op0, d->op1);
6a70badb
RS
22817 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
22818 to_constant () is safe since this is restricted to Advanced SIMD
22819 vectors. */
22820 location = d->perm.length ().to_constant () - location;
ae0533da
AL
22821 }
22822
22823 offset = GEN_INT (location);
3f8334a5
RS
22824 emit_set_insn (d->target,
22825 gen_rtx_UNSPEC (d->vmode,
22826 gen_rtvec (3, d->op0, d->op1, offset),
22827 UNSPEC_EXT));
ae0533da
AL
22828 return true;
22829}
22830
43cacb12
RS
22831/* Recognize patterns for the REV{64,32,16} insns, which reverse elements
22832 within each 64-bit, 32-bit or 16-bit granule. */
923fcec3
AL
22833
22834static bool
43cacb12 22835aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
923fcec3 22836{
6a70badb
RS
22837 HOST_WIDE_INT diff;
22838 unsigned int i, size, unspec;
43cacb12 22839 machine_mode pred_mode;
923fcec3 22840
43cacb12
RS
22841 if (d->vec_flags == VEC_SVE_PRED
22842 || !d->one_vector_p
98452668
AC
22843 || !d->perm[0].is_constant (&diff)
22844 || !diff)
923fcec3
AL
22845 return false;
22846
6c3ce63b
RS
22847 if (d->vec_flags & VEC_SVE_DATA)
22848 size = (diff + 1) * aarch64_sve_container_bits (d->vmode);
22849 else
22850 size = (diff + 1) * GET_MODE_UNIT_BITSIZE (d->vmode);
22851 if (size == 64)
43cacb12
RS
22852 {
22853 unspec = UNSPEC_REV64;
22854 pred_mode = VNx2BImode;
22855 }
6c3ce63b 22856 else if (size == 32)
43cacb12
RS
22857 {
22858 unspec = UNSPEC_REV32;
22859 pred_mode = VNx4BImode;
22860 }
6c3ce63b 22861 else if (size == 16)
43cacb12
RS
22862 {
22863 unspec = UNSPEC_REV16;
22864 pred_mode = VNx8BImode;
22865 }
3f8334a5
RS
22866 else
22867 return false;
923fcec3 22868
326ac20e
RS
22869 unsigned int step = diff + 1;
22870 for (i = 0; i < step; ++i)
22871 if (!d->perm.series_p (i, step, diff - i, step))
22872 return false;
923fcec3
AL
22873
22874 /* Success! */
22875 if (d->testing_p)
22876 return true;
22877
6c3ce63b
RS
22878 if (d->vec_flags & VEC_SVE_DATA)
22879 {
22880 rtx pred = aarch64_ptrue_reg (pred_mode);
22881 emit_insn (gen_aarch64_sve_revbhw (d->vmode, pred_mode,
22882 d->target, pred, d->op0));
d7a09c44 22883 return true;
43cacb12 22884 }
d7a09c44 22885 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
43cacb12
RS
22886 emit_set_insn (d->target, src);
22887 return true;
22888}
22889
22890/* Recognize patterns for the REV insn, which reverses elements within
22891 a full vector. */
22892
22893static bool
22894aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
22895{
22896 poly_uint64 nelt = d->perm.length ();
22897
28350fd1 22898 if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
43cacb12
RS
22899 return false;
22900
22901 if (!d->perm.series_p (0, 1, nelt - 1, -1))
22902 return false;
22903
22904 /* Success! */
22905 if (d->testing_p)
22906 return true;
22907
22908 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
22909 emit_set_insn (d->target, src);
923fcec3
AL
22910 return true;
22911}
22912
91bd4114
JG
22913static bool
22914aarch64_evpc_dup (struct expand_vec_perm_d *d)
22915{
91bd4114
JG
22916 rtx out = d->target;
22917 rtx in0;
6a70badb 22918 HOST_WIDE_INT elt;
ef4bddc2 22919 machine_mode vmode = d->vmode;
91bd4114
JG
22920 rtx lane;
22921
43cacb12
RS
22922 if (d->vec_flags == VEC_SVE_PRED
22923 || d->perm.encoding ().encoded_nelts () != 1
6a70badb 22924 || !d->perm[0].is_constant (&elt))
326ac20e
RS
22925 return false;
22926
6c3ce63b
RS
22927 if ((d->vec_flags & VEC_SVE_DATA)
22928 && elt * (aarch64_sve_container_bits (vmode) / 8) >= 64)
43cacb12
RS
22929 return false;
22930
326ac20e
RS
22931 /* Success! */
22932 if (d->testing_p)
22933 return true;
22934
91bd4114
JG
22935 /* The generic preparation in aarch64_expand_vec_perm_const_1
22936 swaps the operand order and the permute indices if it finds
22937 d->perm[0] to be in the second operand. Thus, we can always
22938 use d->op0 and need not do any extra arithmetic to get the
22939 correct lane number. */
22940 in0 = d->op0;
f901401e 22941 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
91bd4114 22942
3f8334a5
RS
22943 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
22944 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
22945 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
91bd4114
JG
22946 return true;
22947}
22948
88b08073
JG
22949static bool
22950aarch64_evpc_tbl (struct expand_vec_perm_d *d)
22951{
43cacb12 22952 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
ef4bddc2 22953 machine_mode vmode = d->vmode;
6a70badb
RS
22954
22955 /* Make sure that the indices are constant. */
22956 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
22957 for (unsigned int i = 0; i < encoded_nelts; ++i)
22958 if (!d->perm[i].is_constant ())
22959 return false;
88b08073 22960
88b08073
JG
22961 if (d->testing_p)
22962 return true;
22963
22964 /* Generic code will try constant permutation twice. Once with the
22965 original mode and again with the elements lowered to QImode.
22966 So wait and don't do the selector expansion ourselves. */
22967 if (vmode != V8QImode && vmode != V16QImode)
22968 return false;
22969
6a70badb
RS
22970 /* to_constant is safe since this routine is specific to Advanced SIMD
22971 vectors. */
22972 unsigned int nelt = d->perm.length ().to_constant ();
22973 for (unsigned int i = 0; i < nelt; ++i)
22974 /* If big-endian and two vectors we end up with a weird mixed-endian
22975 mode on NEON. Reverse the index within each word but not the word
22976 itself. to_constant is safe because we checked is_constant above. */
22977 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
22978 ? d->perm[i].to_constant () ^ (nelt - 1)
22979 : d->perm[i].to_constant ());
bbcc9c00 22980
88b08073
JG
22981 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
22982 sel = force_reg (vmode, sel);
22983
22984 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
22985 return true;
22986}
22987
43cacb12
RS
22988/* Try to implement D using an SVE TBL instruction. */
22989
22990static bool
22991aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
22992{
22993 unsigned HOST_WIDE_INT nelt;
22994
22995 /* Permuting two variable-length vectors could overflow the
22996 index range. */
22997 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
22998 return false;
22999
23000 if (d->testing_p)
23001 return true;
23002
d083ee47 23003 machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
43cacb12 23004 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
e25c95ef
RS
23005 if (d->one_vector_p)
23006 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
23007 else
23008 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
43cacb12
RS
23009 return true;
23010}
23011
9556ef20
PK
23012/* Try to implement D using SVE SEL instruction. */
23013
23014static bool
23015aarch64_evpc_sel (struct expand_vec_perm_d *d)
23016{
23017 machine_mode vmode = d->vmode;
23018 int unit_size = GET_MODE_UNIT_SIZE (vmode);
23019
23020 if (d->vec_flags != VEC_SVE_DATA
23021 || unit_size > 8)
23022 return false;
23023
23024 int n_patterns = d->perm.encoding ().npatterns ();
23025 poly_int64 vec_len = d->perm.length ();
23026
23027 for (int i = 0; i < n_patterns; ++i)
23028 if (!known_eq (d->perm[i], i)
23029 && !known_eq (d->perm[i], vec_len + i))
23030 return false;
23031
23032 for (int i = n_patterns; i < n_patterns * 2; i++)
23033 if (!d->perm.series_p (i, n_patterns, i, n_patterns)
23034 && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
23035 return false;
23036
23037 if (d->testing_p)
23038 return true;
23039
cc68f7c2 23040 machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
9556ef20 23041
b2f5b380 23042 /* Build a predicate that is true when op0 elements should be used. */
9556ef20
PK
23043 rtx_vector_builder builder (pred_mode, n_patterns, 2);
23044 for (int i = 0; i < n_patterns * 2; i++)
23045 {
23046 rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
23047 : CONST0_RTX (BImode);
23048 builder.quick_push (elem);
23049 }
23050
23051 rtx const_vec = builder.build ();
23052 rtx pred = force_reg (pred_mode, const_vec);
b2f5b380
RS
23053 /* TARGET = PRED ? OP0 : OP1. */
23054 emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, pred));
9556ef20
PK
23055 return true;
23056}
23057
c9c87e6f
DP
23058/* Recognize patterns suitable for the INS instructions. */
23059static bool
23060aarch64_evpc_ins (struct expand_vec_perm_d *d)
23061{
23062 machine_mode mode = d->vmode;
23063 unsigned HOST_WIDE_INT nelt;
23064
23065 if (d->vec_flags != VEC_ADVSIMD)
23066 return false;
23067
23068 /* to_constant is safe since this routine is specific to Advanced SIMD
23069 vectors. */
23070 nelt = d->perm.length ().to_constant ();
23071 rtx insv = d->op0;
23072
23073 HOST_WIDE_INT idx = -1;
23074
23075 for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
23076 {
23077 HOST_WIDE_INT elt;
23078 if (!d->perm[i].is_constant (&elt))
23079 return false;
23080 if (elt == (HOST_WIDE_INT) i)
23081 continue;
23082 if (idx != -1)
23083 {
23084 idx = -1;
23085 break;
23086 }
23087 idx = i;
23088 }
23089
23090 if (idx == -1)
23091 {
23092 insv = d->op1;
23093 for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
23094 {
23095 if (d->perm[i].to_constant () == (HOST_WIDE_INT) (i + nelt))
23096 continue;
23097 if (idx != -1)
23098 return false;
23099 idx = i;
23100 }
23101
23102 if (idx == -1)
23103 return false;
23104 }
23105
23106 if (d->testing_p)
23107 return true;
23108
23109 gcc_assert (idx != -1);
23110
23111 unsigned extractindex = d->perm[idx].to_constant ();
23112 rtx extractv = d->op0;
23113 if (extractindex >= nelt)
23114 {
23115 extractv = d->op1;
23116 extractindex -= nelt;
23117 }
23118 gcc_assert (extractindex < nelt);
23119
c9c87e6f
DP
23120 insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
23121 expand_operand ops[5];
23122 create_output_operand (&ops[0], d->target, mode);
52fa7717 23123 create_input_operand (&ops[1], insv, mode);
c9c87e6f
DP
23124 create_integer_operand (&ops[2], 1 << idx);
23125 create_input_operand (&ops[3], extractv, mode);
23126 create_integer_operand (&ops[4], extractindex);
23127 expand_insn (icode, 5, ops);
23128
23129 return true;
23130}
23131
88b08073
JG
23132static bool
23133aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
23134{
23135 /* The pattern matching functions above are written to look for a small
23136 number to begin the sequence (0, 1, N/2). If we begin with an index
23137 from the second operand, we can swap the operands. */
6a70badb
RS
23138 poly_int64 nelt = d->perm.length ();
23139 if (known_ge (d->perm[0], nelt))
88b08073 23140 {
e3342de4 23141 d->perm.rotate_inputs (1);
cb5c6c29 23142 std::swap (d->op0, d->op1);
88b08073
JG
23143 }
23144
43cacb12
RS
23145 if ((d->vec_flags == VEC_ADVSIMD
23146 || d->vec_flags == VEC_SVE_DATA
6c3ce63b 23147 || d->vec_flags == (VEC_SVE_DATA | VEC_PARTIAL)
43cacb12
RS
23148 || d->vec_flags == VEC_SVE_PRED)
23149 && known_gt (nelt, 1))
cc4d934f 23150 {
43cacb12
RS
23151 if (aarch64_evpc_rev_local (d))
23152 return true;
23153 else if (aarch64_evpc_rev_global (d))
923fcec3
AL
23154 return true;
23155 else if (aarch64_evpc_ext (d))
ae0533da 23156 return true;
f901401e
AL
23157 else if (aarch64_evpc_dup (d))
23158 return true;
ae0533da 23159 else if (aarch64_evpc_zip (d))
cc4d934f
JG
23160 return true;
23161 else if (aarch64_evpc_uzp (d))
23162 return true;
23163 else if (aarch64_evpc_trn (d))
23164 return true;
9556ef20
PK
23165 else if (aarch64_evpc_sel (d))
23166 return true;
c9c87e6f
DP
23167 else if (aarch64_evpc_ins (d))
23168 return true;
7efc03fd
DP
23169 else if (aarch64_evpc_reencode (d))
23170 return true;
43cacb12
RS
23171 if (d->vec_flags == VEC_SVE_DATA)
23172 return aarch64_evpc_sve_tbl (d);
4ec8bb67 23173 else if (d->vec_flags == VEC_ADVSIMD)
43cacb12 23174 return aarch64_evpc_tbl (d);
cc4d934f 23175 }
88b08073
JG
23176 return false;
23177}
23178
f151c9e1 23179/* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
88b08073 23180
f151c9e1
RS
23181static bool
23182aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
23183 rtx op1, const vec_perm_indices &sel)
88b08073
JG
23184{
23185 struct expand_vec_perm_d d;
88b08073 23186
326ac20e 23187 /* Check whether the mask can be applied to a single vector. */
e25c95ef
RS
23188 if (sel.ninputs () == 1
23189 || (op0 && rtx_equal_p (op0, op1)))
326ac20e
RS
23190 d.one_vector_p = true;
23191 else if (sel.all_from_input_p (0))
88b08073 23192 {
326ac20e
RS
23193 d.one_vector_p = true;
23194 op1 = op0;
88b08073 23195 }
326ac20e 23196 else if (sel.all_from_input_p (1))
88b08073 23197 {
88b08073 23198 d.one_vector_p = true;
326ac20e 23199 op0 = op1;
88b08073 23200 }
326ac20e
RS
23201 else
23202 d.one_vector_p = false;
88b08073 23203
326ac20e
RS
23204 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
23205 sel.nelts_per_input ());
23206 d.vmode = vmode;
43cacb12 23207 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
326ac20e 23208 d.target = target;
b1d1e2b5
JJ
23209 d.op0 = op0 ? force_reg (vmode, op0) : NULL_RTX;
23210 if (op0 == op1)
23211 d.op1 = d.op0;
23212 else
23213 d.op1 = op1 ? force_reg (vmode, op1) : NULL_RTX;
326ac20e 23214 d.testing_p = !target;
e3342de4 23215
f151c9e1
RS
23216 if (!d.testing_p)
23217 return aarch64_expand_vec_perm_const_1 (&d);
88b08073 23218
326ac20e 23219 rtx_insn *last = get_last_insn ();
f151c9e1 23220 bool ret = aarch64_expand_vec_perm_const_1 (&d);
326ac20e 23221 gcc_assert (last == get_last_insn ());
88b08073
JG
23222
23223 return ret;
23224}
23225
73e3da51
RS
23226/* Generate a byte permute mask for a register of mode MODE,
23227 which has NUNITS units. */
23228
668046d1 23229rtx
73e3da51 23230aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
668046d1
DS
23231{
23232 /* We have to reverse each vector because we dont have
23233 a permuted load that can reverse-load according to ABI rules. */
23234 rtx mask;
23235 rtvec v = rtvec_alloc (16);
73e3da51
RS
23236 unsigned int i, j;
23237 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
668046d1
DS
23238
23239 gcc_assert (BYTES_BIG_ENDIAN);
23240 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
23241
23242 for (i = 0; i < nunits; i++)
23243 for (j = 0; j < usize; j++)
23244 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
23245 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
23246 return force_reg (V16QImode, mask);
23247}
23248
4a942af6 23249/* Expand an SVE integer comparison using the SVE equivalent of:
f22d7973 23250
4a942af6
RS
23251 (set TARGET (CODE OP0 OP1)). */
23252
23253void
23254aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
f22d7973 23255{
4a942af6
RS
23256 machine_mode pred_mode = GET_MODE (target);
23257 machine_mode data_mode = GET_MODE (op0);
00fa90d9
RS
23258 rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
23259 op0, op1);
23260 if (!rtx_equal_p (target, res))
23261 emit_move_insn (target, res);
f22d7973
RS
23262}
23263
43cacb12
RS
23264/* Return the UNSPEC_COND_* code for comparison CODE. */
23265
23266static unsigned int
23267aarch64_unspec_cond_code (rtx_code code)
23268{
23269 switch (code)
23270 {
23271 case NE:
cb18e86d 23272 return UNSPEC_COND_FCMNE;
43cacb12 23273 case EQ:
cb18e86d 23274 return UNSPEC_COND_FCMEQ;
43cacb12 23275 case LT:
cb18e86d 23276 return UNSPEC_COND_FCMLT;
43cacb12 23277 case GT:
cb18e86d 23278 return UNSPEC_COND_FCMGT;
43cacb12 23279 case LE:
cb18e86d 23280 return UNSPEC_COND_FCMLE;
43cacb12 23281 case GE:
cb18e86d 23282 return UNSPEC_COND_FCMGE;
4a942af6
RS
23283 case UNORDERED:
23284 return UNSPEC_COND_FCMUO;
43cacb12
RS
23285 default:
23286 gcc_unreachable ();
23287 }
23288}
23289
f22d7973 23290/* Emit:
43cacb12 23291
4a942af6 23292 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
f22d7973 23293
4a942af6
RS
23294 where <X> is the operation associated with comparison CODE.
23295 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
f22d7973
RS
23296
23297static void
4a942af6
RS
23298aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
23299 bool known_ptrue_p, rtx op0, rtx op1)
43cacb12 23300{
4a942af6 23301 rtx flag = gen_int_mode (known_ptrue_p, SImode);
f22d7973 23302 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
4a942af6 23303 gen_rtvec (4, pred, flag, op0, op1),
f22d7973
RS
23304 aarch64_unspec_cond_code (code));
23305 emit_set_insn (target, unspec);
43cacb12
RS
23306}
23307
f22d7973 23308/* Emit the SVE equivalent of:
43cacb12 23309
4a942af6
RS
23310 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
23311 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
f22d7973 23312 (set TARGET (ior:PRED_MODE TMP1 TMP2))
43cacb12 23313
4a942af6
RS
23314 where <Xi> is the operation associated with comparison CODEi.
23315 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
43cacb12
RS
23316
23317static void
4a942af6
RS
23318aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
23319 rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
43cacb12 23320{
4a942af6 23321 machine_mode pred_mode = GET_MODE (pred);
43cacb12 23322 rtx tmp1 = gen_reg_rtx (pred_mode);
4a942af6 23323 aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
43cacb12 23324 rtx tmp2 = gen_reg_rtx (pred_mode);
4a942af6 23325 aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
f22d7973 23326 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
43cacb12
RS
23327}
23328
f22d7973 23329/* Emit the SVE equivalent of:
43cacb12 23330
4a942af6 23331 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
f22d7973 23332 (set TARGET (not TMP))
43cacb12 23333
4a942af6
RS
23334 where <X> is the operation associated with comparison CODE.
23335 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
43cacb12
RS
23336
23337static void
4a942af6
RS
23338aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
23339 bool known_ptrue_p, rtx op0, rtx op1)
43cacb12 23340{
4a942af6 23341 machine_mode pred_mode = GET_MODE (pred);
f22d7973 23342 rtx tmp = gen_reg_rtx (pred_mode);
4a942af6 23343 aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
f22d7973 23344 aarch64_emit_unop (target, one_cmpl_optab, tmp);
43cacb12
RS
23345}
23346
f22d7973 23347/* Expand an SVE floating-point comparison using the SVE equivalent of:
43cacb12 23348
f22d7973 23349 (set TARGET (CODE OP0 OP1))
43cacb12
RS
23350
23351 If CAN_INVERT_P is true, the caller can also handle inverted results;
23352 return true if the result is in fact inverted. */
23353
23354bool
23355aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
23356 rtx op0, rtx op1, bool can_invert_p)
23357{
23358 machine_mode pred_mode = GET_MODE (target);
23359 machine_mode data_mode = GET_MODE (op0);
23360
16de3637 23361 rtx ptrue = aarch64_ptrue_reg (pred_mode);
43cacb12
RS
23362 switch (code)
23363 {
23364 case UNORDERED:
23365 /* UNORDERED has no immediate form. */
23366 op1 = force_reg (data_mode, op1);
f22d7973 23367 /* fall through */
43cacb12
RS
23368 case LT:
23369 case LE:
23370 case GT:
23371 case GE:
23372 case EQ:
23373 case NE:
f22d7973
RS
23374 {
23375 /* There is native support for the comparison. */
4a942af6 23376 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
f22d7973
RS
23377 return false;
23378 }
43cacb12
RS
23379
23380 case LTGT:
23381 /* This is a trapping operation (LT or GT). */
4a942af6 23382 aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
43cacb12
RS
23383 return false;
23384
23385 case UNEQ:
23386 if (!flag_trapping_math)
23387 {
23388 /* This would trap for signaling NaNs. */
23389 op1 = force_reg (data_mode, op1);
4a942af6
RS
23390 aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
23391 ptrue, true, op0, op1);
43cacb12
RS
23392 return false;
23393 }
23394 /* fall through */
43cacb12
RS
23395 case UNLT:
23396 case UNLE:
23397 case UNGT:
23398 case UNGE:
f22d7973
RS
23399 if (flag_trapping_math)
23400 {
23401 /* Work out which elements are ordered. */
23402 rtx ordered = gen_reg_rtx (pred_mode);
23403 op1 = force_reg (data_mode, op1);
4a942af6
RS
23404 aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
23405 ptrue, true, op0, op1);
f22d7973
RS
23406
23407 /* Test the opposite condition for the ordered elements,
23408 then invert the result. */
23409 if (code == UNEQ)
23410 code = NE;
23411 else
23412 code = reverse_condition_maybe_unordered (code);
23413 if (can_invert_p)
23414 {
4a942af6
RS
23415 aarch64_emit_sve_fp_cond (target, code,
23416 ordered, false, op0, op1);
f22d7973
RS
23417 return true;
23418 }
4a942af6
RS
23419 aarch64_emit_sve_invert_fp_cond (target, code,
23420 ordered, false, op0, op1);
f22d7973
RS
23421 return false;
23422 }
23423 break;
23424
23425 case ORDERED:
23426 /* ORDERED has no immediate form. */
23427 op1 = force_reg (data_mode, op1);
23428 break;
43cacb12
RS
23429
23430 default:
23431 gcc_unreachable ();
23432 }
f22d7973
RS
23433
23434 /* There is native support for the inverse comparison. */
23435 code = reverse_condition_maybe_unordered (code);
23436 if (can_invert_p)
23437 {
4a942af6 23438 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
f22d7973
RS
23439 return true;
23440 }
4a942af6 23441 aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
f22d7973 23442 return false;
43cacb12
RS
23443}
23444
23445/* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
23446 of the data being selected and CMP_MODE is the mode of the values being
23447 compared. */
23448
23449void
23450aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
23451 rtx *ops)
23452{
10116ec1 23453 machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require ();
43cacb12
RS
23454 rtx pred = gen_reg_rtx (pred_mode);
23455 if (FLOAT_MODE_P (cmp_mode))
23456 {
23457 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
23458 ops[4], ops[5], true))
23459 std::swap (ops[1], ops[2]);
23460 }
23461 else
23462 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
23463
d29f7dd5
RS
23464 if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
23465 ops[1] = force_reg (data_mode, ops[1]);
23466 /* The "false" value can only be zero if the "true" value is a constant. */
23467 if (register_operand (ops[1], data_mode)
23468 || !aarch64_simd_reg_or_zero (ops[2], data_mode))
23469 ops[2] = force_reg (data_mode, ops[2]);
23470
43cacb12
RS
23471 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
23472 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
23473}
23474
99e1629f
RS
23475/* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
23476 true. However due to issues with register allocation it is preferable
23477 to avoid tieing integer scalar and FP scalar modes. Executing integer
23478 operations in general registers is better than treating them as scalar
23479 vector operations. This reduces latency and avoids redundant int<->FP
23480 moves. So tie modes if they are either the same class, or vector modes
23481 with other vector modes, vector structs or any scalar mode. */
97e1ad78 23482
99e1629f 23483static bool
ef4bddc2 23484aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
97e1ad78 23485{
66f206b8
JW
23486 if ((aarch64_advsimd_partial_struct_mode_p (mode1)
23487 != aarch64_advsimd_partial_struct_mode_p (mode2))
23488 && maybe_gt (GET_MODE_SIZE (mode1), 8)
23489 && maybe_gt (GET_MODE_SIZE (mode2), 8))
23490 return false;
23491
97e1ad78
JG
23492 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
23493 return true;
23494
23495 /* We specifically want to allow elements of "structure" modes to
23496 be tieable to the structure. This more general condition allows
43cacb12
RS
23497 other rarer situations too. The reason we don't extend this to
23498 predicate modes is that there are no predicate structure modes
23499 nor any specific instructions for extracting part of a predicate
23500 register. */
23501 if (aarch64_vector_data_mode_p (mode1)
23502 && aarch64_vector_data_mode_p (mode2))
61f17a5c
WD
23503 return true;
23504
23505 /* Also allow any scalar modes with vectors. */
23506 if (aarch64_vector_mode_supported_p (mode1)
23507 || aarch64_vector_mode_supported_p (mode2))
97e1ad78
JG
23508 return true;
23509
23510 return false;
23511}
23512
e2c75eea
JG
23513/* Return a new RTX holding the result of moving POINTER forward by
23514 AMOUNT bytes. */
23515
23516static rtx
6a70badb 23517aarch64_move_pointer (rtx pointer, poly_int64 amount)
e2c75eea
JG
23518{
23519 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
23520
23521 return adjust_automodify_address (pointer, GET_MODE (pointer),
23522 next, amount);
23523}
23524
23525/* Return a new RTX holding the result of moving POINTER forward by the
23526 size of the mode it points to. */
23527
23528static rtx
23529aarch64_progress_pointer (rtx pointer)
23530{
6a70badb 23531 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
e2c75eea
JG
23532}
23533
23534/* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
23535 MODE bytes. */
23536
23537static void
23538aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
ef4bddc2 23539 machine_mode mode)
e2c75eea 23540{
7cda9e08
SD
23541 /* Handle 256-bit memcpy separately. We do this by making 2 adjacent memory
23542 address copies using V4SImode so that we can use Q registers. */
23543 if (known_eq (GET_MODE_BITSIZE (mode), 256))
23544 {
23545 mode = V4SImode;
23546 rtx reg1 = gen_reg_rtx (mode);
23547 rtx reg2 = gen_reg_rtx (mode);
23548 /* "Cast" the pointers to the correct mode. */
23549 *src = adjust_address (*src, mode, 0);
23550 *dst = adjust_address (*dst, mode, 0);
23551 /* Emit the memcpy. */
23552 emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2,
23553 aarch64_progress_pointer (*src)));
23554 emit_insn (aarch64_gen_store_pair (mode, *dst, reg1,
23555 aarch64_progress_pointer (*dst), reg2));
23556 /* Move the pointers forward. */
23557 *src = aarch64_move_pointer (*src, 32);
23558 *dst = aarch64_move_pointer (*dst, 32);
23559 return;
23560 }
23561
e2c75eea
JG
23562 rtx reg = gen_reg_rtx (mode);
23563
23564 /* "Cast" the pointers to the correct mode. */
23565 *src = adjust_address (*src, mode, 0);
23566 *dst = adjust_address (*dst, mode, 0);
23567 /* Emit the memcpy. */
23568 emit_move_insn (reg, *src);
23569 emit_move_insn (*dst, reg);
23570 /* Move the pointers forward. */
23571 *src = aarch64_progress_pointer (*src);
23572 *dst = aarch64_progress_pointer (*dst);
23573}
23574
0caf592d
KT
23575/* Expand a cpymem using the MOPS extension. OPERANDS are taken
23576 from the cpymem pattern. Return true iff we succeeded. */
23577static bool
23578aarch64_expand_cpymem_mops (rtx *operands)
23579{
23580 if (!TARGET_MOPS)
23581 return false;
23582 rtx addr_dst = XEXP (operands[0], 0);
23583 rtx addr_src = XEXP (operands[1], 0);
23584 rtx sz_reg = operands[2];
23585
23586 if (!REG_P (sz_reg))
23587 sz_reg = force_reg (DImode, sz_reg);
23588 if (!REG_P (addr_dst))
23589 addr_dst = force_reg (DImode, addr_dst);
23590 if (!REG_P (addr_src))
23591 addr_src = force_reg (DImode, addr_src);
23592 emit_insn (gen_aarch64_cpymemdi (addr_dst, addr_src, sz_reg));
23593
23594 return true;
23595}
23596
76715c32 23597/* Expand cpymem, as if from a __builtin_memcpy. Return true if
a459ee44
KT
23598 we succeed, otherwise return false, indicating that a libcall to
23599 memcpy should be emitted. */
e2c75eea
JG
23600
23601bool
76715c32 23602aarch64_expand_cpymem (rtx *operands)
e2c75eea 23603{
1d77928f 23604 int mode_bits;
e2c75eea
JG
23605 rtx dst = operands[0];
23606 rtx src = operands[1];
23607 rtx base;
1d77928f 23608 machine_mode cur_mode = BLKmode;
e2c75eea 23609
0caf592d 23610 /* Variable-sized memcpy can go through the MOPS expansion if available. */
e2c75eea 23611 if (!CONST_INT_P (operands[2]))
0caf592d 23612 return aarch64_expand_cpymem_mops (operands);
e2c75eea 23613
1d77928f 23614 unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
e2c75eea 23615
0caf592d
KT
23616 /* Try to inline up to 256 bytes or use the MOPS threshold if available. */
23617 unsigned HOST_WIDE_INT max_copy_size
23618 = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
1d77928f 23619
a459ee44
KT
23620 bool size_p = optimize_function_for_size_p (cfun);
23621
0caf592d
KT
23622 /* Large constant-sized cpymem should go through MOPS when possible.
23623 It should be a win even for size optimization in the general case.
23624 For speed optimization the choice between MOPS and the SIMD sequence
23625 depends on the size of the copy, rather than number of instructions,
23626 alignment etc. */
a459ee44 23627 if (size > max_copy_size)
0caf592d 23628 return aarch64_expand_cpymem_mops (operands);
e2c75eea 23629
1d77928f
WD
23630 int copy_bits = 256;
23631
23632 /* Default to 256-bit LDP/STP on large copies, however small copies, no SIMD
23633 support or slow 256-bit LDP/STP fall back to 128-bit chunks. */
23634 if (size <= 24
23635 || !TARGET_SIMD
23636 || (aarch64_tune_params.extra_tuning_flags
23637 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
a459ee44 23638 copy_bits = 128;
1d77928f 23639
a459ee44
KT
23640 /* Emit an inline load+store sequence and count the number of operations
23641 involved. We use a simple count of just the loads and stores emitted
23642 rather than rtx_insn count as all the pointer adjustments and reg copying
23643 in this function will get optimized away later in the pipeline. */
23644 start_sequence ();
23645 unsigned nops = 0;
0f801e0b 23646
e2c75eea
JG
23647 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
23648 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
23649
23650 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
23651 src = adjust_automodify_address (src, VOIDmode, base, 0);
23652
1d77928f
WD
23653 /* Convert size to bits to make the rest of the code simpler. */
23654 int n = size * BITS_PER_UNIT;
f7e1d19d 23655
89c52e5e 23656 while (n > 0)
e2c75eea 23657 {
89c52e5e
TC
23658 /* Find the largest mode in which to do the copy in without over reading
23659 or writing. */
23660 opt_scalar_int_mode mode_iter;
23661 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
1d77928f 23662 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_bits))
89c52e5e 23663 cur_mode = mode_iter.require ();
e2c75eea 23664
89c52e5e 23665 gcc_assert (cur_mode != BLKmode);
e2c75eea 23666
89c52e5e 23667 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
1d77928f
WD
23668
23669 /* Prefer Q-register accesses for the last bytes. */
23670 if (mode_bits == 128 && copy_bits == 256)
23671 cur_mode = V4SImode;
23672
89c52e5e 23673 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
a459ee44
KT
23674 /* A single block copy is 1 load + 1 store. */
23675 nops += 2;
89c52e5e 23676 n -= mode_bits;
e2c75eea 23677
0caf592d
KT
23678 /* Emit trailing copies using overlapping unaligned accesses
23679 (when !STRICT_ALIGNMENT) - this is smaller and faster. */
23680 if (n > 0 && n < copy_bits / 2 && !STRICT_ALIGNMENT)
89c52e5e 23681 {
1d77928f 23682 machine_mode next_mode = smallest_mode_for_size (n, MODE_INT);
f7e1d19d 23683 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
1d77928f 23684 gcc_assert (n_bits <= mode_bits);
89c52e5e
TC
23685 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
23686 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
23687 n = n_bits;
e2c75eea
JG
23688 }
23689 }
a459ee44
KT
23690 rtx_insn *seq = get_insns ();
23691 end_sequence ();
0caf592d
KT
23692 /* MOPS sequence requires 3 instructions for the memory copying + 1 to move
23693 the constant size into a register. */
23694 unsigned mops_cost = 3 + 1;
23695
23696 /* If MOPS is available at this point we don't consider the libcall as it's
23697 not a win even on code size. At this point only consider MOPS if
23698 optimizing for size. For speed optimizations we will have chosen between
23699 the two based on copy size already. */
23700 if (TARGET_MOPS)
23701 {
23702 if (size_p && mops_cost < nops)
23703 return aarch64_expand_cpymem_mops (operands);
23704 emit_insn (seq);
23705 return true;
23706 }
a459ee44
KT
23707
23708 /* A memcpy libcall in the worst case takes 3 instructions to prepare the
0caf592d
KT
23709 arguments + 1 for the call. When MOPS is not available and we're
23710 optimizing for size a libcall may be preferable. */
a459ee44
KT
23711 unsigned libcall_cost = 4;
23712 if (size_p && libcall_cost < nops)
23713 return false;
e2c75eea 23714
a459ee44 23715 emit_insn (seq);
e2c75eea
JG
23716 return true;
23717}
23718
54bbde55
SD
23719/* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
23720 SRC is a register we have created with the duplicated value to be set. */
23721static void
23722aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
23723 machine_mode mode)
23724{
23725 /* If we are copying 128bits or 256bits, we can do that straight from
23726 the SIMD register we prepared. */
23727 if (known_eq (GET_MODE_BITSIZE (mode), 256))
23728 {
23729 mode = GET_MODE (src);
23730 /* "Cast" the *dst to the correct mode. */
23731 *dst = adjust_address (*dst, mode, 0);
23732 /* Emit the memset. */
23733 emit_insn (aarch64_gen_store_pair (mode, *dst, src,
23734 aarch64_progress_pointer (*dst), src));
23735
23736 /* Move the pointers forward. */
23737 *dst = aarch64_move_pointer (*dst, 32);
23738 return;
23739 }
23740 if (known_eq (GET_MODE_BITSIZE (mode), 128))
23741 {
23742 /* "Cast" the *dst to the correct mode. */
23743 *dst = adjust_address (*dst, GET_MODE (src), 0);
23744 /* Emit the memset. */
23745 emit_move_insn (*dst, src);
23746 /* Move the pointers forward. */
23747 *dst = aarch64_move_pointer (*dst, 16);
23748 return;
23749 }
23750 /* For copying less, we have to extract the right amount from src. */
23751 rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
23752
23753 /* "Cast" the *dst to the correct mode. */
23754 *dst = adjust_address (*dst, mode, 0);
23755 /* Emit the memset. */
23756 emit_move_insn (*dst, reg);
23757 /* Move the pointer forward. */
23758 *dst = aarch64_progress_pointer (*dst);
23759}
23760
d3bd985e
KT
23761/* Expand a setmem using the MOPS instructions. OPERANDS are the same
23762 as for the setmem pattern. Return true iff we succeed. */
23763static bool
23764aarch64_expand_setmem_mops (rtx *operands)
23765{
23766 if (!TARGET_MOPS)
23767 return false;
23768
23769 rtx addr_dst = XEXP (operands[0], 0);
23770 rtx sz_reg = operands[1];
23771 rtx val = operands[2];
23772
23773 if (!REG_P (sz_reg))
23774 sz_reg = force_reg (DImode, sz_reg);
23775 if (!REG_P (addr_dst))
23776 addr_dst = force_reg (DImode, addr_dst);
23777 if (!REG_P (val) && val != CONST0_RTX (QImode))
23778 val = force_reg (QImode, val);
23779 emit_insn (gen_aarch64_setmemdi (addr_dst, val, sz_reg));
23780 return true;
23781}
23782
54bbde55
SD
23783/* Expand setmem, as if from a __builtin_memset. Return true if
23784 we succeed, otherwise return false. */
23785
23786bool
23787aarch64_expand_setmem (rtx *operands)
23788{
23789 int n, mode_bits;
23790 unsigned HOST_WIDE_INT len;
23791 rtx dst = operands[0];
23792 rtx val = operands[2], src;
23793 rtx base;
23794 machine_mode cur_mode = BLKmode, next_mode;
23795
d3bd985e
KT
23796 /* If we don't have SIMD registers or the size is variable use the MOPS
23797 inlined sequence if possible. */
23798 if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
23799 return aarch64_expand_setmem_mops (operands);
54bbde55 23800
8f95e3c0 23801 bool size_p = optimize_function_for_size_p (cfun);
54bbde55 23802
d3bd985e
KT
23803 /* Default the maximum to 256-bytes when considering only libcall vs
23804 SIMD broadcast sequence. */
54bbde55
SD
23805 unsigned max_set_size = 256;
23806
54bbde55 23807 len = INTVAL (operands[1]);
d3bd985e 23808 if (len > max_set_size && !TARGET_MOPS)
54bbde55
SD
23809 return false;
23810
d3bd985e
KT
23811 int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
23812 /* The MOPS sequence takes:
23813 3 instructions for the memory storing
23814 + 1 to move the constant size into a reg
23815 + 1 if VAL is a non-zero constant to move into a reg
23816 (zero constants can use XZR directly). */
23817 unsigned mops_cost = 3 + 1 + cst_val;
23818 /* A libcall to memset in the worst case takes 3 instructions to prepare
23819 the arguments + 1 for the call. */
23820 unsigned libcall_cost = 4;
23821
23822 /* Upper bound check. For large constant-sized setmem use the MOPS sequence
23823 when available. */
23824 if (TARGET_MOPS
23825 && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
23826 return aarch64_expand_setmem_mops (operands);
23827
8f95e3c0 23828 /* Attempt a sequence with a vector broadcast followed by stores.
d3bd985e
KT
23829 Count the number of operations involved to see if it's worth it
23830 against the alternatives. A simple counter simd_ops on the
23831 algorithmically-relevant operations is used rather than an rtx_insn count
23832 as all the pointer adjusmtents and mode reinterprets will be optimized
23833 away later. */
8f95e3c0 23834 start_sequence ();
d3bd985e
KT
23835 unsigned simd_ops = 0;
23836
54bbde55
SD
23837 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
23838 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
23839
23840 /* Prepare the val using a DUP/MOVI v0.16B, val. */
23841 src = expand_vector_broadcast (V16QImode, val);
23842 src = force_reg (V16QImode, src);
d3bd985e 23843 simd_ops++;
54bbde55
SD
23844 /* Convert len to bits to make the rest of the code simpler. */
23845 n = len * BITS_PER_UNIT;
23846
23847 /* Maximum amount to copy in one go. We allow 256-bit chunks based on the
d3bd985e 23848 AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter. */
8f95e3c0
KT
23849 const int copy_limit = (aarch64_tune_params.extra_tuning_flags
23850 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
54bbde55
SD
23851 ? GET_MODE_BITSIZE (TImode) : 256;
23852
23853 while (n > 0)
23854 {
23855 /* Find the largest mode in which to do the copy without
23856 over writing. */
23857 opt_scalar_int_mode mode_iter;
23858 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
23859 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
23860 cur_mode = mode_iter.require ();
23861
23862 gcc_assert (cur_mode != BLKmode);
23863
23864 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
23865 aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
d3bd985e 23866 simd_ops++;
54bbde55
SD
23867 n -= mode_bits;
23868
23869 /* Do certain trailing copies as overlapping if it's going to be
23870 cheaper. i.e. less instructions to do so. For instance doing a 15
23871 byte copy it's more efficient to do two overlapping 8 byte copies than
a45786e9
AP
23872 8 + 4 + 2 + 1. Only do this when -mstrict-align is not supplied. */
23873 if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
54bbde55
SD
23874 {
23875 next_mode = smallest_mode_for_size (n, MODE_INT);
23876 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
23877 gcc_assert (n_bits <= mode_bits);
23878 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
23879 n = n_bits;
23880 }
23881 }
8f95e3c0
KT
23882 rtx_insn *seq = get_insns ();
23883 end_sequence ();
54bbde55 23884
d3bd985e
KT
23885 if (size_p)
23886 {
23887 /* When optimizing for size we have 3 options: the SIMD broadcast sequence,
23888 call to memset or the MOPS expansion. */
23889 if (TARGET_MOPS
23890 && mops_cost <= libcall_cost
23891 && mops_cost <= simd_ops)
23892 return aarch64_expand_setmem_mops (operands);
23893 /* If MOPS is not available or not shorter pick a libcall if the SIMD
23894 sequence is too long. */
23895 else if (libcall_cost < simd_ops)
23896 return false;
23897 emit_insn (seq);
23898 return true;
23899 }
23900
23901 /* At this point the SIMD broadcast sequence is the best choice when
23902 optimizing for speed. */
8f95e3c0 23903 emit_insn (seq);
54bbde55
SD
23904 return true;
23905}
23906
23907
141a3ccf
KT
23908/* Split a DImode store of a CONST_INT SRC to MEM DST as two
23909 SImode stores. Handle the case when the constant has identical
23910 bottom and top halves. This is beneficial when the two stores can be
23911 merged into an STP and we avoid synthesising potentially expensive
23912 immediates twice. Return true if such a split is possible. */
23913
23914bool
23915aarch64_split_dimode_const_store (rtx dst, rtx src)
23916{
23917 rtx lo = gen_lowpart (SImode, src);
23918 rtx hi = gen_highpart_mode (SImode, DImode, src);
23919
23920 bool size_p = optimize_function_for_size_p (cfun);
23921
23922 if (!rtx_equal_p (lo, hi))
23923 return false;
23924
23925 unsigned int orig_cost
23926 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
23927 unsigned int lo_cost
23928 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
23929
23930 /* We want to transform:
23931 MOV x1, 49370
23932 MOVK x1, 0x140, lsl 16
23933 MOVK x1, 0xc0da, lsl 32
23934 MOVK x1, 0x140, lsl 48
23935 STR x1, [x0]
23936 into:
23937 MOV w1, 49370
23938 MOVK w1, 0x140, lsl 16
23939 STP w1, w1, [x0]
23940 So we want to perform this only when we save two instructions
23941 or more. When optimizing for size, however, accept any code size
23942 savings we can. */
23943 if (size_p && orig_cost <= lo_cost)
23944 return false;
23945
23946 if (!size_p
23947 && (orig_cost <= lo_cost + 1))
23948 return false;
23949
23950 rtx mem_lo = adjust_address (dst, SImode, 0);
23951 if (!aarch64_mem_pair_operand (mem_lo, SImode))
23952 return false;
23953
23954 rtx tmp_reg = gen_reg_rtx (SImode);
23955 aarch64_expand_mov_immediate (tmp_reg, lo);
23956 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
23957 /* Don't emit an explicit store pair as this may not be always profitable.
23958 Let the sched-fusion logic decide whether to merge them. */
23959 emit_move_insn (mem_lo, tmp_reg);
23960 emit_move_insn (mem_hi, tmp_reg);
23961
23962 return true;
23963}
23964
30c46053
MC
23965/* Generate RTL for a conditional branch with rtx comparison CODE in
23966 mode CC_MODE. The destination of the unlikely conditional branch
23967 is LABEL_REF. */
23968
23969void
23970aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
23971 rtx label_ref)
23972{
23973 rtx x;
23974 x = gen_rtx_fmt_ee (code, VOIDmode,
23975 gen_rtx_REG (cc_mode, CC_REGNUM),
23976 const0_rtx);
23977
23978 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
23979 gen_rtx_LABEL_REF (VOIDmode, label_ref),
23980 pc_rtx);
23981 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
23982}
23983
23984/* Generate DImode scratch registers for 128-bit (TImode) addition.
23985
23986 OP1 represents the TImode destination operand 1
23987 OP2 represents the TImode destination operand 2
23988 LOW_DEST represents the low half (DImode) of TImode operand 0
23989 LOW_IN1 represents the low half (DImode) of TImode operand 1
23990 LOW_IN2 represents the low half (DImode) of TImode operand 2
23991 HIGH_DEST represents the high half (DImode) of TImode operand 0
23992 HIGH_IN1 represents the high half (DImode) of TImode operand 1
23993 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
23994
23995void
23996aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
23997 rtx *low_in1, rtx *low_in2,
23998 rtx *high_dest, rtx *high_in1,
23999 rtx *high_in2)
24000{
24001 *low_dest = gen_reg_rtx (DImode);
24002 *low_in1 = gen_lowpart (DImode, op1);
24003 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
24004 subreg_lowpart_offset (DImode, TImode));
24005 *high_dest = gen_reg_rtx (DImode);
24006 *high_in1 = gen_highpart (DImode, op1);
24007 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
24008 subreg_highpart_offset (DImode, TImode));
24009}
24010
24011/* Generate DImode scratch registers for 128-bit (TImode) subtraction.
24012
24013 This function differs from 'arch64_addti_scratch_regs' in that
24014 OP1 can be an immediate constant (zero). We must call
24015 subreg_highpart_offset with DImode and TImode arguments, otherwise
24016 VOIDmode will be used for the const_int which generates an internal
24017 error from subreg_size_highpart_offset which does not expect a size of zero.
24018
24019 OP1 represents the TImode destination operand 1
24020 OP2 represents the TImode destination operand 2
24021 LOW_DEST represents the low half (DImode) of TImode operand 0
24022 LOW_IN1 represents the low half (DImode) of TImode operand 1
24023 LOW_IN2 represents the low half (DImode) of TImode operand 2
24024 HIGH_DEST represents the high half (DImode) of TImode operand 0
24025 HIGH_IN1 represents the high half (DImode) of TImode operand 1
24026 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
24027
24028
24029void
24030aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
24031 rtx *low_in1, rtx *low_in2,
24032 rtx *high_dest, rtx *high_in1,
24033 rtx *high_in2)
24034{
24035 *low_dest = gen_reg_rtx (DImode);
24036 *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
24037 subreg_lowpart_offset (DImode, TImode));
24038
24039 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
24040 subreg_lowpart_offset (DImode, TImode));
24041 *high_dest = gen_reg_rtx (DImode);
24042
24043 *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
24044 subreg_highpart_offset (DImode, TImode));
24045 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
24046 subreg_highpart_offset (DImode, TImode));
24047}
24048
24049/* Generate RTL for 128-bit (TImode) subtraction with overflow.
24050
24051 OP0 represents the TImode destination operand 0
24052 LOW_DEST represents the low half (DImode) of TImode operand 0
24053 LOW_IN1 represents the low half (DImode) of TImode operand 1
24054 LOW_IN2 represents the low half (DImode) of TImode operand 2
24055 HIGH_DEST represents the high half (DImode) of TImode operand 0
24056 HIGH_IN1 represents the high half (DImode) of TImode operand 1
a58fe3c5
RE
24057 HIGH_IN2 represents the high half (DImode) of TImode operand 2
24058 UNSIGNED_P is true if the operation is being performed on unsigned
24059 values. */
30c46053
MC
24060void
24061aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
24062 rtx low_in2, rtx high_dest, rtx high_in1,
a58fe3c5 24063 rtx high_in2, bool unsigned_p)
30c46053
MC
24064{
24065 if (low_in2 == const0_rtx)
24066 {
24067 low_dest = low_in1;
a58fe3c5
RE
24068 high_in2 = force_reg (DImode, high_in2);
24069 if (unsigned_p)
24070 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
24071 else
24072 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
30c46053
MC
24073 }
24074 else
24075 {
d80f0a8d
JJ
24076 if (aarch64_plus_immediate (low_in2, DImode))
24077 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
618ae596 24078 GEN_INT (-UINTVAL (low_in2))));
d80f0a8d 24079 else
30c46053 24080 {
d80f0a8d
JJ
24081 low_in2 = force_reg (DImode, low_in2);
24082 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
30c46053 24083 }
d80f0a8d 24084 high_in2 = force_reg (DImode, high_in2);
a58fe3c5
RE
24085
24086 if (unsigned_p)
24087 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
24088 else
24089 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
30c46053
MC
24090 }
24091
24092 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
24093 emit_move_insn (gen_highpart (DImode, op0), high_dest);
24094
24095}
24096
a3125fc2
CL
24097/* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
24098
24099static unsigned HOST_WIDE_INT
24100aarch64_asan_shadow_offset (void)
24101{
10078f3e
AP
24102 if (TARGET_ILP32)
24103 return (HOST_WIDE_INT_1 << 29);
24104 else
24105 return (HOST_WIDE_INT_1 << 36);
a3125fc2
CL
24106}
24107
5f3bc026 24108static rtx
cb4347e8 24109aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
5f3bc026
ZC
24110 int code, tree treeop0, tree treeop1)
24111{
c8012fbc
WD
24112 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
24113 rtx op0, op1;
5f3bc026 24114 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
c8012fbc 24115 insn_code icode;
5f3bc026
ZC
24116 struct expand_operand ops[4];
24117
5f3bc026
ZC
24118 start_sequence ();
24119 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
24120
24121 op_mode = GET_MODE (op0);
24122 if (op_mode == VOIDmode)
24123 op_mode = GET_MODE (op1);
24124
24125 switch (op_mode)
24126 {
4e10a5a7
RS
24127 case E_QImode:
24128 case E_HImode:
24129 case E_SImode:
5f3bc026
ZC
24130 cmp_mode = SImode;
24131 icode = CODE_FOR_cmpsi;
24132 break;
24133
4e10a5a7 24134 case E_DImode:
5f3bc026
ZC
24135 cmp_mode = DImode;
24136 icode = CODE_FOR_cmpdi;
24137 break;
24138
4e10a5a7 24139 case E_SFmode:
786e3c06
WD
24140 cmp_mode = SFmode;
24141 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
24142 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
24143 break;
24144
4e10a5a7 24145 case E_DFmode:
786e3c06
WD
24146 cmp_mode = DFmode;
24147 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
24148 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
24149 break;
24150
5f3bc026
ZC
24151 default:
24152 end_sequence ();
24153 return NULL_RTX;
24154 }
24155
c8012fbc
WD
24156 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
24157 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
5f3bc026
ZC
24158 if (!op0 || !op1)
24159 {
24160 end_sequence ();
24161 return NULL_RTX;
24162 }
24163 *prep_seq = get_insns ();
24164 end_sequence ();
24165
c8012fbc
WD
24166 create_fixed_operand (&ops[0], op0);
24167 create_fixed_operand (&ops[1], op1);
5f3bc026
ZC
24168
24169 start_sequence ();
c8012fbc 24170 if (!maybe_expand_insn (icode, 2, ops))
5f3bc026
ZC
24171 {
24172 end_sequence ();
24173 return NULL_RTX;
24174 }
24175 *gen_seq = get_insns ();
24176 end_sequence ();
24177
c8012fbc
WD
24178 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
24179 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
5f3bc026
ZC
24180}
24181
24182static rtx
cb4347e8
TS
24183aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
24184 int cmp_code, tree treeop0, tree treeop1, int bit_code)
5f3bc026 24185{
c8012fbc
WD
24186 rtx op0, op1, target;
24187 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
5f3bc026 24188 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
c8012fbc 24189 insn_code icode;
5f3bc026 24190 struct expand_operand ops[6];
c8012fbc 24191 int aarch64_cond;
5f3bc026 24192
cb4347e8 24193 push_to_sequence (*prep_seq);
5f3bc026
ZC
24194 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
24195
24196 op_mode = GET_MODE (op0);
24197 if (op_mode == VOIDmode)
24198 op_mode = GET_MODE (op1);
24199
24200 switch (op_mode)
24201 {
4e10a5a7
RS
24202 case E_QImode:
24203 case E_HImode:
24204 case E_SImode:
5f3bc026 24205 cmp_mode = SImode;
5f3bc026
ZC
24206 break;
24207
4e10a5a7 24208 case E_DImode:
5f3bc026 24209 cmp_mode = DImode;
5f3bc026
ZC
24210 break;
24211
4e10a5a7 24212 case E_SFmode:
786e3c06
WD
24213 cmp_mode = SFmode;
24214 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
786e3c06
WD
24215 break;
24216
4e10a5a7 24217 case E_DFmode:
786e3c06
WD
24218 cmp_mode = DFmode;
24219 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
786e3c06
WD
24220 break;
24221
5f3bc026
ZC
24222 default:
24223 end_sequence ();
24224 return NULL_RTX;
24225 }
24226
865257c4
RS
24227 icode = code_for_ccmp (cc_mode, cmp_mode);
24228
5f3bc026
ZC
24229 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
24230 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
24231 if (!op0 || !op1)
24232 {
24233 end_sequence ();
24234 return NULL_RTX;
24235 }
24236 *prep_seq = get_insns ();
24237 end_sequence ();
24238
24239 target = gen_rtx_REG (cc_mode, CC_REGNUM);
c8012fbc 24240 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
5f3bc026 24241
c8012fbc
WD
24242 if (bit_code != AND)
24243 {
865257c4
RS
24244 /* Treat the ccmp patterns as canonical and use them where possible,
24245 but fall back to ccmp_rev patterns if there's no other option. */
24246 rtx_code prev_code = GET_CODE (prev);
24247 machine_mode prev_mode = GET_MODE (XEXP (prev, 0));
24248 if ((prev_mode == CCFPmode || prev_mode == CCFPEmode)
24249 && !(prev_code == EQ
24250 || prev_code == NE
24251 || prev_code == ORDERED
24252 || prev_code == UNORDERED))
24253 icode = code_for_ccmp_rev (cc_mode, cmp_mode);
24254 else
24255 {
24256 rtx_code code = reverse_condition (prev_code);
24257 prev = gen_rtx_fmt_ee (code, VOIDmode, XEXP (prev, 0), const0_rtx);
24258 }
c8012fbc
WD
24259 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
24260 }
24261
24262 create_fixed_operand (&ops[0], XEXP (prev, 0));
5f3bc026
ZC
24263 create_fixed_operand (&ops[1], target);
24264 create_fixed_operand (&ops[2], op0);
24265 create_fixed_operand (&ops[3], op1);
c8012fbc
WD
24266 create_fixed_operand (&ops[4], prev);
24267 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
5f3bc026 24268
cb4347e8 24269 push_to_sequence (*gen_seq);
5f3bc026
ZC
24270 if (!maybe_expand_insn (icode, 6, ops))
24271 {
24272 end_sequence ();
24273 return NULL_RTX;
24274 }
24275
24276 *gen_seq = get_insns ();
24277 end_sequence ();
24278
c8012fbc 24279 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
5f3bc026
ZC
24280}
24281
24282#undef TARGET_GEN_CCMP_FIRST
24283#define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
24284
24285#undef TARGET_GEN_CCMP_NEXT
24286#define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
24287
6a569cdd
KT
24288/* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
24289 instruction fusion of some sort. */
24290
24291static bool
24292aarch64_macro_fusion_p (void)
24293{
b175b679 24294 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
6a569cdd
KT
24295}
24296
24297
24298/* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
24299 should be kept together during scheduling. */
24300
24301static bool
24302aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
24303{
24304 rtx set_dest;
24305 rtx prev_set = single_set (prev);
24306 rtx curr_set = single_set (curr);
24307 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
24308 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
24309
24310 if (!aarch64_macro_fusion_p ())
24311 return false;
24312
d7b03373 24313 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
6a569cdd
KT
24314 {
24315 /* We are trying to match:
24316 prev (mov) == (set (reg r0) (const_int imm16))
24317 curr (movk) == (set (zero_extract (reg r0)
24318 (const_int 16)
24319 (const_int 16))
24320 (const_int imm16_1)) */
24321
24322 set_dest = SET_DEST (curr_set);
24323
24324 if (GET_CODE (set_dest) == ZERO_EXTRACT
24325 && CONST_INT_P (SET_SRC (curr_set))
24326 && CONST_INT_P (SET_SRC (prev_set))
24327 && CONST_INT_P (XEXP (set_dest, 2))
24328 && INTVAL (XEXP (set_dest, 2)) == 16
24329 && REG_P (XEXP (set_dest, 0))
24330 && REG_P (SET_DEST (prev_set))
24331 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
24332 {
24333 return true;
24334 }
24335 }
24336
d7b03373 24337 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
9bbe08fe
KT
24338 {
24339
24340 /* We're trying to match:
24341 prev (adrp) == (set (reg r1)
24342 (high (symbol_ref ("SYM"))))
24343 curr (add) == (set (reg r0)
24344 (lo_sum (reg r1)
24345 (symbol_ref ("SYM"))))
24346 Note that r0 need not necessarily be the same as r1, especially
24347 during pre-regalloc scheduling. */
24348
24349 if (satisfies_constraint_Ush (SET_SRC (prev_set))
24350 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
24351 {
24352 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
24353 && REG_P (XEXP (SET_SRC (curr_set), 0))
24354 && REGNO (XEXP (SET_SRC (curr_set), 0))
24355 == REGNO (SET_DEST (prev_set))
24356 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
24357 XEXP (SET_SRC (curr_set), 1)))
24358 return true;
24359 }
24360 }
24361
d7b03373 24362 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
cd0cb232
KT
24363 {
24364
24365 /* We're trying to match:
24366 prev (movk) == (set (zero_extract (reg r0)
24367 (const_int 16)
24368 (const_int 32))
24369 (const_int imm16_1))
24370 curr (movk) == (set (zero_extract (reg r0)
24371 (const_int 16)
24372 (const_int 48))
24373 (const_int imm16_2)) */
24374
24375 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
24376 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
24377 && REG_P (XEXP (SET_DEST (prev_set), 0))
24378 && REG_P (XEXP (SET_DEST (curr_set), 0))
24379 && REGNO (XEXP (SET_DEST (prev_set), 0))
24380 == REGNO (XEXP (SET_DEST (curr_set), 0))
24381 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
24382 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
24383 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
24384 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
24385 && CONST_INT_P (SET_SRC (prev_set))
24386 && CONST_INT_P (SET_SRC (curr_set)))
24387 return true;
24388
24389 }
d7b03373 24390 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
d8354ad7
KT
24391 {
24392 /* We're trying to match:
24393 prev (adrp) == (set (reg r0)
24394 (high (symbol_ref ("SYM"))))
24395 curr (ldr) == (set (reg r1)
24396 (mem (lo_sum (reg r0)
24397 (symbol_ref ("SYM")))))
24398 or
24399 curr (ldr) == (set (reg r1)
24400 (zero_extend (mem
24401 (lo_sum (reg r0)
24402 (symbol_ref ("SYM")))))) */
24403 if (satisfies_constraint_Ush (SET_SRC (prev_set))
24404 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
24405 {
24406 rtx curr_src = SET_SRC (curr_set);
24407
24408 if (GET_CODE (curr_src) == ZERO_EXTEND)
24409 curr_src = XEXP (curr_src, 0);
24410
24411 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
24412 && REG_P (XEXP (XEXP (curr_src, 0), 0))
24413 && REGNO (XEXP (XEXP (curr_src, 0), 0))
24414 == REGNO (SET_DEST (prev_set))
24415 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
24416 XEXP (SET_SRC (prev_set), 0)))
24417 return true;
24418 }
24419 }
cd0cb232 24420
a4f3fa71 24421 /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch. */
d7b03373 24422 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
a4f3fa71
WD
24423 && prev_set && curr_set && any_condjump_p (curr)
24424 && GET_CODE (SET_SRC (prev_set)) == COMPARE
24425 && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
24426 && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
24427 return true;
24428
24429 /* Fuse flag-setting ALU instructions and conditional branch. */
24430 if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
3759108f
AP
24431 && any_condjump_p (curr))
24432 {
509f819a
N
24433 unsigned int condreg1, condreg2;
24434 rtx cc_reg_1;
24435 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
24436 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
24437
24438 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
24439 && prev
24440 && modified_in_p (cc_reg_1, prev))
24441 {
f8a27206
AP
24442 enum attr_type prev_type = get_attr_type (prev);
24443
509f819a
N
24444 /* FIXME: this misses some which is considered simple arthematic
24445 instructions for ThunderX. Simple shifts are missed here. */
24446 if (prev_type == TYPE_ALUS_SREG
24447 || prev_type == TYPE_ALUS_IMM
24448 || prev_type == TYPE_LOGICS_REG
24449 || prev_type == TYPE_LOGICS_IMM)
24450 return true;
24451 }
3759108f
AP
24452 }
24453
a4f3fa71 24454 /* Fuse ALU instructions and CBZ/CBNZ. */
bee7e0fc
AP
24455 if (prev_set
24456 && curr_set
a4f3fa71 24457 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
00c7c57f
JB
24458 && any_condjump_p (curr))
24459 {
24460 /* We're trying to match:
24461 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
24462 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
24463 (const_int 0))
24464 (label_ref ("SYM"))
24465 (pc)) */
24466 if (SET_DEST (curr_set) == (pc_rtx)
24467 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
24468 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
24469 && REG_P (SET_DEST (prev_set))
24470 && REGNO (SET_DEST (prev_set))
24471 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
24472 {
24473 /* Fuse ALU operations followed by conditional branch instruction. */
24474 switch (get_attr_type (prev))
24475 {
24476 case TYPE_ALU_IMM:
24477 case TYPE_ALU_SREG:
24478 case TYPE_ADC_REG:
24479 case TYPE_ADC_IMM:
24480 case TYPE_ADCS_REG:
24481 case TYPE_ADCS_IMM:
24482 case TYPE_LOGIC_REG:
24483 case TYPE_LOGIC_IMM:
24484 case TYPE_CSEL:
24485 case TYPE_ADR:
24486 case TYPE_MOV_IMM:
24487 case TYPE_SHIFT_REG:
24488 case TYPE_SHIFT_IMM:
24489 case TYPE_BFM:
24490 case TYPE_RBIT:
24491 case TYPE_REV:
24492 case TYPE_EXTEND:
24493 return true;
24494
24495 default:;
24496 }
24497 }
24498 }
24499
6a569cdd
KT
24500 return false;
24501}
24502
f2879a90
KT
24503/* Return true iff the instruction fusion described by OP is enabled. */
24504
24505bool
24506aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
24507{
24508 return (aarch64_tune_params.fusible_ops & op) != 0;
24509}
24510
350013bc
BC
24511/* If MEM is in the form of [base+offset], extract the two parts
24512 of address and set to BASE and OFFSET, otherwise return false
24513 after clearing BASE and OFFSET. */
24514
24515bool
24516extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
24517{
24518 rtx addr;
24519
24520 gcc_assert (MEM_P (mem));
24521
24522 addr = XEXP (mem, 0);
24523
24524 if (REG_P (addr))
24525 {
24526 *base = addr;
24527 *offset = const0_rtx;
24528 return true;
24529 }
24530
24531 if (GET_CODE (addr) == PLUS
24532 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
24533 {
24534 *base = XEXP (addr, 0);
24535 *offset = XEXP (addr, 1);
24536 return true;
24537 }
24538
24539 *base = NULL_RTX;
24540 *offset = NULL_RTX;
24541
24542 return false;
24543}
24544
24545/* Types for scheduling fusion. */
24546enum sched_fusion_type
24547{
24548 SCHED_FUSION_NONE = 0,
24549 SCHED_FUSION_LD_SIGN_EXTEND,
24550 SCHED_FUSION_LD_ZERO_EXTEND,
24551 SCHED_FUSION_LD,
24552 SCHED_FUSION_ST,
24553 SCHED_FUSION_NUM
24554};
24555
24556/* If INSN is a load or store of address in the form of [base+offset],
24557 extract the two parts and set to BASE and OFFSET. Return scheduling
24558 fusion type this INSN is. */
24559
24560static enum sched_fusion_type
24561fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
24562{
24563 rtx x, dest, src;
24564 enum sched_fusion_type fusion = SCHED_FUSION_LD;
24565
24566 gcc_assert (INSN_P (insn));
24567 x = PATTERN (insn);
24568 if (GET_CODE (x) != SET)
24569 return SCHED_FUSION_NONE;
24570
24571 src = SET_SRC (x);
24572 dest = SET_DEST (x);
24573
abc52318
KT
24574 machine_mode dest_mode = GET_MODE (dest);
24575
24576 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
350013bc
BC
24577 return SCHED_FUSION_NONE;
24578
24579 if (GET_CODE (src) == SIGN_EXTEND)
24580 {
24581 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
24582 src = XEXP (src, 0);
3793ecc1 24583 if (!MEM_P (src) || GET_MODE (src) != SImode)
350013bc
BC
24584 return SCHED_FUSION_NONE;
24585 }
24586 else if (GET_CODE (src) == ZERO_EXTEND)
24587 {
24588 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
24589 src = XEXP (src, 0);
3793ecc1 24590 if (!MEM_P (src) || GET_MODE (src) != SImode)
350013bc
BC
24591 return SCHED_FUSION_NONE;
24592 }
24593
3793ecc1 24594 if (MEM_P (src) && REG_P (dest))
350013bc 24595 extract_base_offset_in_addr (src, base, offset);
3793ecc1 24596 else if (MEM_P (dest) && (REG_P (src) || src == const0_rtx))
350013bc
BC
24597 {
24598 fusion = SCHED_FUSION_ST;
24599 extract_base_offset_in_addr (dest, base, offset);
24600 }
24601 else
24602 return SCHED_FUSION_NONE;
24603
24604 if (*base == NULL_RTX || *offset == NULL_RTX)
24605 fusion = SCHED_FUSION_NONE;
24606
24607 return fusion;
24608}
24609
24610/* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
24611
24612 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
24613 and PRI are only calculated for these instructions. For other instruction,
24614 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
24615 type instruction fusion can be added by returning different priorities.
24616
24617 It's important that irrelevant instructions get the largest FUSION_PRI. */
24618
24619static void
24620aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
24621 int *fusion_pri, int *pri)
24622{
24623 int tmp, off_val;
24624 rtx base, offset;
24625 enum sched_fusion_type fusion;
24626
24627 gcc_assert (INSN_P (insn));
24628
24629 tmp = max_pri - 1;
24630 fusion = fusion_load_store (insn, &base, &offset);
24631 if (fusion == SCHED_FUSION_NONE)
24632 {
24633 *pri = tmp;
24634 *fusion_pri = tmp;
24635 return;
24636 }
24637
24638 /* Set FUSION_PRI according to fusion type and base register. */
24639 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
24640
24641 /* Calculate PRI. */
24642 tmp /= 2;
24643
24644 /* INSN with smaller offset goes first. */
24645 off_val = (int)(INTVAL (offset));
24646 if (off_val >= 0)
24647 tmp -= (off_val & 0xfffff);
24648 else
24649 tmp += ((- off_val) & 0xfffff);
24650
24651 *pri = tmp;
24652 return;
24653}
24654
9bca63d4
WD
24655/* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
24656 Adjust priority of sha1h instructions so they are scheduled before
24657 other SHA1 instructions. */
24658
24659static int
24660aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
24661{
24662 rtx x = PATTERN (insn);
24663
24664 if (GET_CODE (x) == SET)
24665 {
24666 x = SET_SRC (x);
24667
24668 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
24669 return priority + 10;
24670 }
24671
24672 return priority;
24673}
24674
526e1639
RS
24675/* Check if *MEM1 and *MEM2 are consecutive memory references and,
24676 if they are, try to make them use constant offsets from the same base
24677 register. Return true on success. When returning true, set *REVERSED
24678 to true if *MEM1 comes after *MEM2, false if *MEM1 comes before *MEM2. */
24679static bool
24680aarch64_check_consecutive_mems (rtx *mem1, rtx *mem2, bool *reversed)
24681{
24682 *reversed = false;
24683 if (GET_RTX_CLASS (GET_CODE (XEXP (*mem1, 0))) == RTX_AUTOINC
24684 || GET_RTX_CLASS (GET_CODE (XEXP (*mem2, 0))) == RTX_AUTOINC)
24685 return false;
24686
24687 if (!MEM_SIZE_KNOWN_P (*mem1) || !MEM_SIZE_KNOWN_P (*mem2))
24688 return false;
24689
24690 auto size1 = MEM_SIZE (*mem1);
24691 auto size2 = MEM_SIZE (*mem2);
24692
24693 rtx base1, base2, offset1, offset2;
24694 extract_base_offset_in_addr (*mem1, &base1, &offset1);
24695 extract_base_offset_in_addr (*mem2, &base2, &offset2);
24696
24697 /* Make sure at least one memory is in base+offset form. */
24698 if (!(base1 && offset1) && !(base2 && offset2))
24699 return false;
24700
24701 /* If both mems already use the same base register, just check the
24702 offsets. */
24703 if (base1 && base2 && rtx_equal_p (base1, base2))
24704 {
24705 if (!offset1 || !offset2)
24706 return false;
24707
24708 if (known_eq (UINTVAL (offset1) + size1, UINTVAL (offset2)))
24709 return true;
24710
24711 if (known_eq (UINTVAL (offset2) + size2, UINTVAL (offset1)))
24712 {
24713 *reversed = true;
24714 return true;
24715 }
24716
24717 return false;
24718 }
24719
24720 /* Otherwise, check whether the MEM_EXPRs and MEM_OFFSETs together
24721 guarantee that the values are consecutive. */
24722 if (MEM_EXPR (*mem1)
24723 && MEM_EXPR (*mem2)
24724 && MEM_OFFSET_KNOWN_P (*mem1)
24725 && MEM_OFFSET_KNOWN_P (*mem2))
24726 {
24727 poly_int64 expr_offset1;
24728 poly_int64 expr_offset2;
24729 tree expr_base1 = get_addr_base_and_unit_offset (MEM_EXPR (*mem1),
24730 &expr_offset1);
24731 tree expr_base2 = get_addr_base_and_unit_offset (MEM_EXPR (*mem2),
24732 &expr_offset2);
24733 if (!expr_base1
24734 || !expr_base2
24735 || !operand_equal_p (expr_base1, expr_base2, OEP_ADDRESS_OF))
24736 return false;
24737
24738 expr_offset1 += MEM_OFFSET (*mem1);
24739 expr_offset2 += MEM_OFFSET (*mem2);
24740
24741 if (known_eq (expr_offset1 + size1, expr_offset2))
24742 ;
24743 else if (known_eq (expr_offset2 + size2, expr_offset1))
24744 *reversed = true;
24745 else
24746 return false;
24747
24748 if (base2)
24749 {
24750 rtx addr1 = plus_constant (Pmode, XEXP (*mem2, 0),
24751 expr_offset1 - expr_offset2);
24752 *mem1 = replace_equiv_address_nv (*mem1, addr1);
24753 }
24754 else
24755 {
24756 rtx addr2 = plus_constant (Pmode, XEXP (*mem1, 0),
24757 expr_offset2 - expr_offset1);
24758 *mem2 = replace_equiv_address_nv (*mem2, addr2);
24759 }
24760 return true;
24761 }
24762
24763 return false;
24764}
24765
350013bc
BC
24766/* Given OPERANDS of consecutive load/store, check if we can merge
24767 them into ldp/stp. LOAD is true if they are load instructions.
24768 MODE is the mode of memory operands. */
24769
24770bool
24771aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
b8506a8a 24772 machine_mode mode)
350013bc 24773{
350013bc 24774 enum reg_class rclass_1, rclass_2;
526e1639 24775 rtx mem_1, mem_2, reg_1, reg_2;
350013bc
BC
24776
24777 if (load)
24778 {
24779 mem_1 = operands[1];
24780 mem_2 = operands[3];
24781 reg_1 = operands[0];
24782 reg_2 = operands[2];
24783 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
24784 if (REGNO (reg_1) == REGNO (reg_2))
24785 return false;
526e1639
RS
24786 if (reg_overlap_mentioned_p (reg_1, mem_2))
24787 return false;
350013bc
BC
24788 }
24789 else
24790 {
24791 mem_1 = operands[0];
24792 mem_2 = operands[2];
24793 reg_1 = operands[1];
24794 reg_2 = operands[3];
24795 }
24796
bf84ac44
AP
24797 /* The mems cannot be volatile. */
24798 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
24799 return false;
24800
54700e2e
AP
24801 /* If we have SImode and slow unaligned ldp,
24802 check the alignment to be at least 8 byte. */
24803 if (mode == SImode
24804 && (aarch64_tune_params.extra_tuning_flags
24805 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
24806 && !optimize_size
24807 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
24808 return false;
24809
350013bc 24810 /* Check if the addresses are in the form of [base+offset]. */
526e1639
RS
24811 bool reversed = false;
24812 if (!aarch64_check_consecutive_mems (&mem_1, &mem_2, &reversed))
350013bc
BC
24813 return false;
24814
dfe1da23
JW
24815 /* The operands must be of the same size. */
24816 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
526e1639 24817 GET_MODE_SIZE (GET_MODE (mem_2))));
350013bc 24818
9b56ec11
JW
24819 /* One of the memory accesses must be a mempair operand.
24820 If it is not the first one, they need to be swapped by the
24821 peephole. */
24822 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
24823 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
24824 return false;
24825
350013bc
BC
24826 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
24827 rclass_1 = FP_REGS;
24828 else
24829 rclass_1 = GENERAL_REGS;
24830
24831 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
24832 rclass_2 = FP_REGS;
24833 else
24834 rclass_2 = GENERAL_REGS;
24835
24836 /* Check if the registers are of same class. */
24837 if (rclass_1 != rclass_2)
24838 return false;
24839
24840 return true;
24841}
24842
9b56ec11
JW
24843/* Given OPERANDS of consecutive load/store that can be merged,
24844 swap them if they are not in ascending order. */
24845void
24846aarch64_swap_ldrstr_operands (rtx* operands, bool load)
24847{
526e1639
RS
24848 int mem_op = load ? 1 : 0;
24849 bool reversed = false;
24850 if (!aarch64_check_consecutive_mems (operands + mem_op,
24851 operands + mem_op + 2, &reversed))
24852 gcc_unreachable ();
9b56ec11 24853
526e1639 24854 if (reversed)
9b56ec11
JW
24855 {
24856 /* Irrespective of whether this is a load or a store,
24857 we do the same swap. */
24858 std::swap (operands[0], operands[2]);
24859 std::swap (operands[1], operands[3]);
24860 }
24861}
24862
d0b51297
JW
24863/* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
24864 comparison between the two. */
24865int
24866aarch64_host_wide_int_compare (const void *x, const void *y)
24867{
24868 return wi::cmps (* ((const HOST_WIDE_INT *) x),
24869 * ((const HOST_WIDE_INT *) y));
24870}
24871
24872/* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
24873 other pointing to a REG rtx containing an offset, compare the offsets
24874 of the two pairs.
24875
24876 Return:
24877
24878 1 iff offset (X) > offset (Y)
24879 0 iff offset (X) == offset (Y)
24880 -1 iff offset (X) < offset (Y) */
24881int
24882aarch64_ldrstr_offset_compare (const void *x, const void *y)
24883{
24884 const rtx * operands_1 = (const rtx *) x;
24885 const rtx * operands_2 = (const rtx *) y;
24886 rtx mem_1, mem_2, base, offset_1, offset_2;
24887
24888 if (MEM_P (operands_1[0]))
24889 mem_1 = operands_1[0];
24890 else
24891 mem_1 = operands_1[1];
24892
24893 if (MEM_P (operands_2[0]))
24894 mem_2 = operands_2[0];
24895 else
24896 mem_2 = operands_2[1];
24897
24898 /* Extract the offsets. */
24899 extract_base_offset_in_addr (mem_1, &base, &offset_1);
24900 extract_base_offset_in_addr (mem_2, &base, &offset_2);
24901
24902 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
24903
24904 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
24905}
24906
350013bc
BC
24907/* Given OPERANDS of consecutive load/store, check if we can merge
24908 them into ldp/stp by adjusting the offset. LOAD is true if they
24909 are load instructions. MODE is the mode of memory operands.
24910
24911 Given below consecutive stores:
24912
24913 str w1, [xb, 0x100]
24914 str w1, [xb, 0x104]
24915 str w1, [xb, 0x108]
24916 str w1, [xb, 0x10c]
24917
24918 Though the offsets are out of the range supported by stp, we can
24919 still pair them after adjusting the offset, like:
24920
24921 add scratch, xb, 0x100
24922 stp w1, w1, [scratch]
24923 stp w1, w1, [scratch, 0x8]
24924
24925 The peephole patterns detecting this opportunity should guarantee
24926 the scratch register is avaliable. */
24927
24928bool
24929aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
cd91a084 24930 machine_mode mode)
350013bc 24931{
34d7854d
JW
24932 const int num_insns = 4;
24933 enum reg_class rclass;
24934 HOST_WIDE_INT offvals[num_insns], msize;
24935 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
350013bc
BC
24936
24937 if (load)
24938 {
34d7854d
JW
24939 for (int i = 0; i < num_insns; i++)
24940 {
24941 reg[i] = operands[2 * i];
24942 mem[i] = operands[2 * i + 1];
24943
24944 gcc_assert (REG_P (reg[i]));
24945 }
d0b51297
JW
24946
24947 /* Do not attempt to merge the loads if the loads clobber each other. */
24948 for (int i = 0; i < 8; i += 2)
24949 for (int j = i + 2; j < 8; j += 2)
24950 if (reg_overlap_mentioned_p (operands[i], operands[j]))
24951 return false;
350013bc
BC
24952 }
24953 else
34d7854d
JW
24954 for (int i = 0; i < num_insns; i++)
24955 {
24956 mem[i] = operands[2 * i];
24957 reg[i] = operands[2 * i + 1];
24958 }
350013bc 24959
34d7854d
JW
24960 /* Skip if memory operand is by itself valid for ldp/stp. */
24961 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
bf84ac44
AP
24962 return false;
24963
34d7854d
JW
24964 for (int i = 0; i < num_insns; i++)
24965 {
24966 /* The mems cannot be volatile. */
24967 if (MEM_VOLATILE_P (mem[i]))
24968 return false;
24969
24970 /* Check if the addresses are in the form of [base+offset]. */
24971 extract_base_offset_in_addr (mem[i], base + i, offset + i);
24972 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
24973 return false;
24974 }
24975
363b395b
JW
24976 /* Check if the registers are of same class. */
24977 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
24978 ? FP_REGS : GENERAL_REGS;
24979
24980 for (int i = 1; i < num_insns; i++)
24981 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
24982 {
24983 if (rclass != FP_REGS)
24984 return false;
24985 }
24986 else
24987 {
24988 if (rclass != GENERAL_REGS)
24989 return false;
24990 }
24991
24992 /* Only the last register in the order in which they occur
24993 may be clobbered by the load. */
24994 if (rclass == GENERAL_REGS && load)
24995 for (int i = 0; i < num_insns - 1; i++)
34d7854d
JW
24996 if (reg_mentioned_p (reg[i], mem[i]))
24997 return false;
350013bc
BC
24998
24999 /* Check if the bases are same. */
34d7854d
JW
25000 for (int i = 0; i < num_insns - 1; i++)
25001 if (!rtx_equal_p (base[i], base[i + 1]))
25002 return false;
25003
25004 for (int i = 0; i < num_insns; i++)
25005 offvals[i] = INTVAL (offset[i]);
350013bc 25006
cd91a084 25007 msize = GET_MODE_SIZE (mode).to_constant ();
d0b51297
JW
25008
25009 /* Check if the offsets can be put in the right order to do a ldp/stp. */
34d7854d
JW
25010 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
25011 aarch64_host_wide_int_compare);
d0b51297
JW
25012
25013 if (!(offvals[1] == offvals[0] + msize
25014 && offvals[3] == offvals[2] + msize))
350013bc
BC
25015 return false;
25016
d0b51297
JW
25017 /* Check that offsets are within range of each other. The ldp/stp
25018 instructions have 7 bit immediate offsets, so use 0x80. */
25019 if (offvals[2] - offvals[0] >= msize * 0x80)
25020 return false;
350013bc 25021
d0b51297
JW
25022 /* The offsets must be aligned with respect to each other. */
25023 if (offvals[0] % msize != offvals[2] % msize)
25024 return false;
25025
54700e2e
AP
25026 /* If we have SImode and slow unaligned ldp,
25027 check the alignment to be at least 8 byte. */
25028 if (mode == SImode
25029 && (aarch64_tune_params.extra_tuning_flags
34d7854d 25030 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
54700e2e 25031 && !optimize_size
34d7854d 25032 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
54700e2e
AP
25033 return false;
25034
350013bc
BC
25035 return true;
25036}
25037
25038/* Given OPERANDS of consecutive load/store, this function pairs them
d0b51297
JW
25039 into LDP/STP after adjusting the offset. It depends on the fact
25040 that the operands can be sorted so the offsets are correct for STP.
350013bc
BC
25041 MODE is the mode of memory operands. CODE is the rtl operator
25042 which should be applied to all memory operands, it's SIGN_EXTEND,
25043 ZERO_EXTEND or UNKNOWN. */
25044
25045bool
25046aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
cd91a084 25047 machine_mode mode, RTX_CODE code)
350013bc 25048{
d0b51297 25049 rtx base, offset_1, offset_3, t1, t2;
350013bc 25050 rtx mem_1, mem_2, mem_3, mem_4;
d0b51297
JW
25051 rtx temp_operands[8];
25052 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
25053 stp_off_upper_limit, stp_off_lower_limit, msize;
9b56ec11 25054
d0b51297
JW
25055 /* We make changes on a copy as we may still bail out. */
25056 for (int i = 0; i < 8; i ++)
25057 temp_operands[i] = operands[i];
9b56ec11 25058
b662250c
BC
25059 /* Sort the operands. Note for cases as below:
25060 [base + 0x310] = A
25061 [base + 0x320] = B
25062 [base + 0x330] = C
25063 [base + 0x320] = D
25064 We need stable sorting otherwise wrong data may be store to offset 0x320.
25065 Also note the dead store in above case should be optimized away, but no
25066 guarantees here. */
25067 gcc_stablesort(temp_operands, 4, 2 * sizeof (rtx *),
25068 aarch64_ldrstr_offset_compare);
9b56ec11 25069
f6af9c21
RE
25070 /* Copy the memory operands so that if we have to bail for some
25071 reason the original addresses are unchanged. */
350013bc
BC
25072 if (load)
25073 {
f6af9c21
RE
25074 mem_1 = copy_rtx (temp_operands[1]);
25075 mem_2 = copy_rtx (temp_operands[3]);
25076 mem_3 = copy_rtx (temp_operands[5]);
25077 mem_4 = copy_rtx (temp_operands[7]);
350013bc
BC
25078 }
25079 else
25080 {
f6af9c21
RE
25081 mem_1 = copy_rtx (temp_operands[0]);
25082 mem_2 = copy_rtx (temp_operands[2]);
25083 mem_3 = copy_rtx (temp_operands[4]);
25084 mem_4 = copy_rtx (temp_operands[6]);
350013bc
BC
25085 gcc_assert (code == UNKNOWN);
25086 }
25087
9b56ec11 25088 extract_base_offset_in_addr (mem_1, &base, &offset_1);
d0b51297
JW
25089 extract_base_offset_in_addr (mem_3, &base, &offset_3);
25090 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
25091 && offset_3 != NULL_RTX);
350013bc 25092
d0b51297 25093 /* Adjust offset so it can fit in LDP/STP instruction. */
cd91a084 25094 msize = GET_MODE_SIZE (mode).to_constant();
d0b51297
JW
25095 stp_off_upper_limit = msize * (0x40 - 1);
25096 stp_off_lower_limit = - msize * 0x40;
350013bc 25097
d0b51297
JW
25098 off_val_1 = INTVAL (offset_1);
25099 off_val_3 = INTVAL (offset_3);
25100
25101 /* The base offset is optimally half way between the two STP/LDP offsets. */
25102 if (msize <= 4)
25103 base_off = (off_val_1 + off_val_3) / 2;
25104 else
25105 /* However, due to issues with negative LDP/STP offset generation for
25106 larger modes, for DF, DI and vector modes. we must not use negative
25107 addresses smaller than 9 signed unadjusted bits can store. This
25108 provides the most range in this case. */
25109 base_off = off_val_1;
25110
25111 /* Adjust the base so that it is aligned with the addresses but still
25112 optimal. */
25113 if (base_off % msize != off_val_1 % msize)
25114 /* Fix the offset, bearing in mind we want to make it bigger not
25115 smaller. */
25116 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
25117 else if (msize <= 4)
25118 /* The negative range of LDP/STP is one larger than the positive range. */
25119 base_off += msize;
25120
25121 /* Check if base offset is too big or too small. We can attempt to resolve
25122 this issue by setting it to the maximum value and seeing if the offsets
25123 still fit. */
25124 if (base_off >= 0x1000)
350013bc 25125 {
d0b51297
JW
25126 base_off = 0x1000 - 1;
25127 /* We must still make sure that the base offset is aligned with respect
700d4cb0 25128 to the address. But it may not be made any bigger. */
d0b51297 25129 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
350013bc
BC
25130 }
25131
d0b51297
JW
25132 /* Likewise for the case where the base is too small. */
25133 if (base_off <= -0x1000)
350013bc 25134 {
d0b51297
JW
25135 base_off = -0x1000 + 1;
25136 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
350013bc
BC
25137 }
25138
d0b51297
JW
25139 /* Offset of the first STP/LDP. */
25140 new_off_1 = off_val_1 - base_off;
25141
25142 /* Offset of the second STP/LDP. */
25143 new_off_3 = off_val_3 - base_off;
350013bc 25144
d0b51297
JW
25145 /* The offsets must be within the range of the LDP/STP instructions. */
25146 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
25147 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
350013bc
BC
25148 return false;
25149
d0b51297
JW
25150 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
25151 new_off_1), true);
25152 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
25153 new_off_1 + msize), true);
25154 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
25155 new_off_3), true);
25156 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
25157 new_off_3 + msize), true);
25158
25159 if (!aarch64_mem_pair_operand (mem_1, mode)
25160 || !aarch64_mem_pair_operand (mem_3, mode))
25161 return false;
350013bc
BC
25162
25163 if (code == ZERO_EXTEND)
25164 {
25165 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
25166 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
25167 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
25168 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
25169 }
25170 else if (code == SIGN_EXTEND)
25171 {
25172 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
25173 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
25174 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
25175 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
25176 }
25177
25178 if (load)
25179 {
d0b51297 25180 operands[0] = temp_operands[0];
350013bc 25181 operands[1] = mem_1;
d0b51297 25182 operands[2] = temp_operands[2];
350013bc 25183 operands[3] = mem_2;
d0b51297 25184 operands[4] = temp_operands[4];
350013bc 25185 operands[5] = mem_3;
d0b51297 25186 operands[6] = temp_operands[6];
350013bc
BC
25187 operands[7] = mem_4;
25188 }
25189 else
25190 {
25191 operands[0] = mem_1;
d0b51297 25192 operands[1] = temp_operands[1];
350013bc 25193 operands[2] = mem_2;
d0b51297 25194 operands[3] = temp_operands[3];
350013bc 25195 operands[4] = mem_3;
d0b51297 25196 operands[5] = temp_operands[5];
350013bc 25197 operands[6] = mem_4;
d0b51297 25198 operands[7] = temp_operands[7];
350013bc
BC
25199 }
25200
25201 /* Emit adjusting instruction. */
d0b51297 25202 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
350013bc 25203 /* Emit ldp/stp instructions. */
f7df4a84
RS
25204 t1 = gen_rtx_SET (operands[0], operands[1]);
25205 t2 = gen_rtx_SET (operands[2], operands[3]);
350013bc 25206 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
f7df4a84
RS
25207 t1 = gen_rtx_SET (operands[4], operands[5]);
25208 t2 = gen_rtx_SET (operands[6], operands[7]);
350013bc
BC
25209 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
25210 return true;
25211}
25212
76a34e3f
RS
25213/* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
25214 it isn't worth branching around empty masked ops (including masked
25215 stores). */
25216
25217static bool
25218aarch64_empty_mask_is_expensive (unsigned)
25219{
25220 return false;
25221}
25222
1b1e81f8
JW
25223/* Return 1 if pseudo register should be created and used to hold
25224 GOT address for PIC code. */
25225
25226bool
25227aarch64_use_pseudo_pic_reg (void)
25228{
25229 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
25230}
25231
7b841a12
JW
25232/* Implement TARGET_UNSPEC_MAY_TRAP_P. */
25233
25234static int
25235aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
25236{
25237 switch (XINT (x, 1))
25238 {
25239 case UNSPEC_GOTSMALLPIC:
25240 case UNSPEC_GOTSMALLPIC28K:
25241 case UNSPEC_GOTTINYPIC:
25242 return 0;
25243 default:
25244 break;
25245 }
25246
25247 return default_unspec_may_trap_p (x, flags);
25248}
25249
39252973
KT
25250
25251/* If X is a positive CONST_DOUBLE with a value that is a power of 2
25252 return the log2 of that value. Otherwise return -1. */
25253
25254int
25255aarch64_fpconst_pow_of_2 (rtx x)
25256{
25257 const REAL_VALUE_TYPE *r;
25258
25259 if (!CONST_DOUBLE_P (x))
25260 return -1;
25261
25262 r = CONST_DOUBLE_REAL_VALUE (x);
25263
25264 if (REAL_VALUE_NEGATIVE (*r)
25265 || REAL_VALUE_ISNAN (*r)
25266 || REAL_VALUE_ISINF (*r)
25267 || !real_isinteger (r, DFmode))
25268 return -1;
25269
25270 return exact_log2 (real_to_integer (r));
25271}
25272
188d0079
JH
25273/* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
25274 power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
25275 return n. Otherwise return -1. */
25276
25277int
25278aarch64_fpconst_pow2_recip (rtx x)
25279{
25280 REAL_VALUE_TYPE r0;
25281
25282 if (!CONST_DOUBLE_P (x))
25283 return -1;
25284
25285 r0 = *CONST_DOUBLE_REAL_VALUE (x);
25286 if (exact_real_inverse (DFmode, &r0)
25287 && !REAL_VALUE_NEGATIVE (r0))
25288 {
25289 int ret = exact_log2 (real_to_integer (&r0));
25290 if (ret >= 1 && ret <= 32)
25291 return ret;
25292 }
25293 return -1;
25294}
25295
39252973
KT
25296/* If X is a vector of equal CONST_DOUBLE values and that value is
25297 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
25298
25299int
25300aarch64_vec_fpconst_pow_of_2 (rtx x)
25301{
6a70badb 25302 int nelts;
568b9c0e 25303 if (!CONST_VECTOR_P (x)
6a70badb 25304 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
39252973
KT
25305 return -1;
25306
25307 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
25308 return -1;
25309
25310 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
25311 if (firstval <= 0)
25312 return -1;
25313
6a70badb 25314 for (int i = 1; i < nelts; i++)
39252973
KT
25315 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
25316 return -1;
25317
25318 return firstval;
25319}
25320
11e554b3
JG
25321/* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
25322 to float.
25323
25324 __fp16 always promotes through this hook.
25325 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
25326 through the generic excess precision logic rather than here. */
25327
c2ec330c
AL
25328static tree
25329aarch64_promoted_type (const_tree t)
25330{
11e554b3
JG
25331 if (SCALAR_FLOAT_TYPE_P (t)
25332 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
c2ec330c 25333 return float_type_node;
11e554b3 25334
c2ec330c
AL
25335 return NULL_TREE;
25336}
ee62a5a6
RS
25337
25338/* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
25339
25340static bool
9acc9cbe 25341aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
ee62a5a6
RS
25342 optimization_type opt_type)
25343{
25344 switch (op)
25345 {
25346 case rsqrt_optab:
9acc9cbe 25347 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
ee62a5a6
RS
25348
25349 default:
25350 return true;
25351 }
25352}
25353
43cacb12
RS
25354/* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
25355
25356static unsigned int
25357aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
25358 int *offset)
25359{
25360 /* Polynomial invariant 1 == (VG / 2) - 1. */
25361 gcc_assert (i == 1);
25362 *factor = 2;
25363 *offset = 1;
25364 return AARCH64_DWARF_VG;
25365}
25366
11e554b3
JG
25367/* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
25368 if MODE is HFmode, and punt to the generic implementation otherwise. */
25369
25370static bool
7c5bd57a 25371aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
11e554b3
JG
25372{
25373 return (mode == HFmode
25374 ? true
25375 : default_libgcc_floating_mode_supported_p (mode));
25376}
25377
2e5f8203
JG
25378/* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
25379 if MODE is HFmode, and punt to the generic implementation otherwise. */
25380
25381static bool
18e2a8b8 25382aarch64_scalar_mode_supported_p (scalar_mode mode)
2e5f8203
JG
25383{
25384 return (mode == HFmode
25385 ? true
25386 : default_scalar_mode_supported_p (mode));
25387}
25388
11e554b3
JG
25389/* Set the value of FLT_EVAL_METHOD.
25390 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
25391
25392 0: evaluate all operations and constants, whose semantic type has at
25393 most the range and precision of type float, to the range and
25394 precision of float; evaluate all other operations and constants to
25395 the range and precision of the semantic type;
25396
25397 N, where _FloatN is a supported interchange floating type
25398 evaluate all operations and constants, whose semantic type has at
25399 most the range and precision of _FloatN type, to the range and
25400 precision of the _FloatN type; evaluate all other operations and
25401 constants to the range and precision of the semantic type;
25402
25403 If we have the ARMv8.2-A extensions then we support _Float16 in native
25404 precision, so we should set this to 16. Otherwise, we support the type,
25405 but want to evaluate expressions in float precision, so set this to
25406 0. */
25407
25408static enum flt_eval_method
25409aarch64_excess_precision (enum excess_precision_type type)
25410{
25411 switch (type)
25412 {
25413 case EXCESS_PRECISION_TYPE_FAST:
25414 case EXCESS_PRECISION_TYPE_STANDARD:
25415 /* We can calculate either in 16-bit range and precision or
25416 32-bit range and precision. Make that decision based on whether
25417 we have native support for the ARMv8.2-A 16-bit floating-point
25418 instructions or not. */
25419 return (TARGET_FP_F16INST
25420 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
25421 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
25422 case EXCESS_PRECISION_TYPE_IMPLICIT:
f19a3270 25423 case EXCESS_PRECISION_TYPE_FLOAT16:
11e554b3
JG
25424 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
25425 default:
25426 gcc_unreachable ();
25427 }
25428 return FLT_EVAL_METHOD_UNPREDICTABLE;
25429}
25430
b48d6421
KT
25431/* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
25432 scheduled for speculative execution. Reject the long-running division
25433 and square-root instructions. */
25434
25435static bool
25436aarch64_sched_can_speculate_insn (rtx_insn *insn)
25437{
25438 switch (get_attr_type (insn))
25439 {
25440 case TYPE_SDIV:
25441 case TYPE_UDIV:
25442 case TYPE_FDIVS:
25443 case TYPE_FDIVD:
25444 case TYPE_FSQRTS:
25445 case TYPE_FSQRTD:
25446 case TYPE_NEON_FP_SQRT_S:
25447 case TYPE_NEON_FP_SQRT_D:
25448 case TYPE_NEON_FP_SQRT_S_Q:
25449 case TYPE_NEON_FP_SQRT_D_Q:
25450 case TYPE_NEON_FP_DIV_S:
25451 case TYPE_NEON_FP_DIV_D:
25452 case TYPE_NEON_FP_DIV_S_Q:
25453 case TYPE_NEON_FP_DIV_D_Q:
25454 return false;
25455 default:
25456 return true;
25457 }
25458}
25459
43cacb12
RS
25460/* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
25461
25462static int
25463aarch64_compute_pressure_classes (reg_class *classes)
25464{
25465 int i = 0;
25466 classes[i++] = GENERAL_REGS;
25467 classes[i++] = FP_REGS;
25468 /* PR_REGS isn't a useful pressure class because many predicate pseudo
25469 registers need to go in PR_LO_REGS at some point during their
25470 lifetime. Splitting it into two halves has the effect of making
25471 all predicates count against PR_LO_REGS, so that we try whenever
25472 possible to restrict the number of live predicates to 8. This
25473 greatly reduces the amount of spilling in certain loops. */
25474 classes[i++] = PR_LO_REGS;
25475 classes[i++] = PR_HI_REGS;
25476 return i;
25477}
25478
25479/* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
25480
25481static bool
25482aarch64_can_change_mode_class (machine_mode from,
25483 machine_mode to, reg_class_t)
25484{
76607e7e
RS
25485 unsigned int from_flags = aarch64_classify_vector_mode (from);
25486 unsigned int to_flags = aarch64_classify_vector_mode (to);
25487
25488 bool from_sve_p = (from_flags & VEC_ANY_SVE);
25489 bool to_sve_p = (to_flags & VEC_ANY_SVE);
25490
25491 bool from_partial_sve_p = from_sve_p && (from_flags & VEC_PARTIAL);
25492 bool to_partial_sve_p = to_sve_p && (to_flags & VEC_PARTIAL);
25493
38e62001
RS
25494 bool from_pred_p = (from_flags & VEC_SVE_PRED);
25495 bool to_pred_p = (to_flags & VEC_SVE_PRED);
25496
66f206b8
JW
25497 bool from_full_advsimd_struct_p = (from_flags == (VEC_ADVSIMD | VEC_STRUCT));
25498 bool to_partial_advsimd_struct_p = (to_flags == (VEC_ADVSIMD | VEC_STRUCT
25499 | VEC_PARTIAL));
25500
38e62001
RS
25501 /* Don't allow changes between predicate modes and other modes.
25502 Only predicate registers can hold predicate modes and only
25503 non-predicate registers can hold non-predicate modes, so any
25504 attempt to mix them would require a round trip through memory. */
25505 if (from_pred_p != to_pred_p)
25506 return false;
25507
76607e7e
RS
25508 /* Don't allow changes between partial SVE modes and other modes.
25509 The contents of partial SVE modes are distributed evenly across
25510 the register, whereas GCC expects them to be clustered together. */
25511 if (from_partial_sve_p != to_partial_sve_p)
25512 return false;
25513
25514 /* Similarly reject changes between partial SVE modes that have
25515 different patterns of significant and insignificant bits. */
25516 if (from_partial_sve_p
25517 && (aarch64_sve_container_bits (from) != aarch64_sve_container_bits (to)
25518 || GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to)))
25519 return false;
25520
66f206b8
JW
25521 /* Don't allow changes between partial and full Advanced SIMD structure
25522 modes. */
25523 if (from_full_advsimd_struct_p && to_partial_advsimd_struct_p)
25524 return false;
25525
38e62001
RS
25526 if (maybe_ne (BITS_PER_SVE_VECTOR, 128u))
25527 {
25528 /* Don't allow changes between SVE modes and other modes that might
25529 be bigger than 128 bits. In particular, OImode, CImode and XImode
25530 divide into 128-bit quantities while SVE modes divide into
25531 BITS_PER_SVE_VECTOR quantities. */
25532 if (from_sve_p && !to_sve_p && maybe_gt (GET_MODE_BITSIZE (to), 128))
25533 return false;
25534 if (to_sve_p && !from_sve_p && maybe_gt (GET_MODE_BITSIZE (from), 128))
25535 return false;
25536 }
25537
002092be
RS
25538 if (BYTES_BIG_ENDIAN)
25539 {
002092be
RS
25540 /* Don't allow changes between SVE data modes and non-SVE modes.
25541 See the comment at the head of aarch64-sve.md for details. */
25542 if (from_sve_p != to_sve_p)
25543 return false;
25544
25545 /* Don't allow changes in element size: lane 0 of the new vector
25546 would not then be lane 0 of the old vector. See the comment
25547 above aarch64_maybe_expand_sve_subreg_move for a more detailed
25548 description.
25549
25550 In the worst case, this forces a register to be spilled in
25551 one mode and reloaded in the other, which handles the
25552 endianness correctly. */
25553 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
25554 return false;
25555 }
43cacb12
RS
25556 return true;
25557}
25558
5cce8171
RS
25559/* Implement TARGET_EARLY_REMAT_MODES. */
25560
25561static void
25562aarch64_select_early_remat_modes (sbitmap modes)
25563{
25564 /* SVE values are not normally live across a call, so it should be
25565 worth doing early rematerialization even in VL-specific mode. */
25566 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
5c38705d
RS
25567 if (aarch64_sve_mode_p ((machine_mode) i))
25568 bitmap_set_bit (modes, i);
5cce8171
RS
25569}
25570
c0111dc4
RE
25571/* Override the default target speculation_safe_value. */
25572static rtx
25573aarch64_speculation_safe_value (machine_mode mode,
25574 rtx result, rtx val, rtx failval)
25575{
25576 /* Maybe we should warn if falling back to hard barriers. They are
25577 likely to be noticably more expensive than the alternative below. */
25578 if (!aarch64_track_speculation)
25579 return default_speculation_safe_value (mode, result, val, failval);
25580
25581 if (!REG_P (val))
25582 val = copy_to_mode_reg (mode, val);
25583
25584 if (!aarch64_reg_or_zero (failval, mode))
25585 failval = copy_to_mode_reg (mode, failval);
25586
21cebf90 25587 emit_insn (gen_despeculate_copy (mode, result, val, failval));
c0111dc4
RE
25588 return result;
25589}
25590
2d56d6ba
KT
25591/* Implement TARGET_ESTIMATED_POLY_VALUE.
25592 Look into the tuning structure for an estimate.
64432b68
KT
25593 KIND specifies the type of requested estimate: min, max or likely.
25594 For cores with a known SVE width all three estimates are the same.
25595 For generic SVE tuning we want to distinguish the maximum estimate from
25596 the minimum and likely ones.
25597 The likely estimate is the same as the minimum in that case to give a
25598 conservative behavior of auto-vectorizing with SVE when it is a win
25599 even for 128-bit SVE.
25600 When SVE width information is available VAL.coeffs[1] is multiplied by
25601 the number of VQ chunks over the initial Advanced SIMD 128 bits. */
2d56d6ba
KT
25602
25603static HOST_WIDE_INT
64432b68
KT
25604aarch64_estimated_poly_value (poly_int64 val,
25605 poly_value_estimate_kind kind
25606 = POLY_VALUE_LIKELY)
2d56d6ba 25607{
fa3ca615 25608 unsigned int width_source = aarch64_tune_params.sve_width;
2d56d6ba 25609
64432b68
KT
25610 /* If there is no core-specific information then the minimum and likely
25611 values are based on 128-bit vectors and the maximum is based on
25612 the architectural maximum of 2048 bits. */
2d56d6ba 25613 if (width_source == SVE_SCALABLE)
64432b68
KT
25614 switch (kind)
25615 {
25616 case POLY_VALUE_MIN:
25617 case POLY_VALUE_LIKELY:
25618 return val.coeffs[0];
25619 case POLY_VALUE_MAX:
25620 return val.coeffs[0] + val.coeffs[1] * 15;
25621 }
2d56d6ba 25622
fa3ca615
RS
25623 /* Allow sve_width to be a bitmask of different VL, treating the lowest
25624 as likely. This could be made more general if future -mtune options
25625 need it to be. */
25626 if (kind == POLY_VALUE_MAX)
25627 width_source = 1 << floor_log2 (width_source);
25628 else
25629 width_source = least_bit_hwi (width_source);
25630
64432b68 25631 /* If the core provides width information, use that. */
2d56d6ba
KT
25632 HOST_WIDE_INT over_128 = width_source - 128;
25633 return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
25634}
25635
d9186814
SE
25636
25637/* Return true for types that could be supported as SIMD return or
25638 argument types. */
25639
25640static bool
25641supported_simd_type (tree t)
25642{
25643 if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
25644 {
25645 HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
25646 return s == 1 || s == 2 || s == 4 || s == 8;
25647 }
25648 return false;
25649}
25650
25651/* Return true for types that currently are supported as SIMD return
25652 or argument types. */
25653
25654static bool
25655currently_supported_simd_type (tree t, tree b)
25656{
25657 if (COMPLEX_FLOAT_TYPE_P (t))
25658 return false;
25659
25660 if (TYPE_SIZE (t) != TYPE_SIZE (b))
25661 return false;
25662
25663 return supported_simd_type (t);
25664}
25665
25666/* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
25667
25668static int
25669aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
25670 struct cgraph_simd_clone *clonei,
25671 tree base_type, int num)
25672{
39916cea 25673 tree t, ret_type;
abe93733
YY
25674 unsigned int elt_bits, count;
25675 unsigned HOST_WIDE_INT const_simdlen;
25676 poly_uint64 vec_bits;
d9186814
SE
25677
25678 if (!TARGET_SIMD)
25679 return 0;
25680
abe93733
YY
25681 /* For now, SVE simdclones won't produce illegal simdlen, So only check
25682 const simdlens here. */
25683 if (maybe_ne (clonei->simdlen, 0U)
25684 && clonei->simdlen.is_constant (&const_simdlen)
25685 && (const_simdlen < 2
25686 || const_simdlen > 1024
25687 || (const_simdlen & (const_simdlen - 1)) != 0))
d9186814
SE
25688 {
25689 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
abe93733 25690 "unsupported simdlen %wd", const_simdlen);
d9186814
SE
25691 return 0;
25692 }
25693
25694 ret_type = TREE_TYPE (TREE_TYPE (node->decl));
25695 if (TREE_CODE (ret_type) != VOID_TYPE
25696 && !currently_supported_simd_type (ret_type, base_type))
25697 {
25698 if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
25699 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
25700 "GCC does not currently support mixed size types "
25701 "for %<simd%> functions");
25702 else if (supported_simd_type (ret_type))
25703 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
25704 "GCC does not currently support return type %qT "
25705 "for %<simd%> functions", ret_type);
25706 else
25707 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
25708 "unsupported return type %qT for %<simd%> functions",
25709 ret_type);
25710 return 0;
25711 }
25712
fcefc59b
JJ
25713 int i;
25714 tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl));
25715 bool decl_arg_p = (node->definition || type_arg_types == NULL_TREE);
25716
25717 for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0;
25718 t && t != void_list_node; t = TREE_CHAIN (t), i++)
d9186814 25719 {
fcefc59b 25720 tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
d9186814 25721
fcefc59b
JJ
25722 if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM
25723 && !currently_supported_simd_type (arg_type, base_type))
d9186814
SE
25724 {
25725 if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
25726 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
25727 "GCC does not currently support mixed size types "
25728 "for %<simd%> functions");
25729 else
25730 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
25731 "GCC does not currently support argument type %qT "
25732 "for %<simd%> functions", arg_type);
25733 return 0;
25734 }
25735 }
25736
25737 clonei->vecsize_mangle = 'n';
25738 clonei->mask_mode = VOIDmode;
25739 elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
abe93733 25740 if (known_eq (clonei->simdlen, 0U))
d9186814
SE
25741 {
25742 count = 2;
25743 vec_bits = (num == 0 ? 64 : 128);
abe93733 25744 clonei->simdlen = exact_div (vec_bits, elt_bits);
d9186814
SE
25745 }
25746 else
25747 {
25748 count = 1;
25749 vec_bits = clonei->simdlen * elt_bits;
abe93733
YY
25750 /* For now, SVE simdclones won't produce illegal simdlen, So only check
25751 const simdlens here. */
25752 if (clonei->simdlen.is_constant (&const_simdlen)
25753 && maybe_ne (vec_bits, 64U) && maybe_ne (vec_bits, 128U))
d9186814
SE
25754 {
25755 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
abe93733
YY
25756 "GCC does not currently support simdlen %wd for type %qT",
25757 const_simdlen, base_type);
d9186814
SE
25758 return 0;
25759 }
25760 }
25761 clonei->vecsize_int = vec_bits;
25762 clonei->vecsize_float = vec_bits;
25763 return count;
25764}
25765
25766/* Implement TARGET_SIMD_CLONE_ADJUST. */
25767
25768static void
25769aarch64_simd_clone_adjust (struct cgraph_node *node)
25770{
25771 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
25772 use the correct ABI. */
25773
25774 tree t = TREE_TYPE (node->decl);
25775 TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
25776 TYPE_ATTRIBUTES (t));
25777}
25778
25779/* Implement TARGET_SIMD_CLONE_USABLE. */
25780
25781static int
25782aarch64_simd_clone_usable (struct cgraph_node *node)
25783{
25784 switch (node->simdclone->vecsize_mangle)
25785 {
25786 case 'n':
25787 if (!TARGET_SIMD)
25788 return -1;
25789 return 0;
25790 default:
25791 gcc_unreachable ();
25792 }
25793}
25794
497f281c
SE
25795/* Implement TARGET_COMP_TYPE_ATTRIBUTES */
25796
25797static int
25798aarch64_comp_type_attributes (const_tree type1, const_tree type2)
25799{
31427b97
RS
25800 auto check_attr = [&](const char *name) {
25801 tree attr1 = lookup_attribute (name, TYPE_ATTRIBUTES (type1));
25802 tree attr2 = lookup_attribute (name, TYPE_ATTRIBUTES (type2));
25803 if (!attr1 && !attr2)
25804 return true;
25805
25806 return attr1 && attr2 && attribute_value_equal (attr1, attr2);
25807 };
25808
25809 if (!check_attr ("aarch64_vector_pcs"))
25810 return 0;
25811 if (!check_attr ("Advanced SIMD type"))
497f281c 25812 return 0;
4cea5b8c
RS
25813 if (!check_attr ("SVE type"))
25814 return 0;
25815 if (!check_attr ("SVE sizeless type"))
25816 return 0;
497f281c
SE
25817 return 1;
25818}
25819
3bac1e20
SE
25820/* Implement TARGET_GET_MULTILIB_ABI_NAME */
25821
25822static const char *
25823aarch64_get_multilib_abi_name (void)
25824{
25825 if (TARGET_BIG_END)
25826 return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
25827 return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
25828}
25829
e76c8e56
JJ
25830/* Implement TARGET_STACK_PROTECT_GUARD. In case of a
25831 global variable based guard use the default else
25832 return a null tree. */
25833static tree
25834aarch64_stack_protect_guard (void)
25835{
25836 if (aarch64_stack_protector_guard == SSP_GLOBAL)
25837 return default_stack_protect_guard ();
25838
25839 return NULL_TREE;
25840}
25841
98698967
SMW
25842/* Return the diagnostic message string if conversion from FROMTYPE to
25843 TOTYPE is not allowed, NULL otherwise. */
25844
25845static const char *
25846aarch64_invalid_conversion (const_tree fromtype, const_tree totype)
25847{
25848 if (element_mode (fromtype) != element_mode (totype))
25849 {
25850 /* Do no allow conversions to/from BFmode scalar types. */
25851 if (TYPE_MODE (fromtype) == BFmode)
25852 return N_("invalid conversion from type %<bfloat16_t%>");
25853 if (TYPE_MODE (totype) == BFmode)
25854 return N_("invalid conversion to type %<bfloat16_t%>");
25855 }
25856
25857 /* Conversion allowed. */
25858 return NULL;
25859}
25860
25861/* Return the diagnostic message string if the unary operation OP is
25862 not permitted on TYPE, NULL otherwise. */
25863
25864static const char *
25865aarch64_invalid_unary_op (int op, const_tree type)
25866{
25867 /* Reject all single-operand operations on BFmode except for &. */
25868 if (element_mode (type) == BFmode && op != ADDR_EXPR)
25869 return N_("operation not permitted on type %<bfloat16_t%>");
25870
25871 /* Operation allowed. */
25872 return NULL;
25873}
25874
25875/* Return the diagnostic message string if the binary operation OP is
25876 not permitted on TYPE1 and TYPE2, NULL otherwise. */
25877
25878static const char *
25879aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1,
25880 const_tree type2)
25881{
25882 /* Reject all 2-operand operations on BFmode. */
25883 if (element_mode (type1) == BFmode
25884 || element_mode (type2) == BFmode)
25885 return N_("operation not permitted on type %<bfloat16_t%>");
25886
38e62001
RS
25887 if (VECTOR_TYPE_P (type1)
25888 && VECTOR_TYPE_P (type2)
25889 && !TYPE_INDIVISIBLE_P (type1)
25890 && !TYPE_INDIVISIBLE_P (type2)
25891 && (aarch64_sve::builtin_type_p (type1)
25892 != aarch64_sve::builtin_type_p (type2)))
25893 return N_("cannot combine GNU and SVE vectors in a binary operation");
25894
98698967
SMW
25895 /* Operation allowed. */
25896 return NULL;
25897}
25898
3bd87832
MM
25899/* Implement TARGET_MEMTAG_CAN_TAG_ADDRESSES. Here we tell the rest of the
25900 compiler that we automatically ignore the top byte of our pointers, which
25901 allows using -fsanitize=hwaddress. */
25902bool
25903aarch64_can_tag_addresses ()
25904{
25905 return !TARGET_ILP32;
25906}
25907
32efff9f
SD
25908/* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
25909 section at the end if needed. */
25910#define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
25911#define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
25912#define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
25913void
25914aarch64_file_end_indicate_exec_stack ()
25915{
25916 file_end_indicate_exec_stack ();
25917
25918 unsigned feature_1_and = 0;
25919 if (aarch64_bti_enabled ())
25920 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
25921
25922 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
25923 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
25924
25925 if (feature_1_and)
25926 {
25927 /* Generate .note.gnu.property section. */
25928 switch_to_section (get_section (".note.gnu.property",
25929 SECTION_NOTYPE, NULL));
25930
25931 /* PT_NOTE header: namesz, descsz, type.
25932 namesz = 4 ("GNU\0")
25933 descsz = 16 (Size of the program property array)
25934 [(12 + padding) * Number of array elements]
25935 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
25936 assemble_align (POINTER_SIZE);
25937 assemble_integer (GEN_INT (4), 4, 32, 1);
25938 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
25939 assemble_integer (GEN_INT (5), 4, 32, 1);
25940
25941 /* PT_NOTE name. */
25942 assemble_string ("GNU", 4);
25943
25944 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
25945 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
25946 datasz = 4
25947 data = feature_1_and. */
25948 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
25949 assemble_integer (GEN_INT (4), 4, 32, 1);
25950 assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
25951
25952 /* Pad the size of the note to the required alignment. */
25953 assemble_align (POINTER_SIZE);
25954 }
25955}
25956#undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
25957#undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
25958#undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
e76c8e56 25959
be178ecd
MM
25960/* Helper function for straight line speculation.
25961 Return what barrier should be emitted for straight line speculation
25962 mitigation.
25963 When not mitigating against straight line speculation this function returns
25964 an empty string.
25965 When mitigating against straight line speculation, use:
25966 * SB when the v8.5-A SB extension is enabled.
25967 * DSB+ISB otherwise. */
25968const char *
25969aarch64_sls_barrier (int mitigation_required)
25970{
25971 return mitigation_required
25972 ? (TARGET_SB ? "sb" : "dsb\tsy\n\tisb")
25973 : "";
25974}
25975
96b7f495
MM
25976static GTY (()) tree aarch64_sls_shared_thunks[30];
25977static GTY (()) bool aarch64_sls_shared_thunks_needed = false;
25978const char *indirect_symbol_names[30] = {
25979 "__call_indirect_x0",
25980 "__call_indirect_x1",
25981 "__call_indirect_x2",
25982 "__call_indirect_x3",
25983 "__call_indirect_x4",
25984 "__call_indirect_x5",
25985 "__call_indirect_x6",
25986 "__call_indirect_x7",
25987 "__call_indirect_x8",
25988 "__call_indirect_x9",
25989 "__call_indirect_x10",
25990 "__call_indirect_x11",
25991 "__call_indirect_x12",
25992 "__call_indirect_x13",
25993 "__call_indirect_x14",
25994 "__call_indirect_x15",
25995 "", /* "__call_indirect_x16", */
25996 "", /* "__call_indirect_x17", */
25997 "__call_indirect_x18",
25998 "__call_indirect_x19",
25999 "__call_indirect_x20",
26000 "__call_indirect_x21",
26001 "__call_indirect_x22",
26002 "__call_indirect_x23",
26003 "__call_indirect_x24",
26004 "__call_indirect_x25",
26005 "__call_indirect_x26",
26006 "__call_indirect_x27",
26007 "__call_indirect_x28",
26008 "__call_indirect_x29",
26009};
26010
26011/* Function to create a BLR thunk. This thunk is used to mitigate straight
26012 line speculation. Instead of a simple BLR that can be speculated past,
26013 we emit a BL to this thunk, and this thunk contains a BR to the relevant
26014 register. These thunks have the relevant speculation barries put after
26015 their indirect branch so that speculation is blocked.
26016
26017 We use such a thunk so the speculation barriers are kept off the
26018 architecturally executed path in order to reduce the performance overhead.
26019
26020 When optimizing for size we use stubs shared by the linked object.
26021 When optimizing for performance we emit stubs for each function in the hope
26022 that the branch predictor can better train on jumps specific for a given
26023 function. */
26024rtx
26025aarch64_sls_create_blr_label (int regnum)
26026{
26027 gcc_assert (STUB_REGNUM_P (regnum));
26028 if (optimize_function_for_size_p (cfun))
26029 {
26030 /* For the thunks shared between different functions in this compilation
26031 unit we use a named symbol -- this is just for users to more easily
26032 understand the generated assembly. */
26033 aarch64_sls_shared_thunks_needed = true;
26034 const char *thunk_name = indirect_symbol_names[regnum];
26035 if (aarch64_sls_shared_thunks[regnum] == NULL)
26036 {
26037 /* Build a decl representing this function stub and record it for
26038 later. We build a decl here so we can use the GCC machinery for
26039 handling sections automatically (through `get_named_section` and
26040 `make_decl_one_only`). That saves us a lot of trouble handling
26041 the specifics of different output file formats. */
26042 tree decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
26043 get_identifier (thunk_name),
26044 build_function_type_list (void_type_node,
26045 NULL_TREE));
26046 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
26047 NULL_TREE, void_type_node);
26048 TREE_PUBLIC (decl) = 1;
26049 TREE_STATIC (decl) = 1;
26050 DECL_IGNORED_P (decl) = 1;
26051 DECL_ARTIFICIAL (decl) = 1;
26052 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
26053 resolve_unique_section (decl, 0, false);
26054 aarch64_sls_shared_thunks[regnum] = decl;
26055 }
26056
26057 return gen_rtx_SYMBOL_REF (Pmode, thunk_name);
26058 }
26059
26060 if (cfun->machine->call_via[regnum] == NULL)
26061 cfun->machine->call_via[regnum]
26062 = gen_rtx_LABEL_REF (Pmode, gen_label_rtx ());
26063 return cfun->machine->call_via[regnum];
26064}
26065
26066/* Helper function for aarch64_sls_emit_blr_function_thunks and
26067 aarch64_sls_emit_shared_blr_thunks below. */
26068static void
26069aarch64_sls_emit_function_stub (FILE *out_file, int regnum)
26070{
26071 /* Save in x16 and branch to that function so this transformation does
26072 not prevent jumping to `BTI c` instructions. */
26073 asm_fprintf (out_file, "\tmov\tx16, x%d\n", regnum);
26074 asm_fprintf (out_file, "\tbr\tx16\n");
26075}
26076
26077/* Emit all BLR stubs for this particular function.
26078 Here we emit all the BLR stubs needed for the current function. Since we
26079 emit these stubs in a consecutive block we know there will be no speculation
26080 gadgets between each stub, and hence we only emit a speculation barrier at
26081 the end of the stub sequences.
26082
26083 This is called in the TARGET_ASM_FUNCTION_EPILOGUE hook. */
26084void
26085aarch64_sls_emit_blr_function_thunks (FILE *out_file)
26086{
26087 if (! aarch64_harden_sls_blr_p ())
26088 return;
26089
26090 bool any_functions_emitted = false;
26091 /* We must save and restore the current function section since this assembly
26092 is emitted at the end of the function. This means it can be emitted *just
26093 after* the cold section of a function. That cold part would be emitted in
26094 a different section. That switch would trigger a `.cfi_endproc` directive
26095 to be emitted in the original section and a `.cfi_startproc` directive to
26096 be emitted in the new section. Switching to the original section without
26097 restoring would mean that the `.cfi_endproc` emitted as a function ends
26098 would happen in a different section -- leaving an unmatched
26099 `.cfi_startproc` in the cold text section and an unmatched `.cfi_endproc`
26100 in the standard text section. */
26101 section *save_text_section = in_section;
26102 switch_to_section (function_section (current_function_decl));
26103 for (int regnum = 0; regnum < 30; ++regnum)
26104 {
26105 rtx specu_label = cfun->machine->call_via[regnum];
26106 if (specu_label == NULL)
26107 continue;
26108
26109 targetm.asm_out.print_operand (out_file, specu_label, 0);
26110 asm_fprintf (out_file, ":\n");
26111 aarch64_sls_emit_function_stub (out_file, regnum);
26112 any_functions_emitted = true;
26113 }
26114 if (any_functions_emitted)
26115 /* Can use the SB if needs be here, since this stub will only be used
26116 by the current function, and hence for the current target. */
26117 asm_fprintf (out_file, "\t%s\n", aarch64_sls_barrier (true));
26118 switch_to_section (save_text_section);
26119}
26120
26121/* Emit shared BLR stubs for the current compilation unit.
26122 Over the course of compiling this unit we may have converted some BLR
26123 instructions to a BL to a shared stub function. This is where we emit those
26124 stub functions.
26125 This function is for the stubs shared between different functions in this
26126 compilation unit. We share when optimizing for size instead of speed.
26127
26128 This function is called through the TARGET_ASM_FILE_END hook. */
26129void
26130aarch64_sls_emit_shared_blr_thunks (FILE *out_file)
26131{
26132 if (! aarch64_sls_shared_thunks_needed)
26133 return;
26134
26135 for (int regnum = 0; regnum < 30; ++regnum)
26136 {
26137 tree decl = aarch64_sls_shared_thunks[regnum];
26138 if (!decl)
26139 continue;
26140
26141 const char *name = indirect_symbol_names[regnum];
26142 switch_to_section (get_named_section (decl, NULL, 0));
26143 ASM_OUTPUT_ALIGN (out_file, 2);
26144 targetm.asm_out.globalize_label (out_file, name);
26145 /* Only emits if the compiler is configured for an assembler that can
26146 handle visibility directives. */
26147 targetm.asm_out.assemble_visibility (decl, VISIBILITY_HIDDEN);
26148 ASM_OUTPUT_TYPE_DIRECTIVE (out_file, name, "function");
26149 ASM_OUTPUT_LABEL (out_file, name);
26150 aarch64_sls_emit_function_stub (out_file, regnum);
26151 /* Use the most conservative target to ensure it can always be used by any
26152 function in the translation unit. */
26153 asm_fprintf (out_file, "\tdsb\tsy\n\tisb\n");
26154 ASM_DECLARE_FUNCTION_SIZE (out_file, name, decl);
26155 }
26156}
26157
26158/* Implement TARGET_ASM_FILE_END. */
26159void
26160aarch64_asm_file_end ()
26161{
26162 aarch64_sls_emit_shared_blr_thunks (asm_out_file);
26163 /* Since this function will be called for the ASM_FILE_END hook, we ensure
26164 that what would be called otherwise (e.g. `file_end_indicate_exec_stack`
26165 for FreeBSD) still gets called. */
26166#ifdef TARGET_ASM_FILE_END
26167 TARGET_ASM_FILE_END ();
26168#endif
26169}
26170
26171const char *
26172aarch64_indirect_call_asm (rtx addr)
26173{
26174 gcc_assert (REG_P (addr));
26175 if (aarch64_harden_sls_blr_p ())
26176 {
26177 rtx stub_label = aarch64_sls_create_blr_label (REGNO (addr));
26178 output_asm_insn ("bl\t%0", &stub_label);
26179 }
26180 else
26181 output_asm_insn ("blr\t%0", &addr);
26182 return "";
26183}
26184
51b86113
DM
26185/* Target-specific selftests. */
26186
26187#if CHECKING_P
26188
26189namespace selftest {
26190
26191/* Selftest for the RTL loader.
26192 Verify that the RTL loader copes with a dump from
26193 print_rtx_function. This is essentially just a test that class
26194 function_reader can handle a real dump, but it also verifies
26195 that lookup_reg_by_dump_name correctly handles hard regs.
26196 The presence of hard reg names in the dump means that the test is
26197 target-specific, hence it is in this file. */
26198
26199static void
26200aarch64_test_loading_full_dump ()
26201{
26202 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
26203
26204 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
26205
26206 rtx_insn *insn_1 = get_insn_by_uid (1);
26207 ASSERT_EQ (NOTE, GET_CODE (insn_1));
26208
26209 rtx_insn *insn_15 = get_insn_by_uid (15);
26210 ASSERT_EQ (INSN, GET_CODE (insn_15));
26211 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
26212
26213 /* Verify crtl->return_rtx. */
26214 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
26215 ASSERT_EQ (0, REGNO (crtl->return_rtx));
26216 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
26217}
26218
83d796d3
RS
26219/* Test the fractional_cost class. */
26220
26221static void
26222aarch64_test_fractional_cost ()
26223{
26224 using cf = fractional_cost;
26225
26226 ASSERT_EQ (cf (0, 20), 0);
26227
26228 ASSERT_EQ (cf (4, 2), 2);
26229 ASSERT_EQ (3, cf (9, 3));
26230
26231 ASSERT_NE (cf (5, 2), 2);
26232 ASSERT_NE (3, cf (8, 3));
26233
26234 ASSERT_EQ (cf (7, 11) + cf (15, 11), 2);
26235 ASSERT_EQ (cf (2, 3) + cf (3, 5), cf (19, 15));
26236 ASSERT_EQ (cf (2, 3) + cf (1, 6) + cf (1, 6), 1);
26237
26238 ASSERT_EQ (cf (14, 15) - cf (4, 15), cf (2, 3));
26239 ASSERT_EQ (cf (1, 4) - cf (1, 2), 0);
26240 ASSERT_EQ (cf (3, 5) - cf (1, 10), cf (1, 2));
26241 ASSERT_EQ (cf (11, 3) - 3, cf (2, 3));
26242 ASSERT_EQ (3 - cf (7, 3), cf (2, 3));
26243 ASSERT_EQ (3 - cf (10, 3), 0);
26244
26245 ASSERT_EQ (cf (2, 3) * 5, cf (10, 3));
26246 ASSERT_EQ (14 * cf (11, 21), cf (22, 3));
26247
26248 ASSERT_TRUE (cf (4, 15) < cf (5, 15));
26249 ASSERT_FALSE (cf (5, 15) < cf (5, 15));
26250 ASSERT_FALSE (cf (6, 15) < cf (5, 15));
26251 ASSERT_TRUE (cf (1, 3) < cf (2, 5));
26252 ASSERT_TRUE (cf (1, 12) < cf (1, 6));
26253 ASSERT_FALSE (cf (5, 3) < cf (5, 3));
26254 ASSERT_TRUE (cf (239, 240) < 1);
26255 ASSERT_FALSE (cf (240, 240) < 1);
26256 ASSERT_FALSE (cf (241, 240) < 1);
26257 ASSERT_FALSE (2 < cf (207, 104));
26258 ASSERT_FALSE (2 < cf (208, 104));
26259 ASSERT_TRUE (2 < cf (209, 104));
26260
26261 ASSERT_TRUE (cf (4, 15) < cf (5, 15));
26262 ASSERT_FALSE (cf (5, 15) < cf (5, 15));
26263 ASSERT_FALSE (cf (6, 15) < cf (5, 15));
26264 ASSERT_TRUE (cf (1, 3) < cf (2, 5));
26265 ASSERT_TRUE (cf (1, 12) < cf (1, 6));
26266 ASSERT_FALSE (cf (5, 3) < cf (5, 3));
26267 ASSERT_TRUE (cf (239, 240) < 1);
26268 ASSERT_FALSE (cf (240, 240) < 1);
26269 ASSERT_FALSE (cf (241, 240) < 1);
26270 ASSERT_FALSE (2 < cf (207, 104));
26271 ASSERT_FALSE (2 < cf (208, 104));
26272 ASSERT_TRUE (2 < cf (209, 104));
26273
26274 ASSERT_FALSE (cf (4, 15) >= cf (5, 15));
26275 ASSERT_TRUE (cf (5, 15) >= cf (5, 15));
26276 ASSERT_TRUE (cf (6, 15) >= cf (5, 15));
26277 ASSERT_FALSE (cf (1, 3) >= cf (2, 5));
26278 ASSERT_FALSE (cf (1, 12) >= cf (1, 6));
26279 ASSERT_TRUE (cf (5, 3) >= cf (5, 3));
26280 ASSERT_FALSE (cf (239, 240) >= 1);
26281 ASSERT_TRUE (cf (240, 240) >= 1);
26282 ASSERT_TRUE (cf (241, 240) >= 1);
26283 ASSERT_TRUE (2 >= cf (207, 104));
26284 ASSERT_TRUE (2 >= cf (208, 104));
26285 ASSERT_FALSE (2 >= cf (209, 104));
26286
26287 ASSERT_FALSE (cf (4, 15) > cf (5, 15));
26288 ASSERT_FALSE (cf (5, 15) > cf (5, 15));
26289 ASSERT_TRUE (cf (6, 15) > cf (5, 15));
26290 ASSERT_FALSE (cf (1, 3) > cf (2, 5));
26291 ASSERT_FALSE (cf (1, 12) > cf (1, 6));
26292 ASSERT_FALSE (cf (5, 3) > cf (5, 3));
26293 ASSERT_FALSE (cf (239, 240) > 1);
26294 ASSERT_FALSE (cf (240, 240) > 1);
26295 ASSERT_TRUE (cf (241, 240) > 1);
26296 ASSERT_TRUE (2 > cf (207, 104));
26297 ASSERT_FALSE (2 > cf (208, 104));
26298 ASSERT_FALSE (2 > cf (209, 104));
26299
26300 ASSERT_EQ (cf (1, 2).ceil (), 1);
26301 ASSERT_EQ (cf (11, 7).ceil (), 2);
26302 ASSERT_EQ (cf (20, 1).ceil (), 20);
26303 ASSERT_EQ ((cf (0xfffffffd) + 1).ceil (), 0xfffffffe);
26304 ASSERT_EQ ((cf (0xfffffffd) + 2).ceil (), 0xffffffff);
26305 ASSERT_EQ ((cf (0xfffffffd) + 3).ceil (), 0xffffffff);
26306 ASSERT_EQ ((cf (0x7fffffff) * 2).ceil (), 0xfffffffe);
26307 ASSERT_EQ ((cf (0x80000000) * 2).ceil (), 0xffffffff);
26308
26309 ASSERT_EQ (cf (1, 2).as_double (), 0.5);
26310}
26311
51b86113
DM
26312/* Run all target-specific selftests. */
26313
26314static void
26315aarch64_run_selftests (void)
26316{
26317 aarch64_test_loading_full_dump ();
83d796d3 26318 aarch64_test_fractional_cost ();
51b86113
DM
26319}
26320
26321} // namespace selftest
26322
26323#endif /* #if CHECKING_P */
26324
cd0b2d36
RR
26325#undef TARGET_STACK_PROTECT_GUARD
26326#define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
26327
43e9d192
IB
26328#undef TARGET_ADDRESS_COST
26329#define TARGET_ADDRESS_COST aarch64_address_cost
26330
26331/* This hook will determines whether unnamed bitfields affect the alignment
26332 of the containing structure. The hook returns true if the structure
26333 should inherit the alignment requirements of an unnamed bitfield's
26334 type. */
26335#undef TARGET_ALIGN_ANON_BITFIELD
26336#define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
26337
26338#undef TARGET_ASM_ALIGNED_DI_OP
26339#define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
26340
26341#undef TARGET_ASM_ALIGNED_HI_OP
26342#define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
26343
26344#undef TARGET_ASM_ALIGNED_SI_OP
26345#define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
26346
26347#undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
26348#define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
26349 hook_bool_const_tree_hwi_hwi_const_tree_true
26350
e1c1ecb0
KT
26351#undef TARGET_ASM_FILE_START
26352#define TARGET_ASM_FILE_START aarch64_start_file
26353
43e9d192
IB
26354#undef TARGET_ASM_OUTPUT_MI_THUNK
26355#define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
26356
26357#undef TARGET_ASM_SELECT_RTX_SECTION
26358#define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
26359
26360#undef TARGET_ASM_TRAMPOLINE_TEMPLATE
26361#define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
26362
c292cfe5
SN
26363#undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
26364#define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
26365
43e9d192
IB
26366#undef TARGET_BUILD_BUILTIN_VA_LIST
26367#define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
26368
26369#undef TARGET_CALLEE_COPIES
7256c719 26370#define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
43e9d192
IB
26371
26372#undef TARGET_CAN_ELIMINATE
26373#define TARGET_CAN_ELIMINATE aarch64_can_eliminate
26374
1fd8d40c
KT
26375#undef TARGET_CAN_INLINE_P
26376#define TARGET_CAN_INLINE_P aarch64_can_inline_p
26377
43e9d192
IB
26378#undef TARGET_CANNOT_FORCE_CONST_MEM
26379#define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
26380
50487d79
EM
26381#undef TARGET_CASE_VALUES_THRESHOLD
26382#define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
26383
43e9d192
IB
26384#undef TARGET_CONDITIONAL_REGISTER_USAGE
26385#define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
26386
38e62001
RS
26387#undef TARGET_MEMBER_TYPE_FORCES_BLK
26388#define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk
26389
43e9d192
IB
26390/* Only the least significant bit is used for initialization guard
26391 variables. */
26392#undef TARGET_CXX_GUARD_MASK_BIT
26393#define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
26394
26395#undef TARGET_C_MODE_FOR_SUFFIX
26396#define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
26397
26398#ifdef TARGET_BIG_ENDIAN_DEFAULT
26399#undef TARGET_DEFAULT_TARGET_FLAGS
26400#define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
26401#endif
26402
26403#undef TARGET_CLASS_MAX_NREGS
26404#define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
26405
119103ca
JG
26406#undef TARGET_BUILTIN_DECL
26407#define TARGET_BUILTIN_DECL aarch64_builtin_decl
26408
a6fc00da
BH
26409#undef TARGET_BUILTIN_RECIPROCAL
26410#define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
26411
11e554b3
JG
26412#undef TARGET_C_EXCESS_PRECISION
26413#define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
26414
43e9d192
IB
26415#undef TARGET_EXPAND_BUILTIN
26416#define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
26417
26418#undef TARGET_EXPAND_BUILTIN_VA_START
26419#define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
26420
9697e620
JG
26421#undef TARGET_FOLD_BUILTIN
26422#define TARGET_FOLD_BUILTIN aarch64_fold_builtin
26423
43e9d192
IB
26424#undef TARGET_FUNCTION_ARG
26425#define TARGET_FUNCTION_ARG aarch64_function_arg
26426
26427#undef TARGET_FUNCTION_ARG_ADVANCE
26428#define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
26429
26430#undef TARGET_FUNCTION_ARG_BOUNDARY
26431#define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
26432
76b0cbf8
RS
26433#undef TARGET_FUNCTION_ARG_PADDING
26434#define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
26435
43cacb12
RS
26436#undef TARGET_GET_RAW_RESULT_MODE
26437#define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
26438#undef TARGET_GET_RAW_ARG_MODE
26439#define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
26440
43e9d192
IB
26441#undef TARGET_FUNCTION_OK_FOR_SIBCALL
26442#define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
26443
26444#undef TARGET_FUNCTION_VALUE
26445#define TARGET_FUNCTION_VALUE aarch64_function_value
26446
26447#undef TARGET_FUNCTION_VALUE_REGNO_P
26448#define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
26449
fc72cba7
AL
26450#undef TARGET_GIMPLE_FOLD_BUILTIN
26451#define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
0ac198d3 26452
43e9d192
IB
26453#undef TARGET_GIMPLIFY_VA_ARG_EXPR
26454#define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
26455
26456#undef TARGET_INIT_BUILTINS
26457#define TARGET_INIT_BUILTINS aarch64_init_builtins
26458
c64f7d37
WD
26459#undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
26460#define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
26461 aarch64_ira_change_pseudo_allocno_class
26462
43e9d192
IB
26463#undef TARGET_LEGITIMATE_ADDRESS_P
26464#define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
26465
26466#undef TARGET_LEGITIMATE_CONSTANT_P
26467#define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
26468
491ec060
WD
26469#undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
26470#define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
26471 aarch64_legitimize_address_displacement
26472
43e9d192
IB
26473#undef TARGET_LIBGCC_CMP_RETURN_MODE
26474#define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
26475
11e554b3
JG
26476#undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
26477#define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
26478aarch64_libgcc_floating_mode_supported_p
26479
ac2b960f
YZ
26480#undef TARGET_MANGLE_TYPE
26481#define TARGET_MANGLE_TYPE aarch64_mangle_type
26482
98698967
SMW
26483#undef TARGET_INVALID_CONVERSION
26484#define TARGET_INVALID_CONVERSION aarch64_invalid_conversion
26485
26486#undef TARGET_INVALID_UNARY_OP
26487#define TARGET_INVALID_UNARY_OP aarch64_invalid_unary_op
26488
26489#undef TARGET_INVALID_BINARY_OP
26490#define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
26491
65ef05d0
RS
26492#undef TARGET_VERIFY_TYPE_CONTEXT
26493#define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
26494
43e9d192
IB
26495#undef TARGET_MEMORY_MOVE_COST
26496#define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
26497
26e0ff94
WD
26498#undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
26499#define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
26500
43e9d192
IB
26501#undef TARGET_MUST_PASS_IN_STACK
26502#define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
26503
26504/* This target hook should return true if accesses to volatile bitfields
26505 should use the narrowest mode possible. It should return false if these
26506 accesses should use the bitfield container type. */
26507#undef TARGET_NARROW_VOLATILE_BITFIELD
26508#define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
26509
26510#undef TARGET_OPTION_OVERRIDE
26511#define TARGET_OPTION_OVERRIDE aarch64_override_options
26512
26513#undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
26514#define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
26515 aarch64_override_options_after_change
26516
29a14a1a
MK
26517#undef TARGET_OFFLOAD_OPTIONS
26518#define TARGET_OFFLOAD_OPTIONS aarch64_offload_options
26519
361fb3ee
KT
26520#undef TARGET_OPTION_SAVE
26521#define TARGET_OPTION_SAVE aarch64_option_save
26522
26523#undef TARGET_OPTION_RESTORE
26524#define TARGET_OPTION_RESTORE aarch64_option_restore
26525
26526#undef TARGET_OPTION_PRINT
26527#define TARGET_OPTION_PRINT aarch64_option_print
26528
5a2c8331
KT
26529#undef TARGET_OPTION_VALID_ATTRIBUTE_P
26530#define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
26531
d78006d9
KT
26532#undef TARGET_SET_CURRENT_FUNCTION
26533#define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
26534
43e9d192
IB
26535#undef TARGET_PASS_BY_REFERENCE
26536#define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
26537
26538#undef TARGET_PREFERRED_RELOAD_CLASS
26539#define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
26540
cee66c68
WD
26541#undef TARGET_SCHED_REASSOCIATION_WIDTH
26542#define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
26543
c2ec330c
AL
26544#undef TARGET_PROMOTED_TYPE
26545#define TARGET_PROMOTED_TYPE aarch64_promoted_type
26546
43e9d192
IB
26547#undef TARGET_SECONDARY_RELOAD
26548#define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
26549
26550#undef TARGET_SHIFT_TRUNCATION_MASK
26551#define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
26552
26553#undef TARGET_SETUP_INCOMING_VARARGS
26554#define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
26555
26556#undef TARGET_STRUCT_VALUE_RTX
26557#define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
26558
26559#undef TARGET_REGISTER_MOVE_COST
26560#define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
26561
26562#undef TARGET_RETURN_IN_MEMORY
26563#define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
26564
26565#undef TARGET_RETURN_IN_MSB
26566#define TARGET_RETURN_IN_MSB aarch64_return_in_msb
26567
26568#undef TARGET_RTX_COSTS
7cc2145f 26569#define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
43e9d192 26570
2e5f8203
JG
26571#undef TARGET_SCALAR_MODE_SUPPORTED_P
26572#define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
26573
d126a4ae
AP
26574#undef TARGET_SCHED_ISSUE_RATE
26575#define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
26576
d0bc0cb6
RS
26577#undef TARGET_SCHED_VARIABLE_ISSUE
26578#define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
26579
d03f7e44
MK
26580#undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
26581#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
26582 aarch64_sched_first_cycle_multipass_dfa_lookahead
26583
2d6bc7fa
KT
26584#undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
26585#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
26586 aarch64_first_cycle_multipass_dfa_lookahead_guard
26587
827ab47a
KT
26588#undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
26589#define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
26590 aarch64_get_separate_components
26591
26592#undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
26593#define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
26594 aarch64_components_for_bb
26595
26596#undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
26597#define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
26598 aarch64_disqualify_components
26599
26600#undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
26601#define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
26602 aarch64_emit_prologue_components
26603
26604#undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
26605#define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
26606 aarch64_emit_epilogue_components
26607
26608#undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
26609#define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
26610 aarch64_set_handled_components
26611
43e9d192
IB
26612#undef TARGET_TRAMPOLINE_INIT
26613#define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
26614
26615#undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
26616#define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
26617
26618#undef TARGET_VECTOR_MODE_SUPPORTED_P
26619#define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
26620
482b2b43
RS
26621#undef TARGET_COMPATIBLE_VECTOR_TYPES_P
26622#define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
26623
7df76747
N
26624#undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
26625#define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
26626 aarch64_builtin_support_vector_misalignment
26627
9f4cbab8
RS
26628#undef TARGET_ARRAY_MODE
26629#define TARGET_ARRAY_MODE aarch64_array_mode
26630
43e9d192
IB
26631#undef TARGET_ARRAY_MODE_SUPPORTED_P
26632#define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
26633
6239dd05
RS
26634#undef TARGET_VECTORIZE_CREATE_COSTS
26635#define TARGET_VECTORIZE_CREATE_COSTS aarch64_vectorize_create_costs
50a525b5 26636
8990e73a
TB
26637#undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
26638#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
26639 aarch64_builtin_vectorization_cost
26640
43e9d192
IB
26641#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
26642#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
26643
42fc9a7f
JG
26644#undef TARGET_VECTORIZE_BUILTINS
26645#define TARGET_VECTORIZE_BUILTINS
26646
26647#undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
26648#define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
26649 aarch64_builtin_vectorized_function
26650
e021fb86
RS
26651#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
26652#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
26653 aarch64_autovectorize_vector_modes
3b357264 26654
aa87aced
KV
26655#undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
26656#define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
26657 aarch64_atomic_assign_expand_fenv
26658
43e9d192
IB
26659/* Section anchor support. */
26660
26661#undef TARGET_MIN_ANCHOR_OFFSET
26662#define TARGET_MIN_ANCHOR_OFFSET -256
26663
26664/* Limit the maximum anchor offset to 4k-1, since that's the limit for a
26665 byte offset; we can do much more for larger data types, but have no way
26666 to determine the size of the access. We assume accesses are aligned. */
26667#undef TARGET_MAX_ANCHOR_OFFSET
26668#define TARGET_MAX_ANCHOR_OFFSET 4095
26669
db0253a4
TB
26670#undef TARGET_VECTOR_ALIGNMENT
26671#define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
26672
43cacb12
RS
26673#undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
26674#define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
26675 aarch64_vectorize_preferred_vector_alignment
db0253a4
TB
26676#undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
26677#define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
26678 aarch64_simd_vector_alignment_reachable
26679
88b08073
JG
26680/* vec_perm support. */
26681
f151c9e1
RS
26682#undef TARGET_VECTORIZE_VEC_PERM_CONST
26683#define TARGET_VECTORIZE_VEC_PERM_CONST \
26684 aarch64_vectorize_vec_perm_const
88b08073 26685
74166aab
RS
26686#undef TARGET_VECTORIZE_RELATED_MODE
26687#define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
43cacb12
RS
26688#undef TARGET_VECTORIZE_GET_MASK_MODE
26689#define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
76a34e3f
RS
26690#undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
26691#define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
26692 aarch64_empty_mask_is_expensive
6a86928d
RS
26693#undef TARGET_PREFERRED_ELSE_VALUE
26694#define TARGET_PREFERRED_ELSE_VALUE \
26695 aarch64_preferred_else_value
43cacb12 26696
c2ec330c
AL
26697#undef TARGET_INIT_LIBFUNCS
26698#define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
70f09188 26699
706b2314 26700#undef TARGET_FIXED_CONDITION_CODE_REGS
70f09188
AP
26701#define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
26702
5cb74e90
RR
26703#undef TARGET_FLAGS_REGNUM
26704#define TARGET_FLAGS_REGNUM CC_REGNUM
26705
78607708
TV
26706#undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
26707#define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
26708
a3125fc2
CL
26709#undef TARGET_ASAN_SHADOW_OFFSET
26710#define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
26711
0c4ec427
RE
26712#undef TARGET_LEGITIMIZE_ADDRESS
26713#define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
26714
b48d6421
KT
26715#undef TARGET_SCHED_CAN_SPECULATE_INSN
26716#define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
26717
594bdd53
FY
26718#undef TARGET_CAN_USE_DOLOOP_P
26719#define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
26720
9bca63d4
WD
26721#undef TARGET_SCHED_ADJUST_PRIORITY
26722#define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
26723
6a569cdd
KT
26724#undef TARGET_SCHED_MACRO_FUSION_P
26725#define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
26726
26727#undef TARGET_SCHED_MACRO_FUSION_PAIR_P
26728#define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
26729
350013bc
BC
26730#undef TARGET_SCHED_FUSION_PRIORITY
26731#define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
26732
7b841a12
JW
26733#undef TARGET_UNSPEC_MAY_TRAP_P
26734#define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
26735
1b1e81f8
JW
26736#undef TARGET_USE_PSEUDO_PIC_REG
26737#define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
26738
cc8ca59e
JB
26739#undef TARGET_PRINT_OPERAND
26740#define TARGET_PRINT_OPERAND aarch64_print_operand
26741
26742#undef TARGET_PRINT_OPERAND_ADDRESS
26743#define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
26744
74b27d8e
RS
26745#undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
26746#define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA aarch64_output_addr_const_extra
26747
ee62a5a6
RS
26748#undef TARGET_OPTAB_SUPPORTED_P
26749#define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
26750
43203dea
RR
26751#undef TARGET_OMIT_STRUCT_RETURN_REG
26752#define TARGET_OMIT_STRUCT_RETURN_REG true
26753
43cacb12
RS
26754#undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
26755#define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
26756 aarch64_dwarf_poly_indeterminate_value
26757
f46fe37e
EB
26758/* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
26759#undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
26760#define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
26761
c43f4279
RS
26762#undef TARGET_HARD_REGNO_NREGS
26763#define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
f939c3e6
RS
26764#undef TARGET_HARD_REGNO_MODE_OK
26765#define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
26766
99e1629f
RS
26767#undef TARGET_MODES_TIEABLE_P
26768#define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
26769
80ec73f4
RS
26770#undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
26771#define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
26772 aarch64_hard_regno_call_part_clobbered
26773
5a5a3bc5
RS
26774#undef TARGET_INSN_CALLEE_ABI
26775#define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
b3650d40 26776
58e17cf8
RS
26777#undef TARGET_CONSTANT_ALIGNMENT
26778#define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
26779
8c6e3b23
TC
26780#undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
26781#define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
26782 aarch64_stack_clash_protection_alloca_probe_range
26783
43cacb12
RS
26784#undef TARGET_COMPUTE_PRESSURE_CLASSES
26785#define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
26786
26787#undef TARGET_CAN_CHANGE_MODE_CLASS
26788#define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
26789
5cce8171
RS
26790#undef TARGET_SELECT_EARLY_REMAT_MODES
26791#define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
26792
c0111dc4
RE
26793#undef TARGET_SPECULATION_SAFE_VALUE
26794#define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
26795
2d56d6ba
KT
26796#undef TARGET_ESTIMATED_POLY_VALUE
26797#define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
26798
a0d0b980
SE
26799#undef TARGET_ATTRIBUTE_TABLE
26800#define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
26801
d9186814
SE
26802#undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
26803#define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
26804 aarch64_simd_clone_compute_vecsize_and_simdlen
26805
26806#undef TARGET_SIMD_CLONE_ADJUST
26807#define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
26808
26809#undef TARGET_SIMD_CLONE_USABLE
26810#define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
26811
497f281c
SE
26812#undef TARGET_COMP_TYPE_ATTRIBUTES
26813#define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
26814
3bac1e20
SE
26815#undef TARGET_GET_MULTILIB_ABI_NAME
26816#define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
26817
002ffd3c
RS
26818#undef TARGET_FNTYPE_ABI
26819#define TARGET_FNTYPE_ABI aarch64_fntype_abi
26820
3bd87832
MM
26821#undef TARGET_MEMTAG_CAN_TAG_ADDRESSES
26822#define TARGET_MEMTAG_CAN_TAG_ADDRESSES aarch64_can_tag_addresses
26823
51b86113
DM
26824#if CHECKING_P
26825#undef TARGET_RUN_TARGET_SELFTESTS
26826#define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
26827#endif /* #if CHECKING_P */
26828
8fc16d72
ST
26829#undef TARGET_ASM_POST_CFI_STARTPROC
26830#define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
26831
c600df9a
RS
26832#undef TARGET_STRICT_ARGUMENT_NAMING
26833#define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
26834
1a7a35c7
RH
26835#undef TARGET_MD_ASM_ADJUST
26836#define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
26837
96b7f495
MM
26838#undef TARGET_ASM_FILE_END
26839#define TARGET_ASM_FILE_END aarch64_asm_file_end
26840
26841#undef TARGET_ASM_FUNCTION_EPILOGUE
26842#define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
26843
43e9d192
IB
26844struct gcc_target targetm = TARGET_INITIALIZER;
26845
26846#include "gt-aarch64.h"